571 files changed, 178514 insertions, 0 deletions
diff --git a/src/mongo/bson/README b/src/mongo/bson/README
new file mode 100644
index 00000000000..01ed654bcd2
--- /dev/null
+++ b/src/mongo/bson/README
@@ -0,0 +1,7 @@
+"BSON" stands for "binary JSON" - a binary storage format that is JSON inspired 
+(and adds a couple extra types such as Date).
+
+This is the C++ implementation.  Implementations which translate BSON<->JSON 
+are available for most languages at bsonspec.org.
+
+
diff --git a/src/mongo/bson/bson-inl.h b/src/mongo/bson/bson-inl.h
new file mode 100644
index 00000000000..9e8b3654802
--- /dev/null
+++ b/src/mongo/bson/bson-inl.h
@@ -0,0 +1,1007 @@
+/** @file bsoninlines.h
+          a goal here is that the most common bson methods can be used inline-only, a la boost.
+          thus some things are inline that wouldn't necessarily be otherwise.
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <limits>
+
+#if defined(_WIN32)
+#undef max
+#undef min
+#endif
+
+namespace mongo {
+
+    inline bool isNaN(double d) { 
+        return d != d;
+    }
+
+    inline bool isInf(double d, int* sign = 0) {
+        volatile double tmp = d;
+
+        if ((tmp == d) && ((tmp - d) != 0.0)) {
+            if ( sign ) {
+                *sign = (d < 0.0 ? -1 : 1);
+            }
+            return true;
+        }
+        
+        if ( sign ) {
+            *sign = 0;
+        }
+
+        return false;
+    }
+
+    /* must be same type when called, unless both sides are #s 
+       this large function is in header to facilitate inline-only use of bson
+    */
+    inline int compareElementValues(const BSONElement& l, const BSONElement& r) {
+        int f;
+
+        switch ( l.type() ) {
+        case EOO:
+        case Undefined: // EOO and Undefined are same canonicalType
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            f = l.canonicalType() - r.canonicalType();
+            if ( f<0 ) return -1;
+            return f==0 ? 0 : 1;
+        case Bool:
+            return *l.value() - *r.value();
+        case Timestamp:
+            // unsigned compare for timestamps - note they are not really dates but (ordinal + time_t)
+            if ( l.date() < r.date() )
+                return -1;
+            return l.date() == r.date() ? 0 : 1;
+        case Date:
+            {
+                long long a = (long long) l.Date().millis;
+                long long b = (long long) r.Date().millis;
+                if( a < b ) 
+                    return -1;
+                return a == b ? 0 : 1;
+            }
+        case NumberLong:
+            if( r.type() == NumberLong ) {
+                long long L = l._numberLong();
+                long long R = r._numberLong();
+                if( L < R ) return -1;
+                if( L == R ) return 0;
+                return 1;
+            }
+            goto dodouble;
+        case NumberInt:
+            if( r.type() == NumberInt ) {
+                int L = l._numberInt();
+                int R = r._numberInt();
+                if( L < R ) return -1;
+                return L == R ? 0 : 1;
+            }
+            // else fall through
+        case NumberDouble: 
+dodouble:
+            {
+                double left = l.number();
+                double right = r.number();
+                if( left < right ) 
+                    return -1;
+                if( left == right )
+                    return 0;
+                if( isNaN(left) )
+                    return isNaN(right) ? 0 : -1;
+                return 1;
+            }
+        case jstOID:
+            return memcmp(l.value(), r.value(), 12);
+        case Code:
+        case Symbol:
+        case String:
+            /* todo: a utf sort order version one day... */
+            {
+                // we use memcmp as we allow zeros in UTF8 strings
+                int lsz = l.valuestrsize();
+                int rsz = r.valuestrsize();
+                int common = min(lsz, rsz);
+                int res = memcmp(l.valuestr(), r.valuestr(), common);
+                if( res ) 
+                    return res;
+                // longer string is the greater one
+                return lsz-rsz;
+            }
+        case Object:
+        case Array:
+            return l.embeddedObject().woCompare( r.embeddedObject() );
+        case DBRef: {
+            int lsz = l.valuesize();
+            int rsz = r.valuesize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value(), r.value(), lsz);
+        }
+        case BinData: {
+            int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+            int rsz = r.objsize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value()+4, r.value()+4, lsz+1 /*+1 for subtype byte*/);
+        }
+        case RegEx: {
+            int c = strcmp(l.regex(), r.regex());
+            if ( c )
+                return c;
+            return strcmp(l.regexFlags(), r.regexFlags());
+        }
+        case CodeWScope : {
+            f = l.canonicalType() - r.canonicalType();
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeScopeData() , r.codeWScopeScopeData() );
+            if ( f )
+                return f;
+            return 0;
+        }
+        default:
+            assert( false);
+        }
+        return -1;
+    }
+
+    /* wo = "well ordered" 
+       note: (mongodb related) : this can only change in behavior when index version # changes
+    */
+    inline int BSONElement::woCompare( const BSONElement &e,
+                                bool considerFieldName ) const {
+        int lt = (int) canonicalType();
+        int rt = (int) e.canonicalType();
+        int x = lt - rt;
+        if( x != 0 && (!isNumber() || !e.isNumber()) )
+            return x;
+        if ( considerFieldName ) {
+            x = strcmp(fieldName(), e.fieldName());
+            if ( x != 0 )
+                return x;
+        }
+        x = compareElementValues(*this, e);
+        return x;
+    }
+
+    inline BSONObjIterator BSONObj::begin() const {
+        return BSONObjIterator(*this);
+    }
+
+    inline BSONObj BSONElement::embeddedObjectUserCheck() const {
+        if ( MONGO_likely(isABSONObj()) )
+            return BSONObj(value());
+        stringstream ss;
+        ss << "invalid parameter: expected an object (" << fieldName() << ")";
+        uasserted( 10065 , ss.str() );
+        return BSONObj(); // never reachable
+    }
+
+    inline BSONObj BSONElement::embeddedObject() const {
+        assert( isABSONObj() );
+        return BSONObj(value());
+    }
+
+    inline BSONObj BSONElement::codeWScopeObject() const {
+        assert( type() == CodeWScope );
+        int strSizeWNull = *(int *)( value() + 4 );
+        return BSONObj( value() + 4 + 4 + strSizeWNull );
+    }
+
+    // deep (full) equality
+    inline bool BSONObj::equal(const BSONObj &rhs) const {
+        BSONObjIterator i(*this);
+        BSONObjIterator j(rhs);
+        BSONElement l,r;
+        do {
+            // so far, equal...
+            l = i.next();
+            r = j.next();
+            if ( l.eoo() )
+                return r.eoo();
+        } while( l == r );
+        return false;
+    }
+
+    inline NOINLINE_DECL void BSONObj::_assertInvalid() const {
+        StringBuilder ss;
+        int os = objsize();
+        ss << "Invalid BSONObj size: " << os << " (0x" << toHex( &os, 4 ) << ')';
+        try {
+            BSONElement e = firstElement();
+            ss << " first element: " << e.toString();
+        }
+        catch ( ... ) { }
+        massert( 10334 , ss.str() , 0 );
+    }
+
+    /* the idea with NOINLINE_DECL here is to keep this from inlining in the
+       getOwned() method.  the presumption being that is better.
+    */
+    inline NOINLINE_DECL BSONObj BSONObj::copy() const {
+        Holder *h = (Holder*) malloc(objsize() + sizeof(unsigned));
+        h->zero();
+        memcpy(h->data, objdata(), objsize());
+        return BSONObj(h);
+    }
+
+    inline BSONObj BSONObj::getOwned() const {
+        if ( isOwned() )
+            return *this;
+        return copy();
+    }
+
+    // wrap this element up as a singleton object.
+    inline BSONObj BSONElement::wrap() const {
+        BSONObjBuilder b(size()+6);
+        b.append(*this);
+        return b.obj();
+    }
+
+    inline BSONObj BSONElement::wrap( const char * newName ) const {
+        BSONObjBuilder b(size()+6+(int)strlen(newName));
+        b.appendAs(*this,newName);
+        return b.obj();
+    }
+
+    inline void BSONObj::getFields(unsigned n, const char **fieldNames, BSONElement *fields) const { 
+        BSONObjIterator i(*this);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            const char *p = e.fieldName();
+            for( unsigned i = 0; i < n; i++ ) {
+                if( strcmp(p, fieldNames[i]) == 0 ) {
+                    fields[i] = e;
+                    break;
+                }
+            }
+        }
+    }
+
+    inline BSONElement BSONObj::getField(const StringData& name) const {
+        BSONObjIterator i(*this);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( strcmp(e.fieldName(), name.data()) == 0 )
+                return e;
+        }
+        return BSONElement();
+    }
+
+    inline int BSONObj::getIntField(const char *name) const {
+        BSONElement e = getField(name);
+        return e.isNumber() ? (int) e.number() : std::numeric_limits< int >::min();
+    }
+
+    inline bool BSONObj::getBoolField(const char *name) const {
+        BSONElement e = getField(name);
+        return e.type() == Bool ? e.boolean() : false;
+    }
+
+    inline const char * BSONObj::getStringField(const char *name) const {
+        BSONElement e = getField(name);
+        return e.type() == String ? e.valuestr() : "";
+    }
+
+    /* add all the fields from the object specified to this object */
+    inline BSONObjBuilder& BSONObjBuilder::appendElements(BSONObj x) {
+        BSONObjIterator it(x);
+        while ( it.moreWithEOO() ) {
+            BSONElement e = it.next();
+            if ( e.eoo() ) break;
+            append(e);
+        }
+        return *this;
+    }
+
+    /* add all the fields from the object specified to this object if they don't exist */
+    inline BSONObjBuilder& BSONObjBuilder::appendElementsUnique(BSONObj x) {
+        set<string> have;
+        {
+            BSONObjIterator i = iterator();
+            while ( i.more() )
+                have.insert( i.next().fieldName() );
+        }
+        
+        BSONObjIterator it(x);
+        while ( it.more() ) {
+            BSONElement e = it.next();
+            if ( have.count( e.fieldName() ) )
+                continue;
+            append(e);
+        }
+        return *this;
+    }
+
+
+    inline bool BSONObj::isValid() const {
+        int x = objsize();
+        return x > 0 && x <= BSONObjMaxInternalSize;
+    }
+
+    inline bool BSONObj::getObjectID(BSONElement& e) const {
+        BSONElement f = getField("_id");
+        if( !f.eoo() ) {
+            e = f;
+            return true;
+        }
+        return false;
+    }
+
+    inline BSONObjBuilderValueStream::BSONObjBuilderValueStream( BSONObjBuilder * builder ) {
+        _fieldName = 0;
+        _builder = builder;
+    }
+
+    template<class T>
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<( T value ) {
+        _builder->append(_fieldName, value);
+        _fieldName = 0;
+        return *_builder;
+    }
+
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const BSONElement& e ) {
+        _builder->appendAs( e , _fieldName );
+        _fieldName = 0;
+        return *_builder;
+    }
+
+    inline Labeler BSONObjBuilderValueStream::operator<<( const Labeler::Label &l ) {
+        return Labeler( l, this );
+    }
+
+    inline void BSONObjBuilderValueStream::endField( const char *nextFieldName ) {
+        if ( _fieldName && haveSubobj() ) {
+            _builder->append( _fieldName, subobj()->done() );
+        }
+        _subobj.reset();
+        _fieldName = nextFieldName;
+    }
+
+    inline BSONObjBuilder *BSONObjBuilderValueStream::subobj() {
+        if ( !haveSubobj() )
+            _subobj.reset( new BSONObjBuilder() );
+        return _subobj.get();
+    }
+
+    template<class T> inline
+    BSONObjBuilder& Labeler::operator<<( T value ) {
+        s_->subobj()->append( l_.l_, value );
+        return *s_->_builder;
+    }
+
+    inline
+    BSONObjBuilder& Labeler::operator<<( const BSONElement& e ) {
+        s_->subobj()->appendAs( e, l_.l_ );
+        return *s_->_builder;
+    }
+
+    // {a: {b:1}} -> {a.b:1}
+    void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base="");
+    inline BSONObj nested2dotted(const BSONObj& obj) {
+        BSONObjBuilder b;
+        nested2dotted(b, obj);
+        return b.obj();
+    }
+
+    // {a.b:1} -> {a: {b:1}}
+    void dotted2nested(BSONObjBuilder& b, const BSONObj& obj);
+    inline BSONObj dotted2nested(const BSONObj& obj) {
+        BSONObjBuilder b;
+        dotted2nested(b, obj);
+        return b.obj();
+    }
+
+    inline BSONObjIterator BSONObjBuilder::iterator() const {
+        const char * s = _b.buf() + _offset;
+        const char * e = _b.buf() + _b.len();
+        return BSONObjIterator( s , e );
+    }
+
+    inline bool BSONObjBuilder::hasField( const StringData& name ) const {
+        BSONObjIterator i = iterator();
+        while ( i.more() )
+            if ( strcmp( name.data() , i.next().fieldName() ) == 0 )
+                return true;
+        return false;
+    }
+
+    /* WARNING: nested/dotted conversions are not 100% reversible
+     * nested2dotted(dotted2nested({a.b: {c:1}})) -> {a.b.c: 1}
+     * also, dotted2nested ignores order
+     */
+
+    typedef map<string, BSONElement> BSONMap;
+    inline BSONMap bson2map(const BSONObj& obj) {
+        BSONMap m;
+        BSONObjIterator it(obj);
+        while (it.more()) {
+            BSONElement e = it.next();
+            m[e.fieldName()] = e;
+        }
+        return m;
+    }
+
+    struct BSONElementFieldNameCmp {
+        bool operator()( const BSONElement &l, const BSONElement &r ) const {
+            return strcmp( l.fieldName() , r.fieldName() ) <= 0;
+        }
+    };
+
+    typedef set<BSONElement, BSONElementFieldNameCmp> BSONSortedElements;
+    inline BSONSortedElements bson2set( const BSONObj& obj ) {
+        BSONSortedElements s;
+        BSONObjIterator it(obj);
+        while ( it.more() )
+            s.insert( it.next() );
+        return s;
+    }
+
+    inline string BSONObj::toString( bool isArray, bool full ) const {
+        if ( isEmpty() ) return "{}";
+        StringBuilder s;
+        toString(s, isArray, full);
+        return s.str();
+    }
+    inline void BSONObj::toString(StringBuilder& s,  bool isArray, bool full ) const {
+        if ( isEmpty() ) {
+            s << "{}";
+            return;
+        }
+
+        s << ( isArray ? "[ " : "{ " );
+        BSONObjIterator i(*this);
+        bool first = true;
+        while ( 1 ) {
+            massert( 10327 ,  "Object does not end with EOO", i.moreWithEOO() );
+            BSONElement e = i.next( true );
+            massert( 10328 ,  "Invalid element size", e.size() > 0 );
+            massert( 10329 ,  "Element too large", e.size() < ( 1 << 30 ) );
+            int offset = (int) (e.rawdata() - this->objdata());
+            massert( 10330 ,  "Element extends past end of object",
+                     e.size() + offset <= this->objsize() );
+            e.validate();
+            bool end = ( e.size() + offset == this->objsize() );
+            if ( e.eoo() ) {
+                massert( 10331 ,  "EOO Before end of object", end );
+                break;
+            }
+            if ( first )
+                first = false;
+            else
+                s << ", ";
+            e.toString(s, !isArray, full );
+        }
+        s << ( isArray ? " ]" : " }" );
+    }
+
+    inline void BSONElement::validate() const {
+        const BSONType t = type();
+
+        switch( t ) {
+        case DBRef:
+        case Code:
+        case Symbol:
+        case mongo::String: {
+            unsigned x = (unsigned) valuestrsize();
+            bool lenOk = x > 0 && x < (unsigned) BSONObjMaxInternalSize;
+            if( lenOk && valuestr()[x-1] == 0 )
+                return;
+            StringBuilder buf;
+            buf <<  "Invalid dbref/code/string/symbol size: " << x;
+            if( lenOk )
+                buf << " strnlen:" << mongo::strnlen( valuestr() , x );
+            msgasserted( 10321 , buf.str() );
+            break;
+        }
+        case CodeWScope: {
+            int totalSize = *( int * )( value() );
+            massert( 10322 ,  "Invalid CodeWScope size", totalSize >= 8 );
+            int strSizeWNull = *( int * )( value() + 4 );
+            massert( 10323 ,  "Invalid CodeWScope string size", totalSize >= strSizeWNull + 4 + 4 );
+            massert( 10324 ,  "Invalid CodeWScope string size",
+                     strSizeWNull > 0 &&
+                     (strSizeWNull - 1) == mongo::strnlen( codeWScopeCode(), strSizeWNull ) );
+            massert( 10325 ,  "Invalid CodeWScope size", totalSize >= strSizeWNull + 4 + 4 + 4 );
+            int objSize = *( int * )( value() + 4 + 4 + strSizeWNull );
+            massert( 10326 ,  "Invalid CodeWScope object size", totalSize == 4 + 4 + strSizeWNull + objSize );
+            // Subobject validation handled elsewhere.
+        }
+        case Object:
+            // We expect Object size validation to be handled elsewhere.
+        default:
+            break;
+        }
+    }
+
+    inline int BSONElement::size( int maxLen ) const {
+        if ( totalSize >= 0 )
+            return totalSize;
+
+        int remain = maxLen - fieldNameSize() - 1;
+
+        int x = 0;
+        switch ( type() ) {
+        case EOO:
+        case Undefined:
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            break;
+        case mongo::Bool:
+            x = 1;
+            break;
+        case NumberInt:
+            x = 4;
+            break;
+        case Timestamp:
+        case mongo::Date:
+        case NumberDouble:
+        case NumberLong:
+            x = 8;
+            break;
+        case jstOID:
+            x = 12;
+            break;
+        case Symbol:
+        case Code:
+        case mongo::String:
+            massert( 10313 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = valuestrsize() + 4;
+            break;
+        case CodeWScope:
+            massert( 10314 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = objsize();
+            break;
+
+        case DBRef:
+            massert( 10315 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = valuestrsize() + 4 + 12;
+            break;
+        case Object:
+        case mongo::Array:
+            massert( 10316 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = objsize();
+            break;
+        case BinData:
+            massert( 10317 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = valuestrsize() + 4 + 1/*subtype*/;
+            break;
+        case RegEx: {
+            const char *p = value();
+            size_t len1 = ( maxLen == -1 ) ? strlen( p ) : (size_t)mongo::strnlen( p, remain );
+            //massert( 10318 ,  "Invalid regex string", len1 != -1 ); // ERH - 4/28/10 - don't think this does anything
+            p = p + len1 + 1;
+            size_t len2;
+            if( maxLen == -1 )
+                len2 = strlen( p );
+            else {
+                size_t x = remain - len1 - 1;
+                assert( x <= 0x7fffffff );
+                len2 = mongo::strnlen( p, (int) x );
+            }
+            //massert( 10319 ,  "Invalid regex options string", len2 != -1 ); // ERH - 4/28/10 - don't think this does anything
+            x = (int) (len1 + 1 + len2 + 1);
+        }
+        break;
+        default: {
+            StringBuilder ss;
+            ss << "BSONElement: bad type " << (int) type();
+            string msg = ss.str();
+            massert( 13655 , msg.c_str(),false);
+        }
+        }
+        totalSize =  x + fieldNameSize() + 1; // BSONType
+
+        return totalSize;
+    }
+
+    inline int BSONElement::size() const {
+        if ( totalSize >= 0 )
+            return totalSize;
+
+        int x = 0;
+        switch ( type() ) {
+        case EOO:
+        case Undefined:
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            break;
+        case mongo::Bool:
+            x = 1;
+            break;
+        case NumberInt:
+            x = 4;
+            break;
+        case Timestamp:
+        case mongo::Date:
+        case NumberDouble:
+        case NumberLong:
+            x = 8;
+            break;
+        case jstOID:
+            x = 12;
+            break;
+        case Symbol:
+        case Code:
+        case mongo::String:
+            x = valuestrsize() + 4;
+            break;
+        case DBRef:
+            x = valuestrsize() + 4 + 12;
+            break;
+        case CodeWScope:
+        case Object:
+        case mongo::Array:
+            x = objsize();
+            break;
+        case BinData:
+            x = valuestrsize() + 4 + 1/*subtype*/;
+            break;
+        case RegEx: 
+            {
+                const char *p = value();
+                size_t len1 = strlen(p);
+                p = p + len1 + 1;
+                size_t len2;
+                len2 = strlen( p );
+                x = (int) (len1 + 1 + len2 + 1);
+            }
+            break;
+        default: 
+            {
+                StringBuilder ss;
+                ss << "BSONElement: bad type " << (int) type();
+                string msg = ss.str();
+                massert(10320 , msg.c_str(),false);
+            }
+        }
+        totalSize =  x + fieldNameSize() + 1; // BSONType
+
+        return totalSize;
+    }
+
+    inline string BSONElement::toString( bool includeFieldName, bool full ) const {
+        StringBuilder s;
+        toString(s, includeFieldName, full);
+        return s.str();
+    }
+    inline void BSONElement::toString(StringBuilder& s, bool includeFieldName, bool full ) const {
+        if ( includeFieldName && type() != EOO )
+            s << fieldName() << ": ";
+        switch ( type() ) {
+        case EOO:
+            s << "EOO";
+            break;
+        case mongo::Date:
+            s << "new Date(" << (long long) date() << ')';
+            break;
+        case RegEx: {
+            s << "/" << regex() << '/';
+            const char *p = regexFlags();
+            if ( p ) s << p;
+        }
+        break;
+        case NumberDouble:
+            s.appendDoubleNice( number() );
+            break;
+        case NumberLong:
+            s << _numberLong();
+            break;
+        case NumberInt:
+            s << _numberInt();
+            break;
+        case mongo::Bool:
+            s << ( boolean() ? "true" : "false" );
+            break;
+        case Object:
+            embeddedObject().toString(s, false, full);
+            break;
+        case mongo::Array:
+            embeddedObject().toString(s, true, full);
+            break;
+        case Undefined:
+            s << "undefined";
+            break;
+        case jstNULL:
+            s << "null";
+            break;
+        case MaxKey:
+            s << "MaxKey";
+            break;
+        case MinKey:
+            s << "MinKey";
+            break;
+        case CodeWScope:
+            s << "CodeWScope( "
+              << codeWScopeCode() << ", " << codeWScopeObject().toString(false, full) << ")";
+            break;
+        case Code:
+            if ( !full &&  valuestrsize() > 80 ) {
+                s.write(valuestr(), 70);
+                s << "...";
+            }
+            else {
+                s.write(valuestr(), valuestrsize()-1);
+            }
+            break;
+        case Symbol:
+        case mongo::String:
+            s << '"';
+            if ( !full &&  valuestrsize() > 160 ) {
+                s.write(valuestr(), 150);
+                s << "...\"";
+            }
+            else {
+                s.write(valuestr(), valuestrsize()-1);
+                s << '"';
+            }
+            break;
+        case DBRef:
+            s << "DBRef('" << valuestr() << "',";
+            {
+                mongo::OID *x = (mongo::OID *) (valuestr() + valuestrsize());
+                s << *x << ')';
+            }
+            break;
+        case jstOID:
+            s << "ObjectId('";
+            s << __oid() << "')";
+            break;
+        case BinData:
+            s << "BinData";
+            if (full) {
+                int len;
+                const char* data = binDataClean(len);
+                s << '(' << binDataType() << ", " << toHex(data, len) << ')';
+            }
+            break;
+        case Timestamp:
+            s << "Timestamp " << timestampTime() << "|" << timestampInc();
+            break;
+        default:
+            s << "?type=" << type();
+            break;
+        }
+    }
+
+    /* return has eoo() true if no match
+       supports "." notation to reach into embedded objects
+    */
+    inline BSONElement BSONObj::getFieldDotted(const char *name) const {
+        BSONElement e = getField( name );
+        if ( e.eoo() ) {
+            const char *p = strchr(name, '.');
+            if ( p ) {
+                string left(name, p-name);
+                BSONObj sub = getObjectField(left.c_str());
+                return sub.isEmpty() ? BSONElement() : sub.getFieldDotted(p+1);
+            }
+        }
+
+        return e;
+    }
+
+    inline BSONObj BSONObj::getObjectField(const char *name) const {
+        BSONElement e = getField(name);
+        BSONType t = e.type();
+        return t == Object || t == Array ? e.embeddedObject() : BSONObj();
+    }
+
+    inline int BSONObj::nFields() const {
+        int n = 0;
+        BSONObjIterator i(*this);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            n++;
+        }
+        return n;
+    }
+
+    inline BSONObj::BSONObj() {
+        /* little endian ordering here, but perhaps that is ok regardless as BSON is spec'd
+           to be little endian external to the system. (i.e. the rest of the implementation of bson,
+           not this part, fails to support big endian)
+        */
+        static char p[] = { /*size*/5, 0, 0, 0, /*eoo*/0 };
+        _objdata = p;
+    }
+
+    inline BSONObj BSONElement::Obj() const { return embeddedObjectUserCheck(); }
+
+    inline BSONElement BSONElement::operator[] (const string& field) const {
+        BSONObj o = Obj();
+        return o[field];
+    }
+
+    inline void BSONObj::elems(vector<BSONElement> &v) const {
+        BSONObjIterator i(*this);
+        while( i.more() )
+            v.push_back(i.next());
+    }
+
+    inline void BSONObj::elems(list<BSONElement> &v) const {
+        BSONObjIterator i(*this);
+        while( i.more() )
+            v.push_back(i.next());
+    }
+
+    template <class T>
+    void BSONObj::Vals(vector<T>& v) const {
+        BSONObjIterator i(*this);
+        while( i.more() ) {
+            T t;
+            i.next().Val(t);
+            v.push_back(t);
+        }
+    }
+    template <class T>
+    void BSONObj::Vals(list<T>& v) const {
+        BSONObjIterator i(*this);
+        while( i.more() ) {
+            T t;
+            i.next().Val(t);
+            v.push_back(t);
+        }
+    }
+
+    template <class T>
+    void BSONObj::vals(vector<T>& v) const {
+        BSONObjIterator i(*this);
+        while( i.more() ) {
+            try {
+                T t;
+                i.next().Val(t);
+                v.push_back(t);
+            }
+            catch(...) { }
+        }
+    }
+    template <class T>
+    void BSONObj::vals(list<T>& v) const {
+        BSONObjIterator i(*this);
+        while( i.more() ) {
+            try {
+                T t;
+                i.next().Val(t);
+                v.push_back(t);
+            }
+            catch(...) { }
+        }
+    }
+
+    inline ostream& operator<<( ostream &s, const BSONObj &o ) {
+        return s << o.toString();
+    }
+
+    inline ostream& operator<<( ostream &s, const BSONElement &e ) {
+        return s << e.toString();
+    }
+
+    inline StringBuilder& operator<<( StringBuilder &s, const BSONObj &o ) {
+        o.toString( s );
+        return s;
+    }
+    inline StringBuilder& operator<<( StringBuilder &s, const BSONElement &e ) {
+        e.toString( s );
+        return s;
+    }
+
+
+    inline void BSONElement::Val(BSONObj& v) const { v = Obj(); }
+
+    template<typename T>
+    inline BSONFieldValue<BSONObj> BSONField<T>::query( const char * q , const T& t ) const {
+        BSONObjBuilder b;
+        b.append( q , t );
+        return BSONFieldValue<BSONObj>( _name , b.obj() );
+    }
+
+    // used by jsonString()
+    inline string escape( string s , bool escape_slash=false) {
+        StringBuilder ret;
+        for ( string::iterator i = s.begin(); i != s.end(); ++i ) {
+            switch ( *i ) {
+            case '"':
+                ret << "\\\"";
+                break;
+            case '\\':
+                ret << "\\\\";
+                break;
+            case '/':
+                ret << (escape_slash ? "\\/" : "/");
+                break;
+            case '\b':
+                ret << "\\b";
+                break;
+            case '\f':
+                ret << "\\f";
+                break;
+            case '\n':
+                ret << "\\n";
+                break;
+            case '\r':
+                ret << "\\r";
+                break;
+            case '\t':
+                ret << "\\t";
+                break;
+            default:
+                if ( *i >= 0 && *i <= 0x1f ) {
+                    //TODO: these should be utf16 code-units not bytes
+                    char c = *i;
+                    ret << "\\u00" << toHexLower(&c, 1);
+                }
+                else {
+                    ret << *i;
+                }
+            }
+        }
+        return ret.str();
+    }
+
+    inline string BSONObj::hexDump() const {
+        stringstream ss;
+        const char *d = objdata();
+        int size = objsize();
+        for( int i = 0; i < size; ++i ) {
+            ss.width( 2 );
+            ss.fill( '0' );
+            ss << hex << (unsigned)(unsigned char)( d[ i ] ) << dec;
+            if ( ( d[ i ] >= '0' && d[ i ] <= '9' ) || ( d[ i ] >= 'A' && d[ i ] <= 'z' ) )
+                ss << '\'' << d[ i ] << '\'';
+            if ( i != size - 1 )
+                ss << ' ';
+        }
+        return ss.str();
+    }
+
+    inline void BSONObjBuilder::appendKeys( const BSONObj& keyPattern , const BSONObj& values ) {
+        BSONObjIterator i(keyPattern);
+        BSONObjIterator j(values);
+
+        while ( i.more() && j.more() ) {
+            appendAs( j.next() , i.next().fieldName() );
+        }
+
+        assert( ! i.more() );
+        assert( ! j.more() );
+    }
+
+    inline BSONObj BSONObj::removeField(const StringData& name) const { 
+        BSONObjBuilder b;
+        BSONObjIterator i(*this);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            const char *fname = e.fieldName();
+            if( strcmp(name.data(), fname) )
+                b.append(e);
+        }
+        return b.obj();
+    }
+}
diff --git a/src/mongo/bson/bson.h b/src/mongo/bson/bson.h
new file mode 100644
index 00000000000..9515adfd829
--- /dev/null
+++ b/src/mongo/bson/bson.h
@@ -0,0 +1,110 @@
+/** @file bson.h
+
+    Main bson include file for mongodb c++ clients. MongoDB includes ../db/jsobj.h instead. 
+    This file, however, pulls in much less code / dependencies.
+
+    @see bsondemo
+*/
+
+/*
+ *    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+   Main include file for C++ BSON module when using standalone (sans MongoDB client).
+
+   "BSON" stands for "binary JSON" -- ie a binary way to represent objects that would be
+   represented in JSON (plus a few extensions useful for databases & other languages).
+
+   http://www.bsonspec.org/
+*/
+
+#pragma once
+
+#if defined(MONGO_EXPOSE_MACROS)
+#error this header is for client programs, not the mongo database itself. include jsobj.h instead.
+/* because we define simplistic assert helpers here that don't pull in a bunch of util -- so that
+   BSON can be used header only.
+   */
+#endif
+
+#include <cstdlib>
+#include <memory>
+#include <iostream>
+#include <sstream>
+#include <boost/utility.hpp>
+
+namespace bson {
+
+    using std::string;
+    using std::stringstream;
+
+    class assertion : public std::exception {
+    public:
+        assertion( unsigned u , const string& s )
+            : id( u ) , msg( s ) {
+            stringstream ss;
+            ss << "BsonAssertion id: " << u << " " << s;
+            full = ss.str();
+        }
+
+        virtual ~assertion() throw() {}
+
+        virtual const char* what() const throw() { return full.c_str(); }
+
+        unsigned id;
+        string msg;
+        string full;
+    };
+}
+
+namespace mongo {
+#if !defined(assert)
+    inline void assert(bool expr) {
+        if(!expr) {
+            throw bson::assertion( 0 , "assertion failure in bson library" );
+        }
+    }
+#endif
+#if !defined(uassert)
+    inline void uasserted(unsigned msgid, std::string s) {
+        throw bson::assertion( msgid , s );
+    }
+
+    inline void uassert(unsigned msgid, std::string msg, bool expr) {
+        if( !expr )
+            uasserted( msgid , msg );
+    }
+    inline void msgasserted(int msgid, const char *msg) {
+        throw bson::assertion( msgid , msg );
+    }
+    inline void msgasserted(int msgid, const std::string &msg) { msgasserted(msgid, msg.c_str()); }
+    inline void massert(unsigned msgid, std::string msg, bool expr) {
+        if(!expr) {
+            std::cout << "assertion failure in bson library: " << msgid << ' ' << msg << std::endl;
+            throw bson::assertion( msgid , msg );
+        }
+    }
+#endif
+}
+
+#include "util/builder.h"
+#include "bsontypes.h"
+#include "oid.h"
+#include "bsonelement.h"
+#include "bsonobj.h"
+#include "bsonobjbuilder.h"
+#include "bsonobjiterator.h"
+#include "bson-inl.h"
diff --git a/src/mongo/bson/bson_db.h b/src/mongo/bson/bson_db.h
new file mode 100644
index 00000000000..3f597bde3e1
--- /dev/null
+++ b/src/mongo/bson/bson_db.h
@@ -0,0 +1,88 @@
+/** @file bson_db.h
+
+    This file contains the implementation of BSON-related methods that are required
+    by the MongoDB database server.
+
+    Normally, for standalone BSON usage, you do not want this file - it will tend to
+    pull in some other files from the MongoDB project. Thus, bson.h (the main file
+    one would use) does not include this file.
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../util/optime.h"
+#include "../util/time_support.h"
+
+namespace mongo {
+
+    /**
+    Timestamps are a special BSON datatype that is used internally for replication.
+    Append a timestamp element to the object being ebuilt.
+    @param time - in millis (but stored in seconds)
+    */
+    inline BSONObjBuilder& BSONObjBuilder::appendTimestamp( const StringData& fieldName , unsigned long long time , unsigned int inc ) {
+        OpTime t( (unsigned) (time / 1000) , inc );
+        appendTimestamp( fieldName , t.asDate() );
+        return *this;
+    }
+
+    inline OpTime BSONElement::_opTime() const {
+        if( type() == mongo::Date || type() == Timestamp )
+            return OpTime( *reinterpret_cast< const unsigned long long* >( value() ) );
+        return OpTime();
+    }
+
+    inline string BSONElement::_asCode() const {
+        switch( type() ) {
+        case mongo::String:
+        case Code:
+            return string(valuestr(), valuestrsize()-1);
+        case CodeWScope:
+            return string(codeWScopeCode(), *(int*)(valuestr())-1);
+        default:
+            log() << "can't convert type: " << (int)(type()) << " to code" << endl;
+        }
+        uassert( 10062 ,  "not code" , 0 );
+        return "";
+    }
+
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(DateNowLabeler& id) {
+        _builder->appendDate(_fieldName, jsTime());
+        _fieldName = 0;
+        return *_builder;
+    }
+
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(NullLabeler& id) {
+        _builder->appendNull(_fieldName);
+        _fieldName = 0;
+        return *_builder;
+    }
+
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(MinKeyLabeler& id) {
+        _builder->appendMinKey(_fieldName);
+        _fieldName = 0;
+        return *_builder;
+    }
+
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(MaxKeyLabeler& id) {
+        _builder->appendMaxKey(_fieldName);
+        _fieldName = 0;
+        return *_builder;
+    }
+
+}
diff --git a/src/mongo/bson/bsondemo/bsondemo.cpp b/src/mongo/bson/bsondemo/bsondemo.cpp
new file mode 100644
index 00000000000..b53a7b39baa
--- /dev/null
+++ b/src/mongo/bson/bsondemo/bsondemo.cpp
@@ -0,0 +1,113 @@
+/** @file bsondemo.cpp
+
+    Example of use of BSON from C++.
+
+    Requires boost (headers only).
+    Works headers only (the parts actually exercised herein that is - some functions require .cpp files).
+
+    To build and run:
+      g++ -o bsondemo bsondemo.cpp
+      ./bsondemo
+
+    Windows: project files are available in this directory for bsondemo.cpp for use with Visual Studio.
+*/
+
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "../bson.h"
+#include <iostream>
+#include <vector>
+
+using namespace std;
+using namespace bson;
+
+void iter(bo o) {
+    /* iterator example */
+    cout << "\niter()\n";
+    for( bo::iterator i(o); i.more(); ) {
+        cout << ' ' << i.next().toString() << '\n';
+    }
+}
+
+int main() {
+    cout << "build bits: " << 8 * sizeof(char *) << '\n' <<  endl;
+
+    /* a bson object defaults on construction to { } */
+    bo empty;
+    cout << "empty: " << empty << endl;
+
+    /* make a simple { name : 'joe', age : 33.7 } object */
+    {
+        bob b;
+        b.append("name", "joe");
+        b.append("age", 33.7);
+        b.obj();
+    }
+
+    /* make { name : 'joe', age : 33.7 } with a more compact notation. */
+    bo x = bob().append("name", "joe").append("age", 33.7).obj();
+
+    /* convert from bson to json */
+    string json = x.toString();
+    cout << "json for x:" << json << endl;
+
+    /* access some fields of bson object x */
+    cout << "Some x things: " << x["name"] << ' ' << x["age"].Number() << ' ' << x.isEmpty() << endl;
+
+    /* make a bit more complex object with some nesting
+       { x : 'asdf', y : true, subobj : { z : 3, q : 4 } }
+    */
+    bo y = BSON( "x" << "asdf" << "y" << true << "subobj" << BSON( "z" << 3 << "q" << 4 ) );
+
+    /* print it */
+    cout << "y: " << y << endl;
+
+    /* reach in and get subobj.z */
+    cout << "subobj.z: " << y.getFieldDotted("subobj.z").Number() << endl;
+
+    /* alternate syntax: */
+    cout << "subobj.z: " << y["subobj"]["z"].Number() << endl;
+
+    /* fetch all *top level* elements from object y into a vector */
+    vector<be> v;
+    y.elems(v);
+    cout << v[0] << endl;
+
+    /* into an array */
+    list<be> L;
+    y.elems(L);
+
+    bo sub = y["subobj"].Obj();
+
+    /* grab all the int's that were in subobj.  if it had elements that were not ints, we throw an exception
+       (capital V on Vals() means exception if wrong type found
+    */
+    vector<int> myints;
+    sub.Vals(myints);
+    cout << "my ints: " << myints[0] << ' ' << myints[1] << endl;
+
+    /* grab all the string values from x.  if the field isn't of string type, just skip it --
+       lowercase v on vals() indicates skip don't throw.
+    */
+    vector<string> strs;
+    x.vals(strs);
+    cout << strs.size() << " strings, first one: " << strs[0] << endl;
+
+    iter(y);
+    return 0;
+}
+
diff --git a/src/mongo/bson/bsondemo/bsondemo.vcproj b/src/mongo/bson/bsondemo/bsondemo.vcproj
new file mode 100644
index 00000000000..8432cebfd87
--- /dev/null
+++ b/src/mongo/bson/bsondemo/bsondemo.vcproj
@@ -0,0 +1,243 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="bsondemo"
+	ProjectGUID="{C9DB5EB7-81AA-4185-BAA1-DA035654402F}"
+	RootNamespace="bsondemo"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="c:\program files\boost\latest;c:\boost;\boost"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="c:\program files\boost\latest;c:\boost;\boost"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath=".\bsondemo.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="bson"
+			>
+			<File
+				RelativePath="..\bson.h"
+				>
+			</File>
+			<File
+				RelativePath="..\bson_db.h"
+				>
+			</File>
+			<File
+				RelativePath="..\bsonelement.h"
+				>
+			</File>
+			<File
+				RelativePath="..\bsoninlines.h"
+				>
+			</File>
+			<File
+				RelativePath="..\bsonmisc.h"
+				>
+			</File>
+			<File
+				RelativePath="..\bsonobj.h"
+				>
+			</File>
+			<File
+				RelativePath="..\bsonobjbuilder.h"
+				>
+			</File>
+			<File
+				RelativePath="..\bsonobjiterator.h"
+				>
+			</File>
+			<File
+				RelativePath="..\bsontypes.h"
+				>
+			</File>
+			<File
+				RelativePath="..\oid.h"
+				>
+			</File>
+			<File
+				RelativePath="..\ordering.h"
+				>
+			</File>
+			<Filter
+				Name="util"
+				>
+				<File
+					RelativePath="..\util\builder.h"
+					>
+				</File>
+				<File
+					RelativePath="..\util\misc.h"
+					>
+				</File>
+			</Filter>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/src/mongo/bson/bsondemo/bsondemo.vcxproj b/src/mongo/bson/bsondemo/bsondemo.vcxproj
new file mode 100644
index 00000000000..2ad53894d7d
--- /dev/null
+++ b/src/mongo/bson/bsondemo/bsondemo.vcxproj
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{C9DB5EB7-81AA-4185-BAA1-DA035654402F}</ProjectGuid>
+    <RootNamespace>bsondemo</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>No</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="bsondemo.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\bson.h" />
+    <ClInclude Include="..\bson_db.h" />
+    <ClInclude Include="..\bsonelement.h" />
+    <ClInclude Include="..\bsoninlines.h" />
+    <ClInclude Include="..\bsonmisc.h" />
+    <ClInclude Include="..\bsonobj.h" />
+    <ClInclude Include="..\bsonobjbuilder.h" />
+    <ClInclude Include="..\bsonobjiterator.h" />
+    <ClInclude Include="..\bsontypes.h" />
+    <ClInclude Include="..\oid.h" />
+    <ClInclude Include="..\ordering.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\misc.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/src/mongo/bson/bsondemo/bsondemo.vcxproj.filters b/src/mongo/bson/bsondemo/bsondemo.vcxproj.filters
new file mode 100644
index 00000000000..35f14d5193b
--- /dev/null
+++ b/src/mongo/bson/bsondemo/bsondemo.vcxproj.filters
@@ -0,0 +1,52 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="bsondemo.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\ordering.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bsonelement.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bsoninlines.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bsonmisc.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bsonobj.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bsonobjbuilder.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bsonobjiterator.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bsontypes.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\builder.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\misc.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\oid.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson_db.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="bson">
+      <UniqueIdentifier>{ea599740-3c6f-40dd-a121-e825d82ae4aa}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
diff --git a/src/mongo/bson/bsonelement.h b/src/mongo/bson/bsonelement.h
new file mode 100644
index 00000000000..57cc2ae5775
--- /dev/null
+++ b/src/mongo/bson/bsonelement.h
@@ -0,0 +1,583 @@
+// BSONElement
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+#include <string.h>
+#include "util/builder.h"
+#include "bsontypes.h"
+
+namespace mongo {
+    class OpTime;
+    class BSONObj;
+    class BSONElement;
+    class BSONObjBuilder;
+}
+
+namespace bson {
+    typedef mongo::BSONElement be;
+    typedef mongo::BSONObj bo;
+    typedef mongo::BSONObjBuilder bob;
+}
+
+namespace mongo {
+
+    /* l and r MUST have same type when called: check that first. */
+    int compareElementValues(const BSONElement& l, const BSONElement& r);
+
+
+    /** BSONElement represents an "element" in a BSONObj.  So for the object { a : 3, b : "abc" },
+        'a : 3' is the first element (key+value).
+
+        The BSONElement object points into the BSONObj's data.  Thus the BSONObj must stay in scope
+        for the life of the BSONElement.
+
+        internals:
+        <type><fieldName    ><value>
+        -------- size() ------------
+        -fieldNameSize-
+        value()
+        type()
+    */
+    class BSONElement {
+    public:
+        /** These functions, which start with a capital letter, throw a UserException if the
+            element is not of the required type. Example:
+
+            string foo = obj["foo"].String(); // exception if not a string type or DNE
+        */
+        string String()             const { return chk(mongo::String).valuestr(); }
+        Date_t Date()               const { return chk(mongo::Date).date(); }
+        double Number()             const { return chk(isNumber()).number(); }
+        double Double()             const { return chk(NumberDouble)._numberDouble(); }
+        long long Long()            const { return chk(NumberLong)._numberLong(); }
+        int Int()                   const { return chk(NumberInt)._numberInt(); }
+        bool Bool()                 const { return chk(mongo::Bool).boolean(); }
+        vector<BSONElement> Array() const; // see implementation for detailed comments
+        mongo::OID OID()            const { return chk(jstOID).__oid(); }
+        void Null()                 const { chk(isNull()); } // throw UserException if not null
+        void OK()                   const { chk(ok()); }     // throw UserException if element DNE
+
+	/** @return the embedded object associated with this field.
+            Note the returned object is a reference to within the parent bson object. If that 
+	    object is out of scope, this pointer will no longer be valid. Call getOwned() on the 
+	    returned BSONObj if you need your own copy.
+	    throws UserException if the element is not of type object.
+	*/
+        BSONObj Obj()               const;
+
+        /** populate v with the value of the element.  If type does not match, throw exception.
+            useful in templates -- see also BSONObj::Vals().
+        */
+        void Val(Date_t& v)         const { v = Date(); }
+        void Val(long long& v)      const { v = Long(); }
+        void Val(bool& v)           const { v = Bool(); }
+        void Val(BSONObj& v)        const;
+        void Val(mongo::OID& v)     const { v = OID(); }
+        void Val(int& v)            const { v = Int(); }
+        void Val(double& v)         const { v = Double(); }
+        void Val(string& v)         const { v = String(); }
+
+        /** Use ok() to check if a value is assigned:
+            if( myObj["foo"].ok() ) ...
+        */
+        bool ok() const { return !eoo(); }
+
+        string toString( bool includeFieldName = true, bool full=false) const;
+        void toString(StringBuilder& s, bool includeFieldName = true, bool full=false) const;
+        string jsonString( JsonStringFormat format, bool includeFieldNames = true, int pretty = 0 ) const;
+        operator string() const { return toString(); }
+
+        /** Returns the type of the element */
+        BSONType type() const { return (BSONType) *data; }
+
+        /** retrieve a field within this element
+            throws exception if *this is not an embedded object
+        */
+        BSONElement operator[] (const string& field) const;
+
+        /** returns the tyoe of the element fixed for the main type
+            the main purpose is numbers.  any numeric type will return NumberDouble
+            Note: if the order changes, indexes have to be re-built or than can be corruption
+        */
+        int canonicalType() const;
+
+        /** Indicates if it is the end-of-object element, which is present at the end of
+            every BSON object.
+        */
+        bool eoo() const { return type() == EOO; }
+
+        /** Size of the element.
+            @param maxLen If maxLen is specified, don't scan more than maxLen bytes to calculate size.
+        */
+        int size( int maxLen ) const;
+        int size() const;
+
+        /** Wrap this element up as a singleton object. */
+        BSONObj wrap() const;
+
+        /** Wrap this element up as a singleton object with a new name. */
+        BSONObj wrap( const char* newName) const;
+
+        /** field name of the element.  e.g., for
+            name : "Joe"
+            "name" is the fieldname
+        */
+        const char * fieldName() const {
+            if ( eoo() ) return ""; // no fieldname for it.
+            return data + 1;
+        }
+
+        /** raw data of the element's value (so be careful). */
+        const char * value() const {
+            return (data + fieldNameSize() + 1);
+        }
+        /** size in bytes of the element's value (when applicable). */
+        int valuesize() const {
+            return size() - fieldNameSize() - 1;
+        }
+
+        bool isBoolean() const { return type() == mongo::Bool; }
+
+        /** @return value of a boolean element.
+            You must assure element is a boolean before
+            calling. */
+        bool boolean() const {
+            return *value() ? true : false;
+        }
+
+        bool booleanSafe() const { return isBoolean() && boolean(); }
+
+        /** Retrieve a java style date value from the element.
+            Ensure element is of type Date before calling.
+            @see Bool(), trueValue()
+        */
+        Date_t date() const {
+            return *reinterpret_cast< const Date_t* >( value() );
+        }
+
+        /** Convert the value to boolean, regardless of its type, in a javascript-like fashion
+            (i.e., treats zero and null and eoo as false).
+        */
+        bool trueValue() const;
+
+        /** True if number, string, bool, date, OID */
+        bool isSimpleType() const;
+
+        /** True if element is of a numeric type. */
+        bool isNumber() const;
+
+        /** Return double value for this field. MUST be NumberDouble type. */
+        double _numberDouble() const {return *reinterpret_cast< const double* >( value() ); }
+        /** Return int value for this field. MUST be NumberInt type. */
+        int _numberInt() const {return *reinterpret_cast< const int* >( value() ); }
+        /** Return long long value for this field. MUST be NumberLong type. */
+        long long _numberLong() const {return *reinterpret_cast< const long long* >( value() ); }
+
+        /** Retrieve int value for the element safely.  Zero returned if not a number. */
+        int numberInt() const;
+        /** Retrieve long value for the element safely.  Zero returned if not a number. */
+        long long numberLong() const;
+        /** Retrieve the numeric value of the element.  If not of a numeric type, returns 0.
+            Note: casts to double, data loss may occur with large (>52 bit) NumberLong values.
+        */
+        double numberDouble() const;
+        /** Retrieve the numeric value of the element.  If not of a numeric type, returns 0.
+            Note: casts to double, data loss may occur with large (>52 bit) NumberLong values.
+        */
+        double number() const { return numberDouble(); }
+
+        /** Retrieve the object ID stored in the object.
+            You must ensure the element is of type jstOID first. */
+        const mongo::OID &__oid() const { return *reinterpret_cast< const mongo::OID* >( value() ); }
+
+        /** True if element is null. */
+        bool isNull() const {
+            return type() == jstNULL;
+        }
+
+        /** Size (length) of a string element.
+            You must assure of type String first.  
+            @return string size including terminating null
+        */
+        int valuestrsize() const {
+            return *reinterpret_cast< const int* >( value() );
+        }
+
+        // for objects the size *includes* the size of the size field
+        int objsize() const {
+            return *reinterpret_cast< const int* >( value() );
+        }
+
+        /** Get a string's value.  Also gives you start of the real data for an embedded object.
+            You must assure data is of an appropriate type first -- see also valuestrsafe().
+        */
+        const char * valuestr() const {
+            return value() + 4;
+        }
+
+        /** Get the string value of the element.  If not a string returns "". */
+        const char *valuestrsafe() const {
+            return type() == mongo::String ? valuestr() : "";
+        }
+        /** Get the string value of the element.  If not a string returns "". */
+        string str() const {
+            return type() == mongo::String ? string(valuestr(), valuestrsize()-1) : string();
+        }
+
+        /** Get javascript code of a CodeWScope data element. */
+        const char * codeWScopeCode() const {
+            return value() + 8;
+        }
+        /** Get the scope SavedContext of a CodeWScope data element. */
+        const char * codeWScopeScopeData() const {
+            // TODO fix
+            return codeWScopeCode() + strlen( codeWScopeCode() ) + 1;
+        }
+
+        /** Get the embedded object this element holds. */
+        BSONObj embeddedObject() const;
+
+        /* uasserts if not an object */
+        BSONObj embeddedObjectUserCheck() const;
+
+        BSONObj codeWScopeObject() const;
+
+        /** Get raw binary data.  Element must be of type BinData. Doesn't handle type 2 specially */
+        const char *binData(int& len) const {
+            // BinData: <int len> <byte subtype> <byte[len] data>
+            assert( type() == BinData );
+            len = valuestrsize();
+            return value() + 5;
+        }
+        /** Get binary data.  Element must be of type BinData. Handles type 2 */
+        const char *binDataClean(int& len) const {
+            // BinData: <int len> <byte subtype> <byte[len] data>
+            if (binDataType() != ByteArrayDeprecated) {
+                return binData(len);
+            }
+            else {
+                // Skip extra size
+                len = valuestrsize() - 4;
+                return value() + 5 + 4;
+            }
+        }
+
+        BinDataType binDataType() const {
+            // BinData: <int len> <byte subtype> <byte[len] data>
+            assert( type() == BinData );
+            unsigned char c = (value() + 4)[0];
+            return (BinDataType)c;
+        }
+
+        /** Retrieve the regex string for a Regex element */
+        const char *regex() const {
+            assert(type() == RegEx);
+            return value();
+        }
+
+        /** Retrieve the regex flags (options) for a Regex element */
+        const char *regexFlags() const {
+            const char *p = regex();
+            return p + strlen(p) + 1;
+        }
+
+        /** like operator== but doesn't check the fieldname,
+            just the value.
+        */
+        bool valuesEqual(const BSONElement& r) const {
+            return woCompare( r , false ) == 0;
+        }
+
+        /** Returns true if elements are equal. */
+        bool operator==(const BSONElement& r) const {
+            return woCompare( r , true ) == 0;
+        }
+        /** Returns true if elements are unequal. */
+        bool operator!=(const BSONElement& r) const { return !operator==(r); }
+
+        /** Well ordered comparison.
+            @return <0: l<r. 0:l==r. >0:l>r
+            order by type, field name, and field value.
+            If considerFieldName is true, pay attention to the field name.
+        */
+        int woCompare( const BSONElement &e, bool considerFieldName = true ) const;
+
+        const char * rawdata() const { return data; }
+
+        /** 0 == Equality, just not defined yet */
+        int getGtLtOp( int def = 0 ) const;
+
+        /** Constructs an empty element */
+        BSONElement();
+
+        /** Check that data is internally consistent. */
+        void validate() const;
+
+        /** True if this element may contain subobjects. */
+        bool mayEncapsulate() const {
+            switch ( type() ) {
+            case Object:
+            case mongo::Array:
+            case CodeWScope:
+                return true;
+            default:
+                return false;
+            }
+        }
+
+        /** True if this element can be a BSONObj */
+        bool isABSONObj() const {
+            switch( type() ) {
+            case Object:
+            case mongo::Array:
+                return true;
+            default:
+                return false;
+            }
+        }
+
+        Date_t timestampTime() const {
+            unsigned long long t = ((unsigned int*)(value() + 4 ))[0];
+            return t * 1000;
+        }
+        unsigned int timestampInc() const {
+            return ((unsigned int*)(value() ))[0];
+        }
+
+        const char * dbrefNS() const {
+            uassert( 10063 ,  "not a dbref" , type() == DBRef );
+            return value() + 4;
+        }
+
+        const mongo::OID& dbrefOID() const {
+            uassert( 10064 ,  "not a dbref" , type() == DBRef );
+            const char * start = value();
+            start += 4 + *reinterpret_cast< const int* >( start );
+            return *reinterpret_cast< const mongo::OID* >( start );
+        }
+
+        /** this does not use fieldName in the comparison, just the value */
+        bool operator<( const BSONElement& other ) const {
+            int x = (int)canonicalType() - (int)other.canonicalType();
+            if ( x < 0 ) return true;
+            else if ( x > 0 ) return false;
+            return compareElementValues(*this,other) < 0;
+        }
+
+        // @param maxLen don't scan more than maxLen bytes
+        explicit BSONElement(const char *d, int maxLen) : data(d) {
+            if ( eoo() ) {
+                totalSize = 1;
+                fieldNameSize_ = 0;
+            }
+            else {
+                totalSize = -1;
+                fieldNameSize_ = -1;
+                if ( maxLen != -1 ) {
+                    int size = (int) strnlen( fieldName(), maxLen - 1 );
+                    uassert( 10333 ,  "Invalid field name", size != -1 );
+                    fieldNameSize_ = size + 1;
+                }
+            }
+        }
+
+        explicit BSONElement(const char *d) : data(d) {
+            fieldNameSize_ = -1;
+            totalSize = -1;
+            if ( eoo() ) {
+                fieldNameSize_ = 0;
+                totalSize = 1;
+            }
+        }
+
+        string _asCode() const;
+        OpTime _opTime() const;
+
+    private:
+        const char *data;
+        mutable int fieldNameSize_; // cached value
+        int fieldNameSize() const {
+            if ( fieldNameSize_ == -1 )
+                fieldNameSize_ = (int)strlen( fieldName() ) + 1;
+            return fieldNameSize_;
+        }
+        mutable int totalSize; /* caches the computed size */
+
+        friend class BSONObjIterator;
+        friend class BSONObj;
+        const BSONElement& chk(int t) const {
+            if ( t != type() ) {
+                StringBuilder ss;
+                if( eoo() )
+                    ss << "field not found, expected type " << t;
+                else
+                    ss << "wrong type for field (" << fieldName() << ") " << type() << " != " << t;
+                uasserted(13111, ss.str() );
+            }
+            return *this;
+        }
+        const BSONElement& chk(bool expr) const {
+            uassert(13118, "unexpected or missing type value in BSON object", expr);
+            return *this;
+        }
+    };
+
+
+    inline int BSONElement::canonicalType() const {
+        BSONType t = type();
+        switch ( t ) {
+        case MinKey:
+        case MaxKey:
+            return t;
+        case EOO:
+        case Undefined:
+            return 0;
+        case jstNULL:
+            return 5;
+        case NumberDouble:
+        case NumberInt:
+        case NumberLong:
+            return 10;
+        case mongo::String:
+        case Symbol:
+            return 15;
+        case Object:
+            return 20;
+        case mongo::Array:
+            return 25;
+        case BinData:
+            return 30;
+        case jstOID:
+            return 35;
+        case mongo::Bool:
+            return 40;
+        case mongo::Date:
+        case Timestamp:
+            return 45;
+        case RegEx:
+            return 50;
+        case DBRef:
+            return 55;
+        case Code:
+            return 60;
+        case CodeWScope:
+            return 65;
+        default:
+            assert(0);
+            return -1;
+        }
+    }
+
+    inline bool BSONElement::trueValue() const {
+        switch( type() ) {
+        case NumberLong:
+            return *reinterpret_cast< const long long* >( value() ) != 0;
+        case NumberDouble:
+            return *reinterpret_cast< const double* >( value() ) != 0;
+        case NumberInt:
+            return *reinterpret_cast< const int* >( value() ) != 0;
+        case mongo::Bool:
+            return boolean();
+        case EOO:
+        case jstNULL:
+        case Undefined:
+            return false;
+
+        default:
+            ;
+        }
+        return true;
+    }
+
+    /** @return true if element is of a numeric type. */
+    inline bool BSONElement::isNumber() const {
+        switch( type() ) {
+        case NumberLong:
+        case NumberDouble:
+        case NumberInt:
+            return true;
+        default:
+            return false;
+        }
+    }
+
+    inline bool BSONElement::isSimpleType() const {
+        switch( type() ) {
+        case NumberLong:
+        case NumberDouble:
+        case NumberInt:
+        case mongo::String:
+        case mongo::Bool:
+        case mongo::Date:
+        case jstOID:
+            return true;
+        default:
+            return false;
+        }
+    }
+
+    inline double BSONElement::numberDouble() const {
+        switch( type() ) {
+        case NumberDouble:
+            return _numberDouble();
+        case NumberInt:
+            return *reinterpret_cast< const int* >( value() );
+        case NumberLong:
+            return (double) *reinterpret_cast< const long long* >( value() );
+        default:
+            return 0;
+        }
+    }
+
+    /** Retrieve int value for the element safely.  Zero returned if not a number. Converted to int if another numeric type. */
+    inline int BSONElement::numberInt() const {
+        switch( type() ) {
+        case NumberDouble:
+            return (int) _numberDouble();
+        case NumberInt:
+            return _numberInt();
+        case NumberLong:
+            return (int) _numberLong();
+        default:
+            return 0;
+        }
+    }
+
+    /** Retrieve long value for the element safely.  Zero returned if not a number. */
+    inline long long BSONElement::numberLong() const {
+        switch( type() ) {
+        case NumberDouble:
+            return (long long) _numberDouble();
+        case NumberInt:
+            return _numberInt();
+        case NumberLong:
+            return _numberLong();
+        default:
+            return 0;
+        }
+    }
+
+    inline BSONElement::BSONElement() {
+        static char z = 0;
+        data = &z;
+        fieldNameSize_ = 0;
+        totalSize = 1;
+    }
+
+}
diff --git a/src/mongo/bson/bsonmisc.h b/src/mongo/bson/bsonmisc.h
new file mode 100644
index 00000000000..8a379396d17
--- /dev/null
+++ b/src/mongo/bson/bsonmisc.h
@@ -0,0 +1,211 @@
+// @file bsonmisc.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    int getGtLtOp(const BSONElement& e);
+
+    struct BSONElementCmpWithoutField {
+        bool operator()( const BSONElement &l, const BSONElement &r ) const {
+            return l.woCompare( r, false ) < 0;
+        }
+    };
+
+    class BSONObjCmp {
+    public:
+        BSONObjCmp( const BSONObj &order = BSONObj() ) : _order( order ) {}
+        bool operator()( const BSONObj &l, const BSONObj &r ) const {
+            return l.woCompare( r, _order ) < 0;
+        }
+        BSONObj order() const { return _order; }
+    private:
+        BSONObj _order;
+    };
+
+    typedef set<BSONObj,BSONObjCmp> BSONObjSet;
+
+    enum FieldCompareResult {
+        LEFT_SUBFIELD = -2,
+        LEFT_BEFORE = -1,
+        SAME = 0,
+        RIGHT_BEFORE = 1 ,
+        RIGHT_SUBFIELD = 2
+    };
+
+    FieldCompareResult compareDottedFieldNames( const string& l , const string& r );
+
+    /** Use BSON macro to build a BSONObj from a stream
+
+        e.g.,
+           BSON( "name" << "joe" << "age" << 33 )
+
+        with auto-generated object id:
+           BSON( GENOID << "name" << "joe" << "age" << 33 )
+
+        The labels GT, GTE, LT, LTE, NE can be helpful for stream-oriented construction
+        of a BSONObj, particularly when assembling a Query.  For example,
+        BSON( "a" << GT << 23.4 << NE << 30 << "b" << 2 ) produces the object
+        { a: { \$gt: 23.4, \$ne: 30 }, b: 2 }.
+    */
+#define BSON(x) (( mongo::BSONObjBuilder(64) << x ).obj())
+
+    /** Use BSON_ARRAY macro like BSON macro, but without keys
+
+        BSONArray arr = BSON_ARRAY( "hello" << 1 << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) );
+
+     */
+#define BSON_ARRAY(x) (( mongo::BSONArrayBuilder() << x ).arr())
+
+    /* Utility class to auto assign object IDs.
+       Example:
+         cout << BSON( GENOID << "z" << 3 ); // { _id : ..., z : 3 }
+    */
+    extern struct GENOIDLabeler { } GENOID;
+
+    /* Utility class to add a Date element with the current time
+       Example:
+         cout << BSON( "created" << DATENOW ); // { created : "2009-10-09 11:41:42" }
+    */
+    extern struct DateNowLabeler { } DATENOW;
+
+    /* Utility class to assign a NULL value to a given attribute
+       Example:
+         cout << BSON( "a" << BSONNULL ); // { a : null }
+    */
+    extern struct NullLabeler { } BSONNULL;
+
+    /* Utility class to add the minKey (minus infinity) to a given attribute
+       Example:
+         cout << BSON( "a" << MINKEY ); // { "a" : { "$minKey" : 1 } }
+    */
+    extern struct MinKeyLabeler { } MINKEY;
+    extern struct MaxKeyLabeler { } MAXKEY;
+
+    // Utility class to implement GT, GTE, etc as described above.
+    class Labeler {
+    public:
+        struct Label {
+            Label( const char *l ) : l_( l ) {}
+            const char *l_;
+        };
+        Labeler( const Label &l, BSONObjBuilderValueStream *s ) : l_( l ), s_( s ) {}
+        template<class T>
+        BSONObjBuilder& operator<<( T value );
+
+        /* the value of the element e is appended i.e. for
+             "age" << GT << someElement
+           one gets
+             { age : { $gt : someElement's value } }
+        */
+        BSONObjBuilder& operator<<( const BSONElement& e );
+    private:
+        const Label &l_;
+        BSONObjBuilderValueStream *s_;
+    };
+
+    extern Labeler::Label GT;
+    extern Labeler::Label GTE;
+    extern Labeler::Label LT;
+    extern Labeler::Label LTE;
+    extern Labeler::Label NE;
+    extern Labeler::Label SIZE;
+
+
+    // $or helper: OR(BSON("x" << GT << 7), BSON("y" << LT << 6));
+    // becomes   : {$or: [{x: {$gt: 7}}, {y: {$lt: 6}}]}
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b);
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c);
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d);
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d, const BSONObj& e);
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d, const BSONObj& e, const BSONObj& f);
+    // definitions in bsonobjbuilder.h b/c of incomplete types
+
+    // Utility class to implement BSON( key << val ) as described above.
+    class BSONObjBuilderValueStream : public boost::noncopyable {
+    public:
+        friend class Labeler;
+        BSONObjBuilderValueStream( BSONObjBuilder * builder );
+
+        BSONObjBuilder& operator<<( const BSONElement& e );
+
+        template<class T>
+        BSONObjBuilder& operator<<( T value );
+
+        BSONObjBuilder& operator<<(DateNowLabeler& id);
+
+        BSONObjBuilder& operator<<(NullLabeler& id);
+
+        BSONObjBuilder& operator<<(MinKeyLabeler& id);
+        BSONObjBuilder& operator<<(MaxKeyLabeler& id);
+
+        Labeler operator<<( const Labeler::Label &l );
+
+        void endField( const char *nextFieldName = 0 );
+        bool subobjStarted() const { return _fieldName != 0; }
+
+    private:
+        const char * _fieldName;
+        BSONObjBuilder * _builder;
+
+        bool haveSubobj() const { return _subobj.get() != 0; }
+        BSONObjBuilder *subobj();
+        auto_ptr< BSONObjBuilder > _subobj;
+    };
+
+    /**
+       used in conjuction with BSONObjBuilder, allows for proper buffer size to prevent crazy memory usage
+     */
+    class BSONSizeTracker {
+    public:
+        BSONSizeTracker() {
+            _pos = 0;
+            for ( int i=0; i<SIZE; i++ )
+                _sizes[i] = 512; // this is the default, so just be consistent
+        }
+
+        ~BSONSizeTracker() {
+        }
+
+        void got( int size ) {
+            _sizes[_pos++] = size;
+            if ( _pos >= SIZE )
+                _pos = 0;
+        }
+
+        /**
+         * right now choosing largest size
+         */
+        int getSize() const {
+            int x = 16; // sane min
+            for ( int i=0; i<SIZE; i++ ) {
+                if ( _sizes[i] > x )
+                    x = _sizes[i];
+            }
+            return x;
+        }
+
+    private:
+        enum { SIZE = 10 };
+        int _pos;
+        int _sizes[SIZE];
+    };
+
+    // considers order
+    bool fieldsMatch(const BSONObj& lhs, const BSONObj& rhs);
+}
diff --git a/src/mongo/bson/bsonobj.h b/src/mongo/bson/bsonobj.h
new file mode 100644
index 00000000000..e8ce462403b
--- /dev/null
+++ b/src/mongo/bson/bsonobj.h
@@ -0,0 +1,497 @@
+// @file bsonobj.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <set>
+#include <list>
+#include <vector>
+#include "util/atomic_int.h"
+#include "util/builder.h"
+#include "stringdata.h"
+
+namespace mongo {
+
+    typedef set< BSONElement, BSONElementCmpWithoutField > BSONElementSet;
+    typedef multiset< BSONElement, BSONElementCmpWithoutField > BSONElementMSet;
+
+    /**
+       C++ representation of a "BSON" object -- that is, an extended JSON-style
+       object in a binary representation.
+
+       See bsonspec.org.
+
+       Note that BSONObj's have a smart pointer capability built in -- so you can
+       pass them around by value.  The reference counts used to implement this
+       do not use locking, so copying and destroying BSONObj's are not thread-safe
+       operations.
+
+     BSON object format:
+
+     code
+     <unsigned totalSize> {<byte BSONType><cstring FieldName><Data>}* EOO
+
+     totalSize includes itself.
+
+     Data:
+     Bool:      <byte>
+     EOO:       nothing follows
+     Undefined: nothing follows
+     OID:       an OID object
+     NumberDouble: <double>
+     NumberInt: <int32>
+     String:    <unsigned32 strsizewithnull><cstring>
+     Date:      <8bytes>
+     Regex:     <cstring regex><cstring options>
+     Object:    a nested object, leading with its entire size, which terminates with EOO.
+     Array:     same as object
+     DBRef:     <strlen> <cstring ns> <oid>
+     DBRef:     a database reference: basically a collection name plus an Object ID
+     BinData:   <int len> <byte subtype> <byte[len] data>
+     Code:      a function (not a closure): same format as String.
+     Symbol:    a language symbol (say a python symbol).  same format as String.
+     Code With Scope: <total size><String><Object>
+     \endcode
+     */
+    class BSONObj {
+    public:
+
+        /** Construct a BSONObj from data in the proper format.
+         *  Use this constructor when something else owns msgdata's buffer
+        */
+        explicit BSONObj(const char *msgdata) {
+            init(msgdata);
+        }
+
+        /** Construct a BSONObj from data in the proper format.
+         *  Use this constructor when you want BSONObj to free(holder) when it is no longer needed
+         *  BSONObj::Holder has an extra 4 bytes for a ref-count before the start of the object
+        */
+        class Holder;
+        explicit BSONObj(Holder* holder) {
+            init(holder);
+        }
+
+        explicit BSONObj(const Record *r);
+
+        /** Construct an empty BSONObj -- that is, {}. */
+        BSONObj();
+
+        ~BSONObj() { 
+            _objdata = 0; // defensive
+        }
+
+        /**
+           A BSONObj can use a buffer it "owns" or one it does not.
+
+           OWNED CASE
+           If the BSONObj owns the buffer, the buffer can be shared among several BSONObj's (by assignment).
+           In this case the buffer is basically implemented as a shared_ptr.
+           Since BSONObj's are typically immutable, this works well.
+
+           UNOWNED CASE
+           A BSONObj can also point to BSON data in some other data structure it does not "own" or free later.
+           For example, in a memory mapped file.  In this case, it is important the original data stays in
+           scope for as long as the BSONObj is in use.  If you think the original data may go out of scope,
+           call BSONObj::getOwned() to promote your BSONObj to having its own copy.
+
+           On a BSONObj assignment, if the source is unowned, both the source and dest will have unowned
+           pointers to the original buffer after the assignment.
+
+           If you are not sure about ownership but need the buffer to last as long as the BSONObj, call
+           getOwned().  getOwned() is a no-op if the buffer is already owned.  If not already owned, a malloc
+           and memcpy will result.
+
+           Most ways to create BSONObj's create 'owned' variants.  Unowned versions can be created with:
+           (1) specifying true for the ifree parameter in the constructor
+           (2) calling BSONObjBuilder::done().  Use BSONObjBuilder::obj() to get an owned copy
+           (3) retrieving a subobject retrieves an unowned pointer into the parent BSON object
+
+           @return true if this is in owned mode
+        */
+        bool isOwned() const { return _holder.get() != 0; }
+
+        /** assure the data buffer is under the control of this BSONObj and not a remote buffer 
+            @see isOwned()
+        */
+        BSONObj getOwned() const;
+
+        /** @return a new full (and owned) copy of the object. */
+        BSONObj copy() const;
+
+        /** Readable representation of a BSON object in an extended JSON-style notation.
+            This is an abbreviated representation which might be used for logging.
+        */
+        string toString( bool isArray = false, bool full=false ) const;
+        void toString(StringBuilder& s, bool isArray = false, bool full=false ) const;
+
+        /** Properly formatted JSON string.
+            @param pretty if true we try to add some lf's and indentation
+        */
+        string jsonString( JsonStringFormat format = Strict, int pretty = 0 ) const;
+
+        /** note: addFields always adds _id even if not specified */
+        int addFields(BSONObj& from, set<string>& fields); /* returns n added */
+
+        /** remove specified field and return a new object with the remaining fields.
+            slowish as builds a full new object
+         */
+        BSONObj removeField(const StringData& name) const;
+
+        /** returns # of top level fields in the object
+           note: iterates to count the fields
+        */
+        int nFields() const;
+
+        /** adds the field names to the fields set.  does NOT clear it (appends). */
+        int getFieldNames(set<string>& fields) const;
+
+        /** @return the specified element.  element.eoo() will be true if not found.
+            @param name field to find. supports dot (".") notation to reach into embedded objects.
+             for example "x.y" means "in the nested object in field x, retrieve field y"
+        */
+        BSONElement getFieldDotted(const char *name) const;
+        /** @return the specified element.  element.eoo() will be true if not found.
+            @param name field to find. supports dot (".") notation to reach into embedded objects.
+             for example "x.y" means "in the nested object in field x, retrieve field y"
+        */
+        BSONElement getFieldDotted(const string& name) const {
+            return getFieldDotted( name.c_str() );
+        }
+
+        /** Like getFieldDotted(), but expands arrays and returns all matching objects.
+         *  Turning off expandLastArray allows you to retrieve nested array objects instead of
+         *  their contents.
+         */
+        void getFieldsDotted(const StringData& name, BSONElementSet &ret, bool expandLastArray = true ) const;
+        void getFieldsDotted(const StringData& name, BSONElementMSet &ret, bool expandLastArray = true ) const;
+
+        /** Like getFieldDotted(), but returns first array encountered while traversing the
+            dotted fields of name.  The name variable is updated to represent field
+            names with respect to the returned element. */
+        BSONElement getFieldDottedOrArray(const char *&name) const;
+
+        /** Get the field of the specified name. eoo() is true on the returned
+            element if not found.
+        */
+        BSONElement getField(const StringData& name) const;
+
+        /** Get several fields at once. This is faster than separate getField() calls as the size of 
+            elements iterated can then be calculated only once each.
+            @param n number of fieldNames, and number of elements in the fields array
+            @param fields if a field is found its element is stored in its corresponding position in this array.
+                   if not found the array element is unchanged.
+         */
+        void getFields(unsigned n, const char **fieldNames, BSONElement *fields) const;
+
+        /** Get the field of the specified name. eoo() is true on the returned
+            element if not found.
+        */
+        BSONElement operator[] (const char *field) const {
+            return getField(field);
+        }
+
+        BSONElement operator[] (const string& field) const {
+            return getField(field);
+        }
+
+        BSONElement operator[] (int field) const {
+            StringBuilder ss;
+            ss << field;
+            string s = ss.str();
+            return getField(s.c_str());
+        }
+
+        /** @return true if field exists */
+        bool hasField( const char * name ) const { return !getField(name).eoo(); }
+        /** @return true if field exists */
+        bool hasElement(const char *name) const { return hasField(name); }
+
+        /** @return "" if DNE or wrong type */
+        const char * getStringField(const char *name) const;
+
+        /** @return subobject of the given name */
+        BSONObj getObjectField(const char *name) const;
+
+        /** @return INT_MIN if not present - does some type conversions */
+        int getIntField(const char *name) const;
+
+        /** @return false if not present 
+            @see BSONElement::trueValue()
+         */
+        bool getBoolField(const char *name) const;
+
+        /**
+           sets element field names to empty string
+           If a field in pattern is missing, it is omitted from the returned
+           object.
+        */
+        BSONObj extractFieldsUnDotted(BSONObj pattern) const;
+
+        /** extract items from object which match a pattern object.
+            e.g., if pattern is { x : 1, y : 1 }, builds an object with
+            x and y elements of this object, if they are present.
+           returns elements with original field names
+        */
+        BSONObj extractFields(const BSONObj &pattern , bool fillWithNull=false) const;
+
+        BSONObj filterFieldsUndotted(const BSONObj &filter, bool inFilter) const;
+
+        BSONElement getFieldUsingIndexNames(const char *fieldName, const BSONObj &indexKey) const;
+
+        /** arrays are bson objects with numeric and increasing field names
+            @return true if field names are numeric and increasing
+         */
+        bool couldBeArray() const;
+
+        /** @return the raw data of the object */
+        const char *objdata() const {
+            return _objdata;
+        }
+        /** @return total size of the BSON object in bytes */
+        int objsize() const { return *(reinterpret_cast<const int*>(objdata())); }
+
+        /** performs a cursory check on the object's size only. */
+        bool isValid() const;
+
+        /** @return if the user is a valid user doc
+            criter: isValid() no . or $ field names
+         */
+        bool okForStorage() const;
+
+        /** @return true if object is empty -- i.e.,  {} */
+        bool isEmpty() const { return objsize() <= 5; }
+
+        void dump() const;
+
+        /** Alternative output format */
+        string hexDump() const;
+
+        /**wo='well ordered'.  fields must be in same order in each object.
+           Ordering is with respect to the signs of the elements
+           and allows ascending / descending key mixing.
+           @return  <0 if l<r. 0 if l==r. >0 if l>r
+        */
+        int woCompare(const BSONObj& r, const Ordering &o,
+                      bool considerFieldName=true) const;
+
+        /**wo='well ordered'.  fields must be in same order in each object.
+           Ordering is with respect to the signs of the elements
+           and allows ascending / descending key mixing.
+           @return  <0 if l<r. 0 if l==r. >0 if l>r
+        */
+        int woCompare(const BSONObj& r, const BSONObj &ordering = BSONObj(),
+                      bool considerFieldName=true) const;
+
+        bool operator<( const BSONObj& other ) const { return woCompare( other ) < 0; }
+        bool operator<=( const BSONObj& other ) const { return woCompare( other ) <= 0; }
+        bool operator>( const BSONObj& other ) const { return woCompare( other ) > 0; }
+        bool operator>=( const BSONObj& other ) const { return woCompare( other ) >= 0; }
+
+        /**
+         * @param useDotted whether to treat sort key fields as possibly dotted and expand into them
+         */
+        int woSortOrder( const BSONObj& r , const BSONObj& sortKey , bool useDotted=false ) const;
+
+        bool equal(const BSONObj& r) const;
+
+        /** This is "shallow equality" -- ints and doubles won't match.  for a
+           deep equality test use woCompare (which is slower).
+        */
+        bool binaryEqual(const BSONObj& r) const {
+            int os = objsize();
+            if ( os == r.objsize() ) {
+                return (os == 0 || memcmp(objdata(),r.objdata(),os)==0);
+            }
+            return false;
+        }
+
+        /** @return first field of the object */
+        BSONElement firstElement() const { return BSONElement(objdata() + 4); }
+
+        /** faster than firstElement().fieldName() - for the first element we can easily find the fieldname without 
+            computing the element size.
+        */
+        const char * firstElementFieldName() const { 
+            const char *p = objdata() + 4;
+            return *p == EOO ? "" : p+1;
+        }
+
+        BSONType firstElementType() const { 
+            const char *p = objdata() + 4;
+            return (BSONType) *p;
+        }
+
+        /** Get the _id field from the object.  For good performance drivers should
+            assure that _id is the first element of the object; however, correct operation
+            is assured regardless.
+            @return true if found
+        */
+        bool getObjectID(BSONElement& e) const;
+
+        /** @return A hash code for the object */
+        int hash() const {
+            unsigned x = 0;
+            const char *p = objdata();
+            for ( int i = 0; i < objsize(); i++ )
+                x = x * 131 + p[i];
+            return (x & 0x7fffffff) | 0x8000000; // must be > 0
+        }
+
+        // Return a version of this object where top level elements of types
+        // that are not part of the bson wire protocol are replaced with
+        // string identifier equivalents.
+        // TODO Support conversion of element types other than min and max.
+        BSONObj clientReadable() const;
+
+        /** Return new object with the field names replaced by those in the
+            passed object. */
+        BSONObj replaceFieldNames( const BSONObj &obj ) const;
+
+        /** true unless corrupt */
+        bool valid() const;
+
+        /** @return an md5 value for this object. */
+        string md5() const;
+
+        bool operator==( const BSONObj& other ) const { return equal( other ); }
+        bool operator!=(const BSONObj& other) const { return !operator==( other); }
+
+        enum MatchType {
+            Equality = 0,
+            LT = 0x1,
+            LTE = 0x3,
+            GTE = 0x6,
+            GT = 0x4,
+            opIN = 0x8, // { x : { $in : [1,2,3] } }
+            NE = 0x9,
+            opSIZE = 0x0A,
+            opALL = 0x0B,
+            NIN = 0x0C,
+            opEXISTS = 0x0D,
+            opMOD = 0x0E,
+            opTYPE = 0x0F,
+            opREGEX = 0x10,
+            opOPTIONS = 0x11,
+            opELEM_MATCH = 0x12,
+            opNEAR = 0x13,
+            opWITHIN = 0x14,
+            opMAX_DISTANCE=0x15
+        };
+
+        /** add all elements of the object to the specified vector */
+        void elems(vector<BSONElement> &) const;
+        /** add all elements of the object to the specified list */
+        void elems(list<BSONElement> &) const;
+
+        /** add all values of the object to the specified vector.  If type mismatches, exception.
+            this is most useful when the BSONObj is an array, but can be used with non-arrays too in theory.
+
+            example:
+              bo sub = y["subobj"].Obj();
+              vector<int> myints;
+              sub.Vals(myints);
+        */
+        template <class T>
+        void Vals(vector<T> &) const;
+        /** add all values of the object to the specified list.  If type mismatches, exception. */
+        template <class T>
+        void Vals(list<T> &) const;
+
+        /** add all values of the object to the specified vector.  If type mismatches, skip. */
+        template <class T>
+        void vals(vector<T> &) const;
+        /** add all values of the object to the specified list.  If type mismatches, skip. */
+        template <class T>
+        void vals(list<T> &) const;
+
+        friend class BSONObjIterator;
+        typedef BSONObjIterator iterator;
+
+        /** use something like this:
+            for( BSONObj::iterator i = myObj.begin(); i.more(); ) {
+                BSONElement e = i.next();
+                ...
+            }
+        */
+        BSONObjIterator begin() const;
+
+        void appendSelfToBufBuilder(BufBuilder& b) const {
+            assert( objsize() );
+            b.appendBuf(reinterpret_cast<const void *>( objdata() ), objsize());
+        }
+
+#pragma pack(1)
+        class Holder : boost::noncopyable {
+        private:
+            Holder(); // this class should never be explicitly created
+            AtomicUInt refCount;
+        public:
+            char data[4]; // start of object
+
+            void zero() { refCount.zero(); }
+
+            // these are called automatically by boost::intrusive_ptr
+            friend void intrusive_ptr_add_ref(Holder* h) { h->refCount++; }
+            friend void intrusive_ptr_release(Holder* h) {
+#if defined(_DEBUG) // cant use dassert or DEV here
+                assert((int)h->refCount > 0); // make sure we haven't already freed the buffer
+#endif
+                if(--(h->refCount) == 0){
+#if defined(_DEBUG)
+                    unsigned sz = (unsigned&) *h->data;
+                    assert(sz < BSONObjMaxInternalSize * 3);
+                    memset(h->data, 0xdd, sz);
+#endif
+                    free(h);
+                }
+            }
+        };
+#pragma pack()
+
+    private:
+        const char *_objdata;
+        boost::intrusive_ptr< Holder > _holder;
+
+        void _assertInvalid() const;
+
+        void init(Holder *holder) {
+            _holder = holder; // holder is now managed by intrusive_ptr
+            init(holder->data);
+        }
+        void init(const char *data) {
+            _objdata = data;
+            if ( !isValid() )
+                _assertInvalid();
+        }
+    };
+
+    ostream& operator<<( ostream &s, const BSONObj &o );
+    ostream& operator<<( ostream &s, const BSONElement &e );
+
+    StringBuilder& operator<<( StringBuilder &s, const BSONObj &o );
+    StringBuilder& operator<<( StringBuilder &s, const BSONElement &e );
+
+
+    struct BSONArray : BSONObj {
+        // Don't add anything other than forwarding constructors!!!
+        BSONArray(): BSONObj() {}
+        explicit BSONArray(const BSONObj& obj): BSONObj(obj) {}
+    };
+
+}
diff --git a/src/mongo/bson/bsonobjbuilder.h b/src/mongo/bson/bsonobjbuilder.h
new file mode 100644
index 00000000000..1fdbcba18a6
--- /dev/null
+++ b/src/mongo/bson/bsonobjbuilder.h
@@ -0,0 +1,842 @@
+/* bsonobjbuilder.h
+
+   Classes in this file:
+   BSONObjBuilder
+   BSONArrayBuilder
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <limits>
+#include <cmath>
+#include <boost/static_assert.hpp>
+#include "bsonelement.h"
+#include "bsonobj.h"
+#include "bsonmisc.h"
+
+namespace mongo {
+
+    using namespace std;
+
+#if defined(_WIN32)
+// warning: 'this' : used in base member initializer list
+#pragma warning( disable : 4355 )
+#endif
+
+    template<typename T>
+    class BSONFieldValue {
+    public:
+        BSONFieldValue( const string& name , const T& t ) {
+            _name = name;
+            _t = t;
+        }
+
+        const T& value() const { return _t; }
+        const string& name() const { return _name; }
+
+    private:
+        string _name;
+        T _t;
+    };
+
+    template<typename T>
+    class BSONField {
+    public:
+        BSONField( const string& name , const string& longName="" )
+            : _name(name), _longName(longName) {}
+        const string& name() const { return _name; }
+        operator string() const { return _name; }
+
+        BSONFieldValue<T> make( const T& t ) const {
+            return BSONFieldValue<T>( _name , t );
+        }
+
+        BSONFieldValue<BSONObj> gt( const T& t ) const { return query( "$gt" , t ); }
+        BSONFieldValue<BSONObj> lt( const T& t ) const { return query( "$lt" , t ); }
+
+        BSONFieldValue<BSONObj> query( const char * q , const T& t ) const;
+
+        BSONFieldValue<T> operator()( const T& t ) const {
+            return BSONFieldValue<T>( _name , t );
+        }
+
+    private:
+        string _name;
+        string _longName;
+    };
+
+    /** Utility for creating a BSONObj.
+        See also the BSON() and BSON_ARRAY() macros.
+    */
+    class BSONObjBuilder : boost::noncopyable {
+    public:
+        /** @param initsize this is just a hint as to the final size of the object */
+        BSONObjBuilder(int initsize=512) : _b(_buf), _buf(initsize + sizeof(unsigned)), _offset( sizeof(unsigned) ), _s( this ) , _tracker(0) , _doneCalled(false) {
+            _b.appendNum((unsigned)0); // ref-count
+            _b.skip(4); /*leave room for size field and ref-count*/
+        }
+
+        /** @param baseBuilder construct a BSONObjBuilder using an existing BufBuilder
+         *  This is for more efficient adding of subobjects/arrays. See docs for subobjStart for example.
+         */
+        BSONObjBuilder( BufBuilder &baseBuilder ) : _b( baseBuilder ), _buf( 0 ), _offset( baseBuilder.len() ), _s( this ) , _tracker(0) , _doneCalled(false) {
+            _b.skip( 4 );
+        }
+
+        BSONObjBuilder( const BSONSizeTracker & tracker ) : _b(_buf) , _buf(tracker.getSize() + sizeof(unsigned) ), _offset( sizeof(unsigned) ), _s( this ) , _tracker( (BSONSizeTracker*)(&tracker) ) , _doneCalled(false) {
+            _b.appendNum((unsigned)0); // ref-count
+            _b.skip(4);
+        }
+
+        ~BSONObjBuilder() {
+            if ( !_doneCalled && _b.buf() && _buf.getSize() == 0 ) {
+                _done();
+            }
+        }
+
+        /** add all the fields from the object specified to this object */
+        BSONObjBuilder& appendElements(BSONObj x);
+
+        /** add all the fields from the object specified to this object if they don't exist already */
+        BSONObjBuilder& appendElementsUnique( BSONObj x );
+
+        /** append element to the object we are building */
+        BSONObjBuilder& append( const BSONElement& e) {
+            assert( !e.eoo() ); // do not append eoo, that would corrupt us. the builder auto appends when done() is called.
+            _b.appendBuf((void*) e.rawdata(), e.size());
+            return *this;
+        }
+
+        /** append an element but with a new name */
+        BSONObjBuilder& appendAs(const BSONElement& e, const StringData& fieldName) {
+            assert( !e.eoo() ); // do not append eoo, that would corrupt us. the builder auto appends when done() is called.
+            _b.appendNum((char) e.type());
+            _b.appendStr(fieldName);
+            _b.appendBuf((void *) e.value(), e.valuesize());
+            return *this;
+        }
+
+        /** add a subobject as a member */
+        BSONObjBuilder& append(const StringData& fieldName, BSONObj subObj) {
+            _b.appendNum((char) Object);
+            _b.appendStr(fieldName);
+            _b.appendBuf((void *) subObj.objdata(), subObj.objsize());
+            return *this;
+        }
+
+        /** add a subobject as a member */
+        BSONObjBuilder& appendObject(const StringData& fieldName, const char * objdata , int size = 0 ) {
+            assert( objdata );
+            if ( size == 0 ) {
+                size = *((int*)objdata);
+            }
+
+            assert( size > 4 && size < 100000000 );
+
+            _b.appendNum((char) Object);
+            _b.appendStr(fieldName);
+            _b.appendBuf((void*)objdata, size );
+            return *this;
+        }
+
+        /** add header for a new subobject and return bufbuilder for writing to
+         *  the subobject's body
+         *
+         *  example:
+         *
+         *  BSONObjBuilder b;
+         *  BSONObjBuilder sub (b.subobjStart("fieldName"));
+         *  // use sub
+         *  sub.done()
+         *  // use b and convert to object
+         */
+        BufBuilder &subobjStart(const StringData& fieldName) {
+            _b.appendNum((char) Object);
+            _b.appendStr(fieldName);
+            return _b;
+        }
+
+        /** add a subobject as a member with type Array.  Thus arr object should have "0", "1", ...
+            style fields in it.
+        */
+        BSONObjBuilder& appendArray(const StringData& fieldName, const BSONObj &subObj) {
+            _b.appendNum((char) Array);
+            _b.appendStr(fieldName);
+            _b.appendBuf((void *) subObj.objdata(), subObj.objsize());
+            return *this;
+        }
+        BSONObjBuilder& append(const StringData& fieldName, BSONArray arr) {
+            return appendArray(fieldName, arr);
+        }
+
+        /** add header for a new subarray and return bufbuilder for writing to
+            the subarray's body */
+        BufBuilder &subarrayStart(const StringData& fieldName) {
+            _b.appendNum((char) Array);
+            _b.appendStr(fieldName);
+            return _b;
+        }
+
+        /** Append a boolean element */
+        BSONObjBuilder& appendBool(const StringData& fieldName, int val) {
+            _b.appendNum((char) Bool);
+            _b.appendStr(fieldName);
+            _b.appendNum((char) (val?1:0));
+            return *this;
+        }
+
+        /** Append a boolean element */
+        BSONObjBuilder& append(const StringData& fieldName, bool val) {
+            _b.appendNum((char) Bool);
+            _b.appendStr(fieldName);
+            _b.appendNum((char) (val?1:0));
+            return *this;
+        }
+
+        /** Append a 32 bit integer element */
+        BSONObjBuilder& append(const StringData& fieldName, int n) {
+            _b.appendNum((char) NumberInt);
+            _b.appendStr(fieldName);
+            _b.appendNum(n);
+            return *this;
+        }
+
+        /** Append a 32 bit unsigned element - cast to a signed int. */
+        BSONObjBuilder& append(const StringData& fieldName, unsigned n) {
+            return append(fieldName, (int) n);
+        }
+
+        /** Append a NumberLong */
+        BSONObjBuilder& append(const StringData& fieldName, long long n) {
+            _b.appendNum((char) NumberLong);
+            _b.appendStr(fieldName);
+            _b.appendNum(n);
+            return *this;
+        }
+
+        /** appends a number.  if n < max(int)/2 then uses int, otherwise long long */
+        BSONObjBuilder& appendIntOrLL( const StringData& fieldName , long long n ) {
+            long long x = n;
+            if ( x < 0 )
+                x = x * -1;
+            if ( x < ( (numeric_limits<int>::max)() / 2 ) ) // extra () to avoid max macro on windows
+                append( fieldName , (int)n );
+            else
+                append( fieldName , n );
+            return *this;
+        }
+
+        /**
+         * appendNumber is a series of method for appending the smallest sensible type
+         * mostly for JS
+         */
+        BSONObjBuilder& appendNumber( const StringData& fieldName , int n ) {
+            return append( fieldName , n );
+        }
+
+        BSONObjBuilder& appendNumber( const StringData& fieldName , double d ) {
+            return append( fieldName , d );
+        }
+
+        BSONObjBuilder& appendNumber( const StringData& fieldName , size_t n ) {
+            static size_t maxInt = (size_t)pow( 2.0 , 30.0 );
+
+            if ( n < maxInt )
+                append( fieldName , (int)n );
+            else
+                append( fieldName , (long long)n );
+            return *this;
+        }
+
+        BSONObjBuilder& appendNumber( const StringData& fieldName , long long l ) {
+            static long long maxInt = (int)pow( 2.0 , 30.0 );
+            static long long maxDouble = (long long)pow( 2.0 , 40.0 );
+            long long x = l >= 0 ? l : -l;
+            if ( x < maxInt )
+                append( fieldName , (int)l );
+            else if ( x < maxDouble )
+                append( fieldName , (double)l );
+            else
+                append( fieldName , l );
+            return *this;
+        }
+
+        /** Append a double element */
+        BSONObjBuilder& append(const StringData& fieldName, double n) {
+            _b.appendNum((char) NumberDouble);
+            _b.appendStr(fieldName);
+            _b.appendNum(n);
+            return *this;
+        }
+
+        /** tries to append the data as a number
+         * @return true if the data was able to be converted to a number
+         */
+        bool appendAsNumber( const StringData& fieldName , const string& data );
+
+        /** Append a BSON Object ID (OID type).
+            @deprecated Generally, it is preferred to use the append append(name, oid)
+            method for this.
+        */
+        BSONObjBuilder& appendOID(const StringData& fieldName, OID *oid = 0 , bool generateIfBlank = false ) {
+            _b.appendNum((char) jstOID);
+            _b.appendStr(fieldName);
+            if ( oid )
+                _b.appendBuf( (void *) oid, 12 );
+            else {
+                OID tmp;
+                if ( generateIfBlank )
+                    tmp.init();
+                else
+                    tmp.clear();
+                _b.appendBuf( (void *) &tmp, 12 );
+            }
+            return *this;
+        }
+
+        /**
+        Append a BSON Object ID.
+        @param fieldName Field name, e.g., "_id".
+        @returns the builder object
+        */
+        BSONObjBuilder& append( const StringData& fieldName, OID oid ) {
+            _b.appendNum((char) jstOID);
+            _b.appendStr(fieldName);
+            _b.appendBuf( (void *) &oid, 12 );
+            return *this;
+        }
+
+        /**
+        Generate and assign an object id for the _id field.
+        _id should be the first element in the object for good performance.
+        */
+        BSONObjBuilder& genOID() {
+            return append("_id", OID::gen());
+        }
+
+        /** Append a time_t date.
+            @param dt a C-style 32 bit date value, that is
+            the number of seconds since January 1, 1970, 00:00:00 GMT
+        */
+        BSONObjBuilder& appendTimeT(const StringData& fieldName, time_t dt) {
+            _b.appendNum((char) Date);
+            _b.appendStr(fieldName);
+            _b.appendNum(static_cast<unsigned long long>(dt) * 1000);
+            return *this;
+        }
+        /** Append a date.
+            @param dt a Java-style 64 bit date value, that is
+            the number of milliseconds since January 1, 1970, 00:00:00 GMT
+        */
+        BSONObjBuilder& appendDate(const StringData& fieldName, Date_t dt) {
+            /* easy to pass a time_t to this and get a bad result.  thus this warning. */
+#if defined(_DEBUG) && defined(MONGO_EXPOSE_MACROS)
+            if( dt > 0 && dt <= 0xffffffff ) {
+                static int n;
+                if( n++ == 0 )
+                    log() << "DEV WARNING appendDate() called with a tiny (but nonzero) date" << endl;
+            }
+#endif
+            _b.appendNum((char) Date);
+            _b.appendStr(fieldName);
+            _b.appendNum(dt);
+            return *this;
+        }
+        BSONObjBuilder& append(const StringData& fieldName, Date_t dt) {
+            return appendDate(fieldName, dt);
+        }
+
+        /** Append a regular expression value
+            @param regex the regular expression pattern
+            @param regex options such as "i" or "g"
+        */
+        BSONObjBuilder& appendRegex(const StringData& fieldName, const StringData& regex, const StringData& options = "") {
+            _b.appendNum((char) RegEx);
+            _b.appendStr(fieldName);
+            _b.appendStr(regex);
+            _b.appendStr(options);
+            return *this;
+        }
+
+        BSONObjBuilder& appendCode(const StringData& fieldName, const StringData& code) {
+            _b.appendNum((char) Code);
+            _b.appendStr(fieldName);
+            _b.appendNum((int) code.size()+1);
+            _b.appendStr(code);
+            return *this;
+        }
+
+        /** Append a string element. 
+            @param sz size includes terminating null character */
+        BSONObjBuilder& append(const StringData& fieldName, const char *str, int sz) {
+            _b.appendNum((char) String);
+            _b.appendStr(fieldName);
+            _b.appendNum((int)sz);
+            _b.appendBuf(str, sz);
+            return *this;
+        }
+        /** Append a string element */
+        BSONObjBuilder& append(const StringData& fieldName, const char *str) {
+            return append(fieldName, str, (int) strlen(str)+1);
+        }
+        /** Append a string element */
+        BSONObjBuilder& append(const StringData& fieldName, const string& str) {
+            return append(fieldName, str.c_str(), (int) str.size()+1);
+        }
+
+        BSONObjBuilder& appendSymbol(const StringData& fieldName, const StringData& symbol) {
+            _b.appendNum((char) Symbol);
+            _b.appendStr(fieldName);
+            _b.appendNum((int) symbol.size()+1);
+            _b.appendStr(symbol);
+            return *this;
+        }
+
+        /** Append a Null element to the object */
+        BSONObjBuilder& appendNull( const StringData& fieldName ) {
+            _b.appendNum( (char) jstNULL );
+            _b.appendStr( fieldName );
+            return *this;
+        }
+
+        // Append an element that is less than all other keys.
+        BSONObjBuilder& appendMinKey( const StringData& fieldName ) {
+            _b.appendNum( (char) MinKey );
+            _b.appendStr( fieldName );
+            return *this;
+        }
+        // Append an element that is greater than all other keys.
+        BSONObjBuilder& appendMaxKey( const StringData& fieldName ) {
+            _b.appendNum( (char) MaxKey );
+            _b.appendStr( fieldName );
+            return *this;
+        }
+
+        // Append a Timestamp field -- will be updated to next OpTime on db insert.
+        BSONObjBuilder& appendTimestamp( const StringData& fieldName ) {
+            _b.appendNum( (char) Timestamp );
+            _b.appendStr( fieldName );
+            _b.appendNum( (unsigned long long) 0 );
+            return *this;
+        }
+
+        BSONObjBuilder& appendTimestamp( const StringData& fieldName , unsigned long long val ) {
+            _b.appendNum( (char) Timestamp );
+            _b.appendStr( fieldName );
+            _b.appendNum( val );
+            return *this;
+        }
+
+        /**
+        Timestamps are a special BSON datatype that is used internally for replication.
+        Append a timestamp element to the object being ebuilt.
+        @param time - in millis (but stored in seconds)
+        */
+        BSONObjBuilder& appendTimestamp( const StringData& fieldName , unsigned long long time , unsigned int inc );
+
+        /*
+        Append an element of the deprecated DBRef type.
+        @deprecated
+        */
+        BSONObjBuilder& appendDBRef( const StringData& fieldName, const StringData& ns, const OID &oid ) {
+            _b.appendNum( (char) DBRef );
+            _b.appendStr( fieldName );
+            _b.appendNum( (int) ns.size() + 1 );
+            _b.appendStr( ns );
+            _b.appendBuf( (void *) &oid, 12 );
+            return *this;
+        }
+
+        /** Append a binary data element
+            @param fieldName name of the field
+            @param len length of the binary data in bytes
+            @param subtype subtype information for the data. @see enum BinDataType in bsontypes.h.
+                   Use BinDataGeneral if you don't care about the type.
+            @param data the byte array
+        */
+        BSONObjBuilder& appendBinData( const StringData& fieldName, int len, BinDataType type, const void *data ) {
+            _b.appendNum( (char) BinData );
+            _b.appendStr( fieldName );
+            _b.appendNum( len );
+            _b.appendNum( (char) type );
+            _b.appendBuf( data, len );
+            return *this;
+        }
+
+        /**
+        Subtype 2 is deprecated.
+        Append a BSON bindata bytearray element.
+        @param data a byte array
+        @param len the length of data
+        */
+        BSONObjBuilder& appendBinDataArrayDeprecated( const char * fieldName , const void * data , int len ) {
+            _b.appendNum( (char) BinData );
+            _b.appendStr( fieldName );
+            _b.appendNum( len + 4 );
+            _b.appendNum( (char)0x2 );
+            _b.appendNum( len );
+            _b.appendBuf( data, len );
+            return *this;
+        }
+
+        /** Append to the BSON object a field of type CodeWScope.  This is a javascript code
+            fragment accompanied by some scope that goes with it.
+        */
+        BSONObjBuilder& appendCodeWScope( const StringData& fieldName, const StringData& code, const BSONObj &scope ) {
+            _b.appendNum( (char) CodeWScope );
+            _b.appendStr( fieldName );
+            _b.appendNum( ( int )( 4 + 4 + code.size() + 1 + scope.objsize() ) );
+            _b.appendNum( ( int ) code.size() + 1 );
+            _b.appendStr( code );
+            _b.appendBuf( ( void * )scope.objdata(), scope.objsize() );
+            return *this;
+        }
+
+        void appendUndefined( const StringData& fieldName ) {
+            _b.appendNum( (char) Undefined );
+            _b.appendStr( fieldName );
+        }
+
+        /* helper function -- see Query::where() for primary way to do this. */
+        void appendWhere( const StringData& code, const BSONObj &scope ) {
+            appendCodeWScope( "$where" , code , scope );
+        }
+
+        /**
+           these are the min/max when comparing, not strict min/max elements for a given type
+        */
+        void appendMinForType( const StringData& fieldName , int type );
+        void appendMaxForType( const StringData& fieldName , int type );
+
+        /** Append an array of values. */
+        template < class T >
+        BSONObjBuilder& append( const StringData& fieldName, const vector< T >& vals );
+
+        template < class T >
+        BSONObjBuilder& append( const StringData& fieldName, const list< T >& vals );
+
+        /** Append a set of values. */
+        template < class T >
+        BSONObjBuilder& append( const StringData& fieldName, const set< T >& vals );
+
+        /**
+         * destructive
+         * The returned BSONObj will free the buffer when it is finished.
+         * @return owned BSONObj
+        */
+        BSONObj obj() {
+            bool own = owned();
+            massert( 10335 , "builder does not own memory", own );
+            doneFast();
+            BSONObj::Holder* h = (BSONObj::Holder*)_b.buf();
+            decouple(); // sets _b.buf() to NULL
+            return BSONObj(h);
+        }
+
+        /** Fetch the object we have built.
+            BSONObjBuilder still frees the object when the builder goes out of
+            scope -- very important to keep in mind.  Use obj() if you
+            would like the BSONObj to last longer than the builder.
+        */
+        BSONObj done() {
+            return BSONObj(_done());
+        }
+
+        // Like 'done' above, but does not construct a BSONObj to return to the caller.
+        void doneFast() {
+            (void)_done();
+        }
+
+        /** Peek at what is in the builder, but leave the builder ready for more appends.
+            The returned object is only valid until the next modification or destruction of the builder.
+            Intended use case: append a field if not already there.
+        */
+        BSONObj asTempObj() {
+            BSONObj temp(_done());
+            _b.setlen(_b.len()-1); //next append should overwrite the EOO
+            _doneCalled = false;
+            return temp;
+        }
+
+        /* assume ownership of the buffer - you must then free it (with free()) */
+        char* decouple(int& l) {
+            char *x = _done();
+            assert( x );
+            l = _b.len();
+            _b.decouple();
+            return x;
+        }
+        void decouple() {
+            _b.decouple();    // post done() call version.  be sure jsobj frees...
+        }
+
+        void appendKeys( const BSONObj& keyPattern , const BSONObj& values );
+
+        static string numStr( int i ) {
+            if (i>=0 && i<100 && numStrsReady)
+                return numStrs[i];
+            StringBuilder o;
+            o << i;
+            return o.str();
+        }
+
+        /** Stream oriented way to add field names and values. */
+        BSONObjBuilderValueStream &operator<<(const char * name ) {
+            _s.endField( name );
+            return _s;
+        }
+
+        /** Stream oriented way to add field names and values. */
+        BSONObjBuilder& operator<<( GENOIDLabeler ) { return genOID(); }
+
+        // prevent implicit string conversions which would allow bad things like BSON( BSON( "foo" << 1 ) << 2 )
+        struct ForceExplicitString {
+            ForceExplicitString( const string &str ) : str_( str ) {}
+            string str_;
+        };
+
+        /** Stream oriented way to add field names and values. */
+        BSONObjBuilderValueStream &operator<<( const ForceExplicitString& name ) {
+            return operator<<( name.str_.c_str() );
+        }
+
+        Labeler operator<<( const Labeler::Label &l ) {
+            massert( 10336 ,  "No subobject started", _s.subobjStarted() );
+            return _s << l;
+        }
+
+        template<typename T>
+        BSONObjBuilderValueStream& operator<<( const BSONField<T>& f ) {
+            _s.endField( f.name().c_str() );
+            return _s;
+        }
+
+        template<typename T>
+        BSONObjBuilder& operator<<( const BSONFieldValue<T>& v ) {
+            append( v.name().c_str() , v.value() );
+            return *this;
+        }
+
+        BSONObjBuilder& operator<<( const BSONElement& e ){
+            append( e );
+            return *this;
+        }
+
+        /** @return true if we are using our own bufbuilder, and not an alternate that was given to us in our constructor */
+        bool owned() const { return &_b == &_buf; }
+
+        BSONObjIterator iterator() const ;
+
+        bool hasField( const StringData& name ) const ;
+
+        int len() const { return _b.len(); }
+
+        BufBuilder& bb() { return _b; }
+
+    private:
+        char* _done() {
+            if ( _doneCalled )
+                return _b.buf() + _offset;
+
+            _doneCalled = true;
+            _s.endField();
+            _b.appendNum((char) EOO);
+            char *data = _b.buf() + _offset;
+            int size = _b.len() - _offset;
+            *((int*)data) = size;
+            if ( _tracker )
+                _tracker->got( size );
+            return data;
+        }
+
+        BufBuilder &_b;
+        BufBuilder _buf;
+        int _offset;
+        BSONObjBuilderValueStream _s;
+        BSONSizeTracker * _tracker;
+        bool _doneCalled;
+
+        static const string numStrs[100]; // cache of 0 to 99 inclusive
+        static bool numStrsReady; // for static init safety. see comments in db/jsobj.cpp
+    };
+
+    class BSONArrayBuilder : boost::noncopyable {
+    public:
+        BSONArrayBuilder() : _i(0), _b() {}
+        BSONArrayBuilder( BufBuilder &_b ) : _i(0), _b(_b) {}
+        BSONArrayBuilder( int initialSize ) : _i(0), _b(initialSize) {}
+
+        template <typename T>
+        BSONArrayBuilder& append(const T& x) {
+            _b.append(num(), x);
+            return *this;
+        }
+
+        BSONArrayBuilder& append(const BSONElement& e) {
+            _b.appendAs(e, num());
+            return *this;
+        }
+
+        template <typename T>
+        BSONArrayBuilder& operator<<(const T& x) {
+            return append(x);
+        }
+
+        void appendNull() {
+            _b.appendNull(num());
+        }
+
+        /**
+         * destructive - ownership moves to returned BSONArray
+         * @return owned BSONArray
+         */
+        BSONArray arr() { return BSONArray(_b.obj()); }
+
+        BSONObj done() { return _b.done(); }
+
+        void doneFast() { _b.doneFast(); }
+
+        template <typename T>
+        BSONArrayBuilder& append(const StringData& name, const T& x) {
+            fill( name );
+            append( x );
+            return *this;
+        }
+
+        // These two just use next position
+        BufBuilder &subobjStart() { return _b.subobjStart( num() ); }
+        BufBuilder &subarrayStart() { return _b.subarrayStart( num() ); }
+
+        // These fill missing entries up to pos. if pos is < next pos is ignored
+        BufBuilder &subobjStart(int pos) {
+            fill(pos);
+            return _b.subobjStart( num() );
+        }
+        BufBuilder &subarrayStart(int pos) {
+            fill(pos);
+            return _b.subarrayStart( num() );
+        }
+
+        // These should only be used where you really need interface compatability with BSONObjBuilder
+        // Currently they are only used by update.cpp and it should probably stay that way
+        BufBuilder &subobjStart( const StringData& name ) {
+            fill( name );
+            return _b.subobjStart( num() );
+        }
+
+        BufBuilder &subarrayStart( const char *name ) {
+            fill( name );
+            return _b.subarrayStart( num() );
+        }
+
+        void appendArray( const StringData& name, BSONObj subObj ) {
+            fill( name );
+            _b.appendArray( num(), subObj );
+        }
+
+        void appendAs( const BSONElement &e, const char *name) {
+            fill( name );
+            append( e );
+        }
+
+        int len() const { return _b.len(); }
+        int arrSize() const { return _i; }
+
+    private:
+        // These two are undefined privates to prevent their accidental
+        // use as we don't support unsigned ints in BSON
+        BSONObjBuilder& append(const StringData& fieldName, unsigned int val);
+        BSONObjBuilder& append(const StringData& fieldName, unsigned long long val);
+
+        void fill( const StringData& name ) {
+            char *r;
+            long int n = strtol( name.data(), &r, 10 );
+            if ( *r )
+                uasserted( 13048, (string)"can't append to array using string field name [" + name.data() + "]" );
+            fill(n);
+        }
+
+        void fill (int upTo){
+            // if this is changed make sure to update error message and jstests/set7.js
+            const int maxElems = 1500000;
+            BOOST_STATIC_ASSERT(maxElems < (BSONObjMaxUserSize/10));
+            uassert(15891, "can't backfill array to larger than 1,500,000 elements", upTo <= maxElems);
+
+            while( _i < upTo )
+                append( nullElt() );
+        }
+
+        static BSONElement nullElt() {
+            static BSONObj n = nullObj();
+            return n.firstElement();
+        }
+
+        static BSONObj nullObj() {
+            BSONObjBuilder _b;
+            _b.appendNull( "" );
+            return _b.obj();
+        }
+
+        string num() { return _b.numStr(_i++); }
+        int _i;
+        BSONObjBuilder _b;
+    };
+
+    template < class T >
+    inline BSONObjBuilder& BSONObjBuilder::append( const StringData& fieldName, const vector< T >& vals ) {
+        BSONObjBuilder arrBuilder;
+        for ( unsigned int i = 0; i < vals.size(); ++i )
+            arrBuilder.append( numStr( i ), vals[ i ] );
+        appendArray( fieldName, arrBuilder.done() );
+        return *this;
+    }
+
+    template < class L >
+    inline BSONObjBuilder& _appendIt( BSONObjBuilder& _this, const StringData& fieldName, const L& vals ) {
+        BSONObjBuilder arrBuilder;
+        int n = 0;
+        for( typename L::const_iterator i = vals.begin(); i != vals.end(); i++ )
+            arrBuilder.append( BSONObjBuilder::numStr(n++), *i );
+        _this.appendArray( fieldName, arrBuilder.done() );
+        return _this;
+    }
+
+    template < class T >
+    inline BSONObjBuilder& BSONObjBuilder::append( const StringData& fieldName, const list< T >& vals ) {
+        return _appendIt< list< T > >( *this, fieldName, vals );
+    }
+
+    template < class T >
+    inline BSONObjBuilder& BSONObjBuilder::append( const StringData& fieldName, const set< T >& vals ) {
+        return _appendIt< set< T > >( *this, fieldName, vals );
+    }
+
+
+    // $or helper: OR(BSON("x" << GT << 7), BSON("y" << LT 6));
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b)
+    { return BSON( "$or" << BSON_ARRAY(a << b) ); }
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c)
+    { return BSON( "$or" << BSON_ARRAY(a << b << c) ); }
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d)
+    { return BSON( "$or" << BSON_ARRAY(a << b << c << d) ); }
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d, const BSONObj& e)
+    { return BSON( "$or" << BSON_ARRAY(a << b << c << d << e) ); }
+    inline BSONObj OR(const BSONObj& a, const BSONObj& b, const BSONObj& c, const BSONObj& d, const BSONObj& e, const BSONObj& f)
+    { return BSON( "$or" << BSON_ARRAY(a << b << c << d << e << f) ); }
+
+}
diff --git a/src/mongo/bson/bsonobjiterator.h b/src/mongo/bson/bsonobjiterator.h
new file mode 100644
index 00000000000..39ae24d9b86
--- /dev/null
+++ b/src/mongo/bson/bsonobjiterator.h
@@ -0,0 +1,161 @@
+// bsonobjiterator.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/preprocessor/cat.hpp> // like the ## operator but works with __LINE__
+
+namespace mongo {
+
+    /** iterator for a BSONObj
+
+       Note each BSONObj ends with an EOO element: so you will get more() on an empty
+       object, although next().eoo() will be true.
+
+       The BSONObj must stay in scope for the duration of the iterator's execution.
+
+       todo: we may want to make a more stl-like iterator interface for this
+             with things like begin() and end()
+    */
+    class BSONObjIterator {
+    public:
+        /** Create an iterator for a BSON object.
+        */
+        BSONObjIterator(const BSONObj& jso) {
+            int sz = jso.objsize();
+            if ( MONGO_unlikely(sz == 0) ) {
+                _pos = _theend = 0;
+                return;
+            }
+            _pos = jso.objdata() + 4;
+            _theend = jso.objdata() + sz - 1;
+        }
+
+        BSONObjIterator( const char * start , const char * end ) {
+            _pos = start + 4;
+            _theend = end - 1;
+        }
+
+        /** @return true if more elements exist to be enumerated. */
+        bool more() { return _pos < _theend; }
+
+        /** @return true if more elements exist to be enumerated INCLUDING the EOO element which is always at the end. */
+        bool moreWithEOO() { return _pos <= _theend; }
+
+        /** @return the next element in the object. For the final element, element.eoo() will be true. */
+        BSONElement next( bool checkEnd ) {
+            assert( _pos <= _theend );
+            BSONElement e( _pos, checkEnd ? (int)(_theend + 1 - _pos) : -1 );
+            _pos += e.size( checkEnd ? (int)(_theend + 1 - _pos) : -1 );
+            return e;
+        }
+        BSONElement next() {
+            assert( _pos <= _theend );
+            BSONElement e(_pos);
+            _pos += e.size();
+            return e;
+        }
+        void operator++() { next(); }
+        void operator++(int) { next(); }
+
+        BSONElement operator*() {
+            assert( _pos <= _theend );
+            return BSONElement(_pos);
+        }
+
+    private:
+        const char* _pos;
+        const char* _theend;
+    };
+
+    class BSONObjIteratorSorted {
+    public:
+        BSONObjIteratorSorted( const BSONObj& o );
+
+        ~BSONObjIteratorSorted() {
+            assert( _fields );
+            delete[] _fields;
+            _fields = 0;
+        }
+
+        bool more() {
+            return _cur < _nfields;
+        }
+
+        BSONElement next() {
+            assert( _fields );
+            if ( _cur < _nfields )
+                return BSONElement( _fields[_cur++] );
+            return BSONElement();
+        }
+
+    private:
+        const char ** _fields;
+        int _nfields;
+        int _cur;
+    };
+
+    /** transform a BSON array into a vector of BSONElements.
+        we match array # positions with their vector position, and ignore
+        any fields with non-numeric field names.
+        */
+    inline vector<BSONElement> BSONElement::Array() const {
+        chk(mongo::Array);
+        vector<BSONElement> v;
+        BSONObjIterator i(Obj());
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const char *f = e.fieldName();
+            try {
+                unsigned u = stringToNum(f);
+                assert( u < 1000000 );
+                if( u >= v.size() )
+                    v.resize(u+1);
+                v[u] = e;
+            }
+            catch(unsigned) { }
+        }
+        return v;
+    }
+
+    /** Similar to BOOST_FOREACH
+     *
+     *  because the iterator is defined outside of the for, you must use {} around
+     *  the surrounding scope. Don't do this:
+     *
+     *  if (foo)
+     *      BSONForEach(e, obj)
+     *          doSomething(e);
+     *
+     *  but this is OK:
+     *
+     *  if (foo) {
+     *      BSONForEach(e, obj)
+     *          doSomething(e);
+     *  }
+     *
+     */
+
+#define BSONForEach(e, obj)                                     \
+    BSONObjIterator BOOST_PP_CAT(it_,__LINE__)(obj);            \
+    for ( BSONElement e;                                        \
+            (BOOST_PP_CAT(it_,__LINE__).more() ?                  \
+             (e = BOOST_PP_CAT(it_,__LINE__).next(), true) :  \
+             false) ;                                         \
+            /*nothing*/ )
+
+}
diff --git a/src/mongo/bson/bsontypes.h b/src/mongo/bson/bsontypes.h
new file mode 100644
index 00000000000..9d95e8e9ad4
--- /dev/null
+++ b/src/mongo/bson/bsontypes.h
@@ -0,0 +1,107 @@
+// bsontypes.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "util/misc.h"
+
+namespace bson { }
+
+namespace mongo {
+
+    using namespace std;
+
+    class BSONArrayBuilder;
+    class BSONElement;
+    class BSONObj;
+    class BSONObjBuilder;
+    class BSONObjBuilderValueStream;
+    class BSONObjIterator;
+    class Ordering;
+    class Record;
+    struct BSONArray; // empty subclass of BSONObj useful for overloading
+    struct BSONElementCmpWithoutField;
+
+    extern BSONObj maxKey;
+    extern BSONObj minKey;
+
+    /**
+        the complete list of valid BSON types
+        see also bsonspec.org
+    */
+    enum BSONType {
+        /** smaller than all other types */
+        MinKey=-1,
+        /** end of object */
+        EOO=0,
+        /** double precision floating point value */
+        NumberDouble=1,
+        /** character string, stored in utf8 */
+        String=2,
+        /** an embedded object */
+        Object=3,
+        /** an embedded array */
+        Array=4,
+        /** binary data */
+        BinData=5,
+        /** Undefined type */
+        Undefined=6,
+        /** ObjectId */
+        jstOID=7,
+        /** boolean type */
+        Bool=8,
+        /** date type */
+        Date=9,
+        /** null type */
+        jstNULL=10,
+        /** regular expression, a pattern with options */
+        RegEx=11,
+        /** deprecated / will be redesigned */
+        DBRef=12,
+        /** deprecated / use CodeWScope */
+        Code=13,
+        /** a programming language (e.g., Python) symbol */
+        Symbol=14,
+        /** javascript code that can execute on the database server, with SavedContext */
+        CodeWScope=15,
+        /** 32 bit signed integer */
+        NumberInt = 16,
+        /** Updated to a Date with value next OpTime on insert */
+        Timestamp = 17,
+        /** 64 bit integer */
+        NumberLong = 18,
+        /** max type that is not MaxKey */
+        JSTypeMax=18,
+        /** larger than all other types */
+        MaxKey=127
+    };
+
+    /* subtypes of BinData.
+       bdtCustom and above are ones that the JS compiler understands, but are
+       opaque to the database.
+    */
+    enum BinDataType {
+        BinDataGeneral=0,
+        Function=1,
+        ByteArrayDeprecated=2, /* use BinGeneral instead */
+        bdtUUID = 3,
+        MD5Type=5,
+        bdtCustom=128
+    };
+
+}
+
diff --git a/src/mongo/bson/inline_decls.h b/src/mongo/bson/inline_decls.h
new file mode 100644
index 00000000000..30da9b4560d
--- /dev/null
+++ b/src/mongo/bson/inline_decls.h
@@ -0,0 +1,68 @@
+// inline_decls.h
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#if defined(__GNUC__)
+
+#define NOINLINE_DECL __attribute__((noinline))
+
+#elif defined(_MSC_VER)
+
+#define NOINLINE_DECL __declspec(noinline)
+
+#else
+
+#define NOINLINE_DECL
+
+#endif
+
+namespace mongo {
+
+/* Note: do not clutter code with these -- ONLY use in hot spots / significant loops. */
+
+#if !defined(__GNUC__)
+
+// branch prediction.  indicate we expect to be true
+# define MONGO_likely(x) ((bool)(x))
+
+// branch prediction.  indicate we expect to be false
+# define MONGO_unlikely(x) ((bool)(x))
+
+# if defined(_WIN32)
+    // prefetch data from memory
+    inline void prefetch(const void *p) { 
+#if defined(_MM_HINT_T0)
+        _mm_prefetch((char *) p, _MM_HINT_T0);
+#endif
+    }
+#else
+    inline void prefetch(void *p) { }
+#endif
+
+#else
+
+# define MONGO_likely(x) ( __builtin_expect((bool)(x), 1) )
+# define MONGO_unlikely(x) ( __builtin_expect((bool)(x), 0) )
+
+    inline void prefetch(void *p) { 
+        __builtin_prefetch(p);
+    }
+
+#endif
+
+}
diff --git a/src/mongo/bson/oid.cpp b/src/mongo/bson/oid.cpp
new file mode 100644
index 00000000000..3aee14a3585
--- /dev/null
+++ b/src/mongo/bson/oid.cpp
@@ -0,0 +1,173 @@
+// @file oid.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "oid.h"
+#include "util/atomic_int.h"
+#include "../db/nonce.h"
+#include "bsonobjbuilder.h"
+
+BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 );
+
+namespace mongo {
+
+    // machine # before folding in the process id
+    OID::MachineAndPid OID::ourMachine;
+
+    unsigned OID::ourPid() {
+        unsigned pid;
+#if defined(_WIN32)
+        pid = (unsigned short) GetCurrentProcessId();
+#elif defined(__linux__) || defined(__APPLE__) || defined(__sunos__)
+        pid = (unsigned short) getpid();
+#else
+        pid = (unsigned short) Security::getNonce();
+#endif
+        return pid;
+    }
+
+    void OID::foldInPid(OID::MachineAndPid& x) {
+        unsigned p = ourPid();
+        x._pid ^= (unsigned short) p;
+        // when the pid is greater than 16 bits, let the high bits modulate the machine id field.
+        unsigned short& rest = (unsigned short &) x._machineNumber[1];
+        rest ^= p >> 16;
+    }
+
+    OID::MachineAndPid OID::genMachineAndPid() {
+        BOOST_STATIC_ASSERT( sizeof(mongo::OID::MachineAndPid) == 5 );
+
+        // this is not called often, so the following is not expensive, and gives us some
+        // testing that nonce generation is working right and that our OIDs are (perhaps) ok.
+        {
+            nonce64 a = Security::getNonceDuringInit();
+            nonce64 b = Security::getNonceDuringInit();
+            nonce64 c = Security::getNonceDuringInit();
+            assert( !(a==b && b==c) );
+        }
+
+        unsigned long long n = Security::getNonceDuringInit();
+        OID::MachineAndPid x = ourMachine = (OID::MachineAndPid&) n;
+        foldInPid(x);
+        return x;
+    }
+
+    // after folding in the process id
+    OID::MachineAndPid OID::ourMachineAndPid = OID::genMachineAndPid();
+
+    void OID::regenMachineId() {
+        ourMachineAndPid = genMachineAndPid();
+    }
+
+    inline bool OID::MachineAndPid::operator!=(const OID::MachineAndPid& rhs) const {
+        return _pid != rhs._pid || _machineNumber != rhs._machineNumber;
+    }
+
+    unsigned OID::getMachineId() {
+        unsigned char x[4];
+        x[0] = ourMachineAndPid._machineNumber[0];
+        x[1] = ourMachineAndPid._machineNumber[1];
+        x[2] = ourMachineAndPid._machineNumber[2];
+        x[3] = 0;
+        return (unsigned&) x[0];
+    }
+
+    void OID::justForked() {
+        MachineAndPid x = ourMachine;
+        // we let the random # for machine go into all 5 bytes of MachineAndPid, and then
+        // xor in the pid into _pid.  this reduces the probability of collisions.
+        foldInPid(x);
+        ourMachineAndPid = genMachineAndPid();
+        assert( x != ourMachineAndPid );
+        ourMachineAndPid = x;
+    }
+
+    void OID::init() {
+        static AtomicUInt inc = (unsigned) Security::getNonce();
+
+        {
+            unsigned t = (unsigned) time(0);
+            unsigned char *T = (unsigned char *) &t;
+            _time[0] = T[3]; // big endian order because we use memcmp() to compare OID's
+            _time[1] = T[2];
+            _time[2] = T[1];
+            _time[3] = T[0];
+        }
+
+        _machineAndPid = ourMachineAndPid;
+
+        {
+            int new_inc = inc++;
+            unsigned char *T = (unsigned char *) &new_inc;
+            _inc[0] = T[2];
+            _inc[1] = T[1];
+            _inc[2] = T[0];
+        }
+    }
+
+    void OID::init( string s ) {
+        assert( s.size() == 24 );
+        const char *p = s.c_str();
+        for( int i = 0; i < 12; i++ ) {
+            data[i] = fromHex(p);
+            p += 2;
+        }
+    }
+
+    void OID::init(Date_t date, bool max) {
+        int time = (int) (date / 1000);
+        char* T = (char *) &time;
+        data[0] = T[3];
+        data[1] = T[2];
+        data[2] = T[1];
+        data[3] = T[0];
+
+        if (max)
+            *(long long*)(data + 4) = 0xFFFFFFFFFFFFFFFFll;
+        else
+            *(long long*)(data + 4) = 0x0000000000000000ll;
+    }
+
+    time_t OID::asTimeT() {
+        int time;
+        char* T = (char *) &time;
+        T[0] = data[3];
+        T[1] = data[2];
+        T[2] = data[1];
+        T[3] = data[0];
+        return time;
+    }
+
+    const string BSONObjBuilder::numStrs[] = {
+        "0",  "1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9",
+        "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
+        "20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
+        "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
+        "40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
+        "50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
+        "60", "61", "62", "63", "64", "65", "66", "67", "68", "69",
+        "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
+        "80", "81", "82", "83", "84", "85", "86", "87", "88", "89",
+        "90", "91", "92", "93", "94", "95", "96", "97", "98", "99",
+    };
+
+    // This is to ensure that BSONObjBuilder doesn't try to use numStrs before the strings have been constructed
+    // I've tested just making numStrs a char[][], but the overhead of constructing the strings each time was too high
+    // numStrsReady will be 0 until after numStrs is initialized because it is a static variable
+    bool BSONObjBuilder::numStrsReady = (numStrs[0].size() > 0);
+
+}
diff --git a/src/mongo/bson/oid.h b/src/mongo/bson/oid.h
new file mode 100644
index 00000000000..e5963a0e81d
--- /dev/null
+++ b/src/mongo/bson/oid.h
@@ -0,0 +1,138 @@
+// oid.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/functional/hash.hpp>
+#include "../util/hex.h"
+
+namespace mongo {
+
+#pragma pack(1)
+    /** Object ID type.
+        BSON objects typically have an _id field for the object id.  This field should be the first
+        member of the object when present.  class OID is a special type that is a 12 byte id which
+        is likely to be unique to the system.  You may also use other types for _id's.
+        When _id field is missing from a BSON object, on an insert the database may insert one
+        automatically in certain circumstances.
+
+        Warning: You must call OID::newState() after a fork().
+
+        Typical contents of the BSON ObjectID is a 12-byte value consisting of a 4-byte timestamp (seconds since epoch),
+        a 3-byte machine id, a 2-byte process id, and a 3-byte counter. Note that the timestamp and counter fields must
+        be stored big endian unlike the rest of BSON. This is because they are compared byte-by-byte and we want to ensure
+        a mostly increasing order.
+    */
+    class OID {
+    public:
+        OID() : a(0), b(0) { }
+
+        /** init from a 24 char hex string */
+        explicit OID(const string &s) { init(s); }
+
+        /** initialize to 'null' */
+        void clear() { a = 0; b = 0; }
+
+        const unsigned char *getData() const { return data; }
+
+        bool operator==(const OID& r) const { return a==r.a && b==r.b; }
+        bool operator!=(const OID& r) const { return a!=r.a || b!=r.b; }
+        int compare( const OID& other ) const { return memcmp( data , other.data , 12 ); }
+        bool operator<( const OID& other ) const { return compare( other ) < 0; }
+        bool operator<=( const OID& other ) const { return compare( other ) <= 0; }
+
+        /** @return the object ID output as 24 hex digits */
+        string str() const { return toHexLower(data, 12); }
+        string toString() const { return str(); }
+
+        static OID gen() { OID o; o.init(); return o; }
+
+        /** sets the contents to a new oid / randomized value */
+        void init();
+
+        /** init from a 24 char hex string */
+        void init( string s );
+
+        /** Set to the min/max OID that could be generated at given timestamp. */
+        void init( Date_t date, bool max=false );
+
+        time_t asTimeT();
+        Date_t asDateT() { return asTimeT() * (long long)1000; }
+
+        bool isSet() const { return a || b; }
+
+	void hash_combine(size_t &seed) const {
+	    boost::hash_combine(seed, a);
+	    boost::hash_combine(seed, b);
+	}
+
+        /** call this after a fork to update the process id */
+        static void justForked();
+
+        static unsigned getMachineId(); // features command uses
+        static void regenMachineId(); // used by unit tests
+
+    private:
+        struct MachineAndPid {
+            unsigned char _machineNumber[3];
+            unsigned short _pid;
+            bool operator!=(const OID::MachineAndPid& rhs) const;
+        };
+        static MachineAndPid ourMachine, ourMachineAndPid;
+        union {
+            struct {
+                // 12 bytes total
+                unsigned char _time[4];
+                MachineAndPid _machineAndPid;
+                unsigned char _inc[3];
+            };
+            struct {
+                long long a;
+                unsigned b;
+            };
+            unsigned char data[12];
+        };
+
+        static unsigned ourPid();
+        static void foldInPid(MachineAndPid& x);
+        static MachineAndPid genMachineAndPid();
+    };
+#pragma pack()
+
+    ostream& operator<<( ostream &s, const OID &o );
+    inline StringBuilder& operator<< (StringBuilder& s, const OID& o) { return (s << o.str()); }
+
+    /** Formatting mode for generating JSON from BSON.
+        See <http://mongodb.onconfluence.com/display/DOCS/Mongo+Extended+JSON>
+        for details.
+    */
+    enum JsonStringFormat {
+        /** strict RFC format */
+        Strict,
+        /** 10gen format, which is close to JS format.  This form is understandable by
+            javascript running inside the Mongo server via eval() */
+        TenGen,
+        /** Javascript JSON compatible */
+        JS
+    };
+
+    inline ostream& operator<<( ostream &s, const OID &o ) {
+        s << o.str();
+        return s;
+    }
+
+}
diff --git a/src/mongo/bson/ordering.h b/src/mongo/bson/ordering.h
new file mode 100644
index 00000000000..bca3296f340
--- /dev/null
+++ b/src/mongo/bson/ordering.h
@@ -0,0 +1,73 @@
+// ordering.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    // todo: ideally move to db/ instead of bson/, but elim any dependencies first
+
+    /** A precomputation of a BSON index or sort key pattern.  That is something like: 
+           { a : 1, b : -1 }
+        The constructor is private to make conversion more explicit so we notice where we call make().
+        Over time we should push this up higher and higher.
+    */
+    class Ordering {
+        unsigned bits;
+        Ordering(unsigned b) : bits(b) { }
+    public:
+        Ordering(const Ordering& r) : bits(r.bits) { }
+        void operator=(const Ordering& r) {
+            bits = r.bits;
+        }
+
+        /** so, for key pattern { a : 1, b : -1 }
+            get(0) == 1
+            get(1) == -1
+        */
+        int get(int i) const {
+            return ((1 << i) & bits) ? -1 : 1;
+        }
+
+        // for woCompare...
+        unsigned descending(unsigned mask) const { return bits & mask; }
+
+        /*operator string() const {
+            StringBuilder buf(32);
+            for ( unsigned i=0; i<nkeys; i++)
+                buf.append( get(i) > 0 ? "+" : "-" );
+            return buf.str();
+        }*/
+
+        static Ordering make(const BSONObj& obj) {
+            unsigned b = 0;
+            BSONObjIterator k(obj);
+            unsigned n = 0;
+            while( 1 ) {
+                BSONElement e = k.next();
+                if( e.eoo() )
+                    break;
+                uassert( 13103, "too many compound keys", n <= 31 );
+                if( e.number() < 0 )
+                    b |= (1 << n);
+                n++;
+            }
+            return Ordering(b);
+        }
+    };
+
+}
diff --git a/src/mongo/bson/stringdata.h b/src/mongo/bson/stringdata.h
new file mode 100644
index 00000000000..1fb4e7d25d3
--- /dev/null
+++ b/src/mongo/bson/stringdata.h
@@ -0,0 +1,71 @@
+// stringdata.h
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include <cstring>
+
+namespace mongo {
+
+    using std::string;
+
+    /** A StringData object wraps a 'const string&' or a 'const char*' without
+     * copying its contents. The most common usage is as a function argument that
+     * takes any of the two forms of strings above. Fundamentally, this class tries
+     * go around the fact that string literals in C++ are char[N]'s.
+     *
+     * Note that the object StringData wraps around must be alive while the StringData
+     * is.
+    */
+    class StringData {
+    public:
+        /** Construct a StringData, for the case where the length of
+         * string is not known. 'c' must be a pointer to a null-terminated string.
+         */
+        StringData( const char* c )
+            : _data(c), _size((unsigned) strlen(c)) {}
+
+        /** Construct a StringData explicitly, for the case where the length of the string
+         * is already known. 'c' must be a pointer to a null-terminated string, and strlenOfc
+         * must be the length that std::strlen(c) would return, a.k.a the index of the
+         * terminator in c.
+         */
+        StringData( const char* c, unsigned len )
+            : _data(c), _size(len) {}
+
+        /** Construct a StringData, for the case of a std::string. */
+        StringData( const string& s )
+            : _data(s.c_str()), _size((unsigned) s.size()) {}
+
+        // Construct a StringData explicitly, for the case of a literal whose size is
+        // known at compile time.
+        struct LiteralTag {};
+        template<size_t N>
+        StringData( const char (&val)[N], LiteralTag )
+            : _data(&val[0]), _size(N-1) {}
+
+        // accessors
+        const char* data() const { return _data; }
+        unsigned size() const { return _size; }
+
+    private:
+        const char* const _data;  // is always null terminated
+        const unsigned    _size;  // 'size' does not include the null terminator
+    };
+
+} // namespace mongo
diff --git a/src/mongo/bson/util/atomic_int.h b/src/mongo/bson/util/atomic_int.h
new file mode 100644
index 00000000000..e85a023c3bc
--- /dev/null
+++ b/src/mongo/bson/util/atomic_int.h
@@ -0,0 +1,106 @@
+// atomic_int.h
+// atomic wrapper for unsigned
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#if defined(_WIN32)
+#  include <windows.h>
+#endif
+
+namespace mongo {
+
+    struct AtomicUInt {
+        AtomicUInt() : x(0) {}
+        AtomicUInt(unsigned z) : x(z) { }
+
+        operator unsigned() const { return x; }
+        unsigned get() const { return x; }
+
+        inline AtomicUInt operator++(); // ++prefix
+        inline AtomicUInt operator++(int);// postfix++
+        inline AtomicUInt operator--(); // --prefix
+        inline AtomicUInt operator--(int); // postfix--
+
+        inline void zero();
+
+        volatile unsigned x;
+    };
+
+#if defined(_WIN32)
+    void AtomicUInt::zero() { 
+        InterlockedExchange((volatile long*)&x, 0);
+    }
+    AtomicUInt AtomicUInt::operator++() {
+        return InterlockedIncrement((volatile long*)&x);
+    }
+    AtomicUInt AtomicUInt::operator++(int) {
+        return InterlockedIncrement((volatile long*)&x)-1;
+    }
+    AtomicUInt AtomicUInt::operator--() {
+        return InterlockedDecrement((volatile long*)&x);
+    }
+    AtomicUInt AtomicUInt::operator--(int) {
+        return InterlockedDecrement((volatile long*)&x)+1;
+    }
+#elif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4)
+    // this is in GCC >= 4.1
+    inline void AtomicUInt::zero() { x = 0; } // TODO: this isn't thread safe - maybe
+    AtomicUInt AtomicUInt::operator++() {
+        return __sync_add_and_fetch(&x, 1);
+    }
+    AtomicUInt AtomicUInt::operator++(int) {
+        return __sync_fetch_and_add(&x, 1);
+    }
+    AtomicUInt AtomicUInt::operator--() {
+        return __sync_add_and_fetch(&x, -1);
+    }
+    AtomicUInt AtomicUInt::operator--(int) {
+        return __sync_fetch_and_add(&x, -1);
+    }
+#elif defined(__GNUC__)  && (defined(__i386__) || defined(__x86_64__))
+    inline void AtomicUInt::zero() { x = 0; } // TODO: this isn't thread safe
+    // from boost 1.39 interprocess/detail/atomic.hpp
+    inline unsigned atomic_int_helper(volatile unsigned *x, int val) {
+        int r;
+        asm volatile
+        (
+            "lock\n\t"
+            "xadd %1, %0":
+            "+m"( *x ), "=r"( r ): // outputs (%0, %1)
+            "1"( val ): // inputs (%2 == %1)
+            "memory", "cc" // clobbers
+        );
+        return r;
+    }
+    AtomicUInt AtomicUInt::operator++() {
+        return atomic_int_helper(&x, 1)+1;
+    }
+    AtomicUInt AtomicUInt::operator++(int) {
+        return atomic_int_helper(&x, 1);
+    }
+    AtomicUInt AtomicUInt::operator--() {
+        return atomic_int_helper(&x, -1)-1;
+    }
+    AtomicUInt AtomicUInt::operator--(int) {
+        return atomic_int_helper(&x, -1);
+    }
+#else
+#  error "unsupported compiler or platform"
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/bson/util/builder.h b/src/mongo/bson/util/builder.h
new file mode 100644
index 00000000000..f189f58b27e
--- /dev/null
+++ b/src/mongo/bson/util/builder.h
@@ -0,0 +1,322 @@
+/* builder.h */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include <string.h>
+#include <stdio.h>
+#include "../inline_decls.h"
+#include "../stringdata.h"
+
+namespace mongo {
+
+    /* Note the limit here is rather arbitrary and is simply a standard. generally the code works
+       with any object that fits in ram.
+
+       Also note that the server has some basic checks to enforce this limit but those checks are not exhaustive
+       for example need to check for size too big after
+         update $push (append) operation
+         various db.eval() type operations
+    */
+    const int BSONObjMaxUserSize = 16 * 1024 * 1024;
+
+    /*
+       Sometimeswe we need objects slightly larger - an object in the replication local.oplog
+       is slightly larger than a user object for example.
+    */
+    const int BSONObjMaxInternalSize = BSONObjMaxUserSize + ( 16 * 1024 );
+
+    const int BufferMaxSize = 64 * 1024 * 1024;
+
+    class StringBuilder;
+
+    void msgasserted(int msgid, const char *msg);
+
+    class TrivialAllocator { 
+    public:
+        void* Malloc(size_t sz) { return malloc(sz); }
+        void* Realloc(void *p, size_t sz) { return realloc(p, sz); }
+        void Free(void *p) { free(p); }
+    };
+
+    class StackAllocator {
+    public:
+        enum { SZ = 512 };
+        void* Malloc(size_t sz) {
+            if( sz <= SZ ) return buf;
+            return malloc(sz); 
+        }
+        void* Realloc(void *p, size_t sz) { 
+            if( p == buf ) {
+                if( sz <= SZ ) return buf;
+                void *d = malloc(sz);
+                if ( d == 0 )
+                    msgasserted( 15912 , "out of memory StackAllocator::Realloc" );
+                memcpy(d, p, SZ);
+                return d;
+            }
+            return realloc(p, sz); 
+        }
+        void Free(void *p) { 
+            if( p != buf )
+                free(p); 
+        }
+    private:
+        char buf[SZ];
+    };
+
+    template< class Allocator >
+    class _BufBuilder {
+        // non-copyable, non-assignable
+        _BufBuilder( const _BufBuilder& );
+        _BufBuilder& operator=( const _BufBuilder& );
+        Allocator al;
+    public:
+        _BufBuilder(int initsize = 512) : size(initsize) {
+            if ( size > 0 ) {
+                data = (char *) al.Malloc(size);
+                if( data == 0 )
+                    msgasserted(10000, "out of memory BufBuilder");
+            }
+            else {
+                data = 0;
+            }
+            l = 0;
+        }
+        ~_BufBuilder() { kill(); }
+
+        void kill() {
+            if ( data ) {
+                al.Free(data);
+                data = 0;
+            }
+        }
+
+        void reset() {
+            l = 0;
+        }
+        void reset( int maxSize ) {
+            l = 0;
+            if ( maxSize && size > maxSize ) {
+                al.Free(data);
+                data = (char*)al.Malloc(maxSize);
+                if ( data == 0 )
+                    msgasserted( 15913 , "out of memory BufBuilder::reset" );
+                size = maxSize;
+            }
+        }
+
+        /** leave room for some stuff later
+            @return point to region that was skipped.  pointer may change later (on realloc), so for immediate use only
+        */
+        char* skip(int n) { return grow(n); }
+
+        /* note this may be deallocated (realloced) if you keep writing. */
+        char* buf() { return data; }
+        const char* buf() const { return data; }
+
+        /* assume ownership of the buffer - you must then free() it */
+        void decouple() { data = 0; }
+
+        void appendUChar(unsigned char j) {
+            *((unsigned char*)grow(sizeof(unsigned char))) = j;
+        }
+        void appendChar(char j) {
+            *((char*)grow(sizeof(char))) = j;
+        }
+        void appendNum(char j) {
+            *((char*)grow(sizeof(char))) = j;
+        }
+        void appendNum(short j) {
+            *((short*)grow(sizeof(short))) = j;
+        }
+        void appendNum(int j) {
+            *((int*)grow(sizeof(int))) = j;
+        }
+        void appendNum(unsigned j) {
+            *((unsigned*)grow(sizeof(unsigned))) = j;
+        }
+        void appendNum(bool j) {
+            *((bool*)grow(sizeof(bool))) = j;
+        }
+        void appendNum(double j) {
+            *((double*)grow(sizeof(double))) = j;
+        }
+        void appendNum(long long j) {
+            *((long long*)grow(sizeof(long long))) = j;
+        }
+        void appendNum(unsigned long long j) {
+            *((unsigned long long*)grow(sizeof(unsigned long long))) = j;
+        }
+
+        void appendBuf(const void *src, size_t len) {
+            memcpy(grow((int) len), src, len);
+        }
+
+        template<class T>
+        void appendStruct(const T& s) {
+            appendBuf(&s, sizeof(T));
+        }
+
+        void appendStr(const StringData &str , bool includeEndingNull = true ) {
+            const int len = str.size() + ( includeEndingNull ? 1 : 0 );
+            memcpy(grow(len), str.data(), len);
+        }
+
+        /** @return length of current string */
+        int len() const { return l; }
+        void setlen( int newLen ) { l = newLen; }
+        /** @return size of the buffer */
+        int getSize() const { return size; }
+
+        /* returns the pre-grow write position */
+        inline char* grow(int by) {
+            int oldlen = l;
+            l += by;
+            if ( l > size ) {
+                grow_reallocate();
+            }
+            return data + oldlen;
+        }
+
+    private:
+        /* "slow" portion of 'grow()'  */
+        void NOINLINE_DECL grow_reallocate() {
+            int a = size * 2;
+            if ( a == 0 )
+                a = 512;
+            if ( l > a )
+                a = l + 16 * 1024;
+            if ( a > BufferMaxSize )
+                msgasserted(13548, "BufBuilder grow() > 64MB");
+            data = (char *) al.Realloc(data, a);
+            size= a;
+        }
+
+        char *data;
+        int l;
+        int size;
+
+        friend class StringBuilder;
+    };
+
+    typedef _BufBuilder<TrivialAllocator> BufBuilder;
+
+    /** The StackBufBuilder builds smaller datasets on the stack instead of using malloc.
+          this can be significantly faster for small bufs.  However, you can not decouple() the 
+          buffer with StackBufBuilder.
+        While designed to be a variable on the stack, if you were to dynamically allocate one, 
+          nothing bad would happen.  In fact in some circumstances this might make sense, say, 
+          embedded in some other object.
+    */
+    class StackBufBuilder : public _BufBuilder<StackAllocator> { 
+    public:
+        StackBufBuilder() : _BufBuilder<StackAllocator>(StackAllocator::SZ) { }
+        void decouple(); // not allowed. not implemented.
+    };
+
+#if defined(_WIN32)
+#pragma warning( push )
+// warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
+#pragma warning( disable : 4996 )
+#endif
+
+    /** stringstream deals with locale so this is a lot faster than std::stringstream for UTF8 */
+    class StringBuilder {
+    public:
+        StringBuilder( int initsize=256 )
+            : _buf( initsize ) {
+        }
+
+        StringBuilder& operator<<( double x ) {
+            return SBNUM( x , 25 , "%g" );
+        }
+        StringBuilder& operator<<( int x ) {
+            return SBNUM( x , 11 , "%d" );
+        }
+        StringBuilder& operator<<( unsigned x ) {
+            return SBNUM( x , 11 , "%u" );
+        }
+        StringBuilder& operator<<( long x ) {
+            return SBNUM( x , 22 , "%ld" );
+        }
+        StringBuilder& operator<<( unsigned long x ) {
+            return SBNUM( x , 22 , "%lu" );
+        }
+        StringBuilder& operator<<( long long x ) {
+            return SBNUM( x , 22 , "%lld" );
+        }
+        StringBuilder& operator<<( unsigned long long x ) {
+            return SBNUM( x , 22 , "%llu" );
+        }
+        StringBuilder& operator<<( short x ) {
+            return SBNUM( x , 8 , "%hd" );
+        }
+        StringBuilder& operator<<( char c ) {
+            _buf.grow( 1 )[0] = c;
+            return *this;
+        }
+
+        void appendDoubleNice( double x ) {
+            int prev = _buf.l;
+            char * start = _buf.grow( 32 );
+            int z = sprintf( start , "%.16g" , x );
+            assert( z >= 0 );
+            _buf.l = prev + z;
+            if( strchr(start, '.') == 0 && strchr(start, 'E') == 0 && strchr(start, 'N') == 0 ) {
+                write( ".0" , 2 );
+            }
+        }
+
+        void write( const char* buf, int len) { memcpy( _buf.grow( len ) , buf , len ); }
+
+        void append( const StringData& str ) { memcpy( _buf.grow( str.size() ) , str.data() , str.size() ); }
+
+        StringBuilder& operator<<( const StringData& str ) {
+            append( str );
+            return *this;
+        }
+
+        void reset( int maxSize = 0 ) { _buf.reset( maxSize ); }
+
+        std::string str() const { return std::string(_buf.data, _buf.l); }
+        
+        int len() const { return _buf.l; }
+
+    private:
+        BufBuilder _buf;
+
+        // non-copyable, non-assignable
+        StringBuilder( const StringBuilder& );
+        StringBuilder& operator=( const StringBuilder& );
+
+        template <typename T>
+        StringBuilder& SBNUM(T val,int maxSize,const char *macro)  {
+            int prev = _buf.l;
+            int z = sprintf( _buf.grow(maxSize) , macro , (val) );
+            assert( z >= 0 );
+            _buf.l = prev + z;
+            return *this;
+        }
+    };
+
+#if defined(_WIN32)
+#pragma warning( pop )
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/bson/util/misc.h b/src/mongo/bson/util/misc.h
new file mode 100644
index 00000000000..b547c981bdf
--- /dev/null
+++ b/src/mongo/bson/util/misc.h
@@ -0,0 +1,121 @@
+/* @file misc.h 
+*/
+
+/*
+ *    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <ctime>
+
+namespace mongo {
+
+    using namespace std;
+
+    inline void time_t_to_String(time_t t, char *buf) {
+#if defined(_WIN32)
+        ctime_s(buf, 32, &t);
+#else
+        ctime_r(&t, buf);
+#endif
+        buf[24] = 0; // don't want the \n
+    }
+
+    inline string time_t_to_String(time_t t = time(0) ) {
+        char buf[64];
+#if defined(_WIN32)
+        ctime_s(buf, sizeof(buf), &t);
+#else
+        ctime_r(&t, buf);
+#endif
+        buf[24] = 0; // don't want the \n
+        return buf;
+    }
+
+    inline string time_t_to_String_no_year(time_t t) {
+        char buf[64];
+#if defined(_WIN32)
+        ctime_s(buf, sizeof(buf), &t);
+#else
+        ctime_r(&t, buf);
+#endif
+        buf[19] = 0;
+        return buf;
+    }
+
+    inline string time_t_to_String_short(time_t t) {
+        char buf[64];
+#if defined(_WIN32)
+        ctime_s(buf, sizeof(buf), &t);
+#else
+        ctime_r(&t, buf);
+#endif
+        buf[19] = 0;
+        if( buf[0] && buf[1] && buf[2] && buf[3] )
+            return buf + 4; // skip day of week
+        return buf;
+    }
+
+    struct Date_t {
+        // TODO: make signed (and look for related TODO's)
+        unsigned long long millis;
+        Date_t(): millis(0) {}
+        Date_t(unsigned long long m): millis(m) {}
+        operator unsigned long long&() { return millis; }
+        operator const unsigned long long&() const { return millis; }
+        void toTm (tm *buf) {
+            time_t dtime = (time_t) millis/1000;
+#if defined(_WIN32)
+            gmtime_s(buf, &dtime);
+#else
+            gmtime_r(&dtime, buf);
+#endif
+        }
+        string toString() const {
+            char buf[64];
+            time_t_to_String(millis/1000, buf);
+            return buf;
+        }
+    };
+
+    // Like strlen, but only scans up to n bytes.
+    // Returns -1 if no '0' found.
+    inline int strnlen( const char *s, int n ) {
+        for( int i = 0; i < n; ++i )
+            if ( !s[ i ] )
+                return i;
+        return -1;
+    }
+
+    inline bool isNumber( char c ) {
+        return c >= '0' && c <= '9';
+    }
+
+    inline unsigned stringToNum(const char *str) {
+        unsigned x = 0;
+        const char *p = str;
+        while( 1 ) {
+            if( !isNumber(*p) ) {
+                if( *p == 0 && p != str )
+                    break;
+                throw 0;
+            }
+            x = x * 10 + *p++ - '0';
+        }
+        return x;
+    }
+
+}
diff --git a/src/mongo/client/clientOnly.cpp b/src/mongo/client/clientOnly.cpp
new file mode 100644
index 00000000000..161f0a82a81
--- /dev/null
+++ b/src/mongo/client/clientOnly.cpp
@@ -0,0 +1,92 @@
+// clientOnly.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "../client/dbclient.h"
+#include "../db/cmdline.h"
+#include "../db/client_common.h"
+#include "../s/shard.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    CmdLine cmdLine;
+
+    const char * curNs = "in client mode";
+
+    bool dbexitCalled = false;
+
+    string dynHostMyName() { return ""; }
+
+    void dynHostResolve(string& name, int& port) {
+        assert(false);
+    }
+
+    void exitCleanly( ExitCode code ) {
+        dbexit( code );
+    }
+
+    void dbexit( ExitCode returnCode, const char *whyMsg , bool tryToGetLock ) {
+        dbexitCalled = true;
+        out() << "dbexit called" << endl;
+        if ( whyMsg )
+            out() << " b/c " << whyMsg << endl;
+        out() << "exiting" << endl;
+        ::exit( returnCode );
+    }
+
+    bool inShutdown() {
+        return dbexitCalled;
+    }
+
+    void setupSignals() {
+        // maybe should do SIGPIPE here, not sure
+    }
+
+    string getDbContext() {
+        return "in client only mode";
+    }
+
+    bool haveLocalShardingInfo( const string& ns ) {
+        return false;
+    }
+
+    DBClientBase * createDirectClient() {
+        uassert( 10256 ,  "no createDirectClient in clientOnly" , 0 );
+        return 0;
+    }
+
+    void Shard::getAllShards( vector<Shard>& all ) {
+        assert(0);
+    }
+
+    bool Shard::isAShardNode( const string& ident ) {
+        assert(0);
+        return false;
+    }
+
+    string prettyHostName() {
+        assert(0);
+        return "";
+    }
+
+    ClientBasic* ClientBasic::getCurrent() {
+        return 0;
+    }
+
+
+}
diff --git a/src/mongo/client/connpool.cpp b/src/mongo/client/connpool.cpp
new file mode 100644
index 00000000000..5089471f521
--- /dev/null
+++ b/src/mongo/client/connpool.cpp
@@ -0,0 +1,426 @@
+/* connpool.cpp
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+// _ todo: reconnect?
+
+#include "pch.h"
+#include "connpool.h"
+//#include "../db/commands.h"
+#include "syncclusterconnection.h"
+#include "../s/shard.h"
+
+namespace mongo {
+
+    // ------ PoolForHost ------
+
+    PoolForHost::~PoolForHost() {
+        while ( ! _pool.empty() ) {
+            StoredConnection sc = _pool.top();
+            delete sc.conn;
+            _pool.pop();
+        }
+    }
+
+    void PoolForHost::done( DBConnectionPool * pool, DBClientBase * c ) {
+        if ( _pool.size() >= _maxPerHost ) {
+            pool->onDestroy( c );
+            delete c;
+        }
+        else {
+            _pool.push(c);
+        }
+    }
+
+    DBClientBase * PoolForHost::get( DBConnectionPool * pool , double socketTimeout ) {
+
+        time_t now = time(0);
+        
+        while ( ! _pool.empty() ) {
+            StoredConnection sc = _pool.top();
+            _pool.pop();
+            
+            if ( ! sc.ok( now ) )  {
+                pool->onDestroy( sc.conn );
+                delete sc.conn;
+                continue;
+            }
+            
+            assert( sc.conn->getSoTimeout() == socketTimeout );
+
+            return sc.conn;
+
+        }
+
+        return NULL;
+    }
+
+    void PoolForHost::flush() {
+        vector<StoredConnection> all;
+        while ( ! _pool.empty() ) {
+            StoredConnection c = _pool.top();
+            _pool.pop();
+            all.push_back( c );
+            bool res;
+            c.conn->isMaster( res );
+        }
+
+        for ( vector<StoredConnection>::iterator i=all.begin(); i != all.end(); ++i ) {
+            _pool.push( *i );
+        }
+    }
+
+    void PoolForHost::getStaleConnections( vector<DBClientBase*>& stale ) {
+        time_t now = time(0);
+
+        vector<StoredConnection> all;
+        while ( ! _pool.empty() ) {
+            StoredConnection c = _pool.top();
+            _pool.pop();
+            
+            if ( c.ok( now ) )
+                all.push_back( c );
+            else
+                stale.push_back( c.conn );
+        }
+
+        for ( size_t i=0; i<all.size(); i++ ) {
+            _pool.push( all[i] );
+        }
+    }
+
+
+    PoolForHost::StoredConnection::StoredConnection( DBClientBase * c ) {
+        conn = c;
+        when = time(0);
+    }
+
+    bool PoolForHost::StoredConnection::ok( time_t now ) {
+        // if connection has been idle for 30 minutes, kill it
+        return ( now - when ) < 1800;
+    }
+
+    void PoolForHost::createdOne( DBClientBase * base) {
+        if ( _created == 0 )
+            _type = base->type();
+        _created++;
+    }
+
+    unsigned PoolForHost::_maxPerHost = 50;
+
+    // ------ DBConnectionPool ------
+
+    DBConnectionPool pool;
+
+    DBConnectionPool::DBConnectionPool() 
+        : _mutex("DBConnectionPool") , 
+          _name( "dbconnectionpool" ) , 
+          _hooks( new list<DBConnectionHook*>() ) { 
+    }
+
+    DBClientBase* DBConnectionPool::_get(const string& ident , double socketTimeout ) {
+        assert( ! inShutdown() );
+        scoped_lock L(_mutex);
+        PoolForHost& p = _pools[PoolKey(ident,socketTimeout)];
+        return p.get( this , socketTimeout );
+    }
+
+    DBClientBase* DBConnectionPool::_finishCreate( const string& host , double socketTimeout , DBClientBase* conn ) {
+        {
+            scoped_lock L(_mutex);
+            PoolForHost& p = _pools[PoolKey(host,socketTimeout)];
+            p.createdOne( conn );
+        }
+        
+        try {
+            onCreate( conn );
+            onHandedOut( conn );
+        }
+        catch ( std::exception & ) {
+            delete conn;
+            throw;
+        }
+
+        return conn;
+    }
+
+    DBClientBase* DBConnectionPool::get(const ConnectionString& url, double socketTimeout) {
+        DBClientBase * c = _get( url.toString() , socketTimeout );
+        if ( c ) {
+            try {
+                onHandedOut( c );
+            }
+            catch ( std::exception& ) {
+                delete c;
+                throw;
+            }
+            return c;
+        }
+
+        string errmsg;
+        c = url.connect( errmsg, socketTimeout );
+        uassert( 13328 ,  _name + ": connect failed " + url.toString() + " : " + errmsg , c );
+
+        return _finishCreate( url.toString() , socketTimeout , c );
+    }
+
+    DBClientBase* DBConnectionPool::get(const string& host, double socketTimeout) {
+        DBClientBase * c = _get( host , socketTimeout );
+        if ( c ) {
+            try {
+                onHandedOut( c );
+            }
+            catch ( std::exception& ) {
+                delete c;
+                throw;
+            }
+            return c;
+        }
+
+        string errmsg;
+        ConnectionString cs = ConnectionString::parse( host , errmsg );
+        uassert( 13071 , (string)"invalid hostname [" + host + "]" + errmsg , cs.isValid() );
+
+        c = cs.connect( errmsg, socketTimeout );
+        if ( ! c )
+            throw SocketException( SocketException::CONNECT_ERROR , host , 11002 , str::stream() << _name << " error: " << errmsg );
+        return _finishCreate( host , socketTimeout , c );
+    }
+
+    void DBConnectionPool::release(const string& host, DBClientBase *c) {
+        if ( c->isFailed() ) {
+            onDestroy( c );
+            delete c;
+            return;
+        }
+        scoped_lock L(_mutex);
+        _pools[PoolKey(host,c->getSoTimeout())].done(this,c);
+    }
+
+
+    DBConnectionPool::~DBConnectionPool() {
+        // connection closing is handled by ~PoolForHost
+    }
+
+    void DBConnectionPool::flush() {
+        scoped_lock L(_mutex);
+        for ( PoolMap::iterator i = _pools.begin(); i != _pools.end(); i++ ) {
+            PoolForHost& p = i->second;
+            p.flush();
+        }
+    }
+
+    void DBConnectionPool::addHook( DBConnectionHook * hook ) {
+        _hooks->push_back( hook );
+    }
+
+    void DBConnectionPool::onCreate( DBClientBase * conn ) {
+        if ( _hooks->size() == 0 )
+            return;
+
+        for ( list<DBConnectionHook*>::iterator i = _hooks->begin(); i != _hooks->end(); i++ ) {
+            (*i)->onCreate( conn );
+        }
+    }
+
+    void DBConnectionPool::onHandedOut( DBClientBase * conn ) {
+        if ( _hooks->size() == 0 )
+            return;
+
+        for ( list<DBConnectionHook*>::iterator i = _hooks->begin(); i != _hooks->end(); i++ ) {
+            (*i)->onHandedOut( conn );
+        }
+    }
+
+    void DBConnectionPool::onDestroy( DBClientBase * conn ) {
+        if ( _hooks->size() == 0 )
+            return;
+
+        for ( list<DBConnectionHook*>::iterator i = _hooks->begin(); i != _hooks->end(); i++ ) {
+            (*i)->onDestroy( conn );
+        }
+    }
+
+    void DBConnectionPool::appendInfo( BSONObjBuilder& b ) {
+
+        int avail = 0;
+        long long created = 0;
+
+
+        map<ConnectionString::ConnectionType,long long> createdByType;
+
+        set<string> replicaSets;
+        
+        BSONObjBuilder bb( b.subobjStart( "hosts" ) );
+        {
+            scoped_lock lk( _mutex );
+            for ( PoolMap::iterator i=_pools.begin(); i!=_pools.end(); ++i ) {
+                if ( i->second.numCreated() == 0 )
+                    continue;
+
+                string s = str::stream() << i->first.ident << "::" << i->first.timeout;
+
+                BSONObjBuilder temp( bb.subobjStart( s ) );
+                temp.append( "available" , i->second.numAvailable() );
+                temp.appendNumber( "created" , i->second.numCreated() );
+                temp.done();
+
+                avail += i->second.numAvailable();
+                created += i->second.numCreated();
+
+                long long& x = createdByType[i->second.type()];
+                x += i->second.numCreated();
+
+                {
+                    string setName = i->first.ident;
+                    if ( setName.find( "/" ) != string::npos ) {
+                        setName = setName.substr( 0 , setName.find( "/" ) );
+                        replicaSets.insert( setName );
+                    }
+                }
+            }
+        }
+        bb.done();
+        
+        
+        BSONObjBuilder setBuilder( b.subobjStart( "replicaSets" ) );
+        for ( set<string>::iterator i=replicaSets.begin(); i!=replicaSets.end(); ++i ) {
+            string rs = *i;
+            ReplicaSetMonitorPtr m = ReplicaSetMonitor::get( rs );
+            if ( ! m ) {
+                warning() << "no monitor for set: " << rs << endl;
+                continue;
+            }
+            
+            BSONObjBuilder temp( setBuilder.subobjStart( rs ) );
+            m->appendInfo( temp );
+            temp.done();
+        }
+        setBuilder.done();
+
+        {
+            BSONObjBuilder temp( bb.subobjStart( "createdByType" ) );
+            for ( map<ConnectionString::ConnectionType,long long>::iterator i=createdByType.begin(); i!=createdByType.end(); ++i ) {
+                temp.appendNumber( ConnectionString::typeToString( i->first ) , i->second );
+            }
+            temp.done();
+        }
+
+        b.append( "totalAvailable" , avail );
+        b.appendNumber( "totalCreated" , created );
+    }
+
+    bool DBConnectionPool::serverNameCompare::operator()( const string& a , const string& b ) const{
+        const char* ap = a.c_str();
+        const char* bp = b.c_str();
+       
+        while (true){
+            if (*ap == '\0' || *ap == '/'){
+                if (*bp == '\0' || *bp == '/')
+                    return false; // equal strings
+                else
+                    return true; // a is shorter
+            }
+
+            if (*bp == '\0' || *bp == '/')
+                return false; // b is shorter
+            
+            if ( *ap < *bp)
+                return true;
+            else if (*ap > *bp)
+                return false;
+
+            ++ap;
+            ++bp;
+        }
+        assert(false);
+    }
+    
+    bool DBConnectionPool::poolKeyCompare::operator()( const PoolKey& a , const PoolKey& b ) const {
+        if (DBConnectionPool::serverNameCompare()( a.ident , b.ident ))
+            return true;
+        
+        if (DBConnectionPool::serverNameCompare()( b.ident , a.ident ))
+            return false;
+
+        return a.timeout < b.timeout;
+    }
+
+
+    void DBConnectionPool::taskDoWork() { 
+        vector<DBClientBase*> toDelete;
+        
+        {
+            // we need to get the connections inside the lock
+            // but we can actually delete them outside
+            scoped_lock lk( _mutex );
+            for ( PoolMap::iterator i=_pools.begin(); i!=_pools.end(); ++i ) {
+                i->second.getStaleConnections( toDelete );
+            }
+        }
+
+        for ( size_t i=0; i<toDelete.size(); i++ ) {
+            try {
+                onDestroy( toDelete[i] );
+                delete toDelete[i];
+            }
+            catch ( ... ) {
+                // we don't care if there was a socket error
+            }
+        }
+    }
+
+    // ------ ScopedDbConnection ------
+
+    ScopedDbConnection * ScopedDbConnection::steal() {
+        assert( _conn );
+        ScopedDbConnection * n = new ScopedDbConnection( _host , _conn, _socketTimeout );
+        _conn = 0;
+        return n;
+    }
+
+    void ScopedDbConnection::_setSocketTimeout(){
+        if( ! _conn ) return;
+        if( _conn->type() == ConnectionString::MASTER )
+            (( DBClientConnection* ) _conn)->setSoTimeout( _socketTimeout );
+        else if( _conn->type() == ConnectionString::SYNC )
+            (( SyncClusterConnection* ) _conn)->setAllSoTimeouts( _socketTimeout );
+    }
+
+    ScopedDbConnection::~ScopedDbConnection() {
+        if ( _conn ) {
+            if ( ! _conn->isFailed() ) {
+                /* see done() comments above for why we log this line */
+                log() << "~ScopedDbConnection: _conn != null" << endl;
+            }
+            kill();
+        }
+    }
+
+    ScopedDbConnection::ScopedDbConnection(const Shard& shard, double socketTimeout )
+        : _host( shard.getConnString() ) , _conn( pool.get(_host, socketTimeout) ), _socketTimeout( socketTimeout ) {
+        _setSocketTimeout();
+    }
+
+    ScopedDbConnection::ScopedDbConnection(const Shard* shard, double socketTimeout )
+        : _host( shard->getConnString() ) , _conn( pool.get(_host, socketTimeout) ), _socketTimeout( socketTimeout ) {
+        _setSocketTimeout();
+    }
+
+    AtomicUInt AScopedConnection::_numConnections;
+
+} // namespace mongo
diff --git a/src/mongo/client/connpool.h b/src/mongo/client/connpool.h
new file mode 100644
index 00000000000..8733abb1f90
--- /dev/null
+++ b/src/mongo/client/connpool.h
@@ -0,0 +1,291 @@
+/** @file connpool.h */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <stack>
+#include "dbclient.h"
+#include "redef_macros.h"
+
+#include "../util/background.h"
+
+namespace mongo {
+
+    class Shard;
+    class DBConnectionPool;
+
+    /**
+     * not thread safe
+     * thread safety is handled by DBConnectionPool
+     */
+    class PoolForHost {
+    public:
+        PoolForHost()
+            : _created(0) {}
+
+        PoolForHost( const PoolForHost& other ) {
+            assert(other._pool.size() == 0);
+            _created = other._created;
+            assert( _created == 0 );
+        }
+
+        ~PoolForHost();
+
+        int numAvailable() const { return (int)_pool.size(); }
+
+        void createdOne( DBClientBase * base );
+        long long numCreated() const { return _created; }
+
+        ConnectionString::ConnectionType type() const { assert(_created); return _type; }
+
+        /**
+         * gets a connection or return NULL
+         */
+        DBClientBase * get( DBConnectionPool * pool , double socketTimeout );
+
+        void done( DBConnectionPool * pool , DBClientBase * c );
+
+        void flush();
+        
+        void getStaleConnections( vector<DBClientBase*>& stale );
+
+        static void setMaxPerHost( unsigned max ) { _maxPerHost = max; }
+        static unsigned getMaxPerHost() { return _maxPerHost; }
+    private:
+
+        struct StoredConnection {
+            StoredConnection( DBClientBase * c );
+
+            bool ok( time_t now );
+
+            DBClientBase* conn;
+            time_t when;
+        };
+
+        std::stack<StoredConnection> _pool;
+        
+        long long _created;
+        ConnectionString::ConnectionType _type;
+
+        static unsigned _maxPerHost;
+    };
+
+    class DBConnectionHook {
+    public:
+        virtual ~DBConnectionHook() {}
+        virtual void onCreate( DBClientBase * conn ) {}
+        virtual void onHandedOut( DBClientBase * conn ) {}
+        virtual void onDestroy( DBClientBase * conn ) {}
+    };
+
+    /** Database connection pool.
+
+        Generally, use ScopedDbConnection and do not call these directly.
+
+        This class, so far, is suitable for use with unauthenticated connections.
+        Support for authenticated connections requires some adjustements: please
+        request...
+
+        Usage:
+
+        {
+           ScopedDbConnection c("myserver");
+           c.conn()...
+        }
+    */
+    class DBConnectionPool : public PeriodicTask {
+        
+    public:
+
+        DBConnectionPool();
+        ~DBConnectionPool();
+
+        /** right now just controls some asserts.  defaults to "dbconnectionpool" */
+        void setName( const string& name ) { _name = name; }
+
+        void onCreate( DBClientBase * conn );
+        void onHandedOut( DBClientBase * conn );
+        void onDestroy( DBClientBase * conn );
+
+        void flush();
+
+        DBClientBase *get(const string& host, double socketTimeout = 0);
+        DBClientBase *get(const ConnectionString& host, double socketTimeout = 0);
+
+        void release(const string& host, DBClientBase *c);
+
+        void addHook( DBConnectionHook * hook ); // we take ownership
+        void appendInfo( BSONObjBuilder& b );
+
+        /** compares server namees, but is smart about replica set names */
+        struct serverNameCompare {
+            bool operator()( const string& a , const string& b ) const;
+        };
+
+        virtual string taskName() const { return "DBConnectionPool-cleaner"; }
+        virtual void taskDoWork();        
+
+    private:
+        DBConnectionPool( DBConnectionPool& p );
+        
+        DBClientBase* _get( const string& ident , double socketTimeout );
+
+        DBClientBase* _finishCreate( const string& ident , double socketTimeout, DBClientBase* conn );
+        
+        struct PoolKey {
+            PoolKey( string i , double t ) : ident( i ) , timeout( t ) {}
+            string ident;
+            double timeout;
+        };
+
+        struct poolKeyCompare {
+            bool operator()( const PoolKey& a , const PoolKey& b ) const;
+        };
+
+        typedef map<PoolKey,PoolForHost,poolKeyCompare> PoolMap; // servername -> pool
+
+        mongo::mutex _mutex;
+        string _name;
+        
+        PoolMap _pools;
+
+        // pointers owned by me, right now they leak on shutdown
+        // _hooks itself also leaks because it creates a shutdown race condition
+        list<DBConnectionHook*> * _hooks; 
+
+    };
+
+    extern DBConnectionPool pool;
+
+    class AScopedConnection : boost::noncopyable {
+    public:
+        AScopedConnection() { _numConnections++; }
+        virtual ~AScopedConnection() { _numConnections--; }
+        
+        virtual DBClientBase* get() = 0;
+        virtual void done() = 0;
+        virtual string getHost() const = 0;
+        
+        /** 
+         * @return true iff this has a connection to the db
+         */
+        virtual bool ok() const = 0;
+
+        /**
+         * @return total number of current instances of AScopedConnection
+         */
+        static int getNumConnections() { return _numConnections; }
+
+    private:
+        static AtomicUInt _numConnections;
+    };
+
+    /** Use to get a connection from the pool.  On exceptions things
+       clean up nicely (i.e. the socket gets closed automatically when the
+       scopeddbconnection goes out of scope).
+    */
+    class ScopedDbConnection : public AScopedConnection {
+    public:
+        /** the main constructor you want to use
+            throws UserException if can't connect
+            */
+        explicit ScopedDbConnection(const string& host, double socketTimeout = 0) : _host(host), _conn( pool.get(host, socketTimeout) ), _socketTimeout( socketTimeout ) {
+            _setSocketTimeout();
+        }
+
+        ScopedDbConnection() : _host( "" ) , _conn(0), _socketTimeout( 0 ) {}
+
+        /* @param conn - bind to an existing connection */
+        ScopedDbConnection(const string& host, DBClientBase* conn, double socketTimeout = 0 ) : _host( host ) , _conn( conn ), _socketTimeout( socketTimeout ) {
+            _setSocketTimeout();
+        }
+
+        /** throws UserException if can't connect */
+        explicit ScopedDbConnection(const ConnectionString& url, double socketTimeout = 0 ) : _host(url.toString()), _conn( pool.get(url, socketTimeout) ), _socketTimeout( socketTimeout ) {
+            _setSocketTimeout();
+        }
+
+        /** throws UserException if can't connect */
+        explicit ScopedDbConnection(const Shard& shard, double socketTimeout = 0 );
+        explicit ScopedDbConnection(const Shard* shard, double socketTimeout = 0 );
+
+        ~ScopedDbConnection();
+
+        /** get the associated connection object */
+        DBClientBase* operator->() {
+            uassert( 11004 ,  "connection was returned to the pool already" , _conn );
+            return _conn;
+        }
+
+        /** get the associated connection object */
+        DBClientBase& conn() {
+            uassert( 11005 ,  "connection was returned to the pool already" , _conn );
+            return *_conn;
+        }
+
+        /** get the associated connection object */
+        DBClientBase* get() {
+            uassert( 13102 ,  "connection was returned to the pool already" , _conn );
+            return _conn;
+        }
+
+        bool ok() const { return _conn > 0; }
+
+        string getHost() const { return _host; }
+
+        /** Force closure of the connection.  You should call this if you leave it in
+            a bad state.  Destructor will do this too, but it is verbose.
+        */
+        void kill() {
+            delete _conn;
+            _conn = 0;
+        }
+
+        /** Call this when you are done with the connection.
+
+            If you do not call done() before this object goes out of scope,
+            we can't be sure we fully read all expected data of a reply on the socket.  so
+            we don't try to reuse the connection in that situation.
+        */
+        void done() {
+            if ( ! _conn )
+                return;
+
+            /* we could do this, but instead of assume one is using autoreconnect mode on the connection
+            if ( _conn->isFailed() )
+                kill();
+            else
+            */
+            pool.release(_host, _conn);
+            _conn = 0;
+        }
+
+        ScopedDbConnection * steal();
+
+    private:
+
+        void _setSocketTimeout();
+
+        const string _host;
+        DBClientBase *_conn;
+        const double _socketTimeout;
+
+    };
+
+} // namespace mongo
+
+#include "undef_macros.h"
diff --git a/src/mongo/client/constants.h b/src/mongo/client/constants.h
new file mode 100644
index 00000000000..54f3fd216f2
--- /dev/null
+++ b/src/mongo/client/constants.h
@@ -0,0 +1,26 @@
+// constants.h
+
+#pragma once
+
+namespace mongo {
+
+    /* query results include a 32 result flag word consisting of these bits */
+    enum ResultFlagType {
+        /* returned, with zero results, when getMore is called but the cursor id
+           is not valid at the server. */
+        ResultFlag_CursorNotFound = 1,
+
+        /* { $err : ... } is being returned */
+        ResultFlag_ErrSet = 2,
+
+        /* Have to update config from the server, usually $err is also set */
+        ResultFlag_ShardConfigStale = 4,
+
+        /* for backward compatability: this let's us know the server supports
+           the QueryOption_AwaitData option. if it doesn't, a repl slave client should sleep
+        a little between getMore's.
+        */
+        ResultFlag_AwaitCapable = 8
+    };
+
+}
diff --git a/src/mongo/client/dbclient.cpp b/src/mongo/client/dbclient.cpp
new file mode 100644
index 00000000000..b38a85d4253
--- /dev/null
+++ b/src/mongo/client/dbclient.cpp
@@ -0,0 +1,1087 @@
+// dbclient.cpp - connect to a Mongo database as a database, from C++
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "dbclient.h"
+#include "../bson/util/builder.h"
+#include "../db/jsobj.h"
+#include "../db/json.h"
+#include "../db/instance.h"
+#include "../util/md5.hpp"
+#include "../db/dbmessage.h"
+#include "../db/cmdline.h"
+#include "connpool.h"
+#include "../s/util.h"
+#include "syncclusterconnection.h"
+
+namespace mongo {
+
+    void ConnectionString::_fillServers( string s ) {
+        
+        {
+            string::size_type idx = s.find( '/' );
+            if ( idx != string::npos ) {
+                _setName = s.substr( 0 , idx );
+                s = s.substr( idx + 1 );
+                _type = SET;
+            }
+        }
+
+        string::size_type idx;
+        while ( ( idx = s.find( ',' ) ) != string::npos ) {
+            _servers.push_back( s.substr( 0 , idx ) );
+            s = s.substr( idx + 1 );
+        }
+        _servers.push_back( s );
+
+    }
+    
+    void ConnectionString::_finishInit() {
+        stringstream ss;
+        if ( _type == SET )
+            ss << _setName << "/";
+        for ( unsigned i=0; i<_servers.size(); i++ ) {
+            if ( i > 0 )
+                ss << ",";
+            ss << _servers[i].toString();
+        }
+        _string = ss.str();
+    }
+
+
+    DBClientBase* ConnectionString::connect( string& errmsg, double socketTimeout ) const {
+        switch ( _type ) {
+        case MASTER: {
+            DBClientConnection * c = new DBClientConnection(true);
+            c->setSoTimeout( socketTimeout );
+            log(1) << "creating new connection to:" << _servers[0] << endl;
+            if ( ! c->connect( _servers[0] , errmsg ) ) {
+                delete c;
+                return 0;
+            }
+            log(1) << "connected connection!" << endl;
+            return c;
+        }
+
+        case PAIR:
+        case SET: {
+            DBClientReplicaSet * set = new DBClientReplicaSet( _setName , _servers , socketTimeout );
+            if( ! set->connect() ) {
+                delete set;
+                errmsg = "connect failed to set ";
+                errmsg += toString();
+                return 0;
+            }
+            return set;
+        }
+
+        case SYNC: {
+            // TODO , don't copy
+            list<HostAndPort> l;
+            for ( unsigned i=0; i<_servers.size(); i++ )
+                l.push_back( _servers[i] );
+            SyncClusterConnection* c = new SyncClusterConnection( l, socketTimeout );
+            return c;
+        }
+
+        case INVALID:
+            throw UserException( 13421 , "trying to connect to invalid ConnectionString" );
+            break;
+        }
+
+        assert( 0 );
+        return 0;
+    }
+
+    ConnectionString ConnectionString::parse( const string& host , string& errmsg ) {
+
+        string::size_type i = host.find( '/' );
+        if ( i != string::npos && i != 0) {
+            // replica set
+            return ConnectionString( SET , host.substr( i + 1 ) , host.substr( 0 , i ) );
+        }
+
+        int numCommas = str::count( host , ',' );
+
+        if( numCommas == 0 )
+            return ConnectionString( HostAndPort( host ) );
+
+        if ( numCommas == 1 )
+            return ConnectionString( PAIR , host );
+
+        if ( numCommas == 2 )
+            return ConnectionString( SYNC , host );
+
+        errmsg = (string)"invalid hostname [" + host + "]";
+        return ConnectionString(); // INVALID
+    }
+
+    string ConnectionString::typeToString( ConnectionType type ) {
+        switch ( type ) {
+        case INVALID:
+            return "invalid";
+        case MASTER:
+            return "master";
+        case PAIR:
+            return "pair";
+        case SET:
+            return "set";
+        case SYNC:
+            return "sync";
+        }
+        assert(0);
+        return "";
+    }
+
+
+    Query& Query::where(const string &jscode, BSONObj scope) {
+        /* use where() before sort() and hint() and explain(), else this will assert. */
+        assert( ! isComplex() );
+        BSONObjBuilder b;
+        b.appendElements(obj);
+        b.appendWhere(jscode, scope);
+        obj = b.obj();
+        return *this;
+    }
+
+    void Query::makeComplex() {
+        if ( isComplex() )
+            return;
+        BSONObjBuilder b;
+        b.append( "query", obj );
+        obj = b.obj();
+    }
+
+    Query& Query::sort(const BSONObj& s) {
+        appendComplex( "orderby", s );
+        return *this;
+    }
+
+    Query& Query::hint(BSONObj keyPattern) {
+        appendComplex( "$hint", keyPattern );
+        return *this;
+    }
+
+    Query& Query::explain() {
+        appendComplex( "$explain", true );
+        return *this;
+    }
+
+    Query& Query::snapshot() {
+        appendComplex( "$snapshot", true );
+        return *this;
+    }
+
+    Query& Query::minKey( const BSONObj &val ) {
+        appendComplex( "$min", val );
+        return *this;
+    }
+
+    Query& Query::maxKey( const BSONObj &val ) {
+        appendComplex( "$max", val );
+        return *this;
+    }
+
+    bool Query::isComplex( bool * hasDollar ) const {
+        if ( obj.hasElement( "query" ) ) {
+            if ( hasDollar )
+                hasDollar[0] = false;
+            return true;
+        }
+
+        if ( obj.hasElement( "$query" ) ) {
+            if ( hasDollar )
+                hasDollar[0] = true;
+            return true;
+        }
+
+        return false;
+    }
+
+    BSONObj Query::getFilter() const {
+        bool hasDollar;
+        if ( ! isComplex( &hasDollar ) )
+            return obj;
+
+        return obj.getObjectField( hasDollar ? "$query" : "query" );
+    }
+    BSONObj Query::getSort() const {
+        if ( ! isComplex() )
+            return BSONObj();
+        BSONObj ret = obj.getObjectField( "orderby" );
+        if (ret.isEmpty())
+            ret = obj.getObjectField( "$orderby" );
+        return ret;
+    }
+    BSONObj Query::getHint() const {
+        if ( ! isComplex() )
+            return BSONObj();
+        return obj.getObjectField( "$hint" );
+    }
+    bool Query::isExplain() const {
+        return isComplex() && obj.getBoolField( "$explain" );
+    }
+
+    string Query::toString() const {
+        return obj.toString();
+    }
+
+    /* --- dbclientcommands --- */
+
+    bool DBClientWithCommands::isOk(const BSONObj& o) {
+        return o["ok"].trueValue();
+    }
+
+    bool DBClientWithCommands::isNotMasterErrorString( const BSONElement& e ) {
+        return e.type() == String && str::contains( e.valuestr() , "not master" );
+    }
+
+
+    enum QueryOptions DBClientWithCommands::availableOptions() {
+        if ( !_haveCachedAvailableOptions ) {
+            BSONObj ret;
+            if ( runCommand( "admin", BSON( "availablequeryoptions" << 1 ), ret ) ) {
+                _cachedAvailableOptions = ( enum QueryOptions )( ret.getIntField( "options" ) );
+            }
+            _haveCachedAvailableOptions = true;
+        }
+        return _cachedAvailableOptions;
+    }
+
+    inline bool DBClientWithCommands::runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options) {
+        string ns = dbname + ".$cmd";
+        info = findOne(ns, cmd, 0 , options);
+        return isOk(info);
+    }
+
+    /* note - we build a bson obj here -- for something that is super common like getlasterror you
+              should have that object prebuilt as that would be faster.
+    */
+    bool DBClientWithCommands::simpleCommand(const string &dbname, BSONObj *info, const string &command) {
+        BSONObj o;
+        if ( info == 0 )
+            info = &o;
+        BSONObjBuilder b;
+        b.append(command, 1);
+        return runCommand(dbname, b.done(), *info);
+    }
+
+    unsigned long long DBClientWithCommands::count(const string &myns, const BSONObj& query, int options, int limit, int skip ) {
+        NamespaceString ns(myns);
+        BSONObj cmd = _countCmd( myns , query , options , limit , skip );
+        BSONObj res;
+        if( !runCommand(ns.db.c_str(), cmd, res, options) )
+            uasserted(11010,string("count fails:") + res.toString());
+        return res["n"].numberLong();
+    }
+
+    BSONObj DBClientWithCommands::_countCmd(const string &myns, const BSONObj& query, int options, int limit, int skip ) {
+        NamespaceString ns(myns);
+        BSONObjBuilder b;
+        b.append( "count" , ns.coll );
+        b.append( "query" , query );
+        if ( limit )
+            b.append( "limit" , limit );
+        if ( skip )
+            b.append( "skip" , skip );
+        return b.obj();
+    }
+
+    BSONObj DBClientWithCommands::getLastErrorDetailed(bool fsync, bool j, int w, int wtimeout) {
+        BSONObj info;
+        BSONObjBuilder b;
+        b.append( "getlasterror", 1 );
+
+        if ( fsync )
+            b.append( "fsync", 1 );
+        if ( j )
+            b.append( "j", 1 );
+
+        // only affects request when greater than one node
+        if ( w >= 1 )
+            b.append( "w", w );
+        else if ( w == -1 )
+            b.append( "w", "majority" );
+
+        if ( wtimeout > 0 )
+            b.append( "wtimeout", wtimeout );
+
+        runCommand("admin", b.obj(), info);
+
+        return info;
+    }
+
+    string DBClientWithCommands::getLastError(bool fsync, bool j, int w, int wtimeout) {
+        BSONObj info = getLastErrorDetailed(fsync, j, w, wtimeout);
+        return getLastErrorString( info );
+    }
+
+    string DBClientWithCommands::getLastErrorString( const BSONObj& info ) {
+        BSONElement e = info["err"];
+        if( e.eoo() ) return "";
+        if( e.type() == Object ) return e.toString();
+        return e.str();
+    }
+
+    const BSONObj getpreverrorcmdobj = fromjson("{getpreverror:1}");
+
+    BSONObj DBClientWithCommands::getPrevError() {
+        BSONObj info;
+        runCommand("admin", getpreverrorcmdobj, info);
+        return info;
+    }
+
+    BSONObj getnoncecmdobj = fromjson("{getnonce:1}");
+
+    string DBClientWithCommands::createPasswordDigest( const string & username , const string & clearTextPassword ) {
+        md5digest d;
+        {
+            md5_state_t st;
+            md5_init(&st);
+            md5_append(&st, (const md5_byte_t *) username.data(), username.length());
+            md5_append(&st, (const md5_byte_t *) ":mongo:", 7 );
+            md5_append(&st, (const md5_byte_t *) clearTextPassword.data(), clearTextPassword.length());
+            md5_finish(&st, d);
+        }
+        return digestToString( d );
+    }
+
+    bool DBClientWithCommands::auth(const string &dbname, const string &username, const string &password_text, string& errmsg, bool digestPassword) {
+        string password = password_text;
+        if( digestPassword )
+            password = createPasswordDigest( username , password_text );
+
+        BSONObj info;
+        string nonce;
+        if( !runCommand(dbname, getnoncecmdobj, info) ) {
+            errmsg = "getnonce fails - connection problem?";
+            return false;
+        }
+        {
+            BSONElement e = info.getField("nonce");
+            assert( e.type() == String );
+            nonce = e.valuestr();
+        }
+
+        BSONObj authCmd;
+        BSONObjBuilder b;
+        {
+
+            b << "authenticate" << 1 << "nonce" << nonce << "user" << username;
+            md5digest d;
+            {
+                md5_state_t st;
+                md5_init(&st);
+                md5_append(&st, (const md5_byte_t *) nonce.c_str(), nonce.size() );
+                md5_append(&st, (const md5_byte_t *) username.data(), username.length());
+                md5_append(&st, (const md5_byte_t *) password.c_str(), password.size() );
+                md5_finish(&st, d);
+            }
+            b << "key" << digestToString( d );
+            authCmd = b.done();
+        }
+
+        if( runCommand(dbname, authCmd, info) )
+            return true;
+
+        errmsg = info.toString();
+        return false;
+    }
+
+    BSONObj ismastercmdobj = fromjson("{\"ismaster\":1}");
+
+    bool DBClientWithCommands::isMaster(bool& isMaster, BSONObj *info) {
+        BSONObj o;
+        if ( info == 0 )
+            info = &o;
+        bool ok = runCommand("admin", ismastercmdobj, *info);
+        isMaster = info->getField("ismaster").trueValue();
+        return ok;
+    }
+
+    bool DBClientWithCommands::createCollection(const string &ns, long long size, bool capped, int max, BSONObj *info) {
+        assert(!capped||size);
+        BSONObj o;
+        if ( info == 0 )    info = &o;
+        BSONObjBuilder b;
+        string db = nsToDatabase(ns.c_str());
+        b.append("create", ns.c_str() + db.length() + 1);
+        if ( size ) b.append("size", size);
+        if ( capped ) b.append("capped", true);
+        if ( max ) b.append("max", max);
+        return runCommand(db.c_str(), b.done(), *info);
+    }
+
+    bool DBClientWithCommands::copyDatabase(const string &fromdb, const string &todb, const string &fromhost, BSONObj *info) {
+        BSONObj o;
+        if ( info == 0 ) info = &o;
+        BSONObjBuilder b;
+        b.append("copydb", 1);
+        b.append("fromhost", fromhost);
+        b.append("fromdb", fromdb);
+        b.append("todb", todb);
+        return runCommand("admin", b.done(), *info);
+    }
+
+    bool DBClientWithCommands::setDbProfilingLevel(const string &dbname, ProfilingLevel level, BSONObj *info ) {
+        BSONObj o;
+        if ( info == 0 ) info = &o;
+
+        if ( level ) {
+            // Create system.profile collection.  If it already exists this does nothing.
+            // TODO: move this into the db instead of here so that all
+            //       drivers don't have to do this.
+            string ns = dbname + ".system.profile";
+            createCollection(ns.c_str(), 1024 * 1024, true, 0, info);
+        }
+
+        BSONObjBuilder b;
+        b.append("profile", (int) level);
+        return runCommand(dbname, b.done(), *info);
+    }
+
+    BSONObj getprofilingcmdobj = fromjson("{\"profile\":-1}");
+
+    bool DBClientWithCommands::getDbProfilingLevel(const string &dbname, ProfilingLevel& level, BSONObj *info) {
+        BSONObj o;
+        if ( info == 0 ) info = &o;
+        if ( runCommand(dbname, getprofilingcmdobj, *info) ) {
+            level = (ProfilingLevel) info->getIntField("was");
+            return true;
+        }
+        return false;
+    }
+
+    DBClientWithCommands::MROutput DBClientWithCommands::MRInline (BSON("inline" << 1));
+
+    BSONObj DBClientWithCommands::mapreduce(const string &ns, const string &jsmapf, const string &jsreducef, BSONObj query, MROutput output) {
+        BSONObjBuilder b;
+        b.append("mapreduce", nsGetCollection(ns));
+        b.appendCode("map", jsmapf);
+        b.appendCode("reduce", jsreducef);
+        if( !query.isEmpty() )
+            b.append("query", query);
+        b.append("out", output.out);
+        BSONObj info;
+        runCommand(nsGetDB(ns), b.done(), info);
+        return info;
+    }
+
+    bool DBClientWithCommands::eval(const string &dbname, const string &jscode, BSONObj& info, BSONElement& retValue, BSONObj *args) {
+        BSONObjBuilder b;
+        b.appendCode("$eval", jscode);
+        if ( args )
+            b.appendArray("args", *args);
+        bool ok = runCommand(dbname, b.done(), info);
+        if ( ok )
+            retValue = info.getField("retval");
+        return ok;
+    }
+
+    bool DBClientWithCommands::eval(const string &dbname, const string &jscode) {
+        BSONObj info;
+        BSONElement retValue;
+        return eval(dbname, jscode, info, retValue);
+    }
+
+    list<string> DBClientWithCommands::getDatabaseNames() {
+        BSONObj info;
+        uassert( 10005 ,  "listdatabases failed" , runCommand( "admin" , BSON( "listDatabases" << 1 ) , info ) );
+        uassert( 10006 ,  "listDatabases.databases not array" , info["databases"].type() == Array );
+
+        list<string> names;
+
+        BSONObjIterator i( info["databases"].embeddedObjectUserCheck() );
+        while ( i.more() ) {
+            names.push_back( i.next().embeddedObjectUserCheck()["name"].valuestr() );
+        }
+
+        return names;
+    }
+
+    list<string> DBClientWithCommands::getCollectionNames( const string& db ) {
+        list<string> names;
+
+        string ns = db + ".system.namespaces";
+        auto_ptr<DBClientCursor> c = query( ns.c_str() , BSONObj() );
+        while ( c->more() ) {
+            string name = c->next()["name"].valuestr();
+            if ( name.find( "$" ) != string::npos )
+                continue;
+            names.push_back( name );
+        }
+        return names;
+    }
+
+    bool DBClientWithCommands::exists( const string& ns ) {
+        list<string> names;
+
+        string db = nsGetDB( ns ) + ".system.namespaces";
+        BSONObj q = BSON( "name" << ns );
+        return count( db.c_str() , q, QueryOption_SlaveOk ) != 0;
+    }
+
+    /* --- dbclientconnection --- */
+
+    bool DBClientConnection::auth(const string &dbname, const string &username, const string &password_text, string& errmsg, bool digestPassword) {
+        string password = password_text;
+        if( digestPassword )
+            password = createPasswordDigest( username , password_text );
+
+        if( autoReconnect ) {
+            /* note we remember the auth info before we attempt to auth -- if the connection is broken, we will
+               then have it for the next autoreconnect attempt.
+            */
+            pair<string,string> p = pair<string,string>(username, password);
+            authCache[dbname] = p;
+        }
+
+        return DBClientBase::auth(dbname, username, password.c_str(), errmsg, false);
+    }
+
+    /** query N objects from the database into an array.  makes sense mostly when you want a small number of results.  if a huge number, use 
+        query() and iterate the cursor. 
+     */
+    void DBClientInterface::findN(vector<BSONObj>& out, const string& ns, Query query, int nToReturn, int nToSkip, const BSONObj *fieldsToReturn, int queryOptions) { 
+        out.reserve(nToReturn);
+
+        auto_ptr<DBClientCursor> c =
+            this->query(ns, query, nToReturn, nToSkip, fieldsToReturn, queryOptions);
+
+        uassert( 10276 ,  str::stream() << "DBClientBase::findN: transport error: " << getServerAddress() << " ns: " << ns << " query: " << query.toString(), c.get() );
+
+        if ( c->hasResultFlag( ResultFlag_ShardConfigStale ) )
+            throw RecvStaleConfigException( ns , "findN stale config" );
+
+        for( int i = 0; i < nToReturn; i++ ) {
+            if ( !c->more() )
+                break;
+            out.push_back( c->nextSafe().copy() );
+        }
+    }
+
+    BSONObj DBClientInterface::findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions) {
+        vector<BSONObj> v;
+        findN(v, ns, query, 1, 0, fieldsToReturn, queryOptions);
+        return v.empty() ? BSONObj() : v[0];
+    }
+
+    bool DBClientConnection::connect(const HostAndPort& server, string& errmsg) {
+        _server = server;
+        _serverString = _server.toString();
+        return _connect( errmsg );
+    }
+
+    bool DBClientConnection::_connect( string& errmsg ) {
+        _serverString = _server.toString();
+
+        // we keep around SockAddr for connection life -- maybe MessagingPort
+        // requires that?
+        server.reset(new SockAddr(_server.host().c_str(), _server.port()));
+        p.reset(new MessagingPort( _so_timeout, _logLevel ));
+
+        if (_server.host().empty() || server->getAddr() == "0.0.0.0") {
+            stringstream s;
+            errmsg = 
+                str::stream() << "couldn't connect to server " << _server.toString();
+            return false;
+        }
+
+        // if( _so_timeout == 0 ){
+        //    printStackTrace();
+        //    log() << "Connecting to server " << _serverString << " timeout " << _so_timeout << endl;
+        // }
+        if ( !p->connect(*server) ) {
+            errmsg = str::stream() << "couldn't connect to server " << _server.toString();
+            _failed = true;
+            return false;
+        }
+
+#ifdef MONGO_SSL
+        if ( cmdLine.sslOnNormalPorts ) {
+            p->secure( sslManager() );
+        }
+#endif
+
+        return true;
+    }
+
+
+    inline bool DBClientConnection::runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options) {
+        if ( DBClientWithCommands::runCommand( dbname , cmd , info , options ) )
+            return true;
+        
+        if ( clientSet && isNotMasterErrorString( info["errmsg"] ) ) {
+            clientSet->isntMaster();
+        }
+
+        return false;
+    }
+
+
+    void DBClientConnection::_checkConnection() {
+        if ( !_failed )
+            return;
+        if ( lastReconnectTry && time(0)-lastReconnectTry < 2 ) {
+            // we wait a little before reconnect attempt to avoid constant hammering.
+            // but we throw we don't want to try to use a connection in a bad state
+            throw SocketException( SocketException::FAILED_STATE , toString() );
+        }
+        if ( !autoReconnect )
+            throw SocketException( SocketException::FAILED_STATE , toString() );
+
+        lastReconnectTry = time(0);
+        log(_logLevel) << "trying reconnect to " << _serverString << endl;
+        string errmsg;
+        _failed = false;
+        if ( ! _connect(errmsg) ) {
+            _failed = true;
+            log(_logLevel) << "reconnect " << _serverString << " failed " << errmsg << endl;
+            throw SocketException( SocketException::CONNECT_ERROR , toString() );
+        }
+
+        log(_logLevel) << "reconnect " << _serverString << " ok" << endl;
+        for( map< string, pair<string,string> >::iterator i = authCache.begin(); i != authCache.end(); i++ ) {
+            const char *dbname = i->first.c_str();
+            const char *username = i->second.first.c_str();
+            const char *password = i->second.second.c_str();
+            if( !DBClientBase::auth(dbname, username, password, errmsg, false) )
+                log(_logLevel) << "reconnect: auth failed db:" << dbname << " user:" << username << ' ' << errmsg << '\n';
+        }
+    }
+
+    auto_ptr<DBClientCursor> DBClientBase::query(const string &ns, Query query, int nToReturn,
+            int nToSkip, const BSONObj *fieldsToReturn, int queryOptions , int batchSize ) {
+        auto_ptr<DBClientCursor> c( new DBClientCursor( this,
+                                    ns, query.obj, nToReturn, nToSkip,
+                                    fieldsToReturn, queryOptions , batchSize ) );
+        if ( c->init() )
+            return c;
+        return auto_ptr< DBClientCursor >( 0 );
+    }
+
+    auto_ptr<DBClientCursor> DBClientBase::getMore( const string &ns, long long cursorId, int nToReturn, int options ) {
+        auto_ptr<DBClientCursor> c( new DBClientCursor( this, ns, cursorId, nToReturn, options ) );
+        if ( c->init() )
+            return c;
+        return auto_ptr< DBClientCursor >( 0 );
+    }
+
+    struct DBClientFunConvertor {
+        void operator()( DBClientCursorBatchIterator &i ) {
+            while( i.moreInCurrentBatch() ) {
+                _f( i.nextSafe() );
+            }
+        }
+        boost::function<void(const BSONObj &)> _f;
+    };
+
+    unsigned long long DBClientConnection::query( boost::function<void(const BSONObj&)> f, const string& ns, Query query, const BSONObj *fieldsToReturn, int queryOptions ) {
+        DBClientFunConvertor fun;
+        fun._f = f;
+        boost::function<void(DBClientCursorBatchIterator &)> ptr( fun );
+        return DBClientConnection::query( ptr, ns, query, fieldsToReturn, queryOptions );
+    }
+
+    unsigned long long DBClientConnection::query( boost::function<void(DBClientCursorBatchIterator &)> f, const string& ns, Query query, const BSONObj *fieldsToReturn, int queryOptions ) {
+        // mask options
+        queryOptions &= (int)( QueryOption_NoCursorTimeout | QueryOption_SlaveOk );
+        unsigned long long n = 0;
+
+        bool doExhaust = ( availableOptions() & QueryOption_Exhaust );
+        if ( doExhaust ) {
+            queryOptions |= (int)QueryOption_Exhaust;
+        }
+        auto_ptr<DBClientCursor> c( this->query(ns, query, 0, 0, fieldsToReturn, queryOptions) );
+        uassert( 13386, "socket error for mapping query", c.get() );
+
+        if ( !doExhaust ) {
+            while( c->more() ) {
+                DBClientCursorBatchIterator i( *c );
+                f( i );
+                n += i.n();
+            }
+            return n;
+        }
+
+        try {
+            while( 1 ) {
+                while( c->moreInCurrentBatch() ) {
+                    DBClientCursorBatchIterator i( *c );
+                    f( i );
+                    n += i.n();
+                }
+
+                if( c->getCursorId() == 0 )
+                    break;
+
+                c->exhaustReceiveMore();
+            }
+        }
+        catch(std::exception&) {
+            /* connection CANNOT be used anymore as more data may be on the way from the server.
+               we have to reconnect.
+               */
+            _failed = true;
+            p->shutdown();
+            throw;
+        }
+
+        return n;
+    }
+
+    void DBClientBase::insert( const string & ns , BSONObj obj , int flags) {
+        Message toSend;
+
+        BufBuilder b;
+        b.appendNum( flags );
+        b.appendStr( ns );
+        obj.appendSelfToBufBuilder( b );
+
+        toSend.setData( dbInsert , b.buf() , b.len() );
+
+        say( toSend );
+    }
+
+    void DBClientBase::insert( const string & ns , const vector< BSONObj > &v , int flags) {
+        Message toSend;
+
+        BufBuilder b;
+        b.appendNum( flags );
+        b.appendStr( ns );
+        for( vector< BSONObj >::const_iterator i = v.begin(); i != v.end(); ++i )
+            i->appendSelfToBufBuilder( b );
+
+        toSend.setData( dbInsert, b.buf(), b.len() );
+
+        say( toSend );
+    }
+
+    void DBClientBase::remove( const string & ns , Query obj , bool justOne ) {
+        Message toSend;
+
+        BufBuilder b;
+        int opts = 0;
+        b.appendNum( opts );
+        b.appendStr( ns );
+
+        int flags = 0;
+        if ( justOne )
+            flags |= RemoveOption_JustOne;
+        b.appendNum( flags );
+
+        obj.obj.appendSelfToBufBuilder( b );
+
+        toSend.setData( dbDelete , b.buf() , b.len() );
+
+        say( toSend );
+    }
+
+    void DBClientBase::update( const string & ns , Query query , BSONObj obj , bool upsert , bool multi ) {
+
+        BufBuilder b;
+        b.appendNum( (int)0 ); // reserved
+        b.appendStr( ns );
+
+        int flags = 0;
+        if ( upsert ) flags |= UpdateOption_Upsert;
+        if ( multi ) flags |= UpdateOption_Multi;
+        b.appendNum( flags );
+
+        query.obj.appendSelfToBufBuilder( b );
+        obj.appendSelfToBufBuilder( b );
+
+        Message toSend;
+        toSend.setData( dbUpdate , b.buf() , b.len() );
+
+        say( toSend );
+
+
+    }
+
+
+    
+    auto_ptr<DBClientCursor> DBClientWithCommands::getIndexes( const string &ns ) {
+        return query( Namespace( ns.c_str() ).getSisterNS( "system.indexes" ).c_str() , BSON( "ns" << ns ) );
+    }
+
+    void DBClientWithCommands::dropIndex( const string& ns , BSONObj keys ) {
+        dropIndex( ns , genIndexName( keys ) );
+    }
+
+
+    void DBClientWithCommands::dropIndex( const string& ns , const string& indexName ) {
+        BSONObj info;
+        if ( ! runCommand( nsToDatabase( ns.c_str() ) ,
+                           BSON( "deleteIndexes" << NamespaceString( ns ).coll << "index" << indexName ) ,
+                           info ) ) {
+            log(_logLevel) << "dropIndex failed: " << info << endl;
+            uassert( 10007 ,  "dropIndex failed" , 0 );
+        }
+        resetIndexCache();
+    }
+
+    void DBClientWithCommands::dropIndexes( const string& ns ) {
+        BSONObj info;
+        uassert( 10008 ,  "dropIndexes failed" , runCommand( nsToDatabase( ns.c_str() ) ,
+                 BSON( "deleteIndexes" << NamespaceString( ns ).coll << "index" << "*") ,
+                 info ) );
+        resetIndexCache();
+    }
+
+    void DBClientWithCommands::reIndex( const string& ns ) {
+        list<BSONObj> all;
+        auto_ptr<DBClientCursor> i = getIndexes( ns );
+        while ( i->more() ) {
+            all.push_back( i->next().getOwned() );
+        }
+
+        dropIndexes( ns );
+
+        for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) {
+            BSONObj o = *i;
+            insert( Namespace( ns.c_str() ).getSisterNS( "system.indexes" ).c_str() , o );
+        }
+
+    }
+
+
+    string DBClientWithCommands::genIndexName( const BSONObj& keys ) {
+        stringstream ss;
+
+        bool first = 1;
+        for ( BSONObjIterator i(keys); i.more(); ) {
+            BSONElement f = i.next();
+
+            if ( first )
+                first = 0;
+            else
+                ss << "_";
+
+            ss << f.fieldName() << "_";
+            if( f.isNumber() )
+                ss << f.numberInt();
+        }
+        return ss.str();
+    }
+
+    bool DBClientWithCommands::ensureIndex( const string &ns , BSONObj keys , bool unique, const string & name , bool cache, bool background, int version ) {
+        BSONObjBuilder toSave;
+        toSave.append( "ns" , ns );
+        toSave.append( "key" , keys );
+
+        string cacheKey(ns);
+        cacheKey += "--";
+
+        if ( name != "" ) {
+            toSave.append( "name" , name );
+            cacheKey += name;
+        }
+        else {
+            string nn = genIndexName( keys );
+            toSave.append( "name" , nn );
+            cacheKey += nn;
+        }
+
+        if( version >= 0 ) 
+            toSave.append("v", version);
+
+        if ( unique )
+            toSave.appendBool( "unique", unique );
+
+        if( background ) 
+            toSave.appendBool( "background", true );
+
+        if ( _seenIndexes.count( cacheKey ) )
+            return 0;
+
+        if ( cache )
+            _seenIndexes.insert( cacheKey );
+
+        insert( Namespace( ns.c_str() ).getSisterNS( "system.indexes"  ).c_str() , toSave.obj() );
+        return 1;
+    }
+
+    void DBClientWithCommands::resetIndexCache() {
+        _seenIndexes.clear();
+    }
+
+    /* -- DBClientCursor ---------------------------------------------- */
+
+#ifdef _DEBUG
+#define CHECK_OBJECT( o , msg ) massert( 10337 ,  (string)"object not valid" + (msg) , (o).isValid() )
+#else
+#define CHECK_OBJECT( o , msg )
+#endif
+
+    void assembleRequest( const string &ns, BSONObj query, int nToReturn, int nToSkip, const BSONObj *fieldsToReturn, int queryOptions, Message &toSend ) {
+        CHECK_OBJECT( query , "assembleRequest query" );
+        // see query.h for the protocol we are using here.
+        BufBuilder b;
+        int opts = queryOptions;
+        b.appendNum(opts);
+        b.appendStr(ns);
+        b.appendNum(nToSkip);
+        b.appendNum(nToReturn);
+        query.appendSelfToBufBuilder(b);
+        if ( fieldsToReturn )
+            fieldsToReturn->appendSelfToBufBuilder(b);
+        toSend.setData(dbQuery, b.buf(), b.len());
+    }
+
+    void DBClientConnection::say( Message &toSend, bool isRetry ) {
+        checkConnection();
+        try {
+            port().say( toSend );
+        }
+        catch( SocketException & ) {
+            _failed = true;
+            throw;
+        }
+    }
+
+    void DBClientConnection::sayPiggyBack( Message &toSend ) {
+        port().piggyBack( toSend );
+    }
+
+    bool DBClientConnection::recv( Message &m ) {
+        return port().recv(m);
+    }
+
+    bool DBClientConnection::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) {
+        /* todo: this is very ugly messagingport::call returns an error code AND can throw
+                 an exception.  we should make it return void and just throw an exception anytime
+                 it fails
+        */
+        try {
+            if ( !port().call(toSend, response) ) {
+                _failed = true;
+                if ( assertOk )
+                    uasserted( 10278 , str::stream() << "dbclient error communicating with server: " << getServerAddress() );
+
+                return false;
+            }
+        }
+        catch( SocketException & ) {
+            _failed = true;
+            throw;
+        }
+        return true;
+    }
+
+    BSONElement getErrField(const BSONObj& o) {
+        BSONElement first = o.firstElement();
+        if( strcmp(first.fieldName(), "$err") == 0 )
+            return first;
+
+        // temp - will be DEV only later
+        /*DEV*/ 
+        if( 1 ) {
+            BSONElement e = o["$err"];
+            if( !e.eoo() ) { 
+                wassert(false);
+            }
+            return e;
+        }
+
+        return BSONElement();
+    }
+
+    bool hasErrField( const BSONObj& o ){
+        return ! getErrField( o ).eoo();
+    }
+
+    void DBClientConnection::checkResponse( const char *data, int nReturned, bool* retry, string* host ) {
+        /* check for errors.  the only one we really care about at
+         * this stage is "not master" 
+        */
+        
+        *retry = false;
+        *host = _serverString;
+
+        if ( clientSet && nReturned ) {
+            assert(data);
+            BSONObj o(data);
+            if ( isNotMasterErrorString( getErrField(o) ) ) {
+                clientSet->isntMaster();
+            }
+        }
+    }
+
+    void DBClientConnection::killCursor( long long cursorId ) {
+        StackBufBuilder b;
+        b.appendNum( (int)0 ); // reserved
+        b.appendNum( (int)1 ); // number
+        b.appendNum( cursorId );
+
+        Message m;
+        m.setData( dbKillCursors , b.buf() , b.len() );
+        
+        if ( _lazyKillCursor )
+            sayPiggyBack( m );
+        else
+            say(m);
+    }
+
+#ifdef MONGO_SSL
+    SSLManager* DBClientConnection::sslManager() {
+        if ( _sslManager )
+            return _sslManager;
+        
+        SSLManager* s = new SSLManager(true);
+        _sslManager = s;
+        return s;
+    }
+
+    SSLManager* DBClientConnection::_sslManager = 0;
+#endif
+
+    AtomicUInt DBClientConnection::_numConnections;
+    bool DBClientConnection::_lazyKillCursor = true;
+
+
+    bool serverAlive( const string &uri ) {
+        DBClientConnection c( false, 0, 20 ); // potentially the connection to server could fail while we're checking if it's alive - so use timeouts
+        string err;
+        if ( !c.connect( uri, err ) )
+            return false;
+        if ( !c.simpleCommand( "admin", 0, "ping" ) )
+            return false;
+        return true;
+    }
+
+
+    /** @return the database name portion of an ns string */
+    string nsGetDB( const string &ns ) {
+        string::size_type pos = ns.find( "." );
+        if ( pos == string::npos )
+            return ns;
+
+        return ns.substr( 0 , pos );
+    }
+
+    /** @return the collection name portion of an ns string */
+    string nsGetCollection( const string &ns ) {
+        string::size_type pos = ns.find( "." );
+        if ( pos == string::npos )
+            return "";
+
+        return ns.substr( pos + 1 );
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/client/dbclient.h b/src/mongo/client/dbclient.h
new file mode 100644
index 00000000000..76c1358f752
--- /dev/null
+++ b/src/mongo/client/dbclient.h
@@ -0,0 +1,1049 @@
+/** @file dbclient.h
+
+    Core MongoDB C++ driver interfaces are defined here.
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/net/message.h"
+#include "../util/net/message_port.h"
+#include "../db/jsobj.h"
+#include "../db/json.h"
+#include <stack>
+
+namespace mongo {
+
+    /** the query field 'options' can have these bits set: */
+    enum QueryOptions {
+        /** Tailable means cursor is not closed when the last data is retrieved.  rather, the cursor marks
+           the final object's position.  you can resume using the cursor later, from where it was located,
+           if more data were received.  Set on dbQuery and dbGetMore.
+
+           like any "latent cursor", the cursor may become invalid at some point -- for example if that
+           final object it references were deleted.  Thus, you should be prepared to requery if you get back
+           ResultFlag_CursorNotFound.
+        */
+        QueryOption_CursorTailable = 1 << 1,
+
+        /** allow query of replica slave.  normally these return an error except for namespace "local".
+        */
+        QueryOption_SlaveOk = 1 << 2,
+
+        // findingStart mode is used to find the first operation of interest when
+        // we are scanning through a repl log.  For efficiency in the common case,
+        // where the first operation of interest is closer to the tail than the head,
+        // we start from the tail of the log and work backwards until we find the
+        // first operation of interest.  Then we scan forward from that first operation,
+        // actually returning results to the client.  During the findingStart phase,
+        // we release the db mutex occasionally to avoid blocking the db process for
+        // an extended period of time.
+        QueryOption_OplogReplay = 1 << 3,
+
+        /** The server normally times out idle cursors after an inactivy period to prevent excess memory uses
+            Set this option to prevent that.
+        */
+        QueryOption_NoCursorTimeout = 1 << 4,
+
+        /** Use with QueryOption_CursorTailable.  If we are at the end of the data, block for a while rather
+            than returning no data. After a timeout period, we do return as normal.
+        */
+        QueryOption_AwaitData = 1 << 5,
+
+        /** Stream the data down full blast in multiple "more" packages, on the assumption that the client
+            will fully read all data queried.  Faster when you are pulling a lot of data and know you want to
+            pull it all down.  Note: it is not allowed to not read all the data unless you close the connection.
+
+            Use the query( boost::function<void(const BSONObj&)> f, ... ) version of the connection's query()
+            method, and it will take care of all the details for you.
+        */
+        QueryOption_Exhaust = 1 << 6,
+
+        /** When sharded, this means its ok to return partial results
+            Usually we will fail a query if all required shards aren't up
+            If this is set, it'll be a partial result set 
+         */
+        QueryOption_PartialResults = 1 << 7 ,
+
+        QueryOption_AllSupported = QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay | QueryOption_NoCursorTimeout | QueryOption_AwaitData | QueryOption_Exhaust | QueryOption_PartialResults
+
+    };
+
+    enum UpdateOptions {
+        /** Upsert - that is, insert the item if no matching item is found. */
+        UpdateOption_Upsert = 1 << 0,
+
+        /** Update multiple documents (if multiple documents match query expression).
+           (Default is update a single document and stop.) */
+        UpdateOption_Multi = 1 << 1,
+
+        /** flag from mongo saying this update went everywhere */
+        UpdateOption_Broadcast = 1 << 2
+    };
+
+    enum RemoveOptions {
+        /** only delete one option */
+        RemoveOption_JustOne = 1 << 0,
+
+        /** flag from mongo saying this update went everywhere */
+        RemoveOption_Broadcast = 1 << 1
+    };
+
+    
+    /** 
+     * need to put in DbMesssage::ReservedOptions as well
+     */
+    enum InsertOptions {
+        /** With muli-insert keep processing inserts if one fails */
+        InsertOption_ContinueOnError = 1 << 0
+    };
+
+    class DBClientBase;
+
+    /**
+     * ConnectionString handles parsing different ways to connect to mongo and determining method
+     * samples:
+     *    server
+     *    server:port
+     *    foo/server:port,server:port   SET
+     *    server,server,server          SYNC
+     *
+     * tyipcal use
+     * string errmsg,
+     * ConnectionString cs = ConnectionString::parse( url , errmsg );
+     * if ( ! cs.isValid() ) throw "bad: " + errmsg;
+     * DBClientBase * conn = cs.connect( errmsg );
+     */
+    class ConnectionString {
+    public:
+        enum ConnectionType { INVALID , MASTER , PAIR , SET , SYNC };
+
+        ConnectionString() {
+            _type = INVALID;
+        }
+
+        ConnectionString( const HostAndPort& server ) {
+            _type = MASTER;
+            _servers.push_back( server );
+            _finishInit();
+        }
+
+        ConnectionString( ConnectionType type , const string& s , const string& setName = "" ) {
+            _type = type;
+            _setName = setName;
+            _fillServers( s );
+
+            switch ( _type ) {
+            case MASTER:
+                assert( _servers.size() == 1 );
+                break;
+            case SET:
+                assert( _setName.size() );
+                assert( _servers.size() >= 1 ); // 1 is ok since we can derive
+                break;
+            case PAIR:
+                assert( _servers.size() == 2 );
+                break;
+            default:
+                assert( _servers.size() > 0 );
+            }
+
+            _finishInit();
+        }
+
+        ConnectionString( const string& s , ConnectionType favoredMultipleType ) {
+            _type = INVALID;
+            
+            _fillServers( s );
+            if ( _type != INVALID ) {
+                // set already
+            }
+            else if ( _servers.size() == 1 ) {
+                _type = MASTER;
+            }
+            else {
+                _type = favoredMultipleType;
+                assert( _type == SET || _type == SYNC );
+            }
+            _finishInit();
+        }
+
+        bool isValid() const { return _type != INVALID; }
+
+        string toString() const { return _string; }
+        
+        DBClientBase* connect( string& errmsg, double socketTimeout = 0 ) const;
+
+        string getSetName() const { return _setName; }
+
+        vector<HostAndPort> getServers() const { return _servers; }
+        
+        ConnectionType type() const { return _type; }
+
+        static ConnectionString parse( const string& url , string& errmsg );
+
+        static string typeToString( ConnectionType type );
+
+    private:
+
+        void _fillServers( string s );
+        void _finishInit();
+
+        ConnectionType _type;
+        vector<HostAndPort> _servers;
+        string _string;
+        string _setName;
+    };
+
+    /**
+     * controls how much a clients cares about writes
+     * default is NORMAL
+     */
+    enum WriteConcern {
+        W_NONE = 0 , // TODO: not every connection type fully supports this
+        W_NORMAL = 1
+        // TODO SAFE = 2
+    };
+
+    class BSONObj;
+    class ScopedDbConnection;
+    class DBClientCursor;
+    class DBClientCursorBatchIterator;
+
+    /** Represents a Mongo query expression.  Typically one uses the QUERY(...) macro to construct a Query object.
+        Examples:
+           QUERY( "age" << 33 << "school" << "UCLA" ).sort("name")
+           QUERY( "age" << GT << 30 << LT << 50 )
+    */
+    class Query {
+    public:
+        BSONObj obj;
+        Query() : obj(BSONObj()) { }
+        Query(const BSONObj& b) : obj(b) { }
+        Query(const string &json) :
+            obj(fromjson(json)) { }
+        Query(const char * json) :
+            obj(fromjson(json)) { }
+
+        /** Add a sort (ORDER BY) criteria to the query expression.
+            @param sortPattern the sort order template.  For example to order by name ascending, time descending:
+              { name : 1, ts : -1 }
+            i.e.
+              BSON( "name" << 1 << "ts" << -1 )
+            or
+              fromjson(" name : 1, ts : -1 ")
+        */
+        Query& sort(const BSONObj& sortPattern);
+
+        /** Add a sort (ORDER BY) criteria to the query expression.
+            This version of sort() assumes you want to sort on a single field.
+            @param asc = 1 for ascending order
+            asc = -1 for descending order
+        */
+        Query& sort(const string &field, int asc = 1) { sort( BSON( field << asc ) ); return *this; }
+
+        /** Provide a hint to the query.
+            @param keyPattern Key pattern for the index to use.
+            Example:
+              hint("{ts:1}")
+        */
+        Query& hint(BSONObj keyPattern);
+        Query& hint(const string &jsonKeyPatt) { return hint(fromjson(jsonKeyPatt)); }
+
+        /** Provide min and/or max index limits for the query.
+            min <= x < max
+         */
+        Query& minKey(const BSONObj &val);
+        /**
+           max is exclusive
+         */
+        Query& maxKey(const BSONObj &val);
+
+        /** Return explain information about execution of this query instead of the actual query results.
+            Normally it is easier to use the mongo shell to run db.find(...).explain().
+        */
+        Query& explain();
+
+        /** Use snapshot mode for the query.  Snapshot mode assures no duplicates are returned, or objects missed, which were
+            present at both the start and end of the query's execution (if an object is new during the query, or deleted during
+            the query, it may or may not be returned, even with snapshot mode).
+
+            Note that short query responses (less than 1MB) are always effectively snapshotted.
+
+            Currently, snapshot mode may not be used with sorting or explicit hints.
+        */
+        Query& snapshot();
+
+        /** Queries to the Mongo database support a $where parameter option which contains
+            a javascript function that is evaluated to see whether objects being queried match
+            its criteria.  Use this helper to append such a function to a query object.
+            Your query may also contain other traditional Mongo query terms.
+
+            @param jscode The javascript function to evaluate against each potential object
+                   match.  The function must return true for matched objects.  Use the this
+                   variable to inspect the current object.
+            @param scope SavedContext for the javascript object.  List in a BSON object any
+                   variables you would like defined when the jscode executes.  One can think
+                   of these as "bind variables".
+
+            Examples:
+              conn.findOne("test.coll", Query("{a:3}").where("this.b == 2 || this.c == 3"));
+              Query badBalance = Query().where("this.debits - this.credits < 0");
+        */
+        Query& where(const string &jscode, BSONObj scope);
+        Query& where(const string &jscode) { return where(jscode, BSONObj()); }
+
+        /**
+         * @return true if this query has an orderby, hint, or some other field
+         */
+        bool isComplex( bool * hasDollar = 0 ) const;
+
+        BSONObj getFilter() const;
+        BSONObj getSort() const;
+        BSONObj getHint() const;
+        bool isExplain() const;
+
+        string toString() const;
+        operator string() const { return toString(); }
+    private:
+        void makeComplex();
+        template< class T >
+        void appendComplex( const char *fieldName, const T& val ) {
+            makeComplex();
+            BSONObjBuilder b;
+            b.appendElements(obj);
+            b.append(fieldName, val);
+            obj = b.obj();
+        }
+    };
+
+    /**
+     * Represents a full query description, including all options required for the query to be passed on
+     * to other hosts
+     */
+    class QuerySpec {
+    public:
+
+        string _ns;
+        int _ntoskip;
+        int _ntoreturn;
+        int _options;
+        BSONObj _query;
+        BSONObj _fields;
+        Query _queryObj;
+
+        QuerySpec( const string& ns,
+                   const BSONObj& query, const BSONObj& fields,
+                   int ntoskip, int ntoreturn, int options )
+            : _ns( ns ), _ntoskip( ntoskip ), _ntoreturn( ntoreturn ), _options( options ),
+              _query( query ), _fields( fields )
+        {
+            _query = _query.getOwned();
+            _fields = _fields.getOwned();
+            _queryObj = Query( _query );
+        }
+
+        QuerySpec() {}
+
+        bool isEmpty() const {
+            return _ns.size() == 0;
+        }
+
+        bool isExplain() const {
+            return _queryObj.isExplain();
+        }
+
+        BSONObj filter() const {
+            return _queryObj.getFilter();
+        }
+
+        BSONObj hint() const {
+            return _queryObj.getHint();
+        }
+
+        BSONObj sort() const {
+            return _queryObj.getSort();
+        }
+
+        BSONObj query(){
+            return _query;
+        }
+
+        BSONObj fields() const { return _fields; }
+
+        string ns() const { return _ns; }
+
+        int ntoskip() const { return _ntoskip; }
+
+        int ntoreturn() const { return _ntoreturn; }
+
+        int options() const { return _options; }
+
+        string toString() const {
+            return str::stream() << "QSpec " << BSON( "ns" << _ns << "n2skip" << _ntoskip << "n2return" << _ntoreturn << "options" << _options
+                                                           << "query" << _query << "fields" << _fields );
+        }
+
+    };
+
+
+    /** Typically one uses the QUERY(...) macro to construct a Query object.
+        Example: QUERY( "age" << 33 << "school" << "UCLA" )
+    */
+#define QUERY(x) mongo::Query( BSON(x) )
+
+    // Useful utilities for namespaces
+    /** @return the database name portion of an ns string */
+    string nsGetDB( const string &ns );
+
+    /** @return the collection name portion of an ns string */
+    string nsGetCollection( const string &ns );
+
+    /**
+       interface that handles communication with the db
+     */
+    class DBConnector {
+    public:
+        virtual ~DBConnector() {}
+        /** actualServer is set to the actual server where they call went if there was a choice (SlaveOk) */
+        virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 ) = 0;
+        virtual void say( Message &toSend, bool isRetry = false ) = 0;
+        virtual void sayPiggyBack( Message &toSend ) = 0;
+        /* used by QueryOption_Exhaust.  To use that your subclass must implement this. */
+        virtual bool recv( Message& m ) { assert(false); return false; }
+        // In general, for lazy queries, we'll need to say, recv, then checkResponse
+        virtual void checkResponse( const char* data, int nReturned, bool* retry = NULL, string* targetHost = NULL ) {
+            if( retry ) *retry = false; if( targetHost ) *targetHost = "";
+        }
+        virtual bool lazySupported() const = 0;
+    };
+
+    /**
+       The interface that any db connection should implement
+     */
+    class DBClientInterface : boost::noncopyable {
+    public:
+        virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0,
+                                               const BSONObj *fieldsToReturn = 0, int queryOptions = 0 , int batchSize = 0 ) = 0;
+
+        virtual void insert( const string &ns, BSONObj obj , int flags=0) = 0;
+
+        virtual void insert( const string &ns, const vector< BSONObj >& v , int flags=0) = 0;
+
+        virtual void remove( const string &ns , Query query, bool justOne = 0 ) = 0;
+
+        virtual void update( const string &ns , Query query , BSONObj obj , bool upsert = 0 , bool multi = 0 ) = 0;
+
+        virtual ~DBClientInterface() { }
+
+        /**
+           @return a single object that matches the query.  if none do, then the object is empty
+           @throws AssertionException
+        */
+        virtual BSONObj findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+
+        /** query N objects from the database into an array.  makes sense mostly when you want a small number of results.  if a huge number, use 
+            query() and iterate the cursor. 
+        */
+        void findN(vector<BSONObj>& out, const string&ns, Query query, int nToReturn, int nToSkip = 0, const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+
+        virtual string getServerAddress() const = 0;
+
+        /** don't use this - called automatically by DBClientCursor for you */
+        virtual auto_ptr<DBClientCursor> getMore( const string &ns, long long cursorId, int nToReturn = 0, int options = 0 ) = 0;
+    };
+
+    /**
+       DB "commands"
+       Basically just invocations of connection.$cmd.findOne({...});
+    */
+    class DBClientWithCommands : public DBClientInterface {
+        set<string> _seenIndexes;
+    public:
+        /** controls how chatty the client is about network errors & such.  See log.h */
+        int _logLevel;
+
+        DBClientWithCommands() : _logLevel(0), _cachedAvailableOptions( (enum QueryOptions)0 ), _haveCachedAvailableOptions(false) { }
+
+        /** helper function.  run a simple command where the command expression is simply
+              { command : 1 }
+            @param info -- where to put result object.  may be null if caller doesn't need that info
+            @param command -- command name
+            @return true if the command returned "ok".
+         */
+        bool simpleCommand(const string &dbname, BSONObj *info, const string &command);
+
+        /** Run a database command.  Database commands are represented as BSON objects.  Common database
+            commands have prebuilt helper functions -- see below.  If a helper is not available you can
+            directly call runCommand.
+
+            @param dbname database name.  Use "admin" for global administrative commands.
+            @param cmd  the command object to execute.  For example, { ismaster : 1 }
+            @param info the result object the database returns. Typically has { ok : ..., errmsg : ... } fields
+                   set.
+            @param options see enum QueryOptions - normally not needed to run a command
+            @return true if the command returned "ok".
+        */
+        virtual bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0);
+
+        /** Authorize access to a particular database.
+            Authentication is separate for each database on the server -- you may authenticate for any
+            number of databases on a single connection.
+            The "admin" database is special and once authenticated provides access to all databases on the
+            server.
+            @param digestPassword if password is plain text, set this to true.  otherwise assumed to be pre-digested
+            @return true if successful
+        */
+        virtual bool auth(const string &dbname, const string &username, const string &pwd, string& errmsg, bool digestPassword = true);
+
+        /** count number of objects in collection ns that match the query criteria specified
+            throws UserAssertion if database returns an error
+        */
+        virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 );
+
+        string createPasswordDigest( const string &username , const string &clearTextPassword );
+
+        /** returns true in isMaster parm if this db is the current master
+           of a replica pair.
+
+           pass in info for more details e.g.:
+             { "ismaster" : 1.0 , "msg" : "not paired" , "ok" : 1.0  }
+
+           returns true if command invoked successfully.
+        */
+        virtual bool isMaster(bool& isMaster, BSONObj *info=0);
+
+        /**
+           Create a new collection in the database.  Normally, collection creation is automatic.  You would
+           use this function if you wish to specify special options on creation.
+
+           If the collection already exists, no action occurs.
+
+           @param ns     fully qualified collection name
+           @param size   desired initial extent size for the collection.
+                         Must be <= 1000000000 for normal collections.
+                         For fixed size (capped) collections, this size is the total/max size of the
+                         collection.
+           @param capped if true, this is a fixed size collection (where old data rolls out).
+           @param max    maximum number of objects if capped (optional).
+
+           returns true if successful.
+        */
+        bool createCollection(const string &ns, long long size = 0, bool capped = false, int max = 0, BSONObj *info = 0);
+
+        /** Get error result from the last write operation (insert/update/delete) on this connection.
+            @return error message text, or empty string if no error.
+        */
+        string getLastError(bool fsync = false, bool j = false, int w = 0, int wtimeout = 0);
+
+        /** Get error result from the last write operation (insert/update/delete) on this connection.
+            @return full error object.
+
+            If "w" is -1, wait for propagation to majority of nodes.
+            If "wtimeout" is 0, the operation will block indefinitely if needed.
+        */
+        virtual BSONObj getLastErrorDetailed(bool fsync = false, bool j = false, int w = 0, int wtimeout = 0);
+
+        /** Can be called with the returned value from getLastErrorDetailed to extract an error string. 
+            If all you need is the string, just call getLastError() instead.
+        */
+        static string getLastErrorString( const BSONObj& res );
+
+        /** Return the last error which has occurred, even if not the very last operation.
+
+           @return { err : <error message>, nPrev : <how_many_ops_back_occurred>, ok : 1 }
+
+           result.err will be null if no error has occurred.
+        */
+        BSONObj getPrevError();
+
+        /** Reset the previous error state for this connection (accessed via getLastError and
+            getPrevError).  Useful when performing several operations at once and then checking
+            for an error after attempting all operations.
+        */
+        bool resetError() { return simpleCommand("admin", 0, "reseterror"); }
+
+        /** Delete the specified collection. */
+        virtual bool dropCollection( const string &ns ) {
+            string db = nsGetDB( ns );
+            string coll = nsGetCollection( ns );
+            uassert( 10011 ,  "no collection name", coll.size() );
+
+            BSONObj info;
+
+            bool res = runCommand( db.c_str() , BSON( "drop" << coll ) , info );
+            resetIndexCache();
+            return res;
+        }
+
+        /** Perform a repair and compaction of the specified database.  May take a long time to run.  Disk space
+           must be available equal to the size of the database while repairing.
+        */
+        bool repairDatabase(const string &dbname, BSONObj *info = 0) {
+            return simpleCommand(dbname, info, "repairDatabase");
+        }
+
+        /** Copy database from one server or name to another server or name.
+
+           Generally, you should dropDatabase() first as otherwise the copied information will MERGE
+           into whatever data is already present in this database.
+
+           For security reasons this function only works when you are authorized to access the "admin" db.  However,
+           if you have access to said db, you can copy any database from one place to another.
+           TODO: this needs enhancement to be more flexible in terms of security.
+
+           This method provides a way to "rename" a database by copying it to a new db name and
+           location.  The copy is "repaired" and compacted.
+
+           fromdb   database name from which to copy.
+           todb     database name to copy to.
+           fromhost hostname of the database (and optionally, ":port") from which to
+                    copy the data.  copies from self if "".
+
+           returns true if successful
+        */
+        bool copyDatabase(const string &fromdb, const string &todb, const string &fromhost = "", BSONObj *info = 0);
+
+        /** The Mongo database provides built-in performance profiling capabilities.  Uset setDbProfilingLevel()
+           to enable.  Profiling information is then written to the system.profiling collection, which one can
+           then query.
+        */
+        enum ProfilingLevel {
+            ProfileOff = 0,
+            ProfileSlow = 1, // log very slow (>100ms) operations
+            ProfileAll = 2
+
+        };
+        bool setDbProfilingLevel(const string &dbname, ProfilingLevel level, BSONObj *info = 0);
+        bool getDbProfilingLevel(const string &dbname, ProfilingLevel& level, BSONObj *info = 0);
+
+
+        /** This implicitly converts from char*, string, and BSONObj to be an argument to mapreduce
+            You shouldn't need to explicitly construct this
+         */
+        struct MROutput {
+            MROutput(const char* collection) : out(BSON("replace" << collection)) {}
+            MROutput(const string& collection) : out(BSON("replace" << collection)) {}
+            MROutput(const BSONObj& obj) : out(obj) {}
+
+            BSONObj out;
+        };
+        static MROutput MRInline;
+
+        /** Run a map/reduce job on the server.
+
+            See http://www.mongodb.org/display/DOCS/MapReduce
+
+            ns        namespace (db+collection name) of input data
+            jsmapf    javascript map function code
+            jsreducef javascript reduce function code.
+            query     optional query filter for the input
+            output    either a string collection name or an object representing output type
+                      if not specified uses inline output type
+
+            returns a result object which contains:
+             { result : <collection_name>,
+               numObjects : <number_of_objects_scanned>,
+               timeMillis : <job_time>,
+               ok : <1_if_ok>,
+               [, err : <errmsg_if_error>]
+             }
+
+             For example one might call:
+               result.getField("ok").trueValue()
+             on the result to check if ok.
+        */
+        BSONObj mapreduce(const string &ns, const string &jsmapf, const string &jsreducef, BSONObj query = BSONObj(), MROutput output = MRInline);
+
+        /** Run javascript code on the database server.
+           dbname    database SavedContext in which the code runs. The javascript variable 'db' will be assigned
+                     to this database when the function is invoked.
+           jscode    source code for a javascript function.
+           info      the command object which contains any information on the invocation result including
+                      the return value and other information.  If an error occurs running the jscode, error
+                     information will be in info.  (try "out() << info.toString()")
+           retValue  return value from the jscode function.
+           args      args to pass to the jscode function.  when invoked, the 'args' variable will be defined
+                     for use by the jscode.
+
+           returns true if runs ok.
+
+           See testDbEval() in dbclient.cpp for an example of usage.
+        */
+        bool eval(const string &dbname, const string &jscode, BSONObj& info, BSONElement& retValue, BSONObj *args = 0);
+
+        /** validate a collection, checking for errors and reporting back statistics.
+            this operation is slow and blocking.
+         */
+        bool validate( const string &ns , bool scandata=true ) {
+            BSONObj cmd = BSON( "validate" << nsGetCollection( ns ) << "scandata" << scandata );
+            BSONObj info;
+            return runCommand( nsGetDB( ns ).c_str() , cmd , info );
+        }
+
+        /* The following helpers are simply more convenient forms of eval() for certain common cases */
+
+        /* invocation with no return value of interest -- with or without one simple parameter */
+        bool eval(const string &dbname, const string &jscode);
+        template< class T >
+        bool eval(const string &dbname, const string &jscode, T parm1) {
+            BSONObj info;
+            BSONElement retValue;
+            BSONObjBuilder b;
+            b.append("0", parm1);
+            BSONObj args = b.done();
+            return eval(dbname, jscode, info, retValue, &args);
+        }
+
+        /** eval invocation with one parm to server and one numeric field (either int or double) returned */
+        template< class T, class NumType >
+        bool eval(const string &dbname, const string &jscode, T parm1, NumType& ret) {
+            BSONObj info;
+            BSONElement retValue;
+            BSONObjBuilder b;
+            b.append("0", parm1);
+            BSONObj args = b.done();
+            if ( !eval(dbname, jscode, info, retValue, &args) )
+                return false;
+            ret = (NumType) retValue.number();
+            return true;
+        }
+
+        /**
+           get a list of all the current databases
+           uses the { listDatabases : 1 } command.
+           throws on error
+         */
+        list<string> getDatabaseNames();
+
+        /**
+           get a list of all the current collections in db
+         */
+        list<string> getCollectionNames( const string& db );
+
+        bool exists( const string& ns );
+
+        /** Create an index if it does not already exist.
+            ensureIndex calls are remembered so it is safe/fast to call this function many
+            times in your code.
+           @param ns collection to be indexed
+           @param keys the "key pattern" for the index.  e.g., { name : 1 }
+           @param unique if true, indicates that key uniqueness should be enforced for this index
+           @param name if not specified, it will be created from the keys automatically (which is recommended)
+           @param cache if set to false, the index cache for the connection won't remember this call
+           @param background build index in the background (see mongodb docs/wiki for details)
+           @param v index version. leave at default value. (unit tests set this parameter.)
+           @return whether or not sent message to db.
+             should be true on first call, false on subsequent unless resetIndexCache was called
+         */
+        virtual bool ensureIndex( const string &ns , BSONObj keys , bool unique = false, const string &name = "",
+                                  bool cache = true, bool background = false, int v = -1 );
+
+        /**
+           clears the index cache, so the subsequent call to ensureIndex for any index will go to the server
+         */
+        virtual void resetIndexCache();
+
+        virtual auto_ptr<DBClientCursor> getIndexes( const string &ns );
+
+        virtual void dropIndex( const string& ns , BSONObj keys );
+        virtual void dropIndex( const string& ns , const string& indexName );
+
+        /**
+           drops all indexes for the collection
+         */
+        virtual void dropIndexes( const string& ns );
+
+        virtual void reIndex( const string& ns );
+
+        string genIndexName( const BSONObj& keys );
+
+        /** Erase / drop an entire database */
+        virtual bool dropDatabase(const string &dbname, BSONObj *info = 0) {
+            bool ret = simpleCommand(dbname, info, "dropDatabase");
+            resetIndexCache();
+            return ret;
+        }
+
+        virtual string toString() = 0;
+
+    protected:
+        /** if the result of a command is ok*/
+        bool isOk(const BSONObj&);
+
+        /** if the element contains a not master error */
+        bool isNotMasterErrorString( const BSONElement& e );
+
+        BSONObj _countCmd(const string &ns, const BSONObj& query, int options, int limit, int skip );
+
+        enum QueryOptions availableOptions();
+
+    private:
+        enum QueryOptions _cachedAvailableOptions;
+        bool _haveCachedAvailableOptions;
+    };
+
+    /**
+     abstract class that implements the core db operations
+     */
+    class DBClientBase : public DBClientWithCommands, public DBConnector {
+    protected:
+        WriteConcern _writeConcern;
+
+    public:
+        DBClientBase() {
+            _writeConcern = W_NORMAL;
+        }
+
+        WriteConcern getWriteConcern() const { return _writeConcern; }
+        void setWriteConcern( WriteConcern w ) { _writeConcern = w; }
+
+        /** send a query to the database.
+         @param ns namespace to query, format is <dbname>.<collectname>[.<collectname>]*
+         @param query query to perform on the collection.  this is a BSONObj (binary JSON)
+         You may format as
+           { query: { ... }, orderby: { ... } }
+         to specify a sort order.
+         @param nToReturn n to return (i.e., limit).  0 = unlimited
+         @param nToSkip start with the nth item
+         @param fieldsToReturn optional template of which fields to select. if unspecified, returns all fields
+         @param queryOptions see options enum at top of this file
+
+         @return    cursor.   0 if error (connection failure)
+         @throws AssertionException
+        */
+        virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0,
+                                               const BSONObj *fieldsToReturn = 0, int queryOptions = 0 , int batchSize = 0 );
+
+        /** don't use this - called automatically by DBClientCursor for you
+            @param cursorId id of cursor to retrieve
+            @return an handle to a previously allocated cursor
+            @throws AssertionException
+         */
+        virtual auto_ptr<DBClientCursor> getMore( const string &ns, long long cursorId, int nToReturn = 0, int options = 0 );
+
+        /**
+           insert an object into the database
+         */
+        virtual void insert( const string &ns , BSONObj obj , int flags=0);
+
+        /**
+           insert a vector of objects into the database
+         */
+        virtual void insert( const string &ns, const vector< BSONObj >& v , int flags=0);
+
+        /**
+           remove matching objects from the database
+           @param justOne if this true, then once a single match is found will stop
+         */
+        virtual void remove( const string &ns , Query q , bool justOne = 0 );
+
+        /**
+           updates objects matching query
+         */
+        virtual void update( const string &ns , Query query , BSONObj obj , bool upsert = false , bool multi = false );
+
+        virtual bool isFailed() const = 0;
+
+        virtual void killCursor( long long cursorID ) = 0;
+
+        virtual bool callRead( Message& toSend , Message& response ) = 0;
+        // virtual bool callWrite( Message& toSend , Message& response ) = 0; // TODO: add this if needed
+        
+        virtual ConnectionString::ConnectionType type() const = 0;
+        
+        virtual double getSoTimeout() const = 0;
+
+    }; // DBClientBase
+
+    class DBClientReplicaSet;
+
+    class ConnectException : public UserException {
+    public:
+        ConnectException(string msg) : UserException(9000,msg) { }
+    };
+
+    /**
+        A basic connection to the database.
+        This is the main entry point for talking to a simple Mongo setup
+    */
+    class DBClientConnection : public DBClientBase {
+    public:
+        /**
+           @param _autoReconnect if true, automatically reconnect on a connection failure
+           @param cp used by DBClientReplicaSet.  You do not need to specify this parameter
+           @param timeout tcp timeout in seconds - this is for read/write, not connect.
+           Connect timeout is fixed, but short, at 5 seconds.
+         */
+        DBClientConnection(bool _autoReconnect=false, DBClientReplicaSet* cp=0, double so_timeout=0) :
+            clientSet(cp), _failed(false), autoReconnect(_autoReconnect), lastReconnectTry(0), _so_timeout(so_timeout) {
+            _numConnections++;
+        }
+
+        virtual ~DBClientConnection() {
+            _numConnections--;
+        }
+
+        /** Connect to a Mongo database server.
+
+           If autoReconnect is true, you can try to use the DBClientConnection even when
+           false was returned -- it will try to connect again.
+
+           @param serverHostname host to connect to.  can include port number ( 127.0.0.1 , 127.0.0.1:5555 )
+                                 If you use IPv6 you must add a port number ( ::1:27017 )
+           @param errmsg any relevant error message will appended to the string
+           @deprecated please use HostAndPort
+           @return false if fails to connect.
+        */
+        virtual bool connect(const char * hostname, string& errmsg) {
+            // TODO: remove this method
+            HostAndPort t( hostname );
+            return connect( t , errmsg );
+        }
+
+        /** Connect to a Mongo database server.
+
+           If autoReconnect is true, you can try to use the DBClientConnection even when
+           false was returned -- it will try to connect again.
+
+           @param server server to connect to.
+           @param errmsg any relevant error message will appended to the string
+           @return false if fails to connect.
+        */
+        virtual bool connect(const HostAndPort& server, string& errmsg);
+
+        /** Connect to a Mongo database server.  Exception throwing version.
+            Throws a UserException if cannot connect.
+
+           If autoReconnect is true, you can try to use the DBClientConnection even when
+           false was returned -- it will try to connect again.
+
+           @param serverHostname host to connect to.  can include port number ( 127.0.0.1 , 127.0.0.1:5555 )
+        */
+        void connect(const string& serverHostname) {
+            string errmsg;
+            if( !connect(HostAndPort(serverHostname), errmsg) )
+                throw ConnectException(string("can't connect ") + errmsg);
+        }
+
+        virtual bool auth(const string &dbname, const string &username, const string &pwd, string& errmsg, bool digestPassword = true);
+
+        virtual auto_ptr<DBClientCursor> query(const string &ns, Query query=Query(), int nToReturn = 0, int nToSkip = 0,
+                                               const BSONObj *fieldsToReturn = 0, int queryOptions = 0 , int batchSize = 0 ) {
+            checkConnection();
+            return DBClientBase::query( ns, query, nToReturn, nToSkip, fieldsToReturn, queryOptions , batchSize );
+        }
+
+        /** Uses QueryOption_Exhaust
+            Exhaust mode sends back all data queries as fast as possible, with no back-and-for for OP_GETMORE.  If you are certain
+            you will exhaust the query, it could be useful.
+
+            Use DBClientCursorBatchIterator version if you want to do items in large blocks, perhaps to avoid granular locking and such.
+         */
+        unsigned long long query( boost::function<void(const BSONObj&)> f, const string& ns, Query query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+        unsigned long long query( boost::function<void(DBClientCursorBatchIterator&)> f, const string& ns, Query query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+
+        virtual bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0);
+
+        /**
+           @return true if this connection is currently in a failed state.  When autoreconnect is on,
+                   a connection will transition back to an ok state after reconnecting.
+         */
+        bool isFailed() const { return _failed; }
+
+        MessagingPort& port() { assert(p); return *p; }
+
+        string toStringLong() const {
+            stringstream ss;
+            ss << _serverString;
+            if ( _failed ) ss << " failed";
+            return ss.str();
+        }
+
+        /** Returns the address of the server */
+        string toString() { return _serverString; }
+
+        string getServerAddress() const { return _serverString; }
+
+        virtual void killCursor( long long cursorID );
+        virtual bool callRead( Message& toSend , Message& response ) { return call( toSend , response ); }
+        virtual void say( Message &toSend, bool isRetry = false );
+        virtual bool recv( Message& m );
+        virtual void checkResponse( const char *data, int nReturned, bool* retry = NULL, string* host = NULL );
+        virtual bool call( Message &toSend, Message &response, bool assertOk = true , string * actualServer = 0 );
+        virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; }
+        void setSoTimeout(double to) { _so_timeout = to; }
+        double getSoTimeout() const { return _so_timeout; }
+
+        virtual bool lazySupported() const { return true; }
+
+        static int getNumConnections() {
+            return _numConnections;
+        }
+        
+        static void setLazyKillCursor( bool lazy ) { _lazyKillCursor = lazy; }
+        static bool getLazyKillCursor() { return _lazyKillCursor; }
+        
+    protected:
+        friend class SyncClusterConnection;
+        virtual void sayPiggyBack( Message &toSend );
+
+        DBClientReplicaSet *clientSet;
+        boost::scoped_ptr<MessagingPort> p;
+        boost::scoped_ptr<SockAddr> server;
+        bool _failed;
+        const bool autoReconnect;
+        time_t lastReconnectTry;
+        HostAndPort _server; // remember for reconnects
+        string _serverString;
+        void _checkConnection();
+
+        // throws SocketException if in failed state and not reconnecting or if waiting to reconnect
+        void checkConnection() { if( _failed ) _checkConnection(); }
+
+        map< string, pair<string,string> > authCache;
+        double _so_timeout;
+        bool _connect( string& errmsg );
+
+        static AtomicUInt _numConnections;
+        static bool _lazyKillCursor; // lazy means we piggy back kill cursors on next op
+
+#ifdef MONGO_SSL
+        static SSLManager* sslManager();
+        static SSLManager* _sslManager;
+#endif
+    };
+
+    /** pings server to check if it's up
+     */
+    bool serverAlive( const string &uri );
+
+    DBClientBase * createDirectClient();
+
+    BSONElement getErrField( const BSONObj& result );
+    bool hasErrField( const BSONObj& result );
+
+    inline std::ostream& operator<<( std::ostream &s, const Query &q ) {
+        return s << q.toString();
+    }
+
+} // namespace mongo
+
+#include "dbclientcursor.h"
+#include "dbclient_rs.h"
+#include "undef_macros.h"
diff --git a/src/mongo/client/dbclient_rs.cpp b/src/mongo/client/dbclient_rs.cpp
new file mode 100644
index 00000000000..2d9e0fbabba
--- /dev/null
+++ b/src/mongo/client/dbclient_rs.cpp
@@ -0,0 +1,993 @@
+// dbclient.cpp - connect to a Mongo database as a database, from C++
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "dbclient.h"
+#include "../bson/util/builder.h"
+#include "../db/jsobj.h"
+#include "../db/json.h"
+#include "../db/dbmessage.h"
+#include "connpool.h"
+#include "dbclient_rs.h"
+#include "../util/background.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    // --------------------------------
+    // ----- ReplicaSetMonitor ---------
+    // --------------------------------
+
+    // global background job responsible for checking every X amount of time
+    class ReplicaSetMonitorWatcher : public BackgroundJob {
+    public:
+        ReplicaSetMonitorWatcher() : _safego("ReplicaSetMonitorWatcher::_safego") , _started(false) {}
+
+        virtual string name() const { return "ReplicaSetMonitorWatcher"; }
+        
+        void safeGo() {
+            // check outside of lock for speed
+            if ( _started )
+                return;
+            
+            scoped_lock lk( _safego );
+            if ( _started )
+                return;
+            _started = true;
+
+            go();
+        }
+    protected:
+        void run() {
+            log() << "starting" << endl;
+            while ( ! inShutdown() ) {
+                sleepsecs( 10 );
+                try {
+                    ReplicaSetMonitor::checkAll( true );
+                }
+                catch ( std::exception& e ) {
+                    error() << "check failed: " << e.what() << endl;
+                }
+                catch ( ... ) {
+                    error() << "unkown error" << endl;
+                }
+            }
+        }
+
+        mongo::mutex _safego;
+        bool _started;
+
+    } replicaSetMonitorWatcher;
+
+    string seedString( const vector<HostAndPort>& servers ){
+        string seedStr;
+        for ( unsigned i = 0; i < servers.size(); i++ ){
+            seedStr += servers[i].toString();
+            if( i < servers.size() - 1 ) seedStr += ",";
+        }
+
+        return seedStr;
+    }
+
+    ReplicaSetMonitor::ReplicaSetMonitor( const string& name , const vector<HostAndPort>& servers )
+        : _lock( "ReplicaSetMonitor instance" ) , _checkConnectionLock( "ReplicaSetMonitor check connection lock" ), _name( name ) , _master(-1), _nextSlave(0) {
+        
+        uassert( 13642 , "need at least 1 node for a replica set" , servers.size() > 0 );
+
+        if ( _name.size() == 0 ) {
+            warning() << "replica set name empty, first node: " << servers[0] << endl;
+        }
+
+        log() << "starting new replica set monitor for replica set " << _name << " with seed of " << seedString( servers ) << endl;
+
+        string errmsg;
+        for ( unsigned i = 0; i < servers.size(); i++ ) {
+
+            // Don't check servers we have already
+            if( _find_inlock( servers[i] ) >= 0 ) continue;
+
+            auto_ptr<DBClientConnection> conn( new DBClientConnection( true , 0, 5.0 ) );
+            try{
+                if( ! conn->connect( servers[i] , errmsg ) ){
+                    throw DBException( errmsg, 15928 );
+                }
+                log() << "successfully connected to seed " << servers[i] << " for replica set " << this->_name << endl;
+            }
+            catch( DBException& e ){
+                log() << "error connecting to seed " << servers[i] << causedBy( e ) << endl;
+                // skip seeds that don't work
+                continue;
+            }
+
+            string maybePrimary;
+            _checkConnection( conn.get(), maybePrimary, false, -1 );
+        }
+
+        // Check everything to get the first data
+        _check( true );
+
+        log() << "replica set monitor for replica set " << _name << " started, address is " << getServerAddress() << endl;
+
+    }
+
+    ReplicaSetMonitor::~ReplicaSetMonitor() {
+        _nodes.clear();
+        _master = -1;
+    }
+
+    ReplicaSetMonitorPtr ReplicaSetMonitor::get( const string& name , const vector<HostAndPort>& servers ) {
+        scoped_lock lk( _setsLock );
+        ReplicaSetMonitorPtr& m = _sets[name];
+        if ( ! m )
+            m.reset( new ReplicaSetMonitor( name , servers ) );
+
+        replicaSetMonitorWatcher.safeGo();
+
+        return m;
+    }
+
+    ReplicaSetMonitorPtr ReplicaSetMonitor::get( const string& name ) {
+        scoped_lock lk( _setsLock );
+        map<string,ReplicaSetMonitorPtr>::const_iterator i = _sets.find( name );
+        if ( i == _sets.end() ) 
+            return ReplicaSetMonitorPtr();
+        return i->second;
+    }
+
+
+    void ReplicaSetMonitor::checkAll( bool checkAllSecondaries ) {
+        set<string> seen;
+
+        while ( true ) {
+            ReplicaSetMonitorPtr m;
+            {
+                scoped_lock lk( _setsLock );
+                for ( map<string,ReplicaSetMonitorPtr>::iterator i=_sets.begin(); i!=_sets.end(); ++i ) {
+                    string name = i->first;
+                    if ( seen.count( name ) )
+                        continue;
+                    LOG(1) << "checking replica set: " << name << endl;
+                    seen.insert( name );
+                    m = i->second;
+                    break;
+                }
+            }
+
+            if ( ! m )
+                break;
+
+            m->check( checkAllSecondaries );
+        }
+
+
+    }
+
+    void ReplicaSetMonitor::setConfigChangeHook( ConfigChangeHook hook ) {
+        massert( 13610 , "ConfigChangeHook already specified" , _hook == 0 );
+        _hook = hook;
+    }
+    
+    string ReplicaSetMonitor::getServerAddress() const {
+        scoped_lock lk( _lock );
+        return _getServerAddress_inlock();
+    }
+
+    string ReplicaSetMonitor::_getServerAddress_inlock() const {
+        StringBuilder ss;
+        if ( _name.size() )
+            ss << _name << "/";
+
+        for ( unsigned i=0; i<_nodes.size(); i++ ) {
+            if ( i > 0 )
+                ss << ",";
+            ss << _nodes[i].addr.toString();
+        }
+
+        return ss.str();
+    }
+
+    bool ReplicaSetMonitor::contains( const string& server ) const {
+        scoped_lock lk( _lock );
+        for ( unsigned i=0; i<_nodes.size(); i++ ) {
+            if ( _nodes[i].addr == server )
+                return true;
+        }
+        return false;
+    }
+    
+
+    void ReplicaSetMonitor::notifyFailure( const HostAndPort& server ) {
+        scoped_lock lk( _lock );
+        if ( _master >= 0 && _master < (int)_nodes.size() ) {
+            if ( server == _nodes[_master].addr ) {
+                _nodes[_master].ok = false; 
+                _master = -1;
+            }
+        }
+    }
+
+
+
+    HostAndPort ReplicaSetMonitor::getMaster() {
+        {
+            scoped_lock lk( _lock );
+            if ( _master >= 0 && _nodes[_master].ok )
+                return _nodes[_master].addr;
+        }
+        
+        _check( false );
+
+        scoped_lock lk( _lock );
+        uassert( 10009 , str::stream() << "ReplicaSetMonitor no master found for set: " << _name , _master >= 0 );
+        return _nodes[_master].addr;
+    }
+    
+    HostAndPort ReplicaSetMonitor::getSlave( const HostAndPort& prev ) {
+        // make sure its valid
+
+        bool wasFound = false;
+        bool wasMaster = false;
+
+        // This is always true, since checked in port()
+        assert( prev.port() >= 0 );
+        if( prev.host().size() ){
+            scoped_lock lk( _lock );
+            for ( unsigned i=0; i<_nodes.size(); i++ ) {
+                if ( prev != _nodes[i].addr ) 
+                    continue;
+
+                wasFound = true;
+
+                if ( _nodes[i].okForSecondaryQueries() )
+                    return prev;
+
+                wasMaster = _nodes[i].ok && ! _nodes[i].secondary;
+
+                break;
+            }
+        }
+        
+        if( prev.host().size() ){
+            if( wasFound ){ LOG(1) << "slave '" << prev << ( wasMaster ? "' is master node, trying to find another node" :
+                                                                         "' is no longer ok to use" ) << endl; }
+            else{ LOG(1) << "slave '" << prev << "' was not found in the replica set" << endl; }
+        }
+        else LOG(1) << "slave '" << prev << "' is not initialized or invalid" << endl;
+
+        return getSlave();
+    }
+
+    HostAndPort ReplicaSetMonitor::getSlave() {
+        LOG(2) << "dbclient_rs getSlave " << getServerAddress() << endl;
+
+        scoped_lock lk( _lock );
+
+        for ( unsigned ii = 0; ii < _nodes.size(); ii++ ) {
+            _nextSlave = ( _nextSlave + 1 ) % _nodes.size();
+            if ( _nextSlave != _master ) {
+                if ( _nodes[ _nextSlave ].okForSecondaryQueries() )
+                    return _nodes[ _nextSlave ].addr;
+                LOG(2) << "dbclient_rs getSlave not selecting " << _nodes[_nextSlave] << ", not currently okForSecondaryQueries" << endl;
+            }
+        }
+        uassert(15899, str::stream() << "No suitable member found for slaveOk query in replica set: " << _name, _master >= 0 && _nodes[_master].ok);
+
+        // Fall back to primary
+        assert( static_cast<unsigned>(_master) < _nodes.size() );
+        LOG(2) << "dbclient_rs getSlave no member in secondary state found, returning primary " << _nodes[ _master ] << endl;
+        return _nodes[_master].addr;
+    }
+
+    /**
+     * notify the monitor that server has faild
+     */
+    void ReplicaSetMonitor::notifySlaveFailure( const HostAndPort& server ) {
+        int x = _find( server );
+        if ( x >= 0 ) {
+            scoped_lock lk( _lock );
+            _nodes[x].ok = false;
+        }
+    }
+
+    void ReplicaSetMonitor::_checkStatus(DBClientConnection *conn) {
+        BSONObj status;
+
+        if (!conn->runCommand("admin", BSON("replSetGetStatus" << 1), status) ) {
+            LOG(1) << "dbclient_rs replSetGetStatus failed" << endl;
+            return;
+        }
+        if( !status.hasField("members") ) { 
+            log() << "dbclient_rs error expected members field in replSetGetStatus result" << endl;
+            return;
+        }
+        if( status["members"].type() != Array) {
+            log() << "dbclient_rs error expected members field in replSetGetStatus result to be an array" << endl;
+            return;
+        }
+
+        BSONObjIterator hi(status["members"].Obj());
+        while (hi.more()) {
+            BSONObj member = hi.next().Obj();
+            string host = member["name"].String();
+
+            int m = -1;
+            if ((m = _find(host)) < 0) {
+                LOG(1) << "dbclient_rs _checkStatus couldn't _find(" << host << ')' << endl;
+                continue;
+            }
+
+            double state = member["state"].Number();
+            if (member["health"].Number() == 1 && (state == 1 || state == 2)) {
+                LOG(1) << "dbclient_rs nodes["<<m<<"].ok = true " << host << endl;
+                scoped_lock lk( _lock );
+                _nodes[m].ok = true;
+            }
+            else {
+                LOG(1) << "dbclient_rs nodes["<<m<<"].ok = false " << host << endl;
+                scoped_lock lk( _lock );
+                _nodes[m].ok = false;
+            }
+        }
+    }
+
+    NodeDiff ReplicaSetMonitor::_getHostDiff_inlock( const BSONObj& hostList ){
+
+        NodeDiff diff;
+        set<int> nodesFound;
+
+        int index = 0;
+        BSONObjIterator hi( hostList );
+        while( hi.more() ){
+
+            string toCheck = hi.next().String();
+            int nodeIndex = _find_inlock( toCheck );
+
+            // Node-to-add
+            if( nodeIndex < 0 ) diff.first.insert( toCheck );
+            else nodesFound.insert( nodeIndex );
+
+            index++;
+        }
+
+        for( size_t i = 0; i < _nodes.size(); i++ ){
+            if( nodesFound.find( static_cast<int>(i) ) == nodesFound.end() ) diff.second.insert( static_cast<int>(i) );
+        }
+
+        return diff;
+    }
+
+    bool ReplicaSetMonitor::_shouldChangeHosts( const BSONObj& hostList, bool inlock ){
+
+        int origHosts = 0;
+        if( ! inlock ){
+            scoped_lock lk( _lock );
+            origHosts = _nodes.size();
+        }
+        else origHosts = _nodes.size();
+        int numHosts = 0;
+        bool changed = false;
+
+        BSONObjIterator hi(hostList);
+        while ( hi.more() ) {
+            string toCheck = hi.next().String();
+
+            numHosts++;
+            int index = 0;
+            if( ! inlock ) index = _find( toCheck );
+            else index = _find_inlock( toCheck );
+
+            if ( index >= 0 ) continue;
+
+            changed = true;
+            break;
+        }
+
+        return changed || origHosts != numHosts;
+
+    }
+
+    void ReplicaSetMonitor::_checkHosts( const BSONObj& hostList, bool& changed ) {
+
+        // Fast path, still requires intermittent locking
+        if( ! _shouldChangeHosts( hostList, false ) ){
+            changed = false;
+            return;
+        }
+
+        // Slow path, double-checked though
+        scoped_lock lk( _lock );
+
+        // Our host list may have changed while waiting for another thread in the meantime,
+        // so double-check here
+        // TODO:  Do we really need this much protection, this should be pretty rare and not triggered
+        // from lots of threads, duping old behavior for safety
+        if( ! _shouldChangeHosts( hostList, true ) ){
+            changed = false;
+            return;
+        }
+
+        // LogLevel can be pretty low, since replica set reconfiguration should be pretty rare and we
+        // want to record our changes
+        log() << "changing hosts to " << hostList << " from " << _getServerAddress_inlock() << endl;
+
+        NodeDiff diff = _getHostDiff_inlock( hostList );
+        set<string> added = diff.first;
+        set<int> removed = diff.second;
+
+        assert( added.size() > 0 || removed.size() > 0 );
+        changed = true;
+
+        // Delete from the end so we don't invalidate as we delete, delete indices are ascending
+        for( set<int>::reverse_iterator i = removed.rbegin(), end = removed.rend(); i != end; ++i ){
+
+            log() << "erasing host " << _nodes[ *i ] << " from replica set " << this->_name << endl;
+
+            _nodes.erase( _nodes.begin() + *i );
+        }
+
+        // Add new nodes
+        for( set<string>::iterator i = added.begin(), end = added.end(); i != end; ++i ){
+
+            log() << "trying to add new host " << *i << " to replica set " << this->_name << endl;
+
+            // Connect to new node
+            HostAndPort h( *i );
+            DBClientConnection * newConn = new DBClientConnection( true, 0, 5.0 );
+
+            string errmsg;
+            try{
+                if( ! newConn->connect( h , errmsg ) ){
+                    throw DBException( errmsg, 15927 );
+                }
+                log() << "successfully connected to new host " << *i << " in replica set " << this->_name << endl;
+            }
+            catch( DBException& e ){
+                warning() << "cannot connect to new host " << *i << " to replica set " << this->_name << causedBy( e ) << endl;
+                delete newConn;
+                newConn = NULL;
+            }
+
+            _nodes.push_back( Node( h , newConn ) );
+        }
+
+    }
+    
+    
+
+    bool ReplicaSetMonitor::_checkConnection( DBClientConnection * c , string& maybePrimary , bool verbose , int nodesOffset ) {
+        scoped_lock lk( _checkConnectionLock );
+        bool isMaster = false;
+        bool changed = false;
+        try {
+            Timer t;
+            BSONObj o;
+            c->isMaster(isMaster, &o);
+            if ( o["setName"].type() != String || o["setName"].String() != _name ) {
+                warning() << "node: " << c->getServerAddress() << " isn't a part of set: " << _name 
+                          << " ismaster: " << o << endl;
+                if ( nodesOffset >= 0 )
+                    _nodes[nodesOffset].ok = false;
+                return false;
+            }
+
+            if ( nodesOffset >= 0 ) {
+                _nodes[nodesOffset].pingTimeMillis = t.millis();
+                _nodes[nodesOffset].hidden = o["hidden"].trueValue();
+                _nodes[nodesOffset].secondary = o["secondary"].trueValue();
+                _nodes[nodesOffset].ismaster = o["ismaster"].trueValue();
+
+                _nodes[nodesOffset].lastIsMaster = o.copy();
+            }
+
+            log( ! verbose ) << "ReplicaSetMonitor::_checkConnection: " << c->toString() << ' ' << o << endl;
+            
+            // add other nodes
+            BSONArrayBuilder b;
+            if ( o["hosts"].type() == Array ) {
+                if ( o["primary"].type() == String )
+                    maybePrimary = o["primary"].String();
+
+                BSONObjIterator it( o["hosts"].Obj() );
+                while( it.more() ) b.append( it.next() );
+            }
+            if (o.hasField("passives") && o["passives"].type() == Array) {
+                BSONObjIterator it( o["passives"].Obj() );
+                while( it.more() ) b.append( it.next() );
+            }
+            
+            _checkHosts( b.arr(), changed);
+            _checkStatus(c);
+
+            
+        }
+        catch ( std::exception& e ) {
+            log( ! verbose ) << "ReplicaSetMonitor::_checkConnection: caught exception " << c->toString() << ' ' << e.what() << endl;
+            _nodes[nodesOffset].ok = false;
+        }
+
+        if ( changed && _hook )
+            _hook( this );
+
+        return isMaster;
+    }
+
+    void ReplicaSetMonitor::_check( bool checkAllSecondaries ) {
+
+        bool triedQuickCheck = false;
+
+        LOG(1) <<  "_check : " << getServerAddress() << endl;
+
+        int newMaster = -1;
+        
+        for ( int retry = 0; retry < 2; retry++ ) {
+            for ( unsigned i=0; i<_nodes.size(); i++ ) {
+                shared_ptr<DBClientConnection> c;
+                {
+                    scoped_lock lk( _lock );
+                    c = _nodes[i].conn;
+                }
+
+                string maybePrimary;
+                if ( _checkConnection( c.get() , maybePrimary , retry , i ) ) {
+                    _master = i;
+                    newMaster = i;
+                    if ( ! checkAllSecondaries )
+                        return;
+                }
+
+                if ( ! triedQuickCheck && maybePrimary.size() ) {
+                    int x = _find( maybePrimary );
+                    if ( x >= 0 ) {
+                        triedQuickCheck = true;
+                        string dummy;
+                        shared_ptr<DBClientConnection> testConn;
+                        {
+                            scoped_lock lk( _lock );
+                            testConn = _nodes[x].conn;
+                        }
+                        if ( _checkConnection( testConn.get() , dummy , false , x ) ) {
+                            _master = x;
+                            newMaster = x;
+                            if ( ! checkAllSecondaries )
+                                return;
+                        }
+                    }
+                }
+
+            }
+            
+            if ( newMaster >= 0 )
+                return;
+
+            sleepsecs(1);
+        }
+
+    }
+
+    void ReplicaSetMonitor::check( bool checkAllSecondaries ) {
+        // first see if the current master is fine
+        if ( _master >= 0 ) {
+            string temp;
+            if ( _checkConnection( _nodes[_master].conn.get() , temp , false , _master ) ) {
+                if ( ! checkAllSecondaries ) {
+                    // current master is fine, so we're done
+                    return;
+                }
+            }
+        }
+
+        // we either have no master, or the current is dead
+        _check( checkAllSecondaries );
+    }
+
+    int ReplicaSetMonitor::_find( const string& server ) const {
+        scoped_lock lk( _lock );
+        return _find_inlock( server );
+    }
+
+    int ReplicaSetMonitor::_find_inlock( const string& server ) const {
+        for ( unsigned i=0; i<_nodes.size(); i++ )
+            if ( _nodes[i].addr == server )
+                return i;
+        return -1;
+    }
+
+
+    int ReplicaSetMonitor::_find( const HostAndPort& server ) const {
+        scoped_lock lk( _lock );
+        for ( unsigned i=0; i<_nodes.size(); i++ )
+            if ( _nodes[i].addr == server )
+                return i;
+        return -1;
+    }
+    
+    void ReplicaSetMonitor::appendInfo( BSONObjBuilder& b ) const {
+        scoped_lock lk( _lock );
+        BSONArrayBuilder hosts( b.subarrayStart( "hosts" ) );
+        for ( unsigned i=0; i<_nodes.size(); i++ ) {
+            hosts.append( BSON( "addr" << _nodes[i].addr <<
+                                // "lastIsMaster" << _nodes[i].lastIsMaster << // this is a potential race, so only used when debugging
+                                "ok" << _nodes[i].ok <<
+                                "ismaster" << _nodes[i].ismaster <<
+                                "hidden" << _nodes[i].hidden <<
+                                "secondary" << _nodes[i].secondary <<
+                                "pingTimeMillis" << _nodes[i].pingTimeMillis  ) );
+            
+        }
+        hosts.done();
+        
+        b.append( "master" , _master );
+        b.append( "nextSlave" , _nextSlave );
+    }
+    
+
+    mongo::mutex ReplicaSetMonitor::_setsLock( "ReplicaSetMonitor" );
+    map<string,ReplicaSetMonitorPtr> ReplicaSetMonitor::_sets;
+    ReplicaSetMonitor::ConfigChangeHook ReplicaSetMonitor::_hook;
+    // --------------------------------
+    // ----- DBClientReplicaSet ---------
+    // --------------------------------
+
+    DBClientReplicaSet::DBClientReplicaSet( const string& name , const vector<HostAndPort>& servers, double so_timeout )
+        : _monitor( ReplicaSetMonitor::get( name , servers ) ),
+          _so_timeout( so_timeout ) {
+    }
+
+    DBClientReplicaSet::~DBClientReplicaSet() {
+    }
+
+    DBClientConnection * DBClientReplicaSet::checkMaster() {
+        HostAndPort h = _monitor->getMaster();
+
+        if ( h == _masterHost && _master ) {
+            // a master is selected.  let's just make sure connection didn't die
+            if ( ! _master->isFailed() )
+                return _master.get();
+            _monitor->notifyFailure( _masterHost );
+        }
+
+        _masterHost = _monitor->getMaster();
+        _master.reset( new DBClientConnection( true , this , _so_timeout ) );
+        string errmsg;
+        if ( ! _master->connect( _masterHost , errmsg ) ) {
+            _monitor->notifyFailure( _masterHost );
+            uasserted( 13639 , str::stream() << "can't connect to new replica set master [" << _masterHost.toString() << "] err: " << errmsg );
+        }
+        _auth( _master.get() );
+        return _master.get();
+    }
+
+    DBClientConnection * DBClientReplicaSet::checkSlave() {
+        HostAndPort h = _monitor->getSlave( _slaveHost );
+
+        if ( h == _slaveHost && _slave ) {
+            if ( ! _slave->isFailed() )
+                return _slave.get();
+            _monitor->notifySlaveFailure( _slaveHost );
+            _slaveHost = _monitor->getSlave();
+        } 
+        else {
+            _slaveHost = h;
+        }
+
+        _slave.reset( new DBClientConnection( true , this , _so_timeout ) );
+        _slave->connect( _slaveHost );
+        _auth( _slave.get() );
+        return _slave.get();
+    }
+
+
+    void DBClientReplicaSet::_auth( DBClientConnection * conn ) {
+        for ( list<AuthInfo>::iterator i=_auths.begin(); i!=_auths.end(); ++i ) {
+            const AuthInfo& a = *i;
+            string errmsg;
+            if ( ! conn->auth( a.dbname , a.username , a.pwd , errmsg, a.digestPassword ) )
+                warning() << "cached auth failed for set: " << _monitor->getName() << " db: " << a.dbname << " user: " << a.username << endl;
+
+        }
+
+    }
+
+    DBClientConnection& DBClientReplicaSet::masterConn() {
+        return *checkMaster();
+    }
+
+    DBClientConnection& DBClientReplicaSet::slaveConn() {
+        return *checkSlave();
+    }
+
+    bool DBClientReplicaSet::connect() {
+        try {
+            checkMaster();
+        }
+        catch (AssertionException&) {
+            if (_master && _monitor) {
+                _monitor->notifyFailure(_masterHost);
+            }
+            return false;
+        }
+        return true;
+    }
+
+    bool DBClientReplicaSet::auth(const string &dbname, const string &username, const string &pwd, string& errmsg, bool digestPassword ) {
+        DBClientConnection * m = checkMaster();
+
+        // first make sure it actually works
+        if( ! m->auth(dbname, username, pwd, errmsg, digestPassword ) )
+            return false;
+
+        // now that it does, we should save so that for a new node we can auth
+        _auths.push_back( AuthInfo( dbname , username , pwd , digestPassword ) );
+        return true;
+    }
+
+    // ------------- simple functions -----------------
+
+    void DBClientReplicaSet::insert( const string &ns , BSONObj obj , int flags) {
+        checkMaster()->insert(ns, obj, flags);
+    }
+
+    void DBClientReplicaSet::insert( const string &ns, const vector< BSONObj >& v , int flags) {
+        checkMaster()->insert(ns, v, flags);
+    }
+
+    void DBClientReplicaSet::remove( const string &ns , Query obj , bool justOne ) {
+        checkMaster()->remove(ns, obj, justOne);
+    }
+
+    void DBClientReplicaSet::update( const string &ns , Query query , BSONObj obj , bool upsert , bool multi ) {
+        return checkMaster()->update(ns, query, obj, upsert,multi);
+    }
+
+    auto_ptr<DBClientCursor> DBClientReplicaSet::query(const string &ns, Query query, int nToReturn, int nToSkip,
+            const BSONObj *fieldsToReturn, int queryOptions, int batchSize) {
+
+        if ( queryOptions & QueryOption_SlaveOk ) {
+            // we're ok sending to a slave
+            // we'll try 2 slaves before just using master
+            // checkSlave will try a different slave automatically after a failure
+            for ( int i=0; i<3; i++ ) {
+                try {
+                    return checkSlaveQueryResult( checkSlave()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize) );
+                }
+                catch ( DBException &e ) {
+                    LOG(1) << "can't query replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
+                }
+            }
+        }
+
+        return checkMaster()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize);
+    }
+
+    BSONObj DBClientReplicaSet::findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions) {
+        if ( queryOptions & QueryOption_SlaveOk ) {
+            // we're ok sending to a slave
+            // we'll try 2 slaves before just using master
+            // checkSlave will try a different slave automatically after a failure
+            for ( int i=0; i<3; i++ ) {
+                try {
+                    return checkSlave()->findOne(ns,query,fieldsToReturn,queryOptions);
+                }
+                catch ( DBException &e ) {
+                	LOG(1) << "can't findone replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
+                }
+            }
+        }
+
+        return checkMaster()->findOne(ns,query,fieldsToReturn,queryOptions);
+    }
+
+    void DBClientReplicaSet::killCursor( long long cursorID ) {
+        // we should neve call killCursor on a replica set conncetion
+        // since we don't know which server it belongs to
+        // can't assume master because of slave ok
+        // and can have a cursor survive a master change
+        assert(0);
+    }
+
+    void DBClientReplicaSet::isntMaster() { 
+        log() << "got not master for: " << _masterHost << endl;
+        _monitor->notifyFailure( _masterHost );
+        _master.reset(); 
+    }
+
+    auto_ptr<DBClientCursor> DBClientReplicaSet::checkSlaveQueryResult( auto_ptr<DBClientCursor> result ){
+        BSONObj error;
+        bool isError = result->peekError( &error );
+        if( ! isError ) return result;
+
+        // We only check for "not master or secondary" errors here
+
+        // If the error code here ever changes, we need to change this code also
+        BSONElement code = error["code"];
+        if( code.isNumber() && code.Int() == 13436 /* not master or secondary */ ){
+            isntSecondary();
+            throw DBException( str::stream() << "slave " << _slaveHost.toString() << " is no longer secondary", 14812 );
+        }
+
+        return result;
+    }
+
+    void DBClientReplicaSet::isntSecondary() {
+        log() << "slave no longer has secondary status: " << _slaveHost << endl;
+        // Failover to next slave
+        _monitor->notifySlaveFailure( _slaveHost );
+        _slave.reset();
+    }
+
+    void DBClientReplicaSet::say( Message& toSend, bool isRetry ) {
+
+        if( ! isRetry )
+            _lazyState = LazyState();
+
+        int lastOp = -1;
+        bool slaveOk = false;
+
+        if ( ( lastOp = toSend.operation() ) == dbQuery ) {
+            // TODO: might be possible to do this faster by changing api
+            DbMessage dm( toSend );
+            QueryMessage qm( dm );
+            if ( ( slaveOk = ( qm.queryOptions & QueryOption_SlaveOk ) ) ) {
+
+                for ( int i = _lazyState._retries; i < 3; i++ ) {
+                    try {
+                        DBClientConnection* slave = checkSlave();
+                        slave->say( toSend );
+
+                        _lazyState._lastOp = lastOp;
+                        _lazyState._slaveOk = slaveOk;
+                        _lazyState._retries = i;
+                        _lazyState._lastClient = slave;
+                        return;
+                    }
+                    catch ( DBException &e ) {
+                       LOG(1) << "can't callLazy replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
+                    }
+                }
+            }
+        }
+
+        DBClientConnection* master = checkMaster();
+        master->say( toSend );
+
+        _lazyState._lastOp = lastOp;
+        _lazyState._slaveOk = slaveOk;
+        _lazyState._retries = 3;
+        _lazyState._lastClient = master;
+        return;
+    }
+
+    bool DBClientReplicaSet::recv( Message& m ) {
+
+        assert( _lazyState._lastClient );
+
+        // TODO: It would be nice if we could easily wrap a conn error as a result error
+        try {
+            return _lazyState._lastClient->recv( m );
+        }
+        catch( DBException& e ){
+            log() << "could not receive data from " << _lazyState._lastClient << causedBy( e ) << endl;
+            return false;
+        }
+    }
+
+    void DBClientReplicaSet::checkResponse( const char* data, int nReturned, bool* retry, string* targetHost ){
+
+        // For now, do exactly as we did before, so as not to break things.  In general though, we
+        // should fix this so checkResponse has a more consistent contract.
+        if( ! retry ){
+            if( _lazyState._lastClient )
+                return _lazyState._lastClient->checkResponse( data, nReturned );
+            else
+                return checkMaster()->checkResponse( data, nReturned );
+        }
+
+        *retry = false;
+        if( targetHost && _lazyState._lastClient ) *targetHost = _lazyState._lastClient->getServerAddress();
+        else if (targetHost) *targetHost = "";
+
+        if( ! _lazyState._lastClient ) return;
+        if( nReturned != 1 && nReturned != -1 ) return;
+
+        BSONObj dataObj;
+        if( nReturned == 1 ) dataObj = BSONObj( data );
+
+        // Check if we should retry here
+        if( _lazyState._lastOp == dbQuery && _lazyState._slaveOk ){
+
+            // Check the error code for a slave not secondary error
+            if( nReturned == -1 ||
+                ( hasErrField( dataObj ) &&  ! dataObj["code"].eoo() && dataObj["code"].Int() == 13436 ) ){
+
+                bool wasMaster = false;
+                if( _lazyState._lastClient == _slave.get() ){
+                    isntSecondary();
+                }
+                else if( _lazyState._lastClient == _master.get() ){
+                    wasMaster = true;
+                    isntMaster();
+                }
+                else
+                    warning() << "passed " << dataObj << " but last rs client " << _lazyState._lastClient->toString() << " is not master or secondary" << endl;
+
+                if( _lazyState._retries < 3 ){
+                    _lazyState._retries++;
+                    *retry = true;
+                }
+                else{
+                    (void)wasMaster; // silence set-but-not-used warning
+                    // assert( wasMaster );
+                    // printStackTrace();
+                    log() << "too many retries (" << _lazyState._retries << "), could not get data from replica set" << endl;
+                }
+            }
+        }
+    }
+
+
+    bool DBClientReplicaSet::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) {
+        const char * ns = 0;
+
+        if ( toSend.operation() == dbQuery ) {
+            // TODO: might be possible to do this faster by changing api
+            DbMessage dm( toSend );
+            QueryMessage qm( dm );
+            ns = qm.ns;
+
+            if ( qm.queryOptions & QueryOption_SlaveOk ) {
+                for ( int i=0; i<3; i++ ) {
+                    try {
+                        DBClientConnection* s = checkSlave();
+                        if ( actualServer )
+                            *actualServer = s->getServerAddress();
+                        return s->call( toSend , response , assertOk );
+                    }
+                    catch ( DBException &e ) {
+                    	LOG(1) << "can't call replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
+                        if ( actualServer )
+                            *actualServer = "";
+                    }
+                }
+            }
+        }
+        
+        DBClientConnection* m = checkMaster();
+        if ( actualServer )
+            *actualServer = m->getServerAddress();
+        
+        if ( ! m->call( toSend , response , assertOk ) )
+            return false;
+
+        if ( ns ) {
+            QueryResult * res = (QueryResult*)response.singleData();
+            if ( res->nReturned == 1 ) {
+                BSONObj x(res->data() );
+                if ( str::contains( ns , "$cmd" ) ) {
+                    if ( isNotMasterErrorString( x["errmsg"] ) )
+                        isntMaster();
+                }
+                else {
+                    if ( isNotMasterErrorString( getErrField( x ) ) )
+                        isntMaster();
+                }
+            }
+        }
+
+        return true;
+    }
+
+}
diff --git a/src/mongo/client/dbclient_rs.h b/src/mongo/client/dbclient_rs.h
new file mode 100644
index 00000000000..0edcea42716
--- /dev/null
+++ b/src/mongo/client/dbclient_rs.h
@@ -0,0 +1,355 @@
+/** @file dbclient_rs.h Connect to a Replica Set, from C++ */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "dbclient.h"
+
+namespace mongo {
+
+    class ReplicaSetMonitor;
+    typedef shared_ptr<ReplicaSetMonitor> ReplicaSetMonitorPtr;
+    typedef pair<set<string>,set<int> > NodeDiff;
+
+    /**
+     * manages state about a replica set for client
+     * keeps tabs on whose master and what slaves are up
+     * can hand a slave to someone for SLAVE_OK
+     * one instace per process per replica set
+     * TODO: we might be able to use a regular Node * to avoid _lock
+     */
+    class ReplicaSetMonitor {
+    public:
+
+        typedef boost::function1<void,const ReplicaSetMonitor*> ConfigChangeHook;
+
+        /**
+         * gets a cached Monitor per name or will create if doesn't exist
+         */
+        static ReplicaSetMonitorPtr get( const string& name , const vector<HostAndPort>& servers );
+
+        /**
+         * gets a cached Monitor per name or will return none if it doesn't exist
+         */
+        static ReplicaSetMonitorPtr get( const string& name );
+
+
+        /**
+         * checks all sets for current master and new secondaries
+         * usually only called from a BackgroundJob
+         */
+        static void checkAll( bool checkAllSecondaries );
+
+        /**
+         * this is called whenever the config of any repclia set changes
+         * currently only 1 globally
+         * asserts if one already exists
+         * ownership passes to ReplicaSetMonitor and the hook will actually never be deleted
+         */
+        static void setConfigChangeHook( ConfigChangeHook hook );
+
+        ~ReplicaSetMonitor();
+
+        /** @return HostAndPort or throws an exception */
+        HostAndPort getMaster();
+
+        /**
+         * notify the monitor that server has faild
+         */
+        void notifyFailure( const HostAndPort& server );
+
+        /** @return prev if its still ok, and if not returns a random slave that is ok for reads */
+        HostAndPort getSlave( const HostAndPort& prev );
+
+        /** @return a random slave that is ok for reads */
+        HostAndPort getSlave();
+
+
+        /**
+         * notify the monitor that server has faild
+         */
+        void notifySlaveFailure( const HostAndPort& server );
+
+        /**
+         * checks for current master and new secondaries
+         */
+        void check( bool checkAllSecondaries );
+
+        string getName() const { return _name; }
+
+        string getServerAddress() const;
+
+        bool contains( const string& server ) const;
+        
+        void appendInfo( BSONObjBuilder& b ) const;
+
+    private:
+        /**
+         * This populates a list of hosts from the list of seeds (discarding the
+         * seed list).
+         * @param name set name
+         * @param servers seeds
+         */
+        ReplicaSetMonitor( const string& name , const vector<HostAndPort>& servers );
+
+        void _check( bool checkAllSecondaries );
+
+        /**
+         * Use replSetGetStatus command to make sure hosts in host list are up
+         * and readable.  Sets Node::ok appropriately.
+         */
+        void _checkStatus(DBClientConnection *conn);
+
+        /**
+         * Add array of hosts to host list. Doesn't do anything if hosts are
+         * already in host list.
+         * @param hostList the list of hosts to add
+         * @param changed if new hosts were added
+         */
+        void _checkHosts(const BSONObj& hostList, bool& changed);
+
+        /**
+         * Updates host list.
+         * @param c the connection to check
+         * @param maybePrimary OUT
+         * @param verbose
+         * @param nodesOffset - offset into _nodes array, -1 for not in it
+         * @return if the connection is good
+         */
+        bool _checkConnection( DBClientConnection * c , string& maybePrimary , bool verbose , int nodesOffset );
+
+        string _getServerAddress_inlock() const;
+
+        NodeDiff _getHostDiff_inlock( const BSONObj& hostList );
+        bool _shouldChangeHosts( const BSONObj& hostList, bool inlock );
+
+
+        int _find( const string& server ) const ;
+        int _find_inlock( const string& server ) const ;
+        int _find( const HostAndPort& server ) const ;
+
+        mutable mongo::mutex _lock; // protects _nodes
+        mutable mongo::mutex  _checkConnectionLock;
+
+        string _name;
+
+        // note these get copied around in the nodes vector so be sure to maintain copyable semantics here
+        struct Node {
+            Node( const HostAndPort& a , DBClientConnection* c ) 
+                : addr( a ) , conn(c) , ok(true) , 
+                  ismaster(false), secondary( false ) , hidden( false ) , pingTimeMillis(0) {
+                ok = conn.get() == NULL;
+            }
+
+            bool okForSecondaryQueries() const {
+                return ok && secondary && ! hidden;
+            }
+
+            BSONObj toBSON() const {
+                return BSON( "addr" << addr.toString() <<
+                             "isMaster" << ismaster <<
+                             "secondary" << secondary <<
+                             "hidden" << hidden <<
+                             "ok" << ok );
+            }
+
+            string toString() const {
+                return toBSON().toString();
+            }
+
+            HostAndPort addr;
+            shared_ptr<DBClientConnection> conn;
+
+            // if this node is in a failure state
+            // used for slave routing
+            // this is too simple, should make it better
+            bool ok;
+
+            // as reported by ismaster
+            BSONObj lastIsMaster;
+
+            bool ismaster;
+            bool secondary; 
+            bool hidden;
+            
+            int pingTimeMillis;
+
+        };
+
+        /**
+         * Host list.
+         */
+        vector<Node> _nodes;
+
+        int _master; // which node is the current master.  -1 means no master is known
+        int _nextSlave; // which node is the current slave
+
+        static mongo::mutex _setsLock; // protects _sets
+        static map<string,ReplicaSetMonitorPtr> _sets; // set name to Monitor
+
+        static ConfigChangeHook _hook;
+    };
+
+    /** Use this class to connect to a replica set of servers.  The class will manage
+       checking for which server in a replica set is master, and do failover automatically.
+
+       This can also be used to connect to replica pairs since pairs are a subset of sets
+
+       On a failover situation, expect at least one operation to return an error (throw
+       an exception) before the failover is complete.  Operations are not retried.
+    */
+    class DBClientReplicaSet : public DBClientBase {
+
+    public:
+        /** Call connect() after constructing. autoReconnect is always on for DBClientReplicaSet connections. */
+        DBClientReplicaSet( const string& name , const vector<HostAndPort>& servers, double so_timeout=0 );
+        virtual ~DBClientReplicaSet();
+
+        /** Returns false if nomember of the set were reachable, or neither is
+         * master, although,
+         * when false returned, you can still try to use this connection object, it will
+         * try reconnects.
+         */
+        bool connect();
+
+        /** Authorize.  Authorizes all nodes as needed
+        */
+        virtual bool auth(const string &dbname, const string &username, const string &pwd, string& errmsg, bool digestPassword = true );
+
+        // ----------- simple functions --------------
+
+        /** throws userassertion "no master found" */
+        virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0,
+                                               const BSONObj *fieldsToReturn = 0, int queryOptions = 0 , int batchSize = 0 );
+
+        /** throws userassertion "no master found" */
+        virtual BSONObj findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+
+        virtual void insert( const string &ns , BSONObj obj , int flags=0);
+
+        /** insert multiple objects.  Note that single object insert is asynchronous, so this version
+            is only nominally faster and not worth a special effort to try to use.  */
+        virtual void insert( const string &ns, const vector< BSONObj >& v , int flags=0);
+
+        virtual void remove( const string &ns , Query obj , bool justOne = 0 );
+
+        virtual void update( const string &ns , Query query , BSONObj obj , bool upsert = 0 , bool multi = 0 );
+
+        virtual void killCursor( long long cursorID );
+
+        // ---- access raw connections ----
+
+        DBClientConnection& masterConn();
+        DBClientConnection& slaveConn();
+
+        // ---- callback pieces -------
+
+        virtual void say( Message &toSend, bool isRetry = false );
+        virtual bool recv( Message &toRecv );
+        virtual void checkResponse( const char* data, int nReturned, bool* retry = NULL, string* targetHost = NULL );
+
+        /* this is the callback from our underlying connections to notify us that we got a "not master" error.
+         */
+        void isntMaster();
+
+        /* this is used to indicate we got a "not master or secondary" error from a secondary.
+         */
+        void isntSecondary();
+
+        // ----- status ------
+
+        virtual bool isFailed() const { return ! _master || _master->isFailed(); }
+
+        // ----- informational ----
+
+        double getSoTimeout() const { return _so_timeout; }
+
+        string toString() { return getServerAddress(); }
+
+        string getServerAddress() const { return _monitor->getServerAddress(); }
+
+        virtual ConnectionString::ConnectionType type() const { return ConnectionString::SET; }
+        virtual bool lazySupported() const { return true; }
+
+        // ---- low level ------
+
+        virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 );
+        virtual bool callRead( Message& toSend , Message& response ) { return checkMaster()->callRead( toSend , response ); }
+
+
+    protected:
+        virtual void sayPiggyBack( Message &toSend ) { checkMaster()->say( toSend ); }
+
+    private:
+
+        // Used to simplify slave-handling logic on errors
+        auto_ptr<DBClientCursor> checkSlaveQueryResult( auto_ptr<DBClientCursor> result );
+
+        DBClientConnection * checkMaster();
+        DBClientConnection * checkSlave();
+
+        void _auth( DBClientConnection * conn );
+
+        ReplicaSetMonitorPtr _monitor;
+
+        HostAndPort _masterHost;
+        scoped_ptr<DBClientConnection> _master;
+
+        HostAndPort _slaveHost;
+        scoped_ptr<DBClientConnection> _slave;
+        
+        double _so_timeout;
+
+        /**
+         * for storing authentication info
+         * fields are exactly for DBClientConnection::auth
+         */
+        struct AuthInfo {
+            AuthInfo( string d , string u , string p , bool di )
+                : dbname( d ) , username( u ) , pwd( p ) , digestPassword( di ) {}
+            string dbname;
+            string username;
+            string pwd;
+            bool digestPassword;
+        };
+
+        // we need to store so that when we connect to a new node on failure
+        // we can re-auth
+        // this could be a security issue, as the password is stored in memory
+        // not sure if/how we should handle
+        list<AuthInfo> _auths;
+
+    protected:
+
+        /**
+         * for storing (non-threadsafe) information between lazy calls
+         */
+        class LazyState {
+        public:
+            LazyState() : _lastClient( NULL ), _lastOp( -1 ), _slaveOk( false ), _retries( 0 ) {}
+            DBClientConnection* _lastClient;
+            int _lastOp;
+            bool _slaveOk;
+            int _retries;
+
+        } _lazyState;
+
+    };
+
+
+}
diff --git a/src/mongo/client/dbclientcursor.cpp b/src/mongo/client/dbclientcursor.cpp
new file mode 100644
index 00000000000..79510b766d8
--- /dev/null
+++ b/src/mongo/client/dbclientcursor.cpp
@@ -0,0 +1,324 @@
+// dbclient.cpp - connect to a Mongo database as a database, from C++
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "dbclient.h"
+#include "../db/dbmessage.h"
+#include "../db/cmdline.h"
+#include "connpool.h"
+#include "../s/shard.h"
+#include "../s/util.h"
+
+namespace mongo {
+
+    void assembleRequest( const string &ns, BSONObj query, int nToReturn, int nToSkip, const BSONObj *fieldsToReturn, int queryOptions, Message &toSend );
+
+    int DBClientCursor::nextBatchSize() {
+
+        if ( nToReturn == 0 )
+            return batchSize;
+
+        if ( batchSize == 0 )
+            return nToReturn;
+
+        return batchSize < nToReturn ? batchSize : nToReturn;
+    }
+
+    void DBClientCursor::_assembleInit( Message& toSend ) {
+        if ( !cursorId ) {
+            assembleRequest( ns, query, nextBatchSize() , nToSkip, fieldsToReturn, opts, toSend );
+        }
+        else {
+            BufBuilder b;
+            b.appendNum( opts );
+            b.appendStr( ns );
+            b.appendNum( nToReturn );
+            b.appendNum( cursorId );
+            toSend.setData( dbGetMore, b.buf(), b.len() );
+        }
+    }
+
+    bool DBClientCursor::init() {
+        Message toSend;
+        _assembleInit( toSend );
+
+        if ( !_client->call( toSend, *batch.m, false ) ) {
+            // log msg temp?
+            log() << "DBClientCursor::init call() failed" << endl;
+            return false;
+        }
+        if ( batch.m->empty() ) {
+            // log msg temp?
+            log() << "DBClientCursor::init message from call() was empty" << endl;
+            return false;
+        }
+        dataReceived();
+        return true;
+    }
+    
+    void DBClientCursor::initLazy( bool isRetry ) {
+        verify( 15875 , _client->lazySupported() );
+        Message toSend;
+        _assembleInit( toSend );
+        _client->say( toSend, isRetry );
+    }
+
+    bool DBClientCursor::initLazyFinish( bool& retry ) {
+
+        bool recvd = _client->recv( *batch.m );
+
+        // If we get a bad response, return false
+        if ( ! recvd || batch.m->empty() ) {
+
+            if( !recvd )
+                log() << "DBClientCursor::init lazy say() failed" << endl;
+            if( batch.m->empty() )
+                log() << "DBClientCursor::init message from say() was empty" << endl;
+
+            _client->checkResponse( NULL, -1, &retry, &_lazyHost );
+
+            return false;
+
+        }
+
+        dataReceived( retry, _lazyHost );
+        return ! retry;
+    }
+
+    void DBClientCursor::requestMore() {
+        assert( cursorId && batch.pos == batch.nReturned );
+
+        if (haveLimit) {
+            nToReturn -= batch.nReturned;
+            assert(nToReturn > 0);
+        }
+        BufBuilder b;
+        b.appendNum(opts);
+        b.appendStr(ns);
+        b.appendNum(nextBatchSize());
+        b.appendNum(cursorId);
+
+        Message toSend;
+        toSend.setData(dbGetMore, b.buf(), b.len());
+        auto_ptr<Message> response(new Message());
+
+        if ( _client ) {
+            _client->call( toSend, *response );
+            this->batch.m = response;
+            dataReceived();
+        }
+        else {
+            assert( _scopedHost.size() );
+            ScopedDbConnection conn( _scopedHost );
+            conn->call( toSend , *response );
+            _client = conn.get();
+            this->batch.m = response;
+            dataReceived();
+            _client = 0;
+            conn.done();
+        }
+    }
+
+    /** with QueryOption_Exhaust, the server just blasts data at us (marked at end with cursorid==0). */
+    void DBClientCursor::exhaustReceiveMore() {
+        assert( cursorId && batch.pos == batch.nReturned );
+        assert( !haveLimit );
+        auto_ptr<Message> response(new Message());
+        assert( _client );
+        if ( _client->recv(*response) ) {
+            batch.m = response;
+            dataReceived();
+        }
+    }
+
+    void DBClientCursor::dataReceived( bool& retry, string& host ) {
+
+        QueryResult *qr = (QueryResult *) batch.m->singleData();
+        resultFlags = qr->resultFlags();
+
+        if ( qr->resultFlags() & ResultFlag_ErrSet ) {
+            wasError = true;
+        }
+
+        if ( qr->resultFlags() & ResultFlag_CursorNotFound ) {
+            // cursor id no longer valid at the server.
+            assert( qr->cursorId == 0 );
+            cursorId = 0; // 0 indicates no longer valid (dead)
+            if ( ! ( opts & QueryOption_CursorTailable ) )
+                throw UserException( 13127 , "getMore: cursor didn't exist on server, possible restart or timeout?" );
+        }
+
+        if ( cursorId == 0 || ! ( opts & QueryOption_CursorTailable ) ) {
+            // only set initially: we don't want to kill it on end of data
+            // if it's a tailable cursor
+            cursorId = qr->cursorId;
+        }
+
+        batch.nReturned = qr->nReturned;
+        batch.pos = 0;
+        batch.data = qr->data();
+
+        _client->checkResponse( batch.data, batch.nReturned, &retry, &host ); // watches for "not master"
+
+        if( qr->resultFlags() & ResultFlag_ShardConfigStale ) {
+            BSONObj error;
+            assert( peekError( &error ) );
+            throw RecvStaleConfigException( error["ns"].String(),
+                                            (string)"stale config on lazy receive" + causedBy( getErrField( error ) ) );
+        }
+
+        /* this assert would fire the way we currently work:
+            assert( nReturned || cursorId == 0 );
+        */
+    }
+
+    /** If true, safe to call next().  Requests more from server if necessary. */
+    bool DBClientCursor::more() {
+        _assertIfNull();
+
+        if ( !_putBack.empty() )
+            return true;
+
+        if (haveLimit && batch.pos >= nToReturn)
+            return false;
+
+        if ( batch.pos < batch.nReturned )
+            return true;
+
+        if ( cursorId == 0 )
+            return false;
+
+        requestMore();
+        return batch.pos < batch.nReturned;
+    }
+
+    BSONObj DBClientCursor::next() {
+        DEV _assertIfNull();
+        if ( !_putBack.empty() ) {
+            BSONObj ret = _putBack.top();
+            _putBack.pop();
+            return ret;
+        }
+
+        uassert(13422, "DBClientCursor next() called but more() is false", batch.pos < batch.nReturned);
+
+        batch.pos++;
+        BSONObj o(batch.data);
+        batch.data += o.objsize();
+        /* todo would be good to make data null at end of batch for safety */
+        return o;
+    }
+
+    void DBClientCursor::peek(vector<BSONObj>& v, int atMost) {
+        int m = atMost;
+
+        /*
+        for( stack<BSONObj>::iterator i = _putBack.begin(); i != _putBack.end(); i++ ) {
+            if( m == 0 )
+                return;
+            v.push_back(*i);
+            m--;
+            n++;
+        }
+        */
+
+        int p = batch.pos;
+        const char *d = batch.data;
+        while( m && p < batch.nReturned ) {
+            BSONObj o(d);
+            d += o.objsize();
+            p++;
+            m--;
+            v.push_back(o);
+        }
+    }
+    
+    BSONObj DBClientCursor::peekFirst(){
+        vector<BSONObj> v;
+        peek( v, 1 );
+
+        if( v.size() > 0 ) return v[0];
+        else return BSONObj();
+    }
+
+    bool DBClientCursor::peekError(BSONObj* error){
+        if( ! wasError ) return false;
+
+        vector<BSONObj> v;
+        peek(v, 1);
+
+        assert( v.size() == 1 );
+        assert( hasErrField( v[0] ) );
+
+        if( error ) *error = v[0].getOwned();
+        return true;
+    }
+
+    void DBClientCursor::attach( AScopedConnection * conn ) {
+        assert( _scopedHost.size() == 0 );
+        assert( conn );
+        assert( conn->get() );
+
+        if ( conn->get()->type() == ConnectionString::SET ||
+             conn->get()->type() == ConnectionString::SYNC ) {
+            if( _lazyHost.size() > 0 )
+                _scopedHost = _lazyHost;
+            else if( _client )
+                _scopedHost = _client->getServerAddress();
+            else
+                massert(14821, "No client or lazy client specified, cannot store multi-host connection.", false);
+        }
+        else {
+            _scopedHost = conn->getHost();
+        }
+
+        conn->done();
+        _client = 0;
+        _lazyHost = "";
+    }
+
+    DBClientCursor::~DBClientCursor() {
+        if (!this)
+            return;
+
+        DESTRUCTOR_GUARD (
+
+        if ( cursorId && _ownCursor && ! inShutdown() ) {
+            BufBuilder b;
+            b.appendNum( (int)0 ); // reserved
+            b.appendNum( (int)1 ); // number
+            b.appendNum( cursorId );
+            
+            Message m;
+            m.setData( dbKillCursors , b.buf() , b.len() );
+
+            if ( _client ) {
+                _client->sayPiggyBack( m );
+            }
+            else {
+                assert( _scopedHost.size() );
+                ScopedDbConnection conn( _scopedHost );
+                conn->sayPiggyBack( m );
+                conn.done();
+            }
+        }
+
+        );
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/client/dbclientcursor.h b/src/mongo/client/dbclientcursor.h
new file mode 100644
index 00000000000..31bf1bb1d5e
--- /dev/null
+++ b/src/mongo/client/dbclientcursor.h
@@ -0,0 +1,243 @@
+// file dbclientcursor.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/net/message.h"
+#include "../db/jsobj.h"
+#include "../db/json.h"
+#include <stack>
+
+namespace mongo {
+
+    class AScopedConnection;
+
+    /** for mock purposes only -- do not create variants of DBClientCursor, nor hang code here 
+        @see DBClientMockCursor
+     */
+    class DBClientCursorInterface : boost::noncopyable {
+    public:
+        virtual ~DBClientCursorInterface() {}
+        virtual bool more() = 0;
+        virtual BSONObj next() = 0;
+        // TODO bring more of the DBClientCursor interface to here
+    protected:
+        DBClientCursorInterface() {}
+    };
+
+    /** Queries return a cursor object */
+    class DBClientCursor : public DBClientCursorInterface {
+    public:
+        /** If true, safe to call next().  Requests more from server if necessary. */
+        bool more();
+
+        /** If true, there is more in our local buffers to be fetched via next(). Returns
+            false when a getMore request back to server would be required.  You can use this
+            if you want to exhaust whatever data has been fetched to the client already but
+            then perhaps stop.
+        */
+        int objsLeftInBatch() const { _assertIfNull(); return _putBack.size() + batch.nReturned - batch.pos; }
+        bool moreInCurrentBatch() { return objsLeftInBatch() > 0; }
+
+        /** next
+           @return next object in the result cursor.
+           on an error at the remote server, you will get back:
+             { $err: <string> }
+           if you do not want to handle that yourself, call nextSafe().
+        */
+        BSONObj next();
+
+        /**
+            restore an object previously returned by next() to the cursor
+         */
+        void putBack( const BSONObj &o ) { _putBack.push( o.getOwned() ); }
+
+        /** throws AssertionException if get back { $err : ... } */
+        BSONObj nextSafe() {
+            BSONObj o = next();
+            if( strcmp(o.firstElementFieldName(), "$err") == 0 ) {
+                string s = "nextSafe(): " + o.toString();
+                if( logLevel >= 5 )
+                    log() << s << endl;
+                uasserted(13106, s);
+            }
+            return o;
+        }
+
+        /** peek ahead at items buffered for future next() calls.
+            never requests new data from the server.  so peek only effective
+            with what is already buffered.
+            WARNING: no support for _putBack yet!
+        */
+        void peek(vector<BSONObj>&, int atMost);
+
+        // Peeks at first element, if exists
+        BSONObj peekFirst();
+
+        /**
+         * peek ahead and see if an error occurred, and get the error if so.
+         */
+        bool peekError(BSONObj* error = NULL);
+
+        /**
+           iterate the rest of the cursor and return the number if items
+         */
+        int itcount() {
+            int c = 0;
+            while ( more() ) {
+                next();
+                c++;
+            }
+            return c;
+        }
+
+        /** cursor no longer valid -- use with tailable cursors.
+           note you should only rely on this once more() returns false;
+           'dead' may be preset yet some data still queued and locally
+           available from the dbclientcursor.
+        */
+        bool isDead() const { return  !this || cursorId == 0; }
+
+        bool tailable() const { return (opts & QueryOption_CursorTailable) != 0; }
+
+        /** see ResultFlagType (constants.h) for flag values
+            mostly these flags are for internal purposes -
+            ResultFlag_ErrSet is the possible exception to that
+        */
+        bool hasResultFlag( int flag ) {
+            _assertIfNull();
+            return (resultFlags & flag) != 0;
+        }
+
+        DBClientCursor( DBClientBase* client, const string &_ns, BSONObj _query, int _nToReturn,
+                        int _nToSkip, const BSONObj *_fieldsToReturn, int queryOptions , int bs ) :
+            _client(client),
+            ns(_ns),
+            query(_query),
+            nToReturn(_nToReturn),
+            haveLimit( _nToReturn > 0 && !(queryOptions & QueryOption_CursorTailable)),
+            nToSkip(_nToSkip),
+            fieldsToReturn(_fieldsToReturn),
+            opts(queryOptions),
+            batchSize(bs==1?2:bs),
+            cursorId(),
+            _ownCursor( true ),
+            wasError( false ) {
+        }
+
+        DBClientCursor( DBClientBase* client, const string &_ns, long long _cursorId, int _nToReturn, int options ) :
+            _client(client),
+            ns(_ns),
+            nToReturn( _nToReturn ),
+            haveLimit( _nToReturn > 0 && !(options & QueryOption_CursorTailable)),
+            opts( options ),
+            cursorId(_cursorId),
+            _ownCursor( true ) {
+        }
+
+        virtual ~DBClientCursor();
+
+        long long getCursorId() const { return cursorId; }
+
+        /** by default we "own" the cursor and will send the server a KillCursor
+            message when ~DBClientCursor() is called. This function overrides that.
+        */
+        void decouple() { _ownCursor = false; }
+
+        void attach( AScopedConnection * conn );
+
+        Message* getMessage(){ return batch.m.get(); }
+
+        /**
+         * actually does the query
+         */
+        bool init();
+
+        void initLazy( bool isRetry = false );
+        bool initLazyFinish( bool& retry );
+
+        class Batch : boost::noncopyable { 
+            friend class DBClientCursor;
+            auto_ptr<Message> m;
+            int nReturned;
+            int pos;
+            const char *data;
+        public:
+            Batch() : m( new Message() ), nReturned(), pos(), data() { }
+        };
+
+    private:
+        friend class DBClientBase;
+        friend class DBClientConnection;
+
+        int nextBatchSize();
+        
+        Batch batch;
+        DBClientBase* _client;
+        string ns;
+        BSONObj query;
+        int nToReturn;
+        bool haveLimit;
+        int nToSkip;
+        const BSONObj *fieldsToReturn;
+        int opts;
+        int batchSize;
+        stack< BSONObj > _putBack;
+        int resultFlags;
+        long long cursorId;
+        bool _ownCursor; // see decouple()
+        string _scopedHost;
+        string _lazyHost;
+        bool wasError;
+
+        void dataReceived() { bool retry; string lazyHost; dataReceived( retry, lazyHost ); }
+        void dataReceived( bool& retry, string& lazyHost );
+        void requestMore();
+        void exhaustReceiveMore(); // for exhaust
+
+        // Don't call from a virtual function
+        void _assertIfNull() const { uassert(13348, "connection died", this); }
+
+        // non-copyable , non-assignable
+        DBClientCursor( const DBClientCursor& );
+        DBClientCursor& operator=( const DBClientCursor& );
+
+        // init pieces
+        void _assembleInit( Message& toSend );
+    };
+
+    /** iterate over objects in current batch only - will not cause a network call
+     */
+    class DBClientCursorBatchIterator {
+    public:
+        DBClientCursorBatchIterator( DBClientCursor &c ) : _c( c ), _n() {}
+        bool moreInCurrentBatch() { return _c.moreInCurrentBatch(); }
+        BSONObj nextSafe() {
+            massert( 13383, "BatchIterator empty", moreInCurrentBatch() );
+            ++_n;
+            return _c.nextSafe();
+        }
+        int n() const { return _n; }
+    private:
+        DBClientCursor &_c;
+        int _n;
+    };
+
+} // namespace mongo
+
+#include "undef_macros.h"
diff --git a/src/mongo/client/dbclientmockcursor.h b/src/mongo/client/dbclientmockcursor.h
new file mode 100644
index 00000000000..8d85ff5ad2e
--- /dev/null
+++ b/src/mongo/client/dbclientmockcursor.h
@@ -0,0 +1,40 @@
+//@file dbclientmockcursor.h
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "dbclientcursor.h"
+
+namespace mongo {
+
+    class DBClientMockCursor : public DBClientCursorInterface {
+    public:
+        DBClientMockCursor( const BSONArray& mockCollection ) : _iter( mockCollection ) {}
+        virtual ~DBClientMockCursor() {}
+
+        bool more() { return _iter.more(); }
+        BSONObj next() { return _iter.next().Obj(); }
+
+    private:
+        BSONObjIterator _iter;
+
+        // non-copyable , non-assignable
+        DBClientMockCursor( const DBClientMockCursor& );
+        DBClientMockCursor& operator=( const DBClientMockCursor& );
+    };
+
+} // namespace mongo
diff --git a/src/mongo/client/distlock.cpp b/src/mongo/client/distlock.cpp
new file mode 100644
index 00000000000..595fc38197c
--- /dev/null
+++ b/src/mongo/client/distlock.cpp
@@ -0,0 +1,958 @@
+// @file distlock.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "dbclient.h"
+#include "distlock.h"
+
+namespace mongo {
+
+    LabeledLevel DistributedLock::logLvl( 1 );
+    DistributedLock::LastPings DistributedLock::lastPings;
+
+    ThreadLocalValue<string> distLockIds("");
+
+    /* ==================
+     * Module initialization
+     */
+
+    boost::once_flag _init = BOOST_ONCE_INIT;
+    static string* _cachedProcessString = NULL;
+
+    static void initModule() {
+        // cache process string
+        stringstream ss;
+        ss << getHostName() << ":" << cmdLine.port << ":" << time(0) << ":" << rand();
+        _cachedProcessString = new string( ss.str() );
+    }
+
+    /* =================== */
+
+    string getDistLockProcess() {
+        boost::call_once( initModule, _init );
+        assert( _cachedProcessString );
+        return *_cachedProcessString;
+    }
+
+    string getDistLockId() {
+        string s = distLockIds.get();
+        if ( s.empty() ) {
+            stringstream ss;
+            ss << getDistLockProcess() << ":" << getThreadName() << ":" << rand();
+            s = ss.str();
+            distLockIds.set( s );
+        }
+        return s;
+    }
+
+
+    class DistributedLockPinger {
+    public:
+
+        DistributedLockPinger()
+            : _mutex( "DistributedLockPinger" ) {
+        }
+
+        void _distLockPingThread( ConnectionString addr, string process, unsigned long long sleepTime ) {
+
+            setThreadName( "LockPinger" );
+
+            string pingId = pingThreadId( addr, process );
+
+            log( DistributedLock::logLvl - 1 ) << "creating distributed lock ping thread for " << addr
+                                               << " and process " << process
+                                               << " (sleeping for " << sleepTime << "ms)" << endl;
+
+            static int loops = 0;
+            while( ! inShutdown() && ! shouldKill( addr, process ) ) {
+
+                log( DistributedLock::logLvl + 2 ) << "distributed lock pinger '" << pingId << "' about to ping." << endl;
+
+                Date_t pingTime;
+
+                try {
+                    ScopedDbConnection conn( addr, 30.0 );
+
+                    pingTime = jsTime();
+
+                    // refresh the entry corresponding to this process in the lockpings collection
+                    conn->update( DistributedLock::lockPingNS ,
+                                  BSON( "_id" << process ) ,
+                                  BSON( "$set" << BSON( "ping" << pingTime ) ) ,
+                                  true );
+
+                    string err = conn->getLastError();
+                    if ( ! err.empty() ) {
+                        warning() << "pinging failed for distributed lock pinger '" << pingId << "'."
+                                  << causedBy( err ) << endl;
+                        conn.done();
+
+                        // Sleep for normal ping time
+                        sleepmillis(sleepTime);
+                        continue;
+                    }
+
+                    // remove really old entries from the lockpings collection if they're not holding a lock
+                    // (this may happen if an instance of a process was taken down and no new instance came up to
+                    // replace it for a quite a while)
+                    // if the lock is taken, the take-over mechanism should handle the situation
+                    auto_ptr<DBClientCursor> c = conn->query( DistributedLock::locksNS , BSONObj() );
+                    set<string> pids;
+                    while ( c->more() ) {
+                        BSONObj lock = c->next();
+                        if ( ! lock["process"].eoo() ) {
+                            pids.insert( lock["process"].valuestrsafe() );
+                        }
+                    }
+
+                    Date_t fourDays = pingTime - ( 4 * 86400 * 1000 ); // 4 days
+                    conn->remove( DistributedLock::lockPingNS , BSON( "_id" << BSON( "$nin" << pids ) << "ping" << LT << fourDays ) );
+                    err = conn->getLastError();
+                    if ( ! err.empty() ) {
+                        warning() << "ping cleanup for distributed lock pinger '" << pingId << " failed."
+                                  << causedBy( err ) << endl;
+                        conn.done();
+
+                        // Sleep for normal ping time
+                        sleepmillis(sleepTime);
+                        continue;
+                    }
+
+                    // create index so remove is fast even with a lot of servers
+                    if ( loops++ == 0 ) {
+                        conn->ensureIndex( DistributedLock::lockPingNS , BSON( "ping" << 1 ) );
+                    }
+
+                    log( DistributedLock::logLvl - ( loops % 10 == 0 ? 1 : 0 ) ) << "cluster " << addr << " pinged successfully at " << pingTime
+                            << " by distributed lock pinger '" << pingId
+                            << "', sleeping for " << sleepTime << "ms" << endl;
+
+                    // Remove old locks, if possible
+                    // Make sure no one else is adding to this list at the same time
+                    scoped_lock lk( _mutex );
+
+                    int numOldLocks = _oldLockOIDs.size();
+                    if( numOldLocks > 0 )
+                        log( DistributedLock::logLvl - 1 ) << "trying to delete " << _oldLockOIDs.size() << " old lock entries for process " << process << endl;
+
+                    bool removed = false;
+                    for( list<OID>::iterator i = _oldLockOIDs.begin(); i != _oldLockOIDs.end();
+                            i = ( removed ? _oldLockOIDs.erase( i ) : ++i ) ) {
+                        removed = false;
+                        try {
+                            // Got OID from lock with id, so we don't need to specify id again
+                            conn->update( DistributedLock::locksNS ,
+                                          BSON( "ts" << *i ),
+                                          BSON( "$set" << BSON( "state" << 0 ) ) );
+
+                            // Either the update went through or it didn't, either way we're done trying to
+                            // unlock
+                            log( DistributedLock::logLvl - 1 ) << "handled late remove of old distributed lock with ts " << *i << endl;
+                            removed = true;
+                        }
+                        catch( UpdateNotTheSame& ) {
+                            log( DistributedLock::logLvl - 1 ) << "partially removed old distributed lock with ts " << *i << endl;
+                            removed = true;
+                        }
+                        catch ( std::exception& e) {
+                            warning() << "could not remove old distributed lock with ts " << *i
+                                      << causedBy( e ) <<  endl;
+                        }
+
+                    }
+
+                    if( numOldLocks > 0 && _oldLockOIDs.size() > 0 ){
+                        log( DistributedLock::logLvl - 1 ) << "not all old lock entries could be removed for process " << process << endl;
+                    }
+
+                    conn.done();
+
+                }
+                catch ( std::exception& e ) {
+                    warning() << "distributed lock pinger '" << pingId << "' detected an exception while pinging."
+                              << causedBy( e ) << endl;
+                }
+
+                sleepmillis(sleepTime);
+            }
+
+            warning() << "removing distributed lock ping thread '" << pingId << "'" << endl;
+
+
+            if( shouldKill( addr, process ) )
+                finishKill( addr, process );
+
+        }
+
+        void distLockPingThread( ConnectionString addr, long long clockSkew, string processId, unsigned long long sleepTime ) {
+            try {
+                jsTimeVirtualThreadSkew( clockSkew );
+                _distLockPingThread( addr, processId, sleepTime );
+            }
+            catch ( std::exception& e ) {
+                error() << "unexpected error while running distributed lock pinger for " << addr << ", process " << processId << causedBy( e ) << endl;
+            }
+            catch ( ... ) {
+                error() << "unknown error while running distributed lock pinger for " << addr << ", process " << processId << endl;
+            }
+        }
+
+        string pingThreadId( const ConnectionString& conn, const string& processId ) {
+            return conn.toString() + "/" + processId;
+        }
+
+        string got( DistributedLock& lock, unsigned long long sleepTime ) {
+
+            // Make sure we don't start multiple threads for a process id
+            scoped_lock lk( _mutex );
+
+            const ConnectionString& conn = lock.getRemoteConnection();
+            const string& processId = lock.getProcessId();
+            string s = pingThreadId( conn, processId );
+
+            // Ignore if we already have a pinging thread for this process.
+            if ( _seen.count( s ) > 0 ) return s;
+
+            // Check our clock skew
+            try {
+                if( lock.isRemoteTimeSkewed() ) {
+                    throw LockException( str::stream() << "clock skew of the cluster " << conn.toString() << " is too far out of bounds to allow distributed locking." , 13650 );
+                }
+            }
+            catch( LockException& e) {
+                throw LockException( str::stream() << "error checking clock skew of cluster " << conn.toString() << causedBy( e ) , 13651);
+            }
+
+            boost::thread t( boost::bind( &DistributedLockPinger::distLockPingThread, this, conn, getJSTimeVirtualThreadSkew(), processId, sleepTime) );
+
+            _seen.insert( s );
+
+            return s;
+        }
+
+        void addUnlockOID( const OID& oid ) {
+            // Modifying the lock from some other thread
+            scoped_lock lk( _mutex );
+            _oldLockOIDs.push_back( oid );
+        }
+
+        bool willUnlockOID( const OID& oid ) {
+            scoped_lock lk( _mutex );
+            return find( _oldLockOIDs.begin(), _oldLockOIDs.end(), oid ) != _oldLockOIDs.end();
+        }
+
+        void kill( const ConnectionString& conn, const string& processId ) {
+            // Make sure we're in a consistent state before other threads can see us
+            scoped_lock lk( _mutex );
+
+            string pingId = pingThreadId( conn, processId );
+
+            assert( _seen.count( pingId ) > 0 );
+            _kill.insert( pingId );
+
+        }
+
+        bool shouldKill( const ConnectionString& conn, const string& processId ) {
+            return _kill.count( pingThreadId( conn, processId ) ) > 0;
+        }
+
+        void finishKill( const ConnectionString& conn, const string& processId ) {
+            // Make sure we're in a consistent state before other threads can see us
+            scoped_lock lk( _mutex );
+
+            string pingId = pingThreadId( conn, processId );
+
+            _kill.erase( pingId );
+            _seen.erase( pingId );
+
+        }
+
+        set<string> _kill;
+        set<string> _seen;
+        mongo::mutex _mutex;
+        list<OID> _oldLockOIDs;
+
+    } distLockPinger;
+
+
+    const string DistributedLock::lockPingNS = "config.lockpings";
+    const string DistributedLock::locksNS = "config.locks";
+
+    /**
+     * Create a new distributed lock, potentially with a custom sleep and takeover time.  If a custom sleep time is
+     * specified (time between pings)
+     */
+    DistributedLock::DistributedLock( const ConnectionString& conn , const string& name , unsigned long long lockTimeout, bool asProcess )
+        : _conn(conn) , _name(name) , _id( BSON( "_id" << name ) ), _processId( asProcess ? getDistLockId() : getDistLockProcess() ),
+          _lockTimeout( lockTimeout == 0 ? LOCK_TIMEOUT : lockTimeout ), _maxClockSkew( _lockTimeout / LOCK_SKEW_FACTOR ), _maxNetSkew( _maxClockSkew ), _lockPing( _maxClockSkew ),
+          _mutex( "DistributedLock" )
+    {
+        log( logLvl - 1 ) << "created new distributed lock for " << name << " on " << conn
+                          << " ( lock timeout : " << _lockTimeout
+                          << ", ping interval : " << _lockPing << ", process : " << asProcess << " )" << endl;
+
+
+    }
+
+    DistributedLock::PingData DistributedLock::LastPings::getLastPing( const ConnectionString& conn, const string& lockName ){
+        scoped_lock lock( _mutex );
+        return _lastPings[ std::pair< string, string >( conn.toString(), lockName ) ];
+    }
+
+    void DistributedLock::LastPings::setLastPing( const ConnectionString& conn, const string& lockName, const PingData& pd ){
+        scoped_lock lock( _mutex );
+        _lastPings[ std::pair< string, string >( conn.toString(), lockName ) ] = pd;
+    }
+
+    Date_t DistributedLock::getRemoteTime() {
+        return DistributedLock::remoteTime( _conn, _maxNetSkew );
+    }
+
+    bool DistributedLock::isRemoteTimeSkewed() {
+        return !DistributedLock::checkSkew( _conn, NUM_LOCK_SKEW_CHECKS, _maxClockSkew, _maxNetSkew );
+    }
+
+    const ConnectionString& DistributedLock::getRemoteConnection() {
+        return _conn;
+    }
+
+    const string& DistributedLock::getProcessId() {
+        return _processId;
+    }
+
+    /**
+     * Returns the remote time as reported by the cluster or server.  The maximum difference between the reported time
+     * and the actual time on the remote server (at the completion of the function) is the maxNetSkew
+     */
+    Date_t DistributedLock::remoteTime( const ConnectionString& cluster, unsigned long long maxNetSkew ) {
+
+        ConnectionString server( *cluster.getServers().begin() );
+        ScopedDbConnection conn( server );
+
+        BSONObj result;
+        long long delay;
+
+        try {
+            Date_t then = jsTime();
+            bool success = conn->runCommand( string("admin"), BSON( "serverStatus" << 1 ), result );
+            delay = jsTime() - then;
+
+            if( !success )
+                throw TimeNotFoundException( str::stream() << "could not get status from server "
+                                             << server.toString() << " in cluster " << cluster.toString()
+                                             << " to check time", 13647 );
+
+            // Make sure that our delay is not more than 2x our maximum network skew, since this is the max our remote
+            // time value can be off by if we assume a response in the middle of the delay.
+            if( delay > (long long) (maxNetSkew * 2) )
+                throw TimeNotFoundException( str::stream() << "server " << server.toString()
+                                             << " in cluster " << cluster.toString()
+                                             << " did not respond within max network delay of "
+                                             << maxNetSkew << "ms", 13648 );
+        }
+        catch(...) {
+            conn.done();
+            throw;
+        }
+
+        conn.done();
+
+        return result["localTime"].Date() - (delay / 2);
+
+    }
+
+    bool DistributedLock::checkSkew( const ConnectionString& cluster, unsigned skewChecks, unsigned long long maxClockSkew, unsigned long long maxNetSkew ) {
+
+        vector<HostAndPort> servers = cluster.getServers();
+
+        if(servers.size() < 1) return true;
+
+        vector<long long> avgSkews;
+
+        for(unsigned i = 0; i < skewChecks; i++) {
+
+            // Find the average skew for each server
+            unsigned s = 0;
+            for(vector<HostAndPort>::iterator si = servers.begin(); si != servers.end(); ++si,s++) {
+
+                if(i == 0) avgSkews.push_back(0);
+
+                // Could check if this is self, but shouldn't matter since local network connection should be fast.
+                ConnectionString server( *si );
+
+                vector<long long> skew;
+
+                BSONObj result;
+
+                Date_t remote = remoteTime( server, maxNetSkew );
+                Date_t local = jsTime();
+
+                // Remote time can be delayed by at most MAX_NET_SKEW
+
+                // Skew is how much time we'd have to add to local to get to remote
+                avgSkews[s] += (long long) (remote - local);
+
+                log( logLvl + 1 ) << "skew from remote server " << server << " found: " << (long long) (remote - local) << endl;
+
+            }
+        }
+
+        // Analyze skews
+
+        long long serverMaxSkew = 0;
+        long long serverMinSkew = 0;
+
+        for(unsigned s = 0; s < avgSkews.size(); s++) {
+
+            long long avgSkew = (avgSkews[s] /= skewChecks);
+
+            // Keep track of max and min skews
+            if(s == 0) {
+                serverMaxSkew = avgSkew;
+                serverMinSkew = avgSkew;
+            }
+            else {
+                if(avgSkew > serverMaxSkew)
+                    serverMaxSkew = avgSkew;
+                if(avgSkew < serverMinSkew)
+                    serverMinSkew = avgSkew;
+            }
+
+        }
+
+        long long totalSkew = serverMaxSkew - serverMinSkew;
+
+        // Make sure our max skew is not more than our pre-set limit
+        if(totalSkew > (long long) maxClockSkew) {
+            log( logLvl + 1 ) << "total clock skew of " << totalSkew << "ms for servers " << cluster << " is out of " << maxClockSkew << "ms bounds." << endl;
+            return false;
+        }
+
+        log( logLvl + 1 ) << "total clock skew of " << totalSkew << "ms for servers " << cluster << " is in " << maxClockSkew << "ms bounds." << endl;
+        return true;
+    }
+
+    // For use in testing, ping thread should run indefinitely in practice.
+    bool DistributedLock::killPinger( DistributedLock& lock ) {
+        if( lock._threadId == "") return false;
+
+        distLockPinger.kill( lock._conn, lock._processId );
+        return true;
+    }
+
+    // Semantics of this method are basically that if the lock cannot be acquired, returns false, can be retried.
+    // If the lock should not be tried again (some unexpected error) a LockException is thrown.
+    // If we are only trying to re-enter a currently held lock, reenter should be true.
+    // Note:  reenter doesn't actually make this lock re-entrant in the normal sense, since it can still only
+    // be unlocked once, instead it is used to verify that the lock is already held.
+    bool DistributedLock::lock_try( const string& why , bool reenter, BSONObj * other ) {
+
+        // TODO:  Start pinging only when we actually get the lock?
+        // If we don't have a thread pinger, make sure we shouldn't have one
+        if( _threadId == "" ){
+            scoped_lock lk( _mutex );
+            _threadId = distLockPinger.got( *this, _lockPing );
+        }
+
+        // This should always be true, if not, we are using the lock incorrectly.
+        assert( _name != "" );
+
+        // write to dummy if 'other' is null
+        BSONObj dummyOther;
+        if ( other == NULL )
+            other = &dummyOther;
+
+        ScopedDbConnection conn( _conn );
+
+        BSONObjBuilder queryBuilder;
+        queryBuilder.appendElements( _id );
+        queryBuilder.append( "state" , 0 );
+
+        {
+            // make sure its there so we can use simple update logic below
+            BSONObj o = conn->findOne( locksNS , _id ).getOwned();
+
+            // Case 1: No locks
+            if ( o.isEmpty() ) {
+                try {
+                    log( logLvl ) << "inserting initial doc in " << locksNS << " for lock " << _name << endl;
+                    conn->insert( locksNS , BSON( "_id" << _name << "state" << 0 << "who" << "" ) );
+                }
+                catch ( UserException& e ) {
+                    warning() << "could not insert initial doc for distributed lock " << _name << causedBy( e ) << endl;
+                }
+            }
+
+            // Case 2: A set lock that we might be able to force
+            else if ( o["state"].numberInt() > 0 ) {
+
+                string lockName = o["_id"].String() + string("/") + o["process"].String();
+
+                bool canReenter = reenter && o["process"].String() == _processId && ! distLockPinger.willUnlockOID( o["ts"].OID() ) && o["state"].numberInt() == 2;
+                if( reenter && ! canReenter ) {
+                    log( logLvl - 1 ) << "not re-entering distributed lock " << lockName;
+                    if( o["process"].String() != _processId ) log( logLvl - 1 ) << ", different process " << _processId << endl;
+                    else if( o["state"].numberInt() == 2 ) log( logLvl - 1 ) << ", state not finalized" << endl;
+                    else log( logLvl - 1 ) << ", ts " << o["ts"].OID() << " scheduled for late unlock" << endl;
+
+                    // reset since we've been bounced by a previous lock not being where we thought it was,
+                    // and should go through full forcing process if required.
+                    // (in theory we should never see a ping here if used correctly)
+                    *other = o; other->getOwned(); conn.done(); resetLastPing();
+                    return false;
+                }
+
+                BSONObj lastPing = conn->findOne( lockPingNS , o["process"].wrap( "_id" ) );
+                if ( lastPing.isEmpty() ) {
+                    log( logLvl ) << "empty ping found for process in lock '" << lockName << "'" << endl;
+                    // TODO:  Using 0 as a "no time found" value Will fail if dates roll over, but then, so will a lot.
+                    lastPing = BSON( "_id" << o["process"].String() << "ping" << (Date_t) 0 );
+                }
+
+                unsigned long long elapsed = 0;
+                unsigned long long takeover = _lockTimeout;
+                PingData _lastPingCheck = getLastPing();
+
+                log( logLvl ) << "checking last ping for lock '" << lockName << "'" << " against process " << _lastPingCheck.get<0>() << " and ping " << _lastPingCheck.get<1>() << endl;
+
+                try {
+
+                    Date_t remote = remoteTime( _conn );
+
+                    // Timeout the elapsed time using comparisons of remote clock
+                    // For non-finalized locks, timeout 15 minutes since last seen (ts)
+                    // For finalized locks, timeout 15 minutes since last ping
+                    bool recPingChange = o["state"].numberInt() == 2 && ( _lastPingCheck.get<0>() != lastPing["_id"].String() || _lastPingCheck.get<1>() != lastPing["ping"].Date() );
+                    bool recTSChange = _lastPingCheck.get<3>() != o["ts"].OID();
+
+                    if( recPingChange || recTSChange ) {
+                        // If the ping has changed since we last checked, mark the current date and time
+                        setLastPing( PingData( lastPing["_id"].String().c_str(), lastPing["ping"].Date(), remote, o["ts"].OID() ) );
+                    }
+                    else {
+
+                        // GOTCHA!  Due to network issues, it is possible that the current time
+                        // is less than the remote time.  We *have* to check this here, otherwise
+                        // we overflow and our lock breaks.
+                        if(_lastPingCheck.get<2>() >= remote)
+                            elapsed = 0;
+                        else
+                            elapsed = remote - _lastPingCheck.get<2>();
+                    }
+                }
+                catch( LockException& e ) {
+
+                    // Remote server cannot be found / is not responsive
+                    warning() << "Could not get remote time from " << _conn << causedBy( e );
+                    // If our config server is having issues, forget all the pings until we can see it again
+                    resetLastPing();
+
+                }
+
+                if ( elapsed <= takeover && ! canReenter ) {
+                    log( logLvl ) << "could not force lock '" << lockName << "' because elapsed time " << elapsed << " <= takeover time " << takeover << endl;
+                    *other = o; other->getOwned(); conn.done();
+                    return false;
+                }
+                else if( elapsed > takeover && canReenter ) {
+                    log( logLvl - 1 ) << "not re-entering distributed lock " << lockName << "' because elapsed time " << elapsed << " > takeover time " << takeover << endl;
+                    *other = o; other->getOwned(); conn.done();
+                    return false;
+                }
+
+                log( logLvl - 1 ) << ( canReenter ? "re-entering" : "forcing" ) << " lock '" << lockName << "' because "
+                                  << ( canReenter ? "re-entering is allowed, " : "" )
+                                  << "elapsed time " << elapsed << " > takeover time " << takeover << endl;
+
+                if( elapsed > takeover ) {
+
+                    // Lock may forced, reset our timer if succeeds or fails
+                    // Ensures that another timeout must happen if something borks up here, and resets our pristine
+                    // ping state if acquired.
+                    resetLastPing();
+
+                    try {
+
+                        // Check the clock skew again.  If we check this before we get a lock
+                        // and after the lock times out, we can be pretty sure the time is
+                        // increasing at the same rate on all servers and therefore our
+                        // timeout is accurate
+                        uassert( 14023, str::stream() << "remote time in cluster " << _conn.toString() << " is now skewed, cannot force lock.", !isRemoteTimeSkewed() );
+
+                        // Make sure we break the lock with the correct "ts" (OID) value, otherwise
+                        // we can overwrite a new lock inserted in the meantime.
+                        conn->update( locksNS , BSON( "_id" << _id["_id"].String() << "state" << o["state"].numberInt() << "ts" << o["ts"] ),
+                                      BSON( "$set" << BSON( "state" << 0 ) ) );
+
+                        BSONObj err = conn->getLastErrorDetailed();
+                        string errMsg = DBClientWithCommands::getLastErrorString(err);
+
+                        // TODO: Clean up all the extra code to exit this method, probably with a refactor
+                        if ( !errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1 ) {
+                            ( errMsg.empty() ? log( logLvl - 1 ) : warning() ) << "Could not force lock '" << lockName << "' "
+                                    << ( !errMsg.empty() ? causedBy(errMsg) : string("(another force won)") ) << endl;
+                            *other = o; other->getOwned(); conn.done();
+                            return false;
+                        }
+
+                    }
+                    catch( UpdateNotTheSame& ) {
+                        // Ok to continue since we know we forced at least one lock document, and all lock docs
+                        // are required for a lock to be held.
+                        warning() << "lock forcing " << lockName << " inconsistent" << endl;
+                    }
+                    catch( std::exception& e ) {
+                        conn.done();
+                        throw LockException( str::stream() << "exception forcing distributed lock "
+                                             << lockName << causedBy( e ), 13660);
+                    }
+
+                }
+                else {
+
+                    assert( canReenter );
+
+                    // Lock may be re-entered, reset our timer if succeeds or fails
+                    // Not strictly necessary, but helpful for small timeouts where thread scheduling is significant.
+                    // This ensures that two attempts are still required for a force if not acquired, and resets our
+                    // state if we are acquired.
+                    resetLastPing();
+
+                    // Test that the lock is held by trying to update the finalized state of the lock to the same state
+                    // if it does not update or does not update on all servers, we can't re-enter.
+                    try {
+
+                        // Test the lock with the correct "ts" (OID) value
+                        conn->update( locksNS , BSON( "_id" << _id["_id"].String() << "state" << 2 << "ts" << o["ts"] ),
+                                      BSON( "$set" << BSON( "state" << 2 ) ) );
+
+                        BSONObj err = conn->getLastErrorDetailed();
+                        string errMsg = DBClientWithCommands::getLastErrorString(err);
+
+                        // TODO: Clean up all the extra code to exit this method, probably with a refactor
+                        if ( ! errMsg.empty() || ! err["n"].type() || err["n"].numberInt() < 1 ) {
+                            ( errMsg.empty() ? log( logLvl - 1 ) : warning() ) << "Could not re-enter lock '" << lockName << "' "
+                                                                               << ( !errMsg.empty() ? causedBy(errMsg) : string("(not sure lock is held)") ) 
+                                                                               << " gle: " << err
+                                                                               << endl;
+                            *other = o; other->getOwned(); conn.done();
+                            return false;
+                        }
+
+                    }
+                    catch( UpdateNotTheSame& ) {
+                        // NOT ok to continue since our lock isn't held by all servers, so isn't valid.
+                        warning() << "inconsistent state re-entering lock, lock " << lockName << " not held" << endl;
+                        *other = o; other->getOwned(); conn.done();
+                        return false;
+                    }
+                    catch( std::exception& e ) {
+                        conn.done();
+                        throw LockException( str::stream() << "exception re-entering distributed lock "
+                                             << lockName << causedBy( e ), 13660);
+                    }
+
+                    log( logLvl - 1 ) << "re-entered distributed lock '" << lockName << "'" << endl;
+                    *other = o; other->getOwned(); conn.done();
+                    return true;
+
+                }
+
+                log( logLvl - 1 ) << "lock '" << lockName << "' successfully forced" << endl;
+
+                // We don't need the ts value in the query, since we will only ever replace locks with state=0.
+            }
+            // Case 3: We have an expired lock
+            else if ( o["ts"].type() ) {
+                queryBuilder.append( o["ts"] );
+            }
+        }
+
+        // Always reset our ping if we're trying to get a lock, since getting a lock implies the lock state is open
+        // and no locks need to be forced.  If anything goes wrong, we don't want to remember an old lock.
+        resetLastPing();
+
+        bool gotLock = false;
+        BSONObj currLock;
+
+        BSONObj lockDetails = BSON( "state" << 1 << "who" << getDistLockId() << "process" << _processId <<
+                                    "when" << jsTime() << "why" << why << "ts" << OID::gen() );
+        BSONObj whatIWant = BSON( "$set" << lockDetails );
+
+        BSONObj query = queryBuilder.obj();
+
+        string lockName = _name + string("/") + _processId;
+
+        try {
+
+            // Main codepath to acquire lock
+
+            log( logLvl ) << "about to acquire distributed lock '" << lockName << ":\n"
+                          <<  lockDetails.jsonString(Strict, true) << "\n"
+                          << query.jsonString(Strict, true) << endl;
+
+            conn->update( locksNS , query , whatIWant );
+
+            BSONObj err = conn->getLastErrorDetailed();
+            string errMsg = DBClientWithCommands::getLastErrorString(err);
+
+            currLock = conn->findOne( locksNS , _id );
+
+            if ( !errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1 ) {
+                ( errMsg.empty() ? log( logLvl - 1 ) : warning() ) << "could not acquire lock '" << lockName << "' "
+                        << ( !errMsg.empty() ? causedBy( errMsg ) : string("(another update won)") ) << endl;
+                *other = currLock;
+                other->getOwned();
+                gotLock = false;
+            }
+            else {
+                gotLock = true;
+            }
+
+        }
+        catch ( UpdateNotTheSame& up ) {
+
+            // this means our update got through on some, but not others
+            warning() << "distributed lock '" << lockName << " did not propagate properly." << causedBy( up ) << endl;
+
+            // Overall protection derives from:
+            // All unlocking updates use the ts value when setting state to 0
+            //   This ensures that during locking, we can override all smaller ts locks with
+            //   our own safe ts value and not be unlocked afterward.
+            for ( unsigned i = 0; i < up.size(); i++ ) {
+
+                ScopedDbConnection indDB( up[i].first );
+                BSONObj indUpdate;
+
+                try {
+
+                    indUpdate = indDB->findOne( locksNS , _id );
+
+                    // If we override this lock in any way, grab and protect it.
+                    // We assume/ensure that if a process does not have all lock documents, it is no longer
+                    // holding the lock.
+                    // Note - finalized locks may compete too, but we know they've won already if competing
+                    // in this round.  Cleanup of crashes during finalizing may take a few tries.
+                    if( indUpdate["ts"] < lockDetails["ts"] || indUpdate["state"].numberInt() == 0 ) {
+
+                        BSONObj grabQuery = BSON( "_id" << _id["_id"].String() << "ts" << indUpdate["ts"].OID() );
+
+                        // Change ts so we won't be forced, state so we won't be relocked
+                        BSONObj grabChanges = BSON( "ts" << lockDetails["ts"].OID() << "state" << 1 );
+
+                        // Either our update will succeed, and we'll grab the lock, or it will fail b/c some other
+                        // process grabbed the lock (which will change the ts), but the lock will be set until forcing
+                        indDB->update( locksNS, grabQuery, BSON( "$set" << grabChanges ) );
+
+                        indUpdate = indDB->findOne( locksNS, _id );
+
+                        // Our lock should now be set until forcing.
+                        assert( indUpdate["state"].numberInt() == 1 );
+
+                    }
+                    // else our lock is the same, in which case we're safe, or it's a bigger lock,
+                    // in which case we won't need to protect anything since we won't have the lock.
+
+                }
+                catch( std::exception& e ) {
+                    conn.done();
+                    throw LockException( str::stream() << "distributed lock " << lockName
+                                         << " had errors communicating with individual server "
+                                         << up[1].first << causedBy( e ), 13661 );
+                }
+
+                assert( !indUpdate.isEmpty() );
+
+                // Find max TS value
+                if ( currLock.isEmpty() || currLock["ts"] < indUpdate["ts"] ) {
+                    currLock = indUpdate.getOwned();
+                }
+
+                indDB.done();
+
+            }
+
+            // Locks on all servers are now set and safe until forcing
+
+            if ( currLock["ts"] == lockDetails["ts"] ) {
+                log( logLvl - 1 ) << "lock update won, completing lock propagation for '" << lockName << "'" << endl;
+                gotLock = true;
+            }
+            else {
+                log( logLvl - 1 ) << "lock update lost, lock '" << lockName << "' not propagated." << endl;
+
+                // Register the lock for deletion, to speed up failover
+                // Not strictly necessary, but helpful
+                distLockPinger.addUnlockOID( lockDetails["ts"].OID() );
+
+                gotLock = false;
+            }
+        }
+        catch( std::exception& e ) {
+            conn.done();
+            throw LockException( str::stream() << "exception creating distributed lock "
+                                 << lockName << causedBy( e ), 13663 );
+        }
+
+        // Complete lock propagation
+        if( gotLock ) {
+
+            // This is now safe, since we know that no new locks will be placed on top of the ones we've checked for at
+            // least 15 minutes.  Sets the state = 2, so that future clients can determine that the lock is truly set.
+            // The invariant for rollbacks is that we will never force locks with state = 2 and active pings, since that
+            // indicates the lock is active, but this means the process creating/destroying them must explicitly poll
+            // when something goes wrong.
+            try {
+
+                BSONObjBuilder finalLockDetails;
+                BSONObjIterator bi( lockDetails );
+                while( bi.more() ) {
+                    BSONElement el = bi.next();
+                    if( (string) ( el.fieldName() ) == "state" )
+                        finalLockDetails.append( "state", 2 );
+                    else finalLockDetails.append( el );
+                }
+
+                conn->update( locksNS , _id , BSON( "$set" << finalLockDetails.obj() ) );
+
+                BSONObj err = conn->getLastErrorDetailed();
+                string errMsg = DBClientWithCommands::getLastErrorString(err);
+
+                currLock = conn->findOne( locksNS , _id );
+
+                if ( !errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1 ) {
+                    warning() << "could not finalize winning lock " << lockName
+                              << ( !errMsg.empty() ? causedBy( errMsg ) : " (did not update lock) " ) << endl;
+                    gotLock = false;
+                }
+                else {
+                    // SUCCESS!
+                    gotLock = true;
+                }
+
+            }
+            catch( std::exception& e ) {
+                conn.done();
+
+                // Register the bad final lock for deletion, in case it exists
+                distLockPinger.addUnlockOID( lockDetails["ts"].OID() );
+
+                throw LockException( str::stream() << "exception finalizing winning lock"
+                                     << causedBy( e ), 13662 );
+            }
+
+        }
+
+        *other = currLock;
+        other->getOwned();
+
+        // Log our lock results
+        if(gotLock)
+            log( logLvl - 1 ) << "distributed lock '" << lockName << "' acquired, ts : " << currLock["ts"].OID() << endl;
+        else
+            log( logLvl - 1 ) << "distributed lock '" << lockName << "' was not acquired." << endl;
+
+        conn.done();
+
+        return gotLock;
+    }
+
+    // Unlock now takes an optional pointer to the lock, so you can be specific about which
+    // particular lock you want to unlock.  This is required when the config server is down,
+    // and so cannot tell you what lock ts you should try later.
+    void DistributedLock::unlock( BSONObj* oldLockPtr ) {
+
+        assert( _name != "" );
+
+        string lockName = _name + string("/") + _processId;
+
+        const int maxAttempts = 3;
+        int attempted = 0;
+
+        BSONObj oldLock;
+        if( oldLockPtr ) oldLock = *oldLockPtr;
+
+        while ( ++attempted <= maxAttempts ) {
+
+            ScopedDbConnection conn( _conn );
+
+            try {
+
+                if( oldLock.isEmpty() )
+                    oldLock = conn->findOne( locksNS, _id );
+
+                if( oldLock["state"].eoo() || oldLock["state"].numberInt() != 2 || oldLock["ts"].eoo() ) {
+                    warning() << "cannot unlock invalid distributed lock " << oldLock << endl;
+                    conn.done();
+                    break;
+                }
+
+                // Use ts when updating lock, so that new locks can be sure they won't get trampled.
+                conn->update( locksNS ,
+                              BSON( "_id" << _id["_id"].String() << "ts" << oldLock["ts"].OID() ),
+                              BSON( "$set" << BSON( "state" << 0 ) ) );
+
+                // Check that the lock was actually unlocked... if not, try again
+                BSONObj err = conn->getLastErrorDetailed();
+                string errMsg = DBClientWithCommands::getLastErrorString(err);
+
+                if ( !errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1 ){
+                    warning() << "distributed lock unlock update failed, retrying "
+                              << ( errMsg.empty() ? causedBy( "( update not registered )" ) : causedBy( errMsg ) ) << endl;
+                    conn.done();
+                    continue;
+                }
+
+                log( logLvl - 1 ) << "distributed lock '" << lockName << "' unlocked. " << endl;
+                conn.done();
+                return;
+            }
+            catch( UpdateNotTheSame& ) {
+                log( logLvl - 1 ) << "distributed lock '" << lockName << "' unlocked (messily). " << endl;
+                conn.done();
+                break;
+            }
+            catch ( std::exception& e) {
+                warning() << "distributed lock '" << lockName << "' failed unlock attempt."
+                          << causedBy( e ) <<  endl;
+
+                conn.done();
+                // TODO:  If our lock timeout is small, sleeping this long may be unsafe.
+                if( attempted != maxAttempts) sleepsecs(1 << attempted);
+            }
+        }
+
+        if( attempted > maxAttempts && ! oldLock.isEmpty() && ! oldLock["ts"].eoo() ) {
+
+            log( logLvl - 1 ) << "could not unlock distributed lock with ts " << oldLock["ts"].OID()
+                              << ", will attempt again later" << endl;
+
+            // We couldn't unlock the lock at all, so try again later in the pinging thread...
+            distLockPinger.addUnlockOID( oldLock["ts"].OID() );
+        }
+        else if( attempted > maxAttempts ) {
+            warning() << "could not unlock untracked distributed lock, a manual force may be required" << endl;
+        }
+
+        warning() << "distributed lock '" << lockName << "' couldn't consummate unlock request. "
+                  << "lock may be taken over after " << ( _lockTimeout / (60 * 1000) )
+                  << " minutes timeout." << endl;
+    }
+
+
+
+}
diff --git a/src/mongo/client/distlock.h b/src/mongo/client/distlock.h
new file mode 100644
index 00000000000..106a5d00001
--- /dev/null
+++ b/src/mongo/client/distlock.h
@@ -0,0 +1,244 @@
+// distlock.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "dbclient.h"
+#include "connpool.h"
+#include "redef_macros.h"
+#include "syncclusterconnection.h"
+
+#define LOCK_TIMEOUT (15 * 60 * 1000)
+#define LOCK_SKEW_FACTOR (30)
+#define LOCK_PING (LOCK_TIMEOUT / LOCK_SKEW_FACTOR)
+#define MAX_LOCK_NET_SKEW (LOCK_TIMEOUT / LOCK_SKEW_FACTOR)
+#define MAX_LOCK_CLOCK_SKEW (LOCK_TIMEOUT / LOCK_SKEW_FACTOR)
+#define NUM_LOCK_SKEW_CHECKS (3)
+
+// The maximum clock skew we need to handle between config servers is
+// 2 * MAX_LOCK_NET_SKEW + MAX_LOCK_CLOCK_SKEW.
+
+// Net effect of *this* clock being slow is effectively a multiplier on the max net skew
+// and a linear increase or decrease of the max clock skew.
+
+namespace mongo {
+
+    /**
+     * Exception class to encapsulate exceptions while managing distributed locks
+     */
+    class LockException : public DBException {
+    public:
+    	LockException( const char * msg , int code ) : DBException( msg, code ) {}
+    	LockException( const string& msg, int code ) : DBException( msg, code ) {}
+    	virtual ~LockException() throw() { }
+    };
+
+    /**
+     * Indicates an error in retrieving time values from remote servers.
+     */
+    class TimeNotFoundException : public LockException {
+    public:
+        TimeNotFoundException( const char * msg , int code ) : LockException( msg, code ) {}
+        TimeNotFoundException( const string& msg, int code ) : LockException( msg, code ) {}
+        virtual ~TimeNotFoundException() throw() { }
+    };
+
+    /**
+     * The distributed lock is a configdb backed way of synchronizing system-wide tasks. A task must be identified by a
+     * unique name across the system (e.g., "balancer"). A lock is taken by writing a document in the configdb's locks
+     * collection with that name.
+     *
+     * To be maintained, each taken lock needs to be revalidaded ("pinged") within a pre-established amount of time. This
+     * class does this maintenance automatically once a DistributedLock object was constructed.
+     */
+    class DistributedLock {
+    public:
+
+    	static LabeledLevel logLvl;
+
+    	typedef boost::tuple<string, Date_t, Date_t, OID> PingData;
+
+    	class LastPings {
+    	public:
+    	    LastPings() : _mutex( "DistributedLock::LastPings" ) {}
+    	    ~LastPings(){}
+
+    	    PingData getLastPing( const ConnectionString& conn, const string& lockName );
+    	    void setLastPing( const ConnectionString& conn, const string& lockName, const PingData& pd );
+
+    	    mongo::mutex _mutex;
+    	    map< std::pair<string, string>, PingData > _lastPings;
+    	};
+
+    	static LastPings lastPings;
+
+        /**
+         * The constructor does not connect to the configdb yet and constructing does not mean the lock was acquired.
+         * Construction does trigger a lock "pinging" mechanism, though.
+         *
+         * @param conn address of config(s) server(s)
+         * @param name identifier for the lock
+         * @param lockTimeout how long can the log go "unpinged" before a new attempt to lock steals it (in minutes).
+         * @param lockPing how long to wait between lock pings
+         * @param legacy use legacy logic
+         *
+         */
+        DistributedLock( const ConnectionString& conn , const string& name , unsigned long long lockTimeout = 0, bool asProcess = false );
+        ~DistributedLock(){};
+
+        /**
+         * Attempts to acquire 'this' lock, checking if it could or should be stolen from the previous holder. Please
+         * consider using the dist_lock_try construct to acquire this lock in an exception safe way.
+         *
+         * @param why human readable description of why the lock is being taken (used to log)
+         * @param whether this is a lock re-entry or a new lock
+         * @param other configdb's lock document that is currently holding the lock, if lock is taken, or our own lock
+         * details if not
+         * @return true if it managed to grab the lock
+         */
+        bool lock_try( const string& why , bool reenter = false, BSONObj * other = 0 );
+
+        /**
+         * Releases a previously taken lock.
+         */
+        void unlock( BSONObj* oldLockPtr = NULL );
+
+        Date_t getRemoteTime();
+
+        bool isRemoteTimeSkewed();
+
+        const string& getProcessId();
+
+        const ConnectionString& getRemoteConnection();
+
+        /**
+         * Check the skew between a cluster of servers
+         */
+        static bool checkSkew( const ConnectionString& cluster, unsigned skewChecks = NUM_LOCK_SKEW_CHECKS, unsigned long long maxClockSkew = MAX_LOCK_CLOCK_SKEW, unsigned long long maxNetSkew = MAX_LOCK_NET_SKEW );
+
+        /**
+         * Get the remote time from a server or cluster
+         */
+        static Date_t remoteTime( const ConnectionString& cluster, unsigned long long maxNetSkew = MAX_LOCK_NET_SKEW );
+
+        static bool killPinger( DistributedLock& lock );
+
+        /**
+         * Namespace for lock pings
+         */
+        static const string lockPingNS;
+
+        /**
+         * Namespace for locks
+         */
+        static const string locksNS;
+
+        const ConnectionString _conn;
+        const string _name;
+        const BSONObj _id;
+        const string _processId;
+
+        // Timeout for lock, usually LOCK_TIMEOUT
+        const unsigned long long _lockTimeout;
+        const unsigned long long _maxClockSkew;
+        const unsigned long long _maxNetSkew;
+        const unsigned long long _lockPing;
+
+    private:
+
+        void resetLastPing(){ lastPings.setLastPing( _conn, _name, PingData() ); }
+        void setLastPing( const PingData& pd ){ lastPings.setLastPing( _conn, _name, pd ); }
+        PingData getLastPing(){ return lastPings.getLastPing( _conn, _name ); }
+
+        // May or may not exist, depending on startup
+        mongo::mutex _mutex;
+        string _threadId;
+
+    };
+
+    class dist_lock_try {
+    public:
+
+    	dist_lock_try() : _lock(NULL), _got(false) {}
+
+    	dist_lock_try( const dist_lock_try& that ) : _lock(that._lock), _got(that._got), _other(that._other) {
+    		_other.getOwned();
+
+    		// Make sure the lock ownership passes to this object,
+    		// so we only unlock once.
+    		((dist_lock_try&) that)._got = false;
+    		((dist_lock_try&) that)._lock = NULL;
+    		((dist_lock_try&) that)._other = BSONObj();
+    	}
+
+    	// Needed so we can handle lock exceptions in context of lock try.
+    	dist_lock_try& operator=( const dist_lock_try& that ){
+
+    	    if( this == &that ) return *this;
+
+    	    _lock = that._lock;
+    	    _got = that._got;
+    	    _other = that._other;
+    	    _other.getOwned();
+    	    _why = that._why;
+
+    	    // Make sure the lock ownership passes to this object,
+    	    // so we only unlock once.
+    	    ((dist_lock_try&) that)._got = false;
+    	    ((dist_lock_try&) that)._lock = NULL;
+    	    ((dist_lock_try&) that)._other = BSONObj();
+
+    	    return *this;
+    	}
+
+        dist_lock_try( DistributedLock * lock , string why )
+            : _lock(lock), _why(why) {
+            _got = _lock->lock_try( why , false , &_other );
+        }
+
+        ~dist_lock_try() {
+            if ( _got ) {
+                assert( ! _other.isEmpty() );
+                _lock->unlock( &_other );
+            }
+        }
+
+        bool reestablish(){
+            return retry();
+        }
+
+        bool retry() {
+            assert( _lock );
+            assert( _got );
+            assert( ! _other.isEmpty() );
+
+            return _got = _lock->lock_try( _why , true, &_other );
+        }
+
+        bool got() const { return _got; }
+        BSONObj other() const { return _other; }
+
+    private:
+        DistributedLock * _lock;
+        bool _got;
+        BSONObj _other;
+        string _why;
+    };
+
+}
+
diff --git a/src/mongo/client/distlock_test.cpp b/src/mongo/client/distlock_test.cpp
new file mode 100644
index 00000000000..a46caa44c11
--- /dev/null
+++ b/src/mongo/client/distlock_test.cpp
@@ -0,0 +1,446 @@
+// distlock_test.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <iostream>
+#include "../pch.h"
+#include "dbclient.h"
+#include "distlock.h"
+#include "../db/commands.h"
+#include "../util/bson_util.h"
+#include "../util/timer.h"
+
+// Modify some config options for the RNG, since they cause MSVC to fail
+#include <boost/config.hpp>
+
+#if defined(BOOST_MSVC) && defined(BOOST_NO_MEMBER_TEMPLATE_FRIENDS)
+#undef BOOST_NO_MEMBER_TEMPLATE_FRIENDS
+#define BOOST_RNG_HACK
+#endif
+
+// Well, sort-of cross-platform RNG
+#include <boost/random/mersenne_twister.hpp>
+
+#ifdef BOOST_RNG_HACK
+#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS
+#undef BOOST_RNG_HACK
+#endif
+
+
+#include <boost/random/uniform_int.hpp>
+#include <boost/random/variate_generator.hpp>
+
+
+// TODO:  Make a method in BSONObj if useful, don't modify for now
+#define string_field(obj, name, def) ( obj.hasField(name) ? obj[name].String() : def )
+#define number_field(obj, name, def) ( obj.hasField(name) ? obj[name].Number() : def )
+
+namespace mongo {
+
+    class TestDistLockWithSync: public Command {
+    public:
+        TestDistLockWithSync() :
+            Command("_testDistLockWithSyncCluster") {
+        }
+        virtual void help(stringstream& help) const {
+            help << "should not be calling this directly" << endl;
+        }
+
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual LockType locktype() const {
+            return NONE;
+        }
+
+        static void runThread() {
+            while (keepGoing) {
+                if (current->lock_try( "test" )) {
+                    count++;
+                    int before = count;
+                    sleepmillis(3);
+                    int after = count;
+
+                    if (after != before) {
+                        error() << " before: " << before << " after: " << after
+                                << endl;
+                    }
+
+                    current->unlock();
+                }
+            }
+        }
+
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg,
+                 BSONObjBuilder& result, bool) {
+            Timer t;
+            DistributedLock lk(ConnectionString(cmdObj["host"].String(),
+                                                ConnectionString::SYNC), "testdistlockwithsync", 0, 0);
+            current = &lk;
+            count = 0;
+            gotit = 0;
+            errors = 0;
+            keepGoing = true;
+
+            vector<shared_ptr<boost::thread> > l;
+            for (int i = 0; i < 4; i++) {
+                l.push_back(
+                    shared_ptr<boost::thread> (new boost::thread(runThread)));
+            }
+
+            int secs = 10;
+            if (cmdObj["secs"].isNumber())
+                secs = cmdObj["secs"].numberInt();
+            sleepsecs(secs);
+            keepGoing = false;
+
+            for (unsigned i = 0; i < l.size(); i++)
+                l[i]->join();
+
+            current = 0;
+
+            result.append("count", count);
+            result.append("gotit", gotit);
+            result.append("errors", errors);
+            result.append("timeMS", t.millis());
+
+            return errors == 0;
+        }
+
+        // variables for test
+        static DistributedLock * current;
+        static int gotit;
+        static int errors;
+        static AtomicUInt count;
+
+        static bool keepGoing;
+
+    } testDistLockWithSyncCmd;
+
+    DistributedLock * TestDistLockWithSync::current;
+    AtomicUInt TestDistLockWithSync::count;
+    int TestDistLockWithSync::gotit;
+    int TestDistLockWithSync::errors;
+    bool TestDistLockWithSync::keepGoing;
+
+
+
+    class TestDistLockWithSkew: public Command {
+    public:
+
+        static const int logLvl = 1;
+
+        TestDistLockWithSkew() :
+            Command("_testDistLockWithSkew") {
+        }
+        virtual void help(stringstream& help) const {
+            help << "should not be calling this directly" << endl;
+        }
+
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual LockType locktype() const {
+            return NONE;
+        }
+
+        void runThread(ConnectionString& hostConn, unsigned threadId, unsigned seed,
+                       BSONObj& cmdObj, BSONObjBuilder& result) {
+
+            stringstream ss;
+            ss << "thread-" << threadId;
+            setThreadName(ss.str().c_str());
+
+            // Lock name
+            string lockName = string_field(cmdObj, "lockName", this->name + "_lock");
+
+            // Range of clock skew in diff threads
+            int skewRange = (int) number_field(cmdObj, "skewRange", 1);
+
+            // How long to wait with the lock
+            int threadWait = (int) number_field(cmdObj, "threadWait", 30);
+            if(threadWait <= 0) threadWait = 1;
+
+            // Max amount of time (ms) a thread waits before checking the lock again
+            int threadSleep = (int) number_field(cmdObj, "threadSleep", 30);
+            if(threadSleep <= 0) threadSleep = 1;
+
+            // How long until the lock is forced in ms, only compared locally
+            unsigned long long takeoverMS = (unsigned long long) number_field(cmdObj, "takeoverMS", 0);
+
+            // Whether or not we should hang some threads
+            int hangThreads = (int) number_field(cmdObj, "hangThreads", 0);
+
+
+            boost::mt19937 gen((boost::mt19937::result_type) seed);
+
+            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSkew(gen, boost::uniform_int<>(0, skewRange));
+            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomWait(gen, boost::uniform_int<>(1, threadWait));
+            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSleep(gen, boost::uniform_int<>(1, threadSleep));
+            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomNewLock(gen, boost::uniform_int<>(0, 3));
+
+
+            int skew = 0;
+            if (!lock.get()) {
+
+                // Pick a skew, but the first two threads skew the whole range
+                if(threadId == 0)
+                    skew = -skewRange / 2;
+                else if(threadId == 1)
+                    skew = skewRange / 2;
+                else skew = randomSkew() - (skewRange / 2);
+
+                // Skew this thread
+                jsTimeVirtualThreadSkew( skew );
+
+                log() << "Initializing lock with skew of " << skew << " for thread " << threadId << endl;
+
+                lock.reset(new DistributedLock(hostConn, lockName, takeoverMS, true ));
+
+                log() << "Skewed time " << jsTime() << "  for thread " << threadId << endl
+                      << "  max wait (with lock: " << threadWait << ", after lock: " << threadSleep << ")" << endl
+                      << "  takeover in " << takeoverMS << "(ms remote)" << endl;
+
+            }
+
+            DistributedLock* myLock = lock.get();
+
+            bool errors = false;
+            BSONObj lockObj;
+            while (keepGoing) {
+                try {
+
+                    if (myLock->lock_try("Testing distributed lock with skew.", false, &lockObj )) {
+
+                        log() << "**** Locked for thread " << threadId << " with ts " << lockObj["ts"] << endl;
+
+                        if( count % 2 == 1 && ! myLock->lock_try( "Testing lock re-entry.", true ) ) {
+                            errors = true;
+                            log() << "**** !Could not re-enter lock already held" << endl;
+                            break;
+                        }
+
+                        if( count % 3 == 1 && myLock->lock_try( "Testing lock non-re-entry.", false ) ) {
+                            errors = true;
+                            log() << "**** !Invalid lock re-entry" << endl;
+                            break;
+                        }
+
+                        count++;
+                        int before = count;
+                        int sleep = randomWait();
+                        sleepmillis(sleep);
+                        int after = count;
+
+                        if(after != before) {
+                            errors = true;
+                            log() << "**** !Bad increment while sleeping with lock for: " << sleep << "ms" << endl;
+                            break;
+                        }
+
+                        // Unlock only half the time...
+                        if(hangThreads == 0 || threadId % hangThreads != 0) {
+                            log() << "**** Unlocking for thread " << threadId << " with ts " << lockObj["ts"] << endl;
+                            myLock->unlock( &lockObj );
+                        }
+                        else {
+                            log() << "**** Not unlocking for thread " << threadId << endl;
+                            assert( DistributedLock::killPinger( *myLock ) );
+                            // We're simulating a crashed process...
+                            break;
+                        }
+                    }
+
+                }
+                catch( LockException& e ) {
+                    log() << "*** !Could not try distributed lock." << causedBy( e ) << endl;
+                    break;
+                }
+
+                // Create a new lock 1/3 of the time
+                if( randomNewLock() > 1 ){
+                    lock.reset(new DistributedLock( hostConn, lockName, takeoverMS, true ));
+                    myLock = lock.get();
+                }
+
+                sleepmillis(randomSleep());
+            }
+
+            result << "errors" << errors
+                   << "skew" << skew
+                   << "takeover" << (long long) takeoverMS
+                   << "localTimeout" << (takeoverMS > 0);
+
+        }
+
+        void test(ConnectionString& hostConn, string& lockName, unsigned seed) {
+            return;
+        }
+
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg,
+                 BSONObjBuilder& result, bool) {
+
+            Timer t;
+
+            ConnectionString hostConn(cmdObj["host"].String(),
+                                      ConnectionString::SYNC);
+
+            unsigned seed = (unsigned) number_field(cmdObj, "seed", 0);
+            int numThreads = (int) number_field(cmdObj, "numThreads", 4);
+            int wait = (int) number_field(cmdObj, "wait", 10000);
+
+            log() << "Starting " << this->name << " with -" << endl
+                  << "  seed: " << seed << endl
+                  << "  numThreads: " << numThreads << endl
+                  << "  total wait: " << wait << endl << endl;
+
+            // Skew host clocks if needed
+            try {
+                skewClocks( hostConn, cmdObj );
+            }
+            catch( DBException e ) {
+                errmsg = str::stream() << "Clocks could not be skewed." << causedBy( e );
+                return false;
+            }
+
+            count = 0;
+            keepGoing = true;
+
+            vector<shared_ptr<boost::thread> > threads;
+            vector<shared_ptr<BSONObjBuilder> > results;
+            for (int i = 0; i < numThreads; i++) {
+                results.push_back(shared_ptr<BSONObjBuilder> (new BSONObjBuilder()));
+                threads.push_back(shared_ptr<boost::thread> (new boost::thread(
+                                      boost::bind(&TestDistLockWithSkew::runThread, this,
+                                                  hostConn, (unsigned) i, seed + i, boost::ref(cmdObj),
+                                                  boost::ref(*(results[i].get()))))));
+            }
+
+            sleepsecs(wait / 1000);
+            keepGoing = false;
+
+            bool errors = false;
+            for (unsigned i = 0; i < threads.size(); i++) {
+                threads[i]->join();
+                errors = errors || results[i].get()->obj()["errors"].Bool();
+            }
+
+            result.append("count", count);
+            result.append("errors", errors);
+            result.append("timeMS", t.millis());
+
+            return !errors;
+
+        }
+
+        /**
+         * Skews the clocks of a remote cluster by a particular amount, specified by
+         * the "skewHosts" element in a BSONObj.
+         */
+        static void skewClocks( ConnectionString& cluster, BSONObj& cmdObj ) {
+
+            vector<long long> skew;
+            if(cmdObj.hasField("skewHosts")) {
+                bsonArrToNumVector<long long>(cmdObj["skewHosts"], skew);
+            }
+            else {
+                log( logLvl ) << "No host clocks to skew." << endl;
+                return;
+            }
+
+            log( logLvl ) << "Skewing clocks of hosts " << cluster << endl;
+
+            unsigned s = 0;
+            for(vector<long long>::iterator i = skew.begin(); i != skew.end(); ++i,s++) {
+
+                ConnectionString server( cluster.getServers()[s] );
+                ScopedDbConnection conn( server );
+
+                BSONObj result;
+                try {
+                    bool success = conn->runCommand( string("admin"), BSON( "_skewClockCommand" << 1 << "skew" << *i ), result );
+
+                    uassert(13678, str::stream() << "Could not communicate with server " << server.toString() << " in cluster " << cluster.toString() << " to change skew by " << *i, success );
+
+                    log( logLvl + 1 ) << " Skewed host " << server << " clock by " << *i << endl;
+                }
+                catch(...) {
+                    conn.done();
+                    throw;
+                }
+
+                conn.done();
+
+            }
+
+        }
+
+        // variables for test
+        thread_specific_ptr<DistributedLock> lock;
+        AtomicUInt count;
+        bool keepGoing;
+
+    } testDistLockWithSkewCmd;
+
+
+    /**
+     * Utility command to virtually skew the clock of a mongo server a particular amount.
+     * This skews the clock globally, per-thread skew is also possible.
+     */
+    class SkewClockCommand: public Command {
+    public:
+        SkewClockCommand() :
+            Command("_skewClockCommand") {
+        }
+        virtual void help(stringstream& help) const {
+            help << "should not be calling this directly" << endl;
+        }
+
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual LockType locktype() const {
+            return NONE;
+        }
+
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg,
+                 BSONObjBuilder& result, bool) {
+
+            long long skew = (long long) number_field(cmdObj, "skew", 0);
+
+            log() << "Adjusting jsTime() clock skew to " << skew << endl;
+
+            jsTimeVirtualSkew( skew );
+
+            log() << "JSTime adjusted, now is " << jsTime() << endl;
+
+            return true;
+
+        }
+
+    } testSkewClockCommand;
+
+}
+
diff --git a/src/mongo/client/examples/authTest.cpp b/src/mongo/client/examples/authTest.cpp
new file mode 100644
index 00000000000..71cdd390cff
--- /dev/null
+++ b/src/mongo/client/examples/authTest.cpp
@@ -0,0 +1,54 @@
+// authTest.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <iostream>
+
+#include "client/dbclient.h"
+
+using namespace mongo;
+
+int main( int argc, const char **argv ) {
+
+    const char *port = "27017";
+    if ( argc != 1 ) {
+        if ( argc != 3 )
+            throw -12;
+        port = argv[ 2 ];
+    }
+
+    DBClientConnection conn;
+    string errmsg;
+    if ( ! conn.connect( string( "127.0.0.1:" ) + port , errmsg ) ) {
+        cout << "couldn't connect : " << errmsg << endl;
+        throw -11;
+    }
+
+    {
+        // clean up old data from any previous tests
+        conn.remove( "test.system.users" , BSONObj() );
+    }
+
+    conn.insert( "test.system.users" , BSON( "user" << "eliot" << "pwd" << conn.createPasswordDigest( "eliot" , "bar" ) ) );
+
+    errmsg.clear();
+    bool ok = conn.auth( "test" , "eliot" , "bar" , errmsg );
+    if ( ! ok )
+        cout << errmsg << endl;
+    MONGO_assert( ok );
+
+    MONGO_assert( ! conn.auth( "test" , "eliot" , "bars" , errmsg ) );
+}
diff --git a/src/mongo/client/examples/clientTest.cpp b/src/mongo/client/examples/clientTest.cpp
new file mode 100644
index 00000000000..aaea6bd1bdf
--- /dev/null
+++ b/src/mongo/client/examples/clientTest.cpp
@@ -0,0 +1,279 @@
+// clientTest.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+ * a simple test for the c++ driver
+ */
+
+// this header should be first to ensure that it includes cleanly in any context
+#include "client/dbclient.h"
+
+#include <iostream>
+
+#ifndef assert
+#  define assert(x) MONGO_assert(x)
+#endif
+
+using namespace std;
+using namespace mongo;
+
+int main( int argc, const char **argv ) {
+
+    const char *port = "27017";
+    if ( argc != 1 ) {
+        if ( argc != 3 )
+            throw -12;
+        port = argv[ 2 ];
+    }
+
+    DBClientConnection conn;
+    string errmsg;
+    if ( ! conn.connect( string( "127.0.0.1:" ) + port , errmsg ) ) {
+        cout << "couldn't connect : " << errmsg << endl;
+        throw -11;
+    }
+
+    const char * ns = "test.test1";
+
+    conn.dropCollection(ns);
+
+    // clean up old data from any previous tests
+    conn.remove( ns, BSONObj() );
+    assert( conn.findOne( ns , BSONObj() ).isEmpty() );
+
+    // test insert
+    conn.insert( ns ,BSON( "name" << "eliot" << "num" << 1 ) );
+    assert( ! conn.findOne( ns , BSONObj() ).isEmpty() );
+
+    // test remove
+    conn.remove( ns, BSONObj() );
+    assert( conn.findOne( ns , BSONObj() ).isEmpty() );
+
+
+    // insert, findOne testing
+    conn.insert( ns , BSON( "name" << "eliot" << "num" << 1 ) );
+    {
+        BSONObj res = conn.findOne( ns , BSONObj() );
+        assert( strstr( res.getStringField( "name" ) , "eliot" ) );
+        assert( ! strstr( res.getStringField( "name2" ) , "eliot" ) );
+        assert( 1 == res.getIntField( "num" ) );
+    }
+
+
+    // cursor
+    conn.insert( ns ,BSON( "name" << "sara" << "num" << 2 ) );
+    {
+        auto_ptr<DBClientCursor> cursor = conn.query( ns , BSONObj() );
+        int count = 0;
+        while ( cursor->more() ) {
+            count++;
+            BSONObj obj = cursor->next();
+        }
+        assert( count == 2 );
+    }
+
+    {
+        auto_ptr<DBClientCursor> cursor = conn.query( ns , BSON( "num" << 1 ) );
+        int count = 0;
+        while ( cursor->more() ) {
+            count++;
+            BSONObj obj = cursor->next();
+        }
+        assert( count == 1 );
+    }
+
+    {
+        auto_ptr<DBClientCursor> cursor = conn.query( ns , BSON( "num" << 3 ) );
+        int count = 0;
+        while ( cursor->more() ) {
+            count++;
+            BSONObj obj = cursor->next();
+        }
+        assert( count == 0 );
+    }
+
+    // update
+    {
+        BSONObj res = conn.findOne( ns , BSONObjBuilder().append( "name" , "eliot" ).obj() );
+        assert( ! strstr( res.getStringField( "name2" ) , "eliot" ) );
+
+        BSONObj after = BSONObjBuilder().appendElements( res ).append( "name2" , "h" ).obj();
+
+        conn.update( ns , BSONObjBuilder().append( "name" , "eliot2" ).obj() , after );
+        res = conn.findOne( ns , BSONObjBuilder().append( "name" , "eliot" ).obj() );
+        assert( ! strstr( res.getStringField( "name2" ) , "eliot" ) );
+        assert( conn.findOne( ns , BSONObjBuilder().append( "name" , "eliot2" ).obj() ).isEmpty() );
+
+        conn.update( ns , BSONObjBuilder().append( "name" , "eliot" ).obj() , after );
+        res = conn.findOne( ns , BSONObjBuilder().append( "name" , "eliot" ).obj() );
+        assert( strstr( res.getStringField( "name" ) , "eliot" ) );
+        assert( strstr( res.getStringField( "name2" ) , "h" ) );
+        assert( conn.findOne( ns , BSONObjBuilder().append( "name" , "eliot2" ).obj() ).isEmpty() );
+
+        // upsert
+        conn.update( ns , BSONObjBuilder().append( "name" , "eliot2" ).obj() , after , 1 );
+        assert( ! conn.findOne( ns , BSONObjBuilder().append( "name" , "eliot" ).obj() ).isEmpty() );
+
+    }
+
+    {
+        // ensure index
+        assert( conn.ensureIndex( ns , BSON( "name" << 1 ) ) );
+        assert( ! conn.ensureIndex( ns , BSON( "name" << 1 ) ) );
+    }
+
+    {
+        // hint related tests
+        assert( conn.findOne(ns, "{}")["name"].str() == "sara" );
+
+        assert( conn.findOne(ns, "{ name : 'eliot' }")["name"].str() == "eliot" );
+        assert( conn.getLastError() == "" );
+
+        // nonexistent index test
+        bool asserted = false;
+        try {
+            conn.findOne(ns, Query("{name:\"eliot\"}").hint("{foo:1}"));
+        }
+        catch ( ... ) {
+            asserted = true;
+        }
+        assert( asserted );
+
+        //existing index
+        assert( conn.findOne(ns, Query("{name:'eliot'}").hint("{name:1}")).hasElement("name") );
+
+        // run validate
+        assert( conn.validate( ns ) );
+    }
+
+    {
+        // timestamp test
+
+        const char * tsns = "test.tstest1";
+        conn.dropCollection( tsns );
+
+        {
+            mongo::BSONObjBuilder b;
+            b.appendTimestamp( "ts" );
+            conn.insert( tsns , b.obj() );
+        }
+
+        mongo::BSONObj out = conn.findOne( tsns , mongo::BSONObj() );
+        Date_t oldTime = out["ts"].timestampTime();
+        unsigned int oldInc = out["ts"].timestampInc();
+
+        {
+            mongo::BSONObjBuilder b1;
+            b1.append( out["_id"] );
+
+            mongo::BSONObjBuilder b2;
+            b2.append( out["_id"] );
+            b2.appendTimestamp( "ts" );
+
+            conn.update( tsns , b1.obj() , b2.obj() );
+        }
+
+        BSONObj found = conn.findOne( tsns , mongo::BSONObj() );
+        cout << "old: " << out << "\nnew: " << found << endl;
+        assert( ( oldTime < found["ts"].timestampTime() ) ||
+                ( oldTime == found["ts"].timestampTime() && oldInc < found["ts"].timestampInc() ) );
+
+    }
+
+    {
+        // check that killcursors doesn't affect last error
+        assert( conn.getLastError().empty() );
+
+        BufBuilder b;
+        b.appendNum( (int)0 ); // reserved
+        b.appendNum( (int)-1 ); // invalid # of cursors triggers exception
+        b.appendNum( (int)-1 ); // bogus cursor id
+
+        Message m;
+        m.setData( dbKillCursors, b.buf(), b.len() );
+
+        // say() is protected in DBClientConnection, so get superclass
+        static_cast< DBConnector* >( &conn )->say( m );
+
+        assert( conn.getLastError().empty() );
+    }
+
+    {
+        list<string> l = conn.getDatabaseNames();
+        for ( list<string>::iterator i = l.begin(); i != l.end(); i++ ) {
+            cout << "db name : " << *i << endl;
+        }
+
+        l = conn.getCollectionNames( "test" );
+        for ( list<string>::iterator i = l.begin(); i != l.end(); i++ ) {
+            cout << "coll name : " << *i << endl;
+        }
+    }
+
+    {
+        //Map Reduce (this mostly just tests that it compiles with all output types)
+        const string ns = "test.mr";
+        conn.insert(ns, BSON("a" << 1));
+        conn.insert(ns, BSON("a" << 1));
+
+        const char* map = "function() { emit(this.a, 1); }";
+        const char* reduce = "function(key, values) { return Array.sum(values); }";
+
+        const string outcoll = ns + ".out";
+
+        BSONObj out;
+        out = conn.mapreduce(ns, map, reduce, BSONObj()); // default to inline
+        //MONGO_PRINT(out);
+        out = conn.mapreduce(ns, map, reduce, BSONObj(), outcoll);
+        //MONGO_PRINT(out);
+        out = conn.mapreduce(ns, map, reduce, BSONObj(), outcoll.c_str());
+        //MONGO_PRINT(out);
+        out = conn.mapreduce(ns, map, reduce, BSONObj(), BSON("reduce" << outcoll));
+        //MONGO_PRINT(out);
+    }
+
+    { 
+        // test timeouts
+
+        DBClientConnection conn( true , 0 , 2 );
+        if ( ! conn.connect( string( "127.0.0.1:" ) + port , errmsg ) ) {
+            cout << "couldn't connect : " << errmsg << endl;
+            throw -11;
+        }
+        conn.insert( "test.totest" , BSON( "x" << 1 ) );
+        BSONObj res;
+        
+        bool gotError = false;
+        assert( conn.eval( "test" , "return db.totest.findOne().x" , res ) );
+        try {
+            conn.eval( "test" , "sleep(5000); return db.totest.findOne().x" , res );
+        }
+        catch ( std::exception& e ) {
+            gotError = true;
+            log() << e.what() << endl;
+        }
+        assert( gotError );
+        // sleep so the server isn't locked anymore
+        sleepsecs( 4 );
+        
+        assert( conn.eval( "test" , "return db.totest.findOne().x" , res ) );
+        
+        
+    }
+
+    cout << "client test finished!" << endl;
+}
diff --git a/src/mongo/client/examples/first.cpp b/src/mongo/client/examples/first.cpp
new file mode 100644
index 00000000000..ab5efb325f5
--- /dev/null
+++ b/src/mongo/client/examples/first.cpp
@@ -0,0 +1,86 @@
+// first.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+ * this is a good first example of how to use mongo from c++
+ */
+
+#include <iostream>
+
+#include "client/dbclient.h"
+
+using namespace std;
+
+void insert( mongo::DBClientConnection & conn , const char * name , int num ) {
+    mongo::BSONObjBuilder obj;
+    obj.append( "name" , name );
+    obj.append( "num" , num );
+    conn.insert( "test.people" , obj.obj() );
+}
+
+int main( int argc, const char **argv ) {
+
+    const char *port = "27017";
+    if ( argc != 1 ) {
+        if ( argc != 3 )
+            throw -12;
+        port = argv[ 2 ];
+    }
+
+    mongo::DBClientConnection conn;
+    string errmsg;
+    if ( ! conn.connect( string( "127.0.0.1:" ) + port , errmsg ) ) {
+        cout << "couldn't connect : " << errmsg << endl;
+        throw -11;
+    }
+
+    {
+        // clean up old data from any previous tests
+        mongo::BSONObjBuilder query;
+        conn.remove( "test.people" , query.obj() );
+    }
+
+    insert( conn , "eliot" , 15 );
+    insert( conn , "sara" , 23 );
+
+    {
+        mongo::BSONObjBuilder query;
+        auto_ptr<mongo::DBClientCursor> cursor = conn.query( "test.people" , query.obj() );
+        cout << "using cursor" << endl;
+        while ( cursor->more() ) {
+            mongo::BSONObj obj = cursor->next();
+            cout << "\t" << obj.jsonString() << endl;
+        }
+
+    }
+
+    {
+        mongo::BSONObjBuilder query;
+        query.append( "name" , "eliot" );
+        mongo::BSONObj res = conn.findOne( "test.people" , query.obj() );
+        cout << res.isEmpty() << "\t" << res.jsonString() << endl;
+    }
+
+    {
+        mongo::BSONObjBuilder query;
+        query.append( "name" , "asd" );
+        mongo::BSONObj res = conn.findOne( "test.people" , query.obj() );
+        cout << res.isEmpty() << "\t" << res.jsonString() << endl;
+    }
+
+
+}
diff --git a/src/mongo/client/examples/httpClientTest.cpp b/src/mongo/client/examples/httpClientTest.cpp
new file mode 100644
index 00000000000..4055d4492d5
--- /dev/null
+++ b/src/mongo/client/examples/httpClientTest.cpp
@@ -0,0 +1,58 @@
+// httpClientTest.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <iostream>
+
+#include "client/dbclient.h"
+#include "util/net/httpclient.h"
+
+using namespace mongo;
+
+void play( string url ) {
+    cout << "[" << url << "]" << endl;
+
+    HttpClient c;
+    HttpClient::Result r;
+    MONGO_assert( c.get( url , &r ) == 200 );
+
+    HttpClient::Headers h = r.getHeaders();
+    MONGO_assert( h["Content-Type"].find( "text/html" ) == 0 );
+
+    cout << "\tHeaders" << endl;
+    for ( HttpClient::Headers::iterator i = h.begin() ; i != h.end(); ++i ) {
+        cout << "\t\t" << i->first << "\t" << i->second << endl;
+    }
+    
+}
+
+int main( int argc, const char **argv ) {
+
+    int port = 27017;
+    if ( argc != 1 ) {
+        if ( argc != 3 )
+            throw -12;
+        port = atoi( argv[ 2 ] );
+    }
+    port += 1000;
+
+    play( str::stream() << "http://localhost:" << port << "/" );
+    
+#ifdef MONGO_SSL
+    play( "https://www.10gen.com/" );
+#endif
+    
+}
diff --git a/src/mongo/client/examples/insert_demo.cpp b/src/mongo/client/examples/insert_demo.cpp
new file mode 100644
index 00000000000..14ac79ee1a0
--- /dev/null
+++ b/src/mongo/client/examples/insert_demo.cpp
@@ -0,0 +1,47 @@
+/* 
+   C++ client program which inserts documents in a MongoDB database.
+
+   How to build and run:
+
+   Using mongo_client_lib.cpp:
+    g++ -I .. -I ../.. insert_demo.cpp ../mongo_client_lib.cpp -lboost_thread-mt -lboost_filesystem
+    ./a.out
+*/
+
+#include <iostream>
+#include "dbclient.h" // the mongo c++ driver
+
+using namespace std;
+using namespace mongo;
+using namespace bson;
+
+int main() {
+    try {
+        cout << "connecting to localhost..." << endl;
+        DBClientConnection c;
+        c.connect("localhost");
+        cout << "connected ok" << endl;
+
+        bo o = BSON( "hello" << "world" );
+
+	cout << "inserting..." << endl;
+
+	time_t start = time(0);
+	for( unsigned i = 0; i < 1000000; i++ ) {
+	  c.insert("test.foo", o);
+	}
+
+	// wait until all operations applied
+	cout << "getlasterror returns: \"" << c.getLastError() << '"' << endl;
+
+	time_t done = time(0);
+	time_t dt = done-start;
+	cout << dt << " seconds " << 1000000/dt << " per second" << endl;
+    } 
+    catch(DBException& e) { 
+        cout << "caught DBException " << e.toString() << endl;
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/src/mongo/client/examples/mongoperf.cpp b/src/mongo/client/examples/mongoperf.cpp
new file mode 100644
index 00000000000..68ebd6b10f2
--- /dev/null
+++ b/src/mongo/client/examples/mongoperf.cpp
@@ -0,0 +1,269 @@
+/* 
+   How to build and run:
+
+   scons mongoperf
+   ./mongoperf -h
+*/
+
+#define MONGO_EXPOSE_MACROS 1
+
+#include <iostream>
+#include "../dbclient.h" // the mongo c++ driver
+#include "../../util/mmap.h"
+#include <assert.h>
+#include "../../util/logfile.h"
+#include "../../util/timer.h"
+#include "../../util/time_support.h"
+#include "../../bson/util/atomic_int.h"
+
+using namespace std;
+using namespace mongo;
+using namespace bson;
+
+int dummy;
+LogFile *lf = 0;
+MemoryMappedFile *mmfFile;
+char *mmf = 0;
+bo options;
+unsigned long long len; // file len
+const unsigned PG = 4096;
+unsigned nThreadsRunning = 0;
+
+// as this is incremented A LOT, at some point this becomes a bottleneck if very high ops/second (in cache) things are happening.
+AtomicUInt iops;
+
+SimpleMutex m("mperf");
+
+int syncDelaySecs = 0;
+
+void syncThread() {
+    while( 1 ) {
+        mongo::Timer t;
+        mmfFile->flush(true);
+        cout << "                                                     mmf sync took " << t.millis() << "ms" << endl;
+        sleepsecs(syncDelaySecs);
+    }
+}
+
+char* round(char* x) {
+    size_t f = (size_t) x;
+    char *p = (char *) ((f+PG-1)/PG*PG);
+    return p;
+}
+
+struct Aligned {
+    char x[8192];
+    char* addr() { return round(x); }
+}; 
+
+unsigned long long rrand() { 
+    // RAND_MAX is very small on windows
+    return (static_cast<unsigned long long>(rand()) << 15) ^ rand();
+}
+
+void workerThread() {
+    bool r = options["r"].trueValue();
+    bool w = options["w"].trueValue();
+    //cout << "read:" << r << " write:" << w << endl;
+    long long su = options["sleepMicros"].numberLong();
+    Aligned a;
+    while( 1 ) { 
+        unsigned long long rofs = (rrand() * PG) % len;
+        unsigned long long wofs = (rrand() * PG) % len;
+        if( mmf ) { 
+            if( r ) {
+                dummy += mmf[rofs];
+                iops++;
+            }
+            if( w ) {
+                mmf[wofs] = 3;
+                iops++;
+            }
+        }
+        else {
+            if( r ) {
+                lf->readAt(rofs, a.addr(), PG);
+                iops++;
+            }
+            if( w ) {
+                lf->writeAt(wofs, a.addr(), PG);
+                iops++;
+            }
+        }
+        long long micros = su / nThreadsRunning;
+        if( micros ) {
+            sleepmicros(micros);
+        }
+    }
+}
+
+void go() {
+    assert( options["r"].trueValue() || options["w"].trueValue() );
+    MemoryMappedFile f;
+    cout << "creating test file size:";
+    len = options["fileSizeMB"].numberLong();
+    if( len == 0 ) len = 1;
+    cout << len << "MB ..." << endl;
+
+    if( 0 && len > 2000 && !options["mmf"].trueValue() ) { 
+        // todo make tests use 64 bit offsets in their i/o -- i.e. adjust LogFile::writeAt and such
+        cout << "\nsizes > 2GB not yet supported with mmf:false" << endl; 
+        return;
+    }
+    len *= 1024 * 1024;
+    const char *fname = "./mongoperf__testfile__tmp";
+    try {
+        boost::filesystem::remove(fname);
+    }
+    catch(...) { 
+        cout << "error deleting file " << fname << endl;
+        return;
+    }
+    lf = new LogFile(fname,true);
+    const unsigned sz = 1024 * 1024 * 32; // needs to be big as we are using synchronousAppend.  if we used a regular MongoFile it wouldn't have to be
+    char *buf = (char*) malloc(sz+4096);
+    const char *p = round(buf);
+    for( unsigned long long i = 0; i < len; i += sz ) { 
+        lf->synchronousAppend(p, sz);
+        if( i % (1024ULL*1024*1024) == 0 && i ) {
+            cout << i / (1024ULL*1024*1024) << "GB..." << endl;
+        }
+    }
+    BSONObj& o = options;
+
+    if( o["mmf"].trueValue() ) { 
+        delete lf;
+        lf = 0;
+        mmfFile = new MemoryMappedFile();
+        mmf = (char *) mmfFile->map(fname);
+        assert( mmf );
+
+        syncDelaySecs = options["syncDelay"].numberInt();
+        if( syncDelaySecs ) {
+            boost::thread t(syncThread);
+        }
+    }
+
+    cout << "testing..."<< endl;
+
+    unsigned wthr = (unsigned) o["nThreads"].Int();
+    if( wthr < 1 ) { 
+        cout << "bad threads field value" << endl;
+        return;
+    }
+    unsigned i = 0;
+    unsigned d = 1;
+    unsigned &nthr = nThreadsRunning;
+    while( 1 ) {
+        if( i++ % 8 == 0 ) {
+            if( nthr < wthr ) {
+                while( nthr < wthr && nthr < d ) {
+                    nthr++;
+                    boost::thread w(workerThread);
+                }
+                cout << "new thread, total running : " << nthr << endl;
+                d *= 2;
+            }
+        }
+        sleepsecs(1);
+        unsigned long long w = iops.get();
+        iops.zero();
+        w /= 1; // 1 secs
+        cout << w << " ops/sec ";
+        if( mmf == 0 ) 
+            // only writing 4 bytes with mmf so we don't say this
+            cout << (w * PG / 1024 / 1024) << " MB/sec";
+        cout << endl;
+    }
+}
+
+int main(int argc, char *argv[]) {
+
+    try {
+        cout << "mongoperf" << endl;
+
+        if( argc > 1 ) { 
+cout <<
+
+"\n"
+"usage:\n"
+"\n"
+"  mongoperf < myjsonconfigfile\n"
+"\n"
+"  {\n"
+"    nThreads:<n>,     // number of threads (default 1)\n"
+"    fileSizeMB:<n>,   // test file size (default 1MB)\n"
+"    sleepMicros:<n>,  // pause for sleepMicros/nThreads between each operation (default 0)\n"
+"    mmf:<bool>,       // if true do i/o's via memory mapped files (default false)\n"
+"    r:<bool>,         // do reads (default false)\n"
+"    w:<bool>,         // do writes (default false)\n"
+"    syncDelay:<n>     // secs between fsyncs, like --syncdelay in mongod. (default 0/never)\n"
+"  }\n"
+"\n"
+"mongoperf is a performance testing tool. the initial tests are of disk subsystem performance; \n"
+"  tests of mongos and mongod will be added later.\n"
+"most fields are optional.\n"
+"non-mmf io is direct io (no caching). use a large file size to test making the heads\n"
+"  move significantly and to avoid i/o coalescing\n"
+"mmf io uses caching (the file system cache).\n"
+"\n"
+
+<< endl;
+            return 0;
+        }
+
+        cout << "use -h for help" << endl;
+
+        char input[1024];
+        memset(input, 0, sizeof(input));
+        cin.read(input, 1000);
+        if( *input == 0 ) { 
+            cout << "error no options found on stdin for mongoperf" << endl;
+            return 2;
+        }
+
+        string s = input;
+        str::stripTrailing(s, "\n\r\0x1a");
+        try { 
+            options = fromjson(s);
+        }
+        catch(...) { 
+            cout << s << endl;
+            cout << "couldn't parse json options" << endl;
+            return -1;
+        }
+        cout << "options:\n" << options.toString() << endl;
+
+        go();
+#if 0
+        cout << "connecting to localhost..." << endl;
+        DBClientConnection c;
+        c.connect("localhost");
+        cout << "connected ok" << endl;
+        unsigned long long count = c.count("test.foo");
+        cout << "count of exiting documents in collection test.foo : " << count << endl;
+
+        bo o = BSON( "hello" << "world" );
+        c.insert("test.foo", o);
+
+        string e = c.getLastError();
+        if( !e.empty() ) { 
+            cout << "insert #1 failed: " << e << endl;
+        }
+
+        // make an index with a unique key constraint
+        c.ensureIndex("test.foo", BSON("hello"<<1), /*unique*/true);
+
+        c.insert("test.foo", o); // will cause a dup key error on "hello" field
+        cout << "we expect a dup key error here:" << endl;
+        cout << "  " << c.getLastErrorDetailed().toString() << endl;
+#endif
+    } 
+    catch(DBException& e) { 
+        cout << "caught DBException " << e.toString() << endl;
+        return 1;
+    }
+
+    return 0;
+}
+
diff --git a/src/mongo/client/examples/mongoperf.vcxproj b/src/mongo/client/examples/mongoperf.vcxproj
new file mode 100755
index 00000000000..89168370733
--- /dev/null
+++ b/src/mongo/client/examples/mongoperf.vcxproj
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{79D4E297-BFB7-4FF2-9B13-08A146582E46}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mongoperf</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IncludePath>..\..;..\..\third_party\pcre-7.4;$(IncludePath)</IncludePath>
+    <LibraryPath>\boost\lib\vs2010_32;$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <PrecompiledHeader>
+    </PrecompiledHeader>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions> _CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions> _CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\util\logfile.cpp" />
+    <ClCompile Include="..\..\util\mmap.cpp" />
+    <ClCompile Include="..\..\util\mmap_win.cpp" />
+    <ClCompile Include="..\mongo_client_lib.cpp" />
+    <ClCompile Include="mongoperf.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\bson\bson-inl.h" />
+    <ClInclude Include="..\..\bson\bson.h" />
+    <ClInclude Include="..\..\bson\bsonelement.h" />
+    <ClInclude Include="..\..\bson\bsonmisc.h" />
+    <ClInclude Include="..\..\bson\bsonobj.h" />
+    <ClInclude Include="..\..\bson\bsonobjbuilder.h" />
+    <ClInclude Include="..\..\bson\bsonobjiterator.h" />
+    <ClInclude Include="..\..\bson\bsontypes.h" />
+    <ClInclude Include="..\..\bson\bson_db.h" />
+    <ClInclude Include="..\..\bson\inline_decls.h" />
+    <ClInclude Include="..\..\bson\oid.h" />
+    <ClInclude Include="..\..\bson\ordering.h" />
+    <ClInclude Include="..\..\bson\stringdata.h" />
+    <ClInclude Include="..\..\util\logfile.h" />
+    <ClInclude Include="..\..\util\mmap.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/client/examples/mongoperf.vcxproj.filters b/src/mongo/client/examples/mongoperf.vcxproj.filters
new file mode 100755
index 00000000000..ab12575af08
--- /dev/null
+++ b/src/mongo/client/examples/mongoperf.vcxproj.filters
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="mongoperf.cpp" />
+    <ClCompile Include="..\mongo_client_lib.cpp">
+      <Filter>shared files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\mmap.cpp">
+      <Filter>shared files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\mmap_win.cpp">
+      <Filter>shared files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\logfile.cpp">
+      <Filter>shared files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="shared files">
+      <UniqueIdentifier>{847e788b-8e8c-48de-829f-6876c9008440}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="includes">
+      <UniqueIdentifier>{d855a95e-71ad-4f54-ae1b-94e7aa894394}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\bson\bson-inl.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\inline_decls.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\bson.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\bson_db.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\bsonelement.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\bsonmisc.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\bsonobj.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\bsonobjbuilder.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\bsonobjiterator.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\bsontypes.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\util\logfile.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\stringdata.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\oid.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\bson\ordering.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\util\mmap.h">
+      <Filter>includes</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/client/examples/rs.cpp b/src/mongo/client/examples/rs.cpp
new file mode 100644
index 00000000000..3307d87b56b
--- /dev/null
+++ b/src/mongo/client/examples/rs.cpp
@@ -0,0 +1,118 @@
+// rs.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+ * example of using replica sets from c++
+ */
+
+#include "client/dbclient.h"
+#include <iostream>
+#include <vector>
+
+using namespace mongo;
+using namespace std;
+
+void workerThread( string collName , bool print , DBClientReplicaSet * conn ) {
+
+    while ( true ) {
+        try {
+            conn->update( collName , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) , true );
+            
+            BSONObj x = conn->findOne( collName , BSONObj() );
+
+            if ( print ) {
+                cout << x << endl;
+            }
+            
+            BSONObj a = conn->slaveConn().findOne( collName , BSONObj() , 0 , QueryOption_SlaveOk );
+            BSONObj b = conn->findOne( collName , BSONObj() , 0 , QueryOption_SlaveOk );
+            
+            if ( print ) {
+                cout << "\t A " << a << endl;
+                cout << "\t B " << b << endl;
+            }
+        }
+        catch ( std::exception& e ) {
+            cout << "ERROR: " << e.what() << endl;
+        }
+        sleepmillis( 10 );
+    }
+}
+
+int main( int argc , const char ** argv ) {
+    
+    unsigned nThreads = 1;
+    bool print = false;
+    bool testTimeout = false;
+
+    for ( int i=1; i<argc; i++ ) {
+        if ( mongoutils::str::equals( "--threads" , argv[i] ) ) {
+            nThreads = atoi( argv[++i] );
+        }
+        else if ( mongoutils::str::equals( "--print" , argv[i] ) ) {
+            print = true;
+        }
+        // Run a special mode to demonstrate the DBClientReplicaSet so_timeout option.
+        else if ( mongoutils::str::equals( "--testTimeout" , argv[i] ) ) {
+            testTimeout = true;
+        }
+        else {
+            cerr << "unknown option: " << argv[i] << endl;
+            return 1;
+        }
+            
+    }
+
+    string errmsg;
+    ConnectionString cs = ConnectionString::parse( "foo/127.0.0.1" , errmsg );
+    if ( ! cs.isValid() ) {
+        cout << "error parsing url: " << errmsg << endl;
+        return 1;
+    }
+
+    DBClientReplicaSet * conn = dynamic_cast<DBClientReplicaSet*>(cs.connect( errmsg, testTimeout ? 10 : 0 ));
+    if ( ! conn ) {
+        cout << "error connecting: " << errmsg << endl;
+        return 2;
+    }
+
+    string collName = "test.rs1";
+
+    conn->dropCollection( collName );
+
+    if ( testTimeout ) {
+        conn->insert( collName, BSONObj() );
+        try {
+            conn->count( collName, BSON( "$where" << "sleep(40000)" ) );
+        } catch( DBException& ) {
+            return 0;
+        }
+        cout << "expected socket exception" << endl;
+        return 1;
+    }
+    
+    vector<boost::shared_ptr<boost::thread> > threads;
+    for ( unsigned i=0; i<nThreads; i++ ) {
+        string errmsg;
+        threads.push_back( boost::shared_ptr<boost::thread>( new boost::thread( boost::bind( workerThread , collName , print , (DBClientReplicaSet*)cs.connect(errmsg) ) ) ) );
+    }
+    
+    for ( unsigned i=0; i<threads.size(); i++ ) {
+        threads[i]->join();
+    }
+
+}
diff --git a/src/mongo/client/examples/second.cpp b/src/mongo/client/examples/second.cpp
new file mode 100644
index 00000000000..6cc2111580f
--- /dev/null
+++ b/src/mongo/client/examples/second.cpp
@@ -0,0 +1,56 @@
+// second.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <iostream>
+
+#include "client/dbclient.h"
+
+using namespace std;
+using namespace mongo;
+
+int main( int argc, const char **argv ) {
+
+    const char *port = "27017";
+    if ( argc != 1 ) {
+        if ( argc != 3 )
+            throw -12;
+        port = argv[ 2 ];
+    }
+
+    DBClientConnection conn;
+    string errmsg;
+    if ( ! conn.connect( string( "127.0.0.1:" ) + port , errmsg ) ) {
+        cout << "couldn't connect : " << errmsg << endl;
+        throw -11;
+    }
+
+    const char * ns = "test.second";
+
+    conn.remove( ns , BSONObj() );
+
+    conn.insert( ns , BSON( "name" << "eliot" << "num" << 17 ) );
+    conn.insert( ns , BSON( "name" << "sara" << "num" << 24 ) );
+
+    auto_ptr<DBClientCursor> cursor = conn.query( ns , BSONObj() );
+    cout << "using cursor" << endl;
+    while ( cursor->more() ) {
+        BSONObj obj = cursor->next();
+        cout << "\t" << obj.jsonString() << endl;
+    }
+
+    conn.ensureIndex( ns , BSON( "name" << 1 << "num" << -1 ) );
+}
diff --git a/src/mongo/client/examples/simple_client_demo.vcxproj b/src/mongo/client/examples/simple_client_demo.vcxproj
new file mode 100755
index 00000000000..358513f307a
--- /dev/null
+++ b/src/mongo/client/examples/simple_client_demo.vcxproj
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{89C30BC3-2874-4F2C-B4DA-EB04E9782236}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>simple_client_demo</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IncludePath>..\..;..\..\third_party\pcre-7.4;$(IncludePath)</IncludePath>
+    <LibraryPath>\boost\lib\vs2010_32;$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+
+  <ItemDefinitionGroup>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <ClCompile>
+	<WarningLevel>Level3</WarningLevel>
+	<AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+      </ClCompile>
+  </ItemDefinitionGroup>
+
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions> _CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions> _CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+
+  <ItemGroup>
+    <ClCompile Include="..\mongo_client_lib.cpp" />
+    <ClCompile Include="..\simple_client_demo.cpp" />
+  </ItemGroup>
+
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/client/examples/simple_client_demo.vcxproj.filters b/src/mongo/client/examples/simple_client_demo.vcxproj.filters
new file mode 100755
index 00000000000..8aa5a1a96c5
--- /dev/null
+++ b/src/mongo/client/examples/simple_client_demo.vcxproj.filters
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\simple_client_demo.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mongo_client_lib.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/client/examples/tail.cpp b/src/mongo/client/examples/tail.cpp
new file mode 100644
index 00000000000..90e62d279c1
--- /dev/null
+++ b/src/mongo/client/examples/tail.cpp
@@ -0,0 +1,46 @@
+// tail.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/* example of using a tailable cursor */
+
+#include "../../client/dbclient.h"
+#include "../../util/goodies.h"
+
+using namespace mongo;
+
+void tail(DBClientBase& conn, const char *ns) {
+    BSONElement lastId = minKey.firstElement();
+    Query query = Query();
+
+    auto_ptr<DBClientCursor> c =
+        conn.query(ns, query, 0, 0, 0, QueryOption_CursorTailable);
+
+    while( 1 ) {
+        if( !c->more() ) {
+            if( c->isDead() ) {
+                break;    // we need to requery
+            }
+
+            // all data (so far) exhausted, wait for more
+            sleepsecs(1);
+            continue;
+        }
+        BSONObj o = c->next();
+        lastId = o["_id"];
+        cout << o.toString() << endl;
+    }
+}
diff --git a/src/mongo/client/examples/tutorial.cpp b/src/mongo/client/examples/tutorial.cpp
new file mode 100644
index 00000000000..aa5ad02b55d
--- /dev/null
+++ b/src/mongo/client/examples/tutorial.cpp
@@ -0,0 +1,71 @@
+//tutorial.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <iostream>
+#include "../../client/dbclient.h"
+
+// g++ tutorial.cpp -lmongoclient -lboost_thread -lboost_filesystem -o tutorial
+// Might need a variant of the above compile line.  This worked for me:
+//g++ tutorial.cpp -L[mongo directory] -L/opt/local/lib -lmongoclient -lboost_thread-mt -lboost_filesystem -lboost_system -I/opt/local/include  -o tutorial
+
+using namespace mongo;
+
+void printIfAge(DBClientConnection& c, int age) {
+    auto_ptr<DBClientCursor> cursor = c.query("tutorial.persons", QUERY( "age" << age ).sort("name") );
+    while( cursor->more() ) {
+        BSONObj p = cursor->next();
+        cout << p.getStringField("name") << endl;
+    }
+}
+
+void run() {
+    DBClientConnection c;
+    c.connect("localhost"); //"192.168.58.1");
+    cout << "connected ok" << endl;
+    BSONObj p = BSON( "name" << "Joe" << "age" << 33 );
+    c.insert("tutorial.persons", p);
+    p = BSON( "name" << "Jane" << "age" << 40 );
+    c.insert("tutorial.persons", p);
+    p = BSON( "name" << "Abe" << "age" << 33 );
+    c.insert("tutorial.persons", p);
+    p = BSON( "name" << "Methuselah" << "age" << BSONNULL);
+    c.insert("tutorial.persons", p);
+    p = BSON( "name" << "Samantha" << "age" << 21 << "city" << "Los Angeles" << "state" << "CA" );
+    c.insert("tutorial.persons", p);
+
+    c.ensureIndex("tutorial.persons", fromjson("{age:1}"));
+
+    cout << "count:" << c.count("tutorial.persons") << endl;
+
+    auto_ptr<DBClientCursor> cursor = c.query("tutorial.persons", BSONObj());
+    while( cursor->more() ) {
+        cout << cursor->next().toString() << endl;
+    }
+
+    cout << "\nprintifage:\n";
+    printIfAge(c, 33);
+}
+
+int main() {
+    try {
+        run();
+    }
+    catch( DBException &e ) {
+        cout << "caught " << e.what() << endl;
+    }
+    return 0;
+}
diff --git a/src/mongo/client/examples/whereExample.cpp b/src/mongo/client/examples/whereExample.cpp
new file mode 100644
index 00000000000..12b68d7add3
--- /dev/null
+++ b/src/mongo/client/examples/whereExample.cpp
@@ -0,0 +1,69 @@
+// @file whereExample.cpp
+// @see http://www.mongodb.org/display/DOCS/Server-side+Code+Execution
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <iostream>
+
+#include "client/dbclient.h"
+
+using namespace std;
+using namespace mongo;
+
+int main( int argc, const char **argv ) {
+
+    const char *port = "27017";
+    if ( argc != 1 ) {
+        if ( argc != 3 )
+            throw -12;
+        port = argv[ 2 ];
+    }
+
+    DBClientConnection conn;
+    string errmsg;
+    if ( ! conn.connect( string( "127.0.0.1:" ) + port , errmsg ) ) {
+        cout << "couldn't connect : " << errmsg << endl;
+        throw -11;
+    }
+
+    const char * ns = "test.where";
+
+    conn.remove( ns , BSONObj() );
+
+    conn.insert( ns , BSON( "name" << "eliot" << "num" << 17 ) );
+    conn.insert( ns , BSON( "name" << "sara" << "num" << 24 ) );
+
+    auto_ptr<DBClientCursor> cursor = conn.query( ns , BSONObj() );
+
+    while ( cursor->more() ) {
+        BSONObj obj = cursor->next();
+        cout << "\t" << obj.jsonString() << endl;
+    }
+
+    cout << "now using $where" << endl;
+
+    Query q = Query("{}").where("this.name == name" , BSON( "name" << "sara" ));
+
+    cursor = conn.query( ns , q );
+
+    int num = 0;
+    while ( cursor->more() ) {
+        BSONObj obj = cursor->next();
+        cout << "\t" << obj.jsonString() << endl;
+        num++;
+    }
+    MONGO_assert( num == 1 );
+}
diff --git a/src/mongo/client/gridfs.cpp b/src/mongo/client/gridfs.cpp
new file mode 100644
index 00000000000..449cb4067d2
--- /dev/null
+++ b/src/mongo/client/gridfs.cpp
@@ -0,0 +1,245 @@
+// gridfs.cpp
+
+/*    Copyright 2009 10gen
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include <fcntl.h>
+#include <utility>
+
+#include "gridfs.h"
+#include <boost/smart_ptr.hpp>
+
+#if defined(_WIN32)
+#include <io.h>
+#endif
+
+#ifndef MIN
+#define MIN(a,b) ( (a) < (b) ? (a) : (b) )
+#endif
+
+namespace mongo {
+
+    const unsigned DEFAULT_CHUNK_SIZE = 256 * 1024;
+
+    GridFSChunk::GridFSChunk( BSONObj o ) {
+        _data = o;
+    }
+
+    GridFSChunk::GridFSChunk( BSONObj fileObject , int chunkNumber , const char * data , int len ) {
+        BSONObjBuilder b;
+        b.appendAs( fileObject["_id"] , "files_id" );
+        b.append( "n" , chunkNumber );
+        b.appendBinData( "data" , len, BinDataGeneral, data );
+        _data = b.obj();
+    }
+
+
+    GridFS::GridFS( DBClientBase& client , const string& dbName , const string& prefix ) : _client( client ) , _dbName( dbName ) , _prefix( prefix ) {
+        _filesNS = dbName + "." + prefix + ".files";
+        _chunksNS = dbName + "." + prefix + ".chunks";
+        _chunkSize = DEFAULT_CHUNK_SIZE;
+
+        client.ensureIndex( _filesNS , BSON( "filename" << 1 ) );
+        client.ensureIndex( _chunksNS , BSON( "files_id" << 1 << "n" << 1 ) );
+    }
+
+    GridFS::~GridFS() {
+
+    }
+
+    void GridFS::setChunkSize(unsigned int size) {
+        massert( 13296 , "invalid chunk size is specified", (size == 0));
+        _chunkSize = size;
+    }
+
+    BSONObj GridFS::storeFile( const char* data , size_t length , const string& remoteName , const string& contentType) {
+        char const * const end = data + length;
+
+        OID id;
+        id.init();
+        BSONObj idObj = BSON("_id" << id);
+
+        int chunkNumber = 0;
+        while (data < end) {
+            int chunkLen = MIN(_chunkSize, (unsigned)(end-data));
+            GridFSChunk c(idObj, chunkNumber, data, chunkLen);
+            _client.insert( _chunksNS.c_str() , c._data );
+
+            chunkNumber++;
+            data += chunkLen;
+        }
+
+        return insertFile(remoteName, id, length, contentType);
+    }
+
+
+    BSONObj GridFS::storeFile( const string& fileName , const string& remoteName , const string& contentType) {
+        uassert( 10012 ,  "file doesn't exist" , fileName == "-" || boost::filesystem::exists( fileName ) );
+
+        FILE* fd;
+        if (fileName == "-")
+            fd = stdin;
+        else
+            fd = fopen( fileName.c_str() , "rb" );
+        uassert( 10013 , "error opening file", fd);
+
+        OID id;
+        id.init();
+        BSONObj idObj = BSON("_id" << id);
+
+        int chunkNumber = 0;
+        gridfs_offset length = 0;
+        while (!feof(fd)) {
+            //boost::scoped_array<char>buf (new char[_chunkSize+1]);
+            char * buf = new char[_chunkSize+1];
+            char* bufPos = buf;//.get();
+            unsigned int chunkLen = 0; // how much in the chunk now
+            while(chunkLen != _chunkSize && !feof(fd)) {
+                int readLen = fread(bufPos, 1, _chunkSize - chunkLen, fd);
+                chunkLen += readLen;
+                bufPos += readLen;
+
+                assert(chunkLen <= _chunkSize);
+            }
+
+            GridFSChunk c(idObj, chunkNumber, buf, chunkLen);
+            _client.insert( _chunksNS.c_str() , c._data );
+
+            length += chunkLen;
+            chunkNumber++;
+            delete[] buf;
+        }
+
+        if (fd != stdin)
+            fclose( fd );
+
+        return insertFile((remoteName.empty() ? fileName : remoteName), id, length, contentType);
+    }
+
+    BSONObj GridFS::insertFile(const string& name, const OID& id, gridfs_offset length, const string& contentType) {
+
+        BSONObj res;
+        if ( ! _client.runCommand( _dbName.c_str() , BSON( "filemd5" << id << "root" << _prefix ) , res ) )
+            throw UserException( 9008 , "filemd5 failed" );
+
+        BSONObjBuilder file;
+        file << "_id" << id
+             << "filename" << name
+             << "chunkSize" << _chunkSize
+             << "uploadDate" << DATENOW
+             << "md5" << res["md5"]
+             ;
+
+        if (length < 1024*1024*1024) { // 2^30
+            file << "length" << (int) length;
+        }
+        else {
+            file << "length" << (long long) length;
+        }
+
+        if (!contentType.empty())
+            file << "contentType" << contentType;
+
+        BSONObj ret = file.obj();
+        _client.insert(_filesNS.c_str(), ret);
+
+        return ret;
+    }
+
+    void GridFS::removeFile( const string& fileName ) {
+        auto_ptr<DBClientCursor> files = _client.query( _filesNS , BSON( "filename" << fileName ) );
+        while (files->more()) {
+            BSONObj file = files->next();
+            BSONElement id = file["_id"];
+            _client.remove( _filesNS.c_str() , BSON( "_id" << id ) );
+            _client.remove( _chunksNS.c_str() , BSON( "files_id" << id ) );
+        }
+    }
+
+    GridFile::GridFile(const GridFS * grid , BSONObj obj ) {
+        _grid = grid;
+        _obj = obj;
+    }
+
+    GridFile GridFS::findFile( const string& fileName ) const {
+        return findFile( BSON( "filename" << fileName ) );
+    };
+
+    GridFile GridFS::findFile( BSONObj query ) const {
+        query = BSON("query" << query << "orderby" << BSON("uploadDate" << -1));
+        return GridFile( this , _client.findOne( _filesNS.c_str() , query ) );
+    }
+
+    auto_ptr<DBClientCursor> GridFS::list() const {
+        return _client.query( _filesNS.c_str() , BSONObj() );
+    }
+
+    auto_ptr<DBClientCursor> GridFS::list( BSONObj o ) const {
+        return _client.query( _filesNS.c_str() , o );
+    }
+
+    BSONObj GridFile::getMetadata() const {
+        BSONElement meta_element = _obj["metadata"];
+        if( meta_element.eoo() ) {
+            return BSONObj();
+        }
+
+        return meta_element.embeddedObject();
+    }
+
+    GridFSChunk GridFile::getChunk( int n ) const {
+        _exists();
+        BSONObjBuilder b;
+        b.appendAs( _obj["_id"] , "files_id" );
+        b.append( "n" , n );
+
+        BSONObj o = _grid->_client.findOne( _grid->_chunksNS.c_str() , b.obj() );
+        uassert( 10014 ,  "chunk is empty!" , ! o.isEmpty() );
+        return GridFSChunk(o);
+    }
+
+    gridfs_offset GridFile::write( ostream & out ) const {
+        _exists();
+
+        const int num = getNumChunks();
+
+        for ( int i=0; i<num; i++ ) {
+            GridFSChunk c = getChunk( i );
+
+            int len;
+            const char * data = c.data( len );
+            out.write( data , len );
+        }
+
+        return getContentLength();
+    }
+
+    gridfs_offset GridFile::write( const string& where ) const {
+        if (where == "-") {
+            return write( cout );
+        }
+        else {
+            ofstream out(where.c_str() , ios::out | ios::binary );
+            uassert(13325, "couldn't open file: " + where, out.is_open() );
+            return write( out );
+        }
+    }
+
+    void GridFile::_exists() const {
+        uassert( 10015 ,  "doesn't exists" , exists() );
+    }
+
+}
diff --git a/src/mongo/client/gridfs.h b/src/mongo/client/gridfs.h
new file mode 100644
index 00000000000..5a19aa142ca
--- /dev/null
+++ b/src/mongo/client/gridfs.h
@@ -0,0 +1,205 @@
+/** @file gridfs.h */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "dbclient.h"
+#include "redef_macros.h"
+
+namespace mongo {
+
+    typedef unsigned long long gridfs_offset;
+
+    class GridFS;
+    class GridFile;
+
+    class GridFSChunk {
+    public:
+        GridFSChunk( BSONObj data );
+        GridFSChunk( BSONObj fileId , int chunkNumber , const char * data , int len );
+
+        int len() const {
+            int len;
+            _data["data"].binDataClean( len );
+            return len;
+        }
+
+        const char * data( int & len ) const {
+            return _data["data"].binDataClean( len );
+        }
+
+    private:
+        BSONObj _data;
+        friend class GridFS;
+    };
+
+
+    /**
+      GridFS is for storing large file-style objects in MongoDB.
+      @see http://www.mongodb.org/display/DOCS/GridFS+Specification
+     */
+    class GridFS {
+    public:
+        /**
+         * @param client - db connection
+         * @param dbName - root database name
+         * @param prefix - if you want your data somewhere besides <dbname>.fs
+         */
+        GridFS( DBClientBase& client , const string& dbName , const string& prefix="fs" );
+        ~GridFS();
+
+        /**
+         * @param
+         */
+        void setChunkSize(unsigned int size);
+
+        /**
+         * puts the file reference by fileName into the db
+         * @param fileName local filename relative to process
+         * @param remoteName optional filename to use for file stored in GridFS
+         *                   (default is to use fileName parameter)
+         * @param contentType optional MIME type for this object.
+         *                    (default is to omit)
+         * @return the file object
+         */
+        BSONObj storeFile( const string& fileName , const string& remoteName="" , const string& contentType="");
+
+        /**
+         * puts the file represented by data into the db
+         * @param data pointer to buffer to store in GridFS
+         * @param length length of buffer
+         * @param remoteName optional filename to use for file stored in GridFS
+         *                   (default is to use fileName parameter)
+         * @param contentType optional MIME type for this object.
+         *                    (default is to omit)
+         * @return the file object
+         */
+        BSONObj storeFile( const char* data , size_t length , const string& remoteName , const string& contentType="");
+
+        /**
+         * removes file referenced by fileName from the db
+         * @param fileName filename (in GridFS) of the file to remove
+         * @return the file object
+         */
+        void removeFile( const string& fileName );
+
+        /**
+         * returns a file object matching the query
+         */
+        GridFile findFile( BSONObj query ) const;
+
+        /**
+         * equiv to findFile( { filename : filename } )
+         */
+        GridFile findFile( const string& fileName ) const;
+
+        /**
+         * convenience method to get all the files
+         */
+        auto_ptr<DBClientCursor> list() const;
+
+        /**
+         * convenience method to get all the files with a filter
+         */
+        auto_ptr<DBClientCursor> list( BSONObj query ) const;
+
+    private:
+        DBClientBase& _client;
+        string _dbName;
+        string _prefix;
+        string _filesNS;
+        string _chunksNS;
+        unsigned int _chunkSize;
+
+        // insert fileobject. All chunks must be in DB.
+        BSONObj insertFile(const string& name, const OID& id, gridfs_offset length, const string& contentType);
+
+        friend class GridFile;
+    };
+
+    /**
+       wrapper for a file stored in the Mongo database
+     */
+    class GridFile {
+    public:
+        /**
+         * @return whether or not this file exists
+         * findFile will always return a GriFile, so need to check this
+         */
+        bool exists() const {
+            return ! _obj.isEmpty();
+        }
+
+        string getFilename() const {
+            return _obj["filename"].str();
+        }
+
+        int getChunkSize() const {
+            return (int)(_obj["chunkSize"].number());
+        }
+
+        gridfs_offset getContentLength() const {
+            return (gridfs_offset)(_obj["length"].number());
+        }
+
+        string getContentType() const {
+            return _obj["contentType"].valuestr();
+        }
+
+        Date_t getUploadDate() const {
+            return _obj["uploadDate"].date();
+        }
+
+        string getMD5() const {
+            return _obj["md5"].str();
+        }
+
+        BSONElement getFileField( const string& name ) const {
+            return _obj[name];
+        }
+
+        BSONObj getMetadata() const;
+
+        int getNumChunks() const {
+            return (int) ceil( (double)getContentLength() / (double)getChunkSize() );
+        }
+
+        GridFSChunk getChunk( int n ) const;
+
+        /**
+           write the file to the output stream
+         */
+        gridfs_offset write( ostream & out ) const;
+
+        /**
+           write the file to this filename
+         */
+        gridfs_offset write( const string& where ) const;
+
+    private:
+        GridFile(const GridFS * grid , BSONObj obj );
+
+        void _exists() const;
+
+        const GridFS * _grid;
+        BSONObj        _obj;
+
+        friend class GridFS;
+    };
+}
+
+#include "undef_macros.h"
diff --git a/src/mongo/client/model.cpp b/src/mongo/client/model.cpp
new file mode 100644
index 00000000000..bd10a3c5528
--- /dev/null
+++ b/src/mongo/client/model.cpp
@@ -0,0 +1,138 @@
+// model.cpp
+
+/*    Copyright 2009 10gen
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "model.h"
+#include "connpool.h"
+
+namespace mongo {
+
+    bool Model::load(BSONObj& query) {
+        ScopedDbConnection conn( modelServer() );
+
+        BSONObj b = conn->findOne(getNS(), query);
+        conn.done();
+
+        if ( b.isEmpty() )
+            return false;
+
+        unserialize(b);
+        _id = b["_id"].wrap().getOwned();
+        return true;
+    }
+
+    void Model::remove( bool safe ) {
+        uassert( 10016 ,  "_id isn't set - needed for remove()" , _id["_id"].type() );
+
+        ScopedDbConnection conn( modelServer() );
+        conn->remove( getNS() , _id );
+
+        string errmsg = "";
+        if ( safe )
+            errmsg = conn->getLastError();
+
+        conn.done();
+
+        if ( safe && errmsg.size() )
+            throw UserException( 9002 , (string)"error on Model::remove: " + errmsg );
+    }
+
+    void Model::save( bool safe ) {
+        ScopedDbConnection conn( modelServer() );
+
+        BSONObjBuilder b;
+        serialize( b );
+
+        BSONElement myId;
+        {
+            BSONObjIterator i = b.iterator();
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( strcmp( e.fieldName() , "_id" ) == 0 ) {
+                    myId = e;
+                    break;
+                }
+            }
+        }
+
+        if ( myId.type() ) {
+            if ( _id.isEmpty() ) {
+                _id = myId.wrap();
+            }
+            else if ( myId.woCompare( _id.firstElement() ) ) {
+                stringstream ss;
+                ss << "_id from serialize and stored differ: ";
+                ss << '[' << myId << "] != ";
+                ss << '[' << _id.firstElement() << ']';
+                throw UserException( 13121 , ss.str() );
+            }
+        }
+
+        if ( _id.isEmpty() ) {
+            OID oid;
+            oid.init();
+            b.appendOID( "_id" , &oid );
+
+            BSONObj o = b.obj();
+            conn->insert( getNS() , o );
+            _id = o["_id"].wrap().getOwned();
+
+            log(4) << "inserted new model " << getNS() << "  " << o << endl;
+        }
+        else {
+            if ( myId.eoo() ) {
+                myId = _id["_id"];
+                b.append( myId );
+            }
+
+            assert( ! myId.eoo() );
+
+            BSONObjBuilder qb;
+            qb.append( myId );
+
+            BSONObj q = qb.obj();
+            BSONObj o = b.obj();
+
+            log(4) << "updated model" << getNS() << "  " << q << " " << o << endl;
+
+            conn->update( getNS() , q , o , true );
+
+        }
+
+        string errmsg = "";
+        if ( safe )
+            errmsg = conn->getLastError();
+
+        conn.done();
+
+        if ( safe && errmsg.size() )
+            throw UserException( 9003 , (string)"error on Model::save: " + errmsg );
+    }
+
+    BSONObj Model::toObject() {
+        BSONObjBuilder b;
+        serialize( b );
+        return b.obj();
+    }
+
+    void Model::append( const char * name , BSONObjBuilder& b ) {
+        BSONObjBuilder bb( b.subobjStart( name ) );
+        serialize( bb );
+        bb.done();
+    }
+
+} // namespace mongo
diff --git a/src/mongo/client/model.h b/src/mongo/client/model.h
new file mode 100644
index 00000000000..7dd31434f49
--- /dev/null
+++ b/src/mongo/client/model.h
@@ -0,0 +1,62 @@
+/** @file model.h */
+
+/*    Copyright 2009 10gen
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "dbclient.h"
+#include "redef_macros.h"
+
+namespace mongo {
+
+    /** Model is a base class for defining objects which are serializable to the Mongo
+       database via the database driver.
+
+       Definition
+       Your serializable class should inherit from Model and implement the abstract methods
+       below.
+
+       Loading
+       To load, first construct an (empty) object.  Then call load().  Do not load an object
+       more than once.
+    */
+    class Model {
+    public:
+        Model() { }
+        virtual ~Model() { }
+
+        virtual const char * getNS() = 0;
+        virtual void serialize(BSONObjBuilder& to) = 0;
+        virtual void unserialize(const BSONObj& from) = 0;
+        virtual BSONObj toObject();
+        virtual void append( const char * name , BSONObjBuilder& b );
+
+        virtual string modelServer() = 0;
+
+        /** Load a single object.
+            @return true if successful.
+        */
+        virtual bool load(BSONObj& query);
+        virtual void save( bool safe=false );
+        virtual void remove( bool safe=false );
+
+    protected:
+        BSONObj _id;
+    };
+
+} // namespace mongo
+
+#include "undef_macros.h"
diff --git a/src/mongo/client/mongo_client_lib.cpp b/src/mongo/client/mongo_client_lib.cpp
new file mode 100644
index 00000000000..58e3f6c6c35
--- /dev/null
+++ b/src/mongo/client/mongo_client_lib.cpp
@@ -0,0 +1,82 @@
+/* @file client_lib.cpp
+
+   MongoDB C++ Driver
+
+   Normally one includes dbclient.h, and links against libmongoclient.a, when connecting to MongoDB
+   from C++.  However, if you have a situation where the pre-built library does not work, you can use
+   this file instead to build all the necessary symbols.  To do so, include mongo_client_lib.cpp in your
+   project.
+
+   GCC
+   ---
+   For example, to build and run simple_client_demo.cpp with GCC and run it:
+
+    g++ -I .. simple_client_demo.cpp mongo_client_lib.cpp -lboost_thread-mt -lboost_filesystem
+    ./a.out
+
+   Visual Studio (2010 tested)
+   ---------------------------
+   First, see client/examples/simple_client_demo.vcxproj.
+   - Be sure to include your boost include directory in your project as an Additional Include Directory.
+   - Define  _CRT_SECURE_NO_WARNINGS to avoid warnings on use of strncpy and such by the MongoDB client code.
+   - Include the boost libraries directory.
+   - Linker.Input.Additional Dependencies - add ws2_32.lib for the Winsock library.
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#if defined(_WIN32)
+// C4800 forcing value to bool 'true' or 'false' (performance warning)
+#pragma warning( disable : 4800 )
+#endif
+
+#include "../util/md5main.cpp"
+
+#define MONGO_EXPOSE_MACROS
+#include "../pch.h"
+
+#include "../util/assert_util.cpp"
+#include "../util/net/message.cpp"
+#include "../util/util.cpp"
+#include "../util/background.cpp"
+#include "../util/base64.cpp"
+#include "../util/net/sock.cpp"
+#include "../util/log.cpp"
+#include "../util/password.cpp"
+#include "../util/net/message_port.cpp"
+#include "../util/concurrency/thread_pool.cpp"
+#include "../util/concurrency/vars.cpp"
+#include "../util/concurrency/task.cpp"
+#include "../util/concurrency/spin_lock.cpp"
+#include "connpool.cpp"
+#include "syncclusterconnection.cpp"
+#include "dbclient.cpp"
+#include "clientOnly.cpp"
+#include "gridfs.cpp"
+#include "dbclientcursor.cpp"
+#include "../util/text.cpp"
+#include "dbclient_rs.cpp"
+#include "../bson/oid.cpp"
+#include "../db/lasterror.cpp"
+#include "../db/json.cpp"
+#include "../db/jsobj.cpp"
+#include "../db/nonce.cpp"
+#include "../pch.cpp"
+
+extern "C" {
+#include "../util/md5.c"
+}
+
diff --git a/src/mongo/client/parallel.cpp b/src/mongo/client/parallel.cpp
new file mode 100644
index 00000000000..5324de52c84
--- /dev/null
+++ b/src/mongo/client/parallel.cpp
@@ -0,0 +1,1515 @@
+// parallel.cpp
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+
+#include "pch.h"
+#include "parallel.h"
+#include "connpool.h"
+#include "../db/queryutil.h"
+#include "../db/dbmessage.h"
+#include "../s/util.h"
+#include "../s/shard.h"
+#include "../s/chunk.h"
+#include "../s/config.h"
+#include "../s/grid.h"
+
+namespace mongo {
+
+    LabeledLevel pc( "pcursor", 2 );
+
+    // --------  ClusteredCursor -----------
+
+    ClusteredCursor::ClusteredCursor( const QuerySpec& q ) {
+        _ns = q.ns();
+        _query = q.filter().copy();
+        _options = q.options();
+        _fields = q.fields().copy();
+        _batchSize = q.ntoreturn();
+        if ( _batchSize == 1 )
+            _batchSize = 2;
+
+        _done = false;
+        _didInit = false;
+    }
+
+    ClusteredCursor::ClusteredCursor( QueryMessage& q ) {
+        _ns = q.ns;
+        _query = q.query.copy();
+        _options = q.queryOptions;
+        _fields = q.fields.copy();
+        _batchSize = q.ntoreturn;
+        if ( _batchSize == 1 )
+            _batchSize = 2;
+
+        _done = false;
+        _didInit = false;
+    }
+
+    ClusteredCursor::ClusteredCursor( const string& ns , const BSONObj& q , int options , const BSONObj& fields ) {
+        _ns = ns;
+        _query = q.getOwned();
+        _options = options;
+        _fields = fields.getOwned();
+        _batchSize = 0;
+
+        _done = false;
+        _didInit = false;
+    }
+
+    ClusteredCursor::~ClusteredCursor() {
+        _done = true; // just in case
+    }
+
+    void ClusteredCursor::init() {
+        if ( _didInit )
+            return;
+        _didInit = true;
+        _init();
+    }
+
+    void ClusteredCursor::_checkCursor( DBClientCursor * cursor ) {
+        assert( cursor );
+        
+        if ( cursor->hasResultFlag( ResultFlag_ShardConfigStale ) ) {
+            throw RecvStaleConfigException( _ns , "ClusteredCursor::_checkCursor" );
+        }
+        
+        if ( cursor->hasResultFlag( ResultFlag_ErrSet ) ) {
+            BSONObj o = cursor->next();
+            throw UserException( o["code"].numberInt() , o["$err"].String() );
+        }
+    }
+
+    auto_ptr<DBClientCursor> ClusteredCursor::query( const string& server , int num , BSONObj extra , int skipLeft , bool lazy ) {
+        uassert( 10017 ,  "cursor already done" , ! _done );
+        assert( _didInit );
+
+        BSONObj q = _query;
+        if ( ! extra.isEmpty() ) {
+            q = concatQuery( q , extra );
+        }
+
+        try {
+            ShardConnection conn( server , _ns );
+            
+            if ( conn.setVersion() ) {
+                conn.done();
+                throw RecvStaleConfigException( _ns , "ClusteredCursor::query" , true );
+            }
+            
+            LOG(5) << "ClusteredCursor::query (" << type() << ") server:" << server
+                   << " ns:" << _ns << " query:" << q << " num:" << num
+                   << " _fields:" << _fields << " options: " << _options << endl;
+        
+            auto_ptr<DBClientCursor> cursor =
+                conn->query( _ns , q , num , 0 , ( _fields.isEmpty() ? 0 : &_fields ) , _options , _batchSize == 0 ? 0 : _batchSize + skipLeft );
+            
+            if ( ! cursor.get() && _options & QueryOption_PartialResults ) {
+                _done = true;
+                conn.done();
+                return cursor;
+            }
+            
+            massert( 13633 , str::stream() << "error querying server: " << server  , cursor.get() );
+            
+            cursor->attach( &conn ); // this calls done on conn
+            assert( ! conn.ok() );
+            _checkCursor( cursor.get() );
+            return cursor;
+        }
+        catch ( SocketException& e ) {
+            if ( ! ( _options & QueryOption_PartialResults ) )
+                throw e;
+            _done = true;
+            return auto_ptr<DBClientCursor>();
+        }
+    }
+
+    BSONObj ClusteredCursor::explain( const string& server , BSONObj extra ) {
+        BSONObj q = _query;
+        if ( ! extra.isEmpty() ) {
+            q = concatQuery( q , extra );
+        }
+
+        BSONObj o;
+
+        ShardConnection conn( server , _ns );
+        auto_ptr<DBClientCursor> cursor = conn->query( _ns , Query( q ).explain() , abs( _batchSize ) * -1 , 0 , _fields.isEmpty() ? 0 : &_fields );
+        if ( cursor.get() && cursor->more() )
+            o = cursor->next().getOwned();
+        conn.done();
+        return o;
+    }
+
+    BSONObj ClusteredCursor::concatQuery( const BSONObj& query , const BSONObj& extraFilter ) {
+        if ( ! query.hasField( "query" ) )
+            return _concatFilter( query , extraFilter );
+
+        BSONObjBuilder b;
+        BSONObjIterator i( query );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+
+            if ( strcmp( e.fieldName() , "query" ) ) {
+                b.append( e );
+                continue;
+            }
+
+            b.append( "query" , _concatFilter( e.embeddedObjectUserCheck() , extraFilter ) );
+        }
+        return b.obj();
+    }
+
+    BSONObj ClusteredCursor::_concatFilter( const BSONObj& filter , const BSONObj& extra ) {
+        BSONObjBuilder b;
+        b.appendElements( filter );
+        b.appendElements( extra );
+        return b.obj();
+        // TODO: should do some simplification here if possibl ideally
+    }
+
+    void ClusteredCursor::explain(BSONObjBuilder& b) {
+        // Note: by default we filter out allPlans and oldPlan in the shell's
+        // explain() function. If you add any recursive structures, make sure to
+        // edit the JS to make sure everything gets filtered.
+
+        b.append( "clusteredType" , type() );
+
+        long long millis = 0;
+        double numExplains = 0;
+
+        map<string,long long> counters;
+
+        map<string,list<BSONObj> > out;
+        {
+            _explain( out );
+
+            BSONObjBuilder x( b.subobjStart( "shards" ) );
+            for ( map<string,list<BSONObj> >::iterator i=out.begin(); i!=out.end(); ++i ) {
+                string shard = i->first;
+                list<BSONObj> l = i->second;
+                BSONArrayBuilder y( x.subarrayStart( shard ) );
+                for ( list<BSONObj>::iterator j=l.begin(); j!=l.end(); ++j ) {
+                    BSONObj temp = *j;
+                    y.append( temp );
+
+                    BSONObjIterator k( temp );
+                    while ( k.more() ) {
+                        BSONElement z = k.next();
+                        if ( z.fieldName()[0] != 'n' )
+                            continue;
+                        long long& c = counters[z.fieldName()];
+                        c += z.numberLong();
+                    }
+
+                    millis += temp["millis"].numberLong();
+                    numExplains++;
+                }
+                y.done();
+            }
+            x.done();
+        }
+
+        for ( map<string,long long>::iterator i=counters.begin(); i!=counters.end(); ++i )
+            b.appendNumber( i->first , i->second );
+
+        b.appendNumber( "millisShardTotal" , millis );
+        b.append( "millisShardAvg" , (int)((double)millis / numExplains ) );
+        b.append( "numQueries" , (int)numExplains );
+        b.append( "numShards" , (int)out.size() );
+    }
+
+    // --------  FilteringClientCursor -----------
+    FilteringClientCursor::FilteringClientCursor( const BSONObj filter )
+        : _matcher( filter ) , _done( true ) {
+    }
+
+    FilteringClientCursor::FilteringClientCursor( auto_ptr<DBClientCursor> cursor , const BSONObj filter )
+        : _matcher( filter ) , _cursor( cursor ) , _done( cursor.get() == 0 ) {
+    }
+
+    FilteringClientCursor::FilteringClientCursor( DBClientCursor* cursor , const BSONObj filter )
+        : _matcher( filter ) , _cursor( cursor ) , _done( cursor == 0 ) {
+    }
+
+
+    FilteringClientCursor::~FilteringClientCursor() {
+    }
+
+    void FilteringClientCursor::reset( auto_ptr<DBClientCursor> cursor ) {
+        _cursor = cursor;
+        _next = BSONObj();
+        _done = _cursor.get() == 0;
+    }
+
+    void FilteringClientCursor::reset( DBClientCursor* cursor ) {
+        _cursor.reset( cursor );
+        _next = BSONObj();
+        _done = cursor == 0;
+    }
+
+
+    bool FilteringClientCursor::more() {
+        if ( ! _next.isEmpty() )
+            return true;
+
+        if ( _done )
+            return false;
+
+        _advance();
+        return ! _next.isEmpty();
+    }
+
+    BSONObj FilteringClientCursor::next() {
+        assert( ! _next.isEmpty() );
+        assert( ! _done );
+
+        BSONObj ret = _next;
+        _next = BSONObj();
+        _advance();
+        return ret;
+    }
+
+    BSONObj FilteringClientCursor::peek() {
+        if ( _next.isEmpty() )
+            _advance();
+        return _next;
+    }
+
+    void FilteringClientCursor::_advance() {
+        assert( _next.isEmpty() );
+        if ( ! _cursor.get() || _done )
+            return;
+
+        while ( _cursor->more() ) {
+            _next = _cursor->next();
+            if ( _matcher.matches( _next ) ) {
+                if ( ! _cursor->moreInCurrentBatch() )
+                    _next = _next.getOwned();
+                return;
+            }
+            _next = BSONObj();
+        }
+        _done = true;
+    }
+
+    // --------  SerialServerClusteredCursor -----------
+
+    SerialServerClusteredCursor::SerialServerClusteredCursor( const set<ServerAndQuery>& servers , QueryMessage& q , int sortOrder) : ClusteredCursor( q ) {
+        for ( set<ServerAndQuery>::const_iterator i = servers.begin(); i!=servers.end(); i++ )
+            _servers.push_back( *i );
+
+        if ( sortOrder > 0 )
+            sort( _servers.begin() , _servers.end() );
+        else if ( sortOrder < 0 )
+            sort( _servers.rbegin() , _servers.rend() );
+
+        _serverIndex = 0;
+
+        _needToSkip = q.ntoskip;
+    }
+
+    bool SerialServerClusteredCursor::more() {
+
+        // TODO: optimize this by sending on first query and then back counting
+        //       tricky in case where 1st server doesn't have any after
+        //       need it to send n skipped
+        while ( _needToSkip > 0 && _current.more() ) {
+            _current.next();
+            _needToSkip--;
+        }
+
+        if ( _current.more() )
+            return true;
+
+        if ( _serverIndex >= _servers.size() ) {
+            return false;
+        }
+
+        ServerAndQuery& sq = _servers[_serverIndex++];
+
+        _current.reset( query( sq._server , 0 , sq._extra ) );
+        return more();
+    }
+
+    BSONObj SerialServerClusteredCursor::next() {
+        uassert( 10018 ,  "no more items" , more() );
+        return _current.next();
+    }
+
+    void SerialServerClusteredCursor::_explain( map< string,list<BSONObj> >& out ) {
+        for ( unsigned i=0; i<_servers.size(); i++ ) {
+            ServerAndQuery& sq = _servers[i];
+            list<BSONObj> & l = out[sq._server];
+            l.push_back( explain( sq._server , sq._extra ) );
+        }
+    }
+
+    // --------  ParallelSortClusteredCursor -----------
+
+    ParallelSortClusteredCursor::ParallelSortClusteredCursor( const set<ServerAndQuery>& servers , QueryMessage& q ,
+            const BSONObj& sortKey )
+        : ClusteredCursor( q ) , _servers( servers ) {
+        _sortKey = sortKey.getOwned();
+        _needToSkip = q.ntoskip;
+        _finishCons();
+    }
+
+    ParallelSortClusteredCursor::ParallelSortClusteredCursor( const set<ServerAndQuery>& servers , const string& ns ,
+            const Query& q ,
+            int options , const BSONObj& fields  )
+        : ClusteredCursor( ns , q.obj , options , fields ) , _servers( servers ) {
+        _sortKey = q.getSort().copy();
+        _needToSkip = 0;
+        _finishCons();
+    }
+
+    ParallelSortClusteredCursor::ParallelSortClusteredCursor( const QuerySpec& qSpec, const CommandInfo& cInfo )
+        : ClusteredCursor( qSpec ),
+          _qSpec( qSpec ), _cInfo( cInfo ), _totalTries( 0 )
+    {
+        _finishCons();
+    }
+
+    ParallelSortClusteredCursor::ParallelSortClusteredCursor( const set<Shard>& qShards, const QuerySpec& qSpec )
+        : ClusteredCursor( qSpec ),
+          _qSpec( qSpec ), _totalTries( 0 )
+    {
+        for( set<Shard>::const_iterator i = qShards.begin(), end = qShards.end(); i != end; ++i )
+            _qShards.insert( *i );
+
+        _finishCons();
+    }
+
+    void ParallelSortClusteredCursor::_finishCons() {
+        _numServers = _servers.size();
+        _cursors = 0;
+
+        if( ! _qSpec.isEmpty() ){
+
+            _needToSkip = _qSpec.ntoskip();
+            _cursors = 0;
+            _sortKey = _qSpec.sort();
+            _fields = _qSpec.fields();
+
+            if( ! isVersioned() ) assert( _cInfo.isEmpty() );
+        }
+
+        if ( ! _sortKey.isEmpty() && ! _fields.isEmpty() ) {
+            // we need to make sure the sort key is in the projection
+
+            set<string> sortKeyFields;
+            _sortKey.getFieldNames(sortKeyFields);
+
+            BSONObjBuilder b;
+            bool isNegative = false;
+            {
+                BSONObjIterator i( _fields );
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    b.append( e );
+
+                    string fieldName = e.fieldName();
+
+                    // exact field
+                    bool found = sortKeyFields.erase(fieldName);
+
+                    // subfields
+                    set<string>::const_iterator begin = sortKeyFields.lower_bound(fieldName + ".\x00");
+                    set<string>::const_iterator end   = sortKeyFields.lower_bound(fieldName + ".\xFF");
+                    sortKeyFields.erase(begin, end);
+
+                    if ( ! e.trueValue() ) {
+                        uassert( 13431 , "have to have sort key in projection and removing it" , !found && begin == end );
+                    }
+                    else if (!e.isABSONObj()) {
+                        isNegative = true;
+                    }
+                }
+            }
+
+            if (isNegative) {
+                for (set<string>::const_iterator it(sortKeyFields.begin()), end(sortKeyFields.end()); it != end; ++it) {
+                    b.append(*it, 1);
+                }
+            }
+
+            _fields = b.obj();
+        }
+
+        if( ! _qSpec.isEmpty() ){
+            _qSpec._fields = _fields;
+        }
+    }
+
+    void ParallelConnectionMetadata::cleanup( bool full ){
+
+        if( full || errored ) retryNext = false;
+
+        if( ! retryNext && pcState ){
+
+            if( errored && pcState->conn ){
+                // Don't return this conn to the pool if it's bad
+                pcState->conn->kill();
+                pcState->conn.reset();
+            }
+            else if( initialized ){
+
+                assert( pcState->cursor );
+                assert( pcState->conn );
+
+                if( ! finished && pcState->conn->ok() ){
+                    try{
+                        // Complete the call if only halfway done
+                        bool retry = false;
+                        pcState->cursor->initLazyFinish( retry );
+                    }
+                    catch( std::exception& ){
+                        warning() << "exception closing cursor" << endl;
+                    }
+                    catch( ... ){
+                        warning() << "unknown exception closing cursor" << endl;
+                    }
+                }
+            }
+
+            // Double-check conn is closed
+            if( pcState->conn ){
+                pcState->conn->done();
+            }
+
+            pcState.reset();
+        }
+        else assert( finished || ! initialized );
+
+        initialized = false;
+        finished = false;
+        completed = false;
+        errored = false;
+    }
+
+
+
+    BSONObj ParallelConnectionState::toBSON() const {
+
+        BSONObj cursorPeek = BSON( "no cursor" << "" );
+        if( cursor ){
+            vector<BSONObj> v;
+            cursor->peek( v, 1 );
+            if( v.size() == 0 ) cursorPeek = BSON( "no data" << "" );
+            else cursorPeek = BSON( "" << v[0] );
+        }
+
+        BSONObj stateObj =
+                BSON( "conn" << ( conn ? ( conn->ok() ? conn->conn().toString() : "(done)" ) : "" ) <<
+                      "vinfo" << ( manager ? ( str::stream() << manager->getns() << " @ " << manager->getVersion().toString() ) :
+                                               primary->toString() ) );
+
+        // Append cursor data if exists
+        BSONObjBuilder stateB;
+        stateB.appendElements( stateObj );
+        if( ! cursor ) stateB.append( "cursor", "(none)" );
+        else {
+            vector<BSONObj> v;
+            cursor->peek( v, 1 );
+            if( v.size() == 0 ) stateB.append( "cursor", "(empty)" );
+            else stateB.append( "cursor", v[0] );
+        }
+        return stateB.obj().getOwned();
+    }
+
+    BSONObj ParallelConnectionMetadata::toBSON() const {
+        return BSON( "state" << ( pcState ? pcState->toBSON() : BSONObj() ) <<
+                     "retryNext" << retryNext <<
+                     "init" << initialized <<
+                     "finish" << finished <<
+                     "errored" << errored );
+    }
+
+    BSONObj ParallelSortClusteredCursor::toBSON() const {
+
+        BSONObjBuilder b;
+
+        b.append( "tries", _totalTries );
+
+        {
+            BSONObjBuilder bb;
+            for( map< Shard, PCMData >::const_iterator i = _cursorMap.begin(), end = _cursorMap.end(); i != end; ++i ){
+                bb.append( i->first.toString(), i->second.toBSON() );
+            }
+            b.append( "cursors", bb.obj().getOwned() );
+        }
+
+        {
+            BSONObjBuilder bb;
+            for( map< string, int >::const_iterator i = _staleNSMap.begin(), end = _staleNSMap.end(); i != end; ++i ){
+                bb.append( i->first, i->second );
+            }
+            b.append( "staleTries", bb.obj().getOwned() );
+        }
+
+        return b.obj().getOwned();
+    }
+
+    string ParallelSortClusteredCursor::toString() const {
+        return str::stream() << "PCursor : " << toBSON();
+    }
+
+    void ParallelSortClusteredCursor::fullInit(){
+        startInit();
+        finishInit();
+    }
+
+    void ParallelSortClusteredCursor::_markStaleNS( const NamespaceString& staleNS, bool& forceReload, bool& fullReload ){
+        if( _staleNSMap.find( staleNS ) == _staleNSMap.end() ){
+            forceReload = false;
+            fullReload = false;
+            _staleNSMap[ staleNS ] = 1;
+        }
+        else{
+            int tries = ++_staleNSMap[ staleNS ];
+
+            if( tries >= 5 ) throw SendStaleConfigException( staleNS, str::stream() << "too many retries of stale version info" );
+
+            forceReload = tries > 1;
+            fullReload = tries > 2;
+        }
+    }
+
+    void ParallelSortClusteredCursor::_handleStaleNS( const NamespaceString& staleNS, bool forceReload, bool fullReload ){
+
+        DBConfigPtr config = grid.getDBConfig( staleNS.db );
+
+        // Reload db if needed, make sure it works
+        if( config && fullReload && ! config->reload() ){
+            // We didn't find the db after the reload, the db may have been dropped,
+            // reset this ptr
+            config.reset();
+        }
+
+        if( ! config ){
+            warning() << "cannot reload database info for stale namespace " << staleNS << endl;
+        }
+        else {
+            // Reload chunk manager, potentially forcing the namespace
+            config->getChunkManagerIfExists( staleNS, true, forceReload );
+        }
+
+    }
+
+    void ParallelSortClusteredCursor::startInit() {
+
+        bool returnPartial = ( _qSpec.options() & QueryOption_PartialResults );
+        bool specialVersion = _cInfo.versionedNS.size() > 0;
+        bool specialFilter = ! _cInfo.cmdFilter.isEmpty();
+        NamespaceString ns = specialVersion ? _cInfo.versionedNS : _qSpec.ns();
+
+        ChunkManagerPtr manager;
+        ShardPtr primary;
+
+        log( pc ) << "creating pcursor over " << _qSpec << " and " << _cInfo << endl;
+
+        set<Shard> todoStorage;
+        set<Shard>& todo = todoStorage;
+        string vinfo;
+
+        if( isVersioned() ){
+
+            DBConfigPtr config = grid.getDBConfig( ns.db ); // Gets or loads the config
+            uassert( 15989, "database not found for parallel cursor request", config );
+
+            // Try to get either the chunk manager or the primary shard
+            int cmRetries = 0;
+            // We need to test config->isSharded() to avoid throwing a stupid exception in most cases
+            // b/c that's how getChunkManager works
+            // This loop basically retries getting either the chunk manager or primary, one or the other *should* exist
+            // eventually?  TODO: Verify that we need / don't need the loop b/c we are / are not protected by const fields or mutexes
+            while( ! ( config->isSharded( ns ) && ( manager = config->getChunkManagerIfExists( ns ) ).get() ) &&
+                   ! ( primary = config->getShardIfExists( ns ) ) &&
+                   cmRetries++ < 5 ) sleepmillis( 100 ); // TODO: Do we need to loop here?
+
+            uassert( 15919, "too many retries for chunk manager or primary", cmRetries < 5 );
+            assert( manager || primary );
+            assert( ! manager || ! primary );
+
+            if( manager ) vinfo = ( str::stream() << "[" << manager->getns() << " @ " << manager->getVersion().toString() << "]" );
+            else vinfo = (str::stream() << "[unsharded @ " << primary->toString() << "]" );
+
+            if( manager ) manager->getShardsForQuery( todo, specialFilter ? _cInfo.cmdFilter : _qSpec.filter() );
+            else if( primary ) todo.insert( *primary );
+
+            // Close all cursors on extra shards first, as these will be invalid
+            for( map< Shard, PCMData >::iterator i = _cursorMap.begin(), end = _cursorMap.end(); i != end; ++i ){
+
+                log( pc ) << "closing cursor on shard " << i->first << " as the connection is no longer required by " << vinfo << endl;
+
+                // Force total cleanup of these connections
+                if( todo.find( i->first ) == todo.end() ) i->second.cleanup();
+            }
+        }
+        else{
+
+            // Don't use version to get shards here
+            todo = _qShards;
+            vinfo = str::stream() << "[" << _qShards.size() << " shards specified]";
+
+        }
+
+        assert( todo.size() );
+
+        log( pc ) << "initializing over " << todo.size() << " shards required by " << vinfo << endl;
+
+        // Don't retry indefinitely for whatever reason
+        _totalTries++;
+        uassert( 15986, "too many retries in total", _totalTries < 10 );
+
+        for( set<Shard>::iterator i = todo.begin(), end = todo.end(); i != end; ++i ){
+
+            const Shard& shard = *i;
+            PCMData& mdata = _cursorMap[ shard ];
+
+            log( pc ) << "initializing on shard " << shard << ", current connection state is " << mdata.toBSON() << endl;
+
+            // This may be the first time connecting to this shard, if so we can get an error here
+            try {
+
+                if( mdata.initialized ){
+
+                    assert( mdata.pcState );
+
+                    PCStatePtr state = mdata.pcState;
+
+                    bool compatiblePrimary = true;
+                    bool compatibleManager = true;
+
+                    // Only check for compatibility if we aren't forcing the shard choices
+                    if( isVersioned() ){
+
+                        if( primary && ! state->primary )
+                            warning() << "Collection becoming unsharded detected" << endl;
+                        if( manager && ! state->manager )
+                            warning() << "Collection becoming sharded detected" << endl;
+                        if( primary && state->primary && primary != state->primary )
+                            warning() << "Weird shift of primary detected" << endl;
+
+                        compatiblePrimary = primary && state->primary && primary == state->primary;
+                        compatibleManager = manager && state->manager && manager->compatibleWith( state->manager, shard );
+
+                    }
+
+                    if( compatiblePrimary || compatibleManager ){
+                        // If we're compatible, don't need to retry unless forced
+                        if( ! mdata.retryNext ) continue;
+                        // Do partial cleanup
+                        mdata.cleanup( false );
+                    }
+                    else {
+                        // Force total cleanup of connection if no longer compatible
+                        mdata.cleanup();
+                    }
+                }
+                else {
+                    // Cleanup connection if we're not yet initialized
+                    mdata.cleanup( false );
+                }
+
+                mdata.pcState.reset( new PCState() );
+                PCStatePtr state = mdata.pcState;
+
+                // Setup manager / primary
+                if( manager ) state->manager = manager;
+                else if( primary ) state->primary = primary;
+
+                assert( ! primary || shard == *primary || ! isVersioned() );
+
+                // Setup conn
+                if( ! state->conn ) state->conn.reset( new ShardConnection( shard, ns, manager ) );
+
+                if( state->conn->setVersion() ){
+                    // It's actually okay if we set the version here, since either the manager will be verified as
+                    // compatible, or if the manager doesn't exist, we don't care about version consistency
+                    log( pc ) << "needed to set remote version on connection to value compatible with " << vinfo << endl;
+                }
+
+                // Setup cursor
+                if( ! state->cursor ){
+                    state->cursor.reset( new DBClientCursor( state->conn->get(), _qSpec.ns(), _qSpec.query(),
+                                                             isCommand() ? 1 : 0, // nToReturn (0 if query indicates multi)
+                                                             0, // nToSkip
+                                                             // Does this need to be a ptr?
+                                                             _qSpec.fields().isEmpty() ? 0 : &_qSpec._fields, // fieldsToReturn
+                                                             _qSpec.options(), // options
+                                                             _qSpec.ntoreturn() == 0 ? 0 : _qSpec.ntoreturn() + _qSpec.ntoskip() ) ); // batchSize
+                }
+
+                bool lazyInit = state->conn->get()->lazySupported();
+                if( lazyInit ){
+
+                    // Need to keep track if this is a second or third try for replica sets
+                    state->cursor->initLazy( mdata.retryNext );
+                    mdata.retryNext = false;
+                    mdata.initialized = true;
+                }
+                else{
+
+                    // Without full initialization, throw an exception
+                    uassert( 15987, str::stream() << "could not fully initialize cursor on shard " << shard.toString() << ", current connection state is " << mdata.toBSON().toString(), state->cursor->init() );
+                    mdata.retryNext = false;
+                    mdata.initialized = true;
+                    mdata.finished = true;
+                }
+
+
+                log( pc ) << "initialized " << ( isCommand() ? "command " : "query " ) << ( lazyInit ? "(lazily) " : "(full) " ) << "on shard " << shard << ", current connection state is " << mdata.toBSON() << endl;
+
+            }
+            catch( SendStaleConfigException& e ){
+
+                // Our version isn't compatible with the current version anymore on at least one shard, need to retry immediately
+                NamespaceString staleNS = e.getns();
+
+                // Probably need to retry fully
+                bool forceReload, fullReload;
+                _markStaleNS( staleNS, forceReload, fullReload );
+
+                int logLevel = fullReload ? 0 : 1;
+                log( pc + logLevel ) << "stale config of ns " << staleNS << " during initialization, will retry with forced : " << forceReload << ", full : " << fullReload << endl;
+
+                // This is somewhat strange
+                if( staleNS != ns )
+                    warning() << "versioned ns " << ns << " doesn't match stale config namespace " << staleNS << endl;
+
+                _handleStaleNS( staleNS, forceReload, fullReload );
+
+                // Restart with new chunk manager
+                startInit();
+                return;
+            }
+            catch( SocketException& e ){
+                warning() << "socket exception when initializing on " << shard << ", current connection state is " << mdata.toBSON() << causedBy( e ) << endl;
+                mdata.errored = true;
+                if( returnPartial ){
+                    mdata.cleanup();
+                    continue;
+                }
+                throw;
+            }
+            catch( DBException& e ){
+                warning() << "db exception when initializing on " << shard << ", current connection state is " << mdata.toBSON() << causedBy( e ) << endl;
+                mdata.errored = true;
+                if( returnPartial && e.getCode() == 15925 /* From above! */ ){
+                    mdata.cleanup();
+                    continue;
+                }
+                throw;
+            }
+            catch( std::exception& e){
+                warning() << "exception when initializing on " << shard << ", current connection state is " << mdata.toBSON() << causedBy( e ) << endl;
+                mdata.errored = true;
+                throw;
+            }
+            catch( ... ){
+                warning() << "unknown exception when initializing on " << shard << ", current connection state is " << mdata.toBSON() << endl;
+                mdata.errored = true;
+                throw;
+            }
+        }
+
+        // Sanity check final init'ed connections
+        for( map< Shard, PCMData >::iterator i = _cursorMap.begin(), end = _cursorMap.end(); i != end; ++i ){
+
+            const Shard& shard = i->first;
+            PCMData& mdata = i->second;
+
+            if( ! mdata.pcState ) continue;
+
+            // Make sure all state is in shards
+            assert( todo.find( shard ) != todo.end() );
+            assert( mdata.initialized = true );
+            if( ! mdata.completed ) assert( mdata.pcState->conn->ok() );
+            assert( mdata.pcState->cursor );
+            if( isVersioned() ) assert( mdata.pcState->primary || mdata.pcState->manager );
+            else assert( ! mdata.pcState->primary || ! mdata.pcState->manager );
+            assert( ! mdata.retryNext );
+
+            if( mdata.completed ) assert( mdata.finished );
+            if( mdata.finished ) assert( mdata.initialized );
+            if( ! returnPartial ) assert( mdata.initialized );
+        }
+
+    }
+
+
+    void ParallelSortClusteredCursor::finishInit(){
+
+        bool returnPartial = ( _qSpec.options() & QueryOption_PartialResults );
+        bool specialVersion = _cInfo.versionedNS.size() > 0;
+        string ns = specialVersion ? _cInfo.versionedNS : _qSpec.ns();
+
+        bool retry = false;
+        set< string > staleNSes;
+
+        log( pc ) << "finishing over " << _cursorMap.size() << " shards" << endl;
+
+        for( map< Shard, PCMData >::iterator i = _cursorMap.begin(), end = _cursorMap.end(); i != end; ++i ){
+
+            const Shard& shard = i->first;
+            PCMData& mdata = i->second;
+
+            log( pc ) << "finishing on shard " << shard << ", current connection state is " << mdata.toBSON() << endl;
+
+            // Ignore empty conns for now
+            if( ! mdata.pcState ) continue;
+
+            PCStatePtr state = mdata.pcState;
+
+            try {
+
+                // Sanity checks
+                if( ! mdata.completed ) assert( state->conn && state->conn->ok() );
+                assert( state->cursor );
+                if( isVersioned() ){
+                    assert( state->manager || state->primary );
+                    assert( ! state->manager || ! state->primary );
+                }
+                else assert( ! state->manager && ! state->primary );
+
+
+                // If we weren't init'ing lazily, ignore this
+                if( ! mdata.finished ){
+
+                    mdata.finished = true;
+
+                    // Mark the cursor as non-retry by default
+                    mdata.retryNext = false;
+
+                    if( ! state->cursor->initLazyFinish( mdata.retryNext ) ){
+                        if( ! mdata.retryNext ){
+                            uassert( 15988, "error querying server", false );
+                        }
+                        else{
+                            retry = true;
+                            continue;
+                        }
+                    }
+
+                    mdata.completed = false;
+                }
+
+                if( ! mdata.completed ){
+
+                    mdata.completed = true;
+
+                    // Make sure we didn't get an error we should rethrow
+                    // TODO : Rename/refactor this to something better
+                    _checkCursor( state->cursor.get() );
+
+                    // Finalize state
+                    state->cursor->attach( state->conn.get() ); // Closes connection for us
+
+                    log( pc ) << "finished on shard " << shard << ", current connection state is " << mdata.toBSON() << endl;
+                }
+            }
+            catch( RecvStaleConfigException& e ){
+                retry = true;
+
+                // Will retry all at once
+                staleNSes.insert( e.getns() );
+
+                // Fully clear this cursor, as it needs to be re-established
+                mdata.cleanup();
+                continue;
+            }
+            catch ( MsgAssertionException& e ){
+                warning() << "socket (msg) exception when finishing on " << shard << ", current connection state is " << mdata.toBSON() << causedBy( e ) << endl;
+                mdata.errored = true;
+                if( returnPartial ){
+                    mdata.cleanup();
+                    continue;
+                }
+                throw;
+            }
+            catch( SocketException& e ){
+                warning() << "socket exception when finishing on " << shard << ", current connection state is " << mdata.toBSON() << causedBy( e ) << endl;
+                mdata.errored = true;
+                if( returnPartial ){
+                    mdata.cleanup();
+                    continue;
+                }
+                throw;
+            }
+            catch( DBException& e ){
+                warning() << "db exception when finishing on " << shard << ", current connection state is " << mdata.toBSON() << causedBy( e ) << endl;
+                mdata.errored = true;
+                throw;
+            }
+            catch( std::exception& e){
+                warning() << "exception when finishing on " << shard << ", current connection state is " << mdata.toBSON() << causedBy( e ) << endl;
+                mdata.errored = true;
+                throw;
+            }
+            catch( ... ){
+                warning() << "unknown exception when finishing on " << shard << ", current connection state is " << mdata.toBSON() << endl;
+                mdata.errored = true;
+                throw;
+            }
+
+        }
+
+        // Retry logic for single refresh of namespaces / retry init'ing connections
+        if( retry ){
+
+            // Refresh stale namespaces
+            if( staleNSes.size() ){
+                for( set<string>::iterator i = staleNSes.begin(), end = staleNSes.end(); i != end; ++i ){
+
+                    const string& staleNS = *i;
+
+                    bool forceReload, fullReload;
+                    _markStaleNS( staleNS, forceReload, fullReload );
+
+                    int logLevel = fullReload ? 0 : 1;
+                    log( pc + logLevel ) << "stale config of ns " << staleNS << " on finishing query, will retry with forced : " << forceReload << ", full : " << fullReload << endl;
+
+                    // This is somewhat strange
+                    if( staleNS != ns )
+                        warning() << "versioned ns " << ns << " doesn't match stale config namespace " << staleNS << endl;
+
+                    _handleStaleNS( staleNS, forceReload, fullReload );
+                }
+            }
+
+            // Re-establish connections we need to
+            startInit();
+            finishInit();
+            return;
+        }
+
+        // Sanity check and clean final connections
+        map< Shard, PCMData >::iterator i = _cursorMap.begin();
+        while( i != _cursorMap.end() ){
+
+            // const Shard& shard = i->first;
+            PCMData& mdata = i->second;
+
+            // Erase empty stuff
+            if( ! mdata.pcState ){
+                log() << "PCursor erasing empty state " << mdata.toBSON() << endl;
+                _cursorMap.erase( i++ );
+                continue;
+            }
+            else ++i;
+
+            // Make sure all state is in shards
+            assert( mdata.initialized = true );
+            assert( mdata.finished = true );
+            assert( mdata.completed = true );
+            assert( ! mdata.pcState->conn->ok() );
+            assert( mdata.pcState->cursor );
+            if( isVersioned() ) assert( mdata.pcState->primary || mdata.pcState->manager );
+            else assert( ! mdata.pcState->primary && ! mdata.pcState->manager );
+        }
+
+        // TODO : More cleanup of metadata?
+
+        // LEGACY STUFF NOW
+
+        _cursors = new FilteringClientCursor[ _cursorMap.size() ];
+
+        // Put the cursors in the legacy format
+        int index = 0;
+        for( map< Shard, PCMData >::iterator i = _cursorMap.begin(), end = _cursorMap.end(); i != end; ++i ){
+
+            PCMData& mdata = i->second;
+
+            _cursors[ index ].reset( mdata.pcState->cursor.get() );
+            _servers.insert( ServerAndQuery( i->first.getConnString(), BSONObj() ) );
+
+            index++;
+        }
+
+        _numServers = _cursorMap.size();
+
+    }
+
+    bool ParallelSortClusteredCursor::isSharded() {
+        // LEGACY is always unsharded
+        if( _qSpec.isEmpty() ) return false;
+
+        if( ! isVersioned() ) return false;
+
+        if( _cursorMap.size() > 1 ) return true;
+        if( _cursorMap.begin()->second.pcState->manager ) return true;
+        return false;
+    }
+
+    ShardPtr ParallelSortClusteredCursor::getPrimary() {
+        if( isSharded() || ! isVersioned() ) return ShardPtr();
+        return _cursorMap.begin()->second.pcState->primary;
+    }
+
+    void ParallelSortClusteredCursor::getQueryShards( set<Shard>& shards ) {
+        for( map< Shard, PCMData >::iterator i = _cursorMap.begin(), end = _cursorMap.end(); i != end; ++i ){
+            shards.insert( i->first );
+        }
+    }
+
+    ChunkManagerPtr ParallelSortClusteredCursor::getChunkManager( const Shard& shard ) {
+        if( ! isSharded() ) return ChunkManagerPtr();
+
+        map<Shard,PCMData>::iterator i = _cursorMap.find( shard );
+
+        if( i == _cursorMap.end() ) return ChunkManagerPtr();
+        else return i->second.pcState->manager;
+    }
+
+    DBClientCursorPtr ParallelSortClusteredCursor::getShardCursor( const Shard& shard ) {
+        map<Shard,PCMData>::iterator i = _cursorMap.find( shard );
+
+        if( i == _cursorMap.end() ) return DBClientCursorPtr();
+        else return i->second.pcState->cursor;
+    }
+
+    void ParallelSortClusteredCursor::_init() {
+        if( ! _qSpec.isEmpty() ) fullInit();
+        else _oldInit();
+    }
+
+
+    // DEPRECATED
+
+
+    // TODO:  Merge with futures API?  We do a lot of error checking here that would be useful elsewhere.
+    void ParallelSortClusteredCursor::_oldInit() {
+
+        // log() << "Starting parallel search..." << endl;
+
+        // make sure we're not already initialized
+        assert( ! _cursors );
+        _cursors = new FilteringClientCursor[_numServers];
+
+        bool returnPartial = ( _options & QueryOption_PartialResults );
+
+        vector<ServerAndQuery> queries( _servers.begin(), _servers.end() );
+        set<int> retryQueries;
+        int finishedQueries = 0;
+
+        vector< shared_ptr<ShardConnection> > conns;
+        vector<string> servers;
+
+        // Since we may get all sorts of errors, record them all as they come and throw them later if necessary
+        vector<string> staleConfigExs;
+        vector<string> socketExs;
+        vector<string> otherExs;
+        bool allConfigStale = false;
+
+        int retries = -1;
+
+        // Loop through all the queries until we've finished or gotten a socket exception on all of them
+        // We break early for non-socket exceptions, and socket exceptions if we aren't returning partial results
+        do {
+            retries++;
+
+            bool firstPass = retryQueries.size() == 0;
+
+            if( ! firstPass ){
+                log() << "retrying " << ( returnPartial ? "(partial) " : "" ) << "parallel connection to ";
+                for( set<int>::iterator it = retryQueries.begin(); it != retryQueries.end(); ++it ){
+                    log() << queries[*it]._server << ", ";
+                }
+                log() << finishedQueries << " finished queries." << endl;
+            }
+
+            size_t num = 0;
+            for ( vector<ServerAndQuery>::iterator it = queries.begin(); it != queries.end(); ++it ) {
+                size_t i = num++;
+
+                const ServerAndQuery& sq = *it;
+
+                // If we're not retrying this cursor on later passes, continue
+                if( ! firstPass && retryQueries.find( i ) == retryQueries.end() ) continue;
+
+                // log() << "Querying " << _query << " from " << _ns << " for " << sq._server << endl;
+
+                BSONObj q = _query;
+                if ( ! sq._extra.isEmpty() ) {
+                    q = concatQuery( q , sq._extra );
+                }
+
+                string errLoc = " @ " + sq._server;
+
+                if( firstPass ){
+
+                    // This may be the first time connecting to this shard, if so we can get an error here
+                    try {
+                        conns.push_back( shared_ptr<ShardConnection>( new ShardConnection( sq._server , _ns ) ) );
+                    }
+                    catch( std::exception& e ){
+                        socketExs.push_back( e.what() + errLoc );
+                        if( ! returnPartial ){
+                            num--;
+                            break;
+                        }
+                        conns.push_back( shared_ptr<ShardConnection>() );
+                        continue;
+                    }
+
+                    servers.push_back( sq._server );
+                }
+
+                if ( conns[i]->setVersion() ) {
+                    conns[i]->done();
+                    staleConfigExs.push_back( (string)"stale config detected for " + RecvStaleConfigException( _ns , "ParallelCursor::_init" , true ).what() + errLoc );
+                    break;
+                }
+
+                LOG(5) << "ParallelSortClusteredCursor::init server:" << sq._server << " ns:" << _ns
+                       << " query:" << q << " _fields:" << _fields << " options: " << _options  << endl;
+
+                if( ! _cursors[i].raw() )
+                    _cursors[i].reset( new DBClientCursor( conns[i]->get() , _ns , q ,
+                                                            0 , // nToReturn
+                                                            0 , // nToSkip
+                                                            _fields.isEmpty() ? 0 : &_fields , // fieldsToReturn
+                                                            _options ,
+                                                            _batchSize == 0 ? 0 : _batchSize + _needToSkip // batchSize
+                                                            ) );
+
+                try{
+                    _cursors[i].raw()->initLazy( ! firstPass );
+                }
+                catch( SocketException& e ){
+                    socketExs.push_back( e.what() + errLoc );
+                    _cursors[i].reset( NULL );
+                    conns[i]->done();
+                    if( ! returnPartial ) break;
+                }
+                catch( std::exception& e){
+                    otherExs.push_back( e.what() + errLoc );
+                    _cursors[i].reset( NULL );
+                    conns[i]->done();
+                    break;
+                }
+
+            }
+
+            // Go through all the potentially started cursors and finish initializing them or log any errors and
+            // potentially retry
+            // TODO:  Better error classification would make this easier, errors are indicated in all sorts of ways
+            // here that we need to trap.
+            for ( size_t i = 0; i < num; i++ ) {
+
+                // log() << "Finishing query for " << cons[i].get()->getHost() << endl;
+                string errLoc = " @ " + queries[i]._server;
+
+                if( ! _cursors[i].raw() || ( ! firstPass && retryQueries.find( i ) == retryQueries.end() ) ){
+                    if( conns[i] ) conns[i].get()->done();
+                    continue;
+                }
+
+                assert( conns[i] );
+                retryQueries.erase( i );
+
+                bool retry = false;
+
+                try {
+
+                    if( ! _cursors[i].raw()->initLazyFinish( retry ) ) {
+
+                        warning() << "invalid result from " << conns[i]->getHost() << ( retry ? ", retrying" : "" ) << endl;
+                        _cursors[i].reset( NULL );
+
+                        if( ! retry ){
+                            socketExs.push_back( str::stream() << "error querying server: " << servers[i] );
+                            conns[i]->done();
+                        }
+                        else {
+                            retryQueries.insert( i );
+                        }
+
+                        continue;
+                    }
+                }
+                catch ( StaleConfigException& e ){
+                    // Our stored configuration data is actually stale, we need to reload it
+                    // when we throw our exception
+                    allConfigStale = true;
+
+                    staleConfigExs.push_back( (string)"stale config detected when receiving response for " + e.what() + errLoc );
+                    _cursors[i].reset( NULL );
+                    conns[i]->done();
+                    continue;
+                }
+                catch ( MsgAssertionException& e ){
+                    socketExs.push_back( e.what() + errLoc );
+                    _cursors[i].reset( NULL );
+                    conns[i]->done();
+                    continue;
+                }
+                catch ( SocketException& e ) {
+                    socketExs.push_back( e.what() + errLoc );
+                    _cursors[i].reset( NULL );
+                    conns[i]->done();
+                    continue;
+                }
+                catch( std::exception& e ){
+                    otherExs.push_back( e.what() + errLoc );
+                    _cursors[i].reset( NULL );
+                    conns[i]->done();
+                    continue;
+                }
+
+                try {
+                    _cursors[i].raw()->attach( conns[i].get() ); // this calls done on conn
+                    _checkCursor( _cursors[i].raw() );
+
+                    finishedQueries++;
+                }
+                catch ( StaleConfigException& e ){
+
+                    // Our stored configuration data is actually stale, we need to reload it
+                    // when we throw our exception
+                    allConfigStale = true;
+
+                    staleConfigExs.push_back( (string)"stale config detected for " + e.what() + errLoc );
+                    _cursors[i].reset( NULL );
+                    conns[i]->done();
+                    continue;
+                }
+                catch( std::exception& e ){
+                    otherExs.push_back( e.what() + errLoc );
+                    _cursors[i].reset( NULL );
+                    conns[i]->done();
+                    continue;
+                }
+            }
+
+            // Don't exceed our max retries, should not happen
+            assert( retries < 5 );
+        }
+        while( retryQueries.size() > 0 /* something to retry */ &&
+               ( socketExs.size() == 0 || returnPartial ) /* no conn issues */ &&
+               staleConfigExs.size() == 0 /* no config issues */ &&
+               otherExs.size() == 0 /* no other issues */);
+
+        // Assert that our conns are all closed!
+        for( vector< shared_ptr<ShardConnection> >::iterator i = conns.begin(); i < conns.end(); ++i ){
+            assert( ! (*i) || ! (*i)->ok() );
+        }
+
+        // Handle errors we got during initialization.
+        // If we're returning partial results, we can ignore socketExs, but nothing else
+        // Log a warning in any case, so we don't lose these messages
+        bool throwException = ( socketExs.size() > 0 && ! returnPartial ) || staleConfigExs.size() > 0 || otherExs.size() > 0;
+
+        if( socketExs.size() > 0 || staleConfigExs.size() > 0 || otherExs.size() > 0 ) {
+
+            vector<string> errMsgs;
+
+            errMsgs.insert( errMsgs.end(), staleConfigExs.begin(), staleConfigExs.end() );
+            errMsgs.insert( errMsgs.end(), otherExs.begin(), otherExs.end() );
+            errMsgs.insert( errMsgs.end(), socketExs.begin(), socketExs.end() );
+
+            stringstream errMsg;
+            errMsg << "could not initialize cursor across all shards because : ";
+            for( vector<string>::iterator i = errMsgs.begin(); i != errMsgs.end(); i++ ){
+                if( i != errMsgs.begin() ) errMsg << " :: and :: ";
+                errMsg << *i;
+            }
+
+            if( throwException && staleConfigExs.size() > 0 )
+                throw RecvStaleConfigException( _ns , errMsg.str() , ! allConfigStale );
+            else if( throwException )
+                throw DBException( errMsg.str(), 14827 );
+            else
+                warning() << errMsg.str() << endl;
+        }
+
+        if( retries > 0 )
+            log() <<  "successfully finished parallel query after " << retries << " retries" << endl;
+
+    }
+
+    ParallelSortClusteredCursor::~ParallelSortClusteredCursor() {
+        // Clear out our metadata before removing legacy cursor data
+        _cursorMap.clear();
+        for( int i = 0; i < _numServers; i++ ) _cursors[i].release();
+
+        delete [] _cursors;
+        _cursors = 0;
+    }
+
+    bool ParallelSortClusteredCursor::more() {
+
+        if ( _needToSkip > 0 ) {
+            int n = _needToSkip;
+            _needToSkip = 0;
+
+            while ( n > 0 && more() ) {
+                BSONObj x = next();
+                n--;
+            }
+
+            _needToSkip = n;
+        }
+
+        for ( int i=0; i<_numServers; i++ ) {
+            if ( _cursors[i].more() )
+                return true;
+        }
+        return false;
+    }
+
+    BSONObj ParallelSortClusteredCursor::next() {
+        BSONObj best = BSONObj();
+        int bestFrom = -1;
+
+        for ( int i=0; i<_numServers; i++) {
+            if ( ! _cursors[i].more() )
+                continue;
+
+            BSONObj me = _cursors[i].peek();
+
+            if ( best.isEmpty() ) {
+                best = me;
+                bestFrom = i;
+                if( _sortKey.isEmpty() ) break;
+                continue;
+            }
+
+            int comp = best.woSortOrder( me , _sortKey , true );
+            if ( comp < 0 )
+                continue;
+
+            best = me;
+            bestFrom = i;
+        }
+
+        uassert( 10019 ,  "no more elements" , ! best.isEmpty() );
+        _cursors[bestFrom].next();
+
+        return best;
+    }
+
+    void ParallelSortClusteredCursor::_explain( map< string,list<BSONObj> >& out ) {
+        for ( set<ServerAndQuery>::iterator i=_servers.begin(); i!=_servers.end(); ++i ) {
+            const ServerAndQuery& sq = *i;
+            list<BSONObj> & l = out[sq._server];
+            l.push_back( explain( sq._server , sq._extra ) );
+        }
+
+    }
+
+    // -----------------
+    // ---- Future -----
+    // -----------------
+
+    Future::CommandResult::CommandResult( const string& server , const string& db , const BSONObj& cmd , int options , DBClientBase * conn )
+        :_server(server) ,_db(db) , _options(options), _cmd(cmd) ,_conn(conn) ,_done(false)
+    {
+        init();
+    }
+
+    void Future::CommandResult::init(){
+        try {
+            if ( ! _conn ){
+                _connHolder.reset( new ScopedDbConnection( _server ) );
+                _conn = _connHolder->get();
+            }
+
+            if ( _conn->lazySupported() ) {
+                _cursor.reset( new DBClientCursor(_conn, _db + ".$cmd", _cmd, -1/*limit*/, 0, NULL, _options, 0));
+                _cursor->initLazy();
+            }
+            else {
+                _done = true; // we set _done first because even if there is an error we're done
+                _ok = _conn->runCommand( _db , _cmd , _res , _options );
+            }
+        }
+        catch ( std::exception& e ) {
+            error() << "Future::spawnComand (part 1) exception: " << e.what() << endl;
+            _ok = false;
+            _done = true;
+        }
+    }
+
+    bool Future::CommandResult::join( int maxRetries ) {
+        if (_done)
+            return _ok;
+
+
+        _ok = false;
+        for( int i = 1; i <= maxRetries; i++ ){
+
+            try {
+                bool retry = false;
+                bool finished = _cursor->initLazyFinish( retry );
+
+                // Shouldn't need to communicate with server any more
+                if ( _connHolder )
+                    _connHolder->done();
+
+                uassert(14812,  str::stream() << "Error running command on server: " << _server, finished);
+                massert(14813, "Command returned nothing", _cursor->more());
+
+                _res = _cursor->nextSafe();
+                _ok = _res["ok"].trueValue();
+
+                break;
+            }
+            catch ( RecvStaleConfigException& e ){
+
+                assert( versionManager.isVersionableCB( _conn ) );
+
+                if( i >= maxRetries ){
+                    error() << "Future::spawnComand (part 2) stale config exception" << causedBy( e ) << endl;
+                    throw e;
+                }
+
+                if( i >= maxRetries / 2 ){
+                    if( ! versionManager.forceRemoteCheckShardVersionCB( e.getns() ) ){
+                        error() << "Future::spawnComand (part 2) no config detected" << causedBy( e ) << endl;
+                        throw e;
+                    }
+                }
+
+                versionManager.checkShardVersionCB( _conn, e.getns(), false, 1 );
+
+                LOG( i > 1 ? 0 : 1 ) << "retrying lazy command" << causedBy( e ) << endl;
+
+                assert( _conn->lazySupported() );
+                _done = false;
+                init();
+                continue;
+            }
+            catch ( std::exception& e ) {
+                error() << "Future::spawnComand (part 2) exception: " << causedBy( e ) << endl;
+                break;
+            }
+
+        }
+
+        _done = true;
+        return _ok;
+    }
+
+    shared_ptr<Future::CommandResult> Future::spawnCommand( const string& server , const string& db , const BSONObj& cmd , int options , DBClientBase * conn ) {
+        shared_ptr<Future::CommandResult> res (new Future::CommandResult( server , db , cmd , options , conn  ));
+        return res;
+    }
+
+}
diff --git a/src/mongo/client/parallel.h b/src/mongo/client/parallel.h
new file mode 100644
index 00000000000..1cbbd9a1cb5
--- /dev/null
+++ b/src/mongo/client/parallel.h
@@ -0,0 +1,444 @@
+// parallel.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+   tools for working in parallel/sharded/clustered environment
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "dbclient.h"
+#include "redef_macros.h"
+#include "../db/dbmessage.h"
+#include "../db/matcher.h"
+#include "../util/concurrency/mvar.h"
+
+namespace mongo {
+
+    /**
+     * holder for a server address and a query to run
+     */
+    class ServerAndQuery {
+    public:
+        ServerAndQuery( const string& server , BSONObj extra = BSONObj() , BSONObj orderObject = BSONObj() ) :
+            _server( server ) , _extra( extra.getOwned() ) , _orderObject( orderObject.getOwned() ) {
+        }
+
+        bool operator<( const ServerAndQuery& other ) const {
+            if ( ! _orderObject.isEmpty() )
+                return _orderObject.woCompare( other._orderObject ) < 0;
+
+            if ( _server < other._server )
+                return true;
+            if ( other._server > _server )
+                return false;
+            return _extra.woCompare( other._extra ) < 0;
+        }
+
+        string toString() const {
+            StringBuilder ss;
+            ss << "server:" << _server << " _extra:" << _extra.toString() << " _orderObject:" << _orderObject.toString();
+            return ss.str();
+        }
+
+        operator string() const {
+            return toString();
+        }
+
+        string _server;
+        BSONObj _extra;
+        BSONObj _orderObject;
+    };
+
+    /**
+     * this is a cursor that works over a set of servers
+     * can be used in serial/paralellel as controlled by sub classes
+     */
+    class ClusteredCursor {
+    public:
+        ClusteredCursor( const QuerySpec& q );
+        ClusteredCursor( QueryMessage& q );
+        ClusteredCursor( const string& ns , const BSONObj& q , int options=0 , const BSONObj& fields=BSONObj() );
+        virtual ~ClusteredCursor();
+
+        /** call before using */
+        void init();
+
+        virtual bool more() = 0;
+        virtual BSONObj next() = 0;
+
+        static BSONObj concatQuery( const BSONObj& query , const BSONObj& extraFilter );
+
+        virtual string type() const = 0;
+
+        virtual void explain(BSONObjBuilder& b);
+
+    protected:
+
+        virtual void _init() = 0;
+
+        auto_ptr<DBClientCursor> query( const string& server , int num = 0 , BSONObj extraFilter = BSONObj() , int skipLeft = 0 , bool lazy=false );
+        BSONObj explain( const string& server , BSONObj extraFilter = BSONObj() );
+
+        /**
+         * checks the cursor for any errors
+         * will throw an exceptionif an error is encountered
+         */
+        void _checkCursor( DBClientCursor * cursor );
+
+        static BSONObj _concatFilter( const BSONObj& filter , const BSONObj& extraFilter );
+
+        virtual void _explain( map< string,list<BSONObj> >& out ) = 0;
+
+        string _ns;
+        BSONObj _query;
+        int _options;
+        BSONObj _fields;
+        int _batchSize;
+
+        bool _didInit;
+
+        bool _done;
+    };
+
+
+    class FilteringClientCursor {
+    public:
+        FilteringClientCursor( const BSONObj filter = BSONObj() );
+        FilteringClientCursor( DBClientCursor* cursor , const BSONObj filter = BSONObj() );
+        FilteringClientCursor( auto_ptr<DBClientCursor> cursor , const BSONObj filter = BSONObj() );
+        ~FilteringClientCursor();
+
+        void reset( auto_ptr<DBClientCursor> cursor );
+        void reset( DBClientCursor* cursor );
+
+        bool more();
+        BSONObj next();
+
+        BSONObj peek();
+
+        DBClientCursor* raw() { return _cursor.get(); }
+
+        // Required for new PCursor
+        void release(){ _cursor.release(); }
+
+    private:
+        void _advance();
+
+        Matcher _matcher;
+        auto_ptr<DBClientCursor> _cursor;
+
+        BSONObj _next;
+        bool _done;
+    };
+
+
+    class Servers {
+    public:
+        Servers() {
+        }
+
+        void add( const ServerAndQuery& s ) {
+            add( s._server , s._extra );
+        }
+
+        void add( const string& server , const BSONObj& filter ) {
+            vector<BSONObj>& mine = _filters[server];
+            mine.push_back( filter.getOwned() );
+        }
+
+        // TOOO: pick a less horrible name
+        class View {
+            View( const Servers* s ) {
+                for ( map<string, vector<BSONObj> >::const_iterator i=s->_filters.begin(); i!=s->_filters.end(); ++i ) {
+                    _servers.push_back( i->first );
+                    _filters.push_back( i->second );
+                }
+            }
+        public:
+            int size() const {
+                return _servers.size();
+            }
+
+            string getServer( int n ) const {
+                return _servers[n];
+            }
+
+            vector<BSONObj> getFilter( int n ) const {
+                return _filters[ n ];
+            }
+
+        private:
+            vector<string> _servers;
+            vector< vector<BSONObj> > _filters;
+
+            friend class Servers;
+        };
+
+        View view() const {
+            return View( this );
+        }
+
+
+    private:
+        map<string, vector<BSONObj> > _filters;
+
+        friend class View;
+    };
+
+
+    /**
+     * runs a query in serial across any number of servers
+     * returns all results from 1 server, then the next, etc...
+     */
+    class SerialServerClusteredCursor : public ClusteredCursor {
+    public:
+        SerialServerClusteredCursor( const set<ServerAndQuery>& servers , QueryMessage& q , int sortOrder=0);
+        virtual bool more();
+        virtual BSONObj next();
+        virtual string type() const { return "SerialServer"; }
+
+    protected:
+        virtual void _explain( map< string,list<BSONObj> >& out );
+
+        void _init() {}
+
+        vector<ServerAndQuery> _servers;
+        unsigned _serverIndex;
+
+        FilteringClientCursor _current;
+
+        int _needToSkip;
+    };
+
+
+
+    class CommandInfo {
+    public:
+        string versionedNS;
+        BSONObj cmdFilter;
+
+        CommandInfo() {}
+        CommandInfo( const string& vns, const BSONObj& filter ) : versionedNS( vns ), cmdFilter( filter ) {}
+
+        bool isEmpty(){
+            return versionedNS.size() == 0;
+        }
+
+        string toString() const {
+            return str::stream() << "CInfo " << BSON( "v_ns" << versionedNS << "filter" << cmdFilter );
+        }
+    };
+
+    class ShardConnection;
+    typedef shared_ptr<ShardConnection> ShardConnectionPtr;
+
+    class DBClientCursor;
+    typedef shared_ptr<DBClientCursor> DBClientCursorPtr;
+
+    class Shard;
+    typedef shared_ptr<Shard> ShardPtr;
+
+    class ChunkManager;
+    typedef shared_ptr<const ChunkManager> ChunkManagerPtr;
+
+    class ParallelConnectionState {
+    public:
+
+        ShardConnectionPtr conn;
+        DBClientCursorPtr cursor;
+
+        // Version information
+        ChunkManagerPtr manager;
+        ShardPtr primary;
+
+        BSONObj toBSON() const;
+
+        string toString() const {
+            return str::stream() << "PCState : " << toBSON();
+        }
+    };
+
+    typedef ParallelConnectionState PCState;
+    typedef shared_ptr<PCState> PCStatePtr;
+
+    class ParallelConnectionMetadata {
+    public:
+
+        ParallelConnectionMetadata() :
+            retryNext( false ), initialized( false ), finished( false ), completed( false ), errored( false ) { }
+
+        ~ParallelConnectionMetadata(){
+            cleanup( true );
+        }
+
+        void cleanup( bool full = true );
+
+        PCStatePtr pcState;
+
+        bool retryNext;
+
+        bool initialized;
+        bool finished;
+        bool completed;
+
+        bool errored;
+
+        BSONObj toBSON() const;
+
+        string toString() const {
+            return str::stream() << "PCMData : " << toBSON();
+        }
+    };
+
+    typedef ParallelConnectionMetadata PCMData;
+    typedef shared_ptr<PCMData> PCMDataPtr;
+
+    /**
+     * Runs a query in parallel across N servers.  New logic has several modes -
+     * 1) Standard query, enforces compatible chunk versions for queries across all results
+     * 2) Standard query, sent to particular servers with no compatible chunk version enforced, but handling
+     *    stale configuration exceptions
+     * 3) Command query, either enforcing compatible chunk versions or sent to particular shards.
+     */
+    class ParallelSortClusteredCursor : public ClusteredCursor {
+    public:
+
+        ParallelSortClusteredCursor( const QuerySpec& qSpec, const CommandInfo& cInfo = CommandInfo() );
+        ParallelSortClusteredCursor( const set<Shard>& servers, const QuerySpec& qSpec );
+
+        // LEGACY Constructors
+        ParallelSortClusteredCursor( const set<ServerAndQuery>& servers , QueryMessage& q , const BSONObj& sortKey );
+        ParallelSortClusteredCursor( const set<ServerAndQuery>& servers , const string& ns ,
+                                     const Query& q , int options=0, const BSONObj& fields=BSONObj() );
+
+        virtual ~ParallelSortClusteredCursor();
+        virtual bool more();
+        virtual BSONObj next();
+        virtual string type() const { return "ParallelSort"; }
+
+        void fullInit();
+        void startInit();
+        void finishInit();
+
+        bool isCommand(){ return NamespaceString( _qSpec.ns() ).isCommand(); }
+        bool isVersioned(){ return _qShards.size() == 0; }
+
+        bool isSharded();
+        ShardPtr getPrimary();
+        void getQueryShards( set<Shard>& shards );
+        ChunkManagerPtr getChunkManager( const Shard& shard );
+        DBClientCursorPtr getShardCursor( const Shard& shard );
+
+        BSONObj toBSON() const;
+        string toString() const;
+
+    protected:
+        void _finishCons();
+        void _init();
+        void _oldInit();
+
+        virtual void _explain( map< string,list<BSONObj> >& out );
+
+        void _markStaleNS( const NamespaceString& staleNS, bool& forceReload, bool& fullReload );
+        void _handleStaleNS( const NamespaceString& staleNS, bool forceReload, bool fullReload );
+
+        set<Shard> _qShards;
+        QuerySpec _qSpec;
+        CommandInfo _cInfo;
+
+        // Count round-trips req'd for namespaces and total
+        map<string,int> _staleNSMap;
+        int _totalTries;
+
+        map<Shard,PCMData> _cursorMap;
+
+        // LEGACY BELOW
+        int _numServers;
+        set<ServerAndQuery> _servers;
+        BSONObj _sortKey;
+
+        FilteringClientCursor * _cursors;
+        int _needToSkip;
+    };
+
+    /**
+     * tools for doing asynchronous operations
+     * right now uses underlying sync network ops and uses another thread
+     * should be changed to use non-blocking io
+     */
+    class Future {
+    public:
+        class CommandResult {
+        public:
+
+            string getServer() const { return _server; }
+
+            bool isDone() const { return _done; }
+
+            bool ok() const {
+                assert( _done );
+                return _ok;
+            }
+
+            BSONObj result() const {
+                assert( _done );
+                return _res;
+            }
+
+            /**
+               blocks until command is done
+               returns ok()
+             */
+            bool join( int maxRetries = 1 );
+
+        private:
+
+            CommandResult( const string& server , const string& db , const BSONObj& cmd , int options , DBClientBase * conn );
+            void init();
+
+            string _server;
+            string _db;
+            int _options;
+            BSONObj _cmd;
+            DBClientBase * _conn;
+            scoped_ptr<ScopedDbConnection> _connHolder; // used if not provided a connection
+
+            scoped_ptr<DBClientCursor> _cursor;
+
+            BSONObj _res;
+            bool _ok;
+            bool _done;
+
+            friend class Future;
+        };
+
+        
+        /**
+         * @param server server name
+         * @param db db name
+         * @param cmd cmd to exec
+         * @param conn optional connection to use.  will use standard pooled if non-specified
+         */
+        static shared_ptr<CommandResult> spawnCommand( const string& server , const string& db , const BSONObj& cmd , int options , DBClientBase * conn = 0 );
+    };
+
+
+}
+
+#include "undef_macros.h"
diff --git a/src/mongo/client/redef_macros.h b/src/mongo/client/redef_macros.h
new file mode 100644
index 00000000000..897912dba2b
--- /dev/null
+++ b/src/mongo/client/redef_macros.h
@@ -0,0 +1,61 @@
+/** @file redef_macros.h macros the implementation uses.
+    
+    @see undef_macros.h undefines these after use to minimize name pollution.
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+// If you define a new global un-prefixed macro, please add it here and in undef_macros
+
+// #pragma once // this file is intended to be processed multiple times
+
+#if defined(MONGO_MACROS_CLEANED)
+
+// util/allocator.h
+#define malloc MONGO_malloc
+#define realloc MONGO_realloc
+
+// util/assert_util.h
+#define assert MONGO_assert
+#define dassert MONGO_dassert
+#define wassert MONGO_wassert
+#define massert MONGO_massert
+#define uassert MONGO_uassert
+#define BOOST_CHECK_EXCEPTION MONGO_BOOST_CHECK_EXCEPTION
+#define DESTRUCTOR_GUARD MONGO_DESTRUCTOR_GUARD
+
+// util/goodies.h
+#define PRINT MONGO_PRINT
+#define PRINTFL MONGO_PRINTFL
+#define asctime MONGO_asctime
+#define gmtime MONGO_gmtime
+#define localtime MONGO_localtime
+#define ctime MONGO_ctime
+
+// util/debug_util.h
+#define DEV MONGO_DEV
+#define DEBUGGING MONGO_DEBUGGING
+#define SOMETIMES MONGO_SOMETIMES
+#define OCCASIONALLY MONGO_OCCASIONALLY
+#define RARELY MONGO_RARELY
+#define ONCE MONGO_ONCE
+
+// util/log.h
+#define LOG MONGO_LOG
+
+#undef MONGO_MACROS_CLEANED
+#endif
+
diff --git a/src/mongo/client/simple_client_demo.cpp b/src/mongo/client/simple_client_demo.cpp
new file mode 100644
index 00000000000..f4278dd4e54
--- /dev/null
+++ b/src/mongo/client/simple_client_demo.cpp
@@ -0,0 +1,54 @@
+/* simple_client_demo.cpp
+
+   See also : http://www.mongodb.org/pages/viewpage.action?pageId=133415
+
+   How to build and run:
+
+   (1) Using the mongoclient:
+    g++ simple_client_demo.cpp -lmongoclient -lboost_thread-mt -lboost_filesystem -lboost_program_options
+    ./a.out
+
+   (2) using client_lib.cpp:
+    g++ -I .. simple_client_demo.cpp mongo_client_lib.cpp -lboost_thread-mt -lboost_filesystem
+    ./a.out
+*/
+
+#include <iostream>
+#include "dbclient.h" // the mongo c++ driver
+
+using namespace std;
+using namespace mongo;
+using namespace bson;
+
+int main() {
+    try {
+        cout << "connecting to localhost..." << endl;
+        DBClientConnection c;
+        c.connect("localhost");
+        cout << "connected ok" << endl;
+        unsigned long long count = c.count("test.foo");
+        cout << "count of exiting documents in collection test.foo : " << count << endl;
+
+        bo o = BSON( "hello" << "world" );
+        c.insert("test.foo", o);
+
+        string e = c.getLastError();
+        if( !e.empty() ) { 
+            cout << "insert #1 failed: " << e << endl;
+        }
+
+        // make an index with a unique key constraint
+        c.ensureIndex("test.foo", BSON("hello"<<1), /*unique*/true);
+
+        c.insert("test.foo", o); // will cause a dup key error on "hello" field
+        cout << "we expect a dup key error here:" << endl;
+        cout << "  " << c.getLastErrorDetailed().toString() << endl;
+    } 
+    catch(DBException& e) { 
+        cout << "caught DBException " << e.toString() << endl;
+        return 1;
+    }
+
+    return 0;
+}
+
diff --git a/src/mongo/client/syncclusterconnection.cpp b/src/mongo/client/syncclusterconnection.cpp
new file mode 100644
index 00000000000..601cdcbd758
--- /dev/null
+++ b/src/mongo/client/syncclusterconnection.cpp
@@ -0,0 +1,410 @@
+// syncclusterconnection.cpp
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+
+#include "pch.h"
+#include "syncclusterconnection.h"
+#include "../db/dbmessage.h"
+
+// error codes 8000-8009
+
+namespace mongo {
+
+    SyncClusterConnection::SyncClusterConnection( const list<HostAndPort> & L, double socketTimeout) : _mutex("SyncClusterConnection"), _socketTimeout( socketTimeout ) {
+        {
+            stringstream s;
+            int n=0;
+            for( list<HostAndPort>::const_iterator i = L.begin(); i != L.end(); i++ ) {
+                if( ++n > 1 ) s << ',';
+                s << i->toString();
+            }
+            _address = s.str();
+        }
+        for( list<HostAndPort>::const_iterator i = L.begin(); i != L.end(); i++ )
+            _connect( i->toString() );
+    }
+
+    SyncClusterConnection::SyncClusterConnection( string commaSeperated, double socketTimeout)  : _mutex("SyncClusterConnection"), _socketTimeout( socketTimeout ) {
+        _address = commaSeperated;
+        string::size_type idx;
+        while ( ( idx = commaSeperated.find( ',' ) ) != string::npos ) {
+            string h = commaSeperated.substr( 0 , idx );
+            commaSeperated = commaSeperated.substr( idx + 1 );
+            _connect( h );
+        }
+        _connect( commaSeperated );
+        uassert( 8004 ,  "SyncClusterConnection needs 3 servers" , _conns.size() == 3 );
+    }
+
+    SyncClusterConnection::SyncClusterConnection( string a , string b , string c, double socketTimeout)  : _mutex("SyncClusterConnection"), _socketTimeout( socketTimeout ) {
+        _address = a + "," + b + "," + c;
+        // connect to all even if not working
+        _connect( a );
+        _connect( b );
+        _connect( c );
+    }
+
+    SyncClusterConnection::SyncClusterConnection( SyncClusterConnection& prev, double socketTimeout) : _mutex("SyncClusterConnection"), _socketTimeout( socketTimeout ) {
+        assert(0);
+    }
+
+    SyncClusterConnection::~SyncClusterConnection() {
+        for ( size_t i=0; i<_conns.size(); i++ )
+            delete _conns[i];
+        _conns.clear();
+    }
+
+    bool SyncClusterConnection::prepare( string& errmsg ) {
+        _lastErrors.clear();
+        return fsync( errmsg );
+    }
+
+    bool SyncClusterConnection::fsync( string& errmsg ) {
+        bool ok = true;
+        errmsg = "";
+        for ( size_t i=0; i<_conns.size(); i++ ) {
+            BSONObj res;
+            try {
+                if ( _conns[i]->simpleCommand( "admin" , &res , "fsync" ) )
+                    continue;
+            }
+            catch ( DBException& e ) {
+                errmsg += e.toString();
+            }
+            catch ( std::exception& e ) {
+                errmsg += e.what();
+            }
+            catch ( ... ) {
+            }
+            ok = false;
+            errmsg += " " + _conns[i]->toString() + ":" + res.toString();
+        }
+        return ok;
+    }
+
+    void SyncClusterConnection::_checkLast() {
+        _lastErrors.clear();
+        vector<string> errors;
+
+        for ( size_t i=0; i<_conns.size(); i++ ) {
+            BSONObj res;
+            string err;
+            try {
+                if ( ! _conns[i]->runCommand( "admin" , BSON( "getlasterror" << 1 << "fsync" << 1 ) , res ) )
+                    err = "cmd failed: ";
+            }
+            catch ( std::exception& e ) {
+                err += e.what();
+            }
+            catch ( ... ) {
+                err += "unknown failure";
+            }
+            _lastErrors.push_back( res.getOwned() );
+            errors.push_back( err );
+        }
+
+        assert( _lastErrors.size() == errors.size() && _lastErrors.size() == _conns.size() );
+
+        stringstream err;
+        bool ok = true;
+
+        for ( size_t i = 0; i<_conns.size(); i++ ) {
+            BSONObj res = _lastErrors[i];
+            if ( res["ok"].trueValue() && (res["fsyncFiles"].numberInt() > 0 || res.hasElement("waited")))
+                continue;
+            ok = false;
+            err << _conns[i]->toString() << ": " << res << " " << errors[i];
+        }
+
+        if ( ok )
+            return;
+        throw UserException( 8001 , (string)"SyncClusterConnection write op failed: " + err.str() );
+    }
+
+    BSONObj SyncClusterConnection::getLastErrorDetailed(bool fsync, bool j, int w, int wtimeout) {
+        if ( _lastErrors.size() )
+            return _lastErrors[0];
+        return DBClientBase::getLastErrorDetailed(fsync,j,w,wtimeout);
+    }
+
+    void SyncClusterConnection::_connect( string host ) {
+        log() << "SyncClusterConnection connecting to [" << host << "]" << endl;
+        DBClientConnection * c = new DBClientConnection( true );
+        c->setSoTimeout( _socketTimeout );
+        string errmsg;
+        if ( ! c->connect( host , errmsg ) )
+            log() << "SyncClusterConnection connect fail to: " << host << " errmsg: " << errmsg << endl;
+        _connAddresses.push_back( host );
+        _conns.push_back( c );
+    }
+
+    bool SyncClusterConnection::callRead( Message& toSend , Message& response ) {
+        // TODO: need to save state of which one to go back to somehow...
+        return _conns[0]->callRead( toSend , response );
+    }
+
+    BSONObj SyncClusterConnection::findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions) {
+
+        if ( ns.find( ".$cmd" ) != string::npos ) {
+            string cmdName = query.obj.firstElementFieldName();
+
+            int lockType = _lockType( cmdName );
+
+            if ( lockType > 0 ) { // write $cmd
+                string errmsg;
+                if ( ! prepare( errmsg ) )
+                    throw UserException( 13104 , (string)"SyncClusterConnection::findOne prepare failed: " + errmsg );
+
+                vector<BSONObj> all;
+                for ( size_t i=0; i<_conns.size(); i++ ) {
+                    all.push_back( _conns[i]->findOne( ns , query , 0 , queryOptions ).getOwned() );
+                }
+
+                _checkLast();
+                
+                for ( size_t i=0; i<all.size(); i++ ) {
+                    BSONObj temp = all[i];
+                    if ( isOk( temp ) )
+                        continue;
+                    stringstream ss;
+                    ss << "write $cmd failed on a node: " << temp.jsonString();
+                    ss << " " << _conns[i]->toString();
+                    ss << " ns: " << ns;
+                    ss << " cmd: " << query.toString();
+                    throw UserException( 13105 , ss.str() );
+                }
+
+                return all[0];
+            }
+        }
+
+        return DBClientBase::findOne( ns , query , fieldsToReturn , queryOptions );
+    }
+
+    bool SyncClusterConnection::auth(const string &dbname, const string &username, const string &password_text, string& errmsg, bool digestPassword) {
+        for (vector<DBClientConnection*>::iterator it = _conns.begin(); it < _conns.end(); it++) {
+            massert( 15848, "sync cluster of sync clusters?", (*it)->type() != ConnectionString::SYNC);
+
+            if (!(*it)->auth(dbname, username, password_text, errmsg, digestPassword)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    auto_ptr<DBClientCursor> SyncClusterConnection::query(const string &ns, Query query, int nToReturn, int nToSkip,
+            const BSONObj *fieldsToReturn, int queryOptions, int batchSize ) {
+        _lastErrors.clear();
+        if ( ns.find( ".$cmd" ) != string::npos ) {
+            string cmdName = query.obj.firstElementFieldName();
+            int lockType = _lockType( cmdName );
+            uassert( 13054 , (string)"write $cmd not supported in SyncClusterConnection::query for:" + cmdName , lockType <= 0 );
+        }
+
+        return _queryOnActive( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions , batchSize );
+    }
+
+    bool SyncClusterConnection::_commandOnActive(const string &dbname, const BSONObj& cmd, BSONObj &info, int options ) {
+        auto_ptr<DBClientCursor> cursor = _queryOnActive( dbname + ".$cmd" , cmd , 1 , 0 , 0 , options , 0 );
+        if ( cursor->more() )
+            info = cursor->next().copy();
+        else
+            info = BSONObj();
+        return isOk( info );
+    }
+
+    auto_ptr<DBClientCursor> SyncClusterConnection::_queryOnActive(const string &ns, Query query, int nToReturn, int nToSkip,
+            const BSONObj *fieldsToReturn, int queryOptions, int batchSize ) {
+
+        for ( size_t i=0; i<_conns.size(); i++ ) {
+            try {
+                auto_ptr<DBClientCursor> cursor =
+                    _conns[i]->query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions , batchSize );
+                if ( cursor.get() )
+                    return cursor;
+                log() << "query failed to: " << _conns[i]->toString() << " no data" << endl;
+            }
+            catch ( std::exception& e ) {
+                log() << "query failed to: " << _conns[i]->toString() << " exception: " << e.what() << endl;
+            }
+            catch ( ... ) {
+                log() << "query failed to: " << _conns[i]->toString() << " exception" << endl;
+            }
+        }
+        throw UserException( 8002 , "all servers down!" );
+    }
+
+    auto_ptr<DBClientCursor> SyncClusterConnection::getMore( const string &ns, long long cursorId, int nToReturn, int options ) {
+        uassert( 10022 , "SyncClusterConnection::getMore not supported yet" , 0);
+        auto_ptr<DBClientCursor> c;
+        return c;
+    }
+
+    void SyncClusterConnection::insert( const string &ns, BSONObj obj , int flags) {
+
+        uassert( 13119 , (string)"SyncClusterConnection::insert obj has to have an _id: " + obj.jsonString() ,
+                 ns.find( ".system.indexes" ) != string::npos || obj["_id"].type() );
+
+        string errmsg;
+        if ( ! prepare( errmsg ) )
+            throw UserException( 8003 , (string)"SyncClusterConnection::insert prepare failed: " + errmsg );
+
+        for ( size_t i=0; i<_conns.size(); i++ ) {
+            _conns[i]->insert( ns , obj , flags);
+        }
+
+        _checkLast();
+    }
+
+    void SyncClusterConnection::insert( const string &ns, const vector< BSONObj >& v , int flags) {
+        uassert( 10023 , "SyncClusterConnection bulk insert not implemented" , 0);
+    }
+
+    void SyncClusterConnection::remove( const string &ns , Query query, bool justOne ) {
+        string errmsg;
+        if ( ! prepare( errmsg ) )
+            throw UserException( 8020 , (string)"SyncClusterConnection::remove prepare failed: " + errmsg );
+
+        for ( size_t i=0; i<_conns.size(); i++ ) {
+            _conns[i]->remove( ns , query , justOne );
+        }
+
+        _checkLast();
+    }
+
+    void SyncClusterConnection::update( const string &ns , Query query , BSONObj obj , bool upsert , bool multi ) {
+
+        if ( upsert ) {
+            uassert( 13120 , "SyncClusterConnection::update upsert query needs _id" , query.obj["_id"].type() );
+        }
+
+        if ( _writeConcern ) {
+            string errmsg;
+            if ( ! prepare( errmsg ) )
+                throw UserException( 8005 , (string)"SyncClusterConnection::udpate prepare failed: " + errmsg );
+        }
+
+        for ( size_t i = 0; i < _conns.size(); i++ ) {
+            try {
+                _conns[i]->update( ns , query , obj , upsert , multi );
+            }
+            catch ( std::exception& e ) {
+                if ( _writeConcern )
+                    throw e;
+            }
+        }
+
+        if ( _writeConcern ) {
+            _checkLast();
+            assert( _lastErrors.size() > 1 );
+
+            int a = _lastErrors[0]["n"].numberInt();
+            for ( unsigned i=1; i<_lastErrors.size(); i++ ) {
+                int b = _lastErrors[i]["n"].numberInt();
+                if ( a == b )
+                    continue;
+
+                throw UpdateNotTheSame( 8017 , 
+                                        str::stream() 
+                                        << "update not consistent " 
+                                        << " ns: " << ns
+                                        << " query: " << query.toString()
+                                        << " update: " << obj
+                                        << " gle1: " << _lastErrors[0]
+                                        << " gle2: " << _lastErrors[i] ,
+                                        _connAddresses , _lastErrors );
+            }
+        }
+    }
+
+    string SyncClusterConnection::_toString() const {
+        stringstream ss;
+        ss << "SyncClusterConnection [" << _address << "]";
+        return ss.str();
+    }
+
+    bool SyncClusterConnection::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) {
+        uassert( 8006 , "SyncClusterConnection::call can only be used directly for dbQuery" ,
+                 toSend.operation() == dbQuery );
+
+        DbMessage d( toSend );
+        uassert( 8007 , "SyncClusterConnection::call can't handle $cmd" , strstr( d.getns(), "$cmd" ) == 0 );
+
+        for ( size_t i=0; i<_conns.size(); i++ ) {
+            try {
+                bool ok = _conns[i]->call( toSend , response , assertOk );
+                if ( ok ) {
+                    if ( actualServer )
+                        *actualServer = _connAddresses[i];
+                    return ok;
+                }
+                log() << "call failed to: " << _conns[i]->toString() << " no data" << endl;
+            }
+            catch ( ... ) {
+                log() << "call failed to: " << _conns[i]->toString() << " exception" << endl;
+            }
+        }
+        throw UserException( 8008 , "all servers down!" );
+    }
+
+    void SyncClusterConnection::say( Message &toSend, bool isRetry ) {
+        string errmsg;
+        if ( ! prepare( errmsg ) )
+            throw UserException( 13397 , (string)"SyncClusterConnection::say prepare failed: " + errmsg );
+
+        for ( size_t i=0; i<_conns.size(); i++ ) {
+            _conns[i]->say( toSend );
+        }
+
+        _checkLast();
+    }
+
+    void SyncClusterConnection::sayPiggyBack( Message &toSend ) {
+        assert(0);
+    }
+
+    int SyncClusterConnection::_lockType( const string& name ) {
+        {
+            scoped_lock lk(_mutex);
+            map<string,int>::iterator i = _lockTypes.find( name );
+            if ( i != _lockTypes.end() )
+                return i->second;
+        }
+
+        BSONObj info;
+        uassert( 13053 , str::stream() << "help failed: " << info , _commandOnActive( "admin" , BSON( name << "1" << "help" << 1 ) , info ) );
+
+        int lockType = info["lockType"].numberInt();
+
+        scoped_lock lk(_mutex);
+        _lockTypes[name] = lockType;
+        return lockType;
+    }
+
+    void SyncClusterConnection::killCursor( long long cursorID ) {
+        // should never need to do this
+        assert(0);
+    }
+
+    void SyncClusterConnection::setAllSoTimeouts( double socketTimeout ){
+        _socketTimeout = socketTimeout;
+        for ( size_t i=0; i<_conns.size(); i++ )
+
+            if( _conns[i] ) _conns[i]->setSoTimeout( socketTimeout );
+    }
+
+}
diff --git a/src/mongo/client/syncclusterconnection.h b/src/mongo/client/syncclusterconnection.h
new file mode 100644
index 00000000000..d2374ddaa45
--- /dev/null
+++ b/src/mongo/client/syncclusterconnection.h
@@ -0,0 +1,147 @@
+// @file syncclusterconnection.h
+
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "dbclient.h"
+#include "redef_macros.h"
+
+namespace mongo {
+
+    /**
+     * This is a connection to a cluster of servers that operate as one
+     * for super high durability.
+     *
+     * Write operations are two-phase.  First, all nodes are asked to fsync. If successful
+     * everywhere, the write is sent everywhere and then followed by an fsync.  There is no
+     * rollback if a problem occurs during the second phase.  Naturally, with all these fsyncs,
+     * these operations will be quite slow -- use sparingly.
+     *
+     * Read operations are sent to a single random node.
+     *
+     * The class checks if a command is read or write style, and sends to a single
+     * node if a read lock command and to all in two phases with a write style command.
+     */
+    class SyncClusterConnection : public DBClientBase {
+    public:
+        /**
+         * @param commaSeparated should be 3 hosts comma separated
+         */
+        SyncClusterConnection( const list<HostAndPort> &, double socketTimeout = 0);
+        SyncClusterConnection( string commaSeparated, double socketTimeout = 0);
+        SyncClusterConnection( string a , string b , string c, double socketTimeout = 0 );
+        ~SyncClusterConnection();
+
+        /**
+         * @return true if all servers are up and ready for writes
+         */
+        bool prepare( string& errmsg );
+
+        /**
+         * runs fsync on all servers
+         */
+        bool fsync( string& errmsg );
+
+        // --- from DBClientInterface
+
+        virtual BSONObj findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions);
+
+        virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn, int nToSkip,
+                                               const BSONObj *fieldsToReturn, int queryOptions, int batchSize );
+
+        virtual auto_ptr<DBClientCursor> getMore( const string &ns, long long cursorId, int nToReturn, int options );
+
+        virtual void insert( const string &ns, BSONObj obj, int flags=0);
+
+        virtual void insert( const string &ns, const vector< BSONObj >& v, int flags=0);
+
+        virtual void remove( const string &ns , Query query, bool justOne );
+
+        virtual void update( const string &ns , Query query , BSONObj obj , bool upsert , bool multi );
+
+        virtual bool call( Message &toSend, Message &response, bool assertOk , string * actualServer );
+        virtual void say( Message &toSend, bool isRetry = false );
+        virtual void sayPiggyBack( Message &toSend );
+
+        virtual void killCursor( long long cursorID );
+
+        virtual string getServerAddress() const { return _address; }
+        virtual bool isFailed() const { return false; }
+        virtual string toString() { return _toString(); }
+
+        virtual BSONObj getLastErrorDetailed(bool fsync=false, bool j=false, int w=0, int wtimeout=0);
+
+        virtual bool callRead( Message& toSend , Message& response );
+
+        virtual ConnectionString::ConnectionType type() const { return ConnectionString::SYNC; }
+
+        void setAllSoTimeouts( double socketTimeout );
+        double getSoTimeout() const { return _socketTimeout; }
+
+        virtual bool auth(const string &dbname, const string &username, const string &password_text, string& errmsg, bool digestPassword);
+
+        virtual bool lazySupported() const { return false; }
+    private:
+        SyncClusterConnection( SyncClusterConnection& prev, double socketTimeout = 0 );
+        string _toString() const;
+        bool _commandOnActive(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0);
+        auto_ptr<DBClientCursor> _queryOnActive(const string &ns, Query query, int nToReturn, int nToSkip,
+                                                const BSONObj *fieldsToReturn, int queryOptions, int batchSize );
+        int _lockType( const string& name );
+        void _checkLast();
+        void _connect( string host );
+
+        string _address;
+        vector<string> _connAddresses;
+        vector<DBClientConnection*> _conns;
+        map<string,int> _lockTypes;
+        mongo::mutex _mutex;
+
+        vector<BSONObj> _lastErrors;
+
+        double _socketTimeout;
+    };
+
+    class UpdateNotTheSame : public UserException {
+    public:
+        UpdateNotTheSame( int code , const string& msg , const vector<string>& addrs , const vector<BSONObj>& lastErrors )
+            : UserException( code , msg ) , _addrs( addrs ) , _lastErrors( lastErrors ) {
+            assert( _addrs.size() == _lastErrors.size() );
+        }
+
+        virtual ~UpdateNotTheSame() throw() {
+        }
+
+        unsigned size() const {
+            return _addrs.size();
+        }
+
+        pair<string,BSONObj> operator[](unsigned i) const {
+            return make_pair( _addrs[i] , _lastErrors[i] );
+        }
+
+    private:
+
+        vector<string> _addrs;
+        vector<BSONObj> _lastErrors;
+    };
+
+};
+
+#include "undef_macros.h"
diff --git a/src/mongo/client/undef_macros.h b/src/mongo/client/undef_macros.h
new file mode 100644
index 00000000000..30ece615747
--- /dev/null
+++ b/src/mongo/client/undef_macros.h
@@ -0,0 +1,61 @@
+/** @file undef_macros.h remove mongo implementation macros after using */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+// If you define a new global un-prefixed macro, please add it here and in redef_macros
+
+// #pragma once // this file is intended to be processed multiple times
+
+
+/** MONGO_EXPOSE_MACROS - when defined, indicates that you are compiling a mongo program rather
+                          than just using the C++ driver.
+*/
+#if !defined(MONGO_EXPOSE_MACROS) && !defined(MONGO_MACROS_CLEANED)
+
+// util/allocator.h
+#undef malloc
+#undef realloc
+
+// util/assert_util.h
+#undef assert
+#undef dassert
+#undef wassert
+#undef massert
+#undef uassert
+#undef BOOST_CHECK_EXCEPTION
+#undef DESTRUCTOR_GUARD
+
+// util/goodies.h
+#undef PRINT
+#undef PRINTFL
+#undef asctime
+#undef gmtime
+#undef localtime
+#undef ctime
+
+// util/debug_util.h
+#undef DEV
+#undef DEBUGGING
+#undef SOMETIMES
+#undef OCCASIONALLY
+#undef RARELY
+#undef ONCE
+
+// util/log.h
+#undef LOG
+
+#define MONGO_MACROS_CLEANED
+#endif
diff --git a/src/mongo/db/background.h b/src/mongo/db/background.h
new file mode 100644
index 00000000000..ea424c97107
--- /dev/null
+++ b/src/mongo/db/background.h
@@ -0,0 +1,56 @@
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* background.h
+
+   Concurrency coordination for administrative operations.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /* these are administrative operations / jobs
+       for a namespace running in the background, and that only one
+       at a time per namespace is permitted, and that if in progress,
+       you aren't allowed to do other NamespaceDetails major manipulations
+       (such as dropping ns or db) even in the foreground and must
+       instead uassert.
+
+       It's assumed this is not for super-high RPS things, so we don't do
+       anything special in the implementation here to be fast.
+    */
+    class BackgroundOperation : public boost::noncopyable {
+    public:
+        static bool inProgForDb(const char *db);
+        static bool inProgForNs(const char *ns);
+        static void assertNoBgOpInProgForDb(const char *db);
+        static void assertNoBgOpInProgForNs(const char *ns);
+        static void dump(stringstream&);
+
+        /* check for in progress before instantiating */
+        BackgroundOperation(const char *ns);
+
+        virtual ~BackgroundOperation();
+
+    private:
+        NamespaceString _ns;
+        static map<string, unsigned> dbsInProg;
+        static set<string> nsInProg;
+    };
+
+} // namespace mongo
+
diff --git a/src/mongo/db/btree.cpp b/src/mongo/db/btree.cpp
new file mode 100644
index 00000000000..5c55fad33c3
--- /dev/null
+++ b/src/mongo/db/btree.cpp
@@ -0,0 +1,1980 @@
+// btree.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "json.h"
+#include "clientcursor.h"
+#include "client.h"
+#include "dbhelpers.h"
+#include "curop-inl.h"
+#include "stats/counters.h"
+#include "dur_commitjob.h"
+#include "btreebuilder.h"
+#include "../util/unittest.h"
+#include "../server.h"
+
+namespace mongo {
+
+    BOOST_STATIC_ASSERT( Record::HeaderSize == 16 );
+    BOOST_STATIC_ASSERT( Record::HeaderSize + BtreeData_V1::BucketSize == 8192 );
+
+    NOINLINE_DECL void checkFailed(unsigned line) {
+        static time_t last;
+        if( time(0) - last >= 10 ) { 
+            msgasserted(15898, str::stream() << "error in index possibly corruption consider repairing " << line);
+        }
+    }
+
+    /** data check. like assert, but gives a reasonable error message to the user. */
+#define check(expr) if(!(expr) ) { checkFailed(__LINE__); }
+
+#define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this );
+
+    template< class Loc >
+    __KeyNode<Loc> & __KeyNode<Loc>::writing() const {
+        return *getDur().writing( const_cast< __KeyNode<Loc> * >( this ) );
+    }
+
+    // BucketBasics::lowWaterMark()
+    //
+    // We define this value as the maximum number of bytes such that, if we have
+    // fewer than this many bytes, we must be able to either merge with or receive
+    // keys from any neighboring node.  If our utilization goes below this value we
+    // know we can bring up the utilization with a simple operation.  Ignoring the
+    // 90/10 split policy which is sometimes employed and our 'unused' nodes, this
+    // is a lower bound on bucket utilization for non root buckets.
+    //
+    // Note that the exact value here depends on the implementation of
+    // rebalancedSeparatorPos().  The conditions for lowWaterMark - 1 are as
+    // follows:  We know we cannot merge with the neighbor, so the total data size
+    // for us, the neighbor, and the separator must be at least
+    // BtreeBucket<V>::bodySize() + 1.  We must be able to accept one key of any
+    // allowed size, so our size plus storage for that additional key must be
+    // <= BtreeBucket<V>::bodySize() / 2.  This way, with the extra key we'll have a
+    // new bucket data size < half the total data size and by the implementation
+    // of rebalancedSeparatorPos() the key must be added.
+
+    static const int split_debug = 0;
+    static const int insert_debug = 0;
+
+    /**
+     * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly
+     * for the 10287 error code.
+     */
+    static void alreadyInIndex() {
+        // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord()
+        throw MsgAssertionException(10287, "btree: key+recloc already in index");
+    }
+
+    /* BucketBasics --------------------------------------------------- */
+
+    template< class V >
+    void BucketBasics<V>::assertWritable() {
+        if( cmdLine.dur )
+	  dur::assertAlreadyDeclared(this, V::BucketSize);
+    }
+
+    template< class V >
+    string BtreeBucket<V>::bucketSummary() const {
+        stringstream ss;
+        ss << "  Bucket info:" << endl;
+        ss << "    n: " << this->n << endl;
+        ss << "    parent: " << this->parent.toString() << endl;
+        ss << "    nextChild: " << this->nextChild.toString() << endl;
+        ss << "    flags:" << this->flags << endl;
+        ss << "    emptySize: " << this->emptySize << " topSize: " << this->topSize << endl;
+        return ss.str();
+    }
+
+    template< class V >
+    int BucketBasics<V>::Size() const {
+        return V::BucketSize;
+    }
+
+    template< class V >
+    void BucketBasics<V>::_shape(int level, stringstream& ss) const {
+        for ( int i = 0; i < level; i++ ) ss << ' ';
+        ss << "*[" << this->n << "]\n";
+        for ( int i = 0; i < this->n; i++ ) {
+            if ( !k(i).prevChildBucket.isNull() ) {
+                DiskLoc ll = k(i).prevChildBucket;
+                ll.btree<V>()->_shape(level+1,ss);
+            }
+        }
+        if ( !this->nextChild.isNull() ) {
+            DiskLoc ll = this->nextChild;
+            ll.btree<V>()->_shape(level+1,ss);
+        }
+    }
+
+    int bt_fv=0;
+    int bt_dmp=0;
+
+    template< class V >
+    void BtreeBucket<V>::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const {
+        bt_dmp=1;
+        fullValidate(thisLoc, order);
+        bt_dmp=0;
+    }
+
+    template< class V >
+    long long BtreeBucket<V>::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount, bool strict, unsigned depth) const {
+        {
+            bool f = false;
+            assert( f = true );
+            massert( 10281 , "assert is misdefined", f);
+        }
+
+        killCurrentOp.checkForInterrupt();
+        this->assertValid(order, true);
+
+        if ( bt_dmp ) {
+            _log() << thisLoc.toString() << ' ';
+            ((BtreeBucket *) this)->dump(depth);
+        }
+
+        // keycount
+        long long kc = 0;
+
+        for ( int i = 0; i < this->n; i++ ) {
+            const _KeyNode& kn = this->k(i);
+
+            if ( kn.isUsed() ) {
+                kc++;
+            }
+            else {
+                if ( unusedCount ) {
+                    ++( *unusedCount );
+                }
+            }
+            if ( !kn.prevChildBucket.isNull() ) {
+                DiskLoc left = kn.prevChildBucket;
+                const BtreeBucket *b = left.btree<V>();
+                if ( strict ) {
+                    assert( b->parent == thisLoc );
+                }
+                else {
+                    wassert( b->parent == thisLoc );
+                }
+                kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict, depth+1);
+            }
+        }
+        if ( !this->nextChild.isNull() ) {
+	    DiskLoc ll = this->nextChild;
+            const BtreeBucket *b = ll.btree<V>();
+            if ( strict ) {
+                assert( b->parent == thisLoc );
+            }
+            else {
+                wassert( b->parent == thisLoc );
+            }
+            kc += b->fullValidate(this->nextChild, order, unusedCount, strict, depth+1);
+        }
+
+        return kc;
+    }
+
+    int nDumped = 0;
+
+    template< class V >
+    void BucketBasics<V>::assertValid(const Ordering &order, bool force) const {
+        if ( !debug && !force )
+            return;
+        {
+            int foo = this->n;
+            wassert( foo >= 0 && this->n < Size() );
+            foo = this->emptySize;
+            wassert( foo >= 0 && this->emptySize < V::BucketSize );
+            wassert( this->topSize >= this->n && this->topSize <= V::BucketSize );
+        }
+
+        // this is very slow so don't do often
+        {
+            static int _k;
+            if( ++_k % 128 )
+                return;
+        }
+
+        DEV {
+            // slow:
+            for ( int i = 0; i < this->n-1; i++ ) {
+                Key k1 = keyNode(i).key;
+                Key k2 = keyNode(i+1).key;
+                int z = k1.woCompare(k2, order); //OK
+                if ( z > 0 ) {
+                    out() << "ERROR: btree key order corrupt.  Keys:" << endl;
+                    if ( ++nDumped < 5 ) {
+                        for ( int j = 0; j < this->n; j++ ) {
+                            out() << "  " << keyNode(j).key.toString() << endl;
+                        }
+                        ((BtreeBucket<V> *) this)->dump();
+                    }
+                    wassert(false);
+                    break;
+                }
+                else if ( z == 0 ) {
+                    if ( !(k(i).recordLoc < k(i+1).recordLoc) ) {
+                        out() << "ERROR: btree key order corrupt (recordloc's wrong):" << endl;
+                        out() << " k(" << i << ")" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl;
+                        out() << " k(" << i+1 << ")" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl;
+                        wassert( k(i).recordLoc < k(i+1).recordLoc );
+                    }
+                }
+            }
+        }
+        else {
+            //faster:
+            if ( this->n > 1 ) {
+                Key k1 = keyNode(0).key;
+                Key k2 = keyNode(this->n-1).key;
+                int z = k1.woCompare(k2, order);
+                //wassert( z <= 0 );
+                if ( z > 0 ) {
+                    problem() << "btree keys out of order" << '\n';
+                    ONCE {
+                        ((BtreeBucket<V> *) this)->dump();
+                    }
+                    assert(false);
+                }
+            }
+        }
+    }
+
+    template< class V >
+    inline void BucketBasics<V>::markUnused(int keypos) {
+        assert( keypos >= 0 && keypos < this->n );
+        k(keypos).setUnused();
+    }
+
+    template< class V >
+    inline int BucketBasics<V>::totalDataSize() const {
+        return (int) (Size() - (this->data-(char*)this));
+    }
+
+    template< class V >
+    void BucketBasics<V>::init() {
+        this->_init();
+        this->parent.Null();
+        this->nextChild.Null();
+        this->flags = Packed;
+        this->n = 0;
+        this->emptySize = totalDataSize();
+        this->topSize = 0;
+    }
+
+    /** see _alloc */
+    template< class V >
+    inline void BucketBasics<V>::_unalloc(int bytes) {
+        this->topSize -= bytes;
+        this->emptySize += bytes;
+    }
+
+    /**
+     * we allocate space from the end of the buffer for data.
+     * the keynodes grow from the front.
+     */
+    template< class V >
+    inline int BucketBasics<V>::_alloc(int bytes) {
+        assert( this->emptySize >= bytes );
+        this->topSize += bytes;
+        this->emptySize -= bytes;
+        int ofs = totalDataSize() - this->topSize;
+        assert( ofs > 0 );
+        return ofs;
+    }
+
+    template< class V >
+    void BucketBasics<V>::_delKeyAtPos(int keypos, bool mayEmpty) {
+        // TODO This should be keypos < n
+        assert( keypos >= 0 && keypos <= this->n );
+        assert( childForPos(keypos).isNull() );
+        // TODO audit cases where nextChild is null
+        assert( ( mayEmpty && this->n > 0 ) || this->n > 1 || this->nextChild.isNull() );
+        this->emptySize += sizeof(_KeyNode);
+        this->n--;
+        for ( int j = keypos; j < this->n; j++ )
+            k(j) = k(j+1);
+        setNotPacked();
+    }
+
+    /**
+     * pull rightmost key from the bucket.  this version requires its right child to be null so it
+     *  does not bother returning that value.
+     */
+    template< class V >
+    void BucketBasics<V>::popBack(DiskLoc& recLoc, Key &key) {
+        massert( 10282 ,  "n==0 in btree popBack()", this->n > 0 );
+        assert( k(this->n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that
+        KeyNode kn = keyNode(this->n-1);
+        recLoc = kn.recordLoc;
+        key.assign(kn.key);
+        int keysize = kn.key.dataSize();
+
+        massert( 10283 , "rchild not null in btree popBack()", this->nextChild.isNull());
+
+        // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full.
+        this->nextChild = kn.prevChildBucket;
+
+        this->n--;
+        // This is risky because the key we are returning points to this unalloc'ed memory,
+        // and we are assuming that the last key points to the last allocated
+        // bson region.
+        this->emptySize += sizeof(_KeyNode);
+        _unalloc(keysize);
+    }
+
+    /** add a key.  must be > all existing.  be careful to set next ptr right. */
+    template< class V >
+    bool BucketBasics<V>::_pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
+        int bytesNeeded = key.dataSize() + sizeof(_KeyNode);
+        if ( bytesNeeded > this->emptySize )
+            return false;
+        assert( bytesNeeded <= this->emptySize );
+        if( this->n ) {
+            const KeyNode klast = keyNode(this->n-1);
+            if(  klast.key.woCompare(key, order) > 0 ) { 
+                log() << "btree bucket corrupt? consider reindexing or running validate command" << endl;
+                log() << "  klast: " << keyNode(this->n-1).key.toString() << endl;
+                log() << "  key:   " << key.toString() << endl;
+                DEV klast.key.woCompare(key, order);
+                assert(false);
+            }
+        }
+        this->emptySize -= sizeof(_KeyNode);
+        _KeyNode& kn = k(this->n++);
+        kn.prevChildBucket = prevChild;
+        kn.recordLoc = recordLoc;
+        kn.setKeyDataOfs( (short) _alloc(key.dataSize()) );
+        short ofs = kn.keyDataOfs();
+        char *p = dataAt(ofs);
+        memcpy(p, key.data(), key.dataSize());
+
+        return true;
+    }
+
+    /* durability note
+       we do separate intent declarations herein.  arguably one could just declare
+       the whole bucket given we do group commits. this is something we could investigate
+       later as to what is faster under what situations.
+       */
+    /** insert a key in a bucket with no complexity -- no splits required
+        @return false if a split is required.
+    */
+    template< class V >
+    bool BucketBasics<V>::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const {
+        check( this->n < 1024 );
+        check( keypos >= 0 && keypos <= this->n );
+        int bytesNeeded = key.dataSize() + sizeof(_KeyNode);
+        if ( bytesNeeded > this->emptySize ) {
+            _pack(thisLoc, order, keypos);
+            if ( bytesNeeded > this->emptySize )
+                return false;
+        }
+
+        BucketBasics *b;
+        {
+            const char *p = (const char *) &k(keypos);
+            const char *q = (const char *) &k(this->n+1);
+            // declare that we will write to [k(keypos),k(n)]
+            // todo: this writes a medium amount to the journal.  we may want to add a verb "shift" to the redo log so
+            //       we can log a very small amount.
+            b = (BucketBasics*) getDur().writingAtOffset((void *) this, p-(char*)this, q-p);
+
+            // e.g. n==3, keypos==2
+            // 1 4 9
+            // ->
+            // 1 4 _ 9
+            for ( int j = this->n; j > keypos; j-- ) // make room
+                b->k(j) = b->k(j-1);
+        }
+
+        getDur().declareWriteIntent(&b->emptySize, sizeof(this->emptySize)+sizeof(this->topSize)+sizeof(this->n));
+        b->emptySize -= sizeof(_KeyNode);
+        b->n++;
+
+        // This _KeyNode was marked for writing above.
+        _KeyNode& kn = b->k(keypos);
+        kn.prevChildBucket.Null();
+        kn.recordLoc = recordLoc;
+        kn.setKeyDataOfs((short) b->_alloc(key.dataSize()) );
+        char *p = b->dataAt(kn.keyDataOfs());
+        getDur().declareWriteIntent(p, key.dataSize());
+        memcpy(p, key.data(), key.dataSize());
+        return true;
+    }
+
+    /**
+     * With this implementation, refPos == 0 disregards effect of refPos.
+     * index > 0 prevents creation of an empty bucket.
+     */
+    template< class V >
+    bool BucketBasics<V>::mayDropKey( int index, int refPos ) const {
+        return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull();
+    }
+
+    template< class V >
+    int BucketBasics<V>::packedDataSize( int refPos ) const {
+        if ( this->flags & Packed ) {
+	  return V::BucketSize - this->emptySize - headerSize();
+        }
+        int size = 0;
+        for( int j = 0; j < this->n; ++j ) {
+            if ( mayDropKey( j, refPos ) ) {
+                continue;
+            }
+            size += keyNode( j ).key.dataSize() + sizeof( _KeyNode );
+        }
+        return size;
+    }
+
+    /**
+     * when we delete things we just leave empty space until the node is
+     * full and then we repack it.
+     */
+    template< class V >
+    void BucketBasics<V>::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const {
+        if ( this->flags & Packed )
+            return;
+
+        VERIFYTHISLOC
+
+        /** TODO perhaps this can be optimized.  for example if packing does no write, we can skip intent decl.
+                 an empirical approach is probably best than just adding new code : perhaps the bucket would need
+                 declaration anyway within the group commit interval, in which case we would just be adding
+                 code and complexity without benefit.
+        */
+        thisLoc.btreemod<V>()->_packReadyForMod(order, refPos);
+    }
+
+    /** version when write intent already declared */
+    template< class V >
+    void BucketBasics<V>::_packReadyForMod( const Ordering &order, int &refPos ) {
+        assertWritable();
+
+        if ( this->flags & Packed )
+            return;
+
+        int tdz = totalDataSize();
+        char temp[V::BucketSize];
+        int ofs = tdz;
+        this->topSize = 0;
+        int i = 0;
+        for ( int j = 0; j < this->n; j++ ) {
+            if( mayDropKey( j, refPos ) ) {
+                continue; // key is unused and has no children - drop it
+            }
+            if( i != j ) {
+                if ( refPos == j ) {
+                    refPos = i; // i < j so j will never be refPos again
+                }
+                k( i ) = k( j );
+            }
+            short ofsold = k(i).keyDataOfs();
+            int sz = keyNode(i).key.dataSize();
+            ofs -= sz;
+            this->topSize += sz;
+            memcpy(temp+ofs, dataAt(ofsold), sz);
+            k(i).setKeyDataOfsSavingUse( ofs );
+            ++i;
+        }
+        if ( refPos == this->n ) {
+            refPos = i;
+        }
+        this->n = i;
+        int dataUsed = tdz - ofs;
+        memcpy(this->data + ofs, temp + ofs, dataUsed);
+
+        // assertWritable();
+        // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this));
+
+        this->emptySize = tdz - dataUsed - this->n * sizeof(_KeyNode);
+        {
+            int foo = this->emptySize;
+            assert( foo >= 0 );
+        }
+
+        setPacked();
+
+        assertValid( order );
+    }
+
+    template< class V >
+    inline void BucketBasics<V>::truncateTo(int N, const Ordering &order, int &refPos) {
+        d.dbMutex.assertWriteLocked();
+        assertWritable();
+
+        this->n = N;
+        setNotPacked();
+        _packReadyForMod( order, refPos );
+    }
+
+    /**
+     * In the standard btree algorithm, we would split based on the
+     * existing keys _and_ the new key.  But that's more work to
+     * implement, so we split the existing keys and then add the new key.
+     *
+     * There are several published heuristic algorithms for doing splits,
+     * but basically what you want are (1) even balancing between the two
+     * sides and (2) a small split key so the parent can have a larger
+     * branching factor.
+     *
+     * We just have a simple algorithm right now: if a key includes the
+     * halfway point (or 10% way point) in terms of bytes, split on that key;
+     * otherwise split on the key immediately to the left of the halfway
+     * point (or 10% point).
+     *
+     * This function is expected to be called on a packed bucket.
+     */
+    template< class V >
+    int BucketBasics<V>::splitPos( int keypos ) const {
+        assert( this->n > 2 );
+        int split = 0;
+        int rightSize = 0;
+        // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split.
+        // see SERVER-983
+        // TODO I think we only want to do the 90% split on the rhs node of the tree.
+        int rightSizeLimit = ( this->topSize + sizeof( _KeyNode ) * this->n ) / ( keypos == this->n ? 10 : 2 );
+        for( int i = this->n - 1; i > -1; --i ) {
+            rightSize += keyNode( i ).key.dataSize() + sizeof( _KeyNode );
+            if ( rightSize > rightSizeLimit ) {
+                split = i;
+                break;
+            }
+        }
+        // safeguards - we must not create an empty bucket
+        if ( split < 1 ) {
+            split = 1;
+        }
+        else if ( split > this->n - 2 ) {
+            split = this->n - 2;
+        }
+
+        return split;
+    }
+
+    template< class V >
+    void BucketBasics<V>::reserveKeysFront( int nAdd ) {
+        assert( this->emptySize >= int( sizeof( _KeyNode ) * nAdd ) );
+        this->emptySize -= sizeof( _KeyNode ) * nAdd;
+        for( int i = this->n - 1; i > -1; --i ) {
+            k( i + nAdd ) = k( i );
+        }
+        this->n += nAdd;
+    }
+
+    template< class V >
+    void BucketBasics<V>::setKey( int i, const DiskLoc recordLoc, const Key &key, const DiskLoc prevChildBucket ) {
+        _KeyNode &kn = k( i );
+        kn.recordLoc = recordLoc;
+        kn.prevChildBucket = prevChildBucket;
+        short ofs = (short) _alloc( key.dataSize() );
+        kn.setKeyDataOfs( ofs );
+        char *p = dataAt( ofs );
+        memcpy( p, key.data(), key.dataSize() );
+    }
+
+    template< class V >
+    void BucketBasics<V>::dropFront( int nDrop, const Ordering &order, int &refpos ) {
+        for( int i = nDrop; i < this->n; ++i ) {
+            k( i - nDrop ) = k( i );
+        }
+        this->n -= nDrop;
+        setNotPacked();
+        _packReadyForMod( order, refpos );
+    }
+
+    /* - BtreeBucket --------------------------------------------------- */
+
+    /** @return largest key in the subtree. */
+    template< class V >
+    void BtreeBucket<V>::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) {
+        DiskLoc loc = thisLoc;
+        while ( 1 ) {
+            const BtreeBucket *b = loc.btree<V>();
+            if ( !b->nextChild.isNull() ) {
+                loc = b->nextChild;
+                continue;
+            }
+
+            assert(b->n>0);
+            largestLoc = loc;
+            largestKey = b->n-1;
+
+            break;
+        }
+    }
+
+    /**
+     * NOTE Currently the Ordering implementation assumes a compound index will
+     * not have more keys than an unsigned variable has bits.  The same
+     * assumption is used in the implementation below with respect to the 'mask'
+     * variable.
+     *
+     * @param l a regular bsonobj
+     * @param rBegin composed partly of an existing bsonobj, and the remaining keys are taken from a vector of elements that frequently changes 
+     *
+     * see 
+     *  jstests/index_check6.js
+     *  https://jira.mongodb.org/browse/SERVER-371
+     */
+    /* static */
+    template< class V >
+    int BtreeBucket<V>::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) {
+        BSONObjIterator ll( l );
+        BSONObjIterator rr( rBegin );
+        vector< const BSONElement * >::const_iterator rr2 = rEnd.begin();
+        vector< bool >::const_iterator inc = rEndInclusive.begin();
+        unsigned mask = 1;
+        for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) {
+            BSONElement lll = ll.next();
+            BSONElement rrr = rr.next();
+            ++rr2;
+            ++inc;
+
+            int x = lll.woCompare( rrr, false );
+            if ( o.descending( mask ) )
+                x = -x;
+            if ( x != 0 )
+                return x;
+        }
+        if ( rSup ) {
+            return -direction;
+        }
+        for( ; ll.more(); mask <<= 1 ) {
+            BSONElement lll = ll.next();
+            BSONElement rrr = **rr2;
+            ++rr2;
+            int x = lll.woCompare( rrr, false );
+            if ( o.descending( mask ) )
+                x = -x;
+            if ( x != 0 )
+                return x;
+            if ( !*inc ) {
+                return -direction;
+            }
+            ++inc;
+        }
+        return 0;
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const {
+            int pos;
+            bool found;
+            DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+
+            // skip unused keys
+            while ( 1 ) {
+                if( b.isNull() )
+                    break;
+                const BtreeBucket *bucket = b.btree<V>();
+                const _KeyNode& kn = bucket->k(pos);
+                if ( kn.isUsed() )
+                    return bucket->keyAt(pos).woEqual(key);
+            b = bucket->advance(b, pos, 1, "BtreeBucket<V>::exists");
+        }
+        return false;
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::wouldCreateDup(
+        const IndexDetails& idx, const DiskLoc &thisLoc,
+        const Key& key, const Ordering& order,
+        const DiskLoc &self) const {
+        int pos;
+        bool found;
+        DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+
+        while ( !b.isNull() ) {
+            // we skip unused keys
+            const BtreeBucket *bucket = b.btree<V>();
+            const _KeyNode& kn = bucket->k(pos);
+            if ( kn.isUsed() ) {
+                if( bucket->keyAt(pos).woEqual(key) )
+                    return kn.recordLoc != self;
+                break;
+            }
+            b = bucket->advance(b, pos, 1, "BtreeBucket<V>::dupCheck");
+        }
+
+        return false;
+    }
+
+    template< class V >
+    string BtreeBucket<V>::dupKeyError( const IndexDetails& idx , const Key& key ) {
+        stringstream ss;
+        ss << "E11000 duplicate key error ";
+        ss << "index: " << idx.indexNamespace() << "  ";
+        ss << "dup key: " << key.toString();
+        return ss.str();
+    }
+
+    /**
+     * Find a key withing this btree bucket.
+     *
+     * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
+     * key.  That assures that even when there are many duplicates (e.g., 1 million) for a key,
+     * our performance is still good.
+     *
+     * assertIfDup: if the key exists (ignoring the recordLoc), uassert
+     *
+     * pos: for existing keys k0...kn-1.
+     * returns # it goes BEFORE.  so key[pos-1] < key < key[pos]
+     * returns n if it goes after the last existing key.
+     * note result might be an Unused location!
+     */
+
+    bool guessIncreasing = false;
+    template< class V >
+    bool BtreeBucket<V>::find(const IndexDetails& idx, const Key& key, const DiskLoc &rl, 
+			      const Ordering &order, int& pos, bool assertIfDup) const {
+        Loc recordLoc;
+        recordLoc = rl;
+        globalIndexCounters.btree( (char*)this );
+
+        // binary search for this key
+        bool dupsChecked = false;
+        int l=0;
+        int h=this->n-1;
+        int m = (l+h)/2;
+        if( guessIncreasing ) {
+            m = h;
+        }
+        while ( l <= h ) {
+            KeyNode M = this->keyNode(m);
+            int x = key.woCompare(M.key, order);
+            if ( x == 0 ) {
+                if( assertIfDup ) {
+                    if( k(m).isUnused() ) {
+                        // ok that key is there if unused.  but we need to check that there aren't other
+                        // entries for the key then.  as it is very rare that we get here, we don't put any
+                        // coding effort in here to make this particularly fast
+                        if( !dupsChecked ) {
+                            dupsChecked = true;
+                            if( idx.head.btree<V>()->exists(idx, idx.head, key, order) ) {
+                                if( idx.head.btree<V>()->wouldCreateDup(idx, idx.head, key, order, recordLoc) )
+                                    uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                                else
+                                    alreadyInIndex();
+                            }
+                        }
+                    }
+                    else {
+                        if( M.recordLoc == recordLoc )
+                            alreadyInIndex();
+                        uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                    }
+                }
+
+                // dup keys allowed.  use recordLoc as if it is part of the key
+                Loc unusedRL = M.recordLoc;
+                unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up
+                x = recordLoc.compare(unusedRL);
+            }
+            if ( x < 0 ) // key < M.key
+                h = m-1;
+            else if ( x > 0 )
+                l = m+1;
+            else {
+                // found it.
+                pos = m;
+                return true;
+            }
+            m = (l+h)/2;
+        }
+        // not found
+        pos = l;
+        if ( pos != this->n ) {
+            Key keyatpos = keyNode(pos).key;
+            wassert( key.woCompare(keyatpos, order) <= 0 );
+            if ( pos > 0 ) {
+                if( !( keyNode(pos-1).key.woCompare(key, order) <= 0 ) ) {
+                    DEV {
+                        log() << key.toString() << endl;
+                        log() << keyNode(pos-1).key.toString() << endl;
+                    }
+                    wassert(false);
+                }
+            }
+        }
+
+        return false;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::delBucket(const DiskLoc thisLoc, const IndexDetails& id) {
+        ClientCursor::informAboutToDeleteBucket(thisLoc); // slow...
+        assert( !isHead() );
+
+	DiskLoc ll = this->parent;
+        const BtreeBucket *p = ll.btree<V>();
+        int parentIdx = indexInParent( thisLoc );
+        p->childForPos( parentIdx ).writing().Null();
+        deallocBucket( thisLoc, id );
+    }
+
+    template< class V >
+    void BtreeBucket<V>::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) {
+#if 0
+        // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
+        // it (meaning it is ineligible for reuse).
+        memset(this, 0, Size());
+#else
+        // defensive:
+        this->n = -1;
+        this->parent.Null();
+        string ns = id.indexNamespace();
+        theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc);
+#endif
+    }
+
+    /** note: may delete the entire bucket!  this invalid upon return sometimes. */
+    template< class V >
+    void BtreeBucket<V>::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) {
+        assert(this->n>0);
+        DiskLoc left = this->childForPos(p);
+
+        if ( this->n == 1 ) {
+            if ( left.isNull() && this->nextChild.isNull() ) {
+                this->_delKeyAtPos(p);
+                if ( isHead() ) {
+                    // we don't delete the top bucket ever
+                }
+                else {
+                    if ( !mayBalanceWithNeighbors( thisLoc, id, order ) ) {
+                        // An empty bucket is only allowed as a transient state.  If
+                        // there are no neighbors to balance with, we delete ourself.
+                        // This condition is only expected in legacy btrees.
+                        delBucket(thisLoc, id);
+                    }
+                }
+                return;
+            }
+            deleteInternalKey( thisLoc, p, id, order );
+            return;
+        }
+
+        if ( left.isNull() ) {
+            this->_delKeyAtPos(p);
+            mayBalanceWithNeighbors( thisLoc, id, order );
+        }
+        else {
+            deleteInternalKey( thisLoc, p, id, order );
+        }
+    }
+
+    /**
+     * This function replaces the specified key (k) by either the prev or next
+     * key in the btree (k').  We require that k have either a left or right
+     * child.  If k has a left child, we set k' to the prev key of k, which must
+     * be a leaf present in the left child.  If k does not have a left child, we
+     * set k' to the next key of k, which must be a leaf present in the right
+     * child.  When we replace k with k', we copy k' over k (which may cause a
+     * split) and then remove k' from its original location.  Because k' is
+     * stored in a descendent of k, replacing k by k' will not modify the
+     * storage location of the original k', and we can easily remove k' from
+     * its original location.
+     *
+     * This function is only needed in cases where k has a left or right child;
+     * in other cases a simpler key removal implementation is possible.
+     *
+     * NOTE on legacy btree structures:
+     * In legacy btrees, k' can be a nonleaf.  In such a case we 'delete' k by
+     * marking it as an unused node rather than replacing it with k'.  Also, k'
+     * may be a leaf but marked as an unused node.  In such a case we replace
+     * k by k', preserving the key's unused marking.  This function is only
+     * expected to mark a key as unused when handling a legacy btree.
+     */
+    template< class V >
+    void BtreeBucket<V>::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = this->childForPos( keypos );
+        DiskLoc rchild = this->childForPos( keypos + 1 );
+        assert( !lchild.isNull() || !rchild.isNull() );
+        int advanceDirection = lchild.isNull() ? 1 : -1;
+        int advanceKeyOfs = keypos;
+        DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ );
+        // advanceLoc must be a descentant of thisLoc, because thisLoc has a
+        // child in the proper direction and all descendants of thisLoc must be
+        // nonempty because they are not the root.
+         
+        if ( !advanceLoc.btree<V>()->childForPos( advanceKeyOfs ).isNull() ||
+                !advanceLoc.btree<V>()->childForPos( advanceKeyOfs + 1 ).isNull() ) {
+            // only expected with legacy btrees, see note above
+            this->markUnused( keypos );
+            return;
+        }
+
+        KeyNode kn = advanceLoc.btree<V>()->keyNode( advanceKeyOfs );
+        // Because advanceLoc is a descendant of thisLoc, updating thisLoc will
+        // not affect packing or keys of advanceLoc and kn will be stable
+        // during the following setInternalKey()
+        setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, this->childForPos( keypos ), this->childForPos( keypos + 1 ), id );
+        advanceLoc.btreemod<V>()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order );
+    }
+
+//#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>())
+#define BTREE(loc) (loc.template btree<V>())
+//#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>())
+#define BTREEMOD(loc) (loc.template btreemod<V>())
+
+    template< class V >
+    void BtreeBucket<V>::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) {
+        assert( this->n == 0 && !this->nextChild.isNull() );
+        if ( this->parent.isNull() ) {
+            assert( id.head == thisLoc );
+            id.head.writing() = this->nextChild;
+        }
+        else {
+	    DiskLoc ll = this->parent;
+            ll.btree<V>()->childForPos( indexInParent( thisLoc ) ).writing() = this->nextChild;
+        }
+        BTREE(this->nextChild)->parent.writing() = this->parent;
+        ClientCursor::informAboutToDeleteBucket( thisLoc );
+        deallocBucket( thisLoc, id );
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const {
+        assert( leftIndex >= 0 && leftIndex < this->n );
+        DiskLoc leftNodeLoc = this->childForPos( leftIndex );
+        DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 );
+        if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) {
+            // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway
+            return false;
+        }
+        int pos = 0;
+        {
+            const BtreeBucket *l = leftNodeLoc.btree<V>();
+            const BtreeBucket *r = rightNodeLoc.btree<V>();
+            if ( ( this->headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.dataSize() + sizeof(_KeyNode) > unsigned( V::BucketSize ) ) ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * This implementation must respect the meaning and value of lowWaterMark.
+     * Also see comments in splitPos().
+     */
+    template< class V >
+    int BtreeBucket<V>::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const {
+        int split = -1;
+        int rightSize = 0;
+        const BtreeBucket *l = BTREE(this->childForPos( leftIndex ));
+        const BtreeBucket *r = BTREE(this->childForPos( leftIndex + 1 ));
+
+        int KNS = sizeof( _KeyNode );
+        int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.dataSize() + KNS + r->topSize + r->n * KNS ) / 2;
+        // This constraint should be ensured by only calling this function
+        // if we go below the low water mark.
+        assert( rightSizeLimit < BtreeBucket<V>::bodySize() );
+        for( int i = r->n - 1; i > -1; --i ) {
+            rightSize += r->keyNode( i ).key.dataSize() + KNS;
+            if ( rightSize > rightSizeLimit ) {
+                split = l->n + 1 + i;
+                break;
+            }
+        }
+        if ( split == -1 ) {
+            rightSize += keyNode( leftIndex ).key.dataSize() + KNS;
+            if ( rightSize > rightSizeLimit ) {
+                split = l->n;
+            }
+        }
+        if ( split == -1 ) {
+            for( int i = l->n - 1; i > -1; --i ) {
+                rightSize += l->keyNode( i ).key.dataSize() + KNS;
+                if ( rightSize > rightSizeLimit ) {
+                    split = i;
+                    break;
+                }
+            }
+        }
+        // safeguards - we must not create an empty bucket
+        if ( split < 1 ) {
+            split = 1;
+        }
+        else if ( split > l->n + 1 + r->n - 2 ) {
+            split = l->n + 1 + r->n - 2;
+        }
+
+        return split;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc leftNodeLoc = this->childForPos( leftIndex );
+        DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 );
+        BtreeBucket *l = leftNodeLoc.btreemod<V>();
+        BtreeBucket *r = rightNodeLoc.btreemod<V>();
+        int pos = 0;
+        l->_packReadyForMod( order, pos );
+        r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys
+
+        // We know the additional keys below will fit in l because canMergeChildren()
+        // must be true.
+        int oldLNum = l->n;
+        {
+            KeyNode kn = keyNode( leftIndex );
+            l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        for( int i = 0; i < r->n; ++i ) {
+            KeyNode kn = r->keyNode( i );
+            l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+        }
+        l->nextChild = r->nextChild;
+        l->fixParentPtrs( leftNodeLoc, oldLNum );
+        r->delBucket( rightNodeLoc, id );
+        this->childForPos( leftIndex + 1 ) = leftNodeLoc;
+        this->childForPos( leftIndex ) = DiskLoc();
+        this->_delKeyAtPos( leftIndex, true );
+        if ( this->n == 0 ) {
+            // will trash this and thisLoc
+            // TODO To ensure all leaves are of equal height, we should ensure
+            // this is only called on the root.
+            replaceWithNextChild( thisLoc, id );
+        }
+        else {
+            // balance recursively - maybe we should do this even when n == 0?
+            mayBalanceWithNeighbors( thisLoc, id, order );
+        }
+    }
+
+    template< class V >
+    int BtreeBucket<V>::indexInParent( const DiskLoc &thisLoc ) const {
+        assert( !this->parent.isNull() );
+        const BtreeBucket *p = BTREE(this->parent);
+        if ( p->nextChild == thisLoc ) {
+            return p->n;
+        }
+        else {
+            for( int i = 0; i < p->n; ++i ) {
+                if ( p->k( i ).prevChildBucket == thisLoc ) {
+                    return i;
+                }
+            }
+        }
+        out() << "ERROR: can't find ref to child bucket.\n";
+        out() << "child: " << thisLoc << "\n";
+        dump();
+        out() << "Parent: " << this->parent << "\n";
+        p->dump();
+        assert(false);
+        return -1; // just to compile
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const {
+        // If we can merge, then we must merge rather than balance to preserve
+        // bucket utilization constraints.
+        if ( canMergeChildren( thisLoc, leftIndex ) ) {
+            return false;
+        }
+        thisLoc.btreemod<V>()->doBalanceChildren( thisLoc, leftIndex, id, order );
+        return true;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+                                            BtreeBucket *l, const DiskLoc lchild,
+                                            BtreeBucket *r, const DiskLoc rchild,
+                                            IndexDetails &id, const Ordering &order ) {
+        // TODO maybe do some audits the same way pushBack() does?
+        // As a precondition, rchild + the old separator are <= half a body size,
+        // and lchild is at most completely full.  Based on the value of split,
+        // rchild will get <= half of the total bytes which is at most 75%
+        // of a full body.  So rchild will have room for the following keys:
+        int rAdd = l->n - split;
+        r->reserveKeysFront( rAdd );
+        for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) {
+            KeyNode kn = l->keyNode( i );
+            r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket );
+        }
+        {
+            KeyNode kn = keyNode( leftIndex );
+            r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        r->fixParentPtrs( rchild, 0, rAdd - 1 );
+        {
+            KeyNode kn = l->keyNode( split );
+            l->nextChild = kn.prevChildBucket;
+            // Because lchild is a descendant of thisLoc, updating thisLoc will
+            // not affect packing or keys of lchild and kn will be stable
+            // during the following setInternalKey()            
+            setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+        }
+        int zeropos = 0;
+        // lchild and rchild cannot be merged, so there must be >0 (actually more)
+        // keys to the left of split.
+        l->truncateTo( split, order, zeropos );
+    }
+
+    template< class V >
+    void BtreeBucket<V>::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+                                            BtreeBucket *l, const DiskLoc lchild,
+                                            BtreeBucket *r, const DiskLoc rchild,
+                                            IndexDetails &id, const Ordering &order ) {
+        // As a precondition, lchild + the old separator are <= half a body size,
+        // and rchild is at most completely full.  Based on the value of split,
+        // lchild will get less than half of the total bytes which is at most 75%
+        // of a full body.  So lchild will have room for the following keys:
+        int lN = l->n;
+        {
+            KeyNode kn = keyNode( leftIndex );
+            l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        for( int i = 0; i < split - lN - 1; ++i ) {
+            KeyNode kn = r->keyNode( i );
+            l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+        }
+        {
+            KeyNode kn = r->keyNode( split - lN - 1 );
+            l->nextChild = kn.prevChildBucket;
+            // Child lN was lchild's old nextChild, and don't need to fix that one.
+            l->fixParentPtrs( lchild, lN + 1, l->n );
+            // Because rchild is a descendant of thisLoc, updating thisLoc will
+            // not affect packing or keys of rchild and kn will be stable
+            // during the following setInternalKey()            
+            setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+        }
+        int zeropos = 0;
+        // lchild and rchild cannot be merged, so there must be >0 (actually more)
+        // keys to the right of split.
+        r->dropFront( split - lN, order, zeropos );
+    }
+
+    template< class V >
+    void BtreeBucket<V>::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = this->childForPos( leftIndex );
+        DiskLoc rchild = this->childForPos( leftIndex + 1 );
+        int zeropos = 0;
+        BtreeBucket *l = lchild.btreemod<V>();
+        l->_packReadyForMod( order, zeropos );
+        BtreeBucket *r = rchild.btreemod<V>();
+        r->_packReadyForMod( order, zeropos );
+        int split = rebalancedSeparatorPos( thisLoc, leftIndex );
+
+        // By definition, if we are below the low water mark and cannot merge
+        // then we must actively balance.
+        assert( split != l->n );
+        if ( split < l->n ) {
+            doBalanceLeftToRight( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+        }
+        else {
+            doBalanceRightToLeft( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+        }
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const {
+        if ( this->parent.isNull() ) { // we are root, there are no neighbors
+            return false;
+        }
+
+        if ( this->packedDataSize( 0 ) >= this->lowWaterMark() ) {
+            return false;
+        }
+
+        const BtreeBucket *p = BTREE(this->parent);
+        int parentIdx = indexInParent( thisLoc );
+
+        // TODO will missing neighbor case be possible long term?  Should we try to merge/balance somehow in that case if so?
+        bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() );
+        bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() );
+
+        // Balance if possible on one side - we merge only if absolutely necessary
+        // to preserve btree bucket utilization constraints since that's a more
+        // heavy duty operation (especially if we must re-split later).
+        if ( mayBalanceRight &&
+                p->tryBalanceChildren( this->parent, parentIdx, id, order ) ) {
+            return true;
+        }
+        if ( mayBalanceLeft &&
+                p->tryBalanceChildren( this->parent, parentIdx - 1, id, order ) ) {
+            return true;
+        }
+
+        BtreeBucket *pm = BTREEMOD(this->parent);
+        if ( mayBalanceRight ) {
+            pm->doMergeChildren( this->parent, parentIdx, id, order );
+            return true;
+        }
+        else if ( mayBalanceLeft ) {
+            pm->doMergeChildren( this->parent, parentIdx - 1, id, order );
+            return true;
+        }
+
+        return false;
+    }
+
+    /** remove a key from the index */
+    template< class V >
+    bool BtreeBucket<V>::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const {
+        int pos;
+        bool found;
+        const Ordering ord = Ordering::make(id.keyPattern());
+        DiskLoc loc = locate(id, thisLoc, key, ord, pos, found, recordLoc, 1);
+        if ( found ) {
+            if ( key.objsize() > this->KeyMax ) {
+                OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl;
+            }            
+            loc.btreemod<V>()->delKeyAtPos(loc, id, pos, ord);            
+            return true;
+        }
+        return false;
+    }
+
+    template< class V >
+    inline void BtreeBucket<V>::fix(const DiskLoc thisLoc, const DiskLoc child) {
+        if ( !child.isNull() ) {
+            if ( insert_debug )
+                out() << "     fix " << child.toString() << ".parent=" << thisLoc.toString() << endl;
+            child.btree<V>()->parent.writing() = thisLoc;
+        }
+    }
+
+    /**
+     * This can cause a lot of additional page writes when we assign buckets to
+     * different parents.  Maybe get rid of parent ptrs?
+     */
+    template< class V >
+    void BtreeBucket<V>::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const {
+        VERIFYTHISLOC
+        if ( lastIndex == -1 ) {
+            lastIndex = this->n;
+        }
+        for ( int i = firstIndex; i <= lastIndex; i++ ) {
+            fix(thisLoc, this->childForPos(i));
+        }
+    }
+
+    template< class V >
+    void BtreeBucket<V>::setInternalKey( const DiskLoc thisLoc, int keypos,
+                                      const DiskLoc recordLoc, const Key &key, const Ordering &order,
+                                      const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) {
+        this->childForPos( keypos ).Null();
+
+        // This may leave the bucket empty (n == 0) which is ok only as a
+        // transient state.  In the instant case, the implementation of
+        // insertHere behaves correctly when n == 0 and as a side effect
+        // increments n.
+        this->_delKeyAtPos( keypos, true );
+
+        // Ensure we do not orphan neighbor's old child.
+        assert( this->childForPos( keypos ) == rchild );
+
+        // Just set temporarily - required to pass validation in insertHere()
+        this->childForPos( keypos ) = lchild;
+
+        insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx );
+    }
+
+    /**
+     * insert a key in this bucket, splitting if necessary.
+     * @keypos - where to insert the key in range 0..n.  0=make leftmost, n=make rightmost.
+     * NOTE this function may free some data, and as a result the value passed for keypos may
+     * be invalid after calling insertHere()
+     *
+     * Some of the write intent signaling below relies on the implementation of
+     * the optimized write intent code in basicInsert().
+     */
+    template< class V >
+    void BtreeBucket<V>::insertHere( const DiskLoc thisLoc, int keypos,
+                                  const DiskLoc recordLoc, const Key& key, const Ordering& order,
+                                  const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const {
+        if ( insert_debug )
+            out() << "   " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' '
+                  << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
+
+        if ( !this->basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
+            // If basicInsert() fails, the bucket will be packed as required by split().
+            thisLoc.btreemod<V>()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
+            return;
+        }
+
+        {
+            const _KeyNode *_kn = &k(keypos);
+            _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert()
+            if ( keypos+1 == this->n ) { // last key
+                if ( this->nextChild != lchild ) {
+                    out() << "ERROR nextChild != lchild" << endl;
+                    out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
+                    out() << "  keyPos: " << keypos << " n:" << this->n << endl;
+                    out() << "  nextChild: " << this->nextChild.toString() << " lchild: " << lchild.toString() << endl;
+                    out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
+                    out() << "  key: " << key.toString() << endl;
+                    dump();
+                    assert(false);
+                }
+                kn->prevChildBucket = this->nextChild;
+                assert( kn->prevChildBucket == lchild );
+                this->nextChild.writing() = rchild;
+                if ( !rchild.isNull() )
+		    BTREE(rchild)->parent.writing() = thisLoc;
+            }
+            else {
+                kn->prevChildBucket = lchild;
+                if ( k(keypos+1).prevChildBucket != lchild ) {
+                    out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl;
+                    out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
+                    out() << "  keyPos: " << keypos << " n:" << this->n << endl;
+                    out() << "  k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl;
+                    out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
+                    out() << "  key: " << key.toString() << endl;
+                    dump();
+                    assert(false);
+                }
+                const Loc *pc = &k(keypos+1).prevChildBucket;
+                *getDur().alreadyDeclared( const_cast<Loc*>(pc) ) = rchild; // declared in basicInsert()
+                if ( !rchild.isNull() )
+                    rchild.btree<V>()->parent.writing() = thisLoc;
+            }
+            return;
+        }
+    }
+
+    template< class V >
+    void BtreeBucket<V>::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) {
+        this->assertWritable();
+
+        if ( split_debug )
+            out() << "    " << thisLoc.toString() << ".split" << endl;
+
+        int split = this->splitPos( keypos );
+        DiskLoc rLoc = addBucket(idx);
+        BtreeBucket *r = rLoc.btreemod<V>();
+        if ( split_debug )
+            out() << "     split:" << split << ' ' << keyNode(split).key.toString() << " n:" << this->n << endl;
+        for ( int i = split+1; i < this->n; i++ ) {
+            KeyNode kn = keyNode(i);
+            r->pushBack(kn.recordLoc, kn.key, order, kn.prevChildBucket);
+        }
+        r->nextChild = this->nextChild;
+        r->assertValid( order );
+
+        if ( split_debug )
+            out() << "     new rLoc:" << rLoc.toString() << endl;
+        r = 0;
+        rLoc.btree<V>()->fixParentPtrs(rLoc);
+
+        {
+            KeyNode splitkey = keyNode(split);
+            this->nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
+            if ( split_debug ) {
+                out() << "    splitkey key:" << splitkey.key.toString() << endl;
+            }
+
+            // Because thisLoc is a descendant of parent, updating parent will
+            // not affect packing or keys of thisLoc and splitkey will be stable
+            // during the following:
+            
+            // promote splitkey to a parent this->node
+            if ( this->parent.isNull() ) {
+                // make a new parent if we were the root
+                DiskLoc L = addBucket(idx);
+                BtreeBucket *p = L.btreemod<V>();
+                p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc);
+                p->nextChild = rLoc;
+                p->assertValid( order );
+                this->parent = idx.head.writing() = L;
+                if ( split_debug )
+                    out() << "    we were root, making new root:" << hex << this->parent.getOfs() << dec << endl;
+                rLoc.btree<V>()->parent.writing() = this->parent;
+            }
+            else {
+                // set this before calling _insert - if it splits it will do fixParent() logic and change the value.
+                rLoc.btree<V>()->parent.writing() = this->parent;
+                if ( split_debug )
+                    out() << "    promoting splitkey key " << splitkey.key.toString() << endl;
+                BTREE(this->parent)->_insert(this->parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx);
+            }
+        }
+
+        int newpos = keypos;
+        // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
+        this->truncateTo(split, order, newpos);
+
+        // add our this->new key, there is room this->now
+        {
+            if ( keypos <= split ) {
+                if ( split_debug )
+                    out() << "  keypos<split, insertHere() the new key" << endl;
+                insertHere(thisLoc, newpos, recordLoc, key, order, lchild, rchild, idx);
+            }
+            else {
+                int kp = keypos-split-1;
+                assert(kp>=0);
+                BTREE(rLoc)->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx);
+            }
+        }
+
+        if ( split_debug )
+            out() << "     split end " << hex << thisLoc.getOfs() << dec << endl;
+    }
+
+    /** start a new index off, empty */
+    template< class V >
+    DiskLoc BtreeBucket<V>::addBucket(const IndexDetails& id) {
+        string ns = id.indexNamespace();
+        DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, V::BucketSize, true);
+        BtreeBucket *b = BTREEMOD(loc);
+        b->init();
+        return loc;
+    }
+
+    void renameIndexNamespace(const char *oldNs, const char *newNs) {
+        renameNamespace( oldNs, newNs );
+    }
+
+    template< class V >
+    const DiskLoc BtreeBucket<V>::getHead(const DiskLoc& thisLoc) const {
+        DiskLoc p = thisLoc;
+        while ( !BTREE(p)->isHead() )
+            p = BTREE(p)->parent;
+        return p;
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const {
+        if ( keyOfs < 0 || keyOfs >= this->n ) {
+            out() << "ASSERT failure BtreeBucket<V>::advance, caller: " << caller << endl;
+            out() << "  thisLoc: " << thisLoc.toString() << endl;
+            out() << "  keyOfs: " << keyOfs << " n:" << this->n << " direction: " << direction << endl;
+            out() << bucketSummary() << endl;
+            assert(false);
+        }
+        int adj = direction < 0 ? 1 : 0;
+        int ko = keyOfs + direction;
+        DiskLoc nextDown = this->childForPos(ko+adj);
+        if ( !nextDown.isNull() ) {
+            while ( 1 ) {
+	      keyOfs = direction>0 ? 0 : BTREE(nextDown)->n - 1;
+	        DiskLoc loc = BTREE(nextDown)->childForPos(keyOfs + adj);
+                if ( loc.isNull() )
+                    break;
+                nextDown = loc;
+            }
+            return nextDown;
+        }
+
+        if ( ko < this->n && ko >= 0 ) {
+            keyOfs = ko;
+            return thisLoc;
+        }
+
+        // end of bucket.  traverse back up.
+        DiskLoc childLoc = thisLoc;
+        DiskLoc ancestor = this->parent;
+        while ( 1 ) {
+            if ( ancestor.isNull() )
+                break;
+            const BtreeBucket *an = BTREE(ancestor);
+            for ( int i = 0; i < an->n; i++ ) {
+                if ( an->childForPos(i+adj) == childLoc ) {
+                    keyOfs = i;
+                    return ancestor;
+                }
+            }
+            assert( direction<0 || an->nextChild == childLoc );
+            // parent exhausted also, keep going up
+            childLoc = ancestor;
+            ancestor = an->parent;
+        }
+
+        return DiskLoc();
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
+        KeyOwned k(key);
+        return locate(idx, thisLoc, k, order, pos, found, recordLoc, direction);
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const Key& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
+        int p;
+        found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false);
+        if ( found ) {
+            pos = p;
+            return thisLoc;
+        }
+
+        DiskLoc child = this->childForPos(p);
+
+        if ( !child.isNull() ) {
+            DiskLoc l = BTREE(child)->locate(idx, child, key, order, pos, found, recordLoc, direction);
+            if ( !l.isNull() )
+                return l;
+        }
+
+        pos = p;
+        if ( direction < 0 )
+            return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc;
+        else
+            return pos == this->n ? DiskLoc() /*theend*/ : thisLoc;
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) {
+        const BtreeBucket<V> * bucket = BTREE(thisLoc);
+        while( 1 ) {
+            if ( l + 1 == h ) {
+                keyOfs = ( direction > 0 ) ? h : l;
+                DiskLoc next = bucket->k( h ).prevChildBucket;
+                if ( !next.isNull() ) {
+                    bestParent = make_pair( thisLoc, keyOfs );
+                    thisLoc = next;
+                    return true;
+                }
+                else {
+                    return false;
+                }
+            }
+            int m = l + ( h - l ) / 2;
+            int cmp = customBSONCmp( bucket->keyNode( m ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+            if ( cmp < 0 ) {
+                l = m;
+            }
+            else if ( cmp > 0 ) {
+                h = m;
+            }
+            else {
+                if ( direction < 0 ) {
+                    l = m;
+                }
+                else {
+                    h = m;
+                }
+            }
+        }
+    }
+
+    /**
+     * find smallest/biggest value greater-equal/less-equal than specified
+     * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd
+     * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient
+     */
+    template< class V >
+    void BtreeBucket<V>::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const {
+        int l,h;
+        bool dontGoUp;
+        if ( direction > 0 ) {
+            l = keyOfs;
+            h = this->n - 1;
+            dontGoUp = ( customBSONCmp( keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+        }
+        else {
+            l = 0;
+            h = keyOfs;
+            dontGoUp = ( customBSONCmp( keyNode( l ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
+        }
+        pair< DiskLoc, int > bestParent;
+        if ( dontGoUp ) {
+            // this comparison result assures h > l
+            if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) {
+                return;
+            }
+        }
+        else {
+            // go up parents until rightmost/leftmost node is >=/<= target or at top
+	    while( !BTREE(thisLoc)->parent.isNull() ) {
+	        thisLoc = BTREE(thisLoc)->parent;
+                if ( direction > 0 ) {
+		  if ( customBSONCmp( BTREE(thisLoc)->keyNode( BTREE(thisLoc)->n - 1 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) {
+                        break;
+                    }
+                }
+                else {
+		  if ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) {
+                        break;
+                    }
+                }
+            }
+        }
+        customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent );
+    }
+
+    /** @param thisLoc in/out param. perhaps thisLoc isn't the best name given that.
+        Ut is used by advanceTo, which skips
+        from one key to another key without necessarily checking all the keys
+        between them in the btree (it can skip to different btree buckets).
+        The advanceTo function can get called a lot, and for different targets
+        we want to advance to, don't want to create a bson obj in a new
+        buffer each time we call that function.  The
+        customLocate function necessary for advanceTo, and does the same thing
+        as normal locate function but takes basically the same arguments
+        as advanceTo.
+    */
+    template< class V >
+    void BtreeBucket<V>::customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, 
+                                      const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, 
+                                      const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) {
+        dassert( direction == 1 || direction == -1 );
+        const BtreeBucket<V> *bucket = BTREE(locInOut);
+        if ( bucket->n == 0 ) {
+            locInOut = DiskLoc();
+            return;
+        }
+        // go down until find smallest/biggest >=/<= target
+        while( 1 ) {
+            int l = 0;
+            int h = bucket->n - 1;
+
+            // +direction: 0, -direction: h
+            int z = (1-direction)/2*h;
+
+            // leftmost/rightmost key may possibly be >=/<= search key
+            int res = customBSONCmp( bucket->keyNode( z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+            bool firstCheck = direction*res >= 0;
+
+            if ( firstCheck ) {
+                DiskLoc next;
+                keyOfs = z;
+                if ( direction > 0 ) {
+                    dassert( z == 0 );
+                    next = bucket->k( 0 ).prevChildBucket;
+                }
+                else {
+                    next = bucket->nextChild;
+                }
+                if ( !next.isNull() ) {
+                    bestParent = pair< DiskLoc, int >( locInOut, keyOfs );
+                    locInOut = next;
+                    bucket = BTREE(locInOut);
+                    continue;
+                }
+                else {
+                    return;
+                }
+            }
+
+            res = customBSONCmp( bucket->keyNode( h-z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+            bool secondCheck = direction*res < 0;
+
+            if ( secondCheck ) {
+                DiskLoc next;
+                if ( direction > 0 ) {
+                    next = bucket->nextChild;
+                }
+                else {
+                    next = bucket->k( 0 ).prevChildBucket;
+                }
+                if ( next.isNull() ) {
+                    // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc()
+                    locInOut = bestParent.first;
+                    keyOfs = bestParent.second;
+                    return;
+                }
+                else {
+                    locInOut = next;
+                    bucket = BTREE(locInOut);
+                    continue;
+                }
+            }
+
+            if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, locInOut, keyOfs, bestParent ) ) {
+                return;
+            }
+            bucket = BTREE(locInOut);
+        }
+    }
+
+    /** @thisLoc disk location of *this */
+    template< class V >
+    void BtreeBucket<V>::insertStepOne(DiskLoc thisLoc, 
+                             Continuation<V>& c,
+                             bool dupsAllowed) const {
+        dassert( c.key.dataSize() <= this->KeyMax );
+        assert( c.key.dataSize() > 0 );
+
+        int pos;
+        bool found = find(c.idx, c.key, c.recordLoc, c.order, pos, !dupsAllowed);
+
+        if ( found ) {
+            const _KeyNode& kn = k(pos);
+            if ( kn.isUnused() ) {
+                log(4) << "btree _insert: reusing unused key" << endl;
+                c.b = this;
+                c.pos = pos;
+                c.op = Continuation<V>::SetUsed;
+                return;
+            }
+
+            DEV {
+                log() << "_insert(): key already exists in index (ok for background:true)\n";
+                log() << "  " << c.idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
+                log() << "  " << c.key.toString() << '\n';
+                log() << "  " << "recordLoc:" << c.recordLoc.toString() << " pos:" << pos << endl;
+                log() << "  old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl;
+            }
+            alreadyInIndex();
+        }
+
+        Loc ch = this->childForPos(pos);
+        DiskLoc child = ch;
+
+        if ( child.isNull() ) {
+            // A this->new key will be inserted at the same tree height as an adjacent existing key.
+            c.bLoc = thisLoc;
+            c.b = this;
+            c.pos = pos;
+            c.op = Continuation<V>::InsertHere;
+            return;
+        }
+
+        child.btree<V>()->insertStepOne(child, c, dupsAllowed);
+    }
+
+    /** @thisLoc disk location of *this */
+    template< class V >
+    int BtreeBucket<V>::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                             const Key& key, const Ordering &order, bool dupsAllowed,
+                             const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const {
+        if ( key.dataSize() > this->KeyMax ) {
+            problem() << "ERROR: key too large len:" << key.dataSize() << " max:" << this->KeyMax << ' ' << key.dataSize() << ' ' << idx.indexNamespace() << endl;
+            return 2;
+        }
+        assert( key.dataSize() > 0 );
+
+        int pos;
+        bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed);
+        if ( insert_debug ) {
+            out() << "  " << thisLoc.toString() << '.' << "_insert " <<
+                  key.toString() << '/' << recordLoc.toString() <<
+                  " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
+            out() << "    found:" << found << " pos:" << pos << " n:" << this->n << endl;
+        }
+
+        if ( found ) {
+            const _KeyNode& kn = k(pos);
+            if ( kn.isUnused() ) {
+                log(4) << "btree _insert: reusing unused key" << endl;
+                massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull());
+                massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull());
+                kn.writing().setUsed();
+                return 0;
+            }
+
+            DEV {
+                log() << "_insert(): key already exists in index (ok for background:true)\n";
+                log() << "  " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
+                log() << "  " << key.toString() << '\n';
+                log() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
+                log() << "  old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl;
+                log() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
+            }
+            alreadyInIndex();
+        }
+
+        DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
+        Loc ch = this->childForPos(pos);
+        DiskLoc child = ch;
+        if ( insert_debug )
+            out() << "    getChild(" << pos << "): " << child.toString() << endl;
+        // In current usage, rChild isNull() for a new key and false when we are
+        // promoting a split key.  These are the only two cases where _insert()
+        // is called currently.
+        if ( child.isNull() || !rChild.isNull() ) {
+            // A new key will be inserted at the same tree height as an adjacent existing key.
+            insertHere(thisLoc, pos, recordLoc, key, order, lChild, rChild, idx);
+            return 0;
+        }
+
+        return child.btree<V>()->_insert(child, recordLoc, key, order, dupsAllowed, /*lchild*/DiskLoc(), /*rchild*/DiskLoc(), idx);
+    }
+
+    template< class V >
+    void BtreeBucket<V>::dump(unsigned depth) const {
+        string indent = string(depth, ' ');
+        _log() << "BUCKET n:" << this->n;
+        _log() << " parent:" << hex << this->parent.getOfs() << dec;
+        for ( int i = 0; i < this->n; i++ ) {
+            _log() << '\n' << indent;
+            KeyNode k = keyNode(i);
+            string ks = k.key.toString();
+            _log() << "  " << hex << k.prevChildBucket.getOfs() << '\n';
+            _log() << indent << "    " << i << ' ' << ks.substr(0, 30) << " Loc:" << k.recordLoc.toString() << dec;
+            if ( this->k(i).isUnused() )
+                _log() << " UNUSED";
+        }
+        _log() << "\n" << indent << "  " << hex << this->nextChild.getOfs() << dec << endl;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::twoStepInsert(DiskLoc thisLoc, Continuation<V> &c, bool dupsAllowed) const
+    {
+
+        if ( c.key.dataSize() > this->KeyMax ) {
+            problem() << "ERROR: key too large len:" << c.key.dataSize() << " max:" << this->KeyMax << ' ' << c.key.dataSize() << ' ' << c.idx.indexNamespace() << endl;
+            return; // op=Nothing
+        }
+        insertStepOne(thisLoc, c, dupsAllowed);
+    }
+
+    /** todo: meaning of return code unclear clean up */
+    template< class V >
+    int BtreeBucket<V>::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                               const BSONObj& _key, const Ordering &order, bool dupsAllowed,
+                               IndexDetails& idx, bool toplevel) const 
+    {
+        guessIncreasing = _key.firstElementType() == jstOID && idx.isIdIndex();
+        KeyOwned key(_key);
+
+        dassert(toplevel); 
+        if ( toplevel ) {
+            if ( key.dataSize() > this->KeyMax ) {
+                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.dataSize() << ' ' << key.toString() << endl;
+                return 3;
+            }
+        }
+
+        int x;
+        try {
+            x = _insert(thisLoc, recordLoc, key, order, dupsAllowed, DiskLoc(), DiskLoc(), idx);
+            this->assertValid( order );
+        }
+        catch( ... ) { 
+            guessIncreasing = false;
+            throw;
+        }
+        guessIncreasing = false;
+        return x;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::shape(stringstream& ss) const {
+        this->_shape(0, ss);
+    }
+
+    template< class V >
+    int BtreeBucket<V>::getKeyMax() {
+        return V::KeyMax;
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const {
+        int pos;
+        bool found;
+        // TODO: is it really ok here that the order is a default?  
+        // for findById() use, yes.  for checkNoIndexConflicts, no?
+        Ordering o = Ordering::make(BSONObj());
+        DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc );
+        if ( bucket.isNull() )
+            return bucket;
+
+        const BtreeBucket<V> *b = bucket.btree<V>();
+        while ( 1 ) {
+            const _KeyNode& knraw = b->k(pos);
+            if ( knraw.isUsed() )
+                break;
+            bucket = b->advance( bucket , pos , 1 , "findSingle" );
+            if ( bucket.isNull() )
+                return bucket;
+            b = bucket.btree<V>();
+        }
+        KeyNode kn = b->keyNode( pos );
+        if ( KeyOwned(key).woCompare( kn.key, o ) != 0 )
+            return DiskLoc();
+        return kn.recordLoc;
+    }
+
+} // namespace mongo
+
+#include "db.h"
+#include "dbhelpers.h"
+
+namespace mongo {
+
+    template< class V >
+    void BtreeBucket<V>::a_test(IndexDetails& id) {
+        BtreeBucket *b = id.head.btreemod<V>();
+
+        // record locs for testing
+        DiskLoc A(1, 20);
+        DiskLoc B(1, 30);
+        DiskLoc C(1, 40);
+
+        DiskLoc rl;
+        BSONObj key = fromjson("{x:9}");
+        BSONObj orderObj = fromjson("{}");
+        Ordering order = Ordering::make(orderObj);
+
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        assert( b->k(0).isUsed() );
+//        b->k(0).setUnused();
+        b->k(1).setUnused();
+        b->k(2).setUnused();
+        b->k(3).setUnused();
+
+        b->dumpTree(id.head, orderObj);
+
+        /* b->bt_insert(id.head, B, key, order, false, id);
+        b->k(1).setUnused();
+        b->dumpTree(id.head, order);
+        b->bt_insert(id.head, A, key, order, false, id);
+        b->dumpTree(id.head, order);
+        */
+
+        // this should assert.  does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing)
+        b->bt_insert(id.head, C, key, order, false, id);
+
+        // b->dumpTree(id.head, order);
+    }
+
+    template class BucketBasics<V0>;
+    template class BucketBasics<V1>;
+    template class BtreeBucket<V0>;
+    template class BtreeBucket<V1>;
+    template struct __KeyNode<DiskLoc>;
+    template struct __KeyNode<DiskLoc56Bit>;
+
+    struct BTUnitTest : public UnitTest {
+        void run() {
+            DiskLoc big(0xf12312, 0x70001234);
+            DiskLoc56Bit bigl;
+            {
+                bigl = big;
+                assert( big == bigl );
+                DiskLoc e = bigl;
+                assert( big == e );
+            }
+            {
+                DiskLoc d;
+                assert( d.isNull() );
+                DiskLoc56Bit l;
+                l = d;
+                assert( l.isNull() );
+                d = l;
+                assert( d.isNull() );
+                assert( l < bigl );
+            }
+        }
+    } btunittest;
+
+}
diff --git a/src/mongo/db/btree.h b/src/mongo/db/btree.h
new file mode 100644
index 00000000000..85e5172d163
--- /dev/null
+++ b/src/mongo/db/btree.h
@@ -0,0 +1,1174 @@
+// btree.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "diskloc.h"
+#include "pdfile.h"
+#include "key.h"
+
+namespace mongo {
+
+    /**
+     * Our btree implementation generally follows the standard btree algorithm,
+     * which is described in many places.  The nodes of our btree are referred to
+     * as buckets below.  These buckets are of size BucketSize and their body is
+     * an ordered array of <bson key, disk loc> pairs, where disk loc is the disk
+     * location of a document and bson key is a projection of this document into
+     * the schema of the index for this btree.  Ordering is determined on the
+     * basis of bson key first and then disk loc in case of a tie.  All bson keys
+     * for a btree have identical schemas with empty string field names and may
+     * not have an objsize() exceeding KeyMax.  The btree's buckets are
+     * themselves organized into an ordered tree.  Although there are exceptions,
+     * generally buckets with n keys have n+1 children and the body of a bucket is
+     * at least lowWaterMark bytes.  A more strictly enforced requirement is that
+     * a non root bucket must have at least one key except in certain transient
+     * states.
+     *
+     * Our btrees support the following primary read operations: finding a
+     * specified key; iterating from a starting key to the next or previous
+     * ordered key; and skipping from a starting key to another specified key
+     * without checking every intermediate key.  The primary write operations
+     * are insertion and deletion of keys.  Insertion may trigger a bucket split
+     * if necessary to avoid bucket overflow.  In such a case, subsequent splits
+     * will occur recursively as necessary.  Deletion may trigger a bucket
+     * rebalance, in which a size deficient bucket is filled with keys from an
+     * adjacent bucket.  In this case, splitting may potentially occur in the
+     * parent.  Deletion may alternatively trigger a merge, in which the keys
+     * from two buckets and a key from their shared parent are combined into the
+     * same bucket.  In such a case, rebalancing or merging may proceed
+     * recursively from the parent.
+     *
+     * While the btree data format has been relatively constant over time, btrees
+     * initially created by versions of mongo earlier than the current version
+     * may embody different properties than freshly created btrees (while
+     * following the same data format).  These older btrees are referred to
+     * below as legacy btrees.
+     */
+    
+    const int OldBucketSize = 8192;
+
+#pragma pack(1)
+    template< class Version > class BucketBasics;
+
+    /**
+     * This is the fixed width data component for storage of a key within a
+     * bucket.  It contains an offset pointer to the variable width bson
+     * data component.  A _KeyNode may be 'unused', please see below.
+     */
+    template< class Loc >
+    struct __KeyNode {
+        /** Signals that we are writing this _KeyNode and casts away const */
+        __KeyNode<Loc> & writing() const;
+        /**
+         * The 'left' child bucket of this key.  If this is the i-th key, it
+         * points to the i index child bucket.
+         */
+        Loc prevChildBucket;
+        /** The location of the record associated with this key. */
+        Loc recordLoc;
+        short keyDataOfs() const { return (short) _kdo; }
+
+        /** Offset within current bucket of the variable width bson key for this _KeyNode. */
+        unsigned short _kdo;
+        void setKeyDataOfs(short s) {
+            _kdo = s;
+            assert(s>=0);
+        }
+        /** Seems to be redundant. */
+        void setKeyDataOfsSavingUse(short s) {
+            _kdo = s;
+            assert(s>=0);
+        }
+        /**
+         * Unused keys are not returned by read operations.  Keys may be marked
+         * as unused in cases where it is difficult to delete them while
+         * maintaining the constraints required of a btree.
+         *
+         * Setting ofs to odd is the sentinel for unused, as real recordLoc's
+         * are always even numbers.  Note we need to keep its value basically
+         * the same as we use the recordLoc as part of the key in the index
+         * (to handle duplicate keys efficiently).
+         *
+         * Flagging keys as unused is a feature that is being phased out in favor
+         * of deleting the keys outright.  The current btree implementation is
+         * not expected to mark a key as unused in a non legacy btree.
+         */
+        void setUnused() {
+            recordLoc.GETOFS() |= 1;
+        }
+        void setUsed() { recordLoc.GETOFS() &= ~1; }
+        int isUnused() const {
+            return recordLoc.getOfs() & 1;
+        }
+        int isUsed() const {
+            return !isUnused();
+        }
+    };
+
+    /**
+     * This structure represents header data for a btree bucket.  An object of
+     * this type is typically allocated inside of a buffer of size BucketSize,
+     * resulting in a full bucket with an appropriate header.
+     *
+     * The body of a btree bucket contains an array of _KeyNode objects starting
+     * from its lowest indexed bytes and growing to higher indexed bytes.  The
+     * body also contains variable width bson keys, which are allocated from the
+     * highest indexed bytes toward lower indexed bytes.
+     *
+     * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
+     * h = header data
+     * k = KeyNode data
+     * - = empty space
+     * b = bson key data
+     * u = unused (old) bson key data, that may be garbage collected
+     */
+    class BtreeData_V0 {
+    protected:
+        /** Parent bucket of this bucket, which isNull() for the root bucket. */
+        DiskLoc parent;
+        /** Given that there are n keys, this is the n index child. */
+        DiskLoc nextChild;
+        /** can be reused, value is 8192 in current pdfile version Apr2010 */
+        unsigned short _wasSize;
+        /** zero */
+        unsigned short _reserved1;
+        int flags;
+
+        void _init() {
+            _reserved1 = 0;
+            _wasSize = BucketSize;
+            reserved = 0;
+        }
+
+        /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+        /** Size of the empty region. */
+        int emptySize;
+        /** Size used for bson storage, including storage of old keys. */
+        int topSize;
+        /* Number of keys in the bucket. */
+        int n;
+
+        int reserved;
+        /* Beginning of the bucket's body */
+        char data[4];
+
+    public:
+        typedef __KeyNode<DiskLoc> _KeyNode;
+        typedef DiskLoc Loc;
+        typedef KeyBson Key;
+        typedef KeyBson KeyOwned;
+        enum { BucketSize = 8192 };
+
+        // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
+        static const int KeyMax = OldBucketSize / 10;
+    };
+
+    // a a a ofs ofs ofs ofs
+    class DiskLoc56Bit {
+        int ofs;
+        unsigned char _a[3];
+        unsigned long long Z() const { 
+            // endian
+            return *((unsigned long long*)this) & 0x00ffffffffffffffULL;
+        }
+        enum { 
+            // first bit of offsets used in _KeyNode we don't use -1 here.
+            OurNullOfs = -2
+        };
+    public:
+        template< class V >
+        const BtreeBucket<V> * btree() const { 
+            return DiskLoc(*this).btree<V>();
+        }
+        template< class V >
+        BtreeBucket<V> * btreemod() const { 
+            return DiskLoc(*this).btreemod<V>();
+        }
+        operator const DiskLoc() const { 
+            // endian
+            if( isNull() ) return DiskLoc();
+            unsigned a = *((unsigned *) (_a-1));
+            return DiskLoc(a >> 8, ofs);
+        }
+        int& GETOFS()      { return ofs; }
+        int getOfs() const { return ofs; }
+        bool operator<(const DiskLoc56Bit& rhs) const {
+            // the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are 
+            // close together on disk close together in the tree, so we do want the file # to be the most significant
+            // bytes
+            return Z() < rhs.Z();
+        }
+        int compare(const DiskLoc56Bit& rhs) const {
+            unsigned long long a = Z();
+            unsigned long long b = rhs.Z();
+            if( a < b ) return -1;
+            return a == b ? 0 : 1;
+        }
+        bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); }
+        bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); }
+        bool operator==(const DiskLoc& rhs) const {
+            return DiskLoc(*this) == rhs;
+        }
+        bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); }
+        bool isNull() const { return ofs < 0; }
+        void Null() { 
+            ofs = OurNullOfs; 
+            _a[0] = _a[1] = _a[2] = 0;
+        }
+        string toString() const { return DiskLoc(*this).toString(); }
+        void operator=(const DiskLoc& loc) {
+            ofs = loc.getOfs();
+            int la = loc.a();
+            assert( la <= 0xffffff ); // must fit in 3 bytes
+            if( la < 0 ) {
+                assert( la == -1 );
+                la = 0;
+                ofs = OurNullOfs;
+            }
+            memcpy(_a, &la, 3); // endian
+            dassert( ofs != 0 );
+        }
+        DiskLoc56Bit& writing() const { 
+            return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7));
+        }
+    };
+
+    class BtreeData_V1 {
+    public:
+        typedef DiskLoc56Bit Loc;
+        //typedef DiskLoc Loc;
+        typedef __KeyNode<Loc> _KeyNode;
+        typedef KeyV1 Key;
+        typedef KeyV1Owned KeyOwned;
+        enum { BucketSize = 8192-16 }; // leave room for Record header
+        // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
+        static const int KeyMax = 1024;
+    protected:
+        /** Parent bucket of this bucket, which isNull() for the root bucket. */
+        Loc parent;
+        /** Given that there are n keys, this is the n index child. */
+        Loc nextChild;
+
+        unsigned short flags;
+
+        /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+        /** Size of the empty region. */
+        unsigned short emptySize;
+        /** Size used for bson storage, including storage of old keys. */
+        unsigned short topSize;
+        /* Number of keys in the bucket. */
+        unsigned short n;
+
+        /* Beginning of the bucket's body */
+        char data[4];
+
+        void _init() { }
+    };
+
+    typedef BtreeData_V0 V0;
+    typedef BtreeData_V1 V1;
+
+    /**
+     * This class adds functionality to BtreeData for managing a single bucket.
+     * The following policies are used in an attempt to encourage simplicity:
+     *
+     * Const member functions of this class are those which may be called on
+     * an object for which writing has not been signaled.  Non const member
+     * functions may only be called on objects for which writing has been
+     * signaled.  Note that currently some const functions write to the
+     * underlying memory representation of this bucket using optimized methods
+     * to signal write operations.
+     *
+     * DiskLoc parameters that may shadow references within the btree should
+     * be passed by value rather than by reference to non const member
+     * functions or to const member functions which may perform writes.  This way
+     * a callee need not worry that write operations will change or invalidate
+     * its arguments.
+     *
+     * The current policy for dealing with bson arguments is the opposite of
+     * what is described above for DiskLoc arguments.  We do not want to copy
+     * bson into memory as an intermediate step for btree changes, and if bson
+     * is to be moved it must be copied to the new location before the old
+     * location is invalidated.  Care should be taken in cases where that invalid
+     * memory may be implicitly referenced by function arguments.
+     *
+     * A number of functions below require a thisLoc argument, which must be the
+     * disk location of the bucket mapped to 'this'.
+     */
+    template< class Version >
+    class BucketBasics : public Version {
+    public:
+        template <class U> friend class BtreeBuilder;
+        typedef typename Version::Key Key;
+        typedef typename Version::_KeyNode _KeyNode;
+        typedef typename Version::Loc Loc;
+
+        int getN() const { return this->n; }
+
+        /**
+         * This is an in memory wrapper for a _KeyNode, and not itself part of btree
+         * storage.  This object and its BSONObj 'key' will become invalid if the
+         * _KeyNode data that generated it is moved within the btree.  In general,
+         * a KeyNode should not be expected to be valid after a write.
+         */
+        class KeyNode {
+        public:
+            KeyNode(const BucketBasics<Version>& bb, const _KeyNode &k);
+            const Loc& prevChildBucket;
+            const Loc& recordLoc;
+            /* Points to the bson key storage for a _KeyNode */
+            Key key;
+        };
+        friend class KeyNode;
+
+        /** Assert write intent declared for this bucket already. */
+        void assertWritable();
+
+        void assertValid(const Ordering &order, bool force = false) const;
+        void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); }
+
+        /**
+         * @return KeyNode for key at index i.  The KeyNode will become invalid
+         * if the key is moved or reassigned, or if the node is packed.  In general
+         * a KeyNode should not be expected to be valid after a write.
+         */
+        const KeyNode keyNode(int i) const {
+            if ( i >= this->n ) {
+                massert( 13000 , (string)"invalid keyNode: " +  BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n );
+            }
+            return KeyNode(*this, k(i));
+        }
+
+        static int headerSize() {
+            const BucketBasics *d = 0;
+            return (char*)&(d->data) - (char*)&(d->parent);
+        }
+        static int bodySize() { return Version::BucketSize - headerSize(); }
+        static int lowWaterMark() { return bodySize() / 2 - Version::KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp
+
+        // for testing
+        int nKeys() const { return this->n; }
+        const DiskLoc getNextChild() const { return this->nextChild; }
+
+    protected:
+        char * dataAt(short ofs) { return this->data + ofs; }
+
+        /** Initialize the header for a new node. */
+        void init();
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos <= n
+         *  - If key is inserted at position keypos, the bucket's keys will still be
+         *    in order.
+         * Postconditions:
+         *  - If key can fit in the bucket, the bucket may be packed and keypos
+         *    may be decreased to reflect deletion of earlier indexed keys during
+         *    packing, the key will be inserted at the updated keypos index with
+         *    a null prevChildBucket, the subsequent keys shifted to the right,
+         *    and the function will return true.
+         *  - If key cannot fit in the bucket, the bucket will be packed and
+         *    the function will return false.
+         * Although this function is marked const, it modifies the underlying
+         * btree representation through an optimized write intent mechanism.
+         */
+        bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const;
+
+        /**
+         * Preconditions:
+         *  - key / recordLoc are > all existing keys
+         *  - The keys in prevChild and their descendents are between all existing
+         *    keys and 'key'.
+         * Postconditions:
+         *  - If there is space for key without packing, it is inserted as the
+         *    last key with specified prevChild and true is returned.
+         *    Importantly, nextChild is not updated!
+         *  - Otherwise false is returned and there is no change.
+         */
+        bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild);
+        void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
+            bool ok = _pushBack( recordLoc , key , order , prevChild );
+            assert(ok);
+        }
+
+        /**
+         * This is a special purpose function used by BtreeBuilder.  The
+         * interface is quite dangerous if you're not careful.  The bson key
+         * returned here points to bucket memory that has been invalidated but
+         * not yet reclaimed.
+         *
+         * TODO Maybe this could be replaced with two functions, one which
+         * returns the last key without deleting it and another which simply
+         * deletes the last key.  Then the caller would have enough control to
+         * ensure proper memory integrity.
+         *
+         * Preconditions:
+         *  - bucket is not empty
+         *  - last key of bucket is used (not unused)
+         *  - nextChild isNull()
+         *  - _unalloc will work correctly as used - see code
+         * Postconditions:
+         *  - The last key of the bucket is removed, and its key and recLoc are
+         *    returned.  As mentioned above, the key points to unallocated memory.
+         */
+        void popBack(DiskLoc& recLoc, Key &key);
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - there is no child bucket at keypos
+         *  - n > 1
+         *  - if mayEmpty == false or nextChild.isNull(), n > 0
+         * Postconditions:
+         *  - The key at keypos is removed, and remaining keys are shifted over.
+         *  - The bucket becomes unpacked.
+         *  - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys.
+         */
+        void _delKeyAtPos(int keypos, bool mayEmpty = false);
+
+        /* !Packed means there is deleted fragment space within the bucket.
+           We "repack" when we run out of space before considering the node
+           to be full.
+           */
+        enum Flags { Packed=1 };
+
+        /** n == 0 is ok */
+        const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+        Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+
+        /** Same as bodySize(). */
+        int totalDataSize() const;
+        /**
+         * @return true when a key may be dropped by pack()
+         * @param index index of the key that may be dropped
+         * @param refPos index of a particular key of interest, which must not
+         *  be dropped; = 0 to safely ignore
+         */
+        bool mayDropKey( int index, int refPos ) const;
+
+        /**
+         * Pack the bucket to reclaim space from invalidated memory.
+         * @refPos is an index in the bucket which may be updated if we
+         *  delete keys from the bucket
+         * This function may cast away const and perform a write.
+         * Preconditions: none
+         * Postconditions:
+         *  - Bucket will be packed
+         *  - Some unused nodes may be dropped, but not ones at index 0 or refPos
+         *  - Some used nodes may be moved
+         *  - If refPos is the index of an existing key, it will be updated to that
+         *    key's new index if the key is moved.
+         */
+        void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
+        /** Pack when already writable */
+        void _packReadyForMod(const Ordering &order, int &refPos);
+
+        /** @return the size the bucket's body would have if we were to call pack() */
+        int packedDataSize( int refPos ) const;
+        void setNotPacked() { this->flags &= ~Packed; }
+        void setPacked() { this->flags |= Packed; }
+        /**
+         * Preconditions: 'bytes' is <= emptySize
+         * Postconditions: A buffer of size 'bytes' is allocated on the top side,
+         *  and its offset is returned.
+         */
+        int _alloc(int bytes);
+        /**
+         * This function can be used to deallocate the lowest byte index bson
+         * buffer in the top region, which in some but not all cases is for the
+         * n - 1 index key.  This function only works correctly in certain
+         * special cases, please be careful.
+         * Preconditions: 'bytes' <= topSize
+         * Postconditions: The top region is decreased
+         */
+        void _unalloc(int bytes);
+        /**
+         * Preconditions: 'N' <= n
+         * Postconditions:
+         *  - All keys after the N index key are dropped.
+         *  - Then bucket is packed, without dropping refPos if < refPos N.
+         */
+        void truncateTo(int N, const Ordering &order, int &refPos);
+        /**
+         * Preconditions:
+         *  - 'nDrop' < n
+         *  - for now, refPos should be zero.
+         * Postconditions:
+         *  - All keys before the nDrop index key are dropped.
+         *  - The bucket is packed.
+         */
+        void dropFront(int nDrop, const Ordering &order, int &refPos);
+        /**
+         * Preconditions: 0 <= keypos < n
+         * Postconditions: keypos indexed key is marked unused.
+         */
+        void markUnused(int keypos);
+
+        /**
+         * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
+         *   we use tempNext() when we do that to be less confusing. (one might have written a union in C)
+         */
+        DiskLoc tempNext() const { return this->parent; }
+        void setTempNext(DiskLoc l) { this->parent = l; }
+
+        void _shape(int level, stringstream&) const;
+        int Size() const;
+        
+        /** @return i-indexed _KeyNode, without bounds checking */
+    public:
+        const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; }
+        _KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; }
+    protected:        
+        _KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; }
+
+        /**
+         * Preconditions: 'this' is packed
+         * @return the key index to be promoted on split
+         * @param keypos The requested index of a key to insert, which may affect
+         *  the choice of split position.
+         */
+        int splitPos( int keypos ) const;
+
+        /**
+         * Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize
+         * Postconditions:
+         *  - Increases indexes of existing _KeyNode objects by nAdd, reserving
+         *    space for additional _KeyNode objects at front.
+         *  - Does not initialize ofs values for the bson data of these
+         *    _KeyNode objects.
+         */
+        void reserveKeysFront( int nAdd );
+
+        /**
+         * Preconditions:
+         *  - 0 <= i < n
+         *  - The bson 'key' must fit in the bucket without packing.
+         *  - If 'key' and 'prevChildBucket' are set at index i, the btree
+         *    ordering properties will be maintained.
+         * Postconditions:
+         *  - The specified key is set at index i, replacing the existing
+         *    _KeyNode data and without shifting any other _KeyNode objects.
+         */
+        void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket );
+    };
+
+    template< class V>
+    struct Continuation;
+
+    /**
+     * This class adds functionality for manipulating buckets that are assembled
+     * in a tree.  The requirements for const and non const functions and
+     * arguments are generally the same as in BtreeBucket.  Because this class
+     * deals with tree structure, some functions that are marked const may
+     * trigger modification of another node in the btree or potentially of the
+     * current node.  In such cases, the function's implementation explicitly
+     * casts away const when indicating an intent to write to the durability
+     * layer.  The DiskLocs provided to such functions should be passed by
+     * value if they shadow pointers within the btree.
+     *
+     * To clarify enforcement of referential integrity in this implementation,
+     * we use the following pattern when deleting data we have a persistent
+     * pointer to.  The pointer is cleared or removed explicitly, then the data
+     * it pointed to is cleaned up with a helper function.
+     *
+     * TODO It might make sense to put some of these functions in a class
+     * representing a full btree instead of a single btree bucket.  That would
+     * allow us to use the const qualifier in a manner more consistent with
+     * standard usage.  Right now the interface is for both a node and a tree,
+     * so assignment of const is sometimes nonideal.
+     *
+     * TODO There are several cases in which the 'this' pointer is invalidated
+     * as a result of deallocation.  A seperate class representing a btree would
+     * alleviate some fragile cases where the implementation must currently
+     * behave correctly if the 'this' pointer is suddenly invalidated by a
+     * callee.
+     */
+    template< class V >
+    class BtreeBucket : public BucketBasics<V> {
+        friend class BtreeCursor;
+        friend struct Continuation<V>;
+    public:
+	// make compiler happy:
+        typedef typename V::Key Key;
+        typedef typename V::KeyOwned KeyOwned;
+	typedef typename BucketBasics<V>::KeyNode KeyNode;
+	typedef typename BucketBasics<V>::_KeyNode _KeyNode;
+	typedef typename BucketBasics<V>::Loc Loc;
+        const _KeyNode& k(int i) const     { return static_cast< const BucketBasics<V> * >(this)->k(i); }
+    protected:
+        _KeyNode& k(int i)                 { return static_cast< BucketBasics<V> * >(this)->_k(i); }
+    public:
+        const KeyNode keyNode(int i) const { return static_cast< const BucketBasics<V> * >(this)->keyNode(i); }
+
+        bool isHead() const { return this->parent.isNull(); }
+        void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
+        long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false, unsigned depth=0) const; /* traverses everything */
+
+        bool isUsed( int i ) const { return this->k(i).isUsed(); }
+        string bucketSummary() const;
+        void dump(unsigned depth=0) const;
+
+        /**
+         * @return true if key exists in index
+         *
+         * @order - indicates order of keys in the index.  this is basically the index's key pattern, e.g.:
+         *    BSONObj order = ((IndexDetails&)idx).keyPattern();
+         * likewise below in bt_insert() etc.
+         */
+    private:
+        bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const;
+    public:
+
+        /**
+         * @param self - Don't complain about ourself already being in the index case.
+         * @return true = There is a duplicate used key.
+         */
+        bool wouldCreateDup(
+            const IndexDetails& idx, const DiskLoc &thisLoc,
+            const Key& key, const Ordering& order,
+            const DiskLoc &self) const;
+
+        /**
+         * Preconditions: none
+         * Postconditions: @return a new bucket allocated from pdfile storage
+         *  and init()-ed.  This bucket is suitable to for use as a new root
+         *  or any other new node in the tree.
+         */
+        static DiskLoc addBucket(const IndexDetails&);
+
+        /**
+         * Preconditions: none
+         * Postconditions:
+         *  - Some header values in this bucket are cleared, and the bucket is
+         *    deallocated from pdfile storage.
+         *  - The memory at thisLoc is invalidated, and 'this' is invalidated.
+         */
+        void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
+
+        /**
+         * Preconditions:
+         *  - 'key' has a valid schema for this index.
+         *  - All other paramenters are valid and consistent with this index if applicable.
+         * Postconditions:
+         *  - If key is bigger than KeyMax, @return 2 or 3 and no change.
+         *  - If key / recordLoc exist in the btree as an unused key, set them
+         *    as used and @return 0
+         *  - If key / recordLoc exist in the btree as a used key, @throw
+         *    exception 10287 and no change.
+         *  - If key / recordLoc do not exist in the btree, they are inserted
+         *    and @return 0.  The root of the btree may be changed, so
+         *    'this'/thisLoc may no longer be the root upon return.
+         */
+        int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                      const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                      IndexDetails& idx, bool toplevel = true) const;
+
+        /** does the insert in two steps - can then use an upgradable lock for step 1, which 
+            is the part which may have page faults.  also that step is most of the computational work.
+        */
+        void twoStepInsert(DiskLoc thisLoc, Continuation<V> &c, bool dupsAllowed) const;
+
+        /**
+         * Preconditions:
+         *  - 'key' has a valid schema for this index, and may have objsize() > KeyMax.
+         * Postconditions:
+         *  - If key / recordLoc are in the btree, they are removed (possibly
+         *    by being marked as an unused key), @return true, and potentially
+         *    invalidate 'this' / thisLoc and change the head.
+         *  - If key / recordLoc are not in the btree, @return false and do nothing.
+         */
+        bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;
+
+        /**
+         * locate may return an "unused" key that is just a marker.  so be careful.
+         *   looks for a key:recordloc pair.
+         *
+         * @found - returns true if exact match found.  note you can get back a position
+         *          result even if found is false.
+         */
+        DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                       int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
+        DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const Key& key, const Ordering &order,
+                       int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
+
+        /**
+         * find the first instance of the key
+         * does not handle dups
+         * WARNING: findSingle may not be compound index safe.  this may need to change.  see notes in 
+         *          findSingle code.
+         * @return the record location of the first match
+         */
+        DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;
+
+        /**
+         * Advance to next or previous key in the index.
+         * @param direction to advance.
+         */
+        DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;
+
+        /** Advance in specified direction to the specified key */
+        void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
+
+        /** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */
+        static void customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) ;
+
+        /** @return head of the btree by traversing from current bucket. */
+        const DiskLoc getHead(const DiskLoc& thisLoc) const;
+
+        /** get tree shape */
+        void shape(stringstream&) const;
+
+        static void a_test(IndexDetails&);
+
+        static int getKeyMax();
+
+    protected:
+        /**
+         * Preconditions:
+         *  - 0 <= firstIndex <= n
+         *  - -1 <= lastIndex <= n ( -1 is equivalent to n )
+         * Postconditions:
+         *  - Any children at indexes firstIndex through lastIndex (inclusive)
+         *    will have their parent pointers set to thisLoc.
+         */
+        void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;
+
+        /**
+         * Preconditions:
+         *  - thisLoc is not the btree head.
+         *  - n == 0 is ok
+         * Postconditions:
+         *  - All cursors pointing to this bucket will be updated.
+         *  - This bucket's parent's child pointer is set to null.
+         *  - This bucket is deallocated from pdfile storage.
+         *  - 'this' and thisLoc are invalidated.
+         */
+        void delBucket(const DiskLoc thisLoc, const IndexDetails&);
+
+        /**
+         * Preconditions: 0 <= p < n
+         * Postconditions:
+         *  - The key at index p is removed from the btree.
+         *  - 'this' and thisLoc may be invalidated.
+         *  - The tree head may change.
+         */
+        void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);
+
+        /**
+         * Preconditions:
+         *  - n == 0 is ok
+         * Postconditions:
+         *  - If thisLoc is head, or if its body has at least lowWaterMark bytes,
+         *    return false and do nothing.
+         *  - Otherwise, if thisLoc has left or right neighbors, either balance
+         *    or merge with them and return true.  Also, 'this' and thisLoc may
+         *    be invalidated and the tree head may change.
+         */
+        bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;
+
+        /**
+         * Preconditions:
+         *  - 0 <= leftIndex < n
+         *  - The child at leftIndex or the child at leftIndex + 1 contains
+         *    fewer than lowWaterMark bytes.
+         * Postconditions:
+         *  - If the child bucket at leftIndex can merge with the child index
+         *    at leftIndex + 1, do nothing and return false.
+         *  - Otherwise, balance keys between the leftIndex child and the
+         *    leftIndex + 1 child, return true, and possibly change the tree head.
+         */
+        bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
+
+        /**
+         * Preconditions:
+         *  - All preconditions of tryBalanceChildren.
+         *  - The leftIndex child and leftIndex + 1 child cannot be merged.
+         * Postconditions:
+         *  - Keys are moved between the leftIndex child and the leftIndex + 1
+         *    child such that neither child has fewer than lowWaterMark bytes.
+         *    The tree head may change.
+         */
+        void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
+        
+        /**
+         * Preconditions:
+         *  - All preconditions of doBalanceChildren
+         *  - The leftIndex and leftIndex + 1 children are packed.
+         *  - The leftIndex + 1 child has fewer than lowWaterMark bytes.
+         *  - split returned by rebalancedSeparatorPos()
+         * Postconditions:
+         *  - The key in lchild at index split is set as thisLoc's key at index
+         *    leftIndex, which may trigger a split and change the tree head.
+         *    The previous key in thisLoc at index leftIndex and all keys with
+         *    indexes greater than split in lchild are moved to rchild.
+         */
+        void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+                                   BtreeBucket<V> *l, const DiskLoc lchild,
+                                   BtreeBucket<V> *r, const DiskLoc rchild,
+                                   IndexDetails &id, const Ordering &order );
+        /**
+         * Preconditions:
+         *  - All preconditions of doBalanceChildren
+         *  - The leftIndex and leftIndex + 1 children are packed.
+         *  - The leftIndex child has fewer than lowWaterMark bytes.
+         *  - split returned by rebalancedSeparatorPos()
+         * Postconditions:
+         *  - The key in rchild at index split - l->n - 1 is set as thisLoc's key
+         *    at index leftIndex, which may trigger a split and change the tree
+         *    head.  The previous key in thisLoc at index leftIndex and all keys
+         *    with indexes less than split - l->n - 1 in rchild are moved to
+         *    lchild.
+         */        
+        void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+                                   BtreeBucket<V> *l, const DiskLoc lchild,
+                                   BtreeBucket<V> *r, const DiskLoc rchild,
+                                   IndexDetails &id, const Ordering &order );
+
+        /**
+         * Preconditions:
+         *  - 0 <= leftIndex < n
+         *  - this->canMergeChildren( thisLoc, leftIndex ) == true
+         * Postconditions:
+         *  - All of the above mentioned keys will be placed in the left child.
+         *  - The tree may be updated recursively, resulting in 'this' and
+         *    thisLoc being invalidated and the tree head being changed.
+         */
+        void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);
+
+        /**
+         * Preconditions:
+         *  - n == 0
+         *  - !nextChild.isNull()
+         * Postconditions:
+         *  - 'this' and thisLoc are deallocated (and invalidated), any cursors
+         *    to them are updated, and the tree head may change.
+         *  - nextChild replaces thisLoc in the btree structure.
+         */
+        void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );
+
+        /**
+         * @return true iff the leftIndex and leftIndex + 1 children both exist,
+         *  and if their body sizes when packed and the thisLoc key at leftIndex
+         *  would fit in a single bucket body.
+         */
+        bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;
+
+        /**
+         * Preconditions:
+         *  - leftIndex and leftIndex + 1 children are packed
+         *  - leftIndex or leftIndex + 1 child is below lowWaterMark
+         * @return index of the rebalanced separator; the index value is
+         *  determined as if we had a bucket with body
+         *  <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
+         *  and called splitPos( 0 ) on it.
+         */
+        int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;
+
+        /**
+         * Preconditions: thisLoc has a parent
+         * @return parent's index of thisLoc.
+         */
+        int indexInParent( const DiskLoc &thisLoc ) const;        
+
+    public:
+        Key keyAt(int i) const {
+            if( i >= this->n ) 
+                return Key();
+            return Key(this->data + k(i).keyDataOfs());
+        }
+    protected:
+
+        /**
+         * Preconditions:
+         *  - This bucket is packed.
+         *  - Cannot add a key of size KeyMax to this bucket.
+         *  - 0 <= keypos <= n is the position of a new key that will be inserted
+         *  - lchild is equal to the existing child at index keypos.
+         * Postconditions:
+         *  - The thisLoc bucket is split into two packed buckets, possibly
+         *    invalidating the initial position of keypos, with a split key
+         *    promoted to the parent.  The new key key/recordLoc will be inserted
+         *    into one of the split buckets, and lchild/rchild set appropriately.
+         *    Splitting may occur recursively, possibly changing the tree head.
+         */
+        void split(const DiskLoc thisLoc, int keypos,
+                   const DiskLoc recordLoc, const Key& key,
+                   const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos <= n
+         *  - If key / recordLoc are inserted at position keypos, with provided
+         *    lchild and rchild, the btree ordering requirements will be
+         *    maintained.
+         *  - lchild is equal to the existing child at index keypos.
+         *  - n == 0 is ok.
+         * Postconditions:
+         *  - The key / recordLoc are inserted at position keypos, and the
+         *    bucket is split if necessary, which may change the tree head.
+         *  - The bucket may be packed or split, invalidating the specified value
+         *    of keypos.
+         * This function will always modify thisLoc, but it's marked const because
+         * it commonly relies on the specialized writ]e intent mechanism of basicInsert().
+         */
+        void insertHere(const DiskLoc thisLoc, int keypos,
+                        const DiskLoc recordLoc, const Key& key, const Ordering &order,
+                        const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;
+
+        /** bt_insert() is basically just a wrapper around this. */
+        int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                    const Key& key, const Ordering &order, bool dupsAllowed,
+                    const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
+
+        void insertStepOne(DiskLoc thisLoc, Continuation<V>& c, bool dupsAllowed) const;
+
+        bool find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;        
+        static bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) ;
+        static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
+        static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
+        
+        /** If child is non null, set its parent to thisLoc */
+        static void fix(const DiskLoc thisLoc, const DiskLoc child);
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - If the specified key and recordLoc are placed in keypos of thisLoc,
+         *    and lchild and rchild are set, the btree ordering properties will
+         *    be maintained.
+         *  - rchild == childForPos( keypos + 1 )
+         *  - childForPos( keypos ) is referenced elsewhere if nonnull.
+         * Postconditions:
+         *  - The key at keypos will be replaced with the specified key and
+         *    lchild, potentially splitting this bucket and changing the tree
+         *    head.
+         *  - childForPos( keypos ) will be orphaned.
+         */
+        void setInternalKey( const DiskLoc thisLoc, int keypos,
+                             const DiskLoc recordLoc, const Key &key, const Ordering &order,
+                             const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - The keypos or keypos+1 indexed child is non null.
+         * Postconditions:
+         *  - The specified key is deleted by replacing it with another key if
+         *    possible.  This replacement may cause a split and change the tree
+         *    head.  The replacement key will be deleted from its original
+         *    location, potentially causing merges and splits that may invalidate
+         *    'this' and thisLoc and change the tree head.
+         *  - If the key cannot be replaced, it will be marked as unused.  This
+         *    is only expected in legacy btrees.
+         */
+        void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
+    public:
+        /** simply builds and returns a dup key error message string */
+        static string dupKeyError( const IndexDetails& idx , const Key& key );
+    };
+#pragma pack()
+
+    class FieldRangeVector;
+    class FieldRangeVectorIterator;
+    
+    class BtreeCursor : public Cursor {
+    protected:
+        BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+        BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+    public:
+        virtual ~BtreeCursor();
+        /** makes an appropriate subclass depending on the index version */
+        static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+        static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+        static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+        static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+
+        virtual bool ok() { return !bucket.isNull(); }
+        virtual bool advance();
+        virtual void noteLocation(); // updates keyAtKeyOfs...
+        virtual void checkLocation() = 0;
+        virtual bool supportGetMore() { return true; }
+        virtual bool supportYields() { return true; }
+
+        /**
+         * used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+         * if a multikey index traversal:
+         *   if loc has already been sent, returns true.
+         *   otherwise, marks loc as sent.
+         * @return false if the loc has not been seen
+         */
+        virtual bool getsetdup(DiskLoc loc) {
+            if( _multikey ) {
+                pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc);
+                return !p.second;
+            }
+            return false;
+        }
+
+        virtual bool modifiedKeys() const { return _multikey; }
+        virtual bool isMultiKey() const { return _multikey; }
+
+        /*const _KeyNode& _currKeyNode() const {
+            assert( !bucket.isNull() );
+            const _KeyNode& kn = keyNode(keyOfs);
+            assert( kn.isUsed() );
+            return kn;
+        }*/
+
+        /** returns BSONObj() if ofs is out of range */
+        virtual BSONObj keyAt(int ofs) const = 0;
+
+        virtual BSONObj currKey() const = 0;
+        virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }
+
+        virtual void aboutToDeleteBucket(const DiskLoc& b) {
+            if ( bucket == b )
+                keyOfs = -1;
+        }
+
+        virtual DiskLoc currLoc() = 0; //  { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();  }
+        virtual DiskLoc refLoc()   { return currLoc(); }
+        virtual Record* _current() { return currLoc().rec(); }
+        virtual BSONObj current()  { return BSONObj(_current()); }
+        virtual string toString();
+
+        BSONObj prettyKey( const BSONObj &key ) const {
+            return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
+        }
+
+        virtual BSONObj prettyIndexBounds() const;
+
+        virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher;  }
+
+        virtual long long nscanned() { return _nscanned; }
+
+        /** for debugging only */
+        const DiskLoc getBucket() const { return bucket; }
+        int getKeyOfs() const { return keyOfs; }
+
+        // just for unit tests
+        virtual bool curKeyHasChild() = 0;
+
+    protected:
+        /**
+         * Our btrees may (rarely) have "unused" keys when items are deleted.
+         * Skip past them.
+         */
+        virtual bool skipUnusedKeys() = 0;
+
+        bool skipOutOfRangeKeysAndCheckEnd();
+        void skipAndCheck();
+        void checkEnd();
+
+        /** selective audits on construction */
+        void audit();
+
+        virtual void _audit() = 0;
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0;
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0;
+
+        /** set initial bucket */
+        void initWithoutIndependentFieldRanges();
+
+        /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
+        void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );
+
+        set<DiskLoc> _dups;
+        NamespaceDetails * const d;
+        const int idxNo;
+        BSONObj startKey;
+        BSONObj endKey;
+        bool _endKeyInclusive;
+        bool _multikey; // this must be updated every getmore batch in case someone added a multikey
+        const IndexDetails& indexDetails;
+        const BSONObj _order;
+        const Ordering _ordering;
+        DiskLoc bucket;
+        int keyOfs;
+        const int _direction; // 1=fwd,-1=reverse
+        BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
+        DiskLoc locAtKeyOfs;
+        const shared_ptr< FieldRangeVector > _bounds;
+        auto_ptr< FieldRangeVectorIterator > _boundsIterator;
+        shared_ptr< CoveredIndexMatcher > _matcher;
+        bool _independentFieldRanges;
+        long long _nscanned;
+    };
+
+    template< class V >
+    struct Continuation { 
+        //Continuation(const typename V::Key & k);
+        Continuation(DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+                     Ordering _order, IndexDetails& _idx) :
+            bLoc(thisLoc), recordLoc(_recordLoc), key(_key), order(_order), idx(_idx) { 
+            op = Nothing;
+        }
+
+        DiskLoc bLoc;
+        DiskLoc recordLoc;
+        typename V::KeyOwned key;
+        const Ordering order;
+        IndexDetails& idx;
+        enum Op { Nothing, SetUsed, InsertHere } op;
+
+        int pos;
+        const BtreeBucket<V> *b;
+
+        void stepTwo() {
+            if( op == Nothing )
+                return;
+            else if( op == SetUsed ) {
+                const typename V::_KeyNode& kn = b->k(pos);
+                kn.writing().setUsed();
+            }
+            else {
+                b->insertHere(bLoc, pos, recordLoc, key, order, DiskLoc(), DiskLoc(), idx);
+            }
+        }
+    };
+
+    /** Renames the index namespace for this btree's index. */
+    void renameIndexNamespace(const char *oldNs, const char *newNs);
+
+    /**
+     * give us a writable version of the btree bucket (declares write intent).
+     * note it is likely more efficient to declare write intent on something smaller when you can.
+     */
+    template< class V >
+    BtreeBucket<V> * DiskLoc::btreemod() const {
+        assert( _a != -1 );
+        BtreeBucket<V> *b = const_cast< BtreeBucket<V> * >( btree<V>() );
+        return static_cast< BtreeBucket<V>* >( getDur().writingPtr( b, V::BucketSize ) );
+    }
+
+    template< class V >
+    BucketBasics<V>::KeyNode::KeyNode(const BucketBasics<V>& bb, const _KeyNode &k) :
+        prevChildBucket(k.prevChildBucket),
+        recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
+    { }
+
+} // namespace mongo;
diff --git a/src/mongo/db/btreebuilder.cpp b/src/mongo/db/btreebuilder.cpp
new file mode 100644
index 00000000000..0ec587a1958
--- /dev/null
+++ b/src/mongo/db/btreebuilder.cpp
@@ -0,0 +1,184 @@
+// btreebuilder.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "json.h"
+#include "clientcursor.h"
+#include "client.h"
+#include "dbhelpers.h"
+#include "curop-inl.h"
+#include "stats/counters.h"
+#include "dur_commitjob.h"
+#include "btreebuilder.h"
+
+namespace mongo {
+
+    /* --- BtreeBuilder --- */
+
+    template<class V>
+    BtreeBuilder<V>::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) :
+        dupsAllowed(_dupsAllowed),
+        idx(_idx),
+        n(0),
+        order( idx.keyPattern() ),
+        ordering( Ordering::make(idx.keyPattern()) ) {
+        first = cur = BtreeBucket<V>::addBucket(idx);
+        b = cur.btreemod<V>();
+        committed = false;
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::newBucket() {
+        DiskLoc L = BtreeBucket<V>::addBucket(idx);
+        b->setTempNext(L);
+        cur = L;
+        b = cur.btreemod<V>();
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::mayCommitProgressDurably() {
+        if ( getDur().commitIfNeeded() ) {
+            b = cur.btreemod<V>();
+        }
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::addKey(BSONObj& _key, DiskLoc loc) {
+
+        auto_ptr< KeyOwned > key( new KeyOwned(_key) );
+        if ( key->dataSize() > BtreeBucket<V>::KeyMax ) {
+            problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() 
+                      << ' ' << key->dataSize() << ' ' << key->toString() << endl;
+            return;
+        }
+
+        if( !dupsAllowed ) {
+            if( n > 0 ) {
+                int cmp = keyLast->woCompare(*key, ordering);
+                massert( 10288 ,  "bad key order in BtreeBuilder - server internal error", cmp <= 0 );
+                if( cmp == 0 ) {
+                    //if( !dupsAllowed )
+                    uasserted( ASSERT_ID_DUPKEY , BtreeBucket<V>::dupKeyError( idx , *keyLast ) );
+                }
+            }
+        }
+
+        if ( ! b->_pushBack(loc, *key, ordering, DiskLoc()) ) {
+            // bucket was full
+            newBucket();
+            b->pushBack(loc, *key, ordering, DiskLoc());
+        }
+        keyLast = key;
+        n++;
+        mayCommitProgressDurably();
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::buildNextLevel(DiskLoc loc) {
+        int levels = 1;
+        while( 1 ) {
+            if( loc.btree<V>()->tempNext().isNull() ) {
+                // only 1 bucket at this level. we are done.
+                getDur().writingDiskLoc(idx.head) = loc;
+                break;
+            }
+            levels++;
+
+            DiskLoc upLoc = BtreeBucket<V>::addBucket(idx);
+            DiskLoc upStart = upLoc;
+            BtreeBucket<V> *up = upLoc.btreemod<V>();
+
+            DiskLoc xloc = loc;
+            while( !xloc.isNull() ) {
+                if ( getDur().commitIfNeeded() ) {
+                    b = cur.btreemod<V>();
+                    up = upLoc.btreemod<V>();
+                }
+
+                BtreeBucket<V> *x = xloc.btreemod<V>();
+                Key k;
+                DiskLoc r;
+                x->popBack(r,k);
+                bool keepX = ( x->n != 0 );
+                DiskLoc keepLoc = keepX ? xloc : x->nextChild;
+
+                if ( ! up->_pushBack(r, k, ordering, keepLoc) ) {
+                    // current bucket full
+                    DiskLoc n = BtreeBucket<V>::addBucket(idx);
+                    up->setTempNext(n);
+                    upLoc = n;
+                    up = upLoc.btreemod<V>();
+                    up->pushBack(r, k, ordering, keepLoc);
+                }
+
+                DiskLoc nextLoc = x->tempNext(); // get next in chain at current level
+                if ( keepX ) {
+                    x->parent = upLoc;
+                }
+                else {
+		  if ( !x->nextChild.isNull() ) {
+		    DiskLoc ll = x->nextChild;
+		    ll.btreemod<V>()->parent = upLoc;
+		    //(x->nextChild.btreemod<V>())->parent = upLoc;
+		  }
+		  x->deallocBucket( xloc, idx );
+                }
+                xloc = nextLoc;
+            }
+
+            loc = upStart;
+            mayCommitProgressDurably();
+        }
+
+        if( levels > 1 )
+            log(2) << "btree levels: " << levels << endl;
+    }
+
+    /** when all addKeys are done, we then build the higher levels of the tree */
+    template<class V>
+    void BtreeBuilder<V>::commit() {
+        buildNextLevel(first);
+        committed = true;
+    }
+
+    template<class V>
+    BtreeBuilder<V>::~BtreeBuilder() {
+        DESTRUCTOR_GUARD(
+            if( !committed ) {
+                log(2) << "Rolling back partially built index space" << endl;
+                DiskLoc x = first;
+                while( !x.isNull() ) {
+                    DiskLoc next = x.btree<V>()->tempNext();
+                    string ns = idx.indexNamespace();
+                    theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x);
+                    x = next;
+                    getDur().commitIfNeeded();
+                }
+                assert( idx.head.isNull() );
+                log(2) << "done rollback" << endl;
+            }
+        )
+    }
+
+    template class BtreeBuilder<V0>;
+    template class BtreeBuilder<V1>;
+
+}
diff --git a/src/mongo/db/btreebuilder.h b/src/mongo/db/btreebuilder.h
new file mode 100644
index 00000000000..6de55d89299
--- /dev/null
+++ b/src/mongo/db/btreebuilder.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "btree.h"
+
+namespace mongo {
+
+    /**
+     * build btree from the bottom up
+     */
+    template< class V >
+    class BtreeBuilder {
+        typedef typename V::KeyOwned KeyOwned;
+        typedef typename V::Key Key;
+        
+        bool dupsAllowed;
+        IndexDetails& idx;
+        /** Number of keys added to btree. */
+        unsigned long long n;
+        /** Last key passed to addKey(). */
+        auto_ptr< typename V::KeyOwned > keyLast;
+        BSONObj order;
+        Ordering ordering;
+        /** true iff commit() completed successfully. */
+        bool committed;
+
+        DiskLoc cur, first;
+        BtreeBucket<V> *b;
+
+        void newBucket();
+        void buildNextLevel(DiskLoc);
+        void mayCommitProgressDurably();
+
+    public:
+        ~BtreeBuilder();
+
+        BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);
+
+        /**
+         * Preconditions: 'key' is > or >= last key passed to this function (depends on _dupsAllowed)
+         * Postconditions: 'key' is added to intermediate storage.
+         */
+        void addKey(BSONObj& key, DiskLoc loc);
+
+        /**
+         * commit work.  if not called, destructor will clean up partially completed work
+         *  (in case exception has happened).
+         */
+        void commit();
+
+        unsigned long long getn() { return n; }
+    };
+
+}
diff --git a/src/mongo/db/btreecursor.cpp b/src/mongo/db/btreecursor.cpp
new file mode 100644
index 00000000000..7ddd4874ef6
--- /dev/null
+++ b/src/mongo/db/btreecursor.cpp
@@ -0,0 +1,457 @@
+// btreecursor.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "curop-inl.h"
+#include "queryutil.h"
+
+namespace mongo {
+
+    template< class V >
+    class BtreeCursorImpl : public BtreeCursor { 
+    public:
+        typedef typename BucketBasics<V>::KeyNode KeyNode;
+        typedef typename V::Key Key;
+        typedef typename V::_KeyNode _KeyNode;
+
+        BtreeCursorImpl(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : 
+          BtreeCursor(a,b,c,d,e,f,g) { }
+        BtreeCursorImpl(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ) :
+          BtreeCursor(_d,_idxNo,_id,_bounds,_direction )
+        { 
+            pair< DiskLoc, int > noBestParent;
+            indexDetails.head.btree<V>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
+            skipAndCheck();
+            dassert( _dups.size() == 0 );
+        }
+
+        virtual DiskLoc currLoc() { 
+            if( bucket.isNull() ) return DiskLoc();
+            return currKeyNode().recordLoc;
+        }
+
+        virtual BSONObj keyAt(int ofs) const { 
+            assert( !bucket.isNull() );
+            const BtreeBucket<V> *b = bucket.btree<V>();
+            int n = b->getN();
+            if( n == 0xffff ) { 
+                throw UserException(15850, "keyAt bucket deleted");
+            }
+            dassert( n >= 0 && n < 10000 );
+            return ofs >= n ? BSONObj() : b->keyNode(ofs).key.toBson();
+        }
+
+        virtual BSONObj currKey() const { 
+            assert( !bucket.isNull() );
+            return bucket.btree<V>()->keyNode(keyOfs).key.toBson();
+        }
+
+        virtual bool curKeyHasChild() { 
+            return !currKeyNode().prevChildBucket.isNull();
+        }
+
+        bool skipUnusedKeys() {
+            int u = 0;
+            while ( 1 ) {
+                if ( !ok() )
+                    break;
+                const _KeyNode& kn = keyNode(keyOfs);
+                if ( kn.isUsed() )
+                    break;
+                bucket = _advance(bucket, keyOfs, _direction, "skipUnusedKeys");
+                u++;
+                //don't include unused keys in nscanned
+                //++_nscanned;
+            }
+            if ( u > 10 )
+                OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
+            return u;
+        }
+
+        /* Since the last noteLocation(), our key may have moved around, and that old cached
+           information may thus be stale and wrong (although often it is right).  We check
+           that here; if we have moved, we have to search back for where we were at.
+
+           i.e., after operations on the index, the BtreeCursor's cached location info may
+           be invalid.  This function ensures validity, so you should call it before using
+           the cursor if other writers have used the database since the last noteLocation
+           call.
+        */
+        void checkLocation() {
+            if ( eof() )
+                return;
+
+            _multikey = d->isMultikey(idxNo);
+
+            if ( keyOfs >= 0 ) {
+                assert( !keyAtKeyOfs.isEmpty() );
+
+                try {
+                    // Note keyAt() returns an empty BSONObj if keyOfs is now out of range,
+                    // which is possible as keys may have been deleted.
+                    int x = 0;
+                    while( 1 ) {
+                        //  if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
+                        //       b->k(keyOfs).recordLoc == locAtKeyOfs ) {
+                        if ( keyAt(keyOfs).binaryEqual(keyAtKeyOfs) ) {
+                            const _KeyNode& kn = keyNode(keyOfs);
+                            if( kn.recordLoc == locAtKeyOfs ) {
+                                if ( !kn.isUsed() ) {
+                                    // we were deleted but still exist as an unused
+                                    // marker key. advance.
+                                    skipUnusedKeys();
+                                }
+                                return;
+                            }
+                        }
+
+                        // we check one key earlier too, in case a key was just deleted.  this is
+                        // important so that multi updates are reasonably fast.
+                        if( keyOfs == 0 || x++ )
+                            break;
+                        keyOfs--;
+                    }
+                }
+                catch(UserException& e) { 
+                    if( e.getCode() != 15850 )
+                        throw;
+                    // hack: fall through if bucket was just deleted. should only happen under deleteObjects()
+                    DEV log() << "debug info: bucket was deleted" << endl;
+                }
+            }
+
+            /* normally we don't get to here.  when we do, old position is no longer
+                valid and we must refind where we left off (which is expensive)
+            */
+
+            /* TODO: Switch to keep indexdetails and do idx.head! */
+            bucket = _locate(keyAtKeyOfs, locAtKeyOfs);
+            RARELY log() << "key seems to have moved in the index, refinding. " << bucket.toString() << endl;
+            if ( ! bucket.isNull() )
+                skipUnusedKeys();
+
+        }
+    
+    protected:
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
+            thisLoc.btree<V>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
+        }
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+            return thisLoc.btree<V>()->advance(thisLoc, keyOfs, direction, caller);
+        }
+        virtual void _audit() {
+            out() << "BtreeCursor(). dumping head bucket" << endl;
+            indexDetails.head.btree<V>()->dump();
+        }
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) {
+            bool found;
+            return indexDetails.head.btree<V>()->
+                     locate(indexDetails, indexDetails.head, key, _ordering, keyOfs, found, loc, _direction);
+        }
+
+        const _KeyNode& keyNode(int keyOfs) const { 
+            return bucket.btree<V>()->k(keyOfs);
+        }
+
+    private:
+        const KeyNode currKeyNode() const {
+            assert( !bucket.isNull() );
+            const BtreeBucket<V> *b = bucket.btree<V>();
+            return b->keyNode(keyOfs);
+        }
+    };
+
+    template class BtreeCursorImpl<V0>;
+    template class BtreeCursorImpl<V1>;
+
+    /*
+    class BtreeCursorV1 : public BtreeCursor { 
+    public:
+        typedef BucketBasics<V1>::KeyNode KeyNode;
+        typedef V1::Key Key;
+
+        BtreeCursorV1(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : 
+          BtreeCursor(a,b,c,d,e,f,g) { }
+        BtreeCursorV1(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction) : 
+          BtreeCursor(_d,_idxNo,_id,_bounds,_direction) 
+        { 
+            pair< DiskLoc, int > noBestParent;
+            indexDetails.head.btree<V1>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
+            skipAndCheck();
+            dassert( _dups.size() == 0 );
+        }
+
+        virtual DiskLoc currLoc() { 
+            if( bucket.isNull() ) return DiskLoc();
+            return currKeyNode().recordLoc;
+        }
+
+        virtual BSONObj currKey() const { 
+            assert( !bucket.isNull() );
+            return bucket.btree<V1>()->keyNode(keyOfs).key.toBson();
+        }
+
+    protected:
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
+            thisLoc.btree<V1>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
+        }
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+            return thisLoc.btree<V1>()->advance(thisLoc, keyOfs, direction, caller);
+        }
+        virtual void _audit() {
+            out() << "BtreeCursor(). dumping head bucket" << endl;
+            indexDetails.head.btree<V1>()->dump();
+        }
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc);
+        virtual const _KeyNode& keyNode(int keyOfs) { 
+            return bucket.btree<V1>()->k(keyOfs);
+        }
+
+    private:
+        const KeyNode currKeyNode() const {
+            assert( !bucket.isNull() );
+            const BtreeBucket<V1> *b = bucket.btree<V1>();
+            return b->keyNode(keyOfs);
+        }
+    };*/
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, const IndexDetails& _id,
+        const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+    {
+        return make( _d, _d->idxNo( (IndexDetails&) _id), _id, _bounds, _direction );
+    }
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, const IndexDetails& _id,
+        const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction)
+    {
+        return make( _d, _d->idxNo( (IndexDetails&) _id), _id, startKey, endKey, endKeyInclusive, direction );
+    }
+
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, 
+        const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction) 
+    { 
+        int v = _id.version();
+        BtreeCursor *c = 0;
+        if( v == 1 ) {
+            c = new BtreeCursorImpl<V1>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction);
+        }
+        else if( v == 0 ) {
+            c = new BtreeCursorImpl<V0>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction);
+        }
+        else {
+            uasserted(14800, str::stream() << "unsupported index version " << v);
+        }
+        c->initWithoutIndependentFieldRanges();
+        dassert( c->_dups.size() == 0 );
+        return c;
+    }
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, 
+        const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+    {
+        int v = _id.version();
+        if( v == 1 )
+            return new BtreeCursorImpl<V1>(_d,_idxNo,_id,_bounds,_direction);
+        if( v == 0 )
+            return new BtreeCursorImpl<V0>(_d,_idxNo,_id,_bounds,_direction);
+        uasserted(14801, str::stream() << "unsupported index version " << v);
+
+        // just check we are in sync with this method
+        dassert( IndexDetails::isASupportedIndexVersionNumber(v) );
+
+        return 0;
+    }
+
+    BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id,
+                              const BSONObj &_startKey, const BSONObj &_endKey, bool endKeyInclusive, int _direction ) :
+        d(_d), idxNo(_idxNo),
+        startKey( _startKey ),
+        endKey( _endKey ),
+        _endKeyInclusive( endKeyInclusive ),
+        _multikey( d->isMultikey( idxNo ) ),
+        indexDetails( _id ),
+        _order( _id.keyPattern() ),
+        _ordering( Ordering::make( _order ) ),
+        _direction( _direction ),
+        _independentFieldRanges( false ),
+        _nscanned( 0 ) {
+        audit();
+    }
+
+    BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+        :
+        d(_d), idxNo(_idxNo),
+        _endKeyInclusive( true ),
+        _multikey( d->isMultikey( idxNo ) ),
+        indexDetails( _id ),
+        _order( _id.keyPattern() ),
+        _ordering( Ordering::make( _order ) ),
+        _direction( _direction ),
+        _bounds( ( assert( _bounds.get() ), _bounds ) ),
+        _boundsIterator( new FieldRangeVectorIterator( *_bounds  ) ),
+        _independentFieldRanges( true ),
+        _nscanned( 0 ) {
+        audit();
+        startKey = _bounds->startKey();
+        _boundsIterator->advance( startKey ); // handles initialization
+        _boundsIterator->prepDive();
+        bucket = indexDetails.head;
+        keyOfs = 0;
+    }
+
+    /** Properly destroy forward declared class members. */
+    BtreeCursor::~BtreeCursor() {}
+    
+    void BtreeCursor::audit() {
+        dassert( d->idxNo((IndexDetails&) indexDetails) == idxNo );
+    }
+
+    void BtreeCursor::initWithoutIndependentFieldRanges() {
+        if ( indexDetails.getSpec().getType() ) {
+            startKey = indexDetails.getSpec().getType()->fixKey( startKey );
+            endKey = indexDetails.getSpec().getType()->fixKey( endKey );
+        }
+        bucket = _locate(startKey, _direction > 0 ? minDiskLoc : maxDiskLoc);
+        if ( ok() ) {
+            _nscanned = 1;
+        }
+        skipUnusedKeys();
+        checkEnd();
+    }
+
+    void BtreeCursor::skipAndCheck() {
+        long long startNscanned = _nscanned;
+        skipUnusedKeys();
+        while( 1 ) {
+            if ( !skipOutOfRangeKeysAndCheckEnd() ) {
+                break;
+            }
+            do {
+                if ( _nscanned > startNscanned + 20 ) {
+                    skipUnusedKeys();
+                    return;
+                }
+            } while( skipOutOfRangeKeysAndCheckEnd() );
+            if ( !skipUnusedKeys() ) {
+                break;
+            }
+        }
+    }
+
+    bool BtreeCursor::skipOutOfRangeKeysAndCheckEnd() {
+        if ( !ok() ) {
+            return false;
+        }
+        int ret = _boundsIterator->advance( currKey() );
+        if ( ret == -2 ) {
+            bucket = DiskLoc();
+            return false;
+        }
+        else if ( ret == -1 ) {
+            ++_nscanned;
+            return false;
+        }
+        ++_nscanned;
+        advanceTo( currKey(), ret, _boundsIterator->after(), _boundsIterator->cmp(), _boundsIterator->inc() );
+        return true;
+    }
+
+    // Return a value in the set {-1, 0, 1} to represent the sign of parameter i.
+    int sgn( int i ) {
+        if ( i == 0 )
+            return 0;
+        return i > 0 ? 1 : -1;
+    }
+
+    // Check if the current key is beyond endKey.
+    void BtreeCursor::checkEnd() {
+        if ( bucket.isNull() )
+            return;
+        if ( !endKey.isEmpty() ) {
+            int cmp = sgn( endKey.woCompare( currKey(), _order ) );
+            if ( ( cmp != 0 && cmp != _direction ) ||
+                    ( cmp == 0 && !_endKeyInclusive ) )
+                bucket = DiskLoc();
+        }
+    }
+
+    void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive) {
+        _advanceTo( bucket, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, _ordering, _direction );
+    }
+
+    bool BtreeCursor::advance() {
+        killCurrentOp.checkForInterrupt();
+        if ( bucket.isNull() )
+            return false;
+
+        bucket = _advance(bucket, keyOfs, _direction, "BtreeCursor::advance");
+
+        if ( !_independentFieldRanges ) {
+            skipUnusedKeys();
+            checkEnd();
+            if ( ok() ) {
+                ++_nscanned;
+            }
+        }
+        else {
+            skipAndCheck();
+        }
+        return ok();
+    }
+
+    void BtreeCursor::noteLocation() {
+        if ( !eof() ) {
+            BSONObj o = currKey().getOwned();
+            keyAtKeyOfs = o;
+            locAtKeyOfs = currLoc();
+        }
+    }
+
+    string BtreeCursor::toString() {
+        string s = string("BtreeCursor ") + indexDetails.indexName();
+        if ( _direction < 0 ) s += " reverse";
+        if ( _bounds.get() && _bounds->size() > 1 ) s += " multi";
+        return s;
+    }
+    
+    BSONObj BtreeCursor::prettyIndexBounds() const {
+        if ( !_independentFieldRanges ) {
+            return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) );
+        }
+        else {
+            return _bounds->obj();
+        }
+    }    
+
+    /* ----------------------------------------------------------------------------- */
+
+    struct BtreeCursorUnitTest {
+        BtreeCursorUnitTest() {
+            assert( minDiskLoc.compare(maxDiskLoc) < 0 );
+        }
+    } btut;
+
+} // namespace mongo
diff --git a/src/mongo/db/cap.cpp b/src/mongo/db/cap.cpp
new file mode 100644
index 00000000000..a8be2383115
--- /dev/null
+++ b/src/mongo/db/cap.cpp
@@ -0,0 +1,457 @@
+// @file cap.cpp capped collection related
+// the "old" version (<= v1.6)
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "../util/mmap.h"
+#include "../util/hashtab.h"
+#include "../scripting/engine.h"
+#include "btree.h"
+#include <algorithm>
+#include <list>
+#include "json.h"
+#include "clientcursor.h"
+
+/*
+ capped collection layout
+
+ d's below won't exist if things align perfectly:
+
+ extent1             -> extent2                 -> extent3
+ -------------------    -----------------------    ---------------------
+ d r r r r r r r r d    d r r r r d r r r r r d    d r r r r r r r r r d
+                                ^   ^
+                           oldest   newest
+
+                        ^cappedFirstDeletedInCurExtent()
+                   ^cappedLastDelRecLastExtent()
+ ^cappedListOfAllDeletedRecords()
+*/
+
+
+namespace mongo {
+
+    /* combine adjacent deleted records *for the current extent* of the capped collection
+
+       this is O(n^2) but we call it for capped tables where typically n==1 or 2!
+       (or 3...there will be a little unused sliver at the end of the extent.)
+    */
+    void NamespaceDetails::compact() {
+        assert(capped);
+
+        list<DiskLoc> drecs;
+
+        // Pull out capExtent's DRs from deletedList
+        DiskLoc i = cappedFirstDeletedInCurExtent();
+        for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted )
+            drecs.push_back( i );
+
+        getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i;
+
+        // This is the O(n^2) part.
+        drecs.sort();
+
+        list<DiskLoc>::iterator j = drecs.begin();
+        assert( j != drecs.end() );
+        DiskLoc a = *j;
+        while ( 1 ) {
+            j++;
+            if ( j == drecs.end() ) {
+                DEBUGGING out() << "TEMP: compact adddelrec\n";
+                addDeletedRec(a.drec(), a);
+                break;
+            }
+            DiskLoc b = *j;
+            while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
+                // a & b are adjacent.  merge.
+                getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders;
+                j++;
+                if ( j == drecs.end() ) {
+                    DEBUGGING out() << "temp: compact adddelrec2\n";
+                    addDeletedRec(a.drec(), a);
+                    return;
+                }
+                b = *j;
+            }
+            DEBUGGING out() << "temp: compact adddelrec3\n";
+            addDeletedRec(a.drec(), a);
+            a = b;
+        }
+    }
+
+    DiskLoc &NamespaceDetails::cappedFirstDeletedInCurExtent() {
+        if ( cappedLastDelRecLastExtent().isNull() )
+            return cappedListOfAllDeletedRecords();
+        else
+            return cappedLastDelRecLastExtent().drec()->nextDeleted;
+    }
+
+    void NamespaceDetails::cappedCheckMigrate() {
+        // migrate old NamespaceDetails format
+        assert( capped );
+        if ( capExtent.a() == 0 && capExtent.getOfs() == 0 ) {
+            //capFirstNewRecord = DiskLoc();
+            capFirstNewRecord.writing().setInvalid();
+            // put all the DeletedRecords in cappedListOfAllDeletedRecords()
+            for ( int i = 1; i < Buckets; ++i ) {
+                DiskLoc first = deletedList[ i ];
+                if ( first.isNull() )
+                    continue;
+                DiskLoc last = first;
+                for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted );
+                last.drec()->nextDeleted.writing() = cappedListOfAllDeletedRecords();
+                cappedListOfAllDeletedRecords().writing() = first;
+                deletedList[i].writing() = DiskLoc();
+            }
+            // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
+
+            // Last, in case we're killed before getting here
+            capExtent.writing() = firstExtent;
+        }
+    }
+
+    bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const {
+        assert( !dl.isNull() );
+        // We could have a rec or drec, doesn't matter.
+        bool res = dl.drec()->myExtentLoc(dl) == capExtent;
+        DEV {
+            // old implementation. this check is temp to test works the same.  new impl should be a little faster.
+            assert( res == (dl.drec()->myExtent( dl ) == capExtent.ext()) );
+        }
+        return res;
+    }
+
+    bool NamespaceDetails::nextIsInCapExtent( const DiskLoc &dl ) const {
+        assert( !dl.isNull() );
+        DiskLoc next = dl.drec()->nextDeleted;
+        if ( next.isNull() )
+            return false;
+        return inCapExtent( next );
+    }
+
+    void NamespaceDetails::advanceCapExtent( const char *ns ) {
+        // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
+        // (or DiskLoc() if new capExtent == firstExtent)
+        if ( capExtent == lastExtent )
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
+        else {
+            DiskLoc i = cappedFirstDeletedInCurExtent();
+            for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted );
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = i;
+        }
+
+        getDur().writingDiskLoc( capExtent ) = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
+
+        /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
+        //dassert( theCapExtent()->ns == ns );
+
+        theCapExtent()->assertOk();
+        getDur().writingDiskLoc( capFirstNewRecord ) = DiskLoc();
+    }
+
+    DiskLoc NamespaceDetails::__capAlloc( int len ) {
+        DiskLoc prev = cappedLastDelRecLastExtent();
+        DiskLoc i = cappedFirstDeletedInCurExtent();
+        DiskLoc ret;
+        for (; !i.isNull() && inCapExtent( i ); prev = i, i = i.drec()->nextDeleted ) {
+            // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
+            // so make sure there's space to create a DR at the end.
+            if ( i.drec()->lengthWithHeaders >= len + 24 ) {
+                ret = i;
+                break;
+            }
+        }
+
+        /* unlink ourself from the deleted list */
+        if ( !ret.isNull() ) {
+            if ( prev.isNull() )
+                cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted;
+            else
+                prev.drec()->nextDeleted.writing() = ret.drec()->nextDeleted;
+            ret.drec()->nextDeleted.writing().setInvalid(); // defensive.
+            assert( ret.drec()->extentOfs < ret.getOfs() );
+        }
+
+        return ret;
+    }
+
+    DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) {
+        // signal done allocating new extents.
+        if ( !cappedLastDelRecLastExtent().isValid() )
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
+
+        assert( len < 400000000 );
+        int passes = 0;
+        int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
+        if ( maxPasses < 5000 ) {
+            // this is for bacwards safety since 5000 was the old value
+            maxPasses = 5000;
+        }
+        DiskLoc loc;
+
+        // delete records until we have room and the max # objects limit achieved.
+
+        /* this fails on a rename -- that is ok but must keep commented out */
+        //assert( theCapExtent()->ns == ns );
+
+        theCapExtent()->assertOk();
+        DiskLoc firstEmptyExtent;
+        while ( 1 ) {
+            if ( stats.nrecords < max ) {
+                loc = __capAlloc( len );
+                if ( !loc.isNull() )
+                    break;
+            }
+
+            // If on first iteration through extents, don't delete anything.
+            if ( !capFirstNewRecord.isValid() ) {
+                advanceCapExtent( ns );
+
+                if ( capExtent != firstExtent )
+                    capFirstNewRecord.writing().setInvalid();
+                // else signal done with first iteration through extents.
+                continue;
+            }
+
+            if ( !capFirstNewRecord.isNull() &&
+                    theCapExtent()->firstRecord == capFirstNewRecord ) {
+                // We've deleted all records that were allocated on the previous
+                // iteration through this extent.
+                advanceCapExtent( ns );
+                continue;
+            }
+
+            if ( theCapExtent()->firstRecord.isNull() ) {
+                if ( firstEmptyExtent.isNull() )
+                    firstEmptyExtent = capExtent;
+                advanceCapExtent( ns );
+                if ( firstEmptyExtent == capExtent ) {
+                    maybeComplain( ns, len );
+                    return DiskLoc();
+                }
+                continue;
+            }
+
+            DiskLoc fr = theCapExtent()->firstRecord;
+            theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); // ZZZZZZZZZZZZ
+            compact();
+            if( ++passes > maxPasses ) {
+                log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n';
+                log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl;
+                massert( 10345 ,  "passes >= maxPasses in capped collection alloc", false );
+            }
+        }
+
+        // Remember first record allocated on this iteration through capExtent.
+        if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() )
+            getDur().writingDiskLoc(capFirstNewRecord) = loc;
+
+        return loc;
+    }
+
+    void NamespaceDetails::dumpExtents() {
+        cout << "dumpExtents:" << endl;
+        for ( DiskLoc i = firstExtent; !i.isNull(); i = i.ext()->xnext ) {
+            Extent *e = i.ext();
+            stringstream ss;
+            e->dump(ss);
+            cout << ss.str() << endl;
+        }
+    }
+
+    void NamespaceDetails::cappedDumpDelInfo() {
+        cout << "dl[0]: " << deletedList[0].toString() << endl;
+        for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) {
+            cout << "  drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders <<
+                 " ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl;
+        }
+        cout << "dl[1]: " << deletedList[1].toString() << endl;
+    }
+
+    void NamespaceDetails::cappedTruncateLastDelUpdate() {
+        if ( capExtent == firstExtent ) {
+            // Only one extent of the collection is in use, so there
+            // is no deleted record in a previous extent, so nullify
+            // cappedLastDelRecLastExtent().
+            cappedLastDelRecLastExtent().writing() = DiskLoc();
+        }
+        else {
+            // Scan through all deleted records in the collection
+            // until the last deleted record for the extent prior
+            // to the new capExtent is found.  Then set
+            // cappedLastDelRecLastExtent() to that deleted record.
+            DiskLoc i = cappedListOfAllDeletedRecords();
+            for( ;
+                    !i.drec()->nextDeleted.isNull() &&
+                    !inCapExtent( i.drec()->nextDeleted );
+                    i = i.drec()->nextDeleted );
+            // In our capped storage model, every extent must have at least one
+            // deleted record.  Here we check that 'i' is not the last deleted
+            // record.  (We expect that there will be deleted records in the new
+            // capExtent as well.)
+            assert( !i.drec()->nextDeleted.isNull() );
+            cappedLastDelRecLastExtent().writing() = i;
+        }
+    }
+
+    void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) {
+        DEV assert( this == nsdetails(ns) );
+        assert( cappedLastDelRecLastExtent().isValid() );
+
+        // We iteratively remove the newest document until the newest document
+        // is 'end', then we remove 'end' if requested.
+        bool foundLast = false;
+        while( 1 ) {
+            if ( foundLast ) {
+                // 'end' has been found and removed, so break.
+                break;
+            }
+            getDur().commitIfNeeded();
+            // 'curr' will point to the newest document in the collection.
+            DiskLoc curr = theCapExtent()->lastRecord;
+            assert( !curr.isNull() );
+            if ( curr == end ) {
+                if ( inclusive ) {
+                    // 'end' has been found, so break next iteration.
+                    foundLast = true;
+                }
+                else {
+                    // 'end' has been found, so break.
+                    break;
+                }
+            }
+
+            // TODO The algorithm used in this function cannot generate an
+            // empty collection, but we could call emptyCappedCollection() in
+            // this case instead of asserting.
+            uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 );
+
+            // Delete the newest record, and coalesce the new deleted
+            // record with existing deleted records.
+            theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
+            compact();
+
+            // This is the case where we have not yet had to remove any
+            // documents to make room for other documents, and we are allocating
+            // documents from free space in fresh extents instead of reusing
+            // space from familiar extents.
+            if ( !capLooped() ) {
+
+                // We just removed the last record from the 'capExtent', and
+                // the 'capExtent' can't be empty, so we set 'capExtent' to
+                // capExtent's prev extent.
+                if ( theCapExtent()->lastRecord.isNull() ) {
+                    assert( !theCapExtent()->xprev.isNull() );
+                    // NOTE Because we didn't delete the last document, and
+                    // capLooped() is false, capExtent is not the first extent
+                    // so xprev will be nonnull.
+                    capExtent.writing() = theCapExtent()->xprev;
+                    theCapExtent()->assertOk();
+
+                    // update cappedLastDelRecLastExtent()
+                    cappedTruncateLastDelUpdate();
+                }
+                continue;
+            }
+
+            // This is the case where capLooped() is true, and we just deleted
+            // from capExtent, and we just deleted capFirstNewRecord, which was
+            // the last record on the fresh side of capExtent.
+            // NOTE In this comparison, curr and potentially capFirstNewRecord
+            // may point to invalid data, but we can still compare the
+            // references themselves.
+            if ( curr == capFirstNewRecord ) {
+
+                // Set 'capExtent' to the first nonempty extent prior to the
+                // initial capExtent.  There must be such an extent because we
+                // have not deleted the last document in the collection.  It is
+                // possible that all extents other than the capExtent are empty.
+                // In this case we will keep the initial capExtent and specify
+                // that all records contained within are on the fresh rather than
+                // stale side of the extent.
+                DiskLoc newCapExtent = capExtent;
+                do {
+                    // Find the previous extent, looping if necessary.
+                    newCapExtent = ( newCapExtent == firstExtent ) ? lastExtent : newCapExtent.ext()->xprev;
+                    newCapExtent.ext()->assertOk();
+                }
+                while ( newCapExtent.ext()->firstRecord.isNull() );
+                capExtent.writing() = newCapExtent;
+
+                // Place all documents in the new capExtent on the fresh side
+                // of the capExtent by setting capFirstNewRecord to the first
+                // document in the new capExtent.
+                capFirstNewRecord.writing() = theCapExtent()->firstRecord;
+
+                // update cappedLastDelRecLastExtent()
+                cappedTruncateLastDelUpdate();
+            }
+        }
+    }
+
+    void NamespaceDetails::emptyCappedCollection( const char *ns ) {
+        DEV assert( this == nsdetails(ns) );
+        massert( 13424, "collection must be capped", capped );
+        massert( 13425, "background index build in progress", !indexBuildInProgress );
+        massert( 13426, "indexes present", nIndexes == 0 );
+
+        // Clear all references to this namespace.
+        ClientCursor::invalidate( ns );
+        NamespaceDetailsTransient::clearForPrefix( ns );
+
+        // Get a writeable reference to 'this' and reset all pertinent
+        // attributes.
+        NamespaceDetails *t = writingWithoutExtra();
+
+        t->cappedLastDelRecLastExtent() = DiskLoc();
+        t->cappedListOfAllDeletedRecords() = DiskLoc();
+
+        // preserve firstExtent/lastExtent
+        t->capExtent = firstExtent;
+        t->stats.datasize = stats.nrecords = 0;
+        // lastExtentSize preserve
+        // nIndexes preserve 0
+        // capped preserve true
+        // max preserve
+        t->paddingFactor = 1.0;
+        t->flags = 0;
+        t->capFirstNewRecord = DiskLoc();
+        t->capFirstNewRecord.setInvalid();
+        t->cappedLastDelRecLastExtent().setInvalid();
+        // dataFileVersion preserve
+        // indexFileVersion preserve
+        t->multiKeyIndexBits = 0;
+        t->reservedA = 0;
+        t->extraOffset = 0;
+        // indexBuildInProgress preserve 0
+        memset(t->reserved, 0, sizeof(t->reserved));
+
+        // Reset all existing extents and recreate the deleted list.
+        for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) {
+            DiskLoc prev = ext.ext()->xprev;
+            DiskLoc next = ext.ext()->xnext;
+            DiskLoc empty = ext.ext()->reuse( ns, true );
+            ext.ext()->xprev.writing() = prev;
+            ext.ext()->xnext.writing() = next;
+            addDeletedRec( empty.drec(), empty );
+        }
+    }
+
+}
diff --git a/src/mongo/db/client.cpp b/src/mongo/db/client.cpp
new file mode 100644
index 00000000000..92b78d87ee5
--- /dev/null
+++ b/src/mongo/db/client.cpp
@@ -0,0 +1,697 @@
+// client.cpp
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Client represents a connection to the database (the server-side) and corresponds
+   to an open socket (or logical connection if pooling on sockets) from a client.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "client.h"
+#include "curop-inl.h"
+#include "json.h"
+#include "security.h"
+#include "commands.h"
+#include "instance.h"
+#include "../s/d_logic.h"
+#include "dbwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/mongoutils/checksum.h"
+#include "../util/file_allocator.h"
+#include "repl/rs.h"
+#include "../scripting/engine.h"
+
+namespace mongo {
+  
+    Client* Client::syncThread;
+    mongo::mutex Client::clientsMutex("clientsMutex");
+    set<Client*> Client::clients; // always be in clientsMutex when manipulating this
+
+    TSP_DEFINE(Client, currentClient)
+
+#if defined(_DEBUG)
+    struct StackChecker;
+    ThreadLocalValue<StackChecker *> checker;
+
+    struct StackChecker { 
+        enum { SZ = 256 * 1024 };
+        char buf[SZ];
+        StackChecker() { 
+            checker.set(this);
+        }
+        void init() { 
+            memset(buf, 42, sizeof(buf)); 
+        }
+        static void check(const char *tname) { 
+            static int max;
+            StackChecker *sc = checker.get();
+            const char *p = sc->buf;
+            int i = 0;
+            for( ; i < SZ; i++ ) { 
+                if( p[i] != 42 )
+                    break;
+            }
+            int z = SZ-i;
+            if( z > max ) {
+                max = z;
+                log() << "thread " << tname << " stack usage was " << z << " bytes" << endl;
+            }
+            wassert( i > 16000 );
+        }
+    };
+#endif
+
+    /* each thread which does db operations has a Client object in TLS.
+       call this when your thread starts.
+    */
+#if defined _DEBUG
+    static unsigned long long nThreads = 0;
+    void assertStartingUp() { 
+        assert( nThreads <= 1 );
+    }
+#else
+    void assertStartingUp() { }
+#endif
+
+    Client& Client::initThread(const char *desc, AbstractMessagingPort *mp) {
+#if defined(_DEBUG)
+        { 
+            nThreads++; // never decremented.  this is for casi class asserts
+            if( sizeof(void*) == 8 ) {
+                StackChecker sc;
+                sc.init();
+            }
+        }
+#endif
+        assert( currentClient.get() == 0 );
+        Client *c = new Client(desc, mp);
+        currentClient.reset(c);
+        mongo::lastError.initThread();
+        return *c;
+    }
+
+    Client::Client(const char *desc, AbstractMessagingPort *p) :
+        _context(0),
+        _shutdown(false),
+        _desc(desc),
+        _god(0),
+        _lastOp(0),
+        _mp(p),
+        _sometimes(0)
+    {
+        _hasWrittenThisPass = false;
+        _pageFaultRetryableSection = 0;
+        _connectionId = setThreadName(desc);
+        _curOp = new CurOp( this );
+#ifndef _WIN32
+        stringstream temp;
+        temp << hex << showbase << pthread_self();
+        _threadId = temp.str();
+#endif
+        scoped_lock bl(clientsMutex);
+        clients.insert(this);
+    }
+
+    Client::~Client() {
+        _god = 0;
+
+        if ( _context )
+            error() << "Client::~Client _context should be null but is not; client:" << _desc << endl;
+
+        if ( ! _shutdown ) {
+            error() << "Client::shutdown not called: " << _desc << endl;
+        }
+
+        if ( ! inShutdown() ) {
+            // we can't clean up safely once we're in shutdown
+            scoped_lock bl(clientsMutex);
+            if ( ! _shutdown )
+                clients.erase(this);
+            delete _curOp;
+        }
+    }
+
+    bool Client::shutdown() {
+#if defined(_DEBUG)
+        { 
+            if( sizeof(void*) == 8 ) {
+                StackChecker::check( desc() );
+            }
+        }
+#endif
+        _shutdown = true;
+        if ( inShutdown() )
+            return false;
+        {
+            scoped_lock bl(clientsMutex);
+            clients.erase(this);
+            if ( isSyncThread() ) {
+                syncThread = 0;
+            }
+        }
+
+        return false;
+    }
+
+    BSONObj CachedBSONObj::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}");
+    Client::Context::Context( string ns , Database * db, bool doauth ) :
+        _client( currentClient.get() ), 
+        _oldContext( _client->_context ),
+        _path( mongo::dbpath ), // is this right? could be a different db? may need a dassert for this
+        _justCreated(false),
+        _ns( ns ), 
+        _db(db)
+    {
+        assert( db == 0 || db->isOk() );
+        _client->_context = this;
+        checkNsAccess( doauth );
+        _client->checkLocks();
+    }
+
+    Client::Context::Context(const string& ns, string path , bool doauth ) :
+        _client( currentClient.get() ), 
+        _oldContext( _client->_context ),
+        _path( path ), 
+        _justCreated(false), // set for real in finishInit
+        _ns( ns ), 
+        _db(0) 
+    {
+        _finishInit( doauth );
+        _client->checkLocks();
+    }
+       
+    /** "read lock, and set my context, all in one operation" 
+     *  This handles (if not recursively locked) opening an unopened database.
+     */
+    Client::ReadContext::ReadContext(const string& ns, string path, bool doauth ) {
+        {
+            lk.reset( new _LockCollectionForReading(ns) );
+            Database *db = dbHolder().get(ns, path);
+            if( db ) {
+                c.reset( new Context(path, ns, db, doauth) );
+                return;
+            }
+        }
+
+        // we usually don't get here, so doesn't matter how fast this part is
+        {
+            int x = d.dbMutex.getState();
+            if( x > 0 ) { 
+                // write locked already
+                DEV RARELY log() << "write locked on ReadContext construction " << ns << endl;
+                c.reset( new Context(ns, path, doauth) );
+            }
+            else if( x == -1 ) { 
+                lk.reset(0);
+                {
+                    writelock w;
+                    Context c(ns, path, doauth);
+                }
+                // db could be closed at this interim point -- that is ok, we will throw, and don't mind throwing.
+                lk.reset( new _LockCollectionForReading(ns) );
+                c.reset( new Context(ns, path, doauth) );
+            }
+            else { 
+                assert( x < -1 );
+                uasserted(15928, str::stream() << "can't open a database from a nested read lock " << ns);
+            }
+        }
+
+        // todo: are receipts of thousands of queries for a nonexisting database a potential 
+        //       cause of bad performance due to the write lock acquisition above?  let's fix that.
+        //       it would be easy to first check that there is at least a .ns file, or something similar.
+    }
+
+    void Client::Context::checkNotStale() const { 
+        switch ( _client->_curOp->getOp() ) {
+        case dbGetMore: // getMore's are special and should be handled else where
+        case dbUpdate: // update & delete check shard version in instance.cpp, so don't check here as well
+        case dbDelete:
+            break;
+        default: {
+            string errmsg;
+            if ( ! shardVersionOk( _ns , errmsg ) ) {
+                ostringstream os;
+                os << "[" << _ns << "] shard version not ok in Client::Context: " << errmsg;
+                throw SendStaleConfigException( _ns, os.str() );
+            }
+        }
+        }
+    }
+
+    // invoked from ReadContext
+    Client::Context::Context(const string& path, const string& ns, Database *db , bool doauth) :
+        _client( currentClient.get() ), 
+        _oldContext( _client->_context ),
+        _path( path ), 
+        _justCreated(false),
+        _ns( ns ), 
+        _db(db)
+    {
+        assert(_db);
+        checkNotStale();
+        _client->_context = this;
+        _client->_curOp->enter( this );
+        checkNsAccess( doauth, d.dbMutex.getState() );
+        _client->checkLocks();
+    }
+       
+    void Client::Context::_finishInit( bool doauth ) {
+        int lockState = d.dbMutex.getState();
+        assert( lockState );        
+        if ( lockState > 0 && FileAllocator::get()->hasFailed() ) {
+            uassert(14031, "Can't take a write lock while out of disk space", false);
+        }
+        
+        _db = dbHolderUnchecked().getOrCreate( _ns , _path , _justCreated );
+        assert(_db);
+        checkNotStale();
+        _client->_context = this;
+        _client->_curOp->enter( this );
+        checkNsAccess( doauth, lockState );
+    }
+
+    void Client::Context::_auth( int lockState ) {
+        if ( _client->_ai.isAuthorizedForLock( _db->name , lockState ) )
+            return;
+
+        // before we assert, do a little cleanup
+        _client->_context = _oldContext; // note: _oldContext may be null
+
+        stringstream ss;
+        ss << "unauthorized db:" << _db->name << " lock type:" << lockState << " client:" << _client->clientAddress();
+        uasserted( 10057 , ss.str() );
+    }
+    
+    Client::Context::~Context() {
+        DEV assert( _client == currentClient.get() );
+        _client->_curOp->leave( this );
+        _client->_context = _oldContext; // note: _oldContext may be null
+    }
+
+    bool Client::Context::inDB( const string& db , const string& path ) const {
+        if ( _path != path )
+            return false;
+
+        if ( db == _ns )
+            return true;
+
+        string::size_type idx = _ns.find( db );
+        if ( idx != 0 )
+            return false;
+
+        return  _ns[db.size()] == '.';
+    }
+    
+    void Client::Context::checkNsAccess( bool doauth, int lockState ) {
+        if ( 0 ) { // SERVER-4276
+            uassert( 15929, "client access to index backing namespace prohibited", NamespaceString::normal( _ns.c_str() ) );
+        }
+        if ( doauth ) {
+            _auth( lockState );
+        }
+    }
+
+    void Client::appendLastOp( BSONObjBuilder& b ) const {
+        // _lastOp is never set if replication is off
+        if( theReplSet || ! _lastOp.isNull() ) {
+            b.appendTimestamp( "lastOp" , _lastOp.asDate() );
+        }
+    }
+
+    string Client::clientAddress(bool includePort) const {
+        if( _curOp )
+            return _curOp->getRemoteString(includePort);
+        return "";
+    }
+
+    string Client::toString() const {
+        stringstream ss;
+        if ( _curOp )
+            ss << _curOp->infoNoauth().jsonString();
+        return ss.str();
+    }
+
+    string sayClientState() {
+        Client* c = currentClient.get();
+        if ( !c )
+            return "no client";
+        return c->toString();
+    }
+
+    Client* curopWaitingForLock( int type ) {
+        Client * c = currentClient.get();
+        assert( c );
+        CurOp * co = c->curop();
+        if ( co ) {
+            co->waitingForLock( type );
+        }
+        return c;
+    }
+    void curopGotLock(Client *c) {
+        assert(c);
+        CurOp * co = c->curop();
+        if ( co )
+            co->gotLock();
+    }
+
+    void KillCurrentOp::interruptJs( AtomicUInt *op ) {
+        if ( !globalScriptEngine )
+            return;
+        if ( !op ) {
+            globalScriptEngine->interruptAll();
+        }
+        else {
+            globalScriptEngine->interrupt( *op );
+        }
+    }
+
+    void KillCurrentOp::killAll() {
+        _globalKill = true;
+        interruptJs( 0 );
+    }
+
+    void KillCurrentOp::kill(AtomicUInt i) {
+        bool found = false;
+        {
+            scoped_lock l( Client::clientsMutex );
+            for( set< Client* >::const_iterator j = Client::clients.begin(); !found && j != Client::clients.end(); ++j ) {
+                for( CurOp *k = ( *j )->curop(); !found && k; k = k->parent() ) {
+                    if ( k->opNum() == i ) {
+                        k->kill();
+                        for( CurOp *l = ( *j )->curop(); l != k; l = l->parent() ) {
+                            l->kill();
+                        }
+                        found = true;
+                    }
+                }
+            }
+        }
+        if ( found ) {
+            interruptJs( &i );
+        }
+    }
+
+    void Client::gotHandshake( const BSONObj& o ) {
+        BSONObjIterator i(o);
+
+        {
+            BSONElement id = i.next();
+            assert( id.type() );
+            _remoteId = id.wrap( "_id" );
+        }
+
+        BSONObjBuilder b;
+        while ( i.more() )
+            b.append( i.next() );
+        
+        b.appendElementsUnique( _handshake );
+
+        _handshake = b.obj();
+
+        if (theReplSet && o.hasField("member")) {
+            theReplSet->ghost->associateSlave(_remoteId, o["member"].Int());
+        }
+    }
+
+    ClientBasic* ClientBasic::getCurrent() {
+        return currentClient.get();
+    }
+
+    class HandshakeCmd : public Command {
+    public:
+        void help(stringstream& h) const { h << "internal"; }
+        HandshakeCmd() : Command( "handshake" ) {}
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            Client& c = cc();
+            c.gotHandshake( cmdObj );
+            return 1;
+        }
+
+    } handshakeCmd;
+
+    class ClientListPlugin : public WebStatusPlugin {
+    public:
+        ClientListPlugin() : WebStatusPlugin( "clients" , 20 ) {}
+        virtual void init() {}
+
+        virtual void run( stringstream& ss ) {
+            using namespace mongoutils::html;
+
+            ss << "\n<table border=1 cellpadding=2 cellspacing=0>";
+            ss << "<tr align='left'>"
+               << th( a("", "Connections to the database, both internal and external.", "Client") )
+               << th( a("http://www.mongodb.org/display/DOCS/Viewing+and+Terminating+Current+Operation", "", "OpId") )
+               << "<th>Active</th>"
+               << "<th>LockType</th>"
+               << "<th>Waiting</th>"
+               << "<th>SecsRunning</th>"
+               << "<th>Op</th>"
+               << th( a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "", "Namespace") )
+               << "<th>Query</th>"
+               << "<th>client</th>"
+               << "<th>msg</th>"
+               << "<th>progress</th>"
+
+               << "</tr>\n";
+            {
+                scoped_lock bl(Client::clientsMutex);
+                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) {
+                    Client *c = *i;
+                    CurOp& co = *(c->curop());
+                    ss << "<tr><td>" << c->desc() << "</td>";
+
+                    tablecell( ss , co.opNum() );
+                    tablecell( ss , co.active() );
+                    {
+                        int lt = co.getLockType();
+                        if( lt == -1 ) tablecell(ss, "R");
+                        else if( lt == 1 ) tablecell(ss, "W");
+                        else
+                            tablecell( ss ,  lt);
+                    }
+                    tablecell( ss , co.isWaitingForLock() );
+                    if ( co.active() )
+                        tablecell( ss , co.elapsedSeconds() );
+                    else
+                        tablecell( ss , "" );
+                    tablecell( ss , co.getOp() );
+                    tablecell( ss , co.getNS() );
+                    if ( co.haveQuery() ) {
+                        tablecell( ss , co.query() );
+                    }
+                    else
+                        tablecell( ss , "" );
+                    tablecell( ss , co.getRemoteString() );
+
+                    tablecell( ss , co.getMessage() );
+                    tablecell( ss , co.getProgressMeter().toString() );
+
+
+                    ss << "</tr>\n";
+                }
+            }
+            ss << "</table>\n";
+
+        }
+
+    } clientListPlugin;
+
+    int Client::recommendedYieldMicros( int * writers , int * readers ) {
+        int num = 0;
+        int w = 0;
+        int r = 0;
+        {
+            scoped_lock bl(clientsMutex);
+            for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) {
+                Client* c = *i;
+                if ( c->curop()->isWaitingForLock() ) {
+                    num++;
+                    if ( c->curop()->getLockType() > 0 )
+                        w++;
+                    else
+                        r++;
+                }
+            }
+        }
+
+        if ( writers )
+            *writers = w;
+        if ( readers )
+            *readers = r;
+
+        int time = r * 100;
+        time += w * 500;
+
+        time = min( time , 1000000 );
+
+        // if there has been a kill request for this op - we should yield to allow the op to stop
+        // This function returns empty string if we aren't interrupted
+        if ( *killCurrentOp.checkForInterruptNoAssert() ) {
+            return 100;
+        }
+
+        return time;
+    }
+
+    int Client::getActiveClientCount( int& writers, int& readers ) {
+        writers = 0;
+        readers = 0;
+
+        scoped_lock bl(clientsMutex);
+        for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) {
+            Client* c = *i;
+            if ( ! c->curop()->active() )
+                continue;
+
+            int l = c->curop()->getLockType();
+            if ( l > 0 )
+                writers++;
+            else if ( l < 0 )
+                readers++;
+
+        }
+
+        return writers + readers;
+    }
+
+    void OpDebug::reset() {
+        extra.reset();
+
+        op = 0;
+        iscommand = false;
+        ns = "";
+        query = BSONObj();
+        updateobj = BSONObj();
+
+        cursorid = -1;
+        ntoreturn = -1;
+        ntoskip = -1;
+        exhaust = false;
+
+        nscanned = -1;
+        idhack = false;
+        scanAndOrder = false;
+        moved = false;
+        fastmod = false;
+        fastmodinsert = false;
+        upsert = false;
+        keyUpdates = 0;  // unsigned, so -1 not possible
+        
+        exceptionInfo.reset();
+        
+        executionTime = 0;
+        nreturned = -1;
+        responseLength = -1;
+    }
+
+
+#define OPDEBUG_TOSTRING_HELP(x) if( x >= 0 ) s << " " #x ":" << (x)
+#define OPDEBUG_TOSTRING_HELP_BOOL(x) if( x ) s << " " #x ":" << (x)
+    string OpDebug::toString() const {
+        StringBuilder s( ns.size() + 64 );
+        if ( iscommand )
+            s << "command ";
+        else
+            s << opToString( op ) << ' ';
+        s << ns.toString();
+
+        if ( ! query.isEmpty() ) {
+            if ( iscommand )
+                s << " command: ";
+            else
+                s << " query: ";
+            s << query.toString();
+        }
+        
+        if ( ! updateobj.isEmpty() ) {
+            s << " update: ";
+            updateobj.toString( s );
+        }
+        
+        OPDEBUG_TOSTRING_HELP( cursorid );
+        OPDEBUG_TOSTRING_HELP( ntoreturn );
+        OPDEBUG_TOSTRING_HELP( ntoskip );
+        OPDEBUG_TOSTRING_HELP_BOOL( exhaust );
+
+        OPDEBUG_TOSTRING_HELP( nscanned );
+        OPDEBUG_TOSTRING_HELP_BOOL( idhack );
+        OPDEBUG_TOSTRING_HELP_BOOL( scanAndOrder );
+        OPDEBUG_TOSTRING_HELP_BOOL( moved );
+        OPDEBUG_TOSTRING_HELP_BOOL( fastmod );
+        OPDEBUG_TOSTRING_HELP_BOOL( fastmodinsert );
+        OPDEBUG_TOSTRING_HELP_BOOL( upsert );
+        OPDEBUG_TOSTRING_HELP( keyUpdates );
+        
+        if ( extra.len() )
+            s << " " << extra.str();
+
+        if ( ! exceptionInfo.empty() ) {
+            s << " exception: " << exceptionInfo.msg;
+            if ( exceptionInfo.code )
+                s << " code:" << exceptionInfo.code;
+        }
+        
+        OPDEBUG_TOSTRING_HELP( nreturned );
+        if ( responseLength )
+            s << " reslen:" << responseLength;
+        s << " " << executionTime << "ms";
+
+        return s.str();
+    }
+
+#define OPDEBUG_APPEND_NUMBER(x) if( x != -1 ) b.append( #x , (x) )
+#define OPDEBUG_APPEND_BOOL(x) if( x ) b.appendBool( #x , (x) )
+    void OpDebug::append( const CurOp& curop, BSONObjBuilder& b ) const {
+        b.append( "op" , iscommand ? "command" : opToString( op ) );
+        b.append( "ns" , ns.toString() );
+        if ( ! query.isEmpty() )
+            b.append( iscommand ? "command" : "query" , query );
+        else if ( ! iscommand && curop.haveQuery() )
+            curop.appendQuery( b , "query" );
+
+        if ( ! updateobj.isEmpty() )
+            b.append( "updateobj" , updateobj );
+        
+        OPDEBUG_APPEND_NUMBER( cursorid );
+        OPDEBUG_APPEND_NUMBER( ntoreturn );
+        OPDEBUG_APPEND_NUMBER( ntoskip );
+        OPDEBUG_APPEND_BOOL( exhaust );
+
+        OPDEBUG_APPEND_NUMBER( nscanned );
+        OPDEBUG_APPEND_BOOL( idhack );
+        OPDEBUG_APPEND_BOOL( scanAndOrder );
+        OPDEBUG_APPEND_BOOL( moved );
+        OPDEBUG_APPEND_BOOL( fastmod );
+        OPDEBUG_APPEND_BOOL( fastmodinsert );
+        OPDEBUG_APPEND_BOOL( upsert );
+        OPDEBUG_APPEND_NUMBER( keyUpdates );
+
+        if ( ! exceptionInfo.empty() ) 
+            exceptionInfo.append( b , "exception" , "exceptionCode" );
+        
+        OPDEBUG_APPEND_NUMBER( nreturned );
+        OPDEBUG_APPEND_NUMBER( responseLength );
+        b.append( "millis" , executionTime );
+        
+    }
+
+}
diff --git a/src/mongo/db/client.h b/src/mongo/db/client.h
new file mode 100644
index 00000000000..6aa8bc00f02
--- /dev/null
+++ b/src/mongo/db/client.h
@@ -0,0 +1,286 @@
+/* @file db/client.h
+
+   "Client" represents a connection to the database (the server-side) and corresponds
+   to an open socket (or logical connection if pooling on sockets) from a client.
+
+   todo: switch to asio...this will fit nicely with that.
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "security.h"
+#include "namespace-inl.h"
+#include "lasterror.h"
+#include "stats/top.h"
+#include "../db/client_common.h"
+#include "../util/concurrency/threadlocal.h"
+#include "../util/net/message_port.h"
+#include "../util/concurrency/rwlock.h"
+#include "d_concurrency.h"
+
+namespace mongo {
+
+    extern class ReplSet *theReplSet;
+    class AuthenticationInfo;
+    class Database;
+    class CurOp;
+    class Command;
+    class Client;
+    class AbstractMessagingPort;
+    class LockCollectionForReading;
+    class PageFaultRetryableSection;
+
+#if defined(CLC)
+    typedef LockCollectionForReading _LockCollectionForReading;
+#else
+    typedef readlock _LockCollectionForReading;
+#endif
+
+    TSP_DECLARE(Client, currentClient)
+
+    typedef long long ConnectionId;
+
+    /** the database's concept of an outside "client" */
+    class Client : public ClientBasic {
+        static Client *syncThread;
+    public:
+        // always be in clientsMutex when manipulating this. killop stuff uses these.
+        static set<Client*> clients;      
+        static mongo::mutex clientsMutex; 
+        static int getActiveClientCount( int& writers , int& readers );
+        class Context;
+        ~Client();
+        static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 );
+
+        /** each thread which does db operations has a Client object in TLS.
+         *  call this when your thread starts.
+        */
+        static Client& initThread(const char *desc, AbstractMessagingPort *mp = 0);
+
+        static void initThreadIfNotAlready(const char *desc) { 
+            if( currentClient.get() )
+                return;
+            initThread(desc);
+        }
+
+        /** this has to be called as the client goes away, but before thread termination
+         *  @return true if anything was done
+         */
+        bool shutdown();
+
+        /** set so isSyncThread() works */
+        void iAmSyncThread() {
+            wassert( syncThread == 0 );
+            syncThread = this;
+        }
+        /** @return true if this client is the replication secondary pull thread.  not used much, is used in create index sync code. */
+        bool isSyncThread() const { return this == syncThread; }
+
+        string clientAddress(bool includePort=false) const;
+        const AuthenticationInfo * getAuthenticationInfo() const { return &_ai; }
+        AuthenticationInfo * getAuthenticationInfo() { return &_ai; }
+        bool isAdmin() { return _ai.isAuthorized( "admin" ); }
+        CurOp* curop() const { return _curOp; }
+        Context* getContext() const { return _context; }
+        Database* database() const {  return _context ? _context->db() : 0; }
+        const char *ns() const { return _context->ns(); }
+        const char *desc() const { return _desc; }
+        void setLastOp( OpTime op ) { _lastOp = op; }
+        OpTime getLastOp() const { return _lastOp; }
+
+        /** caution -- use Context class instead */
+        void setContext(Context *c) { _context = c; }
+
+        /* report what the last operation was.  used by getlasterror */
+        void appendLastOp( BSONObjBuilder& b ) const;
+
+        bool isGod() const { return _god; } /* this is for map/reduce writes */
+        string toString() const;
+        void gotHandshake( const BSONObj& o );
+        bool hasRemote() const { return _mp; }
+        HostAndPort getRemote() const { assert( _mp ); return _mp->remote(); }
+        BSONObj getRemoteID() const { return _remoteId; }
+        BSONObj getHandshake() const { return _handshake; }
+        AbstractMessagingPort * port() const { return _mp; }
+        ConnectionId getConnectionId() const { return _connectionId; }
+    private:
+        Client(const char *desc, AbstractMessagingPort *p = 0);
+        friend class CurOp;
+        ConnectionId _connectionId; // > 0 for things "conn", 0 otherwise
+        string _threadId; // "" on non support systems
+        CurOp * _curOp;
+        Context * _context;
+        bool _shutdown; // to track if Client::shutdown() gets called
+        const char * const _desc;
+        bool _god;
+        AuthenticationInfo _ai;
+        OpTime _lastOp;
+        BSONObj _handshake;
+        BSONObj _remoteId;
+        AbstractMessagingPort * const _mp;
+        unsigned _sometimes;
+    public:
+        bool _hasWrittenThisPass;
+        PageFaultRetryableSection *_pageFaultRetryableSection;
+
+        /** the concept here is the same as MONGO_SOMETIMES.  however that 
+            macro uses a static that will be shared by all threads, and each 
+            time incremented it might eject that line from the other cpu caches (?),
+            so idea is that this is better.
+            */
+        bool sometimes(unsigned howOften) { return ++_sometimes % howOften == 0; }
+
+        /* set _god=true temporarily, safely */
+        class GodScope {
+            bool _prev;
+        public:
+            GodScope();
+            ~GodScope();
+        };
+
+        //static void assureDatabaseIsOpen(const string& ns, string path=dbpath);
+
+        /** "read lock, and set my context, all in one operation" 
+         *  This handles (if not recursively locked) opening an unopened database.
+         */
+        class ReadContext : boost::noncopyable { 
+        public:
+            ReadContext(const string& ns, string path=dbpath, bool doauth=true );
+            Context& ctx() { return *c.get(); }
+        private:
+            scoped_ptr<_LockCollectionForReading> lk;
+            scoped_ptr<Context> c;
+        };
+
+        /* Set database we want to use, then, restores when we finish (are out of scope)
+           Note this is also helpful if an exception happens as the state if fixed up.
+        */
+        class Context : boost::noncopyable {
+        public:
+            /** this is probably what you want */
+            Context(const string& ns, string path=dbpath, bool doauth=true );
+
+            /** note: this does not call finishInit -- i.e., does not call 
+                      shardVersionOk() for example. 
+                see also: reset().
+            */
+            Context( string ns , Database * db, bool doauth=true );
+
+            // used by ReadContext
+            Context(const string& path, const string& ns, Database *db, bool doauth);
+
+            ~Context();
+            Client* getClient() const { return _client; }
+            Database* db() const { return _db; }
+            const char * ns() const { return _ns.c_str(); }
+            bool equals( const string& ns , const string& path=dbpath ) const { return _ns == ns && _path == path; }
+
+            /** @return if the db was created by this Context */
+            bool justCreated() const { return _justCreated; }
+
+            /** @return true iff the current Context is using db/path */
+            bool inDB( const string& db , const string& path=dbpath ) const;
+
+            void _clear() { // this is sort of an "early destruct" indication, _ns can never be uncleared
+                const_cast<string&>(_ns).empty();
+                _db = 0;
+            }
+
+            /** call before unlocking, so clear any non-thread safe state
+             *  _db gets restored on the relock
+             */
+            void unlocked() { _db = 0; }
+
+            /** call after going back into the lock, will re-establish non-thread safe stuff */
+            void relocked() { _finishInit(); }
+
+        private:
+            friend class CurOp;
+            void _finishInit( bool doauth=true);
+            void _auth( int lockState );
+            void checkNotStale() const;
+            void checkNsAccess( bool doauth, int lockState = d.dbMutex.getState() );
+            Client * const _client;
+            Context * const _oldContext;
+            const string _path;
+            bool _justCreated;
+            const string _ns;
+            Database * _db;
+        }; // class Client::Context
+
+        struct LockStatus {
+            LockStatus();
+            string whichCollection;
+            unsigned excluder, global, collection;
+            string toString() const;
+        } lockStatus;
+
+#if defined(CLC)
+        void checkLocks() const;
+#else
+        void checkLocks() const { }
+#endif
+
+    }; // class Client
+
+    /** get the Client object for this thread. */
+    inline Client& cc() {
+        Client * c = currentClient.get();
+        assert( c );
+        return *c;
+    }
+
+    inline Client::GodScope::GodScope() {
+        _prev = cc()._god;
+        cc()._god = true;
+    }
+    inline Client::GodScope::~GodScope() { cc()._god = _prev; }
+
+    /* this unreadlocks and then writelocks; i.e. it does NOT upgrade inside the
+       lock (and is thus wrong to use if you need that, which is usually).
+       that said we use it today for a specific case where the usage is correct.
+    */
+#if 0
+    inline void mongolock::releaseAndWriteLock() {
+        if( !_writelock ) {
+
+#if BOOST_VERSION >= 103500
+            int s = d.dbMutex.getState();
+            if( s != -1 ) {
+                log() << "error: releaseAndWriteLock() s == " << s << endl;
+                msgasserted( 12600, "releaseAndWriteLock: unlock_shared failed, probably recursive" );
+            }
+#endif
+
+            _writelock = true;
+            d.dbMutex.unlock_shared();
+            d.dbMutex.lock();
+
+            // todo: unlocked() method says to call it before unlocking, not after.  so fix this here,
+            // or fix the doc there.
+            if ( cc().getContext() )
+                cc().getContext()->unlocked();
+        }
+    }
+#endif
+
+    inline bool haveClient() { return currentClient.get() > 0; }
+
+};
diff --git a/src/mongo/db/client_common.h b/src/mongo/db/client_common.h
new file mode 100644
index 00000000000..eb70105ef99
--- /dev/null
+++ b/src/mongo/db/client_common.h
@@ -0,0 +1,47 @@
+// client_common.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+//#include "../pch.h"
+//#include "security.h"
+#include "../util/net/hostandport.h"
+
+namespace mongo {
+
+    class AuthenticationInfo;
+    
+    /**
+     * this is the base class for Client and ClientInfo
+     * Client is for mongod
+     * Client is for mongos
+     * They should converge slowly
+     * The idea is this has the basic api so that not all code has to be duplicated
+     */
+    class ClientBasic : boost::noncopyable {
+    public:
+        virtual ~ClientBasic(){}
+        virtual const AuthenticationInfo * getAuthenticationInfo() const = 0;
+        virtual AuthenticationInfo * getAuthenticationInfo() = 0;
+
+        virtual bool hasRemote() const = 0;
+        virtual HostAndPort getRemote() const = 0;
+
+        static ClientBasic* getCurrent();
+    };
+}
diff --git a/src/mongo/db/clientcursor.cpp b/src/mongo/db/clientcursor.cpp
new file mode 100644
index 00000000000..dc04ec38f63
--- /dev/null
+++ b/src/mongo/db/clientcursor.cpp
@@ -0,0 +1,747 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* clientcursor.cpp
+
+   ClientCursor is a wrapper that represents a cursorid from our database
+   application's perspective.
+
+   Cursor -- and its derived classes -- are our internal cursors.
+*/
+
+#include "pch.h"
+#include "clientcursor.h"
+#include "introspect.h"
+#include <time.h>
+#include "db.h"
+#include "commands.h"
+#include "repl_block.h"
+#include "../util/processinfo.h"
+#include "../util/timer.h"
+#include "../server.h"
+
+namespace mongo {
+
+    CCById ClientCursor::clientCursorsById;
+    boost::recursive_mutex& ClientCursor::ccmutex( *(new boost::recursive_mutex()) );
+    long long ClientCursor::numberTimedOut = 0;
+
+    void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ); // from s/d_logic.h
+
+    /*static*/ void ClientCursor::assertNoCursors() {
+        recursive_scoped_lock lock(ccmutex);
+        if( clientCursorsById.size() ) {
+            log() << "ERROR clientcursors exist but should not at this point" << endl;
+            ClientCursor *cc = clientCursorsById.begin()->second;
+            log() << "first one: " << cc->_cursorid << ' ' << cc->_ns << endl;
+            clientCursorsById.clear();
+            assert(false);
+        }
+    }
+
+
+    void ClientCursor::setLastLoc_inlock(DiskLoc L) {
+        assert( _pos != -2 ); // defensive - see ~ClientCursor
+
+        if ( L == _lastLoc )
+            return;
+
+        CCByLoc& bl = byLoc();
+
+        if ( !_lastLoc.isNull() ) {
+            bl.erase( ByLocKey( _lastLoc, _cursorid ) );
+        }
+
+        if ( !L.isNull() )
+            bl[ByLocKey(L,_cursorid)] = this;
+        _lastLoc = L;
+    }
+
+    /* ------------------------------------------- */
+
+    /* must call this when a btree node is updated */
+    //void removedKey(const DiskLoc& btreeLoc, int keyPos) {
+    //}
+
+    // ns is either a full namespace or "dbname." when invalidating for a whole db
+    void ClientCursor::invalidate(const char *ns) {
+        d.dbMutex.assertWriteLocked();
+        int len = strlen(ns);
+        const char* dot = strchr(ns, '.');
+        assert( len > 0 && dot);
+
+        bool isDB = (dot == &ns[len-1]); // first (and only) dot is the last char
+
+        {
+            //cout << "\nTEMP invalidate " << ns << endl;
+            recursive_scoped_lock lock(ccmutex);
+
+            Database *db = cc().database();
+            assert(db);
+            assert( str::startsWith(ns, db->name) );
+
+            for( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); /*++i*/ ) {
+                ClientCursor *cc = i->second;
+
+                ++i; // we may be removing this node
+
+                if( cc->_db != db )
+                    continue;
+
+                if (isDB) {
+                    // already checked that db matched above
+                    dassert( str::startsWith(cc->_ns.c_str(), ns) );
+                    delete cc; //removes self from ccByID
+                }
+                else {
+                    if ( str::equals(cc->_ns.c_str(), ns) )
+                        delete cc; //removes self from ccByID
+                }
+            }
+
+            /*
+            note : we can't iterate byloc because clientcursors may exist with a loc of null in which case
+                   they are not in the map.  perhaps they should not exist though in the future?  something to
+                   change???
+
+            CCByLoc& bl = db->ccByLoc;
+            for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); ++i ) {
+                ClientCursor *cc = i->second;
+                if ( strncmp(ns, cc->ns.c_str(), len) == 0 ) {
+                    assert( cc->_db == db );
+                    toDelete.push_back(i->second);
+                }
+            }*/
+
+            /*cout << "TEMP after invalidate " << endl;
+            for( auto i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) {
+                cout << "  " << i->second->ns << endl;
+            }
+            cout << "TEMP after invalidate done" << endl;*/
+        }
+    }
+
+    /* note called outside of locks (other than ccmutex) so care must be exercised */
+    bool ClientCursor::shouldTimeout( unsigned millis ) {
+        _idleAgeMillis += millis;
+        return _idleAgeMillis > 600000 && _pinValue == 0;
+    }
+
+    /* called every 4 seconds.  millis is amount of idle time passed since the last call -- could be zero */
+    void ClientCursor::idleTimeReport(unsigned millis) {
+        bool foundSomeToTimeout = false;
+
+        // two passes so that we don't need to readlock unless we really do some timeouts
+        // we assume here that incrementing _idleAgeMillis outside readlock is ok.
+        {
+            recursive_scoped_lock lock(ccmutex);
+            {
+                unsigned sz = clientCursorsById.size();
+                static time_t last;
+                if( sz >= 100000 ) { 
+                    if( time(0) - last > 300 ) {
+                        last = time(0);
+                        log() << "warning number of open cursors is very large: " << sz << endl;
+                    }
+                }
+            }
+            for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end();  ) {
+                CCById::iterator j = i;
+                i++;
+                if( j->second->shouldTimeout( millis ) ) {
+                    foundSomeToTimeout = true;
+                    break;
+                }
+            }
+        }
+
+        if( foundSomeToTimeout ) {
+            // todo: ideally all readlocks automatically note what we are locking for so this 
+            // can be reported in currentop command. e.g. something like:
+            //   readlock lk("", "timeout cursors");
+            readlock lk("");
+            recursive_scoped_lock lock(ccmutex);
+            for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end();  ) {
+                CCById::iterator j = i;
+                i++;
+                if( j->second->shouldTimeout(0) ) {
+                    numberTimedOut++;
+                    LOG(1) << "killing old cursor " << j->second->_cursorid << ' ' << j->second->_ns
+                           << " idle:" << j->second->idleTime() << "ms\n";
+                    delete j->second;
+                }
+            }
+        }
+    }
+
+    /* must call when a btree bucket going away.
+       note this is potentially slow
+    */
+    void ClientCursor::informAboutToDeleteBucket(const DiskLoc& b) {
+        recursive_scoped_lock lock(ccmutex);
+        Database *db = cc().database();
+        CCByLoc& bl = db->ccByLoc;
+        RARELY if ( bl.size() > 70 ) {
+            log() << "perf warning: byLoc.size=" << bl.size() << " in aboutToDeleteBucket\n";
+        }
+        if( bl.size() == 0 ) { 
+            DEV tlog() << "debug warning: no cursors found in informAboutToDeleteBucket()" << endl;
+        }
+        for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); i++ )
+            i->second->_c->aboutToDeleteBucket(b);
+    }
+    void aboutToDeleteBucket(const DiskLoc& b) {
+        ClientCursor::informAboutToDeleteBucket(b);
+    }
+
+    /* must call this on a delete so we clean up the cursors. */
+    void ClientCursor::aboutToDelete(const DiskLoc& dl) {
+        recursive_scoped_lock lock(ccmutex);
+
+        Database *db = cc().database();
+        assert(db);
+
+        aboutToDeleteForSharding( db , dl );
+
+        CCByLoc& bl = db->ccByLoc;
+        CCByLoc::iterator j = bl.lower_bound(ByLocKey::min(dl));
+        CCByLoc::iterator stop = bl.upper_bound(ByLocKey::max(dl));
+        if ( j == stop )
+            return;
+
+        vector<ClientCursor*> toAdvance;
+
+        while ( 1 ) {
+            toAdvance.push_back(j->second);
+            DEV assert( j->first.loc == dl );
+            ++j;
+            if ( j == stop )
+                break;
+        }
+
+        if( toAdvance.size() >= 3000 ) {
+            log() << "perf warning MPW101: " << toAdvance.size() << " cursors for one diskloc "
+                  << dl.toString()
+                  << ' ' << toAdvance[1000]->_ns
+                  << ' ' << toAdvance[2000]->_ns
+                  << ' ' << toAdvance[1000]->_pinValue
+                  << ' ' << toAdvance[2000]->_pinValue
+                  << ' ' << toAdvance[1000]->_pos
+                  << ' ' << toAdvance[2000]->_pos
+                  << ' ' << toAdvance[1000]->_idleAgeMillis
+                  << ' ' << toAdvance[2000]->_idleAgeMillis
+                  << ' ' << toAdvance[1000]->_doingDeletes
+                  << ' ' << toAdvance[2000]->_doingDeletes
+                  << endl;
+            //wassert( toAdvance.size() < 5000 );
+        }
+
+        for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ) {
+            ClientCursor* cc = *i;
+            wassert(cc->_db == db);
+
+            if ( cc->_doingDeletes ) continue;
+
+            Cursor *c = cc->_c.get();
+            if ( c->capped() ) {
+                /* note we cannot advance here. if this condition occurs, writes to the oplog
+                   have "caught" the reader.  skipping ahead, the reader would miss postentially
+                   important data.
+                   */
+                delete cc;
+                continue;
+            }
+
+            c->checkLocation();
+            DiskLoc tmp1 = c->refLoc();
+            if ( tmp1 != dl ) {
+                // This might indicate a failure to call ClientCursor::updateLocation() but it can
+                // also happen during correct operation, see SERVER-2009.
+                problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl;
+            }
+            else {
+                c->advance();
+            }
+            while (!c->eof() && c->refLoc() == dl) {
+                /* We don't delete at EOF because we want to return "no more results" rather than "no such cursor".
+                 * The loop is to handle MultiKey indexes where the deleted record is pointed to by multiple adjacent keys.
+                 * In that case we need to advance until we get to the next distinct record or EOF.
+                 * SERVER-4154
+                 */
+                c->advance();
+            }
+            cc->updateLocation();
+        }
+    }
+    void aboutToDelete(const DiskLoc& dl) { ClientCursor::aboutToDelete(dl); }
+
+    ClientCursor::ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query ) :
+        _ns(ns), _db( cc().database() ),
+        _c(c), _pos(0),
+        _query(query),  _queryOptions(queryOptions),
+        _idleAgeMillis(0), _pinValue(0),
+        _doingDeletes(false), _yieldSometimesTracker(128,10) {
+
+        d.dbMutex.assertAtLeastReadLocked();
+
+        assert( _db );
+        assert( str::startsWith(_ns, _db->name) );
+        if( queryOptions & QueryOption_NoCursorTimeout )
+            noTimeout();
+        recursive_scoped_lock lock(ccmutex);
+        _cursorid = allocCursorId_inlock();
+        clientCursorsById.insert( make_pair(_cursorid, this) );
+
+        if ( ! _c->modifiedKeys() ) {
+            // store index information so we can decide if we can
+            // get something out of the index key rather than full object
+
+            int x = 0;
+            BSONObjIterator i( _c->indexKeyPattern() );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.isNumber() ) {
+                    // only want basic index fields, not "2d" etc
+                    _indexedFields[e.fieldName()] = x;
+                }
+                x++;
+            }
+        }
+
+    }
+
+
+    ClientCursor::~ClientCursor() {
+        if( _pos == -2 ) {
+            // defensive: destructor called twice
+            wassert(false);
+            return;
+        }
+
+        {
+            recursive_scoped_lock lock(ccmutex);
+            setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap
+            clientCursorsById.erase(_cursorid);
+
+            // defensive:
+            (CursorId&)_cursorid = -1;
+            _pos = -2;
+        }
+    }
+
+    bool ClientCursor::getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder ) {
+
+        map<string,int>::const_iterator i = _indexedFields.find( name );
+        if ( i == _indexedFields.end() ) {
+            current().getFieldsDotted( name , ret );
+            return false;
+        }
+
+        int x = i->second;
+
+        holder = currKey();
+        BSONObjIterator it( holder );
+        while ( x && it.more() ) {
+            it.next();
+            x--;
+        }
+        assert( x == 0 );
+        ret.insert( it.next() );
+        return true;
+    }
+
+    BSONElement ClientCursor::getFieldDotted( const string& name , BSONObj& holder , bool * fromKey ) {
+
+        map<string,int>::const_iterator i = _indexedFields.find( name );
+        if ( i == _indexedFields.end() ) {
+            if ( fromKey )
+                *fromKey = false;
+            holder = current();
+            return holder.getFieldDotted( name );
+        }
+        
+        int x = i->second;
+
+        holder = currKey();
+        BSONObjIterator it( holder );
+        while ( x && it.more() ) {
+            it.next();
+            x--;
+        }
+        assert( x == 0 );
+
+        if ( fromKey )
+            *fromKey = true;
+        return it.next();
+    }
+
+    BSONObj ClientCursor::extractFields(const BSONObj &pattern , bool fillWithNull ) {
+        BSONObjBuilder b( pattern.objsize() * 2 );
+
+        BSONObj holder;
+     
+        BSONObjIterator i( pattern ); 
+        while ( i.more() ) {
+            BSONElement key = i.next();
+            BSONElement value = getFieldDotted( key.fieldName() , holder );
+
+            if ( value.type() ) {
+                b.appendAs( value , key.fieldName() );
+                continue;
+            }
+
+            if ( fillWithNull ) 
+                b.appendNull( key.fieldName() );            
+            
+        }
+
+        return b.obj();
+    }
+    
+
+    /* call when cursor's location changes so that we can update the
+       cursorsbylocation map.  if you are locked and internally iterating, only
+       need to call when you are ready to "unlock".
+    */
+    void ClientCursor::updateLocation() {
+        assert( _cursorid );
+        _idleAgeMillis = 0;
+        DiskLoc cl = _c->refLoc();
+        if ( lastLoc() == cl ) {
+            //log() << "info: lastloc==curloc " << ns << '\n';
+        }
+        else {
+            recursive_scoped_lock lock(ccmutex);
+            setLastLoc_inlock(cl);
+        }
+        // may be necessary for MultiCursor even when cl hasn't changed
+        _c->noteLocation();
+    }
+
+    int ClientCursor::suggestYieldMicros() {
+        int writers = 0;
+        int readers = 0;
+
+        int micros = Client::recommendedYieldMicros( &writers , &readers );
+
+        if ( micros > 0 && writers == 0 && d.dbMutex.getState() <= 0 ) {
+            // we have a read lock, and only reads are coming on, so why bother unlocking
+            return 0;
+        }
+
+        wassert( micros < 10000000 );
+        dassert( micros <  1000001 );
+        return micros;
+    }
+    
+    Record* ClientCursor::_recordForYield( ClientCursor::RecordNeeds need ) {
+        if ( need == DontNeed ) {
+            return 0;
+        }
+        else if ( need == MaybeCovered ) {
+            // TODO
+            return 0;
+        }
+        else if ( need == WillNeed ) {
+            // no-op
+        }
+        else {
+            warning() << "don't understand RecordNeeds: " << (int)need << endl;
+            return 0;
+        }
+
+        DiskLoc l = currLoc();
+        if ( l.isNull() )
+            return 0;
+        
+        Record * rec = l.rec();
+        if ( rec->likelyInPhysicalMemory() ) 
+            return 0;
+        
+        return rec;
+    }
+
+    bool ClientCursor::yieldSometimes( RecordNeeds need, bool *yielded ) {
+        if ( yielded ) {
+            *yielded = false;   
+        }
+        if ( ! _yieldSometimesTracker.intervalHasElapsed() ) {
+            Record* rec = _recordForYield( need );
+            if ( rec ) {
+                // yield for page fault
+                if ( yielded ) {
+                    *yielded = true;   
+                }
+                return yield( suggestYieldMicros() , rec );
+            }
+            return true;
+        }
+
+        int micros = suggestYieldMicros();
+        if ( micros > 0 ) {
+            if ( yielded ) {
+                *yielded = true;   
+            }
+            return yield( micros , _recordForYield( need ) );
+        }
+        return true;
+    }
+
+    void ClientCursor::staticYield( int micros , const StringData& ns , Record * rec ) {
+        killCurrentOp.checkForInterrupt( false );
+        {
+            auto_ptr<LockMongoFilesShared> lk;
+            if ( rec ) {
+                // need to lock this else rec->touch won't be safe file could disappear
+                lk.reset( new LockMongoFilesShared() );
+            }
+            
+            dbtempreleasecond unlock;
+            if ( unlock.unlocked() ) {
+                if ( micros == -1 )
+                    micros = Client::recommendedYieldMicros();
+                if ( micros > 0 )
+                    sleepmicros( micros );
+            }
+            else {
+                CurOp * c = cc().curop();
+                while ( c->parent() )
+                    c = c->parent();
+                LOGSOME << "warning ClientCursor::yield can't unlock b/c of recursive lock"
+                          << " ns: " << ns 
+                          << " top: " << c->info()
+                          << endl;
+            }
+
+            if ( rec )
+                rec->touch();
+
+            lk.reset(0); // need to release this before dbtempreleasecond
+        }
+    }
+
+    bool ClientCursor::prepareToYield( YieldData &data ) {
+        if ( ! _c->supportYields() )
+            return false;
+        if ( ! _c->prepareToYield() ) {
+            return false;   
+        }
+        // need to store in case 'this' gets deleted
+        data._id = _cursorid;
+
+        data._doingDeletes = _doingDeletes;
+        _doingDeletes = false;
+
+        updateLocation();
+
+        {
+            /* a quick test that our temprelease is safe.
+             todo: make a YieldingCursor class
+             and then make the following code part of a unit test.
+             */
+            const int test = 0;
+            static bool inEmpty = false;
+            if( test && !inEmpty ) {
+                inEmpty = true;
+                log() << "TEST: manipulate collection during cc:yield" << endl;
+                if( test == 1 )
+                    Helpers::emptyCollection(_ns.c_str());
+                else if( test == 2 ) {
+                    BSONObjBuilder b; string m;
+                    dropCollection(_ns.c_str(), m, b);
+                }
+                else {
+                    dropDatabase(_ns.c_str());
+                }
+            }
+        }
+        return true;
+    }
+
+    bool ClientCursor::recoverFromYield( const YieldData &data ) {
+        ClientCursor *cc = ClientCursor::find( data._id , false );
+        if ( cc == 0 ) {
+            // id was deleted
+            return false;
+        }
+
+        cc->_doingDeletes = data._doingDeletes;
+        cc->_c->recoverFromYield();
+        return true;
+    }
+
+    /** @return true if cursor is still ok */
+    bool ClientCursor::yield( int micros , Record * recordToLoad ) {
+
+        if ( ! _c->supportYields() ) // so me cursors (geo@oct2011) don't support yielding
+            return true;
+
+        YieldData data;
+        prepareToYield( data );
+        staticYield( micros , _ns , recordToLoad );
+        return ClientCursor::recoverFromYield( data );
+    }
+
+    long long ctmLast = 0; // so we don't have to do find() which is a little slow very often.
+    long long ClientCursor::allocCursorId_inlock() {
+        long long ctm = curTimeMillis64();
+        dassert( ctm );
+        long long x;
+        while ( 1 ) {
+            x = (((long long)rand()) << 32);
+            x = x ^ ctm;
+            if ( ctm != ctmLast || ClientCursor::find_inlock(x, false) == 0 )
+                break;
+        }
+        ctmLast = ctm;
+        return x;
+    }
+
+    void ClientCursor::storeOpForSlave( DiskLoc last ) {
+        if ( ! ( _queryOptions & QueryOption_OplogReplay ))
+            return;
+
+        if ( last.isNull() )
+            return;
+
+        BSONElement e = last.obj()["ts"];
+        if ( e.type() == Date || e.type() == Timestamp )
+            _slaveReadTill = e._opTime();
+    }
+
+    void ClientCursor::updateSlaveLocation( CurOp& curop ) {
+        if ( _slaveReadTill.isNull() )
+            return;
+        mongo::updateSlaveLocation( curop , _ns.c_str() , _slaveReadTill );
+    }
+
+
+    void ClientCursor::appendStats( BSONObjBuilder& result ) {
+        recursive_scoped_lock lock(ccmutex);
+        result.appendNumber("totalOpen", clientCursorsById.size() );
+        result.appendNumber("clientCursors_size", (int) numCursors());
+        result.appendNumber("timedOut" , numberTimedOut);
+        unsigned pinned = 0;
+        unsigned notimeout = 0;
+        for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); i++ ) {
+            unsigned p = i->second->_pinValue;
+            if( p >= 100 )
+                pinned++;
+            else if( p > 0 )
+                notimeout++;
+        }
+        if( pinned ) 
+            result.append("pinned", pinned);
+        if( notimeout )
+            result.append("totalNoTimeout", notimeout);
+    }
+
+    // QUESTION: Restrict to the namespace from which this command was issued?
+    // Alternatively, make this command admin-only?
+    class CmdCursorInfo : public Command {
+    public:
+        CmdCursorInfo() : Command( "cursorInfo", true ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream& help ) const {
+            help << " example: { cursorInfo : 1 }";
+        }
+        virtual LockType locktype() const { return NONE; }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            ClientCursor::appendStats( result );
+            return true;
+        }
+    } cmdCursorInfo;
+
+    struct Mem { 
+        Mem() { res = virt = mapped = 0; }
+        int res;
+        int virt;
+        int mapped;
+        bool grew(const Mem& r) { 
+            return (r.res && (((double)res)/r.res)>1.1 ) ||
+              (r.virt && (((double)virt)/r.virt)>1.1 ) ||
+              (r.mapped && (((double)mapped)/r.mapped)>1.1 );
+        }
+    };
+
+    /** called once a minute from killcursors thread */
+    void sayMemoryStatus() { 
+        static time_t last;
+        static Mem mlast;
+        try {
+            ProcessInfo p;
+            if ( !cmdLine.quiet && p.supported() ) {
+                Mem m;
+                m.res = p.getResidentSize();
+                m.virt = p.getVirtualMemorySize();
+                m.mapped = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
+                if( time(0)-last >= 300 || m.grew(mlast) ) { 
+                    log() << "mem (MB) res:" << m.res << " virt:" << m.virt << " mapped:" << m.mapped << endl;
+                    if( m.virt - (cmdLine.dur?2:1)*m.mapped > 5000 ) { 
+                        ONCE log() << "warning virtual/mapped memory differential is large. journaling:" << cmdLine.dur << endl;
+                    }
+                    last = time(0);
+                    mlast = m;
+                }
+            }
+        }
+        catch(...) {
+            log() << "ProcessInfo exception" << endl;
+        }
+    }
+
+    /** thread for timing out old cursors */
+    void ClientCursorMonitor::run() {
+        Client::initThread("clientcursormon");
+        Client& client = cc();
+        Timer t;
+        const int Secs = 4;
+        unsigned n = 0;
+        while ( ! inShutdown() ) {
+            ClientCursor::idleTimeReport( t.millisReset() );
+            sleepsecs(Secs);
+            if( ++n % (60/4) == 0 /*once a minute*/ ) { 
+                sayMemoryStatus();
+            }
+        }
+        client.shutdown();
+    }
+
+    void ClientCursor::find( const string& ns , set<CursorId>& all ) {
+        recursive_scoped_lock lock(ccmutex);
+
+        for ( CCById::iterator i=clientCursorsById.begin(); i!=clientCursorsById.end(); ++i ) {
+            if ( i->second->_ns == ns )
+                all.insert( i->first );
+        }
+    }
+
+    int ClientCursor::erase(int n, long long *ids) {
+        int found = 0;
+        for ( int i = 0; i < n; i++ ) {
+            if ( erase(ids[i]) )
+                found++;
+
+            if ( inShutdown() )
+                break;
+        }
+        return found;
+
+    }
+
+    ClientCursorMonitor clientCursorMonitor;
+
+} // namespace mongo
diff --git a/src/mongo/db/clientcursor.h b/src/mongo/db/clientcursor.h
new file mode 100644
index 00000000000..e570820f62c
--- /dev/null
+++ b/src/mongo/db/clientcursor.h
@@ -0,0 +1,430 @@
+/* clientcursor.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Cursor -- and its derived classes -- are our internal cursors.
+
+   ClientCursor is a wrapper that represents a cursorid from our database
+   application's perspective.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "cursor.h"
+#include "jsobj.h"
+#include "../util/net/message.h"
+#include "../util/net/listen.h"
+#include "../util/background.h"
+#include "diskloc.h"
+#include "dbhelpers.h"
+#include "matcher.h"
+#include "../client/dbclient.h"
+#include "projection.h"
+#include "s/d_chunk_manager.h"
+
+namespace mongo {
+
+    typedef long long CursorId; /* passed to the client so it can send back on getMore */
+    class Cursor; /* internal server cursor base class */
+    class ClientCursor;
+    class ParsedQuery;
+
+    struct ByLocKey {
+
+        ByLocKey( const DiskLoc & l , const CursorId& i ) : loc(l), id(i) {}
+
+        static ByLocKey min( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::min() ); }
+        static ByLocKey max( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::max() ); }
+
+        bool operator<( const ByLocKey &other ) const {
+            int x = loc.compare( other.loc );
+            if ( x )
+                return x < 0;
+            return id < other.id;
+        }
+
+        DiskLoc loc;
+        CursorId id;
+
+    };
+
+    /* todo: make this map be per connection.  this will prevent cursor hijacking security attacks perhaps.
+     *       ERH: 9/2010 this may not work since some drivers send getMore over a different connection
+    */
+    typedef map<CursorId, ClientCursor*> CCById;
+    typedef map<ByLocKey, ClientCursor*> CCByLoc;
+
+    extern BSONObj id_obj;
+
+    class ClientCursor {
+        friend class CmdCursorInfo;
+    public:
+        static void assertNoCursors();
+
+        /* use this to assure we don't in the background time out cursor while it is under use.
+           if you are using noTimeout() already, there is no risk anyway.
+           Further, this mechanism guards against two getMore requests on the same cursor executing
+           at the same time - which might be bad.  That should never happen, but if a client driver
+           had a bug, it could (or perhaps some sort of attack situation).
+        */
+        class Pointer : boost::noncopyable {
+            ClientCursor *_c;
+        public:
+            ClientCursor * c() { return _c; }
+            void release() {
+                if( _c ) {
+                    assert( _c->_pinValue >= 100 );
+                    _c->_pinValue -= 100;
+                    _c = 0;
+                }
+            }
+            /**
+             * call this if during a yield, the cursor got deleted
+             * if so, we don't want to use the point address
+             */
+            void deleted() {
+                _c = 0;
+            }
+            ~Pointer() { release(); }
+            Pointer(long long cursorid) {
+                recursive_scoped_lock lock(ccmutex);
+                _c = ClientCursor::find_inlock(cursorid, true);
+                if( _c ) {
+                    if( _c->_pinValue >= 100 ) {
+                        _c = 0;
+                        uasserted(12051, "clientcursor already in use? driver problem?");
+                    }
+                    _c->_pinValue += 100;
+                }
+            }
+        };
+
+        // This object assures safe and reliable cleanup of the ClientCursor.
+        // The implementation assumes that there will be no duplicate ids among cursors
+        // (which is assured if cursors must last longer than 1 second).
+        class CleanupPointer : boost::noncopyable {
+        public:
+            CleanupPointer() : _c( 0 ), _id( -1 ) {}
+            void reset( ClientCursor *c = 0 ) {
+                if ( c == _c )
+                    return;
+                if ( _c ) {
+                    // be careful in case cursor was deleted by someone else
+                    ClientCursor::erase( _id );
+                }
+                if ( c ) {
+                    _c = c;
+                    _id = c->_cursorid;
+                }
+                else {
+                    _c = 0;
+                    _id = -1;
+                }
+            }
+            ~CleanupPointer() {
+                DESTRUCTOR_GUARD ( reset(); );
+            }
+            operator bool() { return _c; }
+            ClientCursor * operator-> () { return _c; }
+        private:
+            ClientCursor *_c;
+            CursorId _id;
+        };
+
+        ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query = BSONObj() );
+
+        ~ClientCursor();
+
+        // ***************  basic accessors *******************
+
+        CursorId cursorid() const { return _cursorid; }
+        string ns() const { return _ns; }
+        Database * db() const { return _db; }
+        const BSONObj& query() const { return _query; }
+        int queryOptions() const { return _queryOptions; }
+
+        DiskLoc lastLoc() const { return _lastLoc; }
+
+        /* Get rid of cursors for namespaces 'ns'. When dropping a db, ns is "dbname."
+           Used by drop, dropIndexes, dropDatabase.
+        */
+        static void invalidate(const char *ns);
+
+        /**
+         * @param microsToSleep -1 : ask client
+         *                     >=0 : sleep for that amount
+         * @param recordToLoad after yielding lock, load this record with only mmutex
+         * do a dbtemprelease
+         * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic -
+         *       we don't do herein as this->matcher (above) is only initialized for true queries/getmore.
+         *       (ie not set for remote/update)
+         * @return if the cursor is still valid.
+         *         if false is returned, then this ClientCursor should be considered deleted -
+         *         in fact, the whole database could be gone.
+         */
+        bool yield( int microsToSleep = -1 , Record * recordToLoad = 0 );
+
+        enum RecordNeeds {
+            DontNeed = -1 , MaybeCovered = 0 , WillNeed = 100
+        };
+            
+        /**
+         * @param needRecord whether or not the next record has to be read from disk for sure
+         *                   if this is true, will yield of next record isn't in memory
+         * @param yielded true if a yield occurred, and potentially if a yield did not occur
+         * @return same as yield()
+         */
+        bool yieldSometimes( RecordNeeds need, bool *yielded = 0 );
+
+        static int suggestYieldMicros();
+        static void staticYield( int micros , const StringData& ns , Record * rec );
+
+        struct YieldData { CursorId _id; bool _doingDeletes; };
+        bool prepareToYield( YieldData &data );
+        static bool recoverFromYield( const YieldData &data );
+
+        struct YieldLock : boost::noncopyable {
+            explicit YieldLock( ptr<ClientCursor> cc )
+                : _canYield(cc->_c->supportYields()) {
+                if ( _canYield ) {
+                    cc->prepareToYield( _data );
+                    _unlock.reset(new dbtempreleasecond());
+                }
+            }
+            ~YieldLock() {
+                if ( _unlock ) {
+                    log( LL_WARNING ) << "ClientCursor::YieldLock not closed properly" << endl;
+                    relock();
+                }
+            }
+            bool stillOk() {
+                if ( ! _canYield )
+                    return true;
+                relock();
+                return ClientCursor::recoverFromYield( _data );
+            }
+            void relock() {
+                _unlock.reset();
+            }
+        private:
+            const bool _canYield;
+            YieldData _data;
+            scoped_ptr<dbtempreleasecond> _unlock;
+        };
+
+        // --- some pass through helpers for Cursor ---
+
+        Cursor* c() const { return _c.get(); }
+        int pos() const { return _pos; }
+
+        void incPos( int n ) { _pos += n; } // TODO: this is bad
+        void setPos( int n ) { _pos = n; } // TODO : this is bad too
+
+        BSONObj indexKeyPattern() { return _c->indexKeyPattern();  }
+        bool modifiedKeys() const { return _c->modifiedKeys(); }
+        bool isMultiKey() const { return _c->isMultiKey(); }
+
+        bool ok() { return _c->ok(); }
+        bool advance() { return _c->advance(); }
+        BSONObj current() { return _c->current(); }
+        DiskLoc currLoc() { return _c->currLoc(); }
+        BSONObj currKey() const { return _c->currKey(); }
+
+        /**
+         * same as BSONObj::getFieldsDotted
+         * if it can be retrieved from key, it is
+         * @param holder keeps the currKey in scope by keeping a reference to it here. generally you'll want 
+         *        holder and ret to destruct about the same time.
+         * @return if this was retrieved from key
+         */
+        bool getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder );
+
+        /**
+         * same as BSONObj::getFieldDotted
+         * if it can be retrieved from key, it is
+         * @return if this was retrieved from key
+         */
+        BSONElement getFieldDotted( const string& name , BSONObj& holder , bool * fromKey = 0 ) ;
+        
+        /** extract items from object which match a pattern object.
+         * e.g., if pattern is { x : 1, y : 1 }, builds an object with
+         * x and y elements of this object, if they are present.
+         * returns elements with original field names
+         * NOTE: copied from BSONObj::extractFields
+        */
+        BSONObj extractFields(const BSONObj &pattern , bool fillWithNull = false) ;
+        
+        bool currentIsDup() { return _c->getsetdup( _c->currLoc() ); }
+
+        bool currentMatches() {
+            if ( ! _c->matcher() )
+                return true;
+            return _c->matcher()->matchesCurrent( _c.get() );
+        }
+
+        void setChunkManager( ShardChunkManagerPtr manager ){ _chunkManager = manager; }
+        ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
+    private:
+        void setLastLoc_inlock(DiskLoc);
+
+        static ClientCursor* find_inlock(CursorId id, bool warn = true) {
+            CCById::iterator it = clientCursorsById.find(id);
+            if ( it == clientCursorsById.end() ) {
+                if ( warn )
+                    OCCASIONALLY out() << "ClientCursor::find(): cursor not found in map " << id << " (ok after a drop)\n";
+                return 0;
+            }
+            return it->second;
+        }
+    public:
+        static ClientCursor* find(CursorId id, bool warn = true) {
+            recursive_scoped_lock lock(ccmutex);
+            ClientCursor *c = find_inlock(id, warn);
+            // if this asserts, your code was not thread safe - you either need to set no timeout
+            // for the cursor or keep a ClientCursor::Pointer in scope for it.
+            massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue );
+            return c;
+        }
+
+        static bool erase(CursorId id) {
+            recursive_scoped_lock lock(ccmutex);
+            ClientCursor *cc = find_inlock(id);
+            if ( cc ) {
+                assert( cc->_pinValue < 100 ); // you can't still have an active ClientCursor::Pointer
+                delete cc;
+                return true;
+            }
+            return false;
+        }
+
+        /**
+         * @return number of cursors found
+         */
+        static int erase( int n , long long * ids );
+
+        /* call when cursor's location changes so that we can update the
+           cursorsbylocation map.  if you are locked and internally iterating, only
+           need to call when you are ready to "unlock".
+           */
+        void updateLocation();
+
+        void mayUpgradeStorage() {
+            /* if ( !ids_.get() )
+                return;
+            stringstream ss;
+            ss << ns << "." << cursorid;
+            ids_->mayUpgradeStorage( ss.str() );*/
+        }
+
+        /**
+         * @param millis amount of idle passed time since last call
+         */
+        bool shouldTimeout( unsigned millis );
+
+        void storeOpForSlave( DiskLoc last );
+        void updateSlaveLocation( CurOp& curop );
+
+        unsigned idleTime() const { return _idleAgeMillis; }
+
+        void setDoingDeletes( bool doingDeletes ) {_doingDeletes = doingDeletes; }
+
+        void slaveReadTill( const OpTime& t ) { _slaveReadTill = t; }
+
+    public: // static methods
+
+        static void idleTimeReport(unsigned millis);
+
+        static void appendStats( BSONObjBuilder& result );
+        static unsigned numCursors() { return clientCursorsById.size(); }
+        static void informAboutToDeleteBucket(const DiskLoc& b);
+        static void aboutToDelete(const DiskLoc& dl);
+        static void find( const string& ns , set<CursorId>& all );
+
+
+    private: // methods
+
+        // cursors normally timeout after an inactivy period to prevent excess memory use
+        // setting this prevents timeout of the cursor in question.
+        void noTimeout() { _pinValue++; }
+
+        CCByLoc& byLoc() { return _db->ccByLoc; }
+        
+        Record* _recordForYield( RecordNeeds need );
+
+    private:
+
+        CursorId _cursorid;
+
+        const string _ns;
+        Database * _db;
+
+        const shared_ptr<Cursor> _c;
+        map<string,int> _indexedFields;  // map from indexed field to offset in key object
+        int _pos;                        // # objects into the cursor so far
+
+        const BSONObj _query;            // used for logging diags only; optional in constructor
+        int _queryOptions;        // see enum QueryOptions dbclient.h
+
+        OpTime _slaveReadTill;
+
+        DiskLoc _lastLoc;                        // use getter and setter not this (important)
+        unsigned _idleAgeMillis;                 // how long has the cursor been around, relative to server idle time
+
+        /* 0 = normal
+           1 = no timeout allowed
+           100 = in use (pinned) -- see Pointer class
+        */
+        unsigned _pinValue;
+
+        bool _doingDeletes; // when true we are the delete and aboutToDelete shouldn't manipulate us
+        ElapsedTracker _yieldSometimesTracker;
+
+        ShardChunkManagerPtr _chunkManager;
+
+    public:
+        shared_ptr<ParsedQuery> pq;
+        shared_ptr<Projection> fields; // which fields query wants returned
+        Message originalMessage; // this is effectively an auto ptr for data the matcher points to
+
+
+
+    private: // static members
+
+        static CCById clientCursorsById;
+        static long long numberTimedOut;
+        static boost::recursive_mutex& ccmutex;   // must use this for all statics above!
+        static CursorId allocCursorId_inlock();
+
+    };
+
+    class ClientCursorMonitor : public BackgroundJob {
+    public:
+        string name() const { return "ClientCursorMonitor"; }
+        void run();
+    };
+
+} // namespace mongo
+
+// ClientCursor should only be used with auto_ptr because it needs to be
+// release()ed after a yield if stillOk() returns false and these pointer types
+// do not support releasing. This will prevent them from being used accidentally
+namespace boost{
+    template<> class scoped_ptr<mongo::ClientCursor> {};
+    template<> class shared_ptr<mongo::ClientCursor> {};
+}
diff --git a/src/mongo/db/cloner.cpp b/src/mongo/db/cloner.cpp
new file mode 100644
index 00000000000..e35ae95052d
--- /dev/null
+++ b/src/mongo/db/cloner.cpp
@@ -0,0 +1,763 @@
+// cloner.cpp - copy a database (export/import basically)
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "cloner.h"
+#include "pdfile.h"
+#include "../client/dbclient.h"
+#include "../bson/util/builder.h"
+#include "jsobj.h"
+#include "ops/query.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "repl.h"
+
+namespace mongo {
+
+    BSONElement getErrField(const BSONObj& o);
+
+    void ensureHaveIdIndex(const char *ns);
+
+    bool replAuthenticate(DBClientBase *);
+
+    /** Selectively release the mutex based on a parameter. */
+    class dbtempreleaseif {
+    public:
+        dbtempreleaseif( bool release ) : _impl( release ? new dbtemprelease() : 0 ) {}
+    private:
+        shared_ptr< dbtemprelease > _impl;
+    };
+    
+    void mayInterrupt( bool mayBeInterrupted ) {
+     	if ( mayBeInterrupted ) {
+         	killCurrentOp.checkForInterrupt( false );   
+        }
+    }
+    
+    class Cloner: boost::noncopyable {
+        auto_ptr< DBClientWithCommands > conn;
+        void copy(const char *from_ns, const char *to_ns, bool isindex, bool logForRepl,
+                  bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query q = Query());
+        struct Fun;
+    public:
+        Cloner() { }
+
+        /* slaveOk     - if true it is ok if the source of the data is !ismaster.
+           useReplAuth - use the credentials we normally use as a replication slave for the cloning
+           snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
+                         for example repairDatabase need not use it.
+        */
+        void setConnection( DBClientWithCommands *c ) { conn.reset( c ); }
+
+        /** copy the entire database */
+        bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode = 0);
+
+        bool copyCollection( const string& ns , const BSONObj& query , string& errmsg , bool mayYield, bool mayBeInterrupted, bool copyIndexes = true, bool logForRepl = true );
+    };
+
+    /* for index info object:
+         { "name" : "name_1" , "ns" : "foo.index3" , "key" :  { "name" : 1.0 } }
+       we need to fix up the value in the "ns" parameter so that the name prefix is correct on a
+       copy to a new name.
+    */
+    BSONObj fixindex(BSONObj o) {
+        BSONObjBuilder b;
+        BSONObjIterator i(o);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+
+            // for now, skip the "v" field so that v:0 indexes will be upgraded to v:1
+            if ( string("v") == e.fieldName() ) {
+                continue;
+            }
+
+            if ( string("ns") == e.fieldName() ) {
+                uassert( 10024 , "bad ns field for index during dbcopy", e.type() == String);
+                const char *p = strchr(e.valuestr(), '.');
+                uassert( 10025 , "bad ns field for index during dbcopy [2]", p);
+                string newname = cc().database()->name + p;
+                b.append("ns", newname);
+            }
+            else
+                b.append(e);
+        }
+        BSONObj res= b.obj();
+
+        /*    if( mod ) {
+            out() << "before: " << o.toString() << endl;
+            o.dump();
+            out() << "after:  " << res.toString() << endl;
+            res.dump();
+            }*/
+
+        return res;
+    }
+
+    struct Cloner::Fun {
+        Fun() : lastLog(0) { }
+        time_t lastLog;
+        void operator()( DBClientCursorBatchIterator &i ) {
+            mongolock l( true );
+            if ( context ) {
+                context->relocked();
+            }
+
+            while( i.moreInCurrentBatch() ) {
+                if ( n % 128 == 127 /*yield some*/ ) {
+                    time_t now = time(0);
+                    if( now - lastLog >= 60 ) { 
+                        // report progress
+                        if( lastLog )
+                            log() << "clone " << to_collection << ' ' << n << endl;
+                        lastLog = now;
+                    }
+                    mayInterrupt( _mayBeInterrupted );
+                    dbtempreleaseif t( _mayYield );
+                }
+
+                BSONObj tmp = i.nextSafe();
+
+                /* assure object is valid.  note this will slow us down a little. */
+                if ( !tmp.valid() ) {
+                    stringstream ss;
+                    ss << "Cloner: skipping corrupt object from " << from_collection;
+                    BSONElement e = tmp.firstElement();
+                    try {
+                        e.validate();
+                        ss << " firstElement: " << e;
+                    }
+                    catch( ... ) {
+                        ss << " firstElement corrupt";
+                    }
+                    out() << ss.str() << endl;
+                    continue;
+                }
+
+                ++n;
+
+                BSONObj js = tmp;
+                if ( isindex ) {
+                    assert( strstr(from_collection, "system.indexes") );
+                    js = fixindex(tmp);
+                    storedForLater->push_back( js.getOwned() );
+                    continue;
+                }
+
+                try {
+                    theDataFileMgr.insertWithObjMod(to_collection, js);
+                    if ( logForRepl )
+                        logOp("i", to_collection, js);
+
+                    getDur().commitIfNeeded();
+                }
+                catch( UserException& e ) {
+                    log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
+                }
+
+                RARELY if ( time( 0 ) - saveLast > 60 ) {
+                    log() << n << " objects cloned so far from collection " << from_collection << endl;
+                    saveLast = time( 0 );
+                }
+            }
+        }
+        int n;
+        bool isindex;
+        const char *from_collection;
+        const char *to_collection;
+        time_t saveLast;
+        list<BSONObj> *storedForLater;
+        bool logForRepl;
+        Client::Context *context;
+        bool _mayYield;
+        bool _mayBeInterrupted;
+    };
+
+    /* copy the specified collection
+       isindex - if true, this is system.indexes collection, in which we do some transformation when copying.
+    */
+    void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query query) {
+        list<BSONObj> storedForLater;
+
+        Fun f;
+        f.n = 0;
+        f.isindex = isindex;
+        f.from_collection = from_collection;
+        f.to_collection = to_collection;
+        f.saveLast = time( 0 );
+        f.storedForLater = &storedForLater;
+        f.logForRepl = logForRepl;
+        f._mayYield = mayYield;
+        f._mayBeInterrupted = mayBeInterrupted;
+
+        int options = QueryOption_NoCursorTimeout | ( slaveOk ? QueryOption_SlaveOk : 0 );
+        {
+            f.context = cc().getContext();
+            mayInterrupt( mayBeInterrupted );
+            dbtempreleaseif r( mayYield );
+            DBClientConnection *remote = dynamic_cast< DBClientConnection* >( conn.get() );
+            if ( remote ) {
+                remote->query( boost::function<void(DBClientCursorBatchIterator &)>( f ), from_collection, query, 0, options );
+            }
+            else {
+                // there is no exhaust mode for direct client, so we have this hack
+                auto_ptr<DBClientCursor> c = conn->query( from_collection, query, 0, 0, 0, options );
+                assert( c.get() );
+                while( c->more() ) {
+                    DBClientCursorBatchIterator i( *c );
+                    f( i );
+                }
+            }
+        }
+
+        if ( storedForLater.size() ) {
+            for ( list<BSONObj>::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ) {
+                BSONObj js = *i;
+                try {
+                    theDataFileMgr.insertWithObjMod(to_collection, js);
+                    if ( logForRepl )
+                        logOp("i", to_collection, js);
+
+                    getDur().commitIfNeeded();
+                }
+                catch( UserException& e ) {
+                    log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
+                }
+            }
+        }
+    }
+
+    bool copyCollectionFromRemote(const string& host, const string& ns, string& errmsg) {
+        Cloner c;
+
+        DBClientConnection *conn = new DBClientConnection();
+        // cloner owns conn in auto_ptr
+        c.setConnection(conn);
+        uassert(15908, errmsg, conn->connect(host, errmsg) && replAuthenticate(conn));
+
+        return c.copyCollection(ns, BSONObj(), errmsg, true, false, /*copyIndexes*/ true, false);
+    }
+
+    bool Cloner::copyCollection( const string& ns, const BSONObj& query, string& errmsg,
+                                 bool mayYield, bool mayBeInterrupted, bool copyIndexes, bool logForRepl ) {
+
+        writelock lk(ns); // TODO: make this lower down
+        Client::Context ctx(ns);
+
+        {
+            // config
+            string temp = ctx.db()->name + ".system.namespaces";
+            BSONObj config = conn->findOne( temp , BSON( "name" << ns ) );
+            if ( config["options"].isABSONObj() )
+                if ( ! userCreateNS( ns.c_str() , config["options"].Obj() , errmsg, logForRepl , 0 ) )
+                    return false;
+        }
+
+        {
+            // main data
+            copy( ns.c_str() , ns.c_str() , /*isindex*/false , logForRepl , false , true , mayYield, mayBeInterrupted, Query(query).snapshot() );
+        }
+
+        /* TODO : copyIndexes bool does not seem to be implemented! */
+        if( !copyIndexes ) {
+            log() << "ERROR copy collection copyIndexes not implemented? " << ns << endl;
+        }
+
+        {
+            // indexes
+            string temp = ctx.db()->name + ".system.indexes";
+            copy( temp.c_str() , temp.c_str() , /*isindex*/true , logForRepl , false , true , mayYield, mayBeInterrupted, BSON( "ns" << ns ) );
+        }
+        getDur().commitIfNeeded();
+        return true;
+    }
+
+    extern bool inDBRepair;
+    void ensureIdIndexForNewNs(const char *ns);
+
+    bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) {
+        if ( errCode ) {
+            *errCode = 0;
+        }
+        massert( 10289 ,  "useReplAuth is not written to replication log", !useReplAuth || !logForRepl );
+
+        string todb = cc().database()->name;
+        stringstream a,b;
+        a << "localhost:" << cmdLine.port;
+        b << "127.0.0.1:" << cmdLine.port;
+        bool masterSameProcess = ( a.str() == masterHost || b.str() == masterHost );
+        if ( masterSameProcess ) {
+            if ( fromdb == todb && cc().database()->path == dbpath ) {
+                // guard against an "infinite" loop
+                /* if you are replicating, the local.sources config may be wrong if you get this */
+                errmsg = "can't clone from self (localhost).";
+                return false;
+            }
+        }
+        /* todo: we can put these releases inside dbclient or a dbclient specialization.
+           or just wait until we get rid of global lock anyway.
+           */
+        string ns = fromdb + ".system.namespaces";
+        list<BSONObj> toClone;
+        {
+            mayInterrupt( mayBeInterrupted );
+            dbtempreleaseif r( mayYield );
+
+            // just using exhaust for collection copying right now
+            auto_ptr<DBClientCursor> c;
+            {
+                if ( conn.get() ) {
+                    // nothing to do
+                }
+                else if ( !masterSameProcess ) {
+                    ConnectionString cs = ConnectionString::parse( masterHost, errmsg );
+                    auto_ptr<DBClientBase> con( cs.connect( errmsg ));
+                    if ( !con.get() )
+                        return false;
+                    if( !replAuthenticate(con.get()) )
+                        return false;
+
+                    conn = con;
+                }
+                else {
+                    conn.reset( new DBDirectClient() );
+                }
+                // todo: if snapshot (bool param to this func) is true, we need to snapshot this query?
+                //       only would be relevant if a thousands of collections -- maybe even then it is hard
+                //       to exceed a single cursor batch.
+                //       for repl it is probably ok as we apply oplog section after the clone (i.e. repl 
+                //       doesnt not use snapshot=true).
+                c = conn->query( ns.c_str(), BSONObj(), 0, 0, 0, slaveOk ? QueryOption_SlaveOk : 0 );
+            }
+
+            if ( c.get() == 0 ) {
+                errmsg = "query failed " + ns;
+                return false;
+            }
+            
+            if ( c->more() ) {
+                BSONObj first = c->next();
+                if( !getErrField(first).eoo() ) {
+                    if ( errCode ) {
+                        *errCode = first.getIntField("code");
+                    }
+                    errmsg = "query failed " + ns;
+                    return false;
+                }
+                c->putBack( first );
+            }
+
+            while ( c->more() ) {
+                BSONObj collection = c->next();
+
+                log(2) << "\t cloner got " << collection << endl;
+
+                BSONElement e = collection.getField("name");
+                if ( e.eoo() ) {
+                    string s = "bad system.namespaces object " + collection.toString();
+                    massert( 10290 , s.c_str(), false);
+                }
+                assert( !e.eoo() );
+                assert( e.type() == String );
+                const char *from_name = e.valuestr();
+
+                if( strstr(from_name, ".system.") ) {
+                    /* system.users and s.js is cloned -- but nothing else from system.
+                     * system.indexes is handled specially at the end*/
+                    if( legalClientSystemNS( from_name , true ) == 0 ) {
+                        log(2) << "\t\t not cloning because system collection" << endl;
+                        continue;
+                    }
+                }
+                if( ! NamespaceString::normal( from_name ) ) {
+                    log(2) << "\t\t not cloning because has $ " << endl;
+                    continue;
+                }
+                toClone.push_back( collection.getOwned() );
+            }
+        }
+
+        for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ) {
+            {
+                mayInterrupt( mayBeInterrupted );
+                dbtempreleaseif r( mayYield );
+            }
+            BSONObj collection = *i;
+            log(2) << "  really will clone: " << collection << endl;
+            const char * from_name = collection["name"].valuestr();
+            BSONObj options = collection.getObjectField("options");
+
+            /* change name "<fromdb>.collection" -> <todb>.collection */
+            const char *p = strchr(from_name, '.');
+            assert(p);
+            string to_name = todb + p;
+
+            bool wantIdIndex = false;
+            {
+                string err;
+                const char *toname = to_name.c_str();
+                /* we defer building id index for performance - building it in batch is much faster */
+                userCreateNS(toname, options, err, logForRepl, &wantIdIndex);
+            }
+            log(1) << "\t\t cloning " << from_name << " -> " << to_name << endl;
+            Query q;
+            if( snapshot )
+                q.snapshot();
+            copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, q);
+
+            if( wantIdIndex ) {
+                /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations
+                   that occur during the initial sync.  inDBRepair makes dropDups be true.
+                   */
+                bool old = inDBRepair;
+                try {
+                    inDBRepair = true;
+                    ensureIdIndexForNewNs(to_name.c_str());
+                    inDBRepair = old;
+                }
+                catch(...) {
+                    inDBRepair = old;
+                    throw;
+                }
+            }
+        }
+
+        // now build the indexes
+
+        string system_indexes_from = fromdb + ".system.indexes";
+        string system_indexes_to = todb + ".system.indexes";
+        /* [dm]: is the ID index sometimes not called "_id_"?  There is other code in the system that looks for a "_id" prefix
+                 rather than this exact value.  we should standardize.  OR, remove names - which is in the bugdb.  Anyway, this
+                 is dubious here at the moment.
+        */
+        // won't need a snapshot of the query of system.indexes as there can never be very many.
+        copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, BSON( "name" << NE << "_id_" ) );
+
+        return true;
+    }
+
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+                   bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) {
+        Cloner c;
+        return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot, mayYield, mayBeInterrupted, errCode);
+    }
+
+    /* Usage:
+       mydb.$cmd.findOne( { clone: "fromhost" } );
+    */
+    class CmdClone : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "clone this database from an instance of the db on another host\n";
+            help << "{ clone : \"host13\" }";
+        }
+        CmdClone() : Command("clone") { }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string from = cmdObj.getStringField("clone");
+            if ( from.empty() )
+                return false;
+            /* replication note: we must logOp() not the command, but the cloned data -- if the slave
+               were to clone it would get a different point-in-time and not match.
+               */
+            return cloneFrom(from.c_str(), errmsg, dbname,
+                             /*logForReplication=*/!fromRepl, /*slaveOk*/false, /*usereplauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/false);
+        }
+    } cmdclone;
+
+    class CmdCloneCollection : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return NONE; }
+        CmdCloneCollection() : Command("cloneCollection") { }
+        virtual void help( stringstream &help ) const {
+            help << "{ cloneCollection: <namespace>, from: <host> [,query: <query_filter>] [,copyIndexes:<bool>] }"
+                 "\nCopies a collection from one server to another. Do not use on a single server as the destination "
+                 "is placed at the same db.collection (namespace) as the source.\n"
+                 "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there."
+                 ;
+        }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string fromhost = cmdObj.getStringField("from");
+            if ( fromhost.empty() ) {
+                errmsg = "missing 'from' parameter";
+                return false;
+            }
+            {
+                HostAndPort h(fromhost);
+                if( h.isSelf() ) {
+                    errmsg = "can't cloneCollection from self";
+                    return false;
+                }
+            }
+            string collection = cmdObj.getStringField("cloneCollection");
+            if ( collection.empty() ) {
+                errmsg = "bad 'cloneCollection' value";
+                return false;
+            }
+            BSONObj query = cmdObj.getObjectField("query");
+            if ( query.isEmpty() )
+                query = BSONObj();
+
+            BSONElement copyIndexesSpec = cmdObj.getField("copyindexes");
+            bool copyIndexes = copyIndexesSpec.isBoolean() ? copyIndexesSpec.boolean() : true;
+
+            log() << "cloneCollection.  db:" << dbname << " collection:" << collection << " from: " << fromhost
+                  << " query: " << query << " " << ( copyIndexes ? "" : ", not copying indexes" ) << endl;
+
+            Cloner c;
+            auto_ptr<DBClientConnection> myconn;
+            myconn.reset( new DBClientConnection() );
+            if ( ! myconn->connect( fromhost , errmsg ) )
+                return false;
+
+            c.setConnection( myconn.release() );
+
+            return c.copyCollection( collection , query, errmsg , true, false, copyIndexes );
+        }
+    } cmdclonecollection;
+
+
+    thread_specific_ptr< DBClientConnection > authConn_;
+    /* Usage:
+     admindb.$cmd.findOne( { copydbgetnonce: 1, fromhost: <hostname> } );
+     */
+    class CmdCopyDbGetNonce : public Command {
+    public:
+        CmdCopyDbGetNonce() : Command("copydbgetnonce") { }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "get a nonce for subsequent copy db request from secure server\n";
+            help << "usage: {copydbgetnonce: 1, fromhost: <hostname>}";
+        }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string fromhost = cmdObj.getStringField("fromhost");
+            if ( fromhost.empty() ) {
+                /* copy from self */
+                stringstream ss;
+                ss << "localhost:" << cmdLine.port;
+                fromhost = ss.str();
+            }
+            authConn_.reset( new DBClientConnection() );
+            BSONObj ret;
+            {
+                dbtemprelease t;
+                if ( !authConn_->connect( fromhost, errmsg ) )
+                    return false;
+                if( !authConn_->runCommand( "admin", BSON( "getnonce" << 1 ), ret ) ) {
+                    errmsg = "couldn't get nonce " + ret.toString();
+                    return false;
+                }
+            }
+            result.appendElements( ret );
+            return true;
+        }
+    } cmdcopydbgetnonce;
+
+    /* Usage:
+       admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>] } );
+    */
+    class CmdCopyDb : public Command {
+    public:
+        CmdCopyDb() : Command("copydb") { }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "copy a database from another host to this host\n";
+            help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, slaveOk: <bool>, username: <username>, nonce: <nonce>, key: <key>]}";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            bool slaveOk = cmdObj["slaveOk"].trueValue();
+            string fromhost = cmdObj.getStringField("fromhost");
+            if ( fromhost.empty() ) {
+                /* copy from self */
+                stringstream ss;
+                ss << "localhost:" << cmdLine.port;
+                fromhost = ss.str();
+            }
+            string fromdb = cmdObj.getStringField("fromdb");
+            string todb = cmdObj.getStringField("todb");
+            if ( fromhost.empty() || todb.empty() || fromdb.empty() ) {
+                errmsg = "parms missing - {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}";
+                return false;
+            }
+            Cloner c;
+            string username = cmdObj.getStringField( "username" );
+            string nonce = cmdObj.getStringField( "nonce" );
+            string key = cmdObj.getStringField( "key" );
+            if ( !username.empty() && !nonce.empty() && !key.empty() ) {
+                uassert( 13008, "must call copydbgetnonce first", authConn_.get() );
+                BSONObj ret;
+                {
+                    dbtemprelease t;
+                    if ( !authConn_->runCommand( fromdb, BSON( "authenticate" << 1 << "user" << username << "nonce" << nonce << "key" << key ), ret ) ) {
+                        errmsg = "unable to login " + ret.toString();
+                        return false;
+                    }
+                }
+                c.setConnection( authConn_.release() );
+            }
+            Client::Context ctx(todb);
+            bool res = c.go(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, slaveOk, /*replauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/ false);
+            return res;
+        }
+    } cmdcopydb;
+
+    class CmdRenameCollection : public Command {
+    public:
+        CmdRenameCollection() : Command( "renameCollection" ) {}
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual bool requiresAuth() { return false; } // do our own auth
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool logTheOp() {
+            return true; // can't log steps when doing fast rename within a db, so always log the op rather than individual steps comprising it.
+        }
+        virtual void help( stringstream &help ) const {
+            help << " example: { renameCollection: foo.a, to: bar.b }";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string source = cmdObj.getStringField( name.c_str() );
+            string target = cmdObj.getStringField( "to" );
+            uassert(15967,"invalid collection name: " + target, NamespaceString::validCollectionName(target.c_str()));
+            if ( source.empty() || target.empty() ) {
+                errmsg = "invalid command syntax";
+                return false;
+            }
+
+            bool capped = false;
+            long long size = 0;
+            {
+                Client::Context ctx( source ); // auths against source
+                NamespaceDetails *nsd = nsdetails( source.c_str() );
+                uassert( 10026 ,  "source namespace does not exist", nsd );
+                capped = nsd->capped;
+                if ( capped )
+                    for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext )
+                        size += i.ext()->length;
+            }
+
+            Client::Context ctx( target ); //auths against target
+
+            if ( nsdetails( target.c_str() ) ) {
+                uassert( 10027 ,  "target namespace exists", cmdObj["dropTarget"].trueValue() );
+                BSONObjBuilder bb( result.subobjStart( "dropTarget" ) );
+                dropCollection( target , errmsg , bb );
+                bb.done();
+                if ( errmsg.size() > 0 )
+                    return false;
+            }
+
+            {
+                char from[256];
+                nsToDatabase( source.c_str(), from );
+                char to[256];
+                nsToDatabase( target.c_str(), to );
+                if ( strcmp( from, to ) == 0 ) {
+                    renameNamespace( source.c_str(), target.c_str() );
+                    // make sure we drop counters etc
+                    Top::global.collectionDropped( source );
+                    return true;
+                }
+            }
+
+            BSONObjBuilder spec;
+            if ( capped ) {
+                spec.appendBool( "capped", true );
+                spec.append( "size", double( size ) );
+            }
+            if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) )
+                return false;
+
+            auto_ptr< DBClientCursor > c;
+            DBDirectClient bridge;
+
+            {
+                c = bridge.query( source, BSONObj() );
+            }
+            while( 1 ) {
+                {
+                    if ( !c->more() )
+                        break;
+                }
+                BSONObj o = c->next();
+                theDataFileMgr.insertWithObjMod( target.c_str(), o );
+            }
+
+            char cl[256];
+            nsToDatabase( source.c_str(), cl );
+            string sourceIndexes = string( cl ) + ".system.indexes";
+            nsToDatabase( target.c_str(), cl );
+            string targetIndexes = string( cl ) + ".system.indexes";
+            {
+                c = bridge.query( sourceIndexes, QUERY( "ns" << source ) );
+            }
+            while( 1 ) {
+                {
+                    if ( !c->more() )
+                        break;
+                }
+                BSONObj o = c->next();
+                BSONObjBuilder b;
+                BSONObjIterator i( o );
+                while( i.moreWithEOO() ) {
+                    BSONElement e = i.next();
+                    if ( e.eoo() )
+                        break;
+                    if ( strcmp( e.fieldName(), "ns" ) == 0 ) {
+                        b.append( "ns", target );
+                    }
+                    else {
+                        b.append( e );
+                    }
+                }
+                BSONObj n = b.done();
+                theDataFileMgr.insertWithObjMod( targetIndexes.c_str(), n );
+            }
+
+            {
+                Client::Context ctx( source );
+                dropCollection( source, errmsg, result );
+            }
+            return true;
+        }
+    } cmdrenamecollection;
+
+} // namespace mongo
diff --git a/src/mongo/db/cloner.h b/src/mongo/db/cloner.h
new file mode 100644
index 00000000000..130fea0fac1
--- /dev/null
+++ b/src/mongo/db/cloner.h
@@ -0,0 +1,39 @@
+// cloner.h - copy a database (export/import basically)
+
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+    
+    /**
+     * @param slaveOk     - if true it is ok if the source of the data is !ismaster.
+     * @param useReplAuth - use the credentials we normally use as a replication slave for the cloning
+     * @param snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
+     *                      for example repairDatabase need not use it.
+     * @param errCode     - If provided, this will be set on error to the server's error code.  Currently
+     *                      this will only be set if there is an error in the initial system.namespaces query.
+     */
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+                   bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield,
+                   bool mayBeInterrupted, int *errCode = 0);
+
+    bool copyCollectionFromRemote(const string& host, const string& ns, string& errmsg);
+
+} // namespace mongo
diff --git a/src/mongo/db/cmdline.cpp b/src/mongo/db/cmdline.cpp
new file mode 100644
index 00000000000..a9b0d7097ca
--- /dev/null
+++ b/src/mongo/db/cmdline.cpp
@@ -0,0 +1,519 @@
+// cmdline.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "commands.h"
+#include "../util/password.h"
+#include "../util/processinfo.h"
+#include "../util/net/listen.h"
+#include "security_common.h"
+#ifdef _WIN32
+#include <direct.h>
+#else
+#include <sys/types.h>
+#include <sys/wait.h>
+#endif
+#include "globals.h"
+
+#define MAX_LINE_LENGTH 256
+
+namespace po = boost::program_options;
+namespace fs = boost::filesystem;
+
+namespace mongo {
+
+    void setupSignals( bool inFork );
+    string getHostNameCached();
+    static BSONArray argvArray;
+    static BSONObj parsedOpts;
+
+    void CmdLine::addGlobalOptions( boost::program_options::options_description& general ,
+                                    boost::program_options::options_description& hidden ) {
+        /* support for -vv -vvvv etc. */
+        for (string s = "vv"; s.length() <= 12; s.append("v")) {
+            hidden.add_options()(s.c_str(), "verbose");
+        }
+
+        general.add_options()
+        ("help,h", "show this usage information")
+        ("version", "show version information")
+        ("config,f", po::value<string>(), "configuration file specifying additional options")
+        ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
+        ("quiet", "quieter output")
+        ("port", po::value<int>(&cmdLine.port), "specify port number")
+        ("bind_ip", po::value<string>(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default")
+        ("maxConns",po::value<int>(), "max number of simultaneous connections")
+        ("objcheck", "inspect client data for validity on receipt")
+        ("logpath", po::value<string>() , "log file to send write to instead of stdout - has to be a file, not directory" )
+        ("logappend" , "append to logpath instead of over-writing" )
+        ("pidfilepath", po::value<string>(), "full path to pidfile (if not set, no pidfile is created)")
+        ("keyFile", po::value<string>(), "private key for cluster authentication (only for replica sets)")
+#ifndef _WIN32
+        ("nounixsocket", "disable listening on unix sockets")
+        ("unixSocketPrefix", po::value<string>(), "alternative directory for UNIX domain sockets (defaults to /tmp)")
+        ("fork" , "fork server process" )
+        ("syslog" , "log to system's syslog facility instead of file or stdout" )
+#endif
+        ;
+        
+        hidden.add_options()
+        ("cloud", po::value<string>(), "custom dynamic host naming")
+#ifdef MONGO_SSL
+        ("sslOnNormalPorts" , "use ssl on configured ports" )
+        ("sslPEMKeyFile" , po::value<string>(&cmdLine.sslPEMKeyFile), "PEM file for ssl" )
+        ("sslPEMKeyPassword" , new PasswordValue(&cmdLine.sslPEMKeyPassword) , "PEM file password" )
+#endif
+        ;
+
+    }
+
+
+#if defined(_WIN32)
+    void CmdLine::addWindowsOptions( boost::program_options::options_description& windows ,
+                                     boost::program_options::options_description& hidden ) {
+        windows.add_options()
+        ("install", "install mongodb service")
+        ("remove", "remove mongodb service")
+        ("reinstall", "reinstall mongodb service (equivilant of mongod --remove followed by mongod --install)")
+        ("serviceName", po::value<string>(), "windows service name")
+        ("serviceDisplayName", po::value<string>(), "windows service display name")
+        ("serviceDescription", po::value<string>(), "windows service description")
+        ("serviceUser", po::value<string>(), "user name service executes as")
+        ("servicePassword", po::value<string>(), "password used to authenticate serviceUser")
+        ;
+        hidden.add_options()("service", "start mongodb service");
+    }
+#endif
+
+    void CmdLine::parseConfigFile( istream &f, stringstream &ss ) {
+        string s;
+        char line[MAX_LINE_LENGTH];
+
+        while ( f ) {
+            f.getline(line, MAX_LINE_LENGTH);
+            s = line;
+            std::remove(s.begin(), s.end(), ' ');
+            std::remove(s.begin(), s.end(), '\t');
+            boost::to_upper(s);
+
+            if ( s.find( "FASTSYNC" ) != string::npos )
+                cout << "warning \"fastsync\" should not be put in your configuration file" << endl;
+
+            if ( s.c_str()[0] == '#' ) { 
+                // skipping commented line
+            } else if ( s.find( "=FALSE" ) == string::npos ) {
+                ss << line << endl;
+            } else {
+                cout << "warning: remove or comment out this line by starting it with \'#\', skipping now : " << line << endl;
+            }
+        }
+        return;
+    }
+
+#ifndef _WIN32
+    // support for exit value propagation with fork
+    void launchSignal( int sig ) {
+        if ( sig == SIGUSR2 ) {
+            pid_t cur = getpid();
+            
+            if ( cur == cmdLine.parentProc || cur == cmdLine.leaderProc ) {
+                // signal indicates successful start allowing us to exit
+                _exit(0);
+            } 
+        }
+    }
+
+    void setupLaunchSignals() {
+        assert( signal(SIGUSR2 , launchSignal ) != SIG_ERR );
+    }
+
+
+    void CmdLine::launchOk() {
+        if ( cmdLine.doFork ) {
+            // killing leader will propagate to parent
+            assert( kill( cmdLine.leaderProc, SIGUSR2 ) == 0 );
+        }
+    }
+#endif
+
+    bool CmdLine::store( int argc , char ** argv ,
+                         boost::program_options::options_description& visible,
+                         boost::program_options::options_description& hidden,
+                         boost::program_options::positional_options_description& positional,
+                         boost::program_options::variables_map &params ) {
+
+
+        {
+            // setup binary name
+            cmdLine.binaryName = argv[0];
+            size_t i = cmdLine.binaryName.rfind( '/' );
+            if ( i != string::npos )
+                cmdLine.binaryName = cmdLine.binaryName.substr( i + 1 );
+            
+            // setup cwd
+            char buffer[1024];
+#ifdef _WIN32
+            assert( _getcwd( buffer , 1000 ) );
+#else
+            assert( getcwd( buffer , 1000 ) );
+#endif
+            cmdLine.cwd = buffer;
+        }
+        
+
+        /* don't allow guessing - creates ambiguities when some options are
+         * prefixes of others. allow long disguises and don't allow guessing
+         * to get away with our vvvvvvv trick. */
+        int style = (((po::command_line_style::unix_style ^
+                       po::command_line_style::allow_guessing) |
+                      po::command_line_style::allow_long_disguise) ^
+                     po::command_line_style::allow_sticky);
+
+
+        try {
+
+            po::options_description all;
+            all.add( visible );
+            all.add( hidden );
+
+            po::store( po::command_line_parser(argc, argv)
+                       .options( all )
+                       .positional( positional )
+                       .style( style )
+                       .run(),
+                       params );
+
+            if ( params.count("config") ) {
+                ifstream f( params["config"].as<string>().c_str() );
+                if ( ! f.is_open() ) {
+                    cout << "ERROR: could not read from config file" << endl << endl;
+                    cout << visible << endl;
+                    return false;
+                }
+
+                stringstream ss;
+                CmdLine::parseConfigFile( f, ss );
+                po::store( po::parse_config_file( ss , all ) , params );
+                f.close();
+            }
+
+            po::notify(params);
+        }
+        catch (po::error &e) {
+            cout << "error command line: " << e.what() << endl;
+            cout << "use --help for help" << endl;
+            //cout << visible << endl;
+            return false;
+        }
+
+        if (params.count("verbose")) {
+            logLevel = 1;
+        }
+
+        for (string s = "vv"; s.length() <= 12; s.append("v")) {
+            if (params.count(s)) {
+                logLevel = s.length();
+            }
+        }
+
+        if (params.count("quiet")) {
+            cmdLine.quiet = true;
+        }
+
+        if ( params.count( "maxConns" ) ) {
+            int newSize = params["maxConns"].as<int>();
+            if ( newSize < 5 ) {
+                out() << "maxConns has to be at least 5" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            else if ( newSize >= 10000000 ) {
+                out() << "maxConns can't be greater than 10000000" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            connTicketHolder.resize( newSize );
+        }
+
+        if (params.count("objcheck")) {
+            cmdLine.objcheck = true;
+        }
+
+        string logpath;
+
+#ifndef _WIN32
+        if (params.count("unixSocketPrefix")) {
+            cmdLine.socket = params["unixSocketPrefix"].as<string>();
+            if (!fs::is_directory(cmdLine.socket)) {
+                cout << cmdLine.socket << " must be a directory" << endl;
+                ::exit(-1);
+            }
+        }
+
+        if (params.count("nounixsocket")) {
+            cmdLine.noUnixSocket = true;
+        }
+
+        if (params.count("fork")) {
+            cmdLine.doFork = true;
+            if ( ! params.count( "logpath" ) && ! params.count( "syslog" ) ) {
+                cout << "--fork has to be used with --logpath or --syslog" << endl;
+                ::exit(-1);
+            }
+
+            if ( params.count( "logpath" ) ) {
+                // test logpath
+                logpath = params["logpath"].as<string>();
+                assert( logpath.size() );
+                if ( logpath[0] != '/' ) {
+                    logpath = cmdLine.cwd + "/" + logpath;
+                }
+                FILE * test = fopen( logpath.c_str() , "a" );
+                if ( ! test ) {
+                    cout << "can't open [" << logpath << "] for log file: " << errnoWithDescription() << endl;
+                    ::exit(-1);
+                }
+                fclose( test );
+            }
+
+            cout.flush();
+            cerr.flush();
+            
+            cmdLine.parentProc = getpid();
+            
+            // facilitate clean exit when child starts successfully
+            setupLaunchSignals();
+
+            pid_t c = fork();
+            if ( c ) {
+                int pstat;
+                waitpid(c, &pstat, 0);
+
+                if ( WIFEXITED(pstat) ) {
+                    if ( ! WEXITSTATUS(pstat) ) {
+                        cout << "child process started successfully, parent exiting" << endl;
+                    }
+
+                    _exit( WEXITSTATUS(pstat) );
+                }
+
+                _exit(50);
+            }
+
+            if ( chdir("/") < 0 ) {
+                cout << "Cant chdir() while forking server process: " << strerror(errno) << endl;
+                ::exit(-1);
+            }
+            setsid();
+            
+            cmdLine.leaderProc = getpid();
+
+            pid_t c2 = fork();
+            if ( c2 ) {
+                int pstat;
+                cout << "forked process: " << c2 << endl;
+                waitpid(c2, &pstat, 0);
+
+                if ( WIFEXITED(pstat) ) {
+                    _exit( WEXITSTATUS(pstat) );
+                }
+
+                _exit(51);
+            }
+
+            // stdout handled in initLogging
+            //fclose(stdout);
+            //freopen("/dev/null", "w", stdout);
+
+            fclose(stderr);
+            fclose(stdin);
+
+            FILE* f = freopen("/dev/null", "w", stderr);
+            if ( f == NULL ) {
+                cout << "Cant reassign stderr while forking server process: " << strerror(errno) << endl;
+                ::exit(-1);
+            }
+
+            f = freopen("/dev/null", "r", stdin);
+            if ( f == NULL ) {
+                cout << "Cant reassign stdin while forking server process: " << strerror(errno) << endl;
+                ::exit(-1);
+            }
+
+            setupCoreSignals();
+            setupSignals( true );
+        }
+        
+        if (params.count("syslog")) {
+            StringBuilder sb(128);
+            sb << cmdLine.binaryName << "." << cmdLine.port;
+            Logstream::useSyslog( sb.str().c_str() );
+        }
+#endif
+        if (params.count("logpath")) {
+            if ( params.count("syslog") ) {
+                cout << "Cant use both a logpath and syslog " << endl;
+                ::exit(-1);
+            }
+            
+            if ( logpath.size() == 0 )
+                logpath = params["logpath"].as<string>();
+            uassert( 10033 ,  "logpath has to be non-zero" , logpath.size() );
+            initLogging( logpath , params.count( "logappend" ) );
+        }
+
+        if ( params.count("pidfilepath")) {
+            writePidFile( params["pidfilepath"].as<string>() );
+        }
+
+        if (params.count("keyFile")) {
+            const string f = params["keyFile"].as<string>();
+
+            if (!setUpSecurityKey(f)) {
+                // error message printed in setUpPrivateKey
+                dbexit(EXIT_BADOPTIONS);
+            }
+
+            cmdLine.keyFile = true;
+            noauth = false;
+        }
+        else {
+            cmdLine.keyFile = false;
+        }
+
+#ifdef MONGO_SSL
+        if (params.count("sslOnNormalPorts") ) {
+            cmdLine.sslOnNormalPorts = true;
+
+            if ( cmdLine.sslPEMKeyPassword.size() == 0 ) {
+                log() << "need sslPEMKeyPassword" << endl;
+                dbexit(EXIT_BADOPTIONS);
+            }
+            
+            if ( cmdLine.sslPEMKeyFile.size() == 0 ) {
+                log() << "need sslPEMKeyFile" << endl;
+                dbexit(EXIT_BADOPTIONS);
+            }
+            
+            cmdLine.sslServerManager = new SSLManager( false );
+            cmdLine.sslServerManager->setupPEM( cmdLine.sslPEMKeyFile , cmdLine.sslPEMKeyPassword );
+        }
+
+        if ( cmdLine.sslPEMKeyFile.size() || cmdLine.sslPEMKeyPassword.size() ) {
+            log() << "need to enable sslOnNormalPorts" << endl;
+            dbexit(EXIT_BADOPTIONS);
+        }
+#endif
+        
+        {
+            BSONObjBuilder b;
+            for (po::variables_map::const_iterator it(params.begin()), end(params.end()); it != end; it++){
+                if (!it->second.defaulted()){
+                    const string& key = it->first;
+                    const po::variable_value& value = it->second;
+                    const type_info& type = value.value().type();
+
+                    if (type == typeid(string)){
+                        if (value.as<string>().empty())
+                            b.appendBool(key, true); // boost po uses empty string for flags like --quiet
+                        else
+                            b.append(key, value.as<string>());
+                    }
+                    else if (type == typeid(int))
+                        b.append(key, value.as<int>());
+                    else if (type == typeid(double))
+                        b.append(key, value.as<double>());
+                    else if (type == typeid(bool))
+                        b.appendBool(key, value.as<bool>());
+                    else if (type == typeid(long))
+                        b.appendNumber(key, (long long)value.as<long>());
+                    else if (type == typeid(unsigned))
+                        b.appendNumber(key, (long long)value.as<unsigned>());
+                    else if (type == typeid(unsigned long long))
+                        b.appendNumber(key, (long long)value.as<unsigned long long>());
+                    else if (type == typeid(vector<string>))
+                        b.append(key, value.as<vector<string> >());
+                    else
+                        b.append(key, "UNKNOWN TYPE: " + demangleName(type));
+                }
+            }
+            parsedOpts = b.obj();
+        }
+
+        {
+            BSONArrayBuilder b;
+            for (int i=0; i < argc; i++)
+                b << argv[i];
+            argvArray = b.arr();
+        }
+
+        return true;
+    }
+
+    void printCommandLineOpts() {
+        log() << "options: " << parsedOpts << endl;
+    }
+
+    void ignoreSignal( int sig ) {}
+
+    void setupCoreSignals() {
+#if !defined(_WIN32)
+        assert( signal(SIGUSR1 , rotateLogs ) != SIG_ERR );
+        assert( signal(SIGHUP , ignoreSignal ) != SIG_ERR );
+#endif
+    }
+
+    class CmdGetCmdLineOpts : Command {
+    public:
+        CmdGetCmdLineOpts(): Command("getCmdLineOpts") {}
+        void help(stringstream& h) const { h << "get argv"; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool slaveOk() const { return true; }
+
+        virtual bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            result.append("argv", argvArray);
+            result.append("parsed", parsedOpts);
+            return true;
+        }
+
+    } cmdGetCmdLineOpts;
+
+    string prettyHostName() {
+        StringBuilder s(128);
+        s << getHostNameCached();
+        if( cmdLine.port != CmdLine::DefaultDBPort )
+            s << ':' << mongo::cmdLine.port;
+        return s.str();
+    }
+
+    casi< map<string,ParameterValidator*> * > pv_all (NULL);
+
+    ParameterValidator::ParameterValidator( const string& name ) : _name( name ) {
+        if ( ! pv_all)
+            pv_all.ref() = new map<string,ParameterValidator*>();
+        (*pv_all.ref())[_name] = this;
+    }
+    
+    ParameterValidator * ParameterValidator::get( const string& name ) {
+        map<string,ParameterValidator*>::const_iterator i = pv_all.get()->find( name );
+        if ( i == pv_all.get()->end() )
+            return NULL;
+        return i->second;
+    }
+
+}
diff --git a/src/mongo/db/cmdline.h b/src/mongo/db/cmdline.h
new file mode 100644
index 00000000000..5fe6ceb1005
--- /dev/null
+++ b/src/mongo/db/cmdline.h
@@ -0,0 +1,203 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+#ifdef MONGO_SSL
+    class SSLManager;
+#endif
+
+    /* command line options
+    */
+    /* concurrency: OK/READ */
+    struct CmdLine {
+
+        CmdLine();
+
+        string binaryName;     // mongod or mongos
+        string cwd;            // cwd of when process started
+
+        // this is suboptimal as someone could rename a binary.  todo...
+        bool isMongos() const { return binaryName == "mongos"; }
+
+        int port;              // --port
+        enum {
+            DefaultDBPort = 27017,
+            ConfigServerPort = 27019,
+            ShardServerPort = 27018
+        };
+        bool isDefaultPort() const { return port == DefaultDBPort; }
+
+        string bind_ip;        // --bind_ip
+        bool rest;             // --rest
+        bool jsonp;            // --jsonp
+
+        string _replSet;       // --replSet[/<seedlist>]
+        string ourSetName() const {
+            string setname;
+            size_t sl = _replSet.find('/');
+            if( sl == string::npos )
+                return _replSet;
+            return _replSet.substr(0, sl);
+        }
+        bool usingReplSets() const { return !_replSet.empty(); }
+
+        // for master/slave replication
+        string source;         // --source
+        string only;           // --only
+
+        bool quiet;            // --quiet
+        bool noTableScan;      // --notablescan no table scans allowed
+        bool prealloc;         // --noprealloc no preallocation of data files
+        bool preallocj;        // --nopreallocj no preallocation of journal files
+        bool smallfiles;       // --smallfiles allocate smaller data files
+
+        bool configsvr;        // --configsvr
+
+        bool quota;            // --quota
+        int quotaFiles;        // --quotaFiles
+        bool cpu;              // --cpu show cpu time periodically
+
+        bool dur;                       // --dur durability (now --journal)
+        unsigned journalCommitInterval; // group/batch commit interval ms
+
+        /** --durOptions 7      dump journal and terminate without doing anything further
+            --durOptions 4      recover and terminate without listening
+        */
+        enum { // bits to be ORed
+            DurDumpJournal = 1,   // dump diagnostics on the journal during recovery
+            DurScanOnly = 2,      // don't do any real work, just scan and dump if dump specified
+            DurRecoverOnly = 4,   // terminate after recovery step
+            DurParanoid = 8,      // paranoid mode enables extra checks
+            DurAlwaysCommit = 16, // do a group commit every time the writelock is released
+            DurAlwaysRemap = 32,  // remap the private view after every group commit (may lag to the next write lock acquisition, but will do all files then)
+            DurNoCheckSpace = 64  // don't check that there is enough room for journal files before startup (for diskfull tests)
+        };
+        int durOptions;          // --durOptions <n> for debugging
+
+        bool objcheck;         // --objcheck
+
+        long long oplogSize;   // --oplogSize
+        int defaultProfile;    // --profile
+        int slowMS;            // --time in ms that is "slow"
+
+        int pretouch;          // --pretouch for replication application (experimental)
+        bool moveParanoia;     // for move chunk paranoia
+        double syncdelay;      // seconds between fsyncs
+
+        bool noUnixSocket;     // --nounixsocket
+        bool doFork;           // --fork
+        string socket;         // UNIX domain socket directory
+
+        bool keyFile;
+
+#ifndef _WIN32
+        pid_t parentProc;      // --fork pid of initial process
+        pid_t leaderProc;      // --fork pid of leader process
+#endif
+
+#ifdef MONGO_SSL
+        bool sslOnNormalPorts;      // --sslOnNormalPorts
+        string sslPEMKeyFile;       // --sslPEMKeyFile
+        string sslPEMKeyPassword;   // --sslPEMKeyPassword
+
+        SSLManager* sslServerManager; // currently leaks on close
+#endif
+        
+        static void launchOk();
+
+        static void addGlobalOptions( boost::program_options::options_description& general ,
+                                      boost::program_options::options_description& hidden );
+
+        static void addWindowsOptions( boost::program_options::options_description& windows ,
+                                       boost::program_options::options_description& hidden );
+
+
+        static void parseConfigFile( istream &f, stringstream &ss);
+        /**
+         * @return true if should run program, false if should exit
+         */
+        static bool store( int argc , char ** argv ,
+                           boost::program_options::options_description& visible,
+                           boost::program_options::options_description& hidden,
+                           boost::program_options::positional_options_description& positional,
+                           boost::program_options::variables_map &output );
+
+        time_t started;
+    };
+
+    // todo move to cmdline.cpp?
+    inline CmdLine::CmdLine() :
+        port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), preallocj(true), smallfiles(sizeof(int*) == 4),
+        configsvr(false),
+        quota(false), quotaFiles(8), cpu(false), durOptions(0), objcheck(false), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ),
+        syncdelay(60), noUnixSocket(false), doFork(0), socket("/tmp") 
+    {
+        started = time(0);
+
+        journalCommitInterval = 0; // 0 means use default
+        dur = false;
+#if defined(_DURABLEDEFAULTON)
+        dur = true;
+#endif
+        if( sizeof(void*) == 8 )
+            dur = true;
+#if defined(_DURABLEDEFAULTOFF)
+        dur = false;
+#endif
+
+#ifdef MONGO_SSL
+        sslOnNormalPorts = false;
+        sslServerManager = 0;
+#endif
+    }
+            
+    extern CmdLine cmdLine;
+
+    void setupLaunchSignals();
+    void setupCoreSignals();
+
+    string prettyHostName();
+
+    void printCommandLineOpts();
+
+    /**
+     * used for setParameter command
+     * so you can write validation code that lives with code using it
+     * rather than all in the command place
+     * also lets you have mongos or mongod specific code
+     * without pulling it all sorts of things
+     */
+    class ParameterValidator {
+    public:
+        ParameterValidator( const string& name );
+        virtual ~ParameterValidator() {}
+
+        virtual bool isValid( BSONElement e , string& errmsg ) const = 0;
+
+        static ParameterValidator * get( const string& name );
+
+    private:
+        const string _name;
+    };
+
+}
+
diff --git a/src/mongo/db/collection.h b/src/mongo/db/collection.h
new file mode 100644
index 00000000000..998b2f0beac
--- /dev/null
+++ b/src/mongo/db/collection.h
@@ -0,0 +1,15 @@
+// @file collection.h
+
+#pragma once
+
+#include "namespace.h"
+
+namespace mongo { 
+
+    class Collection { 
+    public:
+        NamespaceDetails * const d;
+        NamespaceDetailsTransient * const nsd;
+    };
+
+}
diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp
new file mode 100755
index 00000000000..cbe9ffc6861
--- /dev/null
+++ b/src/mongo/db/commands.cpp
@@ -0,0 +1,209 @@
+/* commands.cpp
+   db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "jsobj.h"
+#include "commands.h"
+#include "client.h"
+#include "replutil.h"
+
+namespace mongo {
+
+    map<string,Command*> * Command::_commandsByBestName;
+    map<string,Command*> * Command::_webCommands;
+    map<string,Command*> * Command::_commands;
+
+    string Command::parseNsFullyQualified(const string& dbname, const BSONObj& cmdObj) const { 
+        string s = cmdObj.firstElement().valuestr();
+        NamespaceString nss(s);
+        // these are for security, do not remove:
+        verify(15966, dbname == nss.db || dbname == "admin" );
+        verify(15962, !nss.db.empty() );
+        return s;
+    }
+
+    /*virtual*/ string Command::parseNs(const string& dbname, const BSONObj& cmdObj) const {
+        string coll = cmdObj.firstElement().valuestr();
+#if defined(CLC)
+        DEV if( mongoutils::str::startsWith(coll, dbname+'.') ) { 
+            log() << "DEBUG parseNs Command's collection name looks like it includes the db name\n"
+                << dbname << '\n' 
+                << coll << '\n'
+                << cmdObj.toString() << endl;
+            dassert(false);
+        }
+#endif
+        return dbname + '.' + coll;
+    }
+
+    void Command::htmlHelp(stringstream& ss) const {
+        string helpStr;
+        {
+            stringstream h;
+            help(h);
+            helpStr = h.str();
+        }
+        ss << "\n<tr><td>";
+        bool web = _webCommands->count(name) != 0;
+        if( web ) ss << "<a href=\"/" << name << "?text=1\">";
+        ss << name;
+        if( web ) ss << "</a>";
+        ss << "</td>\n";
+        ss << "<td>";
+        int l = locktype();
+        //if( l == NONE ) ss << "N ";
+        if( l == READ ) ss << "R ";
+        else if( l == WRITE ) ss << "W ";
+        if( slaveOk() )
+            ss << "S ";
+        if( adminOnly() )
+            ss << "A";
+        ss << "</td>";
+        ss << "<td>";
+        if( helpStr != "no help defined" ) {
+            const char *p = helpStr.c_str();
+            while( *p ) {
+                if( *p == '<' ) {
+                    ss << "&lt;";
+                    p++; continue;
+                }
+                else if( *p == '{' )
+                    ss << "<code>";
+                else if( *p == '}' ) {
+                    ss << "}</code>";
+                    p++;
+                    continue;
+                }
+                if( strncmp(p, "http:", 5) == 0 ) {
+                    ss << "<a href=\"";
+                    const char *q = p;
+                    while( *q && *q != ' ' && *q != '\n' )
+                        ss << *q++;
+                    ss << "\">";
+                    q = p;
+                    if( startsWith(q, "http://www.mongodb.org/display/") )
+                        q += 31;
+                    while( *q && *q != ' ' && *q != '\n' ) {
+                        ss << (*q == '+' ? ' ' : *q);
+                        q++;
+                        if( *q == '#' )
+                            while( *q && *q != ' ' && *q != '\n' ) q++;
+                    }
+                    ss << "</a>";
+                    p = q;
+                    continue;
+                }
+                if( *p == '\n' ) ss << "<br>";
+                else ss << *p;
+                p++;
+            }
+        }
+        ss << "</td>";
+        ss << "</tr>\n";
+    }
+
+    Command::Command(const char *_name, bool web, const char *oldName) : name(_name) {
+        // register ourself.
+        if ( _commands == 0 )
+            _commands = new map<string,Command*>;
+        if( _commandsByBestName == 0 )
+            _commandsByBestName = new map<string,Command*>;
+        Command*& c = (*_commands)[name];
+        if ( c )
+            log() << "warning: 2 commands with name: " << _name << endl;
+        c = this;
+        (*_commandsByBestName)[name] = this;
+
+        if( web ) {
+            if( _webCommands == 0 )
+                _webCommands = new map<string,Command*>;
+            (*_webCommands)[name] = this;
+        }
+
+        if( oldName )
+            (*_commands)[oldName] = this;
+    }
+
+    void Command::help( stringstream& help ) const {
+        help << "no help defined";
+    }
+
+    Command* Command::findCommand( const string& name ) {
+        map<string,Command*>::iterator i = _commands->find( name );
+        if ( i == _commands->end() )
+            return 0;
+        return i->second;
+    }
+
+
+    Command::LockType Command::locktype( const string& name ) {
+        Command * c = findCommand( name );
+        if ( ! c )
+            return WRITE;
+        return c->locktype();
+    }
+
+    void Command::logIfSlow( const Timer& timer, const string& msg ) {
+        int ms = timer.millis();
+        if ( ms > cmdLine.slowMS ) {
+            out() << msg << " took " << ms << " ms." << endl;
+        }
+    }
+
+}
+
+#include "../client/connpool.h"
+
+namespace mongo {
+
+    extern DBConnectionPool pool;
+
+    class PoolFlushCmd : public Command {
+    public:
+        PoolFlushCmd() : Command( "connPoolSync" , false , "connpoolsync" ) {}
+        virtual void help( stringstream &help ) const { help<<"internal"; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
+            pool.flush();
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+
+    } poolFlushCmd;
+
+    class PoolStats : public Command {
+    public:
+        PoolStats() : Command( "connPoolStats" ) {}
+        virtual void help( stringstream &help ) const { help<<"stats about connection pool"; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
+            pool.appendInfo( result );
+            result.append( "numDBClientConnection" , DBClientConnection::getNumConnections() );
+            result.append( "numAScopedConnection" , AScopedConnection::getNumConnections() );
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+
+    } poolStatsCmd;
+
+} // namespace mongo
diff --git a/src/mongo/db/commands.h b/src/mongo/db/commands.h
new file mode 100644
index 00000000000..85cdd38d7a4
--- /dev/null
+++ b/src/mongo/db/commands.h
@@ -0,0 +1,164 @@
+// commands.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+
+    class BSONObj;
+    class BSONObjBuilder;
+    class Client;
+    class Timer;
+
+    /** mongodb "commands" (sent via db.$cmd.findOne(...))
+        subclass to make a command.  define a singleton object for it.
+        */
+    class Command {
+    protected:
+        string parseNsFullyQualified(const string& dbname, const BSONObj& cmdObj) const;
+    public:
+        // only makes sense for commands where 1st parm is the collection.
+        virtual string parseNs(const string& dbname, const BSONObj& cmdObj) const;
+
+        enum LockType { READ = -1 , NONE = 0 , WRITE = 1 };
+
+        const string name;
+
+        /* run the given command
+           implement this...
+
+           fromRepl - command is being invoked as part of replication syncing.  In this situation you
+                      normally do not want to log the command to the local oplog.
+
+           return value is true if succeeded.  if false, set errmsg text.
+        */
+        virtual bool run(const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) = 0;
+
+        /*
+           note: logTheTop() MUST be false if READ
+           if NONE, can't use Client::Context setup
+                    use with caution
+         */
+        virtual LockType locktype() const = 0;
+
+        /* Return true if only the admin ns has privileges to run this command. */
+        virtual bool adminOnly() const {
+            return false;
+        }
+
+        void htmlHelp(stringstream&) const;
+
+        /* Like adminOnly, but even stricter: we must either be authenticated for admin db,
+           or, if running without auth, on the local interface.  Used for things which 
+           are so major that remote invocation may not make sense (e.g., shutdownServer).
+
+           When localHostOnlyIfNoAuth() is true, adminOnly() must also be true.
+        */
+        virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return false; }
+
+        /* Return true if slaves are allowed to execute the command
+           (the command directly from a client -- if fromRepl, always allowed).
+        */
+        virtual bool slaveOk() const = 0;
+
+        /* Return true if the client force a command to be run on a slave by
+           turning on the 'slaveOk' option in the command query.
+        */
+        virtual bool slaveOverrideOk() {
+            return false;
+        }
+
+        /* Override and return true to if true,log the operation (logOp()) to the replication log.
+           (not done if fromRepl of course)
+
+           Note if run() returns false, we do NOT log.
+        */
+        virtual bool logTheOp() { return false; }
+
+        virtual void help( stringstream& help ) const;
+
+        /* Return true if authentication and security applies to the commands.  Some commands
+           (e.g., getnonce, authenticate) can be done by anyone even unauthorized.
+        */
+        virtual bool requiresAuth() { return true; }
+
+        /* Return true if a replica set secondary should go into "recovering"
+           (unreadable) state while running this command.
+         */
+        virtual bool maintenanceMode() const { return false; }
+
+        /* Return true if command should be permitted when a replica set secondary is in "recovering"
+           (unreadable) state.
+         */
+        virtual bool maintenanceOk() const { return true; /* assumed true prior to commit */ }
+
+        /** @param webUI expose the command in the web ui as localhost:28017/<name>
+            @param oldName an optional old, deprecated name for the command
+        */
+        Command(const char *_name, bool webUI = false, const char *oldName = 0);
+
+        virtual ~Command() {}
+
+    protected:
+        BSONObj getQuery( const BSONObj& cmdObj ) {
+            if ( cmdObj["query"].type() == Object )
+                return cmdObj["query"].embeddedObject();
+            if ( cmdObj["q"].type() == Object )
+                return cmdObj["q"].embeddedObject();
+            return BSONObj();
+        }
+
+        static void logIfSlow( const Timer& cmdTimer,  const string& msg);
+
+        static map<string,Command*> * _commands;
+        static map<string,Command*> * _commandsByBestName;
+        static map<string,Command*> * _webCommands;
+
+    public:
+        static const map<string,Command*>* commandsByBestName() { return _commandsByBestName; }
+        static const map<string,Command*>* webCommands() { return _webCommands; }
+        /** @return if command was found and executed */
+        static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder, int queryOptions = 0);
+        static LockType locktype( const string& name );
+        static Command * findCommand( const string& name );
+    };
+
+    class CmdShutdown : public Command {
+    public:
+        virtual bool requiresAuth() { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return true; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream& help ) const;
+        CmdShutdown() : Command("shutdown") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+    private:
+        bool shutdownHelper();
+    };
+
+    bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions);
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/aggregate.js b/src/mongo/db/commands/aggregate.js
new file mode 100755
index 00000000000..7741e3121ff
--- /dev/null
+++ b/src/mongo/db/commands/aggregate.js
@@ -0,0 +1,184 @@
+/* sample aggregate command queries */
+
+// make sure we're using the right db; this is the same as "use mydb;" in shell
+db = db.getSisterDB("mydb");
+
+// just passing through fields
+var p1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	tags : 1,
+	pageViews : 1
+    }}
+]});
+
+// unwinding an array
+var p2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }}
+]});
+
+// pulling values out of subdocuments
+var p3 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	otherfoo : "other.foo",
+	otherbar : "other.bar"
+    }}
+]});
+
+// projection includes a computed value
+var p4 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	daveWroteIt : { $eq:["$author", "dave"] }
+    }}
+]});
+
+// projection includes a virtual (fabricated) document
+var p5 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	pageViews : 1,
+	tag : { $unwind : "tags" }
+    }},
+    { $project : {
+	author : 1,
+	subDocument : { foo : "pageViews", bar : "tag"  }
+    }}
+]});
+
+// multi-step aggregate
+// nested expressions in computed fields
+var p6 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }},
+    { $project : {
+	author : 1,
+	tag : 1,
+	pageViews : 1,
+	daveWroteIt : { $eq:["$author", "dave"] },
+	weLikeIt : { $or:[ { $eq:["$author", "dave"] },
+			   { $eq:["$tag", "good"] } ] }
+    }}
+]});
+
+// slightly more complex computed expression; $ifnull
+var p7 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	theSum : { $add:["$pageViews",
+			 { $ifnull:["$other.foo",
+				    "$other.bar"] } ] }
+    }}
+]});
+
+// dotted path inclusion; _id exclusion
+var p8 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	_id : 0,
+	author : 1,
+	tag : { $unwind : "tags" },
+	"comments.author" : 1
+    }}
+]});
+
+
+// simple matching
+var m1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $match : { author : "dave" } }
+]});
+
+// combining matching with a projection
+var m2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	title : 1,
+	author : 1,
+	pageViews : 1,
+	tag : { $unwind : "tags" },
+	comments : 1
+    }},
+    { $match : { tag : "nasty" } }
+]});
+
+
+// group by tag
+var g1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }},
+    { $group : {
+	_id: { tag : 1 },
+	docsByTag : { $sum : 1 },
+	viewsByTag : { $sum : "$pageViews" }
+    }}
+]});
+
+// $max, and averaging in a final projection
+var g2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }},
+    { $group : {
+	_id: { tag : 1 },
+	docsByTag : { $sum : 1 },
+	viewsByTag : { $sum : "$pageViews" },
+	mostViewsByTag : { $max : "$pageViews" },
+    }},
+    { $project : {
+	_id: false,
+	tag : "_id.tag",
+	mostViewsByTag : 1,
+	docsByTag : 1,
+	viewsByTag : 1,
+	avgByTag : { $divide:["$viewsByTag", "$docsByTag"] }
+    }}
+]});
+
+// $push as an accumulator; can pivot data
+var g3 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" }
+    }},
+    { $group : {
+	_id : { tag : 1 },
+	authors : { $push : "$author" }
+    }}
+]});
+
+// $avg, and averaging in a final projection
+var g4 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }},
+    { $group : {
+	_id: { tag : 1 },
+	docsByTag : { $sum : 1 },
+	viewsByTag : { $sum : "$pageViews" },
+	avgByTag : { $avg : "$pageViews" },
+    }}
+]});
diff --git a/src/mongo/db/commands/cloud.cpp b/src/mongo/db/commands/cloud.cpp
new file mode 100644
index 00000000000..8f9d9d2e4b5
--- /dev/null
+++ b/src/mongo/db/commands/cloud.cpp
@@ -0,0 +1,90 @@
+#include "../commands.h"
+#include <map>
+#include "../../util/concurrency/value.h"
+#include "../../util/mongoutils/str.h"
+#include "../../util/net/hostandport.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    mapsf<string,string> dynHostNames;
+    extern DiagStr _hostNameCached;
+
+    string dynHostMyName() {
+        if( !str::startsWith(_hostNameCached, '#') )
+            return "";
+        return _hostNameCached; 
+    }
+
+    void dynHostResolve(string& name, int& port) {
+        assert( !name.empty() );
+        assert( !str::contains(name, ':') );
+        assert( str::startsWith(name, '#') );
+        string s = dynHostNames.get(name);
+        if( s.empty() ) { 
+            name.clear();
+            return;
+        }
+        assert( !str::startsWith(s, '#') );
+        HostAndPort hp(s);
+        if( hp.hasPort() ) {
+            port = hp.port();
+            log() << "info: dynhost in:" << name << " out:" << hp.toString() << endl;
+        }
+        name = hp.host();
+    }
+
+    /** 
+      { cloud:1, nodes: {
+          name : <ip>, ...
+        },
+        me : <mylogicalname>
+      }
+    */
+    class CmdCloud  : public Command {
+    public:
+    virtual LockType locktype() const { return NONE; }
+        virtual bool logTheOp() { return false; }
+        virtual bool adminOnly() const { return true; } // very important
+        virtual bool localHostOnlyIfNoAuth(const BSONObj&) { return true; }
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream& help ) const { 
+            help << "internal\n"; 
+            help << "{cloud:1,nodes:...,me:<my_logical_name>}";
+        }
+        CmdCloud() : Command("cloud") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            assert(!fromRepl);
+            BSONObj nodes = cmdObj["nodes"].Obj();
+            map<string,string> ipmap;
+            for( BSONObj::iterator i(nodes); i.more(); ) { 
+                BSONElement e = i.next();
+                assert( *e.fieldName() == '#' );
+                ipmap[e.fieldName()] = e.String();
+            }
+
+            string me = cmdObj["me"].String();
+            assert( !me.empty() && me[0] == '#' );
+            
+            log(/*1*/) << "CmdCloud" << endl;
+
+            if( me != _hostNameCached.get() ) { 
+                log() << "CmdCloud new 'me' value:" << me << endl;
+                _hostNameCached = me;
+            }
+
+            dynHostNames.swap(ipmap);
+            return true;
+        }
+    } cmdCloud;
+
+    BSONObj fromjson(const string &str);
+
+    void cloudCmdLineParamIs(string cmd) {
+        string errmsg;
+        BSONObjBuilder res;
+        BSONObj o = fromjson(cmd);
+        cmdCloud.run("", o, 0, errmsg, res, false);
+    }
+}
diff --git a/src/mongo/db/commands/distinct.cpp b/src/mongo/db/commands/distinct.cpp
new file mode 100644
index 00000000000..1926e6abddb
--- /dev/null
+++ b/src/mongo/db/commands/distinct.cpp
@@ -0,0 +1,157 @@
+// distinct.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../queryoptimizer.h"
+#include "../clientcursor.h"
+#include "../../util/timer.h"
+
+namespace mongo {
+
+    class DistinctCommand : public Command {
+    public:
+        DistinctCommand() : Command("distinct") {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
+        }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            Timer t;
+            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+
+            string key = cmdObj["key"].valuestrsafe();
+            BSONObj keyPattern = BSON( key << 1 );
+
+            BSONObj query = getQuery( cmdObj );
+
+            int bufSize = BSONObjMaxUserSize - 4096;
+            BufBuilder bb( bufSize );
+            char * start = bb.buf();
+
+            BSONArrayBuilder arr( bb );
+            BSONElementSet values;
+
+            long long nscanned = 0; // locations looked at
+            long long nscannedObjects = 0; // full objects looked at
+            long long n = 0; // matches
+            MatchDetails md;
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+
+            if ( ! d ) {
+                result.appendArray( "values" , BSONObj() );
+                result.append( "stats" , BSON( "n" << 0 << "nscanned" << 0 << "nscannedObjects" << 0 ) );
+                return true;
+            }
+
+            shared_ptr<Cursor> cursor;
+            if ( ! query.isEmpty() ) {
+                cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() );
+            }
+            else {
+
+                // query is empty, so lets see if we can find an index
+                // with the key so we don't have to hit the raw data
+                NamespaceDetails::IndexIterator ii = d->ii();
+                while ( ii.more() ) {
+                    IndexDetails& idx = ii.next();
+
+                    if ( d->isMultikey( ii.pos() - 1 ) )
+                        continue;
+
+                    if ( idx.inKeyPattern( key ) ) {
+                        cursor = bestGuessCursor( ns.c_str() , BSONObj() , idx.keyPattern() );
+                        if( cursor.get() ) break;
+                    }
+
+                }
+
+                if ( ! cursor.get() )
+                    cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() );
+
+            }
+
+            
+            assert( cursor );
+            string cursorName = cursor->toString();
+            
+            auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns));
+
+            while ( cursor->ok() ) {
+                nscanned++;
+                bool loadedObject = false;
+
+                if ( cursor->currentMatches( &md ) && !cursor->getsetdup( cursor->currLoc() ) ) {
+                    n++;
+
+                    BSONObj holder;
+                    BSONElementSet temp;
+                    loadedObject = ! cc->getFieldsDotted( key , temp, holder );
+
+                    for ( BSONElementSet::iterator i=temp.begin(); i!=temp.end(); ++i ) {
+                        BSONElement e = *i;
+                        if ( values.count( e ) )
+                            continue;
+
+                        int now = bb.len();
+
+                        uassert(10044,  "distinct too big, 16mb cap", ( now + e.size() + 1024 ) < bufSize );
+
+                        arr.append( e );
+                        BSONElement x( start + now );
+
+                        values.insert( x );
+                    }
+                }
+
+                if ( loadedObject || md._loadedObject )
+                    nscannedObjects++;
+
+                cursor->advance();
+
+                if (!cc->yieldSometimes( ClientCursor::MaybeCovered )) {
+                    cc.release();
+                    break;
+                }
+
+                RARELY killCurrentOp.checkForInterrupt();
+            }
+
+            assert( start == bb.buf() );
+
+            result.appendArray( "values" , arr.done() );
+
+            {
+                BSONObjBuilder b;
+                b.appendNumber( "n" , n );
+                b.appendNumber( "nscanned" , nscanned );
+                b.appendNumber( "nscannedObjects" , nscannedObjects );
+                b.appendNumber( "timems" , t.millis() );
+                b.append( "cursor" , cursorName );
+                result.append( "stats" , b.obj() );
+            }
+
+            return true;
+        }
+
+    } distinctCmd;
+
+}
diff --git a/src/mongo/db/commands/document_source_cursor.cpp b/src/mongo/db/commands/document_source_cursor.cpp
new file mode 100755
index 00000000000..49bb9f19d9e
--- /dev/null
+++ b/src/mongo/db/commands/document_source_cursor.cpp
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/cursor.h"
+#include "db/pipeline/document.h"
+
+namespace mongo {
+
+    DocumentSourceCursor::~DocumentSourceCursor() {
+    }
+
+    bool DocumentSourceCursor::eof() {
+	/* if we haven't gotten the first one yet, do so now */
+	if (!pCurrent.get())
+	    findNext();
+
+        return (pCurrent.get() == NULL);
+    }
+
+    bool DocumentSourceCursor::advance() {
+	/* if we haven't gotten the first one yet, do so now */
+	if (!pCurrent.get())
+	    findNext();
+
+	findNext();
+        return (pCurrent.get() != NULL);
+    }
+
+    intrusive_ptr<Document> DocumentSourceCursor::getCurrent() {
+	/* if we haven't gotten the first one yet, do so now */
+	if (!pCurrent.get())
+	    findNext();
+
+	return pCurrent;
+    }
+
+    void DocumentSourceCursor::findNext() {
+	/* standard cursor usage pattern */
+	while(pCursor->ok()) {
+	    CoveredIndexMatcher *pCIM; // save intermediate result
+	    if ((!(pCIM = pCursor->matcher()) ||
+		 pCIM->matchesCurrent(pCursor.get())) &&
+		!pCursor->getsetdup(pCursor->currLoc())) {
+
+		/* grab the matching document */
+		BSONObj documentObj(pCursor->current());
+		pCurrent = Document::createFromBsonObj(&documentObj);
+		pCursor->advance();
+		return;
+	    }
+
+	    pCursor->advance();
+	}
+
+	/* if we got here, there aren't any more documents */
+	pCurrent.reset();
+    }
+
+    void DocumentSourceCursor::setSource(
+	const intrusive_ptr<DocumentSource> &pSource) {
+	/* this doesn't take a source */
+	assert(false);
+    }
+
+    void DocumentSourceCursor::sourceToBson(BSONObjBuilder *pBuilder) const {
+	/* this has no analog in the BSON world */
+	assert(false);
+    }
+
+    DocumentSourceCursor::DocumentSourceCursor(
+	const shared_ptr<Cursor> &pTheCursor):
+        pCursor(pTheCursor),
+        pCurrent() {
+    }
+
+    intrusive_ptr<DocumentSourceCursor> DocumentSourceCursor::create(
+	const shared_ptr<Cursor> &pCursor) {
+	assert(pCursor.get());
+	intrusive_ptr<DocumentSourceCursor> pSource(
+	    new DocumentSourceCursor(pCursor));
+	    return pSource;
+    }
+}
diff --git a/src/mongo/db/commands/find_and_modify.cpp b/src/mongo/db/commands/find_and_modify.cpp
new file mode 100644
index 00000000000..0cf766fcf87
--- /dev/null
+++ b/src/mongo/db/commands/find_and_modify.cpp
@@ -0,0 +1,153 @@
+// find_and_modify.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    /* Find and Modify an object returning either the old (default) or new value*/
+    class CmdFindAndModify : public Command {
+    public:
+        virtual void help( stringstream &help ) const {
+            help <<
+                 "{ findAndModify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n"
+                 "{ findAndModify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n"
+                 "Either update or remove is required, all other fields have default values.\n"
+                 "Output is in the \"value\" field\n";
+        }
+
+        CmdFindAndModify() : Command("findAndModify", false, "findandmodify") { }
+        virtual bool logTheOp() { return false; } // the modifications will be logged directly
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            static DBDirectClient db;
+
+            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+
+            BSONObj origQuery = cmdObj.getObjectField("query"); // defaults to {}
+            Query q (origQuery);
+            BSONElement sort = cmdObj["sort"];
+            if (!sort.eoo())
+                q.sort(sort.embeddedObjectUserCheck());
+
+            bool upsert = cmdObj["upsert"].trueValue();
+
+            BSONObj fieldsHolder (cmdObj.getObjectField("fields"));
+            const BSONObj* fields = (fieldsHolder.isEmpty() ? NULL : &fieldsHolder);
+
+            Projection projection;
+            if (fields) {
+                projection.init(fieldsHolder);
+                if (!projection.includeID())
+                    fields = NULL; // do projection in post-processing
+            }
+
+            BSONObj out = db.findOne(ns, q, fields);
+            if (out.isEmpty()) {
+                if (!upsert) {
+                    result.appendNull("value");
+                    return true;
+                }
+
+                BSONElement update = cmdObj["update"];
+                uassert(13329, "upsert mode requires update field", !update.eoo());
+                uassert(13330, "upsert mode requires query field", !origQuery.isEmpty());
+                db.update(ns, origQuery, update.embeddedObjectUserCheck(), true);
+
+                BSONObj gle = db.getLastErrorDetailed();
+                result.append("lastErrorObject", gle);
+                if (gle["err"].type() == String) {
+                    errmsg = gle["err"].String();
+                    return false;
+                }
+
+                if (cmdObj["new"].trueValue()) {
+                    BSONElement _id = gle["upserted"];
+                    if (_id.eoo())
+                        _id = origQuery["_id"];
+
+                    out = db.findOne(ns, QUERY("_id" << _id), fields);
+                }
+
+            }
+            else {
+
+                if (cmdObj["remove"].trueValue()) {
+                    uassert(12515, "can't remove and update", cmdObj["update"].eoo());
+                    db.remove(ns, QUERY("_id" << out["_id"]), 1);
+
+                    BSONObj gle = db.getLastErrorDetailed();
+                    result.append("lastErrorObject", gle);
+                    if (gle["err"].type() == String) {
+                        errmsg = gle["err"].String();
+                        return false;
+                    }
+
+                }
+                else {   // update
+
+                    BSONElement queryId = origQuery["_id"];
+                    if (queryId.eoo() || getGtLtOp(queryId) != BSONObj::Equality) {
+                        // need to include original query for $ positional operator
+
+                        BSONObjBuilder b;
+                        b.append(out["_id"]);
+                        BSONObjIterator it(origQuery);
+                        while (it.more()) {
+                            BSONElement e = it.next();
+                            if (strcmp(e.fieldName(), "_id"))
+                                b.append(e);
+                        }
+                        q = Query(b.obj());
+                    }
+
+                    if (q.isComplex()) // update doesn't work with complex queries
+                        q = Query(q.getFilter().getOwned());
+
+                    BSONElement update = cmdObj["update"];
+                    uassert(12516, "must specify remove or update", !update.eoo());
+                    db.update(ns, q, update.embeddedObjectUserCheck());
+
+                    BSONObj gle = db.getLastErrorDetailed();
+                    result.append("lastErrorObject", gle);
+                    if (gle["err"].type() == String) {
+                        errmsg = gle["err"].String();
+                        return false;
+                    }
+
+                    if (cmdObj["new"].trueValue())
+                        out = db.findOne(ns, QUERY("_id" << out["_id"]), fields);
+                }
+            }
+
+            if (!fieldsHolder.isEmpty() && !fields){
+                // we need to run projection but haven't yet
+                out = projection.transform(out);
+            }
+
+            result.append("value", out);
+
+            return true;
+        }
+    } cmdFindAndModify;
+
+
+}
diff --git a/src/mongo/db/commands/group.cpp b/src/mongo/db/commands/group.cpp
new file mode 100644
index 00000000000..69fee587a47
--- /dev/null
+++ b/src/mongo/db/commands/group.cpp
@@ -0,0 +1,224 @@
+// group.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../queryoptimizer.h"
+#include "../../scripting/engine.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    class GroupCommand : public Command {
+    public:
+        GroupCommand() : Command("group") {}
+        virtual LockType locktype() const { return READ; }
+        virtual bool slaveOk() const { return false; }
+        virtual bool slaveOverrideOk() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << "http://www.mongodb.org/display/DOCS/Aggregation";
+        }
+
+        BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ) {
+            if ( func ) {
+                BSONObjBuilder b( obj.objsize() + 32 );
+                b.append( "0" , obj );
+                const BSONObj& key = b.obj();
+                int res = s->invoke( func , &key, 0 );
+                uassert( 10041 ,  (string)"invoke failed in $keyf: " + s->getError() , res == 0 );
+                int type = s->type("return");
+                uassert( 10042 ,  "return of $key has to be an object" , type == Object );
+                return s->getObject( "return" );
+            }
+            return obj.extractFields( keyPattern , true ).getOwned();
+        }
+
+        bool group( string realdbname , const string& ns , const BSONObj& query ,
+                    BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope ,
+                    BSONObj initial , string finalize ,
+                    string& errmsg , BSONObjBuilder& result ) {
+
+
+            auto_ptr<Scope> s = globalScriptEngine->getPooledScope( realdbname );
+            s->localConnect( realdbname.c_str() );
+
+            if ( reduceScope )
+                s->init( reduceScope );
+
+            s->setObject( "$initial" , initial , true );
+
+            s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 );
+            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+            ScriptingFunction f = s->createFunction(
+                                      "function(){ "
+                                      "  if ( $arr[n] == null ){ "
+                                      "    next = {}; "
+                                      "    Object.extend( next , $key ); "
+                                      "    Object.extend( next , $initial , true ); "
+                                      "    $arr[n] = next; "
+                                      "    next = null; "
+                                      "  } "
+                                      "  $reduce( obj , $arr[n] ); "
+                                      "}" );
+
+            ScriptingFunction keyFunction = 0;
+            if ( keyFunctionCode.size() ) {
+                keyFunction = s->createFunction( keyFunctionCode.c_str() );
+            }
+
+
+            double keysize = keyPattern.objsize() * 3;
+            double keynum = 1;
+
+            map<BSONObj,int,BSONObjCmp> map;
+            list<BSONObj> blah;
+
+            shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query);
+            ClientCursor::CleanupPointer ccPointer;
+            ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
+
+            while ( cursor->ok() ) {
+                
+                if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered ) ||
+                    !cursor->ok() ) {
+                    break;
+                }
+                
+                if ( !cursor->currentMatches() || cursor->getsetdup( cursor->currLoc() ) ) {
+                    cursor->advance();
+                    continue;
+                }
+
+                if ( !ccPointer->yieldSometimes( ClientCursor::WillNeed ) ||
+                    !cursor->ok() ) {
+                    break;
+                }
+                
+                BSONObj obj = cursor->current();
+                cursor->advance();
+
+                BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() );
+                keysize += key.objsize();
+                keynum++;
+
+                int& n = map[key];
+                if ( n == 0 ) {
+                    n = map.size();
+                    s->setObject( "$key" , key , true );
+
+                    uassert( 10043 ,  "group() can't handle more than 20000 unique keys" , n <= 20000 );
+                }
+
+                s->setObject( "obj" , obj , true );
+                s->setNumber( "n" , n - 1 );
+                if ( s->invoke( f , 0, 0 , 0 , true ) ) {
+                    throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() );
+                }
+            }
+            ccPointer.reset();
+
+            if (!finalize.empty()) {
+                s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 );
+                ScriptingFunction g = s->createFunction(
+                                          "function(){ "
+                                          "  for(var i=0; i < $arr.length; i++){ "
+                                          "  var ret = $finalize($arr[i]); "
+                                          "  if (ret !== undefined) "
+                                          "    $arr[i] = ret; "
+                                          "  } "
+                                          "}" );
+                s->invoke( g , 0, 0 , 0 , true );
+            }
+
+            result.appendArray( "retval" , s->getObject( "$arr" ) );
+            result.append( "count" , keynum - 1 );
+            result.append( "keys" , (int)(map.size()) );
+            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+            s->gc();
+
+            return true;
+        }
+
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+
+            if ( !globalScriptEngine ) {
+                errmsg = "server-side JavaScript execution is disabled";
+                return false;
+            }
+            
+            /* db.$cmd.findOne( { group : <p> } ) */
+            const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck();
+
+            BSONObj q;
+            if ( p["cond"].type() == Object )
+                q = p["cond"].embeddedObject();
+            else if ( p["condition"].type() == Object )
+                q = p["condition"].embeddedObject();
+            else
+                q = getQuery( p );
+
+            if ( p["ns"].type() != String ) {
+                errmsg = "ns has to be set";
+                return false;
+            }
+
+            string ns = dbname + "." + p["ns"].String();
+
+            BSONObj key;
+            string keyf;
+            if ( p["key"].type() == Object ) {
+                key = p["key"].embeddedObjectUserCheck();
+                if ( ! p["$keyf"].eoo() ) {
+                    errmsg = "can't have key and $keyf";
+                    return false;
+                }
+            }
+            else if ( p["$keyf"].type() ) {
+                keyf = p["$keyf"]._asCode();
+            }
+            else {
+                // no key specified, will use entire object as key
+            }
+
+            BSONElement reduce = p["$reduce"];
+            if ( reduce.eoo() ) {
+                errmsg = "$reduce has to be set";
+                return false;
+            }
+
+            BSONElement initial = p["initial"];
+            if ( initial.type() != Object ) {
+                errmsg = "initial has to be an object";
+                return false;
+            }
+
+
+            string finalize;
+            if (p["finalize"].type())
+                finalize = p["finalize"]._asCode();
+
+            return group( dbname , ns , q ,
+                          key , keyf , reduce._asCode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() ,
+                          initial.embeddedObject() , finalize ,
+                          errmsg , result );
+        }
+
+    } cmdGroup;
+
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/isself.cpp b/src/mongo/db/commands/isself.cpp
new file mode 100644
index 00000000000..ebf6d5bceec
--- /dev/null
+++ b/src/mongo/db/commands/isself.cpp
@@ -0,0 +1,246 @@
+// isself.cpp
+
+#include "pch.h"
+#include "../../util/net/listen.h"
+#include "../commands.h"
+#include "../../client/dbclient.h"
+#include "../security.h"
+
+#include <boost/algorithm/string.hpp>
+
+#ifndef _WIN32
+# ifndef __sunos__
+#  include <ifaddrs.h>
+# endif
+# include <sys/resource.h>
+# include <sys/stat.h>
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netdb.h>
+#ifdef __openbsd__
+# include <sys/uio.h>
+#endif
+
+#endif
+
+
+namespace mongo {
+
+#if !defined(_WIN32) && !defined(__sunos__)
+
+    vector<string> getMyAddrs() {
+        vector<string> out;
+        ifaddrs * addrs;
+        
+        if ( ! cmdLine.bind_ip.empty() ) {
+            boost::split( out, cmdLine.bind_ip, boost::is_any_of( ", " ) );
+            return out;
+        }
+
+        int status = getifaddrs(&addrs);
+        massert(13469, "getifaddrs failure: " + errnoWithDescription(errno), status == 0);
+
+        // based on example code from linux getifaddrs manpage
+        for (ifaddrs * addr = addrs; addr != NULL; addr = addr->ifa_next) {
+            if ( addr->ifa_addr == NULL ) continue;
+            int family = addr->ifa_addr->sa_family;
+            char host[NI_MAXHOST];
+
+            if (family == AF_INET || family == AF_INET6) {
+                status = getnameinfo(addr->ifa_addr,
+                                     (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)),
+                                     host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+                if ( status != 0 ) {
+                    freeifaddrs( addrs );
+                    addrs = NULL;
+                    msgasserted( 13470, string("getnameinfo() failed: ") + gai_strerror(status) );
+                }
+
+                out.push_back(host);
+            }
+
+        }
+
+        freeifaddrs( addrs );
+        addrs = NULL;
+
+        if (logLevel >= 1) {
+            log(1) << "getMyAddrs():";
+            for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) {
+                log(1) << " [" << *it << ']';
+            }
+            log(1) << endl;
+        }
+
+        return out;
+    }
+
+    vector<string> getAllIPs(StringData iporhost) {
+        addrinfo* addrs = NULL;
+        addrinfo hints;
+        memset(&hints, 0, sizeof(addrinfo));
+        hints.ai_socktype = SOCK_STREAM;
+        hints.ai_family = (IPv6Enabled() ? AF_UNSPEC : AF_INET);
+
+        static string portNum = BSONObjBuilder::numStr(cmdLine.port);
+
+        vector<string> out;
+
+        int ret = getaddrinfo(iporhost.data(), portNum.c_str(), &hints, &addrs);
+        if ( ret ) {
+            warning() << "getaddrinfo(\"" << iporhost.data() << "\") failed: " << gai_strerror(ret) << endl;
+            return out;
+        }
+
+        for (addrinfo* addr = addrs; addr != NULL; addr = addr->ai_next) {
+            int family = addr->ai_family;
+            char host[NI_MAXHOST];
+
+            if (family == AF_INET || family == AF_INET6) {
+                int status = getnameinfo(addr->ai_addr, addr->ai_addrlen, host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+
+                massert(13472, string("getnameinfo() failed: ") + gai_strerror(status), status == 0);
+
+                out.push_back(host);
+            }
+
+        }
+
+        freeaddrinfo(addrs);
+
+        if (logLevel >= 1) {
+            log(1) << "getallIPs(\"" << iporhost << "\"):";
+            for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) {
+                log(1) << " [" << *it << ']';
+            }
+            log(1) << endl;
+        }
+
+        return out;
+    }
+#endif
+
+
+    class IsSelfCommand : public Command {
+    public:
+        IsSelfCommand() : Command("_isSelf") , _cacheLock( "IsSelfCommand::_cacheLock" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "{ _isSelf : 1 } INTERNAL ONLY";
+        }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            init();
+            result.append( "id" , _id );
+            return true;
+        }
+
+        void init() {
+            scoped_lock lk( _cacheLock );
+            if ( ! _id.isSet() )
+                _id.init();
+        }
+
+        OID _id;
+
+        mongo::mutex _cacheLock;
+        map<string,bool> _cache;
+    } isSelfCommand;
+
+    bool HostAndPort::isSelf() const {
+
+        if( dyn() ) { 
+            LOG(2) << "isSelf " << _dynName << ' ' << dynHostMyName() << endl;
+            return dynHostMyName() == _dynName;
+        }
+
+        int _p = port();
+        int p = _p == -1 ? CmdLine::DefaultDBPort : _p;
+
+        if( p != cmdLine.port ) {
+            // shortcut - ports have to match at the very least
+            return false;
+        }
+
+        string host = str::stream() << this->host() << ":" << p;
+
+        {
+            // check cache for this host
+            // debatably something _could_ change, but I'm not sure right now (erh 10/14/2010)
+            scoped_lock lk( isSelfCommand._cacheLock );
+            map<string,bool>::const_iterator i = isSelfCommand._cache.find( host );
+            if ( i != isSelfCommand._cache.end() )
+                return i->second;
+        }
+
+#if !defined(_WIN32) && !defined(__sunos__)
+        // on linux and os x we can do a quick check for an ip match
+
+        const vector<string> myaddrs = getMyAddrs();
+        const vector<string> addrs = getAllIPs(_host);
+
+        for (vector<string>::const_iterator i=myaddrs.begin(), iend=myaddrs.end(); i!=iend; ++i) {
+            for (vector<string>::const_iterator j=addrs.begin(), jend=addrs.end(); j!=jend; ++j) {
+                string a = *i;
+                string b = *j;
+
+                if ( a == b ||
+                        ( str::startsWith( a , "127." ) && str::startsWith( b , "127." ) )  // 127. is all loopback
+                   ) {
+
+                    // add to cache
+                    scoped_lock lk( isSelfCommand._cacheLock );
+                    isSelfCommand._cache[host] = true;
+                    return true;
+                }
+            }
+        }
+
+#endif
+
+        if ( ! Listener::getTimeTracker() ) {
+            // this ensures we are actually running a server
+            // this may return true later, so may want to retry
+            return false;
+        }
+
+        try {
+            isSelfCommand.init();
+            DBClientConnection conn;
+            string errmsg;
+            if ( ! conn.connect( host , errmsg ) ) {
+                // should this go in the cache?
+                return false;
+            }
+
+            if (!noauth && cmdLine.keyFile &&
+                !conn.auth("local", internalSecurity.user, internalSecurity.pwd, errmsg, false)) {
+                return false;
+            }
+
+            BSONObj out;
+            bool ok = conn.simpleCommand( "admin" , &out , "_isSelf" );
+            bool me = ok && out["id"].type() == jstOID && isSelfCommand._id == out["id"].OID();
+
+            // add to cache
+            scoped_lock lk( isSelfCommand._cacheLock );
+            isSelfCommand._cache[host] = me;
+
+            return me;
+        }
+        catch ( std::exception& e ) {
+            warning() << "could't check isSelf (" << host << ") " << e.what() << endl;
+        }
+
+        return false;
+    }
+
+}
diff --git a/src/mongo/db/commands/mr.cpp b/src/mongo/db/commands/mr.cpp
new file mode 100644
index 00000000000..add76c39c47
--- /dev/null
+++ b/src/mongo/db/commands/mr.cpp
@@ -0,0 +1,1317 @@
+// mr.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db.h"
+#include "../instance.h"
+#include "../commands.h"
+#include "../../scripting/engine.h"
+#include "../../client/dbclient.h"
+#include "../../client/connpool.h"
+#include "../../client/parallel.h"
+#include "../queryoptimizer.h"
+#include "../matcher.h"
+#include "../clientcursor.h"
+#include "../replutil.h"
+#include "../../s/d_chunk_manager.h"
+#include "../../s/d_logic.h"
+#include "../../s/grid.h"
+
+#include "mr.h"
+
+namespace mongo {
+
+    namespace mr {
+
+        AtomicUInt Config::JOB_NUMBER;
+
+        JSFunction::JSFunction( string type , const BSONElement& e ) {
+            _type = type;
+            _code = e._asCode();
+
+            if ( e.type() == CodeWScope )
+                _wantedScope = e.codeWScopeObject();
+        }
+
+        void JSFunction::init( State * state ) {
+            _scope = state->scope();
+            assert( _scope );
+            _scope->init( &_wantedScope );
+
+            _func = _scope->createFunction( _code.c_str() );
+            uassert( 13598 , str::stream() << "couldn't compile code for: " << _type , _func );
+
+            // install in JS scope so that it can be called in JS mode
+            _scope->setFunction(_type.c_str(), _code.c_str());
+        }
+
+        void JSMapper::init( State * state ) {
+            _func.init( state );
+            _params = state->config().mapParams;
+        }
+
+        /**
+         * Applies the map function to an object, which should internally call emit()
+         */
+        void JSMapper::map( const BSONObj& o ) {
+            Scope * s = _func.scope();
+            assert( s );
+            if ( s->invoke( _func.func() , &_params, &o , 0 , true, false, true ) )
+                throw UserException( 9014, str::stream() << "map invoke failed: " + s->getError() );
+        }
+
+        /**
+         * Applies the finalize function to a tuple obj (key, val)
+         * Returns tuple obj {_id: key, value: newval}
+         */
+        BSONObj JSFinalizer::finalize( const BSONObj& o ) {
+            Scope * s = _func.scope();
+
+            Scope::NoDBAccess no = s->disableDBAccess( "can't access db inside finalize" );
+            s->invokeSafe( _func.func() , &o, 0 );
+
+            // don't want to use o.objsize() to size b
+            // since there are many cases where the point of finalize
+            // is converting many fields to 1
+            BSONObjBuilder b;
+            b.append( o.firstElement() );
+            s->append( b , "value" , "return" );
+            return b.obj();
+        }
+
+        void JSReducer::init( State * state ) {
+            _func.init( state );
+        }
+
+        /**
+         * Reduces a list of tuple objects (key, value) to a single tuple {"0": key, "1": value}
+         */
+        BSONObj JSReducer::reduce( const BSONList& tuples ) {
+            if (tuples.size() <= 1)
+                return tuples[0];
+            BSONObj key;
+            int endSizeEstimate = 16;
+            _reduce( tuples , key , endSizeEstimate );
+
+            BSONObjBuilder b(endSizeEstimate);
+            b.appendAs( key.firstElement() , "0" );
+            _func.scope()->append( b , "1" , "return" );
+            return b.obj();
+        }
+
+        /**
+         * Reduces a list of tuple object (key, value) to a single tuple {_id: key, value: val}
+         * Also applies a finalizer method if present.
+         */
+        BSONObj JSReducer::finalReduce( const BSONList& tuples , Finalizer * finalizer ) {
+
+            BSONObj res;
+            BSONObj key;
+
+            if (tuples.size() == 1) {
+                // 1 obj, just use it
+                key = tuples[0];
+                BSONObjBuilder b(key.objsize());
+                BSONObjIterator it(key);
+                b.appendAs( it.next() , "_id" );
+                b.appendAs( it.next() , "value" );
+                res = b.obj();
+            }
+            else {
+                // need to reduce
+                int endSizeEstimate = 16;
+                _reduce( tuples , key , endSizeEstimate );
+                BSONObjBuilder b(endSizeEstimate);
+                b.appendAs( key.firstElement() , "_id" );
+                _func.scope()->append( b , "value" , "return" );
+                res = b.obj();
+            }
+
+            if ( finalizer ) {
+                res = finalizer->finalize( res );
+            }
+
+            return res;
+        }
+
+        /**
+         * actually applies a reduce, to a list of tuples (key, value).
+         * After the call, tuples will hold a single tuple {"0": key, "1": value}
+         */
+        void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) {
+            uassert( 10074 ,  "need values" , tuples.size() );
+
+            int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128;
+
+            // need to build the reduce args: ( key, [values] )
+            BSONObjBuilder reduceArgs( sizeEstimate );
+            boost::scoped_ptr<BSONArrayBuilder>  valueBuilder;
+            int sizeSoFar = 0;
+            unsigned n = 0;
+            for ( ; n<tuples.size(); n++ ) {
+                BSONObjIterator j(tuples[n]);
+                BSONElement keyE = j.next();
+                if ( n == 0 ) {
+                    reduceArgs.append( keyE );
+                    key = keyE.wrap();
+                    sizeSoFar = 5 + keyE.size();
+                    valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) ));
+                }
+
+                BSONElement ee = j.next();
+
+                uassert( 13070 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) );
+
+                if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) {
+                    assert( n > 1 ); // if not, inf. loop
+                    break;
+                }
+
+                valueBuilder->append( ee );
+                sizeSoFar += ee.size();
+            }
+            assert(valueBuilder);
+            valueBuilder->done();
+            BSONObj args = reduceArgs.obj();
+
+            Scope * s = _func.scope();
+
+            s->invokeSafe( _func.func() , &args, 0, 0, false, true, true );
+            ++numReduces;
+
+            if ( s->type( "return" ) == Array ) {
+                uasserted( 10075 , "reduce -> multiple not supported yet");
+                return;
+            }
+
+            endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() );
+
+            if ( n == tuples.size() )
+                return;
+
+            // the input list was too large, add the rest of elmts to new tuples and reduce again
+            // note: would be better to use loop instead of recursion to avoid stack overflow
+            BSONList x;
+            for ( ; n < tuples.size(); n++ ) {
+                x.push_back( tuples[n] );
+            }
+            BSONObjBuilder temp( endSizeEstimate );
+            temp.append( key.firstElement() );
+            s->append( temp , "1" , "return" );
+            x.push_back( temp.obj() );
+            _reduce( x , key , endSizeEstimate );
+        }
+
+        Config::Config( const string& _dbname , const BSONObj& cmdObj ) {
+
+            dbname = _dbname;
+            ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            verbose = cmdObj["verbose"].trueValue();
+            jsMode = cmdObj["jsMode"].trueValue();
+            splitInfo = 0;
+            if (cmdObj.hasField("splitInfo"))
+                splitInfo = cmdObj["splitInfo"].Int();
+
+            jsMaxKeys = 500000;
+            reduceTriggerRatio = 10.0;
+            maxInMemSize = 500 * 1024;
+
+            uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() );
+
+            if ( cmdObj["out"].type() == String ) {
+                finalShort = cmdObj["out"].String();
+                outType = REPLACE;
+            }
+            else if ( cmdObj["out"].type() == Object ) {
+                BSONObj o = cmdObj["out"].embeddedObject();
+
+                BSONElement e = o.firstElement();
+                string t = e.fieldName();
+
+                if ( t == "normal" || t == "replace" ) {
+                    outType = REPLACE;
+                    finalShort = e.String();
+                }
+                else if ( t == "merge" ) {
+                    outType = MERGE;
+                    finalShort = e.String();
+                }
+                else if ( t == "reduce" ) {
+                    outType = REDUCE;
+                    finalShort = e.String();
+                }
+                else if ( t == "inline" ) {
+                    outType = INMEMORY;
+                }
+                else {
+                    uasserted( 13522 , str::stream() << "unknown out specifier [" << t << "]" );
+                }
+
+                if (o.hasElement("db")) {
+                    outDB = o["db"].String();
+                }
+
+                if (o.hasElement("nonAtomic")) {
+                    outNonAtomic = o["nonAtomic"].Bool();
+                    if (outNonAtomic)
+                        uassert( 15895 , "nonAtomic option cannot be used with this output type", (outType == REDUCE || outType == MERGE) );
+                }
+            }
+            else {
+                uasserted( 13606 , "'out' has to be a string or an object" );
+            }
+
+            if ( outType != INMEMORY ) { // setup names
+                tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << JOB_NUMBER++;
+
+                incLong = tempLong + "_inc";
+
+                finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort;
+            }
+
+            {
+                // scope and code
+
+                if ( cmdObj["scope"].type() == Object )
+                    scopeSetup = cmdObj["scope"].embeddedObjectUserCheck();
+
+                mapper.reset( new JSMapper( cmdObj["map"] ) );
+                reducer.reset( new JSReducer( cmdObj["reduce"] ) );
+                if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() )
+                    finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) );
+
+                if ( cmdObj["mapparams"].type() == Array ) {
+                    mapParams = cmdObj["mapparams"].embeddedObjectUserCheck();
+                }
+
+            }
+
+            {
+                // query options
+                BSONElement q = cmdObj["query"];
+                if ( q.type() == Object )
+                    filter = q.embeddedObjectUserCheck();
+                else
+                    uassert( 13608 , "query has to be blank or an Object" , ! q.trueValue() );
+
+
+                BSONElement s = cmdObj["sort"];
+                if ( s.type() == Object )
+                    sort = s.embeddedObjectUserCheck();
+                else
+                    uassert( 13609 , "sort has to be blank or an Object" , ! s.trueValue() );
+
+                if ( cmdObj["limit"].isNumber() )
+                    limit = cmdObj["limit"].numberLong();
+                else
+                    limit = 0;
+            }
+        }
+
+        /**
+         * Create temporary collection, set up indexes
+         */
+        void State::prepTempCollection() {
+            if ( ! _onDisk )
+                return;
+
+            if (_config.incLong != _config.tempLong) {
+                // create the inc collection and make sure we have index on "0" key
+                _db.dropCollection( _config.incLong );
+                {
+                    writelock l( _config.incLong );
+                    Client::Context ctx( _config.incLong );
+                    string err;
+                    if ( ! userCreateNS( _config.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ) {
+                        uasserted( 13631 , str::stream() << "userCreateNS failed for mr incLong ns: " << _config.incLong << " err: " << err );
+                    }
+                }
+
+                BSONObj sortKey = BSON( "0" << 1 );
+                _db.ensureIndex( _config.incLong , sortKey );
+            }
+
+            // create temp collection
+            _db.dropCollection( _config.tempLong );
+            {
+                writelock lock( _config.tempLong.c_str() );
+                Client::Context ctx( _config.tempLong.c_str() );
+                string errmsg;
+                if ( ! userCreateNS( _config.tempLong.c_str() , BSONObj() , errmsg , true ) ) {
+                    uasserted( 13630 , str::stream() << "userCreateNS failed for mr tempLong ns: " << _config.tempLong << " err: " << errmsg );
+                }
+            }
+
+            {
+                // copy indexes
+                auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.finalLong );
+                while ( idx->more() ) {
+                    BSONObj i = idx->next();
+
+                    BSONObjBuilder b( i.objsize() + 16 );
+                    b.append( "ns" , _config.tempLong );
+                    BSONObjIterator j( i );
+                    while ( j.more() ) {
+                        BSONElement e = j.next();
+                        if ( str::equals( e.fieldName() , "_id" ) ||
+                                str::equals( e.fieldName() , "ns" ) )
+                            continue;
+
+                        b.append( e );
+                    }
+
+                    BSONObj indexToInsert = b.obj();
+                    insert( Namespace( _config.tempLong.c_str() ).getSisterNS( "system.indexes" ).c_str() , indexToInsert );
+                }
+
+            }
+
+        }
+
+        /**
+         * For inline mode, appends results to output object.
+         * Makes sure (key, value) tuple is formatted as {_id: key, value: val}
+         */
+        void State::appendResults( BSONObjBuilder& final ) {
+            if ( _onDisk ) {
+                if (!_config.outDB.empty()) {
+                    BSONObjBuilder loc;
+                    if ( !_config.outDB.empty())
+                        loc.append( "db" , _config.outDB );
+                    if ( !_config.finalShort.empty() )
+                        loc.append( "collection" , _config.finalShort );
+                    final.append("result", loc.obj());
+                }
+                else {
+                    if ( !_config.finalShort.empty() )
+                        final.append( "result" , _config.finalShort );
+                }
+
+                if ( _config.splitInfo > 0 ) {
+                    // add split points, used for shard
+                    BSONObj res;
+                    BSONObj idKey = BSON( "_id" << 1 );
+                    if ( ! _db.runCommand( "admin" , BSON( "splitVector" << _config.finalLong << "keyPattern" << idKey << "maxChunkSizeBytes" << _config.splitInfo ) , res ) ) {
+                        uasserted( 15921 ,  str::stream() << "splitVector failed: " << res );
+                    }
+                    if ( res.hasField( "splitKeys" ) )
+                        final.append( res.getField( "splitKeys" ) );
+                }
+                return;
+            }
+
+            if (_jsMode) {
+                ScriptingFunction getResult = _scope->createFunction("var map = _mrMap; var result = []; for (key in map) { result.push({_id: key, value: map[key]}) } return result;");
+                _scope->invoke(getResult, 0, 0, 0, false);
+                BSONObj obj = _scope->getObject("return");
+                final.append("results", BSONArray(obj));
+                return;
+            }
+
+            uassert( 13604 , "too much data for in memory map/reduce" , _size < BSONObjMaxUserSize );
+
+            BSONArrayBuilder b( (int)(_size * 1.2) ); // _size is data size, doesn't count overhead and keys
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                BSONObj key = i->first;
+                BSONList& all = i->second;
+
+                assert( all.size() == 1 );
+
+                BSONObjIterator vi( all[0] );
+                vi.next();
+
+                BSONObjBuilder temp( b.subobjStart() );
+                temp.appendAs( key.firstElement() , "_id" );
+                temp.appendAs( vi.next() , "value" );
+                temp.done();
+            }
+
+            BSONArray res = b.arr();
+            final.append( "results" , res );
+        }
+
+        /**
+         * Does post processing on output collection.
+         * This may involve replacing, merging or reducing.
+         */
+        long long State::postProcessCollection(CurOp* op, ProgressMeterHolder& pm) {
+            if ( _onDisk == false || _config.outType == Config::INMEMORY )
+                return _temp->size();
+
+            if (_config.outNonAtomic)
+                return postProcessCollectionNonAtomic(op, pm);
+            writelock lock;
+            return postProcessCollectionNonAtomic(op, pm);
+        }
+
+        long long State::postProcessCollectionNonAtomic(CurOp* op, ProgressMeterHolder& pm) {
+
+            if ( _config.finalLong == _config.tempLong )
+                return _db.count( _config.finalLong );
+
+            if ( _config.outType == Config::REPLACE || _db.count( _config.finalLong ) == 0 ) {
+                writelock lock;
+                // replace: just rename from temp to final collection name, dropping previous collection
+                _db.dropCollection( _config.finalLong );
+                BSONObj info;
+                if ( ! _db.runCommand( "admin" , BSON( "renameCollection" << _config.tempLong << "to" << _config.finalLong ) , info ) ) {
+                    uasserted( 10076 ,  str::stream() << "rename failed: " << info );
+                }
+                         
+                _db.dropCollection( _config.tempLong );
+            }
+            else if ( _config.outType == Config::MERGE ) {
+                // merge: upsert new docs into old collection
+                op->setMessage( "m/r: merge post processing" , _db.count( _config.tempLong, BSONObj() ) );
+                auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() );
+                while ( cursor->more() ) {
+                    writelock lock;
+                    BSONObj o = cursor->next();
+                    Helpers::upsert( _config.finalLong , o );
+                    getDur().commitIfNeeded();
+                    pm.hit();
+                }
+                _db.dropCollection( _config.tempLong );
+                pm.finished();
+            }
+            else if ( _config.outType == Config::REDUCE ) {
+                // reduce: apply reduce op on new result and existing one
+                BSONList values;
+
+                op->setMessage( "m/r: reduce post processing" , _db.count( _config.tempLong, BSONObj() ) );
+                auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() );
+                while ( cursor->more() ) {
+                    writelock lock;
+                    BSONObj temp = cursor->next();
+                    BSONObj old;
+
+                    bool found;
+                    {
+                        Client::Context tx( _config.finalLong );
+                        found = Helpers::findOne( _config.finalLong.c_str() , temp["_id"].wrap() , old , true );
+                    }
+
+                    if ( found ) {
+                        // need to reduce
+                        values.clear();
+                        values.push_back( temp );
+                        values.push_back( old );
+                        Helpers::upsert( _config.finalLong , _config.reducer->finalReduce( values , _config.finalizer.get() ) );
+                    }
+                    else {
+                        Helpers::upsert( _config.finalLong , temp );
+                    }
+                    getDur().commitIfNeeded();
+                    pm.hit();
+                }
+                _db.dropCollection( _config.tempLong );
+                pm.finished();
+            }
+
+            return _db.count( _config.finalLong );
+        }
+
+        /**
+         * Insert doc in collection
+         */
+        void State::insert( const string& ns , const BSONObj& o ) {
+            assert( _onDisk );
+
+            writelock l( ns );
+            Client::Context ctx( ns );
+
+            theDataFileMgr.insertAndLog( ns.c_str() , o , false );
+        }
+
+        /**
+         * Insert doc into the inc collection, taking proper lock
+         */
+        void State::insertToInc( BSONObj& o ) {
+            writelock l(_config.incLong);
+            Client::Context ctx(_config.incLong);
+            _insertToInc(o);
+        }
+
+        /**
+         * Insert doc into the inc collection
+         */
+        void State::_insertToInc( BSONObj& o ) {
+            assert( _onDisk );
+            theDataFileMgr.insertWithObjMod( _config.incLong.c_str() , o , true );
+            getDur().commitIfNeeded();
+        }
+
+        State::State( const Config& c ) : _config( c ), _size(0), _dupCount(0), _numEmits(0) {
+            _temp.reset( new InMemory() );
+            _onDisk = _config.outType != Config::INMEMORY;
+        }
+
+        bool State::sourceExists() {
+            return _db.exists( _config.ns );
+        }
+
+        long long State::incomingDocuments() {
+            return _db.count( _config.ns , _config.filter , QueryOption_SlaveOk , (unsigned) _config.limit );
+        }
+
+        State::~State() {
+            if ( _onDisk ) {
+                try {
+                    _db.dropCollection( _config.tempLong );
+                    _db.dropCollection( _config.incLong );
+                }
+                catch ( std::exception& e ) {
+                    error() << "couldn't cleanup after map reduce: " << e.what() << endl;
+                }
+            }
+
+            if (_scope) {
+                // cleanup js objects
+                ScriptingFunction cleanup = _scope->createFunction("delete _emitCt; delete _keyCt; delete _mrMap;");
+                _scope->invoke(cleanup, 0, 0, 0, true);
+            }
+        }
+
+        /**
+         * Initialize the mapreduce operation, creating the inc collection
+         */
+        void State::init() {
+            // setup js
+            _scope.reset(globalScriptEngine->getPooledScope( _config.dbname ).release() );
+            _scope->localConnect( _config.dbname.c_str() );
+
+            if ( ! _config.scopeSetup.isEmpty() )
+                _scope->init( &_config.scopeSetup );
+
+            _config.mapper->init( this );
+            _config.reducer->init( this );
+            if ( _config.finalizer )
+                _config.finalizer->init( this );
+            _scope->setBoolean("_doFinal", _config.finalizer);
+
+            // by default start in JS mode, will be faster for small jobs
+            _jsMode = _config.jsMode;
+//            _jsMode = true;
+            switchMode(_jsMode);
+
+            // global JS map/reduce hashmap
+            // we use a standard JS object which means keys are only simple types
+            // we could also add a real hashmap from a library, still we need to add object comparison methods
+//            _scope->setObject("_mrMap", BSONObj(), false);
+            ScriptingFunction init = _scope->createFunction("_emitCt = 0; _keyCt = 0; _dupCt = 0; _redCt = 0; if (typeof(_mrMap) === 'undefined') { _mrMap = {}; }");
+            _scope->invoke(init, 0, 0, 0, true);
+
+            // js function to run reduce on all keys
+//            redfunc = _scope->createFunction("for (var key in hashmap) {  print('Key is ' + key); list = hashmap[key]; ret = reduce(key, list); print('Value is ' + ret); };");
+            _reduceAll = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length != 1) { ret = _reduce(key, list); map[key] = [ret]; ++_redCt; } } _dupCt = 0;");
+            _reduceAndEmit = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; } emit(key, ret); }; delete _mrMap;");
+            _reduceAndFinalize = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { if (!_doFinal) {continue;} ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(key, ret); } map[key] = ret; }");
+            _reduceAndFinalizeAndInsert = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(key, ret); } _nativeToTemp({_id: key, value: ret}); }");
+
+        }
+
+        void State::switchMode(bool jsMode) {
+            _jsMode = jsMode;
+            if (jsMode) {
+                // emit function that stays in JS
+                _scope->setFunction("emit", "function(key, value) { if (typeof(key) === 'object') { _bailFromJS(key, value); return; }; ++_emitCt; var map = _mrMap; var list = map[key]; if (!list) { ++_keyCt; list = []; map[key] = list; } else { ++_dupCt; } list.push(value); }");
+                _scope->injectNative("_bailFromJS", _bailFromJS, this);
+            }
+            else {
+                // emit now populates C++ map
+                _scope->injectNative( "emit" , fast_emit, this );
+            }
+        }
+
+        void State::bailFromJS() {
+            log(1) << "M/R: Switching from JS mode to mixed mode" << endl;
+
+            // reduce and reemit into c++
+            switchMode(false);
+            _scope->invoke(_reduceAndEmit, 0, 0, 0, true);
+            // need to get the real number emitted so far
+            _numEmits = _scope->getNumberInt("_emitCt");
+            _config.reducer->numReduces = _scope->getNumberInt("_redCt");
+        }
+
+        /**
+         * Applies last reduce and finalize on a list of tuples (key, val)
+         * Inserts single result {_id: key, value: val} into temp collection
+         */
+        void State::finalReduce( BSONList& values ) {
+            if ( !_onDisk || values.size() == 0 )
+                return;
+
+            BSONObj res = _config.reducer->finalReduce( values , _config.finalizer.get() );
+            insert( _config.tempLong , res );
+        }
+
+        BSONObj _nativeToTemp( const BSONObj& args, void* data ) {
+            State* state = (State*) data;
+            BSONObjIterator it(args);
+            state->insert(state->_config.tempLong, it.next().Obj());
+            return BSONObj();
+        }
+
+//        BSONObj _nativeToInc( const BSONObj& args, void* data ) {
+//            State* state = (State*) data;
+//            BSONObjIterator it(args);
+//            const BSONObj& obj = it.next().Obj();
+//            state->_insertToInc(const_cast<BSONObj&>(obj));
+//            return BSONObj();
+//        }
+
+        /**
+         * Applies last reduce and finalize.
+         * After calling this method, the temp collection will be completed.
+         * If inline, the results will be in the in memory map
+         */
+        void State::finalReduce( CurOp * op , ProgressMeterHolder& pm ) {
+
+            if (_jsMode) {
+                // apply the reduce within JS
+                if (_onDisk) {
+                    _scope->injectNative("_nativeToTemp", _nativeToTemp, this);
+                    _scope->invoke(_reduceAndFinalizeAndInsert, 0, 0, 0, true);
+                    return;
+                }
+                else {
+                    _scope->invoke(_reduceAndFinalize, 0, 0, 0, true);
+                    return;
+                }
+            }
+
+            if ( ! _onDisk ) {
+                // all data has already been reduced, just finalize
+                if ( _config.finalizer ) {
+                    long size = 0;
+                    for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                        BSONObj key = i->first;
+                        BSONList& all = i->second;
+
+                        assert( all.size() == 1 );
+
+                        BSONObj res = _config.finalizer->finalize( all[0] );
+
+                        all.clear();
+                        all.push_back( res );
+                        size += res.objsize();
+                    }
+                    _size = size;
+                }
+                return;
+            }
+
+            // use index on "0" to pull sorted data
+            assert( _temp->size() == 0 );
+            BSONObj sortKey = BSON( "0" << 1 );
+            {
+                bool foundIndex = false;
+
+                auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.incLong );
+                while ( idx.get() && idx->more() ) {
+                    BSONObj x = idx->next();
+                    if ( sortKey.woCompare( x["key"].embeddedObject() ) == 0 ) {
+                        foundIndex = true;
+                        break;
+                    }
+                }
+
+                assert( foundIndex );
+            }
+
+            readlock rl( _config.incLong.c_str() );
+            Client::Context ctx( _config.incLong );
+
+            BSONObj prev;
+            BSONList all;
+
+            assert( pm == op->setMessage( "m/r: (3/3) final reduce to collection" , _db.count( _config.incLong, BSONObj(), QueryOption_SlaveOk ) ) );
+
+            shared_ptr<Cursor> temp = bestGuessCursor( _config.incLong.c_str() , BSONObj() , sortKey );
+            auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , _config.incLong.c_str() ) );
+
+            // iterate over all sorted objects
+            while ( cursor->ok() ) {
+                BSONObj o = cursor->current().getOwned();
+                cursor->advance();
+
+                pm.hit();
+
+                if ( o.woSortOrder( prev , sortKey ) == 0 ) {
+                    // object is same as previous, add to array
+                    all.push_back( o );
+                    if ( pm->hits() % 1000 == 0 ) {
+                        if ( ! cursor->yield() ) {
+                            cursor.release();
+                            break;
+                        }
+                        killCurrentOp.checkForInterrupt();
+                    }
+                    continue;
+                }
+
+                ClientCursor::YieldLock yield (cursor.get());
+
+                try {
+                    // reduce a finalize array
+                    finalReduce( all );
+                }
+                catch (...) {
+                    yield.relock();
+                    cursor.release();
+                    throw;
+                }
+
+                all.clear();
+                prev = o;
+                all.push_back( o );
+
+                if ( ! yield.stillOk() ) {
+                    cursor.release();
+                    break;
+                }
+
+                killCurrentOp.checkForInterrupt();
+            }
+            
+            // we need to release here since we temp release below
+            cursor.release();
+
+            {
+                dbtempreleasecond tl;
+                if ( ! tl.unlocked() )
+                    log( LL_WARNING ) << "map/reduce can't temp release" << endl;
+                // reduce and finalize last array
+                finalReduce( all );
+            }
+
+            pm.finished();
+        }
+
+        /**
+         * Attempts to reduce objects in the memory map.
+         * A new memory map will be created to hold the results.
+         * If applicable, objects with unique key may be dumped to inc collection.
+         * Input and output objects are both {"0": key, "1": val}
+         */
+        void State::reduceInMemory() {
+
+            if (_jsMode) {
+                // in js mode the reduce is applied when writing to collection
+                return;
+            }
+
+            auto_ptr<InMemory> n( new InMemory() ); // for new data
+            long nSize = 0;
+            _dupCount = 0;
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                BSONObj key = i->first;
+                BSONList& all = i->second;
+
+                if ( all.size() == 1 ) {
+                    // only 1 value for this key
+                    if ( _onDisk ) {
+                        // this key has low cardinality, so just write to collection
+                        writelock l(_config.incLong);
+                        Client::Context ctx(_config.incLong.c_str());
+                        _insertToInc( *(all.begin()) );
+                    }
+                    else {
+                        // add to new map
+                        _add( n.get() , all[0] , nSize );
+                    }
+                }
+                else if ( all.size() > 1 ) {
+                    // several values, reduce and add to map
+                    BSONObj res = _config.reducer->reduce( all );
+                    _add( n.get() , res , nSize );
+                }
+            }
+
+            // swap maps
+            _temp.reset( n.release() );
+            _size = nSize;
+        }
+
+        /**
+         * Dumps the entire in memory map to the inc collection.
+         */
+        void State::dumpToInc() {
+            if ( ! _onDisk )
+                return;
+
+            writelock l(_config.incLong);
+            Client::Context ctx(_config.incLong);
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ) {
+                BSONList& all = i->second;
+                if ( all.size() < 1 )
+                    continue;
+
+                for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ )
+                    _insertToInc( *j );
+            }
+            _temp->clear();
+            _size = 0;
+
+        }
+
+        /**
+         * Adds object to in memory map
+         */
+        void State::emit( const BSONObj& a ) {
+            _numEmits++;
+            _add( _temp.get() , a , _size );
+        }
+
+        void State::_add( InMemory* im, const BSONObj& a , long& size ) {
+            BSONList& all = (*im)[a];
+            all.push_back( a );
+            size += a.objsize() + 16;
+            if (all.size() > 1)
+                ++_dupCount;
+        }
+
+        /**
+         * this method checks the size of in memory map and potentially flushes to disk
+         */
+        void State::checkSize() {
+            if (_jsMode) {
+                // try to reduce if it is beneficial
+                int dupCt = _scope->getNumberInt("_dupCt");
+                int keyCt = _scope->getNumberInt("_keyCt");
+
+                if (keyCt > _config.jsMaxKeys) {
+                    // too many keys for JS, switch to mixed
+                    _bailFromJS(BSONObj(), this);
+                    // then fall through to check map size
+                }
+                else if (dupCt > (keyCt * _config.reduceTriggerRatio)) {
+                    // reduce now to lower mem usage
+                    Timer t;
+                    _scope->invoke(_reduceAll, 0, 0, 0, true);
+                    log(1) << "  MR - did reduceAll: keys=" << keyCt << " dups=" << dupCt << " newKeys=" << _scope->getNumberInt("_keyCt") << " time=" << t.millis() << "ms" << endl;
+                    return;
+                }
+            }
+
+            if (_jsMode)
+                return;
+
+            if (_size > _config.maxInMemSize || _dupCount > (_temp->size() * _config.reduceTriggerRatio)) {
+                // attempt to reduce in memory map, if memory is too high or we have many duplicates
+                long oldSize = _size;
+                Timer t;
+                reduceInMemory();
+                log(1) << "  MR - did reduceInMemory: size=" << oldSize << " dups=" << _dupCount << " newSize=" << _size << " time=" << t.millis() << "ms" << endl;
+
+                // if size is still high, or values are not reducing well, dump
+                if ( _onDisk && (_size > _config.maxInMemSize || _size > oldSize / 2) ) {
+                    dumpToInc();
+                    log(1) << "  MR - dumping to db" << endl;
+                }
+            }
+        }
+
+        /**
+         * emit that will be called by js function
+         */
+        BSONObj fast_emit( const BSONObj& args, void* data ) {
+            uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 );
+            uassert( 13069 , "an emit can't be more than half max bson size" , args.objsize() < ( BSONObjMaxUserSize / 2 ) );
+            
+            State* state = (State*) data;
+            if ( args.firstElement().type() == Undefined ) {
+                BSONObjBuilder b( args.objsize() );
+                b.appendNull( "" );
+                BSONObjIterator i( args );
+                i.next();
+                b.append( i.next() );
+                state->emit( b.obj() );
+            }
+            else {
+                state->emit( args );
+            }
+            return BSONObj();
+        }
+
+        /**
+         * function is called when we realize we cant use js mode for m/r on the 1st key
+         */
+        BSONObj _bailFromJS( const BSONObj& args, void* data ) {
+            State* state = (State*) data;
+            state->bailFromJS();
+
+            // emit this particular key if there is one
+            if (!args.isEmpty()) {
+                fast_emit(args, data);
+            }
+            return BSONObj();
+        }
+
+        /**
+         * This class represents a map/reduce command executed on a single server
+         */
+        class MapReduceCommand : public Command {
+        public:
+            MapReduceCommand() : Command("mapReduce", false, "mapreduce") {}
+
+            /* why !replset ?
+               bad things happen with --slave (i think because of this)
+            */
+            virtual bool slaveOk() const { return !replSet; }
+
+            virtual bool slaveOverrideOk() { return true; }
+
+            virtual void help( stringstream &help ) const {
+                help << "Run a map/reduce operation on the server.\n";
+                help << "Note this is used for aggregation, not querying, in MongoDB.\n";
+                help << "http://www.mongodb.org/display/DOCS/MapReduce";
+            }
+
+            virtual LockType locktype() const { return NONE; }
+
+            bool run(const string& dbname , BSONObj& cmd, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+                Timer t;
+                Client& client = cc();
+                CurOp * op = client.curop();
+
+                Config config( dbname , cmd );
+
+                log(1) << "mr ns: " << config.ns << endl;
+
+                bool shouldHaveData = false;
+
+                long long num = 0;
+                long long inReduce = 0;
+
+                BSONObjBuilder countsBuilder;
+                BSONObjBuilder timingBuilder;
+                State state( config );
+                if ( ! state.sourceExists() ) {
+                    errmsg = "ns doesn't exist";
+                    return false;
+                }
+
+                if (replSet && state.isOnDisk()) {
+                    // this means that it will be doing a write operation, make sure we are on Master
+                    // ideally this check should be in slaveOk(), but at that point config is not known
+                    if (!isMaster(dbname.c_str())) {
+                        errmsg = "not master";
+                        return false;
+                    }
+                }
+
+                if (state.isOnDisk() && !client.getAuthenticationInfo()->isAuthorized(dbname)) {
+                    errmsg = "read-only user cannot output mapReduce to collection, use inline instead";
+                    return false;
+                }
+
+                try {
+                    state.init();
+                    state.prepTempCollection();
+                    ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , state.incomingDocuments() ) );
+
+                    wassert( config.limit < 0x4000000 ); // see case on next line to 32 bit unsigned
+                    long long mapTime = 0;
+                    {
+                        readlock lock( config.ns );
+                        Client::Context ctx( config.ns );
+
+                        ShardChunkManagerPtr chunkManager;
+                        if ( shardingState.needShardChunkManager( config.ns ) ) {
+                            chunkManager = shardingState.getShardChunkManager( config.ns );
+                        }
+
+                        // obtain cursor on data to apply mr to, sorted
+                        shared_ptr<Cursor> temp = NamespaceDetailsTransient::getCursor( config.ns.c_str(), config.filter, config.sort );
+                        uassert( 15876, str::stream() << "could not create cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, temp.get() );
+                        auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , config.ns.c_str() ) );
+                        uassert( 15877, str::stream() << "could not create client cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, cursor.get() );
+
+                        Timer mt;
+                        // go through each doc
+                        while ( cursor->ok() ) {
+                            if ( ! cursor->currentMatches() ) {
+                                cursor->advance();
+                                continue;
+                            }
+
+                            // make sure we dont process duplicates in case data gets moved around during map
+                            // TODO This won't actually help when data gets moved, it's to handle multikeys.
+                            if ( cursor->currentIsDup() ) {
+                                cursor->advance();
+                                continue;
+                            }
+                                                        
+                            BSONObj o = cursor->current();
+                            cursor->advance();
+
+                            // check to see if this is a new object we don't own yet
+                            // because of a chunk migration
+                            if ( chunkManager && ! chunkManager->belongsToMe( o ) )
+                                continue;
+
+                            // do map
+                            if ( config.verbose ) mt.reset();
+                            config.mapper->map( o );
+                            if ( config.verbose ) mapTime += mt.micros();
+
+                            num++;
+                            if ( num % 1000 == 0 ) {
+                                // try to yield lock regularly
+                                ClientCursor::YieldLock yield (cursor.get());
+                                Timer t;
+                                // check if map needs to be dumped to disk
+                                state.checkSize();
+                                inReduce += t.micros();
+
+                                if ( ! yield.stillOk() ) {
+                                    cursor.release();
+                                    break;
+                                }
+
+                                killCurrentOp.checkForInterrupt();
+                            }
+                            pm.hit();
+
+                            if ( config.limit && num >= config.limit )
+                                break;
+                        }
+                    }
+                    pm.finished();
+
+                    killCurrentOp.checkForInterrupt();
+                    // update counters
+                    countsBuilder.appendNumber( "input" , num );
+                    countsBuilder.appendNumber( "emit" , state.numEmits() );
+                    if ( state.numEmits() )
+                        shouldHaveData = true;
+
+                    timingBuilder.append( "mapTime" , mapTime / 1000 );
+                    timingBuilder.append( "emitLoop" , t.millis() );
+
+                    op->setMessage( "m/r: (2/3) final reduce in memory" );
+                    Timer t;
+                    // do reduce in memory
+                    // this will be the last reduce needed for inline mode
+                    state.reduceInMemory();
+                    // if not inline: dump the in memory map to inc collection, all data is on disk
+                    state.dumpToInc();
+                    // final reduce
+                    state.finalReduce( op , pm );
+                    inReduce += t.micros();
+                    countsBuilder.appendNumber( "reduce" , state.numReduces() );
+                    timingBuilder.append( "reduceTime" , inReduce / 1000 );
+                    timingBuilder.append( "mode" , state.jsMode() ? "js" : "mixed" );
+
+                    long long finalCount = state.postProcessCollection(op, pm);
+                    state.appendResults( result );
+
+                    timingBuilder.append( "total" , t.millis() );
+                    result.append( "timeMillis" , t.millis() );
+                    countsBuilder.appendNumber( "output" , finalCount );
+                    if ( config.verbose ) result.append( "timing" , timingBuilder.obj() );
+                    result.append( "counts" , countsBuilder.obj() );
+
+                    if ( finalCount == 0 && shouldHaveData ) {
+                        result.append( "cmd" , cmd );
+                        errmsg = "there were emits but no data!";
+                        return false;
+                    }
+
+                }
+                catch( SendStaleConfigException& e ){
+                    log() << "mr detected stale config, should retry" << causedBy(e) << endl;
+                    throw e;
+                }
+                // TODO:  The error handling code for queries is v. fragile,
+                // *requires* rethrow AssertionExceptions - should probably fix.
+                catch ( AssertionException& e ){
+                    log() << "mr failed, removing collection" << causedBy(e) << endl;
+                    throw e;
+                }
+                catch ( std::exception& e ){
+                    log() << "mr failed, removing collection" << causedBy(e) << endl;
+                    throw e;
+                }
+                catch ( ... ) {
+                    log() << "mr failed for unknown reason, removing collection" << endl;
+                    throw;
+                }
+
+                return true;
+            }
+
+        } mapReduceCommand;
+
+        /**
+         * This class represents a map/reduce command executed on the output server of a sharded env
+         */
+        class MapReduceFinishCommand : public Command {
+        public:
+            MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ) {}
+            virtual bool slaveOk() const { return !replSet; }
+            virtual bool slaveOverrideOk() { return true; }
+
+            virtual LockType locktype() const { return NONE; }
+            bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                ShardedConnectionInfo::addHook();
+                // legacy name
+                string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
+                string inputNS = cmdObj["inputNS"].valuestrsafe();
+                if (inputNS.empty())
+                    inputNS = dbname + "." + shardedOutputCollection;
+
+                Client& client = cc();
+                CurOp * op = client.curop();
+
+                Config config( dbname , cmdObj.firstElement().embeddedObjectUserCheck() );
+                State state(config);
+                state.init();
+
+                // no need for incremental collection because records are already sorted
+                config.incLong = config.tempLong;
+
+                BSONObj shardCounts = cmdObj["shardCounts"].embeddedObjectUserCheck();
+                BSONObj counts = cmdObj["counts"].embeddedObjectUserCheck();
+
+                ProgressMeterHolder pm( op->setMessage( "m/r: merge sort and reduce" ) );
+                set<ServerAndQuery> servers;
+                vector< auto_ptr<DBClientCursor> > shardCursors;
+
+                {
+                    // parse per shard results
+                    BSONObjIterator i( shardCounts );
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        string shard = e.fieldName();
+//                        BSONObj res = e.embeddedObjectUserCheck();
+                        servers.insert( shard );
+                    }
+                }
+
+                state.prepTempCollection();
+
+                BSONList values;
+                if (!config.outDB.empty()) {
+                    BSONObjBuilder loc;
+                    if ( !config.outDB.empty())
+                        loc.append( "db" , config.outDB );
+                    if ( !config.finalShort.empty() )
+                        loc.append( "collection" , config.finalShort );
+                    result.append("result", loc.obj());
+                }
+                else {
+                    if ( !config.finalShort.empty() )
+                        result.append( "result" , config.finalShort );
+                }
+
+                // fetch result from other shards 1 chunk at a time
+                // it would be better to do just one big $or query, but then the sorting would not be efficient
+                string shardName = shardingState.getShardName();
+                DBConfigPtr confOut = grid.getDBConfig( dbname , false );
+                vector<ChunkPtr> chunks;
+                if ( confOut->isSharded(config.finalLong) ) {
+                    ChunkManagerPtr cm = confOut->getChunkManager( config.finalLong );
+                    const ChunkMap& chunkMap = cm->getChunkMap();
+                    for ( ChunkMap::const_iterator it = chunkMap.begin(); it != chunkMap.end(); ++it ) {
+                        ChunkPtr chunk = it->second;
+                        if (chunk->getShard().getName() == shardName) chunks.push_back(chunk);
+                    }
+                }
+
+                long long inputCount = 0;
+                unsigned int index = 0;
+                BSONObj query;
+                BSONArrayBuilder chunkSizes;
+                while (true) {
+                    ChunkPtr chunk;
+                    if (chunks.size() > 0) {
+                        chunk = chunks[index];
+                        BSONObjBuilder b;
+                        b.appendAs(chunk->getMin().firstElement(), "$gte");
+                        b.appendAs(chunk->getMax().firstElement(), "$lt");
+                        query = BSON("_id" << b.obj());
+//                        chunkSizes.append(min);
+                    }
+
+                    // reduce from each shard for a chunk
+                    BSONObj sortKey = BSON( "_id" << 1 );
+                    ParallelSortClusteredCursor cursor( servers , inputNS , Query( query ).sort( sortKey ) );
+                    cursor.init();
+                    int chunkSize = 0;
+
+                    while ( cursor.more() || !values.empty() ) {
+                        BSONObj t;
+                        if (cursor.more()) {
+                            t = cursor.next().getOwned();
+                            ++inputCount;
+
+                            if ( values.size() == 0 ) {
+                                values.push_back( t );
+                                continue;
+                            }
+
+                            if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) {
+                                values.push_back( t );
+                                continue;
+                            }
+                        }
+
+                        BSONObj res = config.reducer->finalReduce( values , config.finalizer.get());
+                        chunkSize += res.objsize();
+                        if (state.isOnDisk())
+                            state.insertToInc(res);
+                        else
+                            state.emit(res);
+                        values.clear();
+                        if (!t.isEmpty())
+                            values.push_back( t );
+                    }
+
+                    if (chunk) {
+                        chunkSizes.append(chunk->getMin());
+                        chunkSizes.append(chunkSize);
+                    }
+                    if (++index >= chunks.size())
+                        break;
+                }
+
+                result.append( "chunkSizes" , chunkSizes.arr() );
+
+                long long outputCount = state.postProcessCollection(op, pm);
+                state.appendResults( result );
+
+                BSONObjBuilder countsB(32);
+                countsB.append("input", inputCount);
+                countsB.append("reduce", state.numReduces());
+                countsB.append("output", outputCount);
+                result.append( "counts" , countsB.obj() );
+
+                return 1;
+            }
+        } mapReduceFinishCommand;
+
+    }
+
+}
+
diff --git a/src/mongo/db/commands/mr.h b/src/mongo/db/commands/mr.h
new file mode 100644
index 00000000000..592769d82da
--- /dev/null
+++ b/src/mongo/db/commands/mr.h
@@ -0,0 +1,319 @@
+// mr.h
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    namespace mr {
+
+        typedef vector<BSONObj> BSONList;
+
+        class State;
+
+        // ------------  function interfaces -----------
+
+        class Mapper : boost::noncopyable {
+        public:
+            virtual ~Mapper() {}
+            virtual void init( State * state ) = 0;
+
+            virtual void map( const BSONObj& o ) = 0;
+        };
+
+        class Finalizer : boost::noncopyable {
+        public:
+            virtual ~Finalizer() {}
+            virtual void init( State * state ) = 0;
+
+            /**
+             * this takes a tuple and returns a tuple
+             */
+            virtual BSONObj finalize( const BSONObj& tuple ) = 0;
+        };
+
+        class Reducer : boost::noncopyable {
+        public:
+            Reducer() : numReduces(0) {}
+            virtual ~Reducer() {}
+            virtual void init( State * state ) = 0;
+
+            virtual BSONObj reduce( const BSONList& tuples ) = 0;
+            /** this means its a final reduce, even if there is no finalizer */
+            virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ) = 0;
+
+            long long numReduces;
+        };
+
+        // ------------  js function implementations -----------
+
+        /**
+         * used as a holder for Scope and ScriptingFunction
+         * visitor like pattern as Scope is gotten from first access
+         */
+        class JSFunction : boost::noncopyable {
+        public:
+            /**
+             * @param type (map|reduce|finalize)
+             */
+            JSFunction( string type , const BSONElement& e );
+            virtual ~JSFunction() {}
+
+            virtual void init( State * state );
+
+            Scope * scope() const { return _scope; }
+            ScriptingFunction func() const { return _func; }
+
+        private:
+            string _type;
+            string _code; // actual javascript code
+            BSONObj _wantedScope; // this is for CodeWScope
+
+            Scope * _scope; // this is not owned by us, and might be shared
+            ScriptingFunction _func;
+        };
+
+        class JSMapper : public Mapper {
+        public:
+            JSMapper( const BSONElement & code ) : _func( "_map" , code ) {}
+            virtual void map( const BSONObj& o );
+            virtual void init( State * state );
+
+        private:
+            JSFunction _func;
+            BSONObj _params;
+        };
+
+        class JSReducer : public Reducer {
+        public:
+            JSReducer( const BSONElement& code ) : _func( "_reduce" , code ) {}
+            virtual void init( State * state );
+
+            virtual BSONObj reduce( const BSONList& tuples );
+            virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer );
+
+        private:
+
+            /**
+             * result in "return"
+             * @param key OUT
+             * @param endSizeEstimate OUT
+            */
+            void _reduce( const BSONList& values , BSONObj& key , int& endSizeEstimate );
+
+            JSFunction _func;
+        };
+
+        class JSFinalizer : public Finalizer  {
+        public:
+            JSFinalizer( const BSONElement& code ) : _func( "_finalize" , code ) {}
+            virtual BSONObj finalize( const BSONObj& o );
+            virtual void init( State * state ) { _func.init( state ); }
+        private:
+            JSFunction _func;
+
+        };
+
+        // -----------------
+
+
+        class TupleKeyCmp {
+        public:
+            TupleKeyCmp() {}
+            bool operator()( const BSONObj &l, const BSONObj &r ) const {
+                return l.firstElement().woCompare( r.firstElement() ) < 0;
+            }
+        };
+
+        typedef map< BSONObj,BSONList,TupleKeyCmp > InMemory; // from key to list of tuples
+
+        /**
+         * holds map/reduce config information
+         */
+        class Config {
+        public:
+            Config( const string& _dbname , const BSONObj& cmdObj );
+
+            string dbname;
+            string ns;
+
+            // options
+            bool verbose;
+            bool jsMode;
+            int splitInfo;
+
+            // query options
+
+            BSONObj filter;
+            BSONObj sort;
+            long long limit;
+
+            // functions
+
+            scoped_ptr<Mapper> mapper;
+            scoped_ptr<Reducer> reducer;
+            scoped_ptr<Finalizer> finalizer;
+
+            BSONObj mapParams;
+            BSONObj scopeSetup;
+
+            // output tables
+            string incLong;
+            string tempLong;
+
+            string finalShort;
+            string finalLong;
+
+            string outDB;
+
+            // max number of keys allowed in JS map before switching mode
+            long jsMaxKeys;
+            // ratio of duplicates vs unique keys before reduce is triggered in js mode
+            float reduceTriggerRatio;
+            // maximum size of map before it gets dumped to disk
+            long maxInMemSize;
+
+            enum { REPLACE , // atomically replace the collection
+                   MERGE ,  // merge keys, override dups
+                   REDUCE , // merge keys, reduce dups
+                   INMEMORY // only store in memory, limited in size
+                 } outType;
+
+            // if true, no lock during output operation
+            bool outNonAtomic;
+
+            static AtomicUInt JOB_NUMBER;
+        }; // end MRsetup
+
+        /**
+         * stores information about intermediate map reduce state
+         * controls flow of data from map->reduce->finalize->output
+         */
+        class State {
+        public:
+            State( const Config& c );
+            ~State();
+
+            void init();
+
+            // ---- prep  -----
+            bool sourceExists();
+
+            long long incomingDocuments();
+
+            // ---- map stage ----
+
+            /**
+             * stages on in in-memory storage
+             */
+            void emit( const BSONObj& a );
+
+            /**
+             * if size is big, run a reduce
+             * if its still big, dump to temp collection
+             */
+            void checkSize();
+
+            /**
+             * run reduce on _temp
+             */
+            void reduceInMemory();
+
+            /**
+             * transfers in memory storage to temp collection
+             */
+            void dumpToInc();
+            void insertToInc( BSONObj& o );
+            void _insertToInc( BSONObj& o );
+
+            // ------ reduce stage -----------
+
+            void prepTempCollection();
+
+            void finalReduce( BSONList& values );
+
+            void finalReduce( CurOp * op , ProgressMeterHolder& pm );
+
+            // ------- cleanup/data positioning ----------
+
+            /**
+               @return number objects in collection
+             */
+            long long postProcessCollection( CurOp* op , ProgressMeterHolder& pm );
+            long long postProcessCollectionNonAtomic( CurOp* op , ProgressMeterHolder& pm );
+
+            /**
+             * if INMEMORY will append
+             * may also append stats or anything else it likes
+             */
+            void appendResults( BSONObjBuilder& b );
+
+            // -------- util ------------
+
+            /**
+             * inserts with correct replication semantics
+             */
+            void insert( const string& ns , const BSONObj& o );
+
+            // ------ simple accessors -----
+
+            /** State maintains ownership, do no use past State lifetime */
+            Scope* scope() { return _scope.get(); }
+
+            const Config& config() { return _config; }
+
+            const bool isOnDisk() { return _onDisk; }
+
+            long long numEmits() const { if (_jsMode) return _scope->getNumberLongLong("_emitCt"); return _numEmits; }
+            long long numReduces() const { if (_jsMode) return _scope->getNumberLongLong("_redCt"); return _config.reducer->numReduces; }
+
+            bool jsMode() {return _jsMode;}
+            void switchMode(bool jsMode);
+            void bailFromJS();
+
+            const Config& _config;
+            DBDirectClient _db;
+
+        protected:
+
+            void _add( InMemory* im , const BSONObj& a , long& size );
+
+            scoped_ptr<Scope> _scope;
+            bool _onDisk; // if the end result of this map reduce is disk or not
+
+            scoped_ptr<InMemory> _temp;
+            long _size; // bytes in _temp
+            long _dupCount; // number of duplicate key entries
+
+            long long _numEmits;
+
+            bool _jsMode;
+            ScriptingFunction _reduceAll;
+            ScriptingFunction _reduceAndEmit;
+            ScriptingFunction _reduceAndFinalize;
+            ScriptingFunction _reduceAndFinalizeAndInsert;
+        };
+
+        BSONObj fast_emit( const BSONObj& args, void* data );
+        BSONObj _bailFromJS( const BSONObj& args, void* data );
+
+    } // end mr namespace
+}
+
+
diff --git a/src/mongo/db/commands/pipeline.cpp b/src/mongo/db/commands/pipeline.cpp
new file mode 100755
index 00000000000..4ad5e342aed
--- /dev/null
+++ b/src/mongo/db/commands/pipeline.cpp
@@ -0,0 +1,405 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/commands/pipeline.h"
+
+#include "db/cursor.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/document_source.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pdfile.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+
+    const char Pipeline::commandName[] = "aggregate";
+    const char Pipeline::pipelineName[] = "pipeline";
+    const char Pipeline::fromRouterName[] = "fromRouter";
+    const char Pipeline::splitMongodPipelineName[] = "splitMongodPipeline";
+
+    Pipeline::~Pipeline() {
+    }
+
+    Pipeline::Pipeline(const intrusive_ptr<ExpressionContext> &pTheCtx):
+	collectionName(),
+	sourceVector(),
+        splitMongodPipeline(DEBUG_BUILD == 1), /* test: always split for DEV */
+        pCtx(pTheCtx) {
+    }
+
+
+
+    /* this structure is used to make a lookup table of operators */
+    struct StageDesc {
+	const char *pName;
+	intrusive_ptr<DocumentSource> (*pFactory)(
+	    BSONElement *, const intrusive_ptr<ExpressionContext> &);
+    };
+
+    /* this table must be in alphabetical order by name for bsearch() */
+    static const StageDesc stageDesc[] = {
+#ifdef NEVER /* disabled for now in favor of $match */
+	{DocumentSourceFilter::filterName,
+	 DocumentSourceFilter::createFromBson},
+#endif
+	{DocumentSourceGroup::groupName,
+	 DocumentSourceGroup::createFromBson},
+	{DocumentSourceLimit::limitName,
+	 DocumentSourceLimit::createFromBson},
+	{DocumentSourceMatch::matchName,
+	 DocumentSourceMatch::createFromBson},
+#ifdef LATER /* https://jira.mongodb.org/browse/SERVER-3253 */
+	{DocumentSourceOut::outName,
+	 DocumentSourceOut::createFromBson},
+#endif
+	{DocumentSourceProject::projectName,
+	 DocumentSourceProject::createFromBson},
+	{DocumentSourceSkip::skipName,
+	 DocumentSourceSkip::createFromBson},
+	{DocumentSourceSort::sortName,
+	 DocumentSourceSort::createFromBson},
+	{DocumentSourceUnwind::unwindName,
+	 DocumentSourceUnwind::createFromBson},
+    };
+    static const size_t nStageDesc = sizeof(stageDesc) / sizeof(StageDesc);
+
+    static int stageDescCmp(const void *pL, const void *pR) {
+	return strcmp(((const StageDesc *)pL)->pName,
+		      ((const StageDesc *)pR)->pName);
+    }
+
+    boost::shared_ptr<Pipeline> Pipeline::parseCommand(
+	string &errmsg, BSONObj &cmdObj,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	boost::shared_ptr<Pipeline> pPipeline(new Pipeline(pCtx));
+        vector<BSONElement> pipeline;
+
+        /* gather the specification for the aggregation */
+        for(BSONObj::iterator cmdIterator = cmdObj.begin();
+                cmdIterator.more(); ) {
+            BSONElement cmdElement(cmdIterator.next());
+            const char *pFieldName = cmdElement.fieldName();
+
+            /* look for the aggregation command */
+            if (!strcmp(pFieldName, commandName)) {
+                pPipeline->collectionName = cmdElement.String();
+                continue;
+            }
+
+            /* check for the collection name */
+            if (!strcmp(pFieldName, pipelineName)) {
+                pipeline = cmdElement.Array();
+                continue;
+            }
+
+	    /* if the request came from the router, we're in a shard */
+	    if (!strcmp(pFieldName, fromRouterName)) {
+		pCtx->setInShard(cmdElement.Bool());
+		continue;
+	    }
+
+	    /* check for debug options */
+	    if (!strcmp(pFieldName, splitMongodPipelineName)) {
+		pPipeline->splitMongodPipeline = true;
+		continue;
+	    }
+
+            /* we didn't recognize a field in the command */
+            ostringstream sb;
+            sb <<
+               "Pipeline::parseCommand(): unrecognized field \"" <<
+               cmdElement.fieldName();
+            errmsg = sb.str();
+	    return boost::shared_ptr<Pipeline>();
+        }
+
+        /*
+          If we get here, we've harvested the fields we expect for a pipeline.
+
+          Set up the specified document source pipeline.
+        */
+	SourceVector *pSourceVector = &pPipeline->sourceVector; // shorthand
+
+        /* iterate over the steps in the pipeline */
+        const size_t nSteps = pipeline.size();
+        for(size_t iStep = 0; iStep < nSteps; ++iStep) {
+            /* pull out the pipeline element as an object */
+            BSONElement pipeElement(pipeline[iStep]);
+	    uassert(15942, str::stream() << "pipeline element " <<
+		    iStep << " is not an object",
+		    pipeElement.type() == Object);
+            BSONObj bsonObj(pipeElement.Obj());
+
+	    intrusive_ptr<DocumentSource> pSource;
+
+            /* use the object to add a DocumentSource to the processing chain */
+            BSONObjIterator bsonIterator(bsonObj);
+            while(bsonIterator.more()) {
+                BSONElement bsonElement(bsonIterator.next());
+                const char *pFieldName = bsonElement.fieldName();
+
+                /* select the appropriate operation and instantiate */
+		StageDesc key;
+		key.pName = pFieldName;
+		const StageDesc *pDesc = (const StageDesc *)
+		    bsearch(&key, stageDesc, nStageDesc, sizeof(StageDesc),
+			    stageDescCmp);
+		if (pDesc)
+		    pSource = (*pDesc->pFactory)(&bsonElement, pCtx);
+                else {
+                    ostringstream sb;
+                    sb <<
+                       "Pipeline::run(): unrecognized pipeline op \"" <<
+                       pFieldName;
+                    errmsg = sb.str();
+		    return shared_ptr<Pipeline>();
+                }
+            }
+
+	    pSourceVector->push_back(pSource);
+        }
+
+	/* if there aren't any pipeline stages, there's nothing more to do */
+	if (!pSourceVector->size())
+	    return pPipeline;
+
+	/*
+	  Move filters up where possible.
+
+	  CW TODO -- move filter past projections where possible, and noting
+	  corresponding field renaming.
+	*/
+
+	/*
+	  Wherever there is a match immediately following a sort, swap them.
+	  This means we sort fewer items.  Neither changes the documents in
+	  the stream, so this transformation shouldn't affect the result.
+
+	  We do this first, because then when we coalesce operators below,
+	  any adjacent matches will be combined.
+	 */
+	for(size_t srcn = pSourceVector->size(), srci = 1;
+	    srci < srcn; ++srci) {
+	    intrusive_ptr<DocumentSource> &pSource = pSourceVector->at(srci);
+	    if (dynamic_cast<DocumentSourceMatch *>(pSource.get())) {
+		intrusive_ptr<DocumentSource> &pPrevious =
+		    pSourceVector->at(srci - 1);
+		if (dynamic_cast<DocumentSourceSort *>(pPrevious.get())) {
+		    /* swap this item with the previous */
+		    intrusive_ptr<DocumentSource> pTemp(pPrevious);
+		    pPrevious = pSource;
+		    pSource = pTemp;
+		}
+	    }
+	}
+
+	/*
+	  Coalesce adjacent filters where possible.  Two adjacent filters
+	  are equivalent to one filter whose predicate is the conjunction of
+	  the two original filters' predicates.  For now, capture this by
+	  giving any DocumentSource the option to absorb it's successor; this
+	  will also allow adjacent projections to coalesce when possible.
+
+	  Run through the DocumentSources, and give each one the opportunity
+	  to coalesce with its successor.  If successful, remove the
+	  successor.
+
+	  Move all document sources to a temporary list.
+	*/
+	SourceVector tempVector(*pSourceVector);
+	pSourceVector->clear();
+
+	/* move the first one to the final list */
+	pSourceVector->push_back(tempVector[0]);
+
+	/* run through the sources, coalescing them or keeping them */
+	for(size_t tempn = tempVector.size(), tempi = 1;
+	    tempi < tempn; ++tempi) {
+	    /*
+	      If we can't coalesce the source with the last, then move it
+	      to the final list, and make it the new last.  (If we succeeded,
+	      then we're still on the same last, and there's no need to move
+	      or do anything with the source -- the destruction of tempVector
+	      will take care of the rest.)
+	    */
+	    intrusive_ptr<DocumentSource> &pLastSource = pSourceVector->back();
+	    intrusive_ptr<DocumentSource> &pTemp = tempVector.at(tempi);
+	    if (!pLastSource->coalesce(pTemp))
+		pSourceVector->push_back(pTemp);
+	}
+
+	/* optimize the elements in the pipeline */
+	for(SourceVector::iterator iter(pSourceVector->begin()),
+		listEnd(pSourceVector->end()); iter != listEnd; ++iter)
+	    (*iter)->optimize();
+
+	return pPipeline;
+    }
+
+    shared_ptr<Pipeline> Pipeline::splitForSharded() {
+	/* create an initialize the shard spec we'll return */
+	shared_ptr<Pipeline> pShardPipeline(new Pipeline(pCtx));
+	pShardPipeline->collectionName = collectionName;
+
+	/* put the source list aside */
+	SourceVector tempVector(sourceVector);
+	sourceVector.clear();
+
+	/*
+	  Run through the pipeline, looking for points to split it into
+	  shard pipelines, and the rest.
+	 */
+	while(!tempVector.empty()) {
+	    intrusive_ptr<DocumentSource> &pSource = tempVector.front();
+
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+	    DocumentSourceSort *pSort =
+		dynamic_cast<DocumentSourceSort *>(pSource.get());
+	    if (pSort) {
+		/*
+		  There's no point in sorting until the result is combined.
+		  Therefore, sorts should be done in mongos, and not in
+		  the shard at all.  Add all the remaining operators to
+		  the mongos list and quit.
+
+		  TODO:  unless the sort key is the shard key.
+		  TODO:  we could also do a merge sort in mongos in the
+		  future, and split here.
+		*/
+		for(size_t tempn = tempVector.size(), tempi = 0;
+		    tempi < tempn; ++tempi)
+		    sourceVector.push_back(tempVector[tempi]);
+		break;
+	    }
+#endif
+
+	    /* hang on to this in advance, in case it is a group */
+	    DocumentSourceGroup *pGroup =
+		dynamic_cast<DocumentSourceGroup *>(pSource.get());
+
+	    /* move the source from the tempVector to the shard sourceVector */
+	    pShardPipeline->sourceVector.push_back(pSource);
+	    tempVector.erase(tempVector.begin());
+
+	    /*
+	      If we found a group, that's a split point.
+	     */
+	    if (pGroup) {
+		/* start this pipeline with the group merger */
+		sourceVector.push_back(pGroup->createMerger());
+
+		/* and then add everything that remains and quit */
+		for(size_t tempn = tempVector.size(), tempi = 0;
+		    tempi < tempn; ++tempi)
+		    sourceVector.push_back(tempVector[tempi]);
+		break;
+	    }
+	}
+
+	return pShardPipeline;
+    }
+
+    void Pipeline::getCursorMods(BSONObjBuilder *pQueryBuilder,
+	BSONObjBuilder *pSortBuilder) {
+	/* look for an initial $match */
+	if (!sourceVector.size())
+	    return;
+	const intrusive_ptr<DocumentSource> &pMC = sourceVector.front();
+	const DocumentSourceMatch *pMatch =
+	    dynamic_cast<DocumentSourceMatch *>(pMC.get());
+
+	if (pMatch) {
+	    /* build the query */
+	    pMatch->toMatcherBson(pQueryBuilder);
+
+	    /* remove the match from the pipeline */
+	    sourceVector.erase(sourceVector.begin());
+	}
+
+	/* look for an initial $sort */
+	if (!sourceVector.size())
+	    return;
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+	const intrusive_ptr<DocumentSource> &pSC = sourceVector.front();
+	const DocumentSourceSort *pSort = 
+	    dynamic_cast<DocumentSourceSort *>(pSC.get());
+
+	if (pSort) {
+	    /* build the sort key */
+	    pSort->sortKeyToBson(pSortBuilder, false);
+
+	    /* remove the sort from the pipeline */
+	    sourceVector.erase(sourceVector.begin());
+	}
+#endif
+    }
+
+    void Pipeline::toBson(BSONObjBuilder *pBuilder) const {
+	/* create an array out of the pipeline operations */
+	BSONArrayBuilder arrayBuilder;
+	for(SourceVector::const_iterator iter(sourceVector.begin()),
+		listEnd(sourceVector.end()); iter != listEnd; ++iter) {
+	    intrusive_ptr<DocumentSource> pSource(*iter);
+	    pSource->addToBsonArray(&arrayBuilder);
+	}
+
+	/* add the top-level items to the command */
+	pBuilder->append(commandName, getCollectionName());
+	pBuilder->append(pipelineName, arrayBuilder.arr());
+
+	bool btemp;
+	if ((btemp = getSplitMongodPipeline())) {
+	    pBuilder->append(splitMongodPipelineName, btemp);
+	}
+	if ((btemp = pCtx->getInRouter())) {
+	    pBuilder->append(fromRouterName, btemp);
+	}
+    }
+
+    bool Pipeline::run(BSONObjBuilder &result, string &errmsg,
+		       intrusive_ptr<DocumentSource> pSource) {
+	/* chain together the sources we found */
+	for(SourceVector::iterator iter(sourceVector.begin()),
+		listEnd(sourceVector.end()); iter != listEnd; ++iter) {
+	    intrusive_ptr<DocumentSource> pTemp(*iter);
+	    pTemp->setSource(pSource);
+	    pSource = pTemp;
+	}
+	/* pSource is left pointing at the last source in the chain */
+
+        /*
+          Iterate through the resulting documents, and add them to the result.
+        */
+        BSONArrayBuilder resultArray; // where we'll stash the results
+        for(bool hasDocument = !pSource->eof(); hasDocument;
+                hasDocument = pSource->advance()) {
+	    boost::intrusive_ptr<Document> pDocument(pSource->getCurrent());
+
+            /* add the document to the result set */
+            BSONObjBuilder documentBuilder;
+            pDocument->toBson(&documentBuilder);
+            resultArray.append(documentBuilder.done());
+        }
+
+        result.appendArray("result", resultArray.arr());
+
+        return true;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/pipeline.h b/src/mongo/db/commands/pipeline.h
new file mode 100755
index 00000000000..ef9cc6afe51
--- /dev/null
+++ b/src/mongo/db/commands/pipeline.h
@@ -0,0 +1,183 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "db/jsobj.h"
+#include "util/timer.h"
+#include "db/commands.h"
+
+namespace mongo {
+    class BSONObj;
+    class BSONObjBuilder;
+    class DocumentSource;
+    class DocumentSourceProject;
+    class Expression;
+    class ExpressionContext;
+    class ExpressionNary;
+    struct OpDesc; // local private struct
+
+    /** mongodb "commands" (sent via db.$cmd.findOne(...))
+        subclass to make a command.  define a singleton object for it.
+        */
+    class Pipeline :
+        boost::noncopyable {
+    public:
+        virtual ~Pipeline();
+
+	/*
+	  Create a pipeline from the command.
+
+	  @param errmsg where to write errors, if there are any
+	  @param cmdObj the command object sent from the client
+	  @returns the pipeline, if created, otherwise a NULL reference
+	 */
+	static boost::shared_ptr<Pipeline> parseCommand(
+	    string &errmsg, BSONObj &cmdObj,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Get the collection name from the command.
+
+	  @returns the collection name
+	*/
+	string getCollectionName() const;
+
+	/*
+	  Split the current Pipeline into a Pipeline for each shard, and
+	  a Pipeline that combines the results within mongos.
+
+	  This permanently alters this pipeline for the merging operation.
+
+	  @returns the Spec for the pipeline command that should be sent
+	    to the shards
+	*/
+	boost::shared_ptr<Pipeline> splitForSharded();
+
+	/*
+	  Get Cursor creation modifiers.
+
+	  If we have a $match or a $sort at the beginning of the pipeline,
+	  these can be extracted and used to modify the cursor we'll use for
+	  the initial collection scan.
+
+	  If there is a Matcher query at the beginning of the pipeline,
+	  get it, by adding its terms to the object under construction.  If
+	  not, this adds nothing to the object under construction.
+
+	  If there is a sort at the beginning of the pipeline, get it, by
+	  adding its terms to the object under construction.  If not, this adds
+	  nothing.
+
+	  Optimization steps in parseCommand make sure that for any pairs
+	  of adjacent matches and sorts, the match comes first.  This ensures
+	  that we sort a minimum of items, and doesn't change the result.
+	  When getCursorMods() examines the pipeline, it looks for an initial
+	  $match.  If present, that is put into pQueryBuilder.  If there is
+	  a query, then the next stage is checked for a $sort, which will go
+	  into pSortBuilder.  If there is no initial $match, then a check is
+	  made for an initial $sort, which will then still be put into
+	  pSortBuilder.
+
+	  As a side-effect, retrieving the Cursor modifications removes them
+	  from the pipeline.
+
+	  @param pQueryBuilder an initialized object builder
+	  @param pSortBuilder an initialized object builder
+	 */
+	void getCursorMods(BSONObjBuilder *pQueryBuilder,
+			   BSONObjBuilder *pSortBuilder);
+
+	/*
+	  Write the Pipeline as a BSONObj command.  This should be the
+	  inverse of parseCommand().
+
+	  This is only intended to be used by the shard command obtained
+	  from splitForSharded().  Some pipeline operations in the merge
+	  process do not have equivalent command forms, and using this on
+	  the mongos Pipeline will cause assertions.
+
+	  @param the builder to write the command to
+	*/
+	void toBson(BSONObjBuilder *pBuilder) const;
+
+	/*
+	  Run the Pipeline on the given source.
+
+	  @param result builder to write the result to
+	  @param errmsg place to put error messages, if any
+	  @param pSource the document source to use at the head of the chain
+	  @returns true on success, false if an error occurs
+	*/
+	bool run(BSONObjBuilder &result, string &errmsg,
+		 intrusive_ptr<DocumentSource> pSource);
+
+	/*
+	  Debugging:  should the processing pipeline be split within
+	  mongod, simulating the real mongos/mongod split?  This is determined
+	  by setting the splitMongodPipeline field in an "aggregate"
+	  command.
+
+	  The split itself is handled by the caller, which is currently
+	  pipeline_command.cpp.
+
+	  @returns true if the pipeline is to be split
+	 */
+	bool getSplitMongodPipeline() const;
+
+	/*
+	  The aggregation command name.
+	 */
+	static const char commandName[];
+
+    private:
+	static const char pipelineName[];
+	static const char fromRouterName[];
+	static const char splitMongodPipelineName[];
+
+        Pipeline(const intrusive_ptr<ExpressionContext> &pCtx);
+
+	string collectionName;
+	typedef vector<intrusive_ptr<DocumentSource> > SourceVector;
+	SourceVector sourceVector;
+
+	bool splitMongodPipeline;
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+} // namespace mongo
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline string Pipeline::getCollectionName() const {
+	return collectionName;
+    }
+
+    inline bool Pipeline::getSplitMongodPipeline() const {
+	if (!DEBUG_BUILD)
+	    return false;
+
+	return splitMongodPipeline;
+    }
+
+} // namespace mongo
+
+
diff --git a/src/mongo/db/commands/pipeline_command.cpp b/src/mongo/db/commands/pipeline_command.cpp
new file mode 100755
index 00000000000..9863e14556c
--- /dev/null
+++ b/src/mongo/db/commands/pipeline_command.cpp
@@ -0,0 +1,187 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/commands/pipeline.h"
+#include "db/cursor.h"
+#include "db/pdfile.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/document_source.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/queryoptimizer.h"
+
+namespace mongo {
+
+    /** mongodb "commands" (sent via db.$cmd.findOne(...))
+        subclass to make a command.  define a singleton object for it.
+        */
+    class PipelineCommand :
+        public Command {
+    public:
+        // virtuals from Command
+        virtual ~PipelineCommand();
+        virtual bool run(const string &db, BSONObj &cmdObj, int options,
+			 string &errmsg, BSONObjBuilder &result, bool fromRepl);
+        virtual LockType locktype() const;
+        virtual bool slaveOk() const;
+        virtual void help(stringstream &help) const;
+
+        PipelineCommand();
+    };
+
+    // self-registering singleton static instance
+    static PipelineCommand pipelineCommand;
+
+    PipelineCommand::PipelineCommand():
+        Command(Pipeline::commandName) {
+    }
+
+    Command::LockType PipelineCommand::locktype() const {
+        return READ;
+    }
+
+    bool PipelineCommand::slaveOk() const {
+        return true;
+    }
+
+    void PipelineCommand::help(stringstream &help) const {
+        help << "{ pipeline : [ { <data-pipe-op>: {...}}, ... ] }";
+    }
+
+    PipelineCommand::~PipelineCommand() {
+    }
+
+    bool PipelineCommand::run(const string &db, BSONObj &cmdObj,
+			      int options, string &errmsg,
+			      BSONObjBuilder &result, bool fromRepl) {
+
+	intrusive_ptr<ExpressionContext> pCtx(ExpressionContext::create());
+
+	/* try to parse the command; if this fails, then we didn't run */
+	boost::shared_ptr<Pipeline> pPipeline(
+	    Pipeline::parseCommand(errmsg, cmdObj, pCtx));
+	if (!pPipeline.get())
+	    return false;
+
+	/* get a query to use, if any */
+	BSONObjBuilder queryBuilder;
+	BSONObjBuilder sortBuilder;
+	pPipeline->getCursorMods(&queryBuilder, &sortBuilder);
+	BSONObj query(queryBuilder.done());
+	BSONObj sort(sortBuilder.done());
+
+	/* for debugging purposes, show what the query and sort are */
+	DEV {
+	    (log() << "\n---- query BSON\n" <<
+	     query.jsonString(Strict, 1) << "\n----\n").flush();
+	    (log() << "\n---- sort BSON\n" <<
+	     sort.jsonString(Strict, 1) << "\n----\n").flush();
+	}
+	
+	/* create a cursor for that query */
+	string fullName(db + "." + pPipeline->getCollectionName());
+	shared_ptr<Cursor> pCursor(
+	    NamespaceDetailsTransient::getCursor(
+		fullName.c_str(), query
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+		   , sort
+#endif
+		));
+
+	/* wrap the cursor with a DocumentSource */
+	intrusive_ptr<DocumentSource> pSource(
+	    DocumentSourceCursor::create(pCursor));
+
+	/* this is the normal non-debug path */
+	if (!pPipeline->getSplitMongodPipeline())
+	    return pPipeline->run(result, errmsg, pSource);
+
+	/* setup as if we're in the router */
+	pCtx->setInRouter(true);
+
+	/*
+	  Here, we'll split the pipeline in the same way we would for sharding,
+	  for testing purposes.
+
+	  Run the shard pipeline first, then feed the results into the remains
+	  of the existing pipeline.
+
+	  Start by splitting the pipeline.
+	 */
+	shared_ptr<Pipeline> pShardSplit(
+	    pPipeline->splitForSharded());
+
+	/*
+	  Write the split pipeline as we would in order to transmit it to
+	  the shard servers.
+	*/
+	BSONObjBuilder shardBuilder;
+	pShardSplit->toBson(&shardBuilder);
+	BSONObj shardBson(shardBuilder.done());
+
+	DEV (log() << "\n---- shardBson\n" <<
+	     shardBson.jsonString(Strict, 1) << "\n----\n").flush();
+
+	/* for debugging purposes, show what the pipeline now looks like */
+	DEV {
+	    BSONObjBuilder pipelineBuilder;
+	    pPipeline->toBson(&pipelineBuilder);
+	    BSONObj pipelineBson(pipelineBuilder.done());
+	    (log() << "\n---- pipelineBson\n" <<
+	     pipelineBson.jsonString(Strict, 1) << "\n----\n").flush();
+	}
+
+	/* on the shard servers, create the local pipeline */
+	intrusive_ptr<ExpressionContext> pShardCtx(ExpressionContext::create());
+	shared_ptr<Pipeline> pShardPipeline(
+	    Pipeline::parseCommand(errmsg, shardBson, pShardCtx));
+	if (!pShardPipeline.get()) {
+	    return false;
+	}
+
+	/* run the shard pipeline */
+	BSONObjBuilder shardResultBuilder;
+	string shardErrmsg;
+	pShardPipeline->run(shardResultBuilder, shardErrmsg, pSource);
+	BSONObj shardResult(shardResultBuilder.done());
+
+	/* pick out the shard result, and prepare to read it */
+	intrusive_ptr<DocumentSourceBsonArray> pShardSource;
+	BSONObjIterator shardIter(shardResult);
+	while(shardIter.more()) {
+	    BSONElement shardElement(shardIter.next());
+	    const char *pFieldName = shardElement.fieldName();
+
+	    if (strcmp(pFieldName, "result") == 0) {
+		pShardSource = DocumentSourceBsonArray::create(&shardElement);
+
+	        /*
+		  Connect the output of the shard pipeline with the mongos
+		  pipeline that will merge the results.
+		*/
+		return pPipeline->run(result, errmsg, pShardSource);
+	    }
+	}
+
+	/* NOTREACHED */
+	assert(false);
+	return false;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/common.cpp b/src/mongo/db/common.cpp
new file mode 100644
index 00000000000..cd073f8b059
--- /dev/null
+++ b/src/mongo/db/common.cpp
@@ -0,0 +1,73 @@
+/** @file common.cpp 
+    Common code for server binaries (mongos, mongod, test).  
+    Nothing used by driver should be here. 
+ */
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//#include "pch.h"
+//#include "concurrency.h"
+#include "jsobjmanipulator.h"
+
+/**
+ * this just has globals
+ */
+namespace mongo {
+
+    /** called by mongos, mongod, test. do not call from clients and such. 
+        invoked before about everything except global var construction.
+     */
+    void doPreServerStartupInits() { 
+#if defined(RLIMIT_NPROC) && defined(RLIMIT_NOFILE)
+      //Check that # of files rlmit > 1000 , and # of processes > # of files/2
+      const unsigned int minNumFiles = 1000;
+      const double filesToProcsRatio = 2.0;
+      struct rlimit rlnproc;
+      struct rlimit rlnofile;
+
+      if(!getrlimit(RLIMIT_NPROC,&rlnproc) && !getrlimit(RLIMIT_NOFILE,&rlnofile)){
+        if(rlnofile.rlim_cur < minNumFiles){
+          log() << "Warning: soft rlimits too low. Number of files is " << rlnofile.rlim_cur << ", should be at least " << minNumFiles << endl;
+        }
+        if(rlnproc.rlim_cur < rlnofile.rlim_cur/filesToProcsRatio){
+          log() << "Warning: soft rlimits too low. " << rlnproc.rlim_cur << " processes, " << rlnofile.rlim_cur << " files. Number of processes should be at least "<< 1/filesToProcsRatio << " times number of files." << endl;
+        }
+      }
+      else{
+        log() << "Warning: getrlimit failed" << endl;
+      }
+#endif
+    }
+
+    NOINLINE_DECL OpTime OpTime::skewed() {
+        bool toLog = false;
+        ONCE toLog = true;
+        RARELY toLog = true;
+        last.i++;
+        if ( last.i & 0x80000000 )
+            toLog = true;
+        if ( toLog ) {
+            log() << "clock skew detected  prev: " << last.secs << " now: " << (unsigned) time(0) << endl;
+        }
+        if ( last.i & 0x80000000 ) {
+            log() << "error large clock skew detected, shutting down" << endl;
+            throw ClockSkewException();
+        }
+        return last;
+    }
+
+}
diff --git a/src/mongo/db/compact.cpp b/src/mongo/db/compact.cpp
new file mode 100644
index 00000000000..32931b6c5fd
--- /dev/null
+++ b/src/mongo/db/compact.cpp
@@ -0,0 +1,376 @@
+/** @file compact.cpp
+   compaction of deleted space in pdfiles (datafiles)
+*/
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "concurrency.h"
+#include "commands.h"
+#include "curop-inl.h"
+#include "background.h"
+#include "extsort.h"
+#include "compact.h"
+#include "../util/concurrency/task.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    char faux;
+
+    void addRecordToRecListInExtent(Record *r, DiskLoc loc);
+    DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god);
+    void freeExtents(DiskLoc firstExt, DiskLoc lastExt);
+
+    /* this should be done in alloc record not here, but doing here for now. 
+       really dumb; it's a start.
+    */
+    unsigned quantizeMask(unsigned x) { 
+        if( x > 4096 * 20 ) 
+            return ~4095;
+        if( x >= 512 ) 
+            return ~63;
+        return ~0;
+    }
+
+    /** @return number of skipped (invalid) documents */
+    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n,
+                const scoped_array<IndexSpec> &indexSpecs,
+                scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, 
+                double pf, int pb)
+    {
+        log() << "compact extent #" << n << endl;
+        unsigned oldObjSize = 0; // we'll report what the old padding was
+        unsigned oldObjSizeWithPadding = 0;
+
+        Extent *e = ext.ext();
+        e->assertOk();
+        assert( e->validates() );
+        unsigned skipped = 0;
+
+        {
+            // the next/prev pointers within the extent might not be in order so we first page the whole thing in 
+            // sequentially
+            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
+            Timer t;
+            MAdvise adv(e, e->length, MAdvise::Sequential);
+            const char *p = (const char *) e;
+            for( int i = 0; i < e->length; i += 4096 ) { 
+                faux += p[i];
+            }
+            int ms = t.millis();
+            if( ms > 1000 ) 
+                log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
+        }
+
+        {
+            log() << "compact copying records" << endl;
+            unsigned totalSize = 0;
+            int nrecs = 0;
+            DiskLoc L = e->firstRecord;
+            if( !L.isNull() ) {
+                while( 1 ) {
+                    Record *recOld = L.rec();
+                    L = recOld->nextInExtent(L);
+                    nrecs++;
+                    BSONObj objOld(recOld);
+
+                    if( !validate || objOld.valid() ) {
+                        unsigned sz = objOld.objsize();
+
+                        oldObjSize += sz;
+                        oldObjSizeWithPadding += recOld->netLength();
+
+                        unsigned lenWHdr = sz + Record::HeaderSize;
+                        unsigned lenWPadding = lenWHdr;
+                        {
+                            lenWPadding = static_cast<unsigned>(pf*lenWPadding);
+                            lenWPadding += pb;
+                            lenWPadding = lenWPadding & quantizeMask(lenWPadding);
+                            if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { 
+                                lenWPadding = lenWHdr;
+                            }
+                        }
+                        totalSize += lenWPadding;
+                        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
+                        uassert(14024, "compact error out of space during compaction", !loc.isNull());
+                        Record *recNew = loc.rec();
+                        recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
+                        addRecordToRecListInExtent(recNew, loc);
+                        memcpy(recNew->data, objOld.objdata(), sz);
+
+                        {
+                            // extract keys for all indexes we will be rebuilding
+                            for( int x = 0; x < nidx; x++ ) { 
+                                phase1[x].addKeys(indexSpecs[x], objOld, loc);
+                            }
+                        }
+                    }
+                    else { 
+                        if( ++skipped <= 10 )
+                            log() << "compact skipping invalid object" << endl;
+                    }
+
+                    if( L.isNull() ) { 
+                        // we just did the very last record from the old extent.  it's still pointed to 
+                        // by the old extent ext, but that will be fixed below after this loop
+                        break;
+                    }
+
+                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
+                    bool stopping = false;
+                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
+                    if( stopping || getDur().aCommitIsNeeded() ) {
+                        e->firstRecord.writing() = L;
+                        Record *r = L.rec();
+                        getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs;
+                        getDur().commitIfNeeded();
+                        killCurrentOp.checkForInterrupt(false);
+                    }
+                }
+            } // if !L.isNull()
+
+            assert( d->firstExtent == ext );
+            assert( d->lastExtent != ext );
+            DiskLoc newFirst = e->xnext;
+            d->firstExtent.writing() = newFirst;
+            newFirst.ext()->xprev.writing().Null();
+            getDur().writing(e)->markEmpty();
+            freeExtents(ext,ext);
+            getDur().commitIfNeeded();
+
+            { 
+                double op = 1.0;
+                if( oldObjSize ) 
+                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
+                log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB"
+                    << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
+                    << endl;                    
+            }
+        }
+
+        return skipped;
+    }
+
+    extern SortPhaseOne *precalced;
+
+    bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { 
+        //int les = d->lastExtentSize;
+
+        // this is a big job, so might as well make things tidy before we start just to be nice.
+        getDur().commitNow();
+
+        list<DiskLoc> extents;
+        for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) 
+            extents.push_back(L);
+        log() << "compact " << extents.size() << " extents" << endl;
+
+        ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) );
+
+        // same data, but might perform a little different after compact?
+        NamespaceDetailsTransient::get(ns).clearQueryCache();
+
+        int nidx = d->nIndexes;
+        scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] );
+        scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] );
+        {
+            NamespaceDetails::IndexIterator ii = d->ii(); 
+            int x = 0;
+            while( ii.more() ) { 
+                BSONObjBuilder b;
+                IndexDetails& idx = ii.next();
+                BSONObj::iterator i(idx.info.obj());
+                while( i.more() ) { 
+                    BSONElement e = i.next();
+                    if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) {
+                        b.append(e);
+                    }
+                }
+                BSONObj o = b.obj().getOwned();
+                phase1[x].sorter.reset( new BSONObjExternalSorter( idx.idxInterface(), o.getObjectField("key") ) );
+                phase1[x].sorter->hintNumObjects( d->stats.nrecords );
+                indexSpecs[x++].reset(o);
+            }
+        }
+
+        log() << "compact orphan deleted lists" << endl;
+        for( int i = 0; i < Buckets; i++ ) { 
+            d->deletedList[i].writing().Null();
+        }
+
+
+
+        // Start over from scratch with our extent sizing and growth
+        d->lastExtentSize=0;
+
+        // before dropping indexes, at least make sure we can allocate one extent!
+        uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull());
+
+        // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here
+        log() << "compact dropping indexes" << endl;
+        BSONObjBuilder b;
+        if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { 
+            errmsg = "compact drop indexes failed";
+            log() << errmsg << endl;
+            return false;
+        }
+
+        getDur().commitNow();
+
+        long long skipped = 0;
+        int n = 0;
+        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { 
+            skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate, pf, pb);
+            pm.hit();
+        }
+
+        if( skipped ) {
+            result.append("invalidObjects", skipped);
+        }
+
+        assert( d->firstExtent.ext()->xprev.isNull() );
+
+        // indexes will do their own progress meter?
+        pm.finished();
+
+        // build indexes
+        NamespaceString s(ns);
+        string si = s.db + ".system.indexes";
+        for( int i = 0; i < nidx; i++ ) {
+            killCurrentOp.checkForInterrupt(false);
+            BSONObj info = indexSpecs[i].info;
+            log() << "compact create index " << info["key"].Obj().toString() << endl;
+            try {
+                precalced = &phase1[i];
+                theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize());
+            }
+            catch(...) { 
+                precalced = 0;
+                throw;
+            }
+            precalced = 0;
+        }
+
+        return true;
+    }
+
+    bool compact(const string& ns, string &errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) {
+        massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) );
+        massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails
+
+        bool ok;
+        {
+            writelock lk;
+            BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str());
+            Client::Context ctx(ns);
+            NamespaceDetails *d = nsdetails(ns.c_str());
+            massert( 13660, str::stream() << "namespace " << ns << " does not exist", d );
+            massert( 13661, "cannot compact capped collection", !d->capped );
+            log() << "compact " << ns << " begin" << endl;
+            if( pf != 0 || pb != 0 ) { 
+                log() << "paddingFactor:" << pf << " paddingBytes:" << pb << endl;
+            } 
+            try { 
+                ok = _compact(ns.c_str(), d, errmsg, validate, result, pf, pb);
+            }
+            catch(...) { 
+                log() << "compact " << ns << " end (with error)" << endl;
+                throw;
+            }
+            log() << "compact " << ns << " end" << endl;
+        }
+        return ok;
+    }
+
+    bool isCurrentlyAReplSetPrimary();
+
+    class CompactCmd : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool maintenanceMode() const { return true; }
+        virtual bool logTheOp() { return false; }
+        virtual void help( stringstream& help ) const {
+            help << "compact collection\n"
+                "warning: this operation blocks the server and is slow. you can cancel with cancelOp()\n"
+                "{ compact : <collection_name>, [force:true], [validate:true] }\n"
+                "  force - allows to run on a replica set primary\n"
+                "  validate - check records are noncorrupt before adding to newly compacting extents. slower but safer (default is true in this version)\n";
+        }
+        virtual bool requiresAuth() { return true; }
+        CompactCmd() : Command("compact") { }
+
+        virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string coll = cmdObj.firstElement().valuestr();
+            if( coll.empty() || db.empty() ) {
+                errmsg = "no collection name specified";
+                return false;
+            }
+
+            if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { 
+                errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force";
+                return false;
+            }
+            
+            string ns = db + '.' + coll;
+            if ( ! NamespaceString::normal(ns.c_str()) ) {
+                errmsg = "bad namespace name";
+                return false;
+            }
+            
+            // parameter validation to avoid triggering assertions in compact()
+            if ( str::contains(ns, ".system.") ) {
+                errmsg = "can't compact a system namespace";
+                return false;
+            }
+            
+            {
+                writelock lk;
+                Client::Context ctx(ns);
+                NamespaceDetails *d = nsdetails(ns.c_str());
+                if( ! d ) {
+                    errmsg = "namespace does not exist";
+                    return false;
+                }
+
+                if ( d->capped ) {
+                    errmsg = "cannot compact a capped collection";
+                    return false;
+                }
+            }
+
+            double pf = 1.0;
+            int pb = 0;
+            if( cmdObj.hasElement("paddingFactor") ) {
+                pf = cmdObj["paddingFactor"].Number();
+                assert( pf >= 1.0 && pf <= 4.0 );
+            }
+            if( cmdObj.hasElement("paddingBytes") ) {
+                pb = (int) cmdObj["paddingBytes"].Number();
+                assert( pb >= 0 && pb <= 1024 * 1024 );
+            }
+
+            bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment
+            bool ok = compact(ns, errmsg, validate, result, pf, pb);
+            return ok;
+        }
+    };
+    static CompactCmd compactCmd;
+
+}
diff --git a/src/mongo/db/compact.h b/src/mongo/db/compact.h
new file mode 100644
index 00000000000..7bf49c8e1b8
--- /dev/null
+++ b/src/mongo/db/compact.h
@@ -0,0 +1,50 @@
+// compact.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /** for bottom up fastbuildindex (where we presort keys) */
+    struct SortPhaseOne { 
+        SortPhaseOne() { 
+            n = 0;
+            nkeys = 0;
+            multi = false;
+        }
+        shared_ptr<BSONObjExternalSorter> sorter;
+        unsigned long long n; // # of records
+        unsigned long long nkeys;
+        bool multi; // multikey index
+
+        void addKeys(const IndexSpec& spec, const BSONObj& o, DiskLoc loc) { 
+            BSONObjSet keys;
+            spec.getKeys(o, keys);
+            int k = 0;
+            for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                if( ++k == 2 ) {
+                    multi = true;
+                }
+                sorter->add(*i, loc);
+                nkeys++;
+            }
+            n++;
+        }
+    };
+
+}
diff --git a/src/mongo/db/concurrency.h b/src/mongo/db/concurrency.h
new file mode 100644
index 00000000000..33bc0caac77
--- /dev/null
+++ b/src/mongo/db/concurrency.h
@@ -0,0 +1,21 @@
+// @file concurrency.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mongomutex.h"
diff --git a/src/mongo/db/curop-inl.h b/src/mongo/db/curop-inl.h
new file mode 100644
index 00000000000..7dd678b185d
--- /dev/null
+++ b/src/mongo/db/curop-inl.h
@@ -0,0 +1 @@
+#include "curop.h"
diff --git a/src/mongo/db/curop.cpp b/src/mongo/db/curop.cpp
new file mode 100644
index 00000000000..3cc452b46cc
--- /dev/null
+++ b/src/mongo/db/curop.cpp
@@ -0,0 +1,173 @@
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "curop.h"
+#include "database.h"
+
+namespace mongo {
+
+    // todo : move more here
+
+    CurOp::CurOp( Client * client , CurOp * wrapped ) : 
+        _client(client), 
+        _wrapped(wrapped) 
+    {
+        if ( _wrapped )
+            _client->_curOp = this;
+        _start = _checkpoint = 0;
+        _active = false;
+        _reset();
+        _op = 0;
+        // These addresses should never be written to again.  The zeroes are
+        // placed here as a precaution because currentOp may be accessed
+        // without the db mutex.
+        memset(_ns, 0, sizeof(_ns));
+    }
+
+    void CurOp::_reset() {
+        _command = false;
+        _lockType = 0;
+        _dbprofile = 0;
+        _end = 0;
+        _waitingForLock = false;
+        _message = "";
+        _progressMeter.finished();
+        _killed = false;
+        _numYields = 0;
+    }
+
+    void CurOp::reset() {
+        _reset();
+        _start = _checkpoint = 0;
+        _opNum = _nextOpNum++;
+        _ns[0] = 0;
+        _debug.reset();
+        _query.reset();
+        _active = true; // this should be last for ui clarity
+    }
+
+    void CurOp::reset( const HostAndPort& remote, int op ) {
+        reset();
+        if( _remote != remote ) {
+            // todo : _remote is not thread safe yet is used as such!
+            _remote = remote;
+        }
+        _op = op;
+    }
+        
+    ProgressMeter& CurOp::setMessage( const char * msg , unsigned long long progressMeterTotal , int secondsBetween ) {
+        if ( progressMeterTotal ) {
+            if ( _progressMeter.isActive() ) {
+                cout << "about to assert, old _message: " << _message << " new message:" << msg << endl;
+                assert( ! _progressMeter.isActive() );
+            }
+            _progressMeter.reset( progressMeterTotal , secondsBetween );
+        }
+        else {
+            _progressMeter.finished();
+        }
+        _message = msg;
+        return _progressMeter;
+    }
+
+
+    BSONObj CurOp::info() {
+        if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) {
+            BSONObjBuilder b;
+            b.append("err", "unauthorized");
+            return b.obj();
+        }
+        return infoNoauth();
+    }
+
+    CurOp::~CurOp() {
+        if ( _wrapped ) {
+            scoped_lock bl(Client::clientsMutex);
+            _client->_curOp = _wrapped;
+        }
+        _client = 0;
+    }
+
+    void CurOp::enter( Client::Context * context ) {
+        ensureStarted();
+        setNS( context->ns() );
+        _dbprofile = context->_db ? context->_db->profile : 0;
+    }
+    
+    void CurOp::leave( Client::Context * context ) {
+        unsigned long long now = curTimeMicros64();
+        Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command );
+        _checkpoint = now;
+    }
+
+    BSONObj CurOp::infoNoauth() {
+        BSONObjBuilder b;
+        b.append("opid", _opNum);
+        bool a = _active && _start;
+        b.append("active", a);
+        if ( _lockType )
+            b.append("lockType" , _lockType > 0 ? "write" : "read"  );
+        b.append("waitingForLock" , _waitingForLock );
+
+        if( a ) {
+            b.append("secs_running", elapsedSeconds() );
+        }
+
+        b.append( "op" , opToString( _op ) );
+
+        b.append("ns", _ns);
+
+        _query.append( b , "query" );
+
+        if( !_remote.empty() ) {
+            b.append("client", _remote.toString());
+        }
+
+        if ( _client ) {
+            b.append( "desc" , _client->desc() );
+            if ( _client->_threadId.size() ) 
+                b.append( "threadId" , _client->_threadId );
+            if ( _client->_connectionId )
+                b.appendNumber( "connectionId" , _client->_connectionId );
+        }
+        
+        if ( ! _message.empty() ) {
+            if ( _progressMeter.isActive() ) {
+                StringBuilder buf(128);
+                buf << _message.toString() << " " << _progressMeter.toString();
+                b.append( "msg" , buf.str() );
+                BSONObjBuilder sub( b.subobjStart( "progress" ) );
+                sub.appendNumber( "done" , (long long)_progressMeter.done() );
+                sub.appendNumber( "total" , (long long)_progressMeter.total() );
+                sub.done();
+            }
+            else {
+                b.append( "msg" , _message.toString() );
+            }
+        }
+
+        if( killed() ) 
+            b.append("killed", true);
+        
+        b.append( "numYields" , _numYields );
+
+        return b.obj();
+    }
+
+    AtomicUInt CurOp::_nextOpNum;
+
+}
diff --git a/src/mongo/db/curop.h b/src/mongo/db/curop.h
new file mode 100644
index 00000000000..192404d8796
--- /dev/null
+++ b/src/mongo/db/curop.h
@@ -0,0 +1,313 @@
+// @file curop.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "namespace-inl.h"
+#include "client.h"
+#include "../bson/util/atomic_int.h"
+#include "../util/concurrency/spin_lock.h"
+#include "../util/time_support.h"
+#include "../util/net/hostandport.h"
+
+namespace mongo {
+
+    class CurOp;
+
+    /* lifespan is different than CurOp because of recursives with DBDirectClient */
+    class OpDebug {
+    public:
+        OpDebug() : ns(""){ reset(); }
+
+        void reset();
+        
+        string toString() const;
+        void append( const CurOp& curop, BSONObjBuilder& b ) const;
+
+        // -------------------
+        
+        StringBuilder extra; // weird things we need to fix later
+        
+        // basic options
+        int op;
+        bool iscommand;
+        Namespace ns;
+        BSONObj query;
+        BSONObj updateobj;
+        
+        // detailed options
+        long long cursorid;
+        int ntoreturn;
+        int ntoskip;
+        bool exhaust;
+
+        // debugging/profile info
+        int nscanned;
+        bool idhack;         // indicates short circuited code path on an update to make the update faster
+        bool scanAndOrder;   // scanandorder query plan aspect was used
+        bool moved;          // update resulted in a move (moves are expensive)
+        bool fastmod;
+        bool fastmodinsert;  // upsert of an $operation. builds a default object
+        bool upsert;         // true if the update actually did an insert
+        int keyUpdates;
+
+        // error handling
+        ExceptionInfo exceptionInfo;
+        
+        // response info
+        int executionTime;
+        int nreturned;
+        int responseLength;
+    };
+
+    /**
+     * stores a copy of a bson obj in a fixed size buffer
+     * if its too big for the buffer, says "too big"
+     * useful for keeping a copy around indefinitely without wasting a lot of space or doing malloc
+     */
+    class CachedBSONObj {
+    public:
+        enum { TOO_BIG_SENTINEL = 1 } ;
+        static BSONObj _tooBig; // { $msg : "query not recording (too large)" }
+
+        CachedBSONObj() {
+            _size = (int*)_buf;
+            reset();
+        }
+
+        void reset( int sz = 0 ) {
+            _lock.lock();
+            _reset( sz );
+            _lock.unlock();
+        }
+
+        void set( const BSONObj& o ) {
+            scoped_spinlock lk(_lock);
+            int sz = o.objsize();
+            if ( sz > (int) sizeof(_buf) ) {
+                _reset(TOO_BIG_SENTINEL);
+            }
+            else {
+                memcpy(_buf, o.objdata(), sz );
+            }
+        }
+
+        int size() const { return *_size; }
+        bool have() const { return size() > 0; }
+
+        BSONObj get() const {
+            scoped_spinlock lk(_lock);
+            return _get();
+        }
+
+        void append( BSONObjBuilder& b , const StringData& name ) const {
+            scoped_spinlock lk(_lock);
+            BSONObj temp = _get();
+            b.append( name , temp );
+        }
+
+    private:
+        /** you have to be locked when you call this */
+        BSONObj _get() const {
+            int sz = size();
+            if ( sz == 0 )
+                return BSONObj();
+            if ( sz == TOO_BIG_SENTINEL )
+                return _tooBig;
+            return BSONObj( _buf ).copy();
+        }
+
+        /** you have to be locked when you call this */
+        void _reset( int sz ) { _size[0] = sz; }
+
+        mutable SpinLock _lock;
+        int * _size;
+        char _buf[512];
+    };
+
+    /* Current operation (for the current Client).
+       an embedded member of Client class, and typically used from within the mutex there.
+    */
+    class CurOp : boost::noncopyable {
+    public:
+        CurOp( Client * client , CurOp * wrapped = 0 );
+        ~CurOp();
+
+        bool haveQuery() const { return _query.have(); }
+        BSONObj query() { return _query.get();  }
+        void appendQuery( BSONObjBuilder& b , const StringData& name ) const { _query.append( b , name ); }
+        
+        void ensureStarted() {
+            if ( _start == 0 )
+                _start = _checkpoint = curTimeMicros64();
+        }
+        bool isStarted() const { return _start > 0; }
+        void enter( Client::Context * context );
+        void leave( Client::Context * context );
+        void reset();
+        void reset( const HostAndPort& remote, int op );
+        void markCommand() { _command = true; }
+
+        void waitingForLock( int type ) {
+            _waitingForLock = true;
+            if ( type > 0 )
+                _lockType = 1;
+            else
+                _lockType = -1;
+        }
+        void gotLock()             { _waitingForLock = false; }
+        OpDebug& debug()           { return _debug; }
+        int profileLevel() const   { return _dbprofile; }
+        const char * getNS() const { return _ns; }
+
+        bool shouldDBProfile( int ms ) const {
+            if ( _dbprofile <= 0 )
+                return false;
+
+            return _dbprofile >= 2 || ms >= cmdLine.slowMS;
+        }
+
+        AtomicUInt opNum() const { return _opNum; }
+
+        /** if this op is running */
+        bool active() const { return _active; }
+
+        int getLockType() const { return _lockType; }
+        bool isWaitingForLock() const { return _waitingForLock; }
+        int getOp() const { return _op; }
+        unsigned long long startTime() { // micros
+            ensureStarted();
+            return _start;
+        }
+        void done() {
+            _active = false;
+            _end = curTimeMicros64();
+        }
+        unsigned long long totalTimeMicros() {
+            massert( 12601 , "CurOp not marked done yet" , ! _active );
+            return _end - startTime();
+        }
+        int totalTimeMillis() { return (int) (totalTimeMicros() / 1000); }
+        int elapsedMillis() {
+            unsigned long long total = curTimeMicros64() - startTime();
+            return (int) (total / 1000);
+        }
+        int elapsedSeconds() { return elapsedMillis() / 1000; }
+        void setQuery(const BSONObj& query) { _query.set( query ); }
+        Client * getClient() const { return _client; }
+        BSONObj info();
+        BSONObj infoNoauth();
+        string getRemoteString( bool includePort = true ) { return _remote.toString(includePort); }
+        ProgressMeter& setMessage( const char * msg , unsigned long long progressMeterTotal = 0 , int secondsBetween = 3 );
+        string getMessage() const { return _message.toString(); }
+        ProgressMeter& getProgressMeter() { return _progressMeter; }
+        CurOp *parent() const { return _wrapped; }
+        void kill() { _killed = true; }
+        bool killed() const { return _killed; }
+        void yielded() { _numYields++; }
+        void setNS(const char *ns) {
+            strncpy(_ns, ns, Namespace::MaxNsLen);
+            _ns[Namespace::MaxNsLen] = 0;
+        }
+
+    private:
+        friend class Client;
+        void _reset();
+
+        static AtomicUInt _nextOpNum;
+        Client * _client;
+        CurOp * _wrapped;
+        unsigned long long _start;
+        unsigned long long _checkpoint;
+        unsigned long long _end;
+        bool _active;
+        int _op;
+        bool _command;
+        int _lockType;                   // see concurrency.h for values
+        bool _waitingForLock;
+        int _dbprofile;                  // 0=off, 1=slow, 2=all
+        AtomicUInt _opNum;               // todo: simple being "unsigned" may make more sense here
+        char _ns[Namespace::MaxNsLen+2];
+        HostAndPort _remote;             // CAREFUL here with thread safety
+        CachedBSONObj _query;            // CachedBSONObj is thread safe
+        OpDebug _debug;
+        ThreadSafeString _message;
+        ProgressMeter _progressMeter;
+        volatile bool _killed;
+        int _numYields;
+    };
+
+    /* _globalKill: we are shutting down
+       otherwise kill attribute set on specified CurOp
+       this class does not handle races between interruptJs and the checkForInterrupt functions - those must be
+       handled by the client of this class
+    */
+    extern class KillCurrentOp {
+    public:
+        void killAll();
+        void kill(AtomicUInt i);
+
+        /** @return true if global interrupt and should terminate the operation */
+        bool globalInterruptCheck() const { return _globalKill; }
+
+        void checkForInterrupt( bool heedMutex = true ) {
+            Client& c = cc();
+            if ( heedMutex && d.dbMutex.isWriteLocked() )
+                return;
+            if( _globalKill )
+                uasserted(11600,"interrupted at shutdown");
+            if( c.curop()->killed() )
+                uasserted(11601,"interrupted");
+            if( c.sometimes(1024) ) {
+                AbstractMessagingPort *p = cc().port();
+                if( p ) 
+                    p->assertStillConnected();
+            }
+        }
+
+        /** @return "" if not interrupted.  otherwise, you should stop. */
+        const char *checkForInterruptNoAssert( /*bool heedMutex = true*/ ) {
+            Client& c = cc();
+            // always called withi false so commented out:
+            /*if ( heedMutex && d.dbMutex.isWriteLocked() )
+                return "";*/
+            if( _globalKill )
+                return "interrupted at shutdown";
+            if( c.curop()->killed() )
+                return "interrupted";
+            if( c.sometimes(1024) ) {
+                try { 
+                    AbstractMessagingPort *p = cc().port();
+                    if( p ) 
+                        p->assertStillConnected();
+                }
+                catch(...) { 
+                    log() << "no longer connected to client";
+                    return "no longer connected to client";
+                }
+            }
+            return "";
+        }
+
+    private:
+        void interruptJs( AtomicUInt *op );
+        volatile bool _globalKill;
+    } killCurrentOp;
+
+}
diff --git a/src/mongo/db/cursor.cpp b/src/mongo/db/cursor.cpp
new file mode 100644
index 00000000000..ac7afc1532b
--- /dev/null
+++ b/src/mongo/db/cursor.cpp
@@ -0,0 +1,166 @@
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "pdfile.h"
+#include "curop-inl.h"
+
+namespace mongo {
+
+    bool BasicCursor::advance() {
+        killCurrentOp.checkForInterrupt();
+        if ( eof() ) {
+            if ( tailable_ && !last.isNull() ) {
+                curr = s->next( last );
+            }
+            else {
+                return false;
+            }
+        }
+        else {
+            last = curr;
+            curr = s->next( curr );
+        }
+        incNscanned();
+        return ok();
+    }
+
+    /* these will be used outside of mutexes - really functors - thus the const */
+    class Forward : public AdvanceStrategy {
+        virtual DiskLoc next( const DiskLoc &prev ) const {
+            return prev.rec()->getNext( prev );
+        }
+    } _forward;
+
+    class Reverse : public AdvanceStrategy {
+        virtual DiskLoc next( const DiskLoc &prev ) const {
+            return prev.rec()->getPrev( prev );
+        }
+    } _reverse;
+
+    const AdvanceStrategy *forward() {
+        return &_forward;
+    }
+    const AdvanceStrategy *reverse() {
+        return &_reverse;
+    }
+
+    DiskLoc nextLoop( NamespaceDetails *nsd, const DiskLoc &prev ) {
+        assert( nsd->capLooped() );
+        DiskLoc next = forward()->next( prev );
+        if ( !next.isNull() )
+            return next;
+        return nsd->firstRecord();
+    }
+
+    DiskLoc prevLoop( NamespaceDetails *nsd, const DiskLoc &curr ) {
+        assert( nsd->capLooped() );
+        DiskLoc prev = reverse()->next( curr );
+        if ( !prev.isNull() )
+            return prev;
+        return nsd->lastRecord();
+    }
+
+    ForwardCappedCursor::ForwardCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
+        nsd( _nsd ) {
+        if ( !nsd )
+            return;
+        DiskLoc start = startLoc;
+        if ( start.isNull() ) {
+            if ( !nsd->capLooped() )
+                start = nsd->firstRecord();
+            else {
+                start = nsd->capExtent.ext()->firstRecord;
+                if ( !start.isNull() && start == nsd->capFirstNewRecord ) {
+                    start = nsd->capExtent.ext()->lastRecord;
+                    start = nextLoop( nsd, start );
+                }
+            }
+        }
+        curr = start;
+        s = this;
+        incNscanned();
+    }
+
+    DiskLoc ForwardCappedCursor::next( const DiskLoc &prev ) const {
+        assert( nsd );
+        if ( !nsd->capLooped() )
+            return forward()->next( prev );
+
+        DiskLoc i = prev;
+        // Last record
+        if ( i == nsd->capExtent.ext()->lastRecord )
+            return DiskLoc();
+        i = nextLoop( nsd, i );
+        // If we become capFirstNewRecord from same extent, advance to next extent.
+        if ( i == nsd->capFirstNewRecord &&
+                i != nsd->capExtent.ext()->firstRecord )
+            i = nextLoop( nsd, nsd->capExtent.ext()->lastRecord );
+        // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord
+        if ( i == nsd->capExtent.ext()->firstRecord )
+            i = nsd->capFirstNewRecord;
+        return i;
+    }
+
+    ReverseCappedCursor::ReverseCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
+        nsd( _nsd ) {
+        if ( !nsd )
+            return;
+        DiskLoc start = startLoc;
+        if ( start.isNull() ) {
+            if ( !nsd->capLooped() ) {
+                start = nsd->lastRecord();
+            }
+            else {
+                start = nsd->capExtent.ext()->lastRecord;
+            }
+        }
+        curr = start;
+        s = this;
+        incNscanned();
+    }
+
+    DiskLoc ReverseCappedCursor::next( const DiskLoc &prev ) const {
+        assert( nsd );
+        if ( !nsd->capLooped() )
+            return reverse()->next( prev );
+
+        DiskLoc i = prev;
+        // Last record
+        if ( nsd->capFirstNewRecord == nsd->capExtent.ext()->firstRecord ) {
+            if ( i == nextLoop( nsd, nsd->capExtent.ext()->lastRecord ) ) {
+                return DiskLoc();
+            }
+        }
+        else {
+            if ( i == nsd->capExtent.ext()->firstRecord ) {
+                return DiskLoc();
+            }
+        }
+        // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev.
+        if ( i == nsd->capFirstNewRecord )
+            i = prevLoop( nsd, nsd->capExtent.ext()->firstRecord );
+        else
+            i = prevLoop( nsd, i );
+        // If we just became last in cap extent, advance past capFirstNewRecord
+        // (We know capExtent.ext()->firstRecord != capFirstNewRecord, since would
+        // have returned DiskLoc() earlier otherwise.)
+        if ( i == nsd->capExtent.ext()->lastRecord )
+            i = reverse()->next( nsd->capFirstNewRecord );
+
+        return i;
+    }
+} // namespace mongo
diff --git a/src/mongo/db/cursor.h b/src/mongo/db/cursor.h
new file mode 100644
index 00000000000..8e9e922733d
--- /dev/null
+++ b/src/mongo/db/cursor.h
@@ -0,0 +1,246 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+
+#include "jsobj.h"
+#include "diskloc.h"
+#include "matcher.h"
+
+namespace mongo {
+
+    class NamespaceDetails;
+    class Record;
+    class CoveredIndexMatcher;
+
+    /* Query cursors, base class.  This is for our internal cursors.  "ClientCursor" is a separate
+       concept and is for the user's cursor.
+
+       WARNING concurrency: the vfunctions below are called back from within a
+       ClientCursor::ccmutex.  Don't cause a deadlock, you've been warned.
+    */
+    class Cursor : boost::noncopyable {
+    public:
+        virtual ~Cursor() {}
+        virtual bool ok() = 0;
+        bool eof() { return !ok(); }
+        virtual Record* _current() = 0;
+        virtual BSONObj current() = 0;
+        virtual DiskLoc currLoc() = 0;
+        virtual bool advance() = 0; /*true=ok*/
+        virtual BSONObj currKey() const { return BSONObj(); }
+
+        // DiskLoc the cursor requires for continued operation.  Before this
+        // DiskLoc is deleted, the cursor must be incremented or destroyed.
+        virtual DiskLoc refLoc() = 0;
+
+        /* Implement these if you want the cursor to be "tailable" */
+
+        /* Request that the cursor starts tailing after advancing past last record. */
+        /* The implementation may or may not honor this request. */
+        virtual void setTailable() {}
+        /* indicates if tailing is enabled. */
+        virtual bool tailable() {
+            return false;
+        }
+
+        virtual void aboutToDeleteBucket(const DiskLoc& b) { }
+
+        /* optional to implement.  if implemented, means 'this' is a prototype */
+        virtual Cursor* clone() {
+            return 0;
+        }
+
+        virtual BSONObj indexKeyPattern() {
+            return BSONObj();
+        }
+
+        virtual bool supportGetMore() = 0;
+
+        /* called after every query block is iterated -- i.e. between getMore() blocks
+           so you can note where we are, if necessary.
+           */
+        virtual void noteLocation() { }
+
+        /* called before query getmore block is iterated */
+        virtual void checkLocation() { }
+
+        /**
+         * Called before a document pointed at by an earlier iterate of this cursor is to be
+         * modified.  It is ok if the current iterate also points to the document to be modified.
+         */
+        virtual void prepareToTouchEarlierIterate() { noteLocation(); }
+
+        /** Recover from a previous call to prepareToTouchEarlierIterate(). */
+        virtual void recoverFromTouchingEarlierIterate() { checkLocation(); }
+
+        virtual bool supportYields() = 0;
+
+        /** Called before a ClientCursor yield. */
+        virtual bool prepareToYield() { noteLocation(); return supportYields(); }
+        
+        /** Called after a ClientCursor yield.  Recovers from a previous call to prepareToYield(). */
+        virtual void recoverFromYield() { checkLocation(); }
+
+        virtual string toString() { return "abstract?"; }
+
+        /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+           if a multikey index traversal:
+             if loc has already been sent, returns true.
+             otherwise, marks loc as sent.
+        */
+        virtual bool getsetdup(DiskLoc loc) = 0;
+
+        virtual bool isMultiKey() const = 0;
+
+        virtual bool autoDedup() const { return true; }
+
+        /**
+         * return true if the keys in the index have been modified from the main doc
+         * if you have { a : 1 , b : [ 1 , 2 ] }
+         * an index on { a : 1 } would not be modified
+         * an index on { b : 1 } would be since the values of the array are put in the index
+         *                       not the array
+         */
+        virtual bool modifiedKeys() const = 0;
+
+        virtual BSONObj prettyIndexBounds() const { return BSONArray(); }
+
+        virtual bool capped() const { return false; }
+
+        virtual long long nscanned() = 0;
+
+        // The implementation may return different matchers depending on the
+        // position of the cursor.  If matcher() is nonzero at the start,
+        // matcher() should be checked each time advance() is called.
+        // Implementations which generate their own matcher should return this
+        // to avoid a matcher being set manually.
+        // Note that the return values differ subtly here
+
+        // Used when we want fast matcher lookup
+        virtual CoveredIndexMatcher *matcher() const { return 0; }
+        // Used when we need to share this matcher with someone else
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return shared_ptr< CoveredIndexMatcher >(); }
+
+        virtual bool currentMatches( MatchDetails *details = 0 ) {
+            return !matcher() || matcher()->matchesCurrent( this, details );
+        }
+
+        // A convenience function for setting the value of matcher() manually
+        // so it may accessed later.  Implementations which must generate
+        // their own matcher() should assert here.
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) {
+            massert( 13285, "manual matcher config not allowed", false );
+        }
+
+        virtual void explainDetails( BSONObjBuilder& b ) { return; }
+    };
+
+    // strategy object implementing direction of traversal.
+    class AdvanceStrategy {
+    public:
+        virtual ~AdvanceStrategy() { }
+        virtual DiskLoc next( const DiskLoc &prev ) const = 0;
+    };
+
+    const AdvanceStrategy *forward();
+    const AdvanceStrategy *reverse();
+
+    /* table-scan style cursor */
+    class BasicCursor : public Cursor {
+    public:
+        BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ), _nscanned() {
+            incNscanned();
+            init();
+        }
+        BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ), _nscanned() {
+            init();
+        }
+        bool ok() { return !curr.isNull(); }
+        Record* _current() {
+            assert( ok() );
+            return curr.rec();
+        }
+        BSONObj current() {
+            Record *r = _current();
+            BSONObj j(r);
+            return j;
+        }
+        virtual DiskLoc currLoc() { return curr; }
+        virtual DiskLoc refLoc()  { return curr.isNull() ? last : curr; }
+        bool advance();
+        virtual string toString() { return "BasicCursor"; }
+        virtual void setTailable() {
+            if ( !curr.isNull() || !last.isNull() )
+                tailable_ = true;
+        }
+        virtual bool tailable() { return tailable_; }
+        virtual bool getsetdup(DiskLoc loc) { return false; }
+        virtual bool isMultiKey() const { return false; }
+        virtual bool modifiedKeys() const { return false; }
+        virtual bool supportGetMore() { return true; }
+        virtual bool supportYields() { return true; }
+        virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; }
+        virtual long long nscanned() { return _nscanned; }
+
+    protected:
+        DiskLoc curr, last;
+        const AdvanceStrategy *s;
+        void incNscanned() { if ( !curr.isNull() ) { ++_nscanned; } }
+    private:
+        bool tailable_;
+        shared_ptr< CoveredIndexMatcher > _matcher;
+        long long _nscanned;
+        void init() { tailable_ = false; }
+    };
+
+    /* used for order { $natural: -1 } */
+    class ReverseCursor : public BasicCursor {
+    public:
+        ReverseCursor(DiskLoc dl) : BasicCursor( dl, reverse() ) { }
+        ReverseCursor() : BasicCursor( reverse() ) { }
+        virtual string toString() { return "ReverseCursor"; }
+    };
+
+    class ForwardCappedCursor : public BasicCursor, public AdvanceStrategy {
+    public:
+        ForwardCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
+        virtual string toString() {
+            return "ForwardCappedCursor";
+        }
+        virtual DiskLoc next( const DiskLoc &prev ) const;
+        virtual bool capped() const { return true; }
+    private:
+        NamespaceDetails *nsd;
+    };
+
+    class ReverseCappedCursor : public BasicCursor, public AdvanceStrategy {
+    public:
+        ReverseCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
+        virtual string toString() {
+            return "ReverseCappedCursor";
+        }
+        virtual DiskLoc next( const DiskLoc &prev ) const;
+        virtual bool capped() const { return true; }
+    private:
+        NamespaceDetails *nsd;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/d_concurrency.cpp b/src/mongo/db/d_concurrency.cpp
new file mode 100755
index 00000000000..e3ad974cbfc
--- /dev/null
+++ b/src/mongo/db/d_concurrency.cpp
@@ -0,0 +1,231 @@
+// @file d_concurrency.cpp 
+
+#include "pch.h"
+#include "d_concurrency.h"
+#include "../util/concurrency/threadlocal.h"
+#include "../util/concurrency/rwlock.h"
+#include "../util/concurrency/value.h"
+#include "../util/assert_util.h"
+#include "client.h"
+#include "namespacestring.h"
+#include "d_globals.h"
+
+// oplog locking
+// no top level read locks
+// system.profile writing
+// oplog now
+// yielding
+// commitIfNeeded
+
+namespace mongo {
+
+    using namespace clcimpl;
+
+    Client::LockStatus::LockStatus() { 
+        excluder=global=collection=0;
+    }
+
+    namespace clcimpl {
+        Shared::Shared(unsigned& _state, RWLock& lock) : state(_state) {
+            rw = 0;
+            if( state ) { 
+                // already locked
+                dassert( (state & (AcquireShared|AcquireExclusive)) == 0 );
+                return;
+            }
+            rw = &lock;
+            state = AcquireShared;
+            rw->lock_shared();
+            state = LockedShared;
+        }
+        Shared::~Shared() { 
+            if( rw ) {
+                state = Unlocked;
+                rw->unlock_shared();
+            }
+        }
+        Exclusive::Exclusive(unsigned& _state, RWLock& lock) : state(_state) { 
+            rw = 0;
+            if( state ) { 
+                // already locked
+                dassert( (state & (AcquireShared|AcquireExclusive)) == 0 );
+                assert( state == LockedExclusive ); // can't be in shared state
+                return;
+            }
+            rw = &lock;
+            state = AcquireExclusive;
+            rw->lock();
+            state = LockedExclusive;
+        }
+        Exclusive::~Exclusive() { 
+            if( rw ) {
+                state = Unlocked;
+                rw->unlock();
+            }
+        }
+    } // clcimpl namespace
+
+    // this tie-in temporary until MongoMutex is folded in more directly.
+    // called when the lock has been achieved
+    void MongoMutex::lockedExclusively() {
+        Client& c = cc();
+        curopGotLock(&c); // hopefully lockStatus replaces one day
+        c.lockStatus.global = clcimpl::LockedExclusive;
+        _minfo.entered(); // hopefully eliminate one day 
+    }
+
+    void MongoMutex::unlockingExclusively() {
+        Client& c = cc();
+        _minfo.leaving();
+        c.lockStatus.global = Unlocked;
+    }
+
+    MongoMutex::MongoMutex(const char *name) : _m(name) {
+        static int n = 0;
+        assert( ++n == 1 ); // below releasingWriteLock we assume MongoMutex is a singleton, and uses dbMutex ref above
+        _remapPrivateViewRequested = false;
+    }
+
+    bool subcollectionOf(const string& parent, const char *child) {
+        if( parent == child ) 
+            return true;
+        if( !str::startsWith(child, parent) )
+            return false;
+        const char *p = child + parent.size();
+        uassert(15963, str::stream() << "bad collection name: " << child, !str::endsWith(p, '.'));
+        return *p == '.' && p[1] == '$';
+    }
+
+    // (maybe tbd) ...
+    // we will use the global write lock for writing to system.* collections for simplicity 
+    // for now; this has some advantages in speed as we don't need to latch just for that then; 
+    // also there are cases to be handled carefully otherwise such as namespacedetails methods
+    // reaching into system.indexes implicitly
+    // exception : system.profile
+    static bool lkspecial(const string& ns) { 
+        NamespaceString s(ns);
+        return s.isSystem() && s.coll != "system.profile";
+    }
+
+    /** Notes on d.writeExcluder
+        we want to be able to block any attempted write while allowing reads; additionally 
+        force non-greedy acquisition so that reads can continue -- 
+        that is, disallow greediness of write lock acquisitions.  This is for that purpose.  The 
+        #1 need is by groupCommitWithLimitedLocks() but useful elsewhere such as for lock and fsync.
+    */
+
+    ExcludeAllWrites::ExcludeAllWrites() : 
+        lk(cc().lockStatus.excluder, d.writeExcluder), 
+        gslk()
+    {
+        LOG(3) << "ExcludeAllWrites" << endl;
+        wassert( !d.dbMutex.isWriteLocked() );
+    };
+    ExcludeAllWrites::~ExcludeAllWrites() {
+    }
+
+    // CLC turns on the "collection level concurrency" code 
+    // (which is under development and not finished)
+#if defined(CLC)
+    // called after a context is set. check that the correct collection is locked
+    void Client::checkLocks() const { 
+        DEV {
+            if( !d.dbMutex.isWriteLocked() ) { 
+                const char *n = ns();
+                if( lockStatus.whichCollection.empty() ) { 
+                    log() << "DEBUG checkLocks error expected to already be locked: " << n << endl;
+                    dassert(false);
+                }
+                dassert( subcollectionOf(lockStatus.whichCollection, n) || lkspecial(n) );
+            }
+        }
+    }
+#endif
+
+    // we don't keep these locks in the namespacedetailstransient and Database 
+    // objects -- that makes things safer as we need not prove to ourselves that they 
+    // are always in scope when we need them.
+    // todo: we don't clean these locks up yet.
+    // todo: avoiding the mutex here might be nice.
+    class LockObjectForEachCollection {
+        //mapsf<string,RWLock*> dblocks;
+        mapsf<string,RWLock*> nslocks;
+    public:
+        /*RWLock& fordb(string db) { 
+            mapsf<string,RWLock*>::ref r(dblocks);
+            RWLock*& rw = r[db];
+            if( rw == 0 )
+                rw = new RWLock(0);
+            return *rw;
+        }*/
+        RWLock& forns(string ns) { 
+            mapsf<string,RWLock*>::ref r(nslocks);
+#if defined(CLC)
+            massert(15964, str::stream() << "bad collection name to lock: " << ns, str::contains(ns, '.'));
+#endif
+            RWLock*& rw = r[ns];
+            if( rw == 0 ) { 
+                rw = new RWLock(0);
+            }
+            return *rw;
+        }
+    } theLocks;
+
+#if defined(CLC)
+    LockCollectionForWriting::Locks::Locks(string ns) : 
+        excluder(d.writeExcluder),
+        gslk(),
+        clk(theLocks.forns(ns),true)
+    { }
+    LockCollectionForWriting::~LockCollectionForWriting() { 
+        if( locks.get() ) {
+            Client::LockStatus& s = cc().lockStatus;
+            s.whichCollection.clear();
+        }
+    }
+    LockCollectionForWriting::LockCollectionForWriting(string coll)
+    {
+        Client::LockStatus& s = cc().lockStatus;
+        LockBits b(s.state);
+        if( !s.whichCollection.empty() ) {
+            if( !subcollectionOf(s.whichCollection, coll.c_str()) ) { 
+                massert(15937, str::stream() << "can't nest lock of " << coll << " beneath " << s.whichCollection, false);
+            }
+            if( b.get(LockBits::Collection) != LockBits::Exclusive ) {
+                massert(15938, str::stream() << "want collection write lock but it is already read locked " << s.state, false);
+            }
+            return;
+        }
+        verify(15965, !lkspecial(coll)); // you must global write lock for writes to special's
+        s.whichCollection = coll;
+        b.set(LockBits::Collection, LockBits::NotLocked, LockBits::Exclusive);
+        locks.reset( new Locks(coll) );
+    }    
+#endif
+
+    LockCollectionForReading::LockCollectionForReading(string ns) : 
+      gslk(),
+      clk( cc().lockStatus.collection, theLocks.forns(ns) ) 
+    { 
+        Client::LockStatus& s = cc().lockStatus;
+        if( s.whichCollection.empty() ) {
+            s.whichCollection = ns;
+        }
+        else {
+            if( !subcollectionOf(s.whichCollection, ns.c_str()) ) {
+                if( lkspecial(ns) )
+                    return;
+                massert(15939, 
+                    str::stream() << "can't nest lock of " << ns << " beneath " << s.whichCollection, 
+                    false);
+            }
+        }
+    }
+    LockCollectionForReading::~LockCollectionForReading() {
+        if( !clk.recursed() ) {
+            Client::LockStatus& s = cc().lockStatus;
+            s.whichCollection.clear();
+        }
+    }
+
+}
diff --git a/src/mongo/db/d_concurrency.h b/src/mongo/db/d_concurrency.h
new file mode 100644
index 00000000000..ba2f64f5126
--- /dev/null
+++ b/src/mongo/db/d_concurrency.h
@@ -0,0 +1,67 @@
+// @file d_concurrency.h
+
+#pragma once
+
+#include "../util/concurrency/rwlock.h"
+#include "db/mongomutex.h"
+
+namespace mongo {
+
+    namespace clcimpl {
+        enum LockStates { Unlocked, AcquireShared=1, LockedShared=2, AcquireExclusive=4, LockedExclusive=8 };
+        class Shared : boost::noncopyable { 
+            unsigned& state;
+            RWLock *rw;
+        public:
+            Shared(unsigned& state, RWLock& lock);
+            ~Shared();
+            bool recursed() const { return rw == 0; }
+        };
+        class Exclusive : boost::noncopyable { 
+            unsigned& state;
+            RWLock *rw;
+        public:
+            Exclusive(unsigned& state, RWLock& lock);
+            ~Exclusive();
+        };
+    }
+
+    typedef readlock GlobalSharedLock;
+
+    class ExcludeAllWrites : boost::noncopyable {
+        clcimpl::Exclusive lk;
+        GlobalSharedLock gslk;
+    public:
+        ExcludeAllWrites();
+        ~ExcludeAllWrites();
+    };
+
+    class todoGlobalWriteLock : boost::noncopyable { 
+    public:
+    };
+
+    class LockCollectionForReading : boost::noncopyable { 
+        GlobalSharedLock gslk;
+        clcimpl::Shared clk;
+    public:
+        LockCollectionForReading(string coll);
+        ~LockCollectionForReading();
+    };
+
+#if defined(CLC)
+    class LockCollectionForWriting : boost::noncopyable {
+        struct Locks { 
+            Locks(string ns);
+            SimpleRWLock::Shared excluder;
+            GlobalSharedLock gslk;
+            rwlock clk;
+        };
+        scoped_ptr<Locks> locks;
+    public:
+        LockCollectionForWriting(string db);
+        ~LockCollectionForWriting();
+    };
+#else
+#endif
+
+}
diff --git a/src/mongo/db/d_globals.cpp b/src/mongo/db/d_globals.cpp
new file mode 100644
index 00000000000..7e0fd9e8cb0
--- /dev/null
+++ b/src/mongo/db/d_globals.cpp
@@ -0,0 +1,20 @@
+// @file d_globals.cpp
+
+#include "pch.h"
+#include "d_globals.h"
+#include "../util/concurrency/rwlock.h"
+#include "clientcursor.h"
+#include "mongomutex.h"
+
+namespace mongo { 
+
+    DGlobals::DGlobals() :
+        writeExcluder( *(new RWLock("writeexcluder")) ),
+        dbMutex( *(new MongoMutex("dbMutex")) ),
+        clientCursorMonitor( *(new ClientCursorMonitor()) )
+    {
+    }
+
+    DGlobals d;
+
+}
diff --git a/src/mongo/db/d_globals.h b/src/mongo/db/d_globals.h
new file mode 100644
index 00000000000..7c95d463cc3
--- /dev/null
+++ b/src/mongo/db/d_globals.h
@@ -0,0 +1,27 @@
+// @file d_globals.h
+//
+// these are global variables used in mongod ("d").  also used in test binary as that is effectively a variation on mongod code.
+// that is, these are not in mongos.
+//
+
+#pragma once
+
+namespace mongo { 
+
+    class RWLock;
+    class MongoMutex;
+    class ClientCursorMonitor;
+
+    struct DGlobals : boost::noncopyable { 
+        DGlobals();
+
+        // these are intentionally never deleted:
+        RWLock& writeExcluder;
+        MongoMutex &dbMutex;
+        ClientCursorMonitor& clientCursorMonitor;
+
+    };
+
+    extern DGlobals d;
+
+};
diff --git a/src/mongo/db/database.cpp b/src/mongo/db/database.cpp
new file mode 100644
index 00000000000..2d55fd35626
--- /dev/null
+++ b/src/mongo/db/database.cpp
@@ -0,0 +1,423 @@
+// database.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "database.h"
+#include "instance.h"
+#include "clientcursor.h"
+#include "databaseholder.h"
+
+namespace mongo {
+
+    bool Database::_openAllFiles = true;
+
+    void assertDbAtLeastReadLocked(const Database *) { 
+        // temp impl
+        d.dbMutex.assertAtLeastReadLocked(); 
+    }
+
+    void assertDbWriteLocked(const Database *) { 
+        // temp impl
+        d.dbMutex.assertWriteLocked();
+    }
+
+    Database::~Database() {
+        d.dbMutex.assertWriteLocked();
+        magic = 0;
+        size_t n = _files.size();
+        for ( size_t i = 0; i < n; i++ )
+            delete _files[i];
+        if( ccByLoc.size() ) {
+            log() << "\n\n\nWARNING: ccByLoc not empty on database close! " << ccByLoc.size() << ' ' << name << endl;
+        }
+    }
+
+    Database::Database(const char *nm, bool& newDb, const string& _path )
+        : name(nm), path(_path), namespaceIndex( path, name ),
+          profileName(name + ".system.profile")
+    {
+        try {
+            {
+                // check db name is valid
+                size_t L = strlen(nm);
+                uassert( 10028 ,  "db name is empty", L > 0 );
+                uassert( 10032 ,  "db name too long", L < 64 );
+                uassert( 10029 ,  "bad db name [1]", *nm != '.' );
+                uassert( 10030 ,  "bad db name [2]", nm[L-1] != '.' );
+                uassert( 10031 ,  "bad char(s) in db name", strchr(nm, ' ') == 0 );
+            }
+            newDb = namespaceIndex.exists();
+            profile = cmdLine.defaultProfile;
+            checkDuplicateUncasedNames(true);
+            // If already exists, open.  Otherwise behave as if empty until
+            // there's a write, then open.
+            if ( ! newDb || cmdLine.defaultProfile ) {
+                namespaceIndex.init();
+                if( _openAllFiles )
+                    openAllFiles();
+            }
+            magic = 781231;
+        } catch(std::exception& e) {
+            log() << "warning database " << path << ' ' << nm << " could not be opened" << endl;
+            log() << e.what() << endl;
+            // since destructor won't be called:
+            for ( size_t i = 0; i < _files.size(); i++ )
+                delete _files[i];
+            throw;
+        }
+    }
+    
+    void Database::checkDuplicateUncasedNames(bool inholderlock) const {
+        string duplicate = duplicateUncasedName(inholderlock, name, path );
+        if ( !duplicate.empty() ) {
+            stringstream ss;
+            ss << "db already exists with different case other: [" << duplicate << "] me [" << name << "]";
+            uasserted( DatabaseDifferCaseCode , ss.str() );
+        }
+    }
+
+    /*static*/
+    string Database::duplicateUncasedName( bool inholderlock, const string &name, const string &path, set< string > *duplicates ) {
+        d.dbMutex.assertAtLeastReadLocked();
+
+        if ( duplicates ) {
+            duplicates->clear();   
+        }
+        
+        vector<string> others;
+        getDatabaseNames( others , path );
+        
+        set<string> allShortNames;
+        dbHolder().getAllShortNames( inholderlock, allShortNames );
+        
+        others.insert( others.end(), allShortNames.begin(), allShortNames.end() );
+        
+        for ( unsigned i=0; i<others.size(); i++ ) {
+
+            if ( strcasecmp( others[i].c_str() , name.c_str() ) )
+                continue;
+            
+            if ( strcmp( others[i].c_str() , name.c_str() ) == 0 )
+                continue;
+
+            if ( duplicates ) {
+                duplicates->insert( others[i] );
+            } else {
+                return others[i];
+            }
+        }
+        if ( duplicates ) {
+            return duplicates->empty() ? "" : *duplicates->begin();
+        }
+        return "";
+    }
+    
+    boost::filesystem::path Database::fileName( int n ) const {
+        stringstream ss;
+        ss << name << '.' << n;
+        boost::filesystem::path fullName;
+        fullName = boost::filesystem::path(path);
+        if ( directoryperdb )
+            fullName /= name;
+        fullName /= ss.str();
+        return fullName;
+    }
+
+    bool Database::openExistingFile( int n ) { 
+        assert(this);
+        d.dbMutex.assertWriteLocked();
+        {
+            // must not yet be visible to others as we aren't in the db's write lock and 
+            // we will write to _files vector - thus this assert.
+            bool loaded = dbHolder().__isLoaded(name, path);
+            assert( !loaded );
+        }
+        // additionally must be in the dbholder mutex (no assert for that yet)
+
+        // todo: why here? that could be bad as we may be read locked only here
+        namespaceIndex.init();
+
+        if ( n < 0 || n >= DiskLoc::MaxFiles ) {
+            massert( 15924 , str::stream() << "getFile(): bad file number value " << n << " (corrupt db?): run repair", false);
+        }
+
+        {
+            if( n < (int) _files.size() && _files[n] ) {
+                dlog(2) << "openExistingFile " << n << " is already open" << endl;
+                return true;
+            }
+        }
+
+        {
+            boost::filesystem::path fullName = fileName( n );
+            string fullNameString = fullName.string();
+            MongoDataFile *df = new MongoDataFile(n);
+            try {
+                if( !df->openExisting( fullNameString.c_str() ) ) { 
+                    delete df;
+                    return false;
+                }
+            }
+            catch ( AssertionException& ) {
+                delete df;
+                throw;
+            }
+            while ( n >= (int) _files.size() ) {
+                _files.push_back(0);
+            }
+            _files[n] = df;
+        }
+
+        return true;
+    }
+
+    // todo : we stop once a datafile dne.
+    //        if one datafile were missing we should keep going for 
+    //        repair purposes yet we do not.
+    void Database::openAllFiles() {
+        //log() << "TEMP openallfiles " << path << ' ' << name << endl;
+        assert(this);
+        int n = 0;
+        while( openExistingFile(n) ) {
+            n++;
+        }
+
+        /*
+        int n = 0;
+        while( exists(n) ) {
+            getFile(n);
+            n++;
+        }
+        // If last file is empty, consider it preallocated and make sure it's not mapped
+        // until a write is requested
+        if ( n > 1 && getFile( n - 1 )->getHeader()->isEmpty() ) {
+            delete _files[ n - 1 ];
+            _files.pop_back();
+        }*/
+    }
+
+    // todo: this is called a lot. streamline the common case
+    MongoDataFile* Database::getFile( int n, int sizeNeeded , bool preallocateOnly) {
+        assert(this);
+        DEV assertDbAtLeastReadLocked(this);
+
+        namespaceIndex.init();
+        if ( n < 0 || n >= DiskLoc::MaxFiles ) {
+            out() << "getFile(): n=" << n << endl;
+            massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false);
+        }
+        DEV {
+            if ( n > 100 ) {
+                out() << "getFile(): n=" << n << endl;
+            }
+        }
+        MongoDataFile* p = 0;
+        if ( !preallocateOnly ) {
+            while ( n >= (int) _files.size() ) {
+                DEV if( !d.dbMutex.isWriteLocked() ) { 
+                    log() << "error: getFile() called in a read lock, yet file to return is not yet open" << endl;
+                    log() << "       getFile(" << n << ") _files.size:" <<_files.size() << ' ' << fileName(n).string() << endl;
+                    log() << "       context ns: " << cc().ns() << " openallfiles:" << _openAllFiles << endl;
+                }
+                assertDbWriteLocked(this);
+                _files.push_back(0);
+            }
+            p = _files[n];
+        }
+        if ( p == 0 ) {
+            assertDbWriteLocked(this);
+            boost::filesystem::path fullName = fileName( n );
+            string fullNameString = fullName.string();
+            p = new MongoDataFile(n);
+            int minSize = 0;
+            if ( n != 0 && _files[ n - 1 ] )
+                minSize = _files[ n - 1 ]->getHeader()->fileLength;
+            if ( sizeNeeded + DataFileHeader::HeaderSize > minSize )
+                minSize = sizeNeeded + DataFileHeader::HeaderSize;
+            try {
+                p->open( fullNameString.c_str(), minSize, preallocateOnly );
+            }
+            catch ( AssertionException& ) {
+                delete p;
+                throw;
+            }
+            if ( preallocateOnly )
+                delete p;
+            else
+                _files[n] = p;
+        }
+        return preallocateOnly ? 0 : p;
+    }
+
+    MongoDataFile* Database::addAFile( int sizeNeeded, bool preallocateNextFile ) {
+        assertDbWriteLocked(this);
+        int n = (int) _files.size();
+        MongoDataFile *ret = getFile( n, sizeNeeded );
+        if ( preallocateNextFile )
+            preallocateAFile();
+        return ret;
+    }
+
+    bool fileIndexExceedsQuota( const char *ns, int fileIndex, bool enforceQuota ) {
+        return
+            cmdLine.quota &&
+            enforceQuota &&
+            fileIndex >= cmdLine.quotaFiles &&
+            // we don't enforce the quota on "special" namespaces as that could lead to problems -- e.g.
+            // rejecting an index insert after inserting the main record.
+            !NamespaceString::special( ns ) &&
+            NamespaceString( ns ).db != "local";
+    }
+    
+    MongoDataFile* Database::suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota ) {
+
+        // check existing files
+        for ( int i=numFiles()-1; i>=0; i-- ) {
+            MongoDataFile* f = getFile( i );
+            if ( f->getHeader()->unusedLength >= sizeNeeded ) {
+                if ( fileIndexExceedsQuota( ns, i-1, enforceQuota ) ) // NOTE i-1 is the value used historically for this check.
+                    ;
+                else
+                    return f;
+            }
+        }
+
+        if ( fileIndexExceedsQuota( ns, numFiles(), enforceQuota ) )
+            uasserted(12501, "quota exceeded");
+
+        // allocate files until we either get one big enough or hit maxSize
+        for ( int i = 0; i < 8; i++ ) {
+            MongoDataFile* f = addAFile( sizeNeeded, preallocate );
+
+            if ( f->getHeader()->unusedLength >= sizeNeeded )
+                return f;
+
+            if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop
+                return f;
+        }
+
+        uasserted(14810, "couldn't allocate space (suitableFile)"); // callers don't check for null return code
+        return 0;
+    }
+
+    MongoDataFile* Database::newestFile() {
+        int n = numFiles();
+        if ( n == 0 )
+            return 0;
+        return getFile(n-1);
+    }
+
+
+    Extent* Database::allocExtent( const char *ns, int size, bool capped, bool enforceQuota ) {
+        // todo: when profiling, these may be worth logging into profile collection
+        bool fromFreeList = true;
+        Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped );
+        if( e == 0 ) {
+            fromFreeList = false;
+            e = suitableFile( ns, size, !capped, enforceQuota )->createExtent( ns, size, capped );
+        }
+        LOG(1) << "allocExtent " << ns << " size " << size << ' ' << fromFreeList << endl; 
+        return e;
+    }
+
+
+    bool Database::setProfilingLevel( int newLevel , string& errmsg ) {
+        if ( profile == newLevel )
+            return true;
+
+        if ( newLevel < 0 || newLevel > 2 ) {
+            errmsg = "profiling level has to be >=0 and <= 2";
+            return false;
+        }
+
+        if ( newLevel == 0 ) {
+            profile = 0;
+            return true;
+        }
+
+        assert( cc().database() == this );
+
+        if ( ! namespaceIndex.details( profileName.c_str() ) ) {
+            log() << "creating profile collection: " << profileName << endl;
+            BSONObjBuilder spec;
+            spec.appendBool( "capped", true );
+            spec.append( "size", 1024*1024 );
+            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , false /* we don't replica profile messages */ ) ) {
+                return false;
+            }
+        }
+        profile = newLevel;
+        return true;
+    }
+
+    bool Database::exists(int n) const { 
+        return boost::filesystem::exists( fileName( n ) ); 
+    }
+
+    int Database::numFiles() const { 
+        DEV assertDbAtLeastReadLocked(this);
+        return (int) _files.size(); 
+    }
+
+    void Database::flushFiles( bool sync ) {
+        assertDbAtLeastReadLocked(this);
+        for( vector<MongoDataFile*>::iterator i = _files.begin(); i != _files.end(); i++ ) { 
+            MongoDataFile *f = *i;
+            f->flush(sync);
+        }
+    }
+
+    long long Database::fileSize() const {
+        long long size=0;
+        for (int n=0; exists(n); n++)
+            size += boost::filesystem::file_size( fileName(n) );
+        return size;
+    }
+
+    Database* DatabaseHolder::getOrCreate( const string& ns , const string& path , bool& justCreated ) {
+        d.dbMutex.assertAtLeastReadLocked();
+
+        DBs& m = _paths[path];
+
+        string dbname = _todb( ns );
+
+        {
+            DBs::iterator i = m.find(dbname); 
+            if( i != m.end() ) {
+                justCreated = false;
+                return i->second;
+            }
+        }
+
+        // todo: protect against getting sprayed with requests for different db names that DNE - 
+        //       that would make the DBs map very large.  not clear what to do to handle though, 
+        //       perhaps just log it, which is what we do here with the "> 40" : 
+        bool cant = !d.dbMutex.isWriteLocked();
+        if( logLevel >= 1 || m.size() > 40 || cant || DEBUG_BUILD ) {
+            log() << "opening db: " << (path==dbpath?"":path) << ' ' << dbname << endl;
+        }
+        massert(15927, "can't open database in a read lock. if db was just closed, consider retrying the query. might otherwise indicate an internal error", !cant);
+        
+        Database *db = new Database( dbname.c_str() , justCreated , path );
+        m[dbname] = db;
+        _size++;
+        return db;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/database.h b/src/mongo/db/database.h
new file mode 100644
index 00000000000..a7867e20e8c
--- /dev/null
+++ b/src/mongo/db/database.h
@@ -0,0 +1,145 @@
+// database.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "cmdline.h"
+#include "namespace.h"
+
+namespace mongo {
+
+    class Extent;
+    class MongoDataFile;
+    class ClientCursor;
+    struct ByLocKey;
+    typedef map<ByLocKey, ClientCursor*> CCByLoc;
+
+    /**
+     * Database represents a database database
+     * Each database database has its own set of files -- dbname.ns, dbname.0, dbname.1, ...
+     * NOT memory mapped
+    */
+    class Database {
+    public:
+        static bool _openAllFiles;
+
+        // you probably need to be in dbHolderMutex when constructing this
+        Database(const char *nm, /*out*/ bool& newDb, const string& _path = dbpath);
+    private:
+        ~Database(); // closes files and other cleanup see below.
+    public:
+        /* you must use this to close - there is essential code in this method that is not in the ~Database destructor.
+           thus the destructor is private.  this could be cleaned up one day...
+        */
+        static void closeDatabase( const char *db, const string& path );
+
+        void openAllFiles();
+
+        /**
+         * tries to make sure that this hasn't been deleted
+         */
+        bool isOk() const { return magic == 781231; }
+
+        bool isEmpty() { return ! namespaceIndex.allocated(); }
+
+        /**
+         * total file size of Database in bytes
+         */
+        long long fileSize() const;
+
+        int numFiles() const;
+
+        /**
+         * returns file valid for file number n
+         */
+        boost::filesystem::path fileName( int n ) const;
+
+    private:
+        bool exists(int n) const;
+        bool openExistingFile( int n );
+
+    public:
+        /**
+         * return file n.  if it doesn't exist, create it
+         */
+        MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false );
+
+        MongoDataFile* addAFile( int sizeNeeded, bool preallocateNextFile );
+
+        /**
+         * makes sure we have an extra file at the end that is empty
+         * safe to call this multiple times - the implementation will only preallocate one file
+         */
+        void preallocateAFile() { getFile( numFiles() , 0, true ); }
+
+        MongoDataFile* suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota );
+
+        Extent* allocExtent( const char *ns, int size, bool capped, bool enforceQuota );
+
+        MongoDataFile* newestFile();
+
+        /**
+         * @return true if success.  false if bad level or error creating profile ns
+         */
+        bool setProfilingLevel( int newLevel , string& errmsg );
+
+        void flushFiles( bool sync );
+
+        /**
+         * @return true if ns is part of the database
+         *         ns=foo.bar, db=foo returns true
+         */
+        bool ownsNS( const string& ns ) const {
+            if ( ! startsWith( ns , name ) )
+                return false;
+            return ns[name.size()] == '.';
+        }
+    private:
+        /**
+         * @throws DatabaseDifferCaseCode if the name is a duplicate based on
+         * case insensitive matching.
+         */
+        void checkDuplicateUncasedNames(bool inholderlockalready) const;
+    public:
+        /**
+         * @return name of an existing database with same text name but different
+         * casing, if one exists.  Otherwise the empty string is returned.  If
+         * 'duplicates' is specified, it is filled with all duplicate names.
+         */
+        static string duplicateUncasedName( bool inholderlockalready, const string &name, const string &path, set< string > *duplicates = 0 );
+
+        const string name; // "alleyinsider"
+        const string path;
+
+    private:
+
+        // must be in the dbLock when touching this (and write locked when writing to of course)
+        // however during Database object construction we aren't, which is ok as it isn't yet visible
+        //   to others and we are in the dbholder lock then.
+        vector<MongoDataFile*> _files;
+
+    public: // this should be private later
+
+        NamespaceIndex namespaceIndex;
+        int profile; // 0=off.
+        const string profileName; // "alleyinsider.system.profile"
+        CCByLoc ccByLoc;
+        int magic; // used for making sure the object is still loaded in memory
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/databaseholder.h b/src/mongo/db/databaseholder.h
new file mode 100644
index 00000000000..7c878c4ed63
--- /dev/null
+++ b/src/mongo/db/databaseholder.h
@@ -0,0 +1,126 @@
+// @file databaseholder.h
+
+#pragma once
+
+namespace mongo { 
+
+    /**
+     * path + dbname -> Database
+     */
+    class DatabaseHolder {
+        typedef map<string,Database*> DBs;
+        typedef map<string,DBs> Paths;
+    public:
+        DatabaseHolder() : _size(0) { }
+
+        bool __isLoaded( const string& ns , const string& path ) const {
+            Paths::const_iterator x = _paths.find( path );
+            if ( x == _paths.end() )
+                return false;
+            const DBs& m = x->second;
+
+            string db = _todb( ns );
+
+            DBs::const_iterator it = m.find(db);
+            return it != m.end();
+        }
+        // must be write locked as otherwise isLoaded could go false->true on you 
+        // in the background and you might not expect that.
+        bool _isLoaded( const string& ns , const string& path ) const {
+            d.dbMutex.assertWriteLocked();
+            return __isLoaded(ns,path);
+        }
+
+        Database * get( const string& ns , const string& path ) const {
+            d.dbMutex.assertAtLeastReadLocked();
+            Paths::const_iterator x = _paths.find( path );
+            if ( x == _paths.end() )
+                return 0;
+            const DBs& m = x->second;
+            string db = _todb( ns );
+            DBs::const_iterator it = m.find(db);
+            if ( it != m.end() )
+                return it->second;
+            return 0;
+        }
+
+        void _put( const string& ns , const string& path , Database * db ) {
+            d.dbMutex.assertAtLeastReadLocked();
+            DBs& m = _paths[path];
+            Database*& d = m[_todb(ns)];
+            if( d ) {
+                dlog(2) << "info dbholder put db was already set " << ns << endl;
+            }
+            else {
+                _size++;
+            }
+            d = db;
+        }
+
+        Database* getOrCreate( const string& ns , const string& path , bool& justCreated );
+
+        void erase( const string& ns , const string& path ) {
+            d.dbMutex.assertWriteLocked(); // write lock req'd as a Database obj can be in use dbHolderMutex is mainly just to control the holder itself
+            DBs& m = _paths[path];
+            _size -= (int)m.erase( _todb( ns ) );
+        }
+
+        /** @param force - force close even if something underway - use at shutdown */
+        bool closeAll( const string& path , BSONObjBuilder& result, bool force );
+
+        // "info" as this is informational only could change on you if you are not write locked
+        int sizeInfo() const { return _size; }
+
+        void forEach(boost::function<void(Database *)> f) const {
+            d.dbMutex.assertWriteLocked();
+            for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) {
+                DBs m = i->second;
+                for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) {
+                    f(j->second);
+                }
+            }
+        }
+
+        /**
+         * gets all unique db names, ignoring paths
+         */
+        void getAllShortNames( bool locked, set<string>& all ) const {
+            d.dbMutex.assertAtLeastReadLocked();
+            for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) {
+                DBs m = i->second;
+                for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) {
+                    all.insert( j->first );
+                }
+            }
+        }
+
+    private:
+        static string _todb( const string& ns ) {
+            string d = __todb( ns );
+            uassert( 13280 , (string)"invalid db name: " + ns , NamespaceString::validDBName( d ) );
+            return d;
+        }
+        static string __todb( const string& ns ) {
+            size_t i = ns.find( '.' );
+            if ( i == string::npos ) {
+                uassert( 13074 , "db name can't be empty" , ns.size() );
+                return ns;
+            }
+            uassert( 13075 , "db name can't be empty" , i > 0 );
+            return ns.substr( 0 , i );
+        }
+        Paths _paths;
+        int _size;
+    };
+
+    DatabaseHolder& dbHolderUnchecked();
+    inline const DatabaseHolder& dbHolder() { 
+        dassert( d.dbMutex.atLeastReadLocked() );
+        return dbHolderUnchecked();
+    }
+    inline DatabaseHolder& dbHolderW() { 
+        dassert( d.dbMutex.isWriteLocked() );
+        return dbHolderUnchecked();
+    }
+
+}
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp
new file mode 100644
index 00000000000..af03b447976
--- /dev/null
+++ b/src/mongo/db/db.cpp
@@ -0,0 +1,1309 @@
+// @file db.cpp : Defines main() for the mongod program.
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "introspect.h"
+#include "repl.h"
+#include "../util/unittest.h"
+#include "../util/file_allocator.h"
+#include "../util/background.h"
+#include "../util/text.h"
+#include "dbmessage.h"
+#include "instance.h"
+#include "clientcursor.h"
+#include "pdfile.h"
+#include "stats/counters.h"
+#include "repl/rs.h"
+#include "../scripting/engine.h"
+#include "module.h"
+#include "cmdline.h"
+#include "stats/snapshots.h"
+#include "../util/concurrency/task.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "../util/net/message_server.h"
+#include "client.h"
+#include "restapi.h"
+#include "dbwebserver.h"
+#include "dur.h"
+#include "concurrency.h"
+#include "../s/d_writeback.h"
+#include "d_globals.h"
+
+#if defined(_WIN32)
+# include "../util/ntservice.h"
+#else
+# include <sys/file.h>
+#endif
+
+namespace mongo {
+
+    namespace dur { 
+        extern unsigned long long DataLimitPerJournalFile;
+    }
+
+    /* only off if --nocursors which is for debugging. */
+    extern bool useCursors;
+
+    /* only off if --nohints */
+    extern bool useHints;
+
+    extern int diagLogging;
+    extern unsigned lenForNewNsFiles;
+    extern int lockFile;
+    extern bool checkNsFilesOnLoad;
+    extern string repairpath;
+
+    void setupSignals( bool inFork );
+    void startReplication();
+    void exitCleanly( ExitCode code );
+
+    CmdLine cmdLine;
+    static bool scriptingEnabled = true;
+    bool noHttpInterface = false;
+    bool shouldRepairDatabases = 0;
+    static bool forceRepair = 0;
+    Timer startupSrandTimer;
+
+    const char *ourgetns() {
+        Client *c = currentClient.get();
+        if ( ! c )
+            return "";
+        Client::Context* cc = c->getContext();
+        return cc ? cc->ns() : "";
+    }
+
+    struct MyStartupTests {
+        MyStartupTests() {
+            assert( sizeof(OID) == 12 );
+        }
+    } mystartupdbcpp;
+
+    QueryResult* emptyMoreResult(long long);
+
+
+    /* todo: make this a real test.  the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */
+// QueryOption_Exhaust
+#define TESTEXHAUST 0
+#if( TESTEXHAUST )
+    void testExhaust() {
+        sleepsecs(1);
+        unsigned n = 0;
+        auto f = [&n](const BSONObj& o) {
+            assert( o.valid() );
+            //cout << o << endl;
+            n++;
+            bool testClosingSocketOnError = false;
+            if( testClosingSocketOnError )
+                assert(false);
+        };
+        DBClientConnection db(false);
+        db.connect("localhost");
+        const char *ns = "local.foo";
+        if( db.count(ns) < 10000 )
+            for( int i = 0; i < 20000; i++ )
+                db.insert(ns, BSON("aaa" << 3 << "b" << "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
+
+        try {
+            db.query(f, ns, Query() );
+        }
+        catch(...) {
+            cout << "hmmm" << endl;
+        }
+
+        try {
+            db.query(f, ns, Query() );
+        }
+        catch(...) {
+            cout << "caught" << endl;
+        }
+
+        cout << n << endl;
+    };
+#endif
+
+    void sysRuntimeInfo() {
+        out() << "sysinfo:" << endl;
+#if defined(_SC_PAGE_SIZE)
+        out() << "  page size: " << (int) sysconf(_SC_PAGE_SIZE) << endl;
+#endif
+#if defined(_SC_PHYS_PAGES)
+        out() << "  _SC_PHYS_PAGES: " << sysconf(_SC_PHYS_PAGES) << endl;
+#endif
+#if defined(_SC_AVPHYS_PAGES)
+        out() << "  _SC_AVPHYS_PAGES: " << sysconf(_SC_AVPHYS_PAGES) << endl;
+#endif
+    }
+
+    /* if server is really busy, wait a bit */
+    void beNice() {
+        sleepmicros( Client::recommendedYieldMicros() );
+    }
+
+    class MyMessageHandler : public MessageHandler {
+    public:
+        virtual void connected( AbstractMessagingPort* p ) {
+            Client& c = Client::initThread("conn", p);
+            c.getAuthenticationInfo()->isLocalHost = p->remote().isLocalHost();
+        }
+
+        virtual void process( Message& m , AbstractMessagingPort* port , LastError * le) {
+            while ( true ) {
+                if ( inShutdown() ) {
+                    log() << "got request after shutdown()" << endl;
+                    break;
+                }
+
+                lastError.startRequest( m , le );
+
+                DbResponse dbresponse;
+                assembleResponse( m, dbresponse, port->remote() );
+
+                if ( dbresponse.response ) {
+                    port->reply(m, *dbresponse.response, dbresponse.responseTo);
+                    if( dbresponse.exhaust ) {
+                        MsgData *header = dbresponse.response->header();
+                        QueryResult *qr = (QueryResult *) header;
+                        long long cursorid = qr->cursorId;
+                        if( cursorid ) {
+                            assert( dbresponse.exhaust && *dbresponse.exhaust != 0 );
+                            string ns = dbresponse.exhaust; // before reset() free's it...
+                            m.reset();
+                            BufBuilder b(512);
+                            b.appendNum((int) 0 /*size set later in appendData()*/);
+                            b.appendNum(header->id);
+                            b.appendNum(header->responseTo);
+                            b.appendNum((int) dbGetMore);
+                            b.appendNum((int) 0);
+                            b.appendStr(ns);
+                            b.appendNum((int) 0); // ntoreturn
+                            b.appendNum(cursorid);
+                            m.appendData(b.buf(), b.len());
+                            b.decouple();
+                            DEV log() << "exhaust=true sending more" << endl;
+                            beNice();
+                            continue; // this goes back to top loop
+                        }
+                    }
+                }
+                break;
+            }
+        }
+
+        virtual void disconnected( AbstractMessagingPort* p ) {
+            Client * c = currentClient.get();
+            if( c ) c->shutdown();
+            globalScriptEngine->threadDone();
+        }
+
+    };
+
+    void listen(int port) {
+        //testTheDb();
+        MessageServer::Options options;
+        options.port = port;
+        options.ipList = cmdLine.bind_ip;
+
+        MessageServer * server = createServer( options , new MyMessageHandler() );
+        server->setAsTimeTracker();
+
+        startReplication();
+        if ( !noHttpInterface )
+            boost::thread web( boost::bind(&webServerThread, new RestAdminAccess() /* takes ownership */));
+
+#if(TESTEXHAUST)
+        boost::thread thr(testExhaust);
+#endif
+        server->run();
+    }
+
+
+    bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ) {
+        static DBDirectClient db;
+
+        if ( h->version == 4 && h->versionMinor == 4 ) {
+            assert( PDFILE_VERSION == 4 );
+            assert( PDFILE_VERSION_MINOR == 5 );
+
+            list<string> colls = db.getCollectionNames( dbName );
+            for ( list<string>::iterator i=colls.begin(); i!=colls.end(); i++) {
+                string c = *i;
+                log() << "\t upgrading collection:" << c << endl;
+                BSONObj out;
+                bool ok = db.runCommand( dbName , BSON( "reIndex" << c.substr( dbName.size() + 1 ) ) , out );
+                if ( ! ok ) {
+                    errmsg = "reindex failed";
+                    log() << "\t\t reindex failed: " << out << endl;
+                    return false;
+                }
+            }
+
+            h->versionMinor = 5;
+            return true;
+        }
+
+        // do this in the general case
+        return repairDatabase( dbName.c_str(), errmsg );
+    }
+
+    // ran at startup.
+    static void repairDatabasesAndCheckVersion() {
+        //        LastError * le = lastError.get( true );
+        Client::GodScope gs;
+        log(1) << "enter repairDatabases (to check pdfile version #)" << endl;
+
+        //assert(checkNsFilesOnLoad);
+        checkNsFilesOnLoad = false; // we are mainly just checking the header - don't scan the whole .ns file for every db here.
+
+        dblock lk;
+        vector< string > dbNames;
+        getDatabaseNames( dbNames );
+        for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+            string dbName = *i;
+            log(1) << "\t" << dbName << endl;
+            Client::Context ctx( dbName );
+            MongoDataFile *p = cc().database()->getFile( 0 );
+            DataFileHeader *h = p->getHeader();
+            if ( !h->isCurrentVersion() || forceRepair ) {
+
+                if( h->version <= 0 ) {
+                    uasserted(14026, 
+                      str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version 
+							        << " info: " << h->versionMinor << ' ' << h->fileLength);
+                }
+
+                log() << "****" << endl;
+                log() << "****" << endl;
+                log() << "need to upgrade database " << dbName << " with pdfile version " << h->version << "." << h->versionMinor << ", "
+                      << "new version: " << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR << endl;
+                if ( shouldRepairDatabases ) {
+                    // QUESTION: Repair even if file format is higher version than code?
+                    log() << "\t starting upgrade" << endl;
+                    string errmsg;
+                    assert( doDBUpgrade( dbName , errmsg , h ) );
+                }
+                else {
+                    log() << "\t Not upgrading, exiting" << endl;
+                    log() << "\t run --upgrade to upgrade dbs, then start again" << endl;
+                    log() << "****" << endl;
+                    dbexit( EXIT_NEED_UPGRADE );
+                    shouldRepairDatabases = 1;
+                    return;
+                }
+            }
+            else {
+                Database::closeDatabase( dbName.c_str(), dbpath );
+            }
+        }
+
+        log(1) << "done repairDatabases" << endl;
+
+        if ( shouldRepairDatabases ) {
+            log() << "finished checking dbs" << endl;
+            cc().shutdown();
+            dbexit( EXIT_CLEAN );
+        }
+
+        checkNsFilesOnLoad = true;
+    }
+
+    void clearTmpFiles() {
+        boost::filesystem::path path( dbpath );
+        for ( boost::filesystem::directory_iterator i( path );
+                i != boost::filesystem::directory_iterator(); ++i ) {
+            string fileName = boost::filesystem::path(*i).leaf();
+            if ( boost::filesystem::is_directory( *i ) &&
+                    fileName.length() && fileName[ 0 ] == '$' )
+                boost::filesystem::remove_all( *i );
+        }
+    }
+
+    void checkIfReplMissingFromCommandLine() {
+        if( !cmdLine.usingReplSets() ) { 
+            Client::GodScope gs;
+            DBDirectClient c;
+            unsigned long long x = 
+                c.count("local.system.replset");
+            if( x ) { 
+                log() << endl;
+                log() << "** warning: mongod started without --replSet yet " << x << " documents are present in local.system.replset" << endl;
+                log() << "**          restart with --replSet unless you are doing maintenance and no other clients are connected" << endl;
+                log() << endl;
+            }
+        }
+    }
+
+    void clearTmpCollections() {
+        writelock lk; // _openAllFiles is false at this point, so this is helpful for the query below to work as you can't open files when readlocked
+        Client::GodScope gs;
+        vector< string > toDelete;
+        DBDirectClient cli;
+        auto_ptr< DBClientCursor > c = cli.query( "local.system.namespaces", Query( fromjson( "{name:/^local.temp./}" ) ) );
+        while( c->more() ) {
+            BSONObj o = c->next();
+            toDelete.push_back( o.getStringField( "name" ) );
+        }
+        for( vector< string >::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) {
+            log() << "Dropping old temporary collection: " << *i << endl;
+            cli.dropCollection( *i );
+        }
+    }
+
+    /**
+     * does background async flushes of mmapped files
+     */
+    class DataFileSync : public BackgroundJob {
+    public:
+        string name() const { return "DataFileSync"; }
+        void run() {
+            if( cmdLine.syncdelay == 0 )
+                log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl;
+            else if( cmdLine.syncdelay == 1 )
+                log() << "--syncdelay 1" << endl;
+            else if( cmdLine.syncdelay != 60 )
+                log(1) << "--syncdelay " << cmdLine.syncdelay << endl;
+            int time_flushing = 0;
+            while ( ! inShutdown() ) {
+                _diaglog.flush();
+                if ( cmdLine.syncdelay == 0 ) {
+                    // in case at some point we add an option to change at runtime
+                    sleepsecs(5);
+                    continue;
+                }
+
+                sleepmillis( (long long) std::max(0.0, (cmdLine.syncdelay * 1000) - time_flushing) );
+
+                if ( inShutdown() ) {
+                    // occasional issue trying to flush during shutdown when sleep interrupted
+                    break;
+                }
+
+                Date_t start = jsTime();
+                int numFiles = MemoryMappedFile::flushAll( true );
+                time_flushing = (int) (jsTime() - start);
+
+                globalFlushCounters.flushed(time_flushing);
+
+                if( logLevel >= 1 || time_flushing >= 10000 ) { 
+                    log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl;
+                }
+            }
+        }
+
+    } dataFileSync;
+
+    const char * jsInterruptCallback() {
+        // should be safe to interrupt in js code, even if we have a write lock
+        return killCurrentOp.checkForInterruptNoAssert();
+    }
+
+    unsigned jsGetInterruptSpecCallback() {
+        return cc().curop()->opNum();
+    }
+
+    void _initAndListen(int listenPort ) {
+
+        Client::initThread("initandlisten");
+
+        Database::_openAllFiles = false;
+
+        Logstream::get().addGlobalTee( new RamLog("global") );
+
+        bool is32bit = sizeof(int*) == 4;
+
+        {
+#if !defined(_WIN32)
+            pid_t pid = getpid();
+#else
+            DWORD pid=GetCurrentProcessId();
+#endif
+            Nullstream& l = log();
+            l << "MongoDB starting : pid=" << pid << " port=" << cmdLine.port << " dbpath=" << dbpath;
+            if( replSettings.master ) l << " master=" << replSettings.master;
+            if( replSettings.slave )  l << " slave=" << (int) replSettings.slave;
+            l << ( is32bit ? " 32" : " 64" ) << "-bit host=" << getHostNameCached() << endl;
+        }
+        DEV log() << "_DEBUG build (which is slower)" << endl;
+        show_warnings();
+        log() << mongodVersion() << endl;
+        printGitVersion();
+        printSysInfo();
+        printCommandLineOpts();
+
+        {
+            stringstream ss;
+            ss << endl;
+            ss << "*********************************************************************" << endl;
+            ss << " ERROR: dbpath (" << dbpath << ") does not exist." << endl;
+            ss << " Create this directory or give existing directory in --dbpath." << endl;
+            ss << " See http://www.mongodb.org/display/DOCS/Starting+and+Stopping+Mongo" << endl;
+            ss << "*********************************************************************" << endl;
+            uassert( 10296 ,  ss.str().c_str(), boost::filesystem::exists( dbpath ) );
+        }
+        {
+            stringstream ss;
+            ss << "repairpath (" << repairpath << ") does not exist";
+            uassert( 12590 ,  ss.str().c_str(), boost::filesystem::exists( repairpath ) );
+        }
+
+        acquirePathLock(forceRepair);
+        remove_all( dbpath + "/_tmp/" );
+
+        FileAllocator::get()->start();
+
+        MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( clearTmpFiles(), "clear tmp files" );
+
+        dur::startup();
+
+        if( cmdLine.durOptions & CmdLine::DurRecoverOnly )
+            return;
+
+        // comes after getDur().startup() because this reads from the database
+        clearTmpCollections();
+
+        checkIfReplMissingFromCommandLine();
+
+        Module::initAll();
+
+        if ( scriptingEnabled ) {
+            ScriptEngine::setup();
+            globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback );
+            globalScriptEngine->setGetInterruptSpecCallback( jsGetInterruptSpecCallback );
+        }
+
+        repairDatabasesAndCheckVersion();
+
+        /* we didn't want to pre-open all files for the repair check above. for regular
+           operation we do for read/write lock concurrency reasons.
+        */
+        Database::_openAllFiles = true;
+
+        if ( shouldRepairDatabases )
+            return;
+
+        /* this is for security on certain platforms (nonce generation) */
+        srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros()));
+
+        snapshotThread.go();
+        d.clientCursorMonitor.go();
+        PeriodicTask::theRunner->go();
+        
+#ifndef _WIN32
+        CmdLine::launchOk();
+#endif
+        listen(listenPort);
+
+        // listen() will return when exit code closes its socket.
+        exitCleanly(EXIT_NET_ERROR);
+    }
+
+    void testPretouch();
+
+    void initAndListen(int listenPort) {
+        try { 
+            _initAndListen(listenPort); 
+        }
+        catch ( DBException &e ) {
+            log() << "exception in initAndListen: " << e.toString() << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch ( std::exception &e ) {
+            log() << "exception in initAndListen std::exception: " << e.what() << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch ( int& n ) {
+            log() << "exception in initAndListen int: " << n << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch(...) {
+            log() << "exception in initAndListen, terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+    }
+
+#if defined(_WIN32)
+    bool initService() {
+        ServiceController::reportStatus( SERVICE_RUNNING );
+        initAndListen( cmdLine.port );
+        return true;
+    }
+#endif
+
+} // namespace mongo
+
+using namespace mongo;
+
+#include <boost/program_options.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace po = boost::program_options;
+
+void show_help_text(po::options_description options) {
+    show_warnings();
+    cout << options << endl;
+};
+
+/* Return error string or "" if no errors. */
+string arg_error_check(int argc, char* argv[]) {
+    return "";
+}
+
+int main(int argc, char* argv[]) {
+    static StaticObserver staticObserver;
+    doPreServerStartupInits();
+    getcurns = ourgetns;
+
+    po::options_description general_options("General options");
+#if defined(_WIN32)
+    po::options_description windows_scm_options("Windows Service Control Manager options");
+#endif
+    po::options_description replication_options("Replication options");
+    po::options_description ms_options("Master/slave options");
+    po::options_description rs_options("Replica set options");
+    po::options_description sharding_options("Sharding options");
+    po::options_description visible_options("Allowed options");
+    po::options_description hidden_options("Hidden options");
+
+    po::positional_options_description positional_options;
+
+    CmdLine::addGlobalOptions( general_options , hidden_options );
+
+    general_options.add_options()
+    ("auth", "run with security")
+    ("cpu", "periodically show cpu and iowait utilization")
+    ("dbpath", po::value<string>() , "directory for datafiles")
+    ("diaglog", po::value<int>(), "0=off 1=W 2=R 3=both 7=W+some reads")
+    ("directoryperdb", "each database will be stored in a separate directory")
+    ("journal", "enable journaling")
+    ("journalOptions", po::value<int>(), "journal diagnostic options")
+    ("journalCommitInterval", po::value<unsigned>(), "how often to group/batch commit (ms)")
+    ("ipv6", "enable IPv6 support (disabled by default)")
+    ("jsonp","allow JSONP access via http (has security implications)")
+    ("noauth", "run without security")
+    ("nohttpinterface", "disable http interface")
+    ("nojournal", "disable journaling (journaling is on by default for 64 bit)")
+    ("noprealloc", "disable data file preallocation - will often hurt performance")
+    ("noscripting", "disable scripting engine")
+    ("notablescan", "do not allow table scans")
+    ("nssize", po::value<int>()->default_value(16), ".ns file size (in MB) for new databases")
+    ("profile",po::value<int>(), "0=off 1=slow, 2=all")
+    ("quota", "limits each database to a certain number of files (8 default)")
+    ("quotaFiles", po::value<int>(), "number of files allower per db, requires --quota")
+    ("rest","turn on simple rest api")
+    ("repair", "run repair on all dbs")
+    ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" )
+    ("slowms",po::value<int>(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" )
+    ("smallfiles", "use a smaller default file size")
+#if defined(__linux__)
+    ("shutdown", "kill a running server (for init scripts)")
+#endif
+    ("syncdelay",po::value<double>(&cmdLine.syncdelay)->default_value(60), "seconds between disk syncs (0=never, but not recommended)")
+    ("sysinfo", "print some diagnostic system information")
+    ("upgrade", "upgrade db if needed")
+    ;
+
+#if defined(_WIN32)
+    CmdLine::addWindowsOptions( windows_scm_options, hidden_options );
+#endif
+
+    replication_options.add_options()
+    ("oplogSize", po::value<int>(), "size limit (in MB) for op log")
+    ;
+
+    ms_options.add_options()
+    ("master", "master mode")
+    ("slave", "slave mode")
+    ("source", po::value<string>(), "when slave: specify master as <server:port>")
+    ("only", po::value<string>(), "when slave: specify a single database to replicate")
+    ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave")
+    ("autoresync", "automatically resync if slave data is stale")
+    ;
+
+    rs_options.add_options()
+    ("replSet", po::value<string>(), "arg is <setname>[/<optionalseedhostlist>]")
+    ;
+
+    sharding_options.add_options()
+    ("configsvr", "declare this is a config db of a cluster; default port 27019; default dir /data/configdb")
+    ("shardsvr", "declare this is a shard db of a cluster; default port 27018")
+    ("noMoveParanoia" , "turn off paranoid saving of data for moveChunk.  this is on by default for now, but default will switch" )
+    ;
+
+    hidden_options.add_options()
+    ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer")
+    ("pretouch", po::value<int>(), "n pretouch threads for applying replicationed operations") // experimental
+    ("command", po::value< vector<string> >(), "command")
+    ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
+    ("nodur", "disable journaling")
+    // things we don't want people to use
+    ("nocursors", "diagnostic/debugging option that turns off cursors DO NOT USE IN PRODUCTION")
+    ("nohints", "ignore query hints")
+    ("nopreallocj", "don't preallocate journal files")
+    ("dur", "enable journaling") // old name for --journal
+    ("durOptions", po::value<int>(), "durability diagnostic options") // deprecated name
+    // deprecated pairing command line options
+    ("pairwith", "DEPRECATED")
+    ("arbiter", "DEPRECATED")
+    ("opIdMem", "DEPRECATED")
+    ;
+
+
+    positional_options.add("command", 3);
+    visible_options.add(general_options);
+#if defined(_WIN32)
+    visible_options.add(windows_scm_options);
+#endif
+    visible_options.add(replication_options);
+    visible_options.add(ms_options);
+    visible_options.add(rs_options);
+    visible_options.add(sharding_options);
+    Module::addOptions( visible_options );
+
+    setupCoreSignals();
+    setupSignals( false );
+
+    dbExecCommand = argv[0];
+
+    srand(curTimeMicros());
+#if( BOOST_VERSION >= 104500 )
+    boost::filesystem::path::default_name_check( boost::filesystem2::no_check );
+#else
+    boost::filesystem::path::default_name_check( boost::filesystem::no_check );
+#endif
+
+    {
+        unsigned x = 0x12345678;
+        unsigned char& b = (unsigned char&) x;
+        if ( b != 0x78 ) {
+            out() << "big endian cpus not yet supported" << endl;
+            return 33;
+        }
+    }
+
+    if( argc == 1 )
+        cout << dbExecCommand << " --help for help and startup options" << endl;
+
+    {
+        po::variables_map params;
+
+        string error_message = arg_error_check(argc, argv);
+        if (error_message != "") {
+            cout << error_message << endl << endl;
+            show_help_text(visible_options);
+            return 0;
+        }
+
+        if ( ! CmdLine::store( argc , argv , visible_options , hidden_options , positional_options , params ) )
+            return 0;
+
+        if (params.count("help")) {
+            show_help_text(visible_options);
+            return 0;
+        }
+        if (params.count("version")) {
+            cout << mongodVersion() << endl;
+            printGitVersion();
+            return 0;
+        }
+        if ( params.count( "dbpath" ) ) {
+            dbpath = params["dbpath"].as<string>();
+            if ( params.count( "fork" ) && dbpath[0] != '/' ) {
+                // we need to change dbpath if we fork since we change
+                // cwd to "/"
+                // fork only exists on *nix
+                // so '/' is safe 
+                dbpath = cmdLine.cwd + "/" + dbpath;
+            }
+        }
+        else {
+            dbpath = "/data/db/";
+        }
+#ifdef _WIN32
+        if (dbpath.size() > 1 && dbpath[dbpath.size()-1] == '/') {
+            // size() check is for the unlikely possibility of --dbpath "/"
+            dbpath = dbpath.erase(dbpath.size()-1);
+        }
+#endif
+
+        if ( params.count("directoryperdb")) {
+            directoryperdb = true;
+        }
+        if (params.count("cpu")) {
+            cmdLine.cpu = true;
+        }
+        if (params.count("noauth")) {
+            noauth = true;
+        }
+        if (params.count("auth")) {
+            noauth = false;
+        }
+        if (params.count("quota")) {
+            cmdLine.quota = true;
+        }
+        if (params.count("quotaFiles")) {
+            cmdLine.quota = true;
+            cmdLine.quotaFiles = params["quotaFiles"].as<int>() - 1;
+        }
+        bool journalExplicit = false;
+        if( params.count("nodur") || params.count( "nojournal" ) ) {
+            journalExplicit = true;
+            cmdLine.dur = false;
+        }
+        if( params.count("dur") || params.count( "journal" ) ) {
+            if (journalExplicit) {
+                log() << "Can't specify both --journal and --nojournal options." << endl;
+                return EXIT_BADOPTIONS;
+            }
+            journalExplicit = true;
+            cmdLine.dur = true;
+        }
+        if (params.count("durOptions")) {
+            cmdLine.durOptions = params["durOptions"].as<int>();
+        }
+        if( params.count("journalCommitInterval") ) { 
+            // don't check if dur is false here as many will just use the default, and will default to off on win32. 
+            // ie no point making life a little more complex by giving an error on a dev environment.
+            cmdLine.journalCommitInterval = params["journalCommitInterval"].as<unsigned>();
+            if( cmdLine.journalCommitInterval <= 1 || cmdLine.journalCommitInterval > 300 ) {
+                out() << "--journalCommitInterval out of allowed range (0-300ms)" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        if (params.count("journalOptions")) {
+            cmdLine.durOptions = params["journalOptions"].as<int>();
+        }
+        if (params.count("repairpath")) {
+            repairpath = params["repairpath"].as<string>();
+            if (!repairpath.size()) {
+                out() << "repairpath is empty" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        if (params.count("nocursors")) {
+            useCursors = false;
+        }
+        if (params.count("nohints")) {
+            useHints = false;
+        }
+        if (params.count("nopreallocj")) {
+            cmdLine.preallocj = false;
+        }
+        if (params.count("nohttpinterface")) {
+            noHttpInterface = true;
+        }
+        if (params.count("rest")) {
+            cmdLine.rest = true;
+        }
+        if (params.count("jsonp")) {
+            cmdLine.jsonp = true;
+        }
+        if (params.count("noscripting")) {
+            scriptingEnabled = false;
+        }
+        if (params.count("noprealloc")) {
+            cmdLine.prealloc = false;
+            cout << "note: noprealloc may hurt performance in many applications" << endl;
+        }
+        if (params.count("smallfiles")) {
+            cmdLine.smallfiles = true;
+            assert( dur::DataLimitPerJournalFile >= 128 * 1024 * 1024 );
+            dur::DataLimitPerJournalFile = 128 * 1024 * 1024;
+        }
+        if (params.count("diaglog")) {
+            int x = params["diaglog"].as<int>();
+            if ( x < 0 || x > 7 ) {
+                out() << "can't interpret --diaglog setting" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            _diaglog.setLevel(x);
+        }
+        if (params.count("sysinfo")) {
+            sysRuntimeInfo();
+            return 0;
+        }
+        if (params.count("repair")) {
+            Record::MemoryTrackingEnabled = false;
+            shouldRepairDatabases = 1;
+            forceRepair = 1;
+        }
+        if (params.count("upgrade")) {
+            Record::MemoryTrackingEnabled = false;
+            shouldRepairDatabases = 1;
+        }
+        if (params.count("notablescan")) {
+            cmdLine.noTableScan = true;
+        }
+        if (params.count("master")) {
+            replSettings.master = true;
+        }
+        if (params.count("slave")) {
+            replSettings.slave = SimpleSlave;
+        }
+        if (params.count("slavedelay")) {
+            replSettings.slavedelay = params["slavedelay"].as<int>();
+        }
+        if (params.count("fastsync")) {
+            replSettings.fastsync = true;
+        }
+        if (params.count("autoresync")) {
+            replSettings.autoresync = true;
+            if( params.count("replSet") ) {
+                out() << "--autoresync is not used with --replSet" << endl;
+                out() << "see http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        if (params.count("source")) {
+            /* specifies what the source in local.sources should be */
+            cmdLine.source = params["source"].as<string>().c_str();
+        }
+        if( params.count("pretouch") ) {
+            cmdLine.pretouch = params["pretouch"].as<int>();
+        }
+        if (params.count("replSet")) {
+            if (params.count("slavedelay")) {
+                out() << "--slavedelay cannot be used with --replSet" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            else if (params.count("only")) {
+                out() << "--only cannot be used with --replSet" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            /* seed list of hosts for the repl set */
+            cmdLine._replSet = params["replSet"].as<string>().c_str();
+        }
+        if (params.count("only")) {
+            cmdLine.only = params["only"].as<string>().c_str();
+        }
+        if( params.count("nssize") ) {
+            int x = params["nssize"].as<int>();
+            if (x <= 0 || x > (0x7fffffff/1024/1024)) {
+                out() << "bad --nssize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            lenForNewNsFiles = x * 1024 * 1024;
+            assert(lenForNewNsFiles > 0);
+        }
+        if (params.count("oplogSize")) {
+            long long x = params["oplogSize"].as<int>();
+            if (x <= 0) {
+                out() << "bad --oplogSize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            // note a small size such as x==1 is ok for an arbiter.
+            if( x > 1000 && sizeof(void*) == 4 ) {
+                out() << "--oplogSize of " << x << "MB is too big for 32 bit version. Use 64 bit build instead." << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            cmdLine.oplogSize = x * 1024 * 1024;
+            assert(cmdLine.oplogSize > 0);
+        }
+        if (params.count("cacheSize")) {
+            long x = params["cacheSize"].as<long>();
+            if (x <= 0) {
+                out() << "bad --cacheSize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            log() << "--cacheSize option not currently supported" << endl;
+        }
+        if (params.count("port") == 0 ) {
+            if( params.count("configsvr") ) {
+                cmdLine.port = CmdLine::ConfigServerPort;
+            }
+            if( params.count("shardsvr") ) {
+                if( params.count("configsvr") ) {
+                    log() << "can't do --shardsvr and --configsvr at the same time" << endl;
+                    dbexit( EXIT_BADOPTIONS );
+                }
+                cmdLine.port = CmdLine::ShardServerPort;
+            }
+        }
+        else {
+            if ( cmdLine.port <= 0 || cmdLine.port > 65535 ) {
+                out() << "bad --port number" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        if ( params.count("configsvr" ) ) {
+            cmdLine.configsvr = true;
+            if (cmdLine.usingReplSets() || replSettings.master || replSettings.slave) {
+                log() << "replication should not be enabled on a config server" << endl;
+                ::exit(-1);
+            }
+            if ( params.count( "nodur" ) == 0 && params.count( "nojournal" ) == 0 )
+                cmdLine.dur = true;
+            if ( params.count( "dbpath" ) == 0 )
+                dbpath = "/data/configdb";
+        }
+        if ( params.count( "profile" ) ) {
+            cmdLine.defaultProfile = params["profile"].as<int>();
+        }
+        if (params.count("ipv6")) {
+            enableIPv6();
+        }
+        if (params.count("noMoveParanoia")) {
+            cmdLine.moveParanoia = false;
+        }
+        if (params.count("pairwith") || params.count("arbiter") || params.count("opIdMem")) {
+            out() << "****" << endl;
+            out() << "Replica Pairs have been deprecated. Invalid options: --pairwith, --arbiter, and/or --opIdMem" << endl;
+            out() << "<http://www.mongodb.org/display/DOCS/Replica+Pairs>" << endl;
+            out() << "****" << endl;
+            dbexit( EXIT_BADOPTIONS );
+        }
+
+        // needs to be after things like --configsvr parsing, thus here.
+        if( repairpath.empty() )
+            repairpath = dbpath;
+
+        Module::configAll( params );
+        dataFileSync.go();
+
+        if (params.count("command")) {
+            vector<string> command = params["command"].as< vector<string> >();
+
+            if (command[0].compare("run") == 0) {
+                if (command.size() > 1) {
+                    cout << "Too many parameters to 'run' command" << endl;
+                    cout << visible_options << endl;
+                    return 0;
+                }
+
+                initAndListen(cmdLine.port);
+                return 0;
+            }
+
+            if (command[0].compare("dbpath") == 0) {
+                cout << dbpath << endl;
+                return 0;
+            }
+
+            cout << "Invalid command: " << command[0] << endl;
+            cout << visible_options << endl;
+            return 0;
+        }
+
+        if( cmdLine.pretouch )
+            log() << "--pretouch " << cmdLine.pretouch << endl;
+
+#ifdef __linux__
+        if (params.count("shutdown")){
+            bool failed = false;
+
+            string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+            if ( !boost::filesystem::exists( name ) || boost::filesystem::file_size( name ) == 0 )
+                failed = true;
+
+            pid_t pid;
+            string procPath;
+            if (!failed){
+                try {
+                    ifstream f (name.c_str());
+                    f >> pid;
+                    procPath = (str::stream() << "/proc/" << pid);
+                    if (!boost::filesystem::exists(procPath))
+                        failed = true;
+
+                    string exePath = procPath + "/exe";
+                    if (boost::filesystem::exists(exePath)){
+                        char buf[256];
+                        int ret = readlink(exePath.c_str(), buf, sizeof(buf)-1);
+                        buf[ret] = '\0'; // readlink doesn't terminate string
+                        if (ret == -1) {
+                            int e = errno;
+                            cerr << "Error resolving " << exePath << ": " << errnoWithDescription(e);
+                            failed = true;
+                        }
+                        else if (!endsWith(buf, "mongod")){
+                            cerr << "Process " << pid << " is running " << buf << " not mongod" << endl;
+                            ::exit(-1);
+                        }
+                    }
+                }
+                catch (const std::exception& e){
+                    cerr << "Error reading pid from lock file [" << name << "]: " << e.what() << endl;
+                    failed = true;
+                }
+            }
+
+            if (failed) {
+                cerr << "There doesn't seem to be a server running with dbpath: " << dbpath << endl;
+                ::exit(-1);
+            }
+
+            cout << "killing process with pid: " << pid << endl;
+            int ret = kill(pid, SIGTERM);
+            if (ret) {
+                int e = errno;
+                cerr << "failed to kill process: " << errnoWithDescription(e) << endl;
+                ::exit(-1);
+            }
+
+            while (boost::filesystem::exists(procPath)) {
+                sleepsecs(1);
+            }
+
+            ::exit(0);
+        }
+#endif
+
+#if defined(_WIN32)
+        if (serviceParamsCheck( params, dbpath, argc, argv )) {
+            return 0;
+        }
+#endif
+
+
+        if (sizeof(void*) == 4 && !journalExplicit){
+            // trying to make this stand out more like startup warnings
+            log() << endl;
+            warning() << "32-bit servers don't have journaling enabled by default. Please use --journal if you want durability." << endl;
+            log() << endl;
+        }
+
+    }
+
+    UnitTest::runTests();
+    initAndListen(cmdLine.port);
+    dbexit(EXIT_CLEAN);
+    return 0;
+}
+
+namespace mongo {
+
+    string getDbContext();
+
+#undef out
+
+
+#if !defined(_WIN32)
+
+} // namespace mongo
+
+#include <signal.h>
+#include <string.h>
+
+namespace mongo {
+
+    void pipeSigHandler( int signal ) {
+#ifdef psignal
+        psignal( signal, "Signal Received : ");
+#else
+        cout << "got pipe signal:" << signal << endl;
+#endif
+    }
+
+    void abruptQuit(int x) {
+        ostringstream ossSig;
+        ossSig << "Got signal: " << x << " (" << strsignal( x ) << ")." << endl;
+        rawOut( ossSig.str() );
+
+        /*
+        ostringstream ossOp;
+        ossOp << "Last op: " << currentOp.infoNoauth() << endl;
+        rawOut( ossOp.str() );
+        */
+
+        ostringstream oss;
+        oss << "Backtrace:" << endl;
+        printStackTrace( oss );
+        rawOut( oss.str() );
+
+        // Don't go through normal shutdown procedure. It may make things worse.
+        ::exit(EXIT_ABRUPT);
+
+    }
+
+    void abruptQuitWithAddrSignal( int signal, siginfo_t *siginfo, void * ) {
+        ostringstream oss;
+        oss << "Invalid";
+        if ( signal == SIGSEGV || signal == SIGBUS ) {
+            oss << " access";
+        } else {
+            oss << " operation";   
+        }
+        oss << " at address: " << siginfo->si_addr << endl;
+        rawOut( oss.str() );
+        abruptQuit( signal );   
+    }
+        
+    sigset_t asyncSignals;
+    // The above signals will be processed by this thread only, in order to
+    // ensure the db and log mutexes aren't held.
+    void interruptThread() {
+        int x;
+        sigwait( &asyncSignals, &x );
+        log() << "got kill or ctrl c or hup signal " << x << " (" << strsignal( x ) << "), will terminate after current cmd ends" << endl;
+        Client::initThread( "interruptThread" );
+        exitCleanly( EXIT_KILL );
+    }
+
+    // this will be called in certain c++ error cases, for example if there are two active
+    // exceptions
+    void myterminate() {
+        rawOut( "terminate() called, printing stack:" );
+        printStackTrace();
+        ::abort();
+    }
+
+    // this gets called when new fails to allocate memory
+    void my_new_handler() {
+        rawOut( "out of memory, printing stack and exiting:" );
+        printStackTrace();
+        ::exit(EXIT_ABRUPT);
+    }
+
+    void setupSignals_ignoreHelper( int signal ) {}
+
+    void setupSignals( bool inFork ) {
+        struct sigaction addrSignals;
+        memset( &addrSignals, 0, sizeof( struct sigaction ) );
+        addrSignals.sa_sigaction = abruptQuitWithAddrSignal;
+        sigemptyset( &addrSignals.sa_mask );
+        addrSignals.sa_flags = SA_SIGINFO;
+       
+        assert( sigaction(SIGSEGV, &addrSignals, 0) == 0 );
+        assert( sigaction(SIGBUS, &addrSignals, 0) == 0 );
+        assert( sigaction(SIGILL, &addrSignals, 0) == 0 );
+        assert( sigaction(SIGFPE, &addrSignals, 0) == 0 );
+        
+        assert( signal(SIGABRT, abruptQuit) != SIG_ERR );
+        assert( signal(SIGQUIT, abruptQuit) != SIG_ERR );
+        assert( signal(SIGPIPE, pipeSigHandler) != SIG_ERR );
+
+        setupSIGTRAPforGDB();
+
+        sigemptyset( &asyncSignals );
+
+        if ( inFork )
+            assert( signal( SIGHUP , setupSignals_ignoreHelper ) != SIG_ERR );
+        else
+            sigaddset( &asyncSignals, SIGHUP );
+
+        sigaddset( &asyncSignals, SIGINT );
+        sigaddset( &asyncSignals, SIGTERM );
+        assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 );
+        boost::thread it( interruptThread );
+
+        set_terminate( myterminate );
+        set_new_handler( my_new_handler );
+    }
+
+#else
+    void consoleTerminate( const char* controlCodeName ) {
+        Client::initThread( "consoleTerminate" );
+        log() << "got " << controlCodeName << ", will terminate after current cmd ends" << endl;
+        exitCleanly( EXIT_KILL );
+    }
+
+    BOOL CtrlHandler( DWORD fdwCtrlType ) {
+
+        switch( fdwCtrlType ) {
+
+        case CTRL_C_EVENT:
+            rawOut( "Ctrl-C signal" );
+            consoleTerminate( "CTRL_C_EVENT" );
+            return TRUE ;
+
+        case CTRL_CLOSE_EVENT:
+            rawOut( "CTRL_CLOSE_EVENT signal" );
+            consoleTerminate( "CTRL_CLOSE_EVENT" );
+            return TRUE ;
+
+        case CTRL_BREAK_EVENT:
+            rawOut( "CTRL_BREAK_EVENT signal" );
+            consoleTerminate( "CTRL_BREAK_EVENT" );
+            return TRUE;
+
+        case CTRL_LOGOFF_EVENT:
+            rawOut( "CTRL_LOGOFF_EVENT signal" );
+            consoleTerminate( "CTRL_LOGOFF_EVENT" );
+            return TRUE;
+
+        case CTRL_SHUTDOWN_EVENT:
+            rawOut( "CTRL_SHUTDOWN_EVENT signal" );
+            consoleTerminate( "CTRL_SHUTDOWN_EVENT" );
+            return TRUE;
+
+        default:
+            return FALSE;
+        }
+    }
+
+    LPTOP_LEVEL_EXCEPTION_FILTER filtLast = 0;
+    ::HANDLE standardOut = GetStdHandle(STD_OUTPUT_HANDLE);
+    LONG WINAPI exceptionFilter(struct _EXCEPTION_POINTERS *ExceptionInfo) { 
+        {
+            // given the severity of the event we write to console in addition to the --logFile
+            // (rawOut writes to the logfile, if a special one were specified)
+            DWORD written;
+            WriteFile(standardOut, "unhandled windows exception\n", 20, &written, 0);
+            FlushFileBuffers(standardOut);
+        }
+
+        DWORD ec = ExceptionInfo->ExceptionRecord->ExceptionCode;
+        if( ec == EXCEPTION_ACCESS_VIOLATION ) {
+            rawOut("access violation");
+        } 
+        else {
+            rawOut("unhandled windows exception");
+            char buf[64];
+            strcpy(buf, "ec=0x");
+            _ui64toa(ec, buf+5, 16);
+            rawOut(buf);
+        }
+        if( filtLast ) 
+            return filtLast(ExceptionInfo);
+        return EXCEPTION_EXECUTE_HANDLER;
+    }
+
+    // called by mongoAbort()
+    extern void (*reportEventToSystem)(const char *msg);
+    void reportEventToSystemImpl(const char *msg) { 
+        static ::HANDLE hEventLog = RegisterEventSource( NULL, TEXT("mongod") );
+        if( hEventLog ) { 
+            std::wstring s = toNativeString(msg);
+            LPCTSTR txt = s.c_str();
+            BOOL ok = ReportEvent(
+              hEventLog, EVENTLOG_ERROR_TYPE, 
+              0, 0, NULL,
+              1, 
+              0, 
+              &txt,
+              0);
+            wassert(ok);
+        }
+    }
+
+    void myPurecallHandler() {
+        printStackTrace();
+        mongoAbort("pure virtual");
+    }
+
+    void setupSignals( bool inFork ) {
+        reportEventToSystem = reportEventToSystemImpl;
+        filtLast = SetUnhandledExceptionFilter(exceptionFilter);
+        massert(10297 , "Couldn't register Windows Ctrl-C handler", SetConsoleCtrlHandler((PHANDLER_ROUTINE) CtrlHandler, TRUE));
+        _set_purecall_handler( myPurecallHandler );
+    }
+
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/db/db.h b/src/mongo/db/db.h
new file mode 100644
index 00000000000..6a31a06f77c
--- /dev/null
+++ b/src/mongo/db/db.h
@@ -0,0 +1,120 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/net/message.h"
+#include "concurrency.h"
+#include "pdfile.h"
+#include "curop.h"
+#include "client.h"
+#include "databaseholder.h"
+
+namespace mongo {
+
+    struct dbtemprelease {
+        Client::Context * _context;
+        int _locktype;
+
+        dbtemprelease() {
+            const Client& c = cc();
+            _context = c.getContext();
+            _locktype = d.dbMutex.getState();
+            assert( _locktype );
+
+            if ( _locktype > 0 ) {
+                massert( 10298 , "can't temprelease nested write lock", _locktype == 1);
+                if ( _context ) _context->unlocked();
+                d.dbMutex.unlock();
+            }
+            else {
+                massert( 10299 , "can't temprelease nested read lock", _locktype == -1);
+                if ( _context ) _context->unlocked();
+                d.dbMutex.unlock_shared();
+            }
+            
+            verify( 14814 , c.curop() );
+            c.curop()->yielded();
+            
+        }
+        ~dbtemprelease() {
+            if ( _locktype > 0 )
+                d.dbMutex.lock();
+            else
+                d.dbMutex.lock_shared();
+
+            if ( _context ) _context->relocked();
+        }
+    };
+
+    /** must be write locked
+        no assert (and no release) if nested write lock 
+        a lot like dbtempreleasecond but no malloc so should be a tiny bit faster
+    */
+    struct dbtempreleasewritelock {
+        Client::Context * _context;
+        int _locktype;
+        dbtempreleasewritelock() {
+            const Client& c = cc();
+            _context = c.getContext();
+            _locktype = d.dbMutex.getState();
+            assert( _locktype >= 1 );
+            if( _locktype > 1 ) 
+                return; // nested
+            if ( _context ) 
+                _context->unlocked();
+            d.dbMutex.unlock();
+            verify( 14845 , c.curop() );
+            c.curop()->yielded();            
+        }
+        ~dbtempreleasewritelock() {
+            if ( _locktype == 1 )
+                d.dbMutex.lock();
+            if ( _context ) 
+                _context->relocked();
+        }
+    };
+
+    /**
+       only does a temp release if we're not nested and have a lock
+     */
+    struct dbtempreleasecond {
+        dbtemprelease * real;
+        int locktype;
+
+        dbtempreleasecond() {
+            real = 0;
+            locktype = d.dbMutex.getState();
+            if ( locktype == 1 || locktype == -1 )
+                real = new dbtemprelease();
+        }
+
+        ~dbtempreleasecond() {
+            if ( real ) {
+                delete real;
+                real = 0;
+            }
+        }
+
+        bool unlocked() {
+            return real != 0;
+        }
+    };
+
+} // namespace mongo
+
+#include "concurrency.h"
diff --git a/src/mongo/db/db.rc b/src/mongo/db/db.rc
new file mode 100755
index 00000000000..b589458cf73
--- /dev/null
+++ b/src/mongo/db/db.rc
@@ -0,0 +1,12 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Icon
+//
+// Icon with lowest ID value placed first to ensure application icon
+// remains consistent on all systems.
+IDI_ICON2               ICON                    "mongo.ico"
+/////////////////////////////////////////////////////////////////////////////
+\ No newline at end of file
diff --git a/src/mongo/db/db.vcxproj b/src/mongo/db/db.vcxproj
new file mode 100755
index 00000000000..8963f0af580
--- /dev/null
+++ b/src/mongo/db/db.vcxproj
@@ -0,0 +1,934 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectName>mongod</ProjectName>
+    <ProjectGuid>{215B2D68-0A70-4D10-8E75-B31010C62A91}</ProjectGuid>
+    <RootNamespace>db</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.;..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>No</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\bson\oid.cpp" />
+    <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\client\distlock.cpp" />
+    <ClCompile Include="..\client\model.cpp" />
+    <ClCompile Include="..\s\default_version.cpp" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\shell\mongo.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\s\chunk.cpp" />
+    <ClCompile Include="..\s\config.cpp" />
+    <ClCompile Include="..\s\d_chunk_manager.cpp" />
+    <ClCompile Include="..\s\d_migrate.cpp" />
+    <ClCompile Include="..\s\d_split.cpp" />
+    <ClCompile Include="..\s\d_state.cpp" />
+    <ClCompile Include="..\s\d_writeback.cpp" />
+    <ClCompile Include="..\s\grid.cpp" />
+    <ClCompile Include="..\s\shard.cpp" />
+    <ClCompile Include="..\s\shardconnection.cpp" />
+    <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\alignedbuilder.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\compress.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+    <ClCompile Include="..\util\concurrency\task.cpp" />
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+    <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\file_allocator.cpp" />
+    <ClCompile Include="..\util\intrusive_counter.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\logfile.cpp" />
+    <ClCompile Include="..\util\net\listen.cpp" />
+    <ClCompile Include="..\util\net\miniwebserver.cpp" />
+    <ClCompile Include="..\util\processinfo.cpp" />
+    <ClCompile Include="..\util\ramlog.cpp" />
+    <ClCompile Include="..\util\stringutils.cpp" />
+    <ClCompile Include="..\util\systeminfo_win32.cpp" />
+    <ClCompile Include="..\util\text.cpp" />
+    <ClCompile Include="..\util\version.cpp" />
+    <ClCompile Include="btreebuilder.cpp" />
+    <ClCompile Include="cap.cpp" />
+    <ClCompile Include="commands\cloud.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="commands\distinct.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="commands\document_source_cursor.cpp" />
+    <ClCompile Include="commands\find_and_modify.cpp" />
+    <ClCompile Include="commands\group.cpp" />
+    <ClCompile Include="commands\isself.cpp" />
+    <ClCompile Include="commands\mr.cpp" />
+    <ClCompile Include="commands\pipeline_command.cpp" />
+    <ClCompile Include="commands\pipeline.cpp" />
+    <ClCompile Include="compact.cpp" />
+    <ClCompile Include="curop.cpp" />
+    <ClCompile Include="dbcommands_generic.cpp" />
+    <ClCompile Include="dbmessage.cpp" />
+    <ClCompile Include="dur.cpp" />
+    <ClCompile Include="durop.cpp" />
+    <ClCompile Include="dur_commitjob.cpp" />
+    <ClCompile Include="dur_journal.cpp" />
+    <ClCompile Include="dur_preplogbuffer.cpp" />
+    <ClCompile Include="dur_recover.cpp" />
+    <ClCompile Include="dur_writetodatafiles.cpp" />
+    <ClCompile Include="d_concurrency.cpp" />
+    <ClCompile Include="d_globals.cpp" />
+    <ClCompile Include="geo\2d.cpp" />
+    <ClCompile Include="geo\haystack.cpp" />
+    <ClCompile Include="key.cpp" />
+    <ClCompile Include="mongommf.cpp" />
+    <ClCompile Include="oplog.cpp" />
+    <ClCompile Include="ops\count.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="ops\delete.cpp" />
+    <ClCompile Include="ops\query.cpp" />
+    <ClCompile Include="ops\update.cpp" />
+    <ClCompile Include="pagefault.cpp" />
+    <ClCompile Include="pipeline\accumulator.cpp" />
+    <ClCompile Include="pipeline\accumulator_add_to_set.cpp" />
+    <ClCompile Include="pipeline\accumulator_avg.cpp" />
+    <ClCompile Include="pipeline\accumulator_first.cpp" />
+    <ClCompile Include="pipeline\accumulator_last.cpp" />
+    <ClCompile Include="pipeline\accumulator_min_max.cpp" />
+    <ClCompile Include="pipeline\accumulator_push.cpp" />
+    <ClCompile Include="pipeline\accumulator_single_value.cpp" />
+    <ClCompile Include="pipeline\accumulator_sum.cpp" />
+    <ClCompile Include="pipeline\builder.cpp" />
+    <ClCompile Include="pipeline\document.cpp" />
+    <ClCompile Include="pipeline\document_source.cpp" />
+    <ClCompile Include="pipeline\document_source_bson_array.cpp" />
+    <ClCompile Include="pipeline\document_source_command_futures.cpp" />
+    <ClCompile Include="pipeline\document_source_filter.cpp" />
+    <ClCompile Include="pipeline\document_source_filter_base.cpp" />
+    <ClCompile Include="pipeline\document_source_group.cpp" />
+    <ClCompile Include="pipeline\document_source_limit.cpp" />
+    <ClCompile Include="pipeline\document_source_match.cpp" />
+    <ClCompile Include="pipeline\document_source_out.cpp" />
+    <ClCompile Include="pipeline\document_source_project.cpp" />
+    <ClCompile Include="pipeline\document_source_skip.cpp" />
+    <ClCompile Include="pipeline\document_source_sort.cpp" />
+    <ClCompile Include="pipeline\document_source_unwind.cpp" />
+    <ClCompile Include="pipeline\doc_mem_monitor.cpp" />
+    <ClCompile Include="pipeline\expression.cpp" />
+    <ClCompile Include="pipeline\expression_context.cpp" />
+    <ClCompile Include="pipeline\field_path.cpp" />
+    <ClCompile Include="pipeline\value.cpp" />
+    <ClCompile Include="projection.cpp" />
+    <ClCompile Include="queryoptimizercursor.cpp" />
+    <ClCompile Include="querypattern.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="record.cpp" />
+    <ClCompile Include="repl.cpp" />
+    <ClCompile Include="repl\consensus.cpp" />
+    <ClCompile Include="repl\heartbeat.cpp" />
+    <ClCompile Include="repl\manager.cpp" />
+    <ClCompile Include="repl\rs_initialsync.cpp" />
+    <ClCompile Include="repl\rs_initiate.cpp" />
+    <ClCompile Include="repl\rs_rollback.cpp" />
+    <ClCompile Include="repl\rs_sync.cpp" />
+    <ClCompile Include="repl_block.cpp" />
+    <ClCompile Include="restapi.cpp" />
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\pch.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="client.cpp" />
+    <ClCompile Include="clientcursor.cpp" />
+    <ClCompile Include="cloner.cpp" />
+    <ClCompile Include="commands.cpp" />
+    <ClCompile Include="common.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="cursor.cpp" />
+    <ClCompile Include="database.cpp" />
+    <ClCompile Include="db.cpp" />
+    <ClCompile Include="dbcommands.cpp" />
+    <ClCompile Include="dbcommands_admin.cpp" />
+    <ClCompile Include="dbeval.cpp" />
+    <ClCompile Include="dbhelpers.cpp" />
+    <ClCompile Include="dbwebserver.cpp" />
+    <ClCompile Include="extsort.cpp" />
+    <ClCompile Include="index.cpp" />
+    <ClCompile Include="indexkey.cpp" />
+    <ClCompile Include="instance.cpp" />
+    <ClCompile Include="introspect.cpp" />
+    <ClCompile Include="jsobj.cpp" />
+    <ClCompile Include="json.cpp" />
+    <ClCompile Include="lasterror.cpp" />
+    <ClCompile Include="matcher.cpp" />
+    <ClCompile Include="matcher_covered.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="modules\mms.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="namespace.cpp" />
+    <ClCompile Include="nonce.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="pdfile.cpp" />
+    <ClCompile Include="queryoptimizer.cpp" />
+    <ClCompile Include="scanandorder.cpp" />
+    <ClCompile Include="security.cpp" />
+    <ClCompile Include="security_commands.cpp" />
+    <ClCompile Include="security_common.cpp" />
+    <ClCompile Include="tests.cpp" />
+    <ClCompile Include="cmdline.cpp" />
+    <ClCompile Include="queryutil.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\ntservice.cpp" />
+    <ClCompile Include="..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="..\util\net\httpclient.cpp" />
+    <ClCompile Include="..\util\md5.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeaderFile>
+    </ClCompile>
+    <ClCompile Include="..\util\md5main.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message.cpp" />
+    <ClCompile Include="..\util\net\message_port.cpp" />
+    <ClCompile Include="..\util\net\message_server_port.cpp" />
+    <ClCompile Include="..\util\net\sock.cpp" />
+    <ClCompile Include="..\s\d_logic.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\scripting\utils.cpp" />
+    <ClCompile Include="stats\counters.cpp" />
+    <ClCompile Include="stats\snapshots.cpp" />
+    <ClCompile Include="stats\top.cpp" />
+    <ClCompile Include="btree.cpp" />
+    <ClCompile Include="btreecursor.cpp" />
+    <ClCompile Include="repl\health.cpp" />
+    <ClCompile Include="repl\rs.cpp" />
+    <ClCompile Include="repl\replset_commands.cpp" />
+    <ClCompile Include="repl\rs_config.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\jstests\dur\basic1.sh" />
+    <None Include="..\jstests\dur\dur1.js" />
+    <None Include="..\jstests\replsets\replset1.js" />
+    <None Include="..\jstests\replsets\replset2.js" />
+    <None Include="..\jstests\replsets\replset3.js" />
+    <None Include="..\jstests\replsets\replset4.js" />
+    <None Include="..\jstests\replsets\replset5.js" />
+    <None Include="..\jstests\replsets\replsetadd.js" />
+    <None Include="..\jstests\replsets\replsetarb1.js" />
+    <None Include="..\jstests\replsets\replsetarb2.js" />
+    <None Include="..\jstests\replsets\replsetprio1.js" />
+    <None Include="..\jstests\replsets\replsetrestart1.js" />
+    <None Include="..\jstests\replsets\replsetrestart2.js" />
+    <None Include="..\jstests\replsets\replset_remove_node.js" />
+    <None Include="..\jstests\replsets\rollback.js" />
+    <None Include="..\jstests\replsets\rollback2.js" />
+    <None Include="..\jstests\replsets\sync1.js" />
+    <None Include="..\jstests\replsets\twosets.js" />
+    <None Include="..\SConstruct" />
+    <None Include="..\util\mongoutils\README" />
+    <None Include="mongo.ico" />
+    <None Include="repl\notes.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\bson\bson-inl.h" />
+    <ClInclude Include="..\bson\bson.h" />
+    <ClInclude Include="..\bson\bson_db.h" />
+    <ClInclude Include="..\bson\inline_decls.h" />
+    <ClInclude Include="..\bson\stringdata.h" />
+    <ClInclude Include="..\bson\util\atomic_int.h" />
+    <ClInclude Include="..\bson\util\builder.h" />
+    <ClInclude Include="..\bson\util\misc.h" />
+    <ClInclude Include="..\client\dbclientcursor.h" />
+    <ClInclude Include="..\client\distlock.h" />
+    <ClInclude Include="..\client\gridfs.h" />
+    <ClInclude Include="..\client\parallel.h" />
+    <ClInclude Include="..\s\d_logic.h" />
+    <ClInclude Include="..\targetver.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\third_party\snappy\config.h" />
+    <ClInclude Include="..\third_party\snappy\snappy.h" />
+    <ClInclude Include="..\util\alignedbuilder.h" />
+    <ClInclude Include="..\util\concurrency\race.h" />
+    <ClInclude Include="..\util\concurrency\rwlock.h" />
+    <ClInclude Include="..\util\concurrency\msg.h" />
+    <ClInclude Include="..\util\concurrency\mutex.h" />
+    <ClInclude Include="..\util\concurrency\mvar.h" />
+    <ClInclude Include="..\util\concurrency\task.h" />
+    <ClInclude Include="..\util\concurrency\thread_pool.h" />
+    <ClInclude Include="..\util\intrusive_counter.h" />
+    <ClInclude Include="..\util\logfile.h" />
+    <ClInclude Include="..\util\mongoutils\checksum.h" />
+    <ClInclude Include="..\util\mongoutils\html.h" />
+    <ClInclude Include="..\util\mongoutils\str.h" />
+    <ClInclude Include="..\util\net\hostandport.h" />
+    <ClInclude Include="..\util\net\listen.h" />
+    <ClInclude Include="..\util\net\message_port.h" />
+    <ClInclude Include="..\util\net\miniwebserver.h" />
+    <ClInclude Include="..\util\paths.h" />
+    <ClInclude Include="..\util\ramlog.h" />
+    <ClInclude Include="..\util\systeminfo.h" />
+    <ClInclude Include="..\util\text.h" />
+    <ClInclude Include="..\util\time_support.h" />
+    <ClInclude Include="databaseholder.h" />
+    <ClInclude Include="durop.h" />
+    <ClInclude Include="dur_commitjob.h" />
+    <ClInclude Include="dur_journal.h" />
+    <ClInclude Include="dur_journalformat.h" />
+    <ClInclude Include="dur_journalimpl.h" />
+    <ClInclude Include="dur_stats.h" />
+    <ClInclude Include="d_globals.h" />
+    <ClInclude Include="geo\core.h" />
+    <ClInclude Include="globals.h" />
+    <ClInclude Include="helpers\dblogger.h" />
+    <ClInclude Include="instance.h" />
+    <ClInclude Include="mongommf.h" />
+    <ClInclude Include="mongomutex.h" />
+    <ClInclude Include="namespace-inl.h" />
+    <ClInclude Include="namespacestring.h" />
+    <ClInclude Include="oplogreader.h" />
+    <ClInclude Include="ops\count.h" />
+    <ClInclude Include="ops\delete.h" />
+    <ClInclude Include="ops\update.h" />
+    <ClInclude Include="pagefault.h" />
+    <ClInclude Include="pipeline\accumulator.h" />
+    <ClInclude Include="pipeline\builder.h" />
+    <ClInclude Include="pipeline\document.h" />
+    <ClInclude Include="pipeline\document_source.h" />
+    <ClInclude Include="pipeline\doc_mem_monitor.h" />
+    <ClInclude Include="pipeline\expression.h" />
+    <ClInclude Include="pipeline\expression_context.h" />
+    <ClInclude Include="pipeline\field_path.h" />
+    <ClInclude Include="pipeline\value.h" />
+    <ClInclude Include="projection.h" />
+    <ClInclude Include="queryutil.h" />
+    <ClInclude Include="repl.h" />
+    <ClInclude Include="replpair.h" />
+    <ClInclude Include="repl\connections.h" />
+    <ClInclude Include="repl\multicmd.h" />
+    <ClInclude Include="repl\rsmember.h" />
+    <ClInclude Include="repl\rs_optime.h" />
+    <ClInclude Include="stats\counters.h" />
+    <ClInclude Include="stats\snapshots.h" />
+    <ClInclude Include="stats\top.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="..\client\redef_macros.h" />
+    <ClInclude Include="..\client\syncclusterconnection.h" />
+    <ClInclude Include="..\client\undef_macros.h" />
+    <ClInclude Include="background.h" />
+    <ClInclude Include="client.h" />
+    <ClInclude Include="clientcursor.h" />
+    <ClInclude Include="cmdline.h" />
+    <ClInclude Include="commands.h" />
+    <ClInclude Include="concurrency.h" />
+    <ClInclude Include="curop.h" />
+    <ClInclude Include="cursor.h" />
+    <ClInclude Include="database.h" />
+    <ClInclude Include="db.h" />
+    <ClInclude Include="dbhelpers.h" />
+    <ClInclude Include="dbinfo.h" />
+    <ClInclude Include="dbmessage.h" />
+    <ClInclude Include="diskloc.h" />
+    <ClInclude Include="index.h" />
+    <ClInclude Include="indexkey.h" />
+    <ClInclude Include="introspect.h" />
+    <ClInclude Include="json.h" />
+    <ClInclude Include="matcher.h" />
+    <ClInclude Include="namespace.h" />
+    <ClInclude Include="..\pch.h" />
+    <ClInclude Include="pdfile.h" />
+    <ClInclude Include="..\grid\protocol.h" />
+    <ClInclude Include="query.h" />
+    <ClInclude Include="queryoptimizer.h" />
+    <ClInclude Include="resource.h" />
+    <ClInclude Include="scanandorder.h" />
+    <ClInclude Include="security.h" />
+    <ClInclude Include="..\util\allocator.h" />
+    <ClInclude Include="..\util\array.h" />
+    <ClInclude Include="..\util\assert_util.h" />
+    <ClInclude Include="..\util\background.h" />
+    <ClInclude Include="..\util\base64.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\debug_util.h" />
+    <ClInclude Include="..\util\embedded_builder.h" />
+    <ClInclude Include="..\util\file.h" />
+    <ClInclude Include="..\util\file_allocator.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\util\hashtab.h" />
+    <ClInclude Include="..\util\hex.h" />
+    <ClInclude Include="lasterror.h" />
+    <ClInclude Include="..\util\log.h" />
+    <ClInclude Include="..\util\lruishmap.h" />
+    <ClInclude Include="..\util\mmap.h" />
+    <ClInclude Include="..\util\ntservice.h" />
+    <ClInclude Include="..\util\optime.h" />
+    <ClInclude Include="..\util\processinfo.h" />
+    <ClInclude Include="..\util\queue.h" />
+    <ClInclude Include="..\util\ramstore.h" />
+    <ClInclude Include="..\util\unittest.h" />
+    <ClInclude Include="..\util\concurrency\list.h" />
+    <ClInclude Include="..\util\concurrency\value.h" />
+    <ClInclude Include="..\util\web\html.h" />
+    <ClInclude Include="..\util\net\httpclient.h" />
+    <ClInclude Include="..\util\md5.h" />
+    <ClInclude Include="..\util\md5.hpp" />
+    <ClInclude Include="..\util\net\message.h" />
+    <ClInclude Include="..\util\net\message_server.h" />
+    <ClInclude Include="..\util\net\sock.h" />
+    <ClInclude Include="..\scripting\engine.h" />
+    <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+    <ClInclude Include="..\scripting\engine_v8.h" />
+    <ClInclude Include="..\scripting\v8_db.h" />
+    <ClInclude Include="..\scripting\v8_utils.h" />
+    <ClInclude Include="..\scripting\v8_wrapper.h" />
+    <ClInclude Include="btree.h" />
+    <ClInclude Include="repl\health.h" />
+    <ClInclude Include="repl\rs.h" />
+    <ClInclude Include="repl\rs_config.h" />
+    <ClInclude Include="..\bson\bsonelement.h" />
+    <ClInclude Include="..\bson\bsoninlines.h" />
+    <ClInclude Include="..\bson\bsonmisc.h" />
+    <ClInclude Include="..\bson\bsonobj.h" />
+    <ClInclude Include="..\bson\bsonobjbuilder.h" />
+    <ClInclude Include="..\bson\bsonobjiterator.h" />
+    <ClInclude Include="..\bson\bsontypes.h" />
+    <ClInclude Include="jsobj.h" />
+    <ClInclude Include="..\bson\oid.h" />
+    <ClInclude Include="..\bson\ordering.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js32d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js32r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="db.rc" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/db/db.vcxproj.filters b/src/mongo/db/db.vcxproj.filters
new file mode 100755
index 00000000000..a39df0dc796
--- /dev/null
+++ b/src/mongo/db/db.vcxproj.filters
@@ -0,0 +1,432 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\bson\oid.cpp" />
+    <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\client\distlock.cpp" />
+    <ClCompile Include="..\client\model.cpp" />
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\shell\mongo.cpp" />
+    <ClCompile Include="..\s\chunk.cpp" />
+    <ClCompile Include="..\s\config.cpp" />
+    <ClCompile Include="..\s\d_chunk_manager.cpp" />
+    <ClCompile Include="..\s\d_migrate.cpp" />
+    <ClCompile Include="..\s\d_split.cpp" />
+    <ClCompile Include="..\s\d_state.cpp" />
+    <ClCompile Include="..\s\d_writeback.cpp" />
+    <ClCompile Include="..\s\grid.cpp" />
+    <ClCompile Include="..\s\shard.cpp" />
+    <ClCompile Include="..\s\shardconnection.cpp" />
+    <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\util\alignedbuilder.cpp" />
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+    <ClCompile Include="..\util\concurrency\task.cpp" />
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+    <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\logfile.cpp" />
+    <ClCompile Include="..\util\processinfo.cpp" />
+    <ClCompile Include="..\util\stringutils.cpp" />
+    <ClCompile Include="..\util\text.cpp" />
+    <ClCompile Include="..\util\version.cpp" />
+    <ClCompile Include="cap.cpp" />
+    <ClCompile Include="commands\distinct.cpp" />
+    <ClCompile Include="commands\group.cpp" />
+    <ClCompile Include="commands\isself.cpp" />
+    <ClCompile Include="commands\mr.cpp" />
+    <ClCompile Include="compact.cpp" />
+    <ClCompile Include="dbcommands_generic.cpp" />
+    <ClCompile Include="dur.cpp" />
+    <ClCompile Include="durop.cpp" />
+    <ClCompile Include="dur_commitjob.cpp" />
+    <ClCompile Include="dur_journal.cpp" />
+    <ClCompile Include="dur_preplogbuffer.cpp" />
+    <ClCompile Include="dur_recover.cpp" />
+    <ClCompile Include="dur_writetodatafiles.cpp" />
+    <ClCompile Include="geo\2d.cpp" />
+    <ClCompile Include="geo\haystack.cpp" />
+    <ClCompile Include="mongommf.cpp" />
+    <ClCompile Include="oplog.cpp" />
+    <ClCompile Include="projection.cpp" />
+    <ClCompile Include="repl.cpp" />
+    <ClCompile Include="repl\consensus.cpp" />
+    <ClCompile Include="repl\heartbeat.cpp" />
+    <ClCompile Include="repl\manager.cpp" />
+    <ClCompile Include="repl\rs_initialsync.cpp" />
+    <ClCompile Include="repl\rs_initiate.cpp" />
+    <ClCompile Include="repl\rs_rollback.cpp" />
+    <ClCompile Include="repl\rs_sync.cpp" />
+    <ClCompile Include="repl_block.cpp" />
+    <ClCompile Include="restapi.cpp" />
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\pch.cpp" />
+    <ClCompile Include="client.cpp" />
+    <ClCompile Include="clientcursor.cpp" />
+    <ClCompile Include="cloner.cpp" />
+    <ClCompile Include="commands.cpp" />
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="cursor.cpp" />
+    <ClCompile Include="database.cpp" />
+    <ClCompile Include="db.cpp" />
+    <ClCompile Include="dbcommands.cpp" />
+    <ClCompile Include="dbcommands_admin.cpp" />
+    <ClCompile Include="dbeval.cpp" />
+    <ClCompile Include="dbhelpers.cpp" />
+    <ClCompile Include="dbwebserver.cpp" />
+    <ClCompile Include="extsort.cpp" />
+    <ClCompile Include="index.cpp" />
+    <ClCompile Include="indexkey.cpp" />
+    <ClCompile Include="instance.cpp" />
+    <ClCompile Include="introspect.cpp" />
+    <ClCompile Include="jsobj.cpp" />
+    <ClCompile Include="json.cpp" />
+    <ClCompile Include="lasterror.cpp" />
+    <ClCompile Include="matcher.cpp" />
+    <ClCompile Include="matcher_covered.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="modules\mms.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="namespace.cpp" />
+    <ClCompile Include="nonce.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="pdfile.cpp" />
+    <ClCompile Include="queryoptimizer.cpp" />
+    <ClCompile Include="security.cpp" />
+    <ClCompile Include="security_commands.cpp" />
+    <ClCompile Include="tests.cpp" />
+    <ClCompile Include="cmdline.cpp" />
+    <ClCompile Include="queryutil.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\ntservice.cpp" />
+    <ClCompile Include="..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="..\util\md5.c" />
+    <ClCompile Include="..\util\md5main.cpp" />
+    <ClCompile Include="..\s\d_logic.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\scripting\utils.cpp" />
+    <ClCompile Include="stats\counters.cpp" />
+    <ClCompile Include="stats\snapshots.cpp" />
+    <ClCompile Include="stats\top.cpp" />
+    <ClCompile Include="btree.cpp" />
+    <ClCompile Include="btreecursor.cpp" />
+    <ClCompile Include="repl\health.cpp" />
+    <ClCompile Include="repl\rs.cpp" />
+    <ClCompile Include="repl\replset_commands.cpp" />
+    <ClCompile Include="repl\rs_config.cpp" />
+    <ClCompile Include="..\util\file_allocator.cpp" />
+    <ClCompile Include="querypattern.cpp" />
+    <ClCompile Include="..\util\ramlog.cpp" />
+    <ClCompile Include="key.cpp" />
+    <ClCompile Include="btreebuilder.cpp" />
+    <ClCompile Include="queryoptimizercursor.cpp" />
+    <ClCompile Include="record.cpp" />
+    <ClCompile Include="ops\delete.cpp" />
+    <ClCompile Include="ops\update.cpp" />
+    <ClCompile Include="security_common.cpp" />
+    <ClCompile Include="ops\query.cpp" />
+    <ClCompile Include="..\util\net\httpclient.cpp" />
+    <ClCompile Include="..\util\net\message.cpp" />
+    <ClCompile Include="..\util\net\message_server_port.cpp" />
+    <ClCompile Include="..\util\net\sock.cpp" />
+    <ClCompile Include="..\util\net\miniwebserver.cpp" />
+    <ClCompile Include="..\util\net\listen.cpp" />
+    <ClCompile Include="..\util\net\message_port.cpp" />
+    <ClCompile Include="dbmessage.cpp" />
+    <ClCompile Include="commands\find_and_modify.cpp" />
+    <ClCompile Include="..\util\compress.cpp">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="scanandorder.cpp" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c" />
+    <ClCompile Include="commands\cloud.cpp" />
+    <ClCompile Include="commands\pipeline_command.cpp" />
+    <ClCompile Include="commands\pipeline.cpp" />
+    <ClCompile Include="pipeline\accumulator.cpp" />
+    <ClCompile Include="pipeline\accumulator_add_to_set.cpp" />
+    <ClCompile Include="pipeline\accumulator_avg.cpp" />
+    <ClCompile Include="pipeline\accumulator_first.cpp" />
+    <ClCompile Include="pipeline\accumulator_last.cpp" />
+    <ClCompile Include="pipeline\accumulator_min_max.cpp" />
+    <ClCompile Include="pipeline\accumulator_push.cpp" />
+    <ClCompile Include="pipeline\accumulator_single_value.cpp" />
+    <ClCompile Include="pipeline\accumulator_sum.cpp" />
+    <ClCompile Include="pipeline\builder.cpp" />
+    <ClCompile Include="pipeline\doc_mem_monitor.cpp" />
+    <ClCompile Include="pipeline\document.cpp" />
+    <ClCompile Include="pipeline\document_source.cpp" />
+    <ClCompile Include="pipeline\document_source_bson_array.cpp" />
+    <ClCompile Include="pipeline\document_source_command_futures.cpp" />
+    <ClCompile Include="pipeline\document_source_filter.cpp" />
+    <ClCompile Include="pipeline\document_source_filter_base.cpp" />
+    <ClCompile Include="pipeline\document_source_group.cpp" />
+    <ClCompile Include="pipeline\document_source_limit.cpp" />
+    <ClCompile Include="pipeline\document_source_match.cpp" />
+    <ClCompile Include="pipeline\document_source_out.cpp" />
+    <ClCompile Include="pipeline\document_source_project.cpp" />
+    <ClCompile Include="pipeline\document_source_skip.cpp" />
+    <ClCompile Include="pipeline\document_source_sort.cpp" />
+    <ClCompile Include="pipeline\document_source_unwind.cpp" />
+    <ClCompile Include="pipeline\expression.cpp" />
+    <ClCompile Include="pipeline\expression_context.cpp" />
+    <ClCompile Include="pipeline\field_path.cpp" />
+    <ClCompile Include="pipeline\value.cpp" />
+    <ClCompile Include="..\util\intrusive_counter.cpp" />
+    <ClCompile Include="..\util\systeminfo_win32.cpp" />
+    <ClCompile Include="commands\document_source_cursor.cpp" />
+    <ClCompile Include="d_concurrency.cpp" />
+    <ClCompile Include="..\s\default_version.cpp" />
+    <ClCompile Include="ops\count.cpp" />
+    <ClCompile Include="pagefault.cpp" />
+    <ClCompile Include="d_globals.cpp" />
+    <ClCompile Include="curop.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\client\dbclientcursor.h" />
+    <ClInclude Include="..\client\distlock.h" />
+    <ClInclude Include="..\client\gridfs.h" />
+    <ClInclude Include="..\client\parallel.h" />
+    <ClInclude Include="..\s\d_logic.h" />
+    <ClInclude Include="..\targetver.h" />
+    <ClInclude Include="..\util\concurrency\rwlock.h" />
+    <ClInclude Include="..\util\concurrency\msg.h" />
+    <ClInclude Include="..\util\concurrency\mutex.h" />
+    <ClInclude Include="..\util\concurrency\mvar.h" />
+    <ClInclude Include="..\util\concurrency\task.h" />
+    <ClInclude Include="..\util\concurrency\thread_pool.h" />
+    <ClInclude Include="..\util\logfile.h" />
+    <ClInclude Include="..\util\mongoutils\checksum.h" />
+    <ClInclude Include="..\util\mongoutils\html.h" />
+    <ClInclude Include="..\util\mongoutils\str.h" />
+    <ClInclude Include="..\util\paths.h" />
+    <ClInclude Include="..\util\ramlog.h" />
+    <ClInclude Include="..\util\text.h" />
+    <ClInclude Include="..\util\time_support.h" />
+    <ClInclude Include="durop.h" />
+    <ClInclude Include="dur_commitjob.h" />
+    <ClInclude Include="dur_journal.h" />
+    <ClInclude Include="dur_journalformat.h" />
+    <ClInclude Include="dur_stats.h" />
+    <ClInclude Include="geo\core.h" />
+    <ClInclude Include="helpers\dblogger.h" />
+    <ClInclude Include="instance.h" />
+    <ClInclude Include="mongommf.h" />
+    <ClInclude Include="mongomutex.h" />
+    <ClInclude Include="namespace-inl.h" />
+    <ClInclude Include="oplogreader.h" />
+    <ClInclude Include="projection.h" />
+    <ClInclude Include="repl.h" />
+    <ClInclude Include="replpair.h" />
+    <ClInclude Include="repl\connections.h" />
+    <ClInclude Include="repl\multicmd.h" />
+    <ClInclude Include="repl\rsmember.h" />
+    <ClInclude Include="repl\rs_optime.h" />
+    <ClInclude Include="stats\counters.h" />
+    <ClInclude Include="stats\snapshots.h" />
+    <ClInclude Include="stats\top.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="..\client\redef_macros.h" />
+    <ClInclude Include="..\client\syncclusterconnection.h" />
+    <ClInclude Include="..\client\undef_macros.h" />
+    <ClInclude Include="background.h" />
+    <ClInclude Include="client.h" />
+    <ClInclude Include="clientcursor.h" />
+    <ClInclude Include="cmdline.h" />
+    <ClInclude Include="commands.h" />
+    <ClInclude Include="concurrency.h" />
+    <ClInclude Include="curop.h" />
+    <ClInclude Include="cursor.h" />
+    <ClInclude Include="database.h" />
+    <ClInclude Include="db.h" />
+    <ClInclude Include="dbhelpers.h" />
+    <ClInclude Include="dbinfo.h" />
+    <ClInclude Include="dbmessage.h" />
+    <ClInclude Include="diskloc.h" />
+    <ClInclude Include="index.h" />
+    <ClInclude Include="indexkey.h" />
+    <ClInclude Include="introspect.h" />
+    <ClInclude Include="json.h" />
+    <ClInclude Include="matcher.h" />
+    <ClInclude Include="namespace.h" />
+    <ClInclude Include="..\pch.h" />
+    <ClInclude Include="pdfile.h" />
+    <ClInclude Include="..\grid\protocol.h" />
+    <ClInclude Include="query.h" />
+    <ClInclude Include="queryoptimizer.h" />
+    <ClInclude Include="resource.h" />
+    <ClInclude Include="scanandorder.h" />
+    <ClInclude Include="security.h" />
+    <ClInclude Include="..\util\allocator.h" />
+    <ClInclude Include="..\util\array.h" />
+    <ClInclude Include="..\util\assert_util.h" />
+    <ClInclude Include="..\util\background.h" />
+    <ClInclude Include="..\util\base64.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\debug_util.h" />
+    <ClInclude Include="..\util\embedded_builder.h" />
+    <ClInclude Include="..\util\file.h" />
+    <ClInclude Include="..\util\file_allocator.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\util\hashtab.h" />
+    <ClInclude Include="..\util\hex.h" />
+    <ClInclude Include="lasterror.h" />
+    <ClInclude Include="..\util\log.h" />
+    <ClInclude Include="..\util\lruishmap.h" />
+    <ClInclude Include="..\util\mmap.h" />
+    <ClInclude Include="..\util\ntservice.h" />
+    <ClInclude Include="..\util\optime.h" />
+    <ClInclude Include="..\util\processinfo.h" />
+    <ClInclude Include="..\util\queue.h" />
+    <ClInclude Include="..\util\ramstore.h" />
+    <ClInclude Include="..\util\unittest.h" />
+    <ClInclude Include="..\util\concurrency\list.h" />
+    <ClInclude Include="..\util\concurrency\value.h" />
+    <ClInclude Include="..\util\web\html.h" />
+    <ClInclude Include="..\util\md5.h" />
+    <ClInclude Include="..\util\md5.hpp" />
+    <ClInclude Include="..\scripting\engine.h" />
+    <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+    <ClInclude Include="..\scripting\engine_v8.h" />
+    <ClInclude Include="..\scripting\v8_db.h" />
+    <ClInclude Include="..\scripting\v8_utils.h" />
+    <ClInclude Include="..\scripting\v8_wrapper.h" />
+    <ClInclude Include="btree.h" />
+    <ClInclude Include="repl\health.h" />
+    <ClInclude Include="repl\rs.h" />
+    <ClInclude Include="repl\rs_config.h" />
+    <ClInclude Include="..\bson\bsonelement.h" />
+    <ClInclude Include="..\bson\bsoninlines.h" />
+    <ClInclude Include="..\bson\bsonmisc.h" />
+    <ClInclude Include="..\bson\bsonobj.h" />
+    <ClInclude Include="..\bson\bsonobjbuilder.h" />
+    <ClInclude Include="..\bson\bsonobjiterator.h" />
+    <ClInclude Include="..\bson\bsontypes.h" />
+    <ClInclude Include="jsobj.h" />
+    <ClInclude Include="..\bson\oid.h" />
+    <ClInclude Include="..\bson\ordering.h" />
+    <ClInclude Include="dur_journalimpl.h" />
+    <ClInclude Include="..\util\concurrency\race.h" />
+    <ClInclude Include="..\util\alignedbuilder.h" />
+    <ClInclude Include="queryutil.h" />
+    <ClInclude Include="..\bson\bson.h" />
+    <ClInclude Include="..\bson\bson_db.h" />
+    <ClInclude Include="..\bson\bson-inl.h" />
+    <ClInclude Include="..\bson\inline_decls.h" />
+    <ClInclude Include="..\bson\stringdata.h" />
+    <ClInclude Include="..\bson\util\atomic_int.h" />
+    <ClInclude Include="..\bson\util\builder.h" />
+    <ClInclude Include="..\bson\util\misc.h" />
+    <ClInclude Include="ops\delete.h" />
+    <ClInclude Include="ops\update.h" />
+    <ClInclude Include="..\util\net\httpclient.h" />
+    <ClInclude Include="..\util\net\message.h" />
+    <ClInclude Include="..\util\net\message_server.h" />
+    <ClInclude Include="..\util\net\sock.h" />
+    <ClInclude Include="..\third_party\snappy\config.h">
+      <Filter>snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy.h">
+      <Filter>snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+    <ClInclude Include="globals.h" />
+    <ClInclude Include="..\util\net\hostandport.h" />
+    <ClInclude Include="..\util\net\listen.h" />
+    <ClInclude Include="..\util\net\message_port.h" />
+    <ClInclude Include="..\util\net\miniwebserver.h" />
+    <ClInclude Include="databaseholder.h" />
+    <ClInclude Include="pipeline\accumulator.h" />
+    <ClInclude Include="pipeline\builder.h" />
+    <ClInclude Include="pipeline\doc_mem_monitor.h" />
+    <ClInclude Include="pipeline\document.h" />
+    <ClInclude Include="pipeline\document_source.h" />
+    <ClInclude Include="pipeline\expression.h" />
+    <ClInclude Include="pipeline\expression_context.h" />
+    <ClInclude Include="pipeline\field_path.h" />
+    <ClInclude Include="pipeline\value.h" />
+    <ClInclude Include="..\util\intrusive_counter.h" />
+    <ClInclude Include="..\util\systeminfo.h" />
+    <ClInclude Include="namespacestring.h" />
+    <ClInclude Include="ops\count.h" />
+    <ClInclude Include="pagefault.h" />
+    <ClInclude Include="d_globals.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="db.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\jstests\dur\basic1.sh" />
+    <None Include="..\jstests\dur\dur1.js" />
+    <None Include="..\jstests\replsets\replset1.js" />
+    <None Include="..\jstests\replsets\replset2.js" />
+    <None Include="..\jstests\replsets\replset3.js" />
+    <None Include="..\jstests\replsets\replset4.js" />
+    <None Include="..\jstests\replsets\replset5.js" />
+    <None Include="..\jstests\replsets\replsetadd.js" />
+    <None Include="..\jstests\replsets\replsetarb1.js" />
+    <None Include="..\jstests\replsets\replsetarb2.js" />
+    <None Include="..\jstests\replsets\replsetprio1.js" />
+    <None Include="..\jstests\replsets\replsetrestart1.js" />
+    <None Include="..\jstests\replsets\replsetrestart2.js" />
+    <None Include="..\jstests\replsets\replset_remove_node.js" />
+    <None Include="..\jstests\replsets\rollback.js" />
+    <None Include="..\jstests\replsets\rollback2.js" />
+    <None Include="..\jstests\replsets\sync1.js" />
+    <None Include="..\jstests\replsets\twosets.js" />
+    <None Include="..\SConstruct" />
+    <None Include="..\util\mongoutils\README" />
+    <None Include="mongo.ico" />
+    <None Include="repl\notes.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js32d.lib" />
+    <Library Include="..\..\js\js32r.lib" />
+    <Library Include="..\..\js\js64d.lib" />
+    <Library Include="..\..\js\js64r.lib" />
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="snappy">
+      <UniqueIdentifier>{bb99c086-7926-4f50-838d-f5f0c18397c0}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/db/db_10.sln b/src/mongo/db/db_10.sln
new file mode 100755
index 00000000000..c1d83f3901a
--- /dev/null
+++ b/src/mongo/db/db_10.sln
@@ -0,0 +1,168 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{4082881B-EB00-486F-906C-843B8EC06E18}"
+	ProjectSection(SolutionItems) = preProject
+		driverHelpers.cpp = driverHelpers.cpp
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
+	ProjectSection(SolutionItems) = preProject
+		..\shell\msvc\createCPPfromJavaScriptFiles.js = ..\shell\msvc\createCPPfromJavaScriptFiles.js
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}"
+	ProjectSection(SolutionItems) = preProject
+		..\util\processinfo_darwin.cpp = ..\util\processinfo_darwin.cpp
+		..\util\processinfo_linux2.cpp = ..\util\processinfo_linux2.cpp
+		..\util\processinfo_none.cpp = ..\util\processinfo_none.cpp
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "other", "other", "{12B11474-2D74-48C3-BB3D-F03249BEA88F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcxproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcxproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "..\dbtests\test.vcxproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bsondemo", "..\bson\bsondemo\bsondemo.vcxproj", "{C9DB5EB7-81AA-4185-BAA1-DA035654402F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoutils test program", "..\util\mongoutils\mongoutils.vcxproj", "{7B84584E-92BC-4DB9-971B-A1A8F93E5053}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple_client_demo", "..\client\examples\simple_client_demo.vcxproj", "{89C30BC3-2874-4F2C-B4DA-EB04E9782236}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongo", "..\shell\msvc\mongo.vcxproj", "{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoperf", "..\client\examples\mongoperf.vcxproj", "{79D4E297-BFB7-4FF2-9B13-08A146582E46}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|Mixed Platforms = Debug|Mixed Platforms
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Any CPU = Release|Any CPU
+		Release|Mixed Platforms = Release|Mixed Platforms
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|x64.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Any CPU.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Mixed Platforms.Build.0 = Release|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.Build.0 = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|x64.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|x64.Build.0 = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.Build.0 = Debug|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|x64.ActiveCfg = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|x64.Build.0 = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Any CPU.ActiveCfg = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Mixed Platforms.Build.0 = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.ActiveCfg = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.Build.0 = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|x64.ActiveCfg = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|x64.Build.0 = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Any CPU.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Mixed Platforms.Build.0 = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.Build.0 = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.Build.0 = Debug|Win32
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|x64.ActiveCfg = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|x64.Build.0 = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Any CPU.ActiveCfg = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Mixed Platforms.Build.0 = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.ActiveCfg = Release|Win32
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.Build.0 = Release|Win32
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|x64.ActiveCfg = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|x64.Build.0 = Release|x64
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Win32.ActiveCfg = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Win32.Build.0 = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|x64.ActiveCfg = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Any CPU.ActiveCfg = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.ActiveCfg = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.Build.0 = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|x64.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.Build.0 = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|x64.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Any CPU.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.Build.0 = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|x64.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.Build.0 = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|x64.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Any CPU.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.Build.0 = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|x64.ActiveCfg = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Win32.ActiveCfg = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Win32.Build.0 = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|x64.ActiveCfg = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Any CPU.ActiveCfg = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Win32.ActiveCfg = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Win32.Build.0 = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|x64.ActiveCfg = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(NestedProjects) = preSolution
+		{2B262D59-9DC7-4BF1-A431-1BD4966899A5} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{2F760952-C71B-4865-998F-AABAE96D1373} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{4082881B-EB00-486F-906C-843B8EC06E18} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+	EndGlobalSection
+EndGlobal
diff --git a/src/mongo/db/dbcommands.cpp b/src/mongo/db/dbcommands.cpp
new file mode 100644
index 00000000000..570c897fae4
--- /dev/null
+++ b/src/mongo/db/dbcommands.cpp
@@ -0,0 +1,1955 @@
+// dbcommands.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* SHARDING: 
+   I believe this file is for mongod only.
+   See s/commnands_public.cpp for mongos.
+*/
+
+#include "pch.h"
+#include "ops/count.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "../util/md5.hpp"
+#include "../util/processinfo.h"
+#include "../util/ramlog.h"
+#include "json.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "replutil.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "queryoptimizer.h"
+#include "../scripting/engine.h"
+#include "stats/counters.h"
+#include "background.h"
+#include "../util/version.h"
+#include "../s/d_writeback.h"
+#include "dur_stats.h"
+
+namespace mongo {
+
+    namespace dur { 
+        void setAgeOutJournalFiles(bool rotate);
+    }
+    /** @return true if fields found */
+    bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { 
+        BSONElement e = cmdObj["ageOutJournalFiles"];
+        if( !e.eoo() ) {
+            bool r = e.trueValue();
+            log() << "ageOutJournalFiles " << r << endl;
+            dur::setAgeOutJournalFiles(r);
+            return true;
+        }
+        return false;
+    }
+
+    /* reset any errors so that getlasterror comes back clean.
+
+       useful before performing a long series of operations where we want to
+       see if any of the operations triggered an error, but don't want to check
+       after each op as that woudl be a client/server turnaround.
+    */
+    class CmdResetError : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "reset error state (used with getpreverror)";
+        }
+        CmdResetError() : Command("resetError", false, "reseterror") {}
+        bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.get();
+            assert( le );
+            le->reset();
+            return true;
+        }
+    } cmdResetError;
+
+    /* set by replica sets if specified in the configuration.
+       a pointer is used to avoid any possible locking issues with lockless reading (see below locktype() is NONE
+       and would like to keep that)
+       (for now, it simply orphans any old copy as config changes should be extremely rare).
+       note: once non-null, never goes to null again.
+    */
+    BSONObj *getLastErrorDefault = 0;
+
+    class CmdGetLastError : public Command {
+    public:
+        CmdGetLastError() : Command("getLastError", false, "getlasterror") { }
+        virtual LockType locktype() const { return NONE;  }
+        virtual bool logTheOp()           { return false; }
+        virtual bool slaveOk() const      { return true;  }
+        virtual void help( stringstream& help ) const {
+            help << "return error status of the last operation on this connection\n"
+                 << "options:\n"
+                 << "  { fsync:true } - fsync before returning, or wait for journal commit if running with --journal\n"
+                 << "  { j:true } - wait for journal commit if running with --journal\n"
+                 << "  { w:n } - await replication to n servers (including self) before returning\n"
+                 << "  { wtimeout:m} - timeout for w in m milliseconds";
+        }
+        bool run(const string& dbname, BSONObj& _cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.disableForCommand();
+
+            bool err = false;
+
+            if ( le->nPrev != 1 )
+                err = LastError::noError.appendSelf( result , false );
+            else
+                err = le->appendSelf( result , false );
+
+            Client& c = cc();
+            c.appendLastOp( result );
+
+            result.appendNumber( "connectionId" , c.getConnectionId() ); // for sharding; also useful in general for debugging
+
+            BSONObj cmdObj = _cmdObj;
+            {
+                BSONObj::iterator i(_cmdObj);
+                i.next();
+                if( !i.more() ) {
+                    /* empty, use default */
+                    BSONObj *def = getLastErrorDefault;
+                    if( def )
+                        cmdObj = *def;
+                }
+            }
+
+            if ( cmdObj["j"].trueValue() ) { 
+                if( !getDur().awaitCommit() ) {
+                    // --journal is off
+                    result.append("jnote", "journaling not enabled on this server");
+                }
+                if( cmdObj["fsync"].trueValue() ) { 
+                    errmsg = "fsync and j options are not used together";
+                    return false;
+                }
+            }
+            else if ( cmdObj["fsync"].trueValue() ) {
+                Timer t;
+                if( !getDur().awaitCommit() ) {
+                    // if get here, not running with --journal
+                    log() << "fsync from getlasterror" << endl;
+                    result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) );
+                }
+                else {
+                    // this perhaps is temp.  how long we wait for the group commit to occur.
+                    result.append( "waited", t.millis() );
+                }
+            }
+
+            if ( err ) {
+                // doesn't make sense to wait for replication
+                // if there was an error
+                return true;
+            }
+
+            BSONElement e = cmdObj["w"];
+            if ( e.ok() ) {
+                int timeout = cmdObj["wtimeout"].numberInt();
+                Timer t;
+
+                long long passes = 0;
+                char buf[32];
+                while ( 1 ) {
+                    OpTime op(c.getLastOp());
+                    
+                    if ( op.isNull() ) {
+                        if ( anyReplEnabled() ) {
+                            result.append( "wnote" , "no write has been done on this connection" );
+                        }
+                        else if ( e.isNumber() && e.numberInt() <= 1 ) {
+                            // don't do anything
+                            // w=1 and no repl, so this is fine
+                        }
+                        else {
+                            // w=2 and no repl
+                            result.append( "wnote" , "no replication has been enabled, so w=2+ won't work" );
+                            result.append( "err", "norepl" );
+                            return true; 
+                        }
+                        break;
+                    }
+
+                    // check this first for w=0 or w=1
+                    if ( opReplicatedEnough( op, e ) ) {
+                        break;
+                    }
+
+                    // if replication isn't enabled (e.g., config servers)
+                    if ( ! anyReplEnabled() ) {
+                        result.append( "err", "norepl" );
+                        return true;
+                    }
+
+
+                    if ( timeout > 0 && t.millis() >= timeout ) {
+                        result.append( "wtimeout" , true );
+                        errmsg = "timed out waiting for slaves";
+                        result.append( "waited" , t.millis() );
+                        result.append( "err" , "timeout" );
+                        return true;
+                    }
+
+                    assert( sprintf( buf , "w block pass: %lld" , ++passes ) < 30 );
+                    c.curop()->setMessage( buf );
+                    sleepmillis(1);
+                    killCurrentOp.checkForInterrupt();
+                }
+                result.appendNumber( "wtime" , t.millis() );
+            }
+
+            result.appendNull( "err" );
+            return true;
+        }
+    } cmdGetLastError;
+
+    class CmdGetPrevError : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "check for errors since last reseterror commandcal";
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        CmdGetPrevError() : Command("getPrevError", false, "getpreverror") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.disableForCommand();
+            le->appendSelf( result );
+            if ( le->valid )
+                result.append( "nPrev", le->nPrev );
+            else
+                result.append( "nPrev", -1 );
+            return true;
+        }
+    } cmdGetPrevError;
+
+    CmdShutdown cmdShutdown;
+
+    void CmdShutdown::help( stringstream& help ) const {
+        help << "shutdown the database.  must be ran against admin db and "
+             << "either (1) ran from localhost or (2) authenticated. If "
+             << "this is a primary in a replica set and there is no member "
+             << "within 10 seconds of its optime, it will not shutdown "
+             << "without force : true.  You can also specify timeoutSecs : "
+             << "N to wait N seconds for other members to catch up.";
+    }
+
+    bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+
+        if (!force && theReplSet && theReplSet->isPrimary()) {
+            long long timeout, now, start;
+            timeout = now = start = curTimeMicros64()/1000000;
+            if (cmdObj.hasField("timeoutSecs")) {
+                timeout += cmdObj["timeoutSecs"].numberLong();
+            }
+
+            OpTime lastOp = theReplSet->lastOpTimeWritten;
+            OpTime closest = theReplSet->lastOtherOpTime();
+            long long int diff = lastOp.getSecs() - closest.getSecs();
+            while (now <= timeout && (diff < 0 || diff > 10)) {
+                sleepsecs(1);
+                now++;
+
+                lastOp = theReplSet->lastOpTimeWritten;
+                closest = theReplSet->lastOtherOpTime();
+                diff = lastOp.getSecs() - closest.getSecs();
+            }
+
+            if (diff < 0 || diff > 10) {
+                errmsg = "no secondaries within 10 seconds of my optime";
+                result.append("closest", closest.getSecs());
+                result.append("difference", diff);
+                return false;
+            }
+
+            // step down
+            theReplSet->stepDown(120);
+
+            log() << "waiting for secondaries to catch up" << endl;
+
+            lastOp = theReplSet->lastOpTimeWritten;
+            while (lastOp != closest && now - start < 60) {
+                closest = theReplSet->lastOtherOpTime();
+
+                now++;
+                sleepsecs(1);
+            }
+
+            // regardless of whether they caught up, we'll shut down
+        }
+
+        return shutdownHelper();
+    }
+
+    class CmdDropDatabase : public Command {
+    public:
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "drop (delete) this database";
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        CmdDropDatabase() : Command("dropDatabase") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.firstElement();
+            log() << "dropDatabase " << dbname << endl;
+            int p = (int) e.number();
+            if ( p != 1 )
+                return false;
+            dropDatabase(dbname);
+            result.append( "dropped" , dbname );
+            return true;
+        }
+    } cmdDropDatabase;
+
+    class CmdRepairDatabase : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual bool maintenanceMode() const { return true; }
+        virtual void help( stringstream& help ) const {
+            help << "repair database.  also compacts. note: slow.";
+        }
+        virtual LockType locktype() const { return WRITE; }
+        CmdRepairDatabase() : Command("repairDatabase") {}
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.firstElement();
+            log() << "repairDatabase " << dbname << endl;
+            int p = (int) e.number();
+            if ( p != 1 ) {
+                errmsg = "bad option";
+                return false;
+            }
+            e = cmdObj.getField( "preserveClonedFilesOnFailure" );
+            bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean();
+            e = cmdObj.getField( "backupOriginalFiles" );
+            bool backupOriginalFiles = e.isBoolean() && e.boolean();
+            return repairDatabase( dbname, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles );
+        }
+    } cmdRepairDatabase;
+
+    /* set db profiling level
+       todo: how do we handle profiling information put in the db with replication?
+             sensibly or not?
+    */
+    class CmdProfile : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "enable or disable performance profiling\n";
+            help << "{ profile : <n> }\n";
+            help << "0=off 1=log slow ops 2=log all\n";
+            help << "-1 to get current values\n";
+            help << "http://www.mongodb.org/display/DOCS/Database+Profiler";
+        }
+        virtual LockType locktype() const { return WRITE; }
+        CmdProfile() : Command("profile") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.firstElement();
+            result.append("was", cc().database()->profile);
+            result.append("slowms", cmdLine.slowMS );
+
+            int p = (int) e.number();
+            bool ok = false;
+
+            if ( p == -1 )
+                ok = true;
+            else if ( p >= 0 && p <= 2 ) {
+                ok = cc().database()->setProfilingLevel( p , errmsg );
+            }
+
+            BSONElement slow = cmdObj["slowms"];
+            if ( slow.isNumber() )
+                cmdLine.slowMS = slow.numberInt();
+
+            return ok;
+        }
+    } cmdProfile;
+
+    class CmdServerStatus : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        CmdServerStatus() : Command("serverStatus", true) {}
+
+        virtual LockType locktype() const { return NONE; }
+
+        virtual void help( stringstream& help ) const {
+            help << "returns lots of administrative server statistics";
+        }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            long long start = Listener::getElapsedTimeMillis();
+            BSONObjBuilder timeBuilder(128);
+
+
+            bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+
+            result.append( "host" , prettyHostName() );
+            result.append("version", versionString);
+            result.append("process","mongod");
+            result.append("uptime",(double) (time(0)-cmdLine.started));
+            result.append("uptimeEstimate",(double) (start/1000));
+            result.appendDate( "localTime" , jsTime() );
+
+            {
+                BSONObjBuilder t;
+
+                unsigned long long last, start, timeLocked;
+                d.dbMutex.info().getTimingInfo(start, timeLocked);
+                last = curTimeMicros64();
+                double tt = (double) last-start;
+                double tl = (double) timeLocked;
+                t.append("totalTime", tt);
+                t.append("lockTime", tl);
+                t.append("ratio", (tt ? tl/tt : 0));
+
+                {
+                    BSONObjBuilder ttt( t.subobjStart( "currentQueue" ) );
+                    int w=0, r=0;
+                    Client::recommendedYieldMicros( &w , &r );
+                    ttt.append( "total" , w + r );
+                    ttt.append( "readers" , r );
+                    ttt.append( "writers" , w );
+                    ttt.done();
+                }
+
+                {
+                    BSONObjBuilder ttt( t.subobjStart( "activeClients" ) );
+                    int w=0, r=0;
+                    Client::getActiveClientCount( w , r );
+                    ttt.append( "total" , w + r );
+                    ttt.append( "readers" , r );
+                    ttt.append( "writers" , w );
+                    ttt.done();
+                }
+
+
+
+                result.append( "globalLock" , t.obj() );
+            }
+            timeBuilder.appendNumber( "after basic" , Listener::getElapsedTimeMillis() - start );
+
+            {
+
+                BSONObjBuilder t( result.subobjStart( "mem" ) );
+
+                t.append("bits",  ( sizeof(int*) == 4 ? 32 : 64 ) );
+
+                ProcessInfo p;
+                int v = 0;
+                if ( p.supported() ) {
+                    t.appendNumber( "resident" , p.getResidentSize() );
+                    v = p.getVirtualMemorySize();
+                    t.appendNumber( "virtual" , v );
+                    t.appendBool( "supported" , true );
+                }
+                else {
+                    result.append( "note" , "not all mem info support on this platform" );
+                    t.appendBool( "supported" , false );
+                }
+
+                timeBuilder.appendNumber( "middle of mem" , Listener::getElapsedTimeMillis() - start );
+
+                int m = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
+                t.appendNumber( "mapped" , m );
+                
+                if ( cmdLine.dur ) {
+                    m *= 2;
+                    t.appendNumber( "mappedWithJournal" , m );
+                }
+                
+                int overhead = v - m - connTicketHolder.used();
+
+                if( overhead > 4000 ) { 
+                    t.append("note", "virtual minus mapped is large. could indicate a memory leak");
+                    log() << "warning: virtual size (" << v << "MB) - mapped size (" << m << "MB) is large (" << overhead << "MB). could indicate a memory leak" << endl;
+                }
+
+                t.done();
+
+            }
+            timeBuilder.appendNumber( "after mem" , Listener::getElapsedTimeMillis() - start );
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "connections" ) );
+                bb.append( "current" , connTicketHolder.used() );
+                bb.append( "available" , connTicketHolder.available() );
+                bb.done();
+            }
+            timeBuilder.appendNumber( "after connections" , Listener::getElapsedTimeMillis() - start );
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "extra_info" ) );
+                bb.append("note", "fields vary by platform");
+                ProcessInfo p;
+                p.getExtraInfo(bb);
+                bb.done();
+                timeBuilder.appendNumber( "after extra info" , Listener::getElapsedTimeMillis() - start );
+
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "indexCounters" ) );
+                globalIndexCounters.append( bb );
+                bb.done();
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "backgroundFlushing" ) );
+                globalFlushCounters.append( bb );
+                bb.done();
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "cursors" ) );
+                ClientCursor::appendStats( bb );
+                bb.done();
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "network" ) );
+                networkCounter.append( bb );
+                bb.done();
+            }
+
+
+            timeBuilder.appendNumber( "after counters" , Listener::getElapsedTimeMillis() - start );
+
+            if ( anyReplEnabled() ) {
+                BSONObjBuilder bb( result.subobjStart( "repl" ) );
+                appendReplicationInfo( bb , authed , cmdObj["repl"].numberInt() );
+                bb.done();
+
+                if ( ! _isMaster() ) {
+                    result.append( "opcountersRepl" , replOpCounters.getObj() );
+                }
+
+            }
+
+            timeBuilder.appendNumber( "after repl" , Listener::getElapsedTimeMillis() - start );
+
+            result.append( "opcounters" , globalOpCounters.getObj() );
+
+            {
+                BSONObjBuilder asserts( result.subobjStart( "asserts" ) );
+                asserts.append( "regular" , assertionCount.regular );
+                asserts.append( "warning" , assertionCount.warning );
+                asserts.append( "msg" , assertionCount.msg );
+                asserts.append( "user" , assertionCount.user );
+                asserts.append( "rollovers" , assertionCount.rollovers );
+                asserts.done();
+            }
+
+            timeBuilder.appendNumber( "after asserts" , Listener::getElapsedTimeMillis() - start );
+
+            result.append( "writeBacksQueued" , ! writeBackManager.queuesEmpty() );
+
+            if( cmdLine.dur ) {
+                result.append("dur", dur::stats.asObj());
+            }
+
+            timeBuilder.appendNumber( "after dur" , Listener::getElapsedTimeMillis() - start );
+
+            {
+                RamLog* rl = RamLog::get( "warnings" );
+                verify(15880, rl);
+                
+                if (rl->lastWrite() >= time(0)-(10*60)){ // only show warnings from last 10 minutes
+                    vector<const char*> lines;
+                    rl->get( lines );
+                    
+                    BSONArrayBuilder arr( result.subarrayStart( "warnings" ) );
+                    for ( unsigned i=std::max(0,(int)lines.size()-10); i<lines.size(); i++ )
+                        arr.append( lines[i] );
+                    arr.done();
+                }
+            }
+
+            if ( ! authed )
+                result.append( "note" , "run against admin for more info" );
+            
+            timeBuilder.appendNumber( "at end" , Listener::getElapsedTimeMillis() - start );
+            if ( Listener::getElapsedTimeMillis() - start > 1000 ) {
+                BSONObj t = timeBuilder.obj();
+                log() << "serverStatus was very slow: " << t << endl;
+                result.append( "timing" , t );
+            }
+
+            return true;
+        }
+    } cmdServerStatus;
+
+    class CmdGetOpTime : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream& help ) const { help << "internal"; }
+        virtual LockType locktype() const { return NONE; }
+        CmdGetOpTime() : Command("getoptime") { }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            writelock l( "" );
+            result.appendDate("optime", OpTime::now().asDate());
+            return true;
+        }
+    } cmdgetoptime;
+
+    /*
+    class Cmd : public Command {
+    public:
+        Cmd() : Command("") { }
+        bool adminOnly() const { return true; }
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result) {
+            return true;
+        }
+    } cmd;
+    */
+
+    class CmdDiagLogging : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        CmdDiagLogging() : Command("diagLogging") { }
+        bool adminOnly() const {
+            return true;
+        }
+        void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Monitoring+and+Diagnostics#MonitoringandDiagnostics-DatabaseRecord%2FReplay"; }
+        virtual LockType locktype() const { return WRITE; }
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() );
+            _diaglog.flush();
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: diagLogging set to " << _diaglog.getLevel() << " from: " << was << endl;
+            result.append( "was" , was );
+            return true;
+        }
+    } cmddiaglogging;
+
+    /* remove bit from a bit array - actually remove its slot, not a clear
+       note: this function does not work with x == 63 -- that is ok
+             but keep in mind in the future if max indexes were extended to
+             exactly 64 it would be a problem
+    */
+    unsigned long long removeBit(unsigned long long b, int x) {
+        unsigned long long tmp = b;
+        return
+            (tmp & ((((unsigned long long) 1) << x)-1)) |
+            ((tmp >> (x+1)) << x);
+    }
+
+    struct DBCommandsUnitTest {
+        DBCommandsUnitTest() {
+            assert( removeBit(1, 0) == 0 );
+            assert( removeBit(2, 0) == 1 );
+            assert( removeBit(2, 1) == 0 );
+            assert( removeBit(255, 1) == 127 );
+            assert( removeBit(21, 2) == 9 );
+            assert( removeBit(0x4000000000000001ULL, 62) == 1 );
+        }
+    } dbc_unittest;
+
+    void assureSysIndexesEmptied(const char *ns, IndexDetails *exceptForIdIndex);
+    int removeFromSysIndexes(const char *ns, const char *idxName);
+
+    bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) {
+
+        BackgroundOperation::assertNoBgOpInProgForNs(ns);
+
+        d = d->writingWithExtra();
+        d->aboutToDeleteAnIndex();
+
+        /* there may be pointers pointing at keys in the btree(s).  kill them. */
+        ClientCursor::invalidate(ns);
+
+        // delete a specific index or all?
+        if ( *name == '*' && name[1] == 0 ) {
+            log(4) << "  d->nIndexes was " << d->nIndexes << '\n';
+            anObjBuilder.append("nIndexesWas", (double)d->nIndexes);
+            IndexDetails *idIndex = 0;
+            if( d->nIndexes ) {
+                for ( int i = 0; i < d->nIndexes; i++ ) {
+                    if ( !mayDeleteIdIndex && d->idx(i).isIdIndex() ) {
+                        idIndex = &d->idx(i);
+                    }
+                    else {
+                        d->idx(i).kill_idx();
+                    }
+                }
+                d->nIndexes = 0;
+            }
+            if ( idIndex ) {
+                d->addIndex(ns) = *idIndex;
+                wassert( d->nIndexes == 1 );
+            }
+            /* assuming here that id index is not multikey: */
+            d->multiKeyIndexBits = 0;
+            assureSysIndexesEmptied(ns, idIndex);
+            anObjBuilder.append("msg", mayDeleteIdIndex ?
+                                "indexes dropped for collection" :
+                                "non-_id indexes dropped for collection");
+        }
+        else {
+            // delete just one index
+            int x = d->findIndexByName(name);
+            if ( x >= 0 ) {
+                log(4) << "  d->nIndexes was " << d->nIndexes << endl;
+                anObjBuilder.append("nIndexesWas", (double)d->nIndexes);
+
+                /* note it is  important we remove the IndexDetails with this
+                 call, otherwise, on recreate, the old one would be reused, and its
+                 IndexDetails::info ptr would be bad info.
+                 */
+                IndexDetails *id = &d->idx(x);
+                if ( !mayDeleteIdIndex && id->isIdIndex() ) {
+                    errmsg = "may not delete _id index";
+                    return false;
+                }
+                id->kill_idx();
+                d->multiKeyIndexBits = removeBit(d->multiKeyIndexBits, x);
+                d->nIndexes--;
+                for ( int i = x; i < d->nIndexes; i++ )
+                    d->idx(i) = d->idx(i+1);
+            }
+            else {
+                int n = removeFromSysIndexes(ns, name); // just in case an orphaned listing there - i.e. should have been repaired but wasn't
+                if( n ) {
+                    log() << "info: removeFromSysIndexes cleaned up " << n << " entries" << endl;
+                }
+                log() << "dropIndexes: " << name << " not found" << endl;
+                errmsg = "index not found";
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /* drop collection */
+    class CmdDrop : public Command {
+    public:
+        CmdDrop() : Command("drop") { }
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual bool adminOnly() const {
+            return false;
+        }
+        virtual void help( stringstream& help ) const { help << "drop a collection\n{drop : <collectionName>}"; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string nsToDrop = dbname + '.' + cmdObj.firstElement().valuestr();
+            NamespaceDetails *d = nsdetails(nsToDrop.c_str());
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: drop " << nsToDrop << endl;
+            if ( d == 0 ) {
+                errmsg = "ns not found";
+                return false;
+            }
+            uassert( 10039 ,  "can't drop collection with reserved $ character in name", strchr(nsToDrop.c_str(), '$') == 0 );
+            dropCollection( nsToDrop, errmsg, result );
+            return true;
+        }
+    } cmdDrop;
+
+    /* select count(*) */
+    class CmdCount : public Command {
+    public:
+        virtual LockType locktype() const { return READ; }
+        CmdCount() : Command("count") { }
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const {
+            // ok on --slave setups
+            return replSettings.slave == SimpleSlave;
+        }
+        virtual bool slaveOverrideOk() { return true; }
+        virtual bool maintenanceOk() const { return false; }
+        virtual bool adminOnly() const { return false; }
+        virtual void help( stringstream& help ) const { help << "count objects in collection"; }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string ns = parseNs(dbname, cmdObj);
+            string err;
+            long long n = runCount(ns.c_str(), cmdObj, err);
+            long long nn = n;
+            bool ok = true;
+            if ( n == -1 ) {
+                nn = 0;
+                result.appendBool( "missing" , true );
+            }
+            else if ( n < 0 ) {
+                nn = 0;
+                ok = false;
+                if ( !err.empty() )
+                    errmsg = err;
+            }
+            result.append("n", (double) nn);
+            return ok;
+        }
+    } cmdCount;
+
+    /* create collection */
+    class CmdCreate : public Command {
+    public:
+        CmdCreate() : Command("create") { }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual bool adminOnly() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream& help ) const {
+            help << "create a collection explicitly\n"
+                "{ create: <ns>[, capped: <bool>, size: <collSizeInBytes>, max: <nDocs>] }";
+        }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            uassert(15888, "must pass name of collection to create", cmdObj.firstElement().valuestrsafe()[0] != '\0');
+            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+            string err;
+            uassert(14832, "specify size:<n> when capped is true", !cmdObj["capped"].trueValue() || cmdObj["size"].isNumber() || cmdObj.hasField("$nExtents"));
+            bool ok = userCreateNS(ns.c_str(), cmdObj, err, ! fromRepl );
+            if ( !ok && !err.empty() )
+                errmsg = err;
+            return ok;
+        }
+    } cmdCreate;
+
+    /* "dropIndexes" is now the preferred form - "deleteIndexes" deprecated */
+    class CmdDropIndexes : public Command {
+    public:
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream& help ) const {
+            help << "drop indexes for a collection";
+        }
+        CmdDropIndexes() : Command("dropIndexes", false, "deleteIndexes") { }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
+            BSONElement e = jsobj.firstElement();
+            string toDeleteNs = dbname + '.' + e.valuestr();
+            NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: dropIndexes " << toDeleteNs << endl;
+            if ( d ) {
+                BSONElement f = jsobj.getField("index");
+                if ( f.type() == String ) {
+                    return dropIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false );
+                }
+                else if ( f.type() == Object ) {
+                    int idxId = d->findIndexByKeyPattern( f.embeddedObject() );
+                    if ( idxId < 0 ) {
+                        errmsg = "can't find index with key:";
+                        errmsg += f.embeddedObject().toString();
+                        return false;
+                    }
+                    else {
+                        IndexDetails& ii = d->idx( idxId );
+                        string iName = ii.indexName();
+                        return dropIndexes( d, toDeleteNs.c_str(), iName.c_str() , errmsg, anObjBuilder, false );
+                    }
+                }
+                else {
+                    errmsg = "invalid index name spec";
+                    return false;
+                }
+            }
+            else {
+                errmsg = "ns not found";
+                return false;
+            }
+        }
+    } cmdDropIndexes;
+
+    class CmdReIndex : public Command {
+    public:
+        virtual bool logTheOp() { return false; } // only reindexes on the one node
+        virtual bool slaveOk() const { return true; }    // can reindex on a secondary
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream& help ) const {
+            help << "re-index a collection";
+        }
+        CmdReIndex() : Command("reIndex") { }
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            static DBDirectClient db;
+
+            BSONElement e = jsobj.firstElement();
+            string toDeleteNs = dbname + '.' + e.valuestr();
+            NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
+            tlog() << "CMD: reIndex " << toDeleteNs << endl;
+            BackgroundOperation::assertNoBgOpInProgForNs(toDeleteNs.c_str());
+
+            if ( ! d ) {
+                errmsg = "ns not found";
+                return false;
+            }
+
+            list<BSONObj> all;
+            auto_ptr<DBClientCursor> i = db.query( dbname + ".system.indexes" , BSON( "ns" << toDeleteNs ) , 0 , 0 , 0 , QueryOption_SlaveOk );
+            BSONObjBuilder b;
+            while ( i->more() ) {
+                BSONObj o = i->next().removeField("v").getOwned();
+                b.append( BSONObjBuilder::numStr( all.size() ) , o );
+                all.push_back( o );
+            }
+
+
+            bool ok = dropIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true );
+            if ( ! ok ) {
+                errmsg = "dropIndexes failed";
+                return false;
+            }
+
+            for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) {
+                BSONObj o = *i;
+                log(1) << "reIndex ns: " << toDeleteNs << " index: " << o << endl;
+                theDataFileMgr.insertWithObjMod( Namespace( toDeleteNs.c_str() ).getSisterNS( "system.indexes" ).c_str() , o , true );
+            }
+
+            result.append( "nIndexes" , (int)all.size() );
+            result.appendArray( "indexes" , b.obj() );
+            return true;
+        }
+    } cmdReIndex;
+
+    class CmdListDatabases : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual bool slaveOverrideOk() {
+            return true;
+        }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream& help ) const { help << "list databases on this server"; }
+        CmdListDatabases() : Command("listDatabases" , true ) {}
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            vector< string > dbNames;
+            getDatabaseNames( dbNames );
+            vector< BSONObj > dbInfos;
+
+            set<string> seen;
+            boost::intmax_t totalSize = 0;
+            for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+                BSONObjBuilder b;
+                b.append( "name", *i );
+
+                boost::intmax_t size = dbSize( i->c_str() );
+                b.append( "sizeOnDisk", (double) size );
+                totalSize += size;
+                
+                {
+                    Client::ReadContext rc( *i + ".system.namespaces" );
+                    b.appendBool( "empty", rc.ctx().db()->isEmpty() );
+                }
+                
+                dbInfos.push_back( b.obj() );
+
+                seen.insert( i->c_str() );
+            }
+
+            // TODO: erh 1/1/2010 I think this is broken where path != dbpath ??
+            set<string> allShortNames;
+            {
+                readlock lk;
+                dbHolder().getAllShortNames( false, allShortNames );
+            }
+            
+            for ( set<string>::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ) {
+                string name = *i;
+
+                if ( seen.count( name ) )
+                    continue;
+
+                BSONObjBuilder b;
+                b.append( "name" , name );
+                b.append( "sizeOnDisk" , (double)1.0 );
+
+                {
+                    readlock lk( name );
+                    Client::Context ctx( name );
+                    b.appendBool( "empty", ctx.db()->isEmpty() );
+                }
+
+                dbInfos.push_back( b.obj() );
+            }
+
+            result.append( "databases", dbInfos );
+            result.append( "totalSize", double( totalSize ) );
+            return true;
+        }
+    } cmdListDatabases;
+
+    /* note an access to a database right after this will open it back up - so this is mainly
+       for diagnostic purposes.
+       */
+    class CmdCloseAllDatabases : public Command {
+    public:
+        virtual void help( stringstream& help ) const { help << "Close all database files.\nA new request will cause an immediate reopening; thus, this is mostly for testing purposes."; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+
+        CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {}
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            bool ok;
+            try {
+                ok = dbHolderW().closeAll( dbpath , result, false );
+            }
+            catch(DBException&) { 
+                throw;
+            }
+            catch(...) { 
+                log() << "ERROR uncaught exception in command closeAllDatabases" << endl;
+                errmsg = "unexpected uncaught exception";
+                return false;
+            }
+            return ok;
+        }
+    } cmdCloseAllDatabases;
+
+    class CmdFileMD5 : public Command {
+    public:
+        CmdFileMD5() : Command( "filemd5" ) {}
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }";
+        }
+        virtual LockType locktype() const { return READ; }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string ns = dbname;
+            ns += ".";
+            {
+                string root = jsobj.getStringField( "root" );
+                if ( root.size() == 0 )
+                    root = "fs";
+                ns += root;
+            }
+            ns += ".chunks"; // make this an option in jsobj
+
+            md5digest d;
+            md5_state_t st;
+            md5_init(&st);
+
+            BSONObj query = BSON( "files_id" << jsobj["filemd5"] );
+            BSONObj sort = BSON( "files_id" << 1 << "n" << 1 );
+
+            shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str(), query, sort);
+            if ( ! cursor ) {
+                errmsg = "need an index on { files_id : 1 , n : 1 }";
+                return false;
+            }
+            auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns.c_str()));
+
+            int n = 0;
+            while ( cursor->ok() ) {
+                if ( ! cursor->matcher()->matchesCurrent( cursor.get() ) ) {
+                    log() << "**** NOT MATCHING ****" << endl;
+                    PRINT(cursor->current());
+                    cursor->advance();
+                    continue;
+                }
+
+                BSONObj obj = cursor->current();
+                cursor->advance();
+
+                BSONElement ne = obj["n"];
+                assert(ne.isNumber());
+                int myn = ne.numberInt();
+                if ( n != myn ) {
+                    log() << "should have chunk: " << n << " have:" << myn << endl;
+                    dumpChunks( ns , query , sort );
+                    uassert( 10040 ,  "chunks out of order" , n == myn );
+                }
+
+                int len;
+                const char * data = obj["data"].binDataClean( len );
+
+                ClientCursor::YieldLock yield (cc.get());
+                try {
+                    md5_append( &st , (const md5_byte_t*)(data) , len );
+                    n++;
+                }
+                catch (...) {
+                    if ( ! yield.stillOk() ) // relocks
+                        cc.release();
+                    throw;
+                }
+
+                if ( ! yield.stillOk() ) {
+                    cc.release();
+                    uasserted(13281, "File deleted during filemd5 command");
+                }
+            }
+
+            md5_finish(&st, d);
+
+            result.append( "numChunks" , n );
+            result.append( "md5" , digestToString( d ) );
+            return true;
+        }
+
+        void dumpChunks( const string& ns , const BSONObj& query , const BSONObj& sort ) {
+            DBDirectClient client;
+            Query q(query);
+            q.sort(sort);
+            auto_ptr<DBClientCursor> c = client.query(ns, q);
+            while(c->more())
+                PRINT(c->nextSafe());
+        }
+    } cmdFileMD5;
+
+    static IndexDetails *cmdIndexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+        if ( ns[ 0 ] == '\0' || min.isEmpty() || max.isEmpty() ) {
+            errmsg = "invalid command syntax (note: min and max are required)";
+            return 0;
+        }
+        return indexDetailsForRange( ns, errmsg, min, max, keyPattern );
+    }
+
+    class CmdDatasize : public Command {
+        virtual string parseNs(const string& dbname, const BSONObj& cmdObj) const { 
+            return parseNsFullyQualified(dbname, cmdObj);
+        }
+    public:
+        CmdDatasize() : Command( "dataSize", false, "datasize" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help <<
+                 "determine data size for a set of data in a certain range"
+                 "\nexample: { dataSize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }"
+                 "\nkeyPattern, min, and max parameters are optional."
+                 "\nnote: This command may take a while to run";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            Timer timer;
+
+            string ns = jsobj.firstElement().String();
+            BSONObj min = jsobj.getObjectField( "min" );
+            BSONObj max = jsobj.getObjectField( "max" );
+            BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+            bool estimate = jsobj["estimate"].trueValue();
+
+            Client::Context ctx( ns );
+            NamespaceDetails *d = nsdetails(ns.c_str());
+
+            if ( ! d || d->stats.nrecords == 0 ) {
+                result.appendNumber( "size" , 0 );
+                result.appendNumber( "numObjects" , 0 );
+                result.append( "millis" , timer.millis() );
+                return true;
+            }
+
+            result.appendBool( "estimate" , estimate );
+
+            shared_ptr<Cursor> c;
+            if ( min.isEmpty() && max.isEmpty() ) {
+                if ( estimate ) {
+                    result.appendNumber( "size" , d->stats.datasize );
+                    result.appendNumber( "numObjects" , d->stats.nrecords );
+                    result.append( "millis" , timer.millis() );
+                    return 1;
+                }
+                c = theDataFileMgr.findAll( ns.c_str() );
+            }
+            else if ( min.isEmpty() || max.isEmpty() ) {
+                errmsg = "only one of min or max specified";
+                return false;
+            }
+            else {
+                IndexDetails *idx = cmdIndexDetailsForRange( ns.c_str(), errmsg, min, max, keyPattern );
+                if ( idx == 0 )
+                    return false;
+
+                c.reset( BtreeCursor::make( d, d->idxNo(*idx), *idx, min, max, false, 1 ) );
+            }
+
+            long long avgObjSize = d->stats.datasize / d->stats.nrecords;
+
+            long long maxSize = jsobj["maxSize"].numberLong();
+            long long maxObjects = jsobj["maxObjects"].numberLong();
+
+            long long size = 0;
+            long long numObjects = 0;
+            while( c->ok() ) {
+
+                if ( estimate )
+                    size += avgObjSize;
+                else
+                    size += c->currLoc().rec()->netLength();
+
+                numObjects++;
+
+                if ( ( maxSize && size > maxSize ) ||
+                        ( maxObjects && numObjects > maxObjects ) ) {
+                    result.appendBool( "maxReached" , true );
+                    break;
+                }
+
+                c->advance();
+            }
+
+            ostringstream os;
+            os <<  "Finding size for ns: " << ns;
+            if ( ! min.isEmpty() ) {
+                os << " between " << min << " and " << max;
+            }
+            logIfSlow( timer , os.str() );
+
+            result.appendNumber( "size", size );
+            result.appendNumber( "numObjects" , numObjects );
+            result.append( "millis" , timer.millis() );
+            return true;
+        }
+    } cmdDatasize;
+
+    namespace {
+        long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ) {
+            d.dbMutex.assertAtLeastReadLocked();
+
+            NamespaceDetails * nsd = nsdetails( ns.c_str() );
+            if ( ! nsd )
+                return 0;
+
+            long long totalSize = 0;
+
+            NamespaceDetails::IndexIterator ii = nsd->ii();
+            while ( ii.more() ) {
+                IndexDetails& d = ii.next();
+                string collNS = d.indexNamespace();
+                NamespaceDetails * mine = nsdetails( collNS.c_str() );
+                if ( ! mine ) {
+                    log() << "error: have index ["  << collNS << "] but no NamespaceDetails" << endl;
+                    continue;
+                }
+                totalSize += mine->stats.datasize;
+                if ( details )
+                    details->appendNumber( d.indexName() , mine->stats.datasize / scale );
+            }
+            return totalSize;
+        }
+    }
+
+    class CollectionStats : public Command {
+    public:
+        CollectionStats() : Command( "collStats", false, "collstats" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024\n"
+                    "    avgObjSize - in bytes";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string ns = dbname + "." + jsobj.firstElement().valuestr();
+            Client::Context cx( ns );
+
+            NamespaceDetails * nsd = nsdetails( ns.c_str() );
+            if ( ! nsd ) {
+                errmsg = "ns not found";
+                return false;
+            }
+
+            result.append( "ns" , ns.c_str() );
+
+            int scale = 1;
+            if ( jsobj["scale"].isNumber() ) {
+                scale = jsobj["scale"].numberInt();
+                if ( scale <= 0 ) {
+                    errmsg = "scale has to be > 0";
+                    return false;
+                }
+            }
+            else if ( jsobj["scale"].trueValue() ) {
+                errmsg = "scale has to be a number > 0";
+                return false;
+            }
+
+            bool verbose = jsobj["verbose"].trueValue();
+
+            long long size = nsd->stats.datasize / scale;
+            result.appendNumber( "count" , nsd->stats.nrecords );
+            result.appendNumber( "size" , size );
+            if( nsd->stats.nrecords )
+                result.append      ( "avgObjSize" , double(size) / double(nsd->stats.nrecords) );
+
+            int numExtents;
+            BSONArrayBuilder extents;
+
+            result.appendNumber( "storageSize" , nsd->storageSize( &numExtents , verbose ? &extents : 0  ) / scale );
+            result.append( "numExtents" , numExtents );
+            result.append( "nindexes" , nsd->nIndexes );
+            result.append( "lastExtentSize" , nsd->lastExtentSize / scale );
+            result.append( "paddingFactor" , nsd->paddingFactor );
+            result.append( "flags" , nsd->flags );
+
+            BSONObjBuilder indexSizes;
+            result.appendNumber( "totalIndexSize" , getIndexSizeForCollection(dbname, ns, &indexSizes, scale) / scale );
+            result.append("indexSizes", indexSizes.obj());
+
+            if ( nsd->capped ) {
+                result.append( "capped" , nsd->capped );
+                result.append( "max" , nsd->max );
+            }
+
+            if ( verbose )
+                result.appendArray( "extents" , extents.arr() );
+
+            return true;
+        }
+    } cmdCollectionStats;
+
+    class DBStats : public Command {
+    public:
+        DBStats() : Command( "dbStats", false, "dbstats" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help << 
+                "Get stats on a database. Not instantaneous. Slower for databases with large .ns files.\n" << 
+                "Example: { dbStats:1, scale:1 }";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            int scale = 1;
+            if ( jsobj["scale"].isNumber() ) {
+                scale = jsobj["scale"].numberInt();
+                if ( scale <= 0 ) {
+                    errmsg = "scale has to be > 0";
+                    return false;
+                }
+            }
+            else if ( jsobj["scale"].trueValue() ) {
+                errmsg = "scale has to be a number > 0";
+                return false;
+            }
+
+            list<string> collections;
+            Database* d = cc().database();
+            if ( d )
+                d->namespaceIndex.getNamespaces( collections );
+
+            long long ncollections = 0;
+            long long objects = 0;
+            long long size = 0;
+            long long storageSize = 0;
+            long long numExtents = 0;
+            long long indexes = 0;
+            long long indexSize = 0;
+
+            for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it) {
+                const string ns = *it;
+
+                NamespaceDetails * nsd = nsdetails( ns.c_str() );
+                if ( ! nsd ) {
+                    errmsg = "missing ns: ";
+                    errmsg += ns;
+                    return false;
+                }
+
+                ncollections += 1;
+                objects += nsd->stats.nrecords;
+                size += nsd->stats.datasize;
+
+                int temp;
+                storageSize += nsd->storageSize( &temp );
+                numExtents += temp;
+
+                indexes += nsd->nIndexes;
+                indexSize += getIndexSizeForCollection(dbname, ns);
+            }
+            
+            result.append      ( "db" , dbname );
+            result.appendNumber( "collections" , ncollections );
+            result.appendNumber( "objects" , objects );
+            result.append      ( "avgObjSize" , objects == 0 ? 0 : double(size) / double(objects) );
+            result.appendNumber( "dataSize" , size / scale );
+            result.appendNumber( "storageSize" , storageSize / scale);
+            result.appendNumber( "numExtents" , numExtents );
+            result.appendNumber( "indexes" , indexes );
+            result.appendNumber( "indexSize" , indexSize / scale );
+            result.appendNumber( "fileSize" , d->fileSize() / scale );
+            if( d )
+                result.appendNumber( "nsSizeMB", (int) d->namespaceIndex.fileLength() / 1024 / 1024 );
+
+            return true;
+        }
+    } cmdDBStats;
+
+    /* convertToCapped seems to use this */
+    class CmdCloneCollectionAsCapped : public Command {
+    public:
+        CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "{ cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string from = jsobj.getStringField( "cloneCollectionAsCapped" );
+            string to = jsobj.getStringField( "toCollection" );
+            long long size = (long long)jsobj.getField( "size" ).number();
+
+            if ( from.empty() || to.empty() || size == 0 ) {
+                errmsg = "invalid command spec";
+                return false;
+            }
+
+            string fromNs = dbname + "." + from;
+            string toNs = dbname + "." + to;
+            NamespaceDetails *nsd = nsdetails( fromNs.c_str() );
+            massert( 10301 ,  "source collection " + fromNs + " does not exist", nsd );
+            long long excessSize = nsd->stats.datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size'
+            DiskLoc extent = nsd->firstExtent;
+            for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) {
+                excessSize -= extent.ext()->length;
+                log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl;
+                log( 6 ) << "excessSize: " << excessSize << endl;
+            }
+            DiskLoc startLoc = extent.ext()->firstRecord;
+
+            CursorId id;
+            {
+                shared_ptr<Cursor> c = theDataFileMgr.findAll( fromNs.c_str(), startLoc );
+                ClientCursor *cc = new ClientCursor(0, c, fromNs.c_str());
+                id = cc->cursorid();
+            }
+
+            DBDirectClient client;
+            Client::Context ctx( toNs );
+            BSONObjBuilder spec;
+            spec.appendBool( "capped", true );
+            spec.append( "size", double( size ) );
+            if ( !userCreateNS( toNs.c_str(), spec.done(), errmsg, true ) )
+                return false;
+
+            auto_ptr< DBClientCursor > c = client.getMore( fromNs, id );
+            while( c->more() ) {
+                BSONObj obj = c->next();
+                theDataFileMgr.insertAndLog( toNs.c_str(), obj, true );
+                getDur().commitIfNeeded();
+            }
+
+            return true;
+        }
+    } cmdCloneCollectionAsCapped;
+
+    /* jan2010:
+       Converts the given collection to a capped collection w/ the specified size.
+       This command is not highly used, and is not currently supported with sharded
+       environments.
+       */
+    class CmdConvertToCapped : public Command {
+    public:
+        CmdConvertToCapped() : Command( "convertToCapped" ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "{ convertToCapped:<fromCollectionName>, size:<sizeInBytes> }";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            BackgroundOperation::assertNoBgOpInProgForDb(dbname.c_str());
+
+            string from = jsobj.getStringField( "convertToCapped" );
+            long long size = (long long)jsobj.getField( "size" ).number();
+
+            if ( from.empty() || size == 0 ) {
+                errmsg = "invalid command spec";
+                return false;
+            }
+
+            string shortTmpName = str::stream() << ".tmp.convertToCapped." << from;
+            string longTmpName = str::stream() << dbname << "." << shortTmpName;
+
+            DBDirectClient client;
+            client.dropCollection( longTmpName );
+
+            BSONObj info;
+            if ( !client.runCommand( dbname ,
+                                     BSON( "cloneCollectionAsCapped" << from << "toCollection" << shortTmpName << "size" << double( size ) ),
+                                     info ) ) {
+                errmsg = "cloneCollectionAsCapped failed: " + info.toString();
+                return false;
+            }
+
+            if ( !client.dropCollection( dbname + "." + from ) ) {
+                errmsg = "failed to drop original collection";
+                return false;
+            }
+
+            if ( !client.runCommand( "admin",
+                                     BSON( "renameCollection" << longTmpName <<
+                                           "to" << ( dbname + "." + from ) ),
+                                     info ) ) {
+                errmsg = "renameCollection failed: " + info.toString();
+                return false;
+            }
+
+            return true;
+        }
+    } cmdConvertToCapped;
+
+    /* Returns client's uri */
+    class CmdWhatsMyUri : public Command {
+    public:
+        CmdWhatsMyUri() : Command("whatsmyuri") { }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "{whatsmyuri:1}";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            BSONObj info = cc().curop()->infoNoauth();
+            result << "you" << info[ "client" ];
+            return true;
+        }
+    } cmdWhatsMyUri;
+
+    /* For testing only, not for general use */
+    class GodInsert : public Command {
+    public:
+        GodInsert() : Command( "godinsert" ) { }
+        virtual bool adminOnly() const { return false; }
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool requiresAuth() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << "internal. for testing only.";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+            AuthenticationInfo *ai = cc().getAuthenticationInfo();
+            if ( ! ai->isLocalHost ) {
+                errmsg = "godinsert only works locally";
+                return false;
+            }
+
+            string coll = cmdObj[ "godinsert" ].valuestrsafe();
+            log() << "test only command godinsert invoked coll:" << coll << endl;
+            uassert( 13049, "godinsert must specify a collection", !coll.empty() );
+            string ns = dbname + "." + coll;
+            BSONObj obj = cmdObj[ "obj" ].embeddedObjectUserCheck();
+            {
+                dblock lk;
+                Client::Context ctx( ns );
+                theDataFileMgr.insertWithObjMod( ns.c_str(), obj, true );
+            }
+            return true;
+        }
+    } cmdGodInsert;
+    
+    class DBHashCmd : public Command {
+    public:
+        DBHashCmd() : Command( "dbHash", false, "dbhash" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            list<string> colls;
+            Database* db = cc().database();
+            if ( db )
+                db->namespaceIndex.getNamespaces( colls );
+            colls.sort();
+
+            result.appendNumber( "numCollections" , (long long)colls.size() );
+            result.append( "host" , prettyHostName() );
+
+            md5_state_t globalState;
+            md5_init(&globalState);
+
+            BSONObjBuilder bb( result.subobjStart( "collections" ) );
+            for ( list<string>::iterator i=colls.begin(); i != colls.end(); i++ ) {
+                string c = *i;
+                if ( c.find( ".system.profil" ) != string::npos )
+                    continue;
+
+                shared_ptr<Cursor> cursor;
+
+                NamespaceDetails * nsd = nsdetails( c.c_str() );
+
+                // debug SERVER-761
+                NamespaceDetails::IndexIterator ii = nsd->ii();
+                while( ii.more() ) {
+                    const IndexDetails &idx = ii.next();
+                    if ( !idx.head.isValid() || !idx.info.isValid() ) {
+                        log() << "invalid index for ns: " << c << " " << idx.head << " " << idx.info;
+                        if ( idx.info.isValid() )
+                            log() << " " << idx.info.obj();
+                        log() << endl;
+                    }
+                }
+
+                int idNum = nsd->findIdIndex();
+                if ( idNum >= 0 ) {
+                    cursor.reset( BtreeCursor::make( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) );
+                }
+                else if ( c.find( ".system." ) != string::npos ) {
+                    continue;
+                }
+                else if ( nsd->capped ) {
+                    cursor = findTableScan( c.c_str() , BSONObj() );
+                }
+                else {
+                    log() << "can't find _id index for: " << c << endl;
+                    continue;
+                }
+
+                md5_state_t st;
+                md5_init(&st);
+
+                long long n = 0;
+                while ( cursor->ok() ) {
+                    BSONObj c = cursor->current();
+                    md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() );
+                    n++;
+                    cursor->advance();
+                }
+                md5digest d;
+                md5_finish(&st, d);
+                string hash = digestToString( d );
+
+                bb.append( c.c_str() + ( dbname.size() + 1 ) , hash );
+
+                md5_append( &globalState , (const md5_byte_t*)hash.c_str() , hash.size() );
+            }
+            bb.done();
+
+            md5digest d;
+            md5_finish(&globalState, d);
+            string hash = digestToString( d );
+
+            result.append( "md5" , hash );
+
+            return 1;
+        }
+
+    } dbhashCmd;
+
+    /* for diagnostic / testing purposes. */
+    class CmdSleep : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream& help ) const {
+            help << "internal testing command.  Makes db block (in a read lock) for 100 seconds\n";
+            help << "w:true write lock. secs:<seconds>";
+        }
+        CmdSleep() : Command("sleep") { }
+        bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            log() << "test only command sleep invoked" << endl;
+            int secs = 100;
+            if ( cmdObj["secs"].isNumber() )
+                secs = cmdObj["secs"].numberInt();
+            if( cmdObj.getBoolField("w") ) {
+                writelock lk("");
+                sleepsecs(secs);
+            }
+            else {
+                readlock lk("");
+                sleepsecs(secs);
+            }
+            return true;
+        }
+    } cmdSleep;
+
+    // just for testing
+    class CapTrunc : public Command {
+    public:
+        CapTrunc() : Command( "captrunc" ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool requiresAuth() { return true; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string coll = cmdObj[ "captrunc" ].valuestrsafe();
+            uassert( 13416, "captrunc must specify a collection", !coll.empty() );
+            string ns = dbname + "." + coll;
+            int n = cmdObj.getIntField( "n" );
+
+            // inclusive range?
+            bool inc = cmdObj.getBoolField( "inc" );
+            NamespaceDetails *nsd = nsdetails( ns.c_str() );
+            ReverseCappedCursor c( nsd );
+            massert( 13417, "captrunc collection not found or empty", c.ok() );
+            for( int i = 0; i < n; ++i ) {
+                massert( 13418, "captrunc invalid n", c.advance() );
+            }
+            DiskLoc end = c.currLoc();
+            nsd->cappedTruncateAfter( ns.c_str(), end, inc );
+            return true;
+        }
+    } capTruncCmd;
+
+    // just for testing
+    class EmptyCapped : public Command {
+    public:
+        EmptyCapped() : Command( "emptycapped" ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool requiresAuth() { return true; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string coll = cmdObj[ "emptycapped" ].valuestrsafe();
+            uassert( 13428, "emptycapped must specify a collection", !coll.empty() );
+            string ns = dbname + "." + coll;
+            NamespaceDetails *nsd = nsdetails( ns.c_str() );
+            massert( 13429, "emptycapped no such collection", nsd );
+            nsd->emptyCappedCollection( ns.c_str() );
+            return true;
+        }
+    } emptyCappedCmd;
+
+    bool _execCommand(Command *c, const string& dbname, BSONObj& cmdObj, int queryOptions, BSONObjBuilder& result, bool fromRepl) {
+
+        try {
+            string errmsg;
+            if ( ! c->run(dbname, cmdObj, queryOptions, errmsg, result, fromRepl ) ) {
+                result.append( "errmsg" , errmsg );
+                return false;
+            }
+        }
+        catch ( SendStaleConfigException& e ){
+            log(1) << "command failed because of stale config, can retry" << causedBy( e ) << endl;
+            throw;
+        }
+        catch ( DBException& e ) {
+
+            // TODO: Rethrown errors have issues here, should divorce SendStaleConfigException from the DBException tree
+
+            stringstream ss;
+            ss << "exception: " << e.what();
+            result.append( "errmsg" , ss.str() );
+            result.append( "code" , e.getCode() );
+            return false;
+        }
+
+        return true;
+    }
+
+    /**
+     * this handles
+     - auth
+     - maintenance mode
+     - locking
+     - context
+     then calls run()
+    */
+    bool execCommand( Command * c ,
+                      Client& client , int queryOptions ,
+                      const char *cmdns, BSONObj& cmdObj ,
+                      BSONObjBuilder& result,
+                      bool fromRepl ) {
+
+        string dbname = nsToDatabase( cmdns );
+
+        AuthenticationInfo *ai = client.getAuthenticationInfo();
+
+        if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) {
+            result.append( "errmsg" ,
+                           "unauthorized: this command must run from localhost when running db without auth" );
+            log() << "command denied: " << cmdObj.toString() << endl;
+            return false;
+        }
+
+        if ( c->adminOnly() && ! fromRepl && dbname != "admin" ) {
+            result.append( "errmsg" ,  "access denied; use admin db" );
+            log() << "command denied: " << cmdObj.toString() << endl;
+            return false;
+        }
+
+        if ( cmdObj["help"].trueValue() ) {
+            client.curop()->ensureStarted();
+            stringstream ss;
+            ss << "help for: " << c->name << " ";
+            c->help( ss );
+            result.append( "help" , ss.str() );
+            result.append( "lockType" , c->locktype() );
+            return true;
+        }
+
+        bool canRunHere =
+            isMaster( dbname.c_str() ) ||
+            c->slaveOk() ||
+            ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) ||
+            fromRepl;
+
+        if ( ! canRunHere ) {
+            result.append( "errmsg" , "not master" );
+            result.append( "note" , "from execCommand" );
+            return false;
+        }
+
+        if ( ! c->maintenanceOk() && theReplSet && ! isMaster( dbname.c_str() ) && ! theReplSet->isSecondary() ) {
+            result.append( "errmsg" , "node is recovering" );
+            result.append( "note" , "from execCommand" );
+            return false;
+        }
+
+        if ( c->adminOnly() )
+            log( 2 ) << "command: " << cmdObj << endl;
+
+        if (c->maintenanceMode() && theReplSet && theReplSet->isSecondary()) {
+            theReplSet->setMaintenanceMode(true);
+        }
+
+        bool retval = false;
+        if ( c->locktype() == Command::NONE ) {
+            // we also trust that this won't crash
+            retval = true;
+
+            if ( c->requiresAuth() ) {
+                // test that the user at least as read permissions
+                if ( ! client.getAuthenticationInfo()->isAuthorizedReads( dbname ) ) {
+                    result.append( "errmsg" , "need to login" );
+                    retval = false;
+                }
+            }
+
+            if (retval) {
+                client.curop()->ensureStarted();
+                retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+            }
+        }
+        else if( c->locktype() != Command::WRITE ) { 
+            // read lock
+            assert( ! c->logTheOp() );
+            string ns = c->parseNs(dbname, cmdObj);
+            Client::ReadContext ctx( ns , dbpath, c->requiresAuth() ); // read locks
+            client.curop()->ensureStarted();
+            retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+        }
+        else {
+            dassert( c->locktype() == Command::WRITE );
+            writelock lk;
+            client.curop()->ensureStarted();
+            Client::Context ctx( dbname , dbpath , c->requiresAuth() );
+            retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+            if ( retval && c->logTheOp() && ! fromRepl ) {
+                logOp("c", cmdns, cmdObj);
+            }
+        }
+
+        if (c->maintenanceMode() && theReplSet) {
+            theReplSet->setMaintenanceMode(false);
+        }
+
+        return retval;
+    }
+
+
+    /* TODO make these all command objects -- legacy stuff here
+
+       usage:
+         abc.$cmd.findOne( { ismaster:1 } );
+
+       returns true if ran a cmd
+    */
+    bool _runCommands(const char *ns, BSONObj& _cmdobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+        string dbname = nsToDatabase( ns );
+
+        if( logLevel >= 1 )
+            log() << "run command " << ns << ' ' << _cmdobj << endl;
+
+        const char *p = strchr(ns, '.');
+        if ( !p ) return false;
+        if ( strcmp(p, ".$cmd") != 0 ) return false;
+
+        BSONObj jsobj;
+        {
+            BSONElement e = _cmdobj.firstElement();
+            if ( e.type() == Object && (e.fieldName()[0] == '$'
+                                         ? str::equals("query", e.fieldName()+1)
+                                         : str::equals("query", e.fieldName())))
+            {
+                jsobj = e.embeddedObject();
+            }
+            else {
+                jsobj = _cmdobj;
+            }
+        }
+
+        Client& client = cc();
+        bool ok = false;
+
+        BSONElement e = jsobj.firstElement();
+
+        Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0;
+
+        if ( c ) {
+            ok = execCommand( c , client , queryOptions , ns , jsobj , anObjBuilder , fromRepl );
+        }
+        else {
+            anObjBuilder.append("errmsg", str::stream() << "no such cmd: " << e.fieldName() );
+            anObjBuilder.append("bad cmd" , _cmdobj );
+        }
+
+        // switch to bool, but wait a bit longer before switching?
+        // anObjBuilder.append("ok", ok);
+        anObjBuilder.append("ok", ok?1.0:0.0);
+        BSONObj x = anObjBuilder.done();
+        b.appendBuf((void*) x.objdata(), x.objsize());
+
+        return true;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/dbcommands_admin.cpp b/src/mongo/db/dbcommands_admin.cpp
new file mode 100644
index 00000000000..ffcc3f261fe
--- /dev/null
+++ b/src/mongo/db/dbcommands_admin.cpp
@@ -0,0 +1,550 @@
+// dbcommands_admin.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+   this file has dbcommands that are for dba type administration
+   mostly around dbs and collections
+   NOT system stuff
+*/
+
+
+#include "pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "namespace-inl.h"
+#include "commands.h"
+#include "cmdline.h"
+#include "btree.h"
+#include "curop-inl.h"
+#include "../util/background.h"
+#include "../util/logfile.h"
+#include "../util/alignedbuilder.h"
+#include "../util/paths.h"
+#include "../scripting/engine.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    class CleanCmd : public Command {
+    public:
+        CleanCmd() : Command( "clean" ) {}
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return WRITE; }
+
+        virtual void help(stringstream& h) const { h << "internal"; }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string dropns = dbname + "." + cmdObj.firstElement().valuestrsafe();
+
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: clean " << dropns << endl;
+
+            NamespaceDetails *d = nsdetails(dropns.c_str());
+
+            if ( ! d ) {
+                errmsg = "ns not found";
+                return 0;
+            }
+
+            for ( int i = 0; i < Buckets; i++ )
+                d->deletedList[i].Null();
+
+            result.append("ns", dropns.c_str());
+            return 1;
+        }
+
+    } cleanCmd;
+
+    namespace dur {
+        boost::filesystem::path getJournalDir();
+    }
+ 
+    class JournalLatencyTestCmd : public Command {
+    public:
+        JournalLatencyTestCmd() : Command( "journalLatencyTest" ) {}
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+        virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            boost::filesystem::path p = dur::getJournalDir();
+            p /= "journalLatencyTest";
+        
+            // remove file if already present
+            try { 
+                remove(p);
+            }
+            catch(...) { }
+
+            BSONObjBuilder bb[2];
+            for( int pass = 0; pass < 2; pass++ ) {
+                LogFile f(p.string());
+                AlignedBuilder b(1024 * 1024);
+                {
+                    Timer t;
+                    for( int i = 0 ; i < 100; i++ ) { 
+                        f.synchronousAppend(b.buf(), 8192);
+                    }
+                    bb[pass].append("8KB", t.millis() / 100.0);
+                }
+                {
+                    const int N = 50;
+                    Timer t2;
+                    long long x = 0;
+                    for( int i = 0 ; i < N; i++ ) { 
+                        Timer t;
+                        f.synchronousAppend(b.buf(), 8192);
+                        x += t.micros();
+                        sleepmillis(4);
+                    }
+                    long long y = t2.micros() - 4*N*1000;
+                    // not really trusting the timer granularity on all platforms so whichever is higher of x and y
+                    bb[pass].append("8KBWithPauses", max(x,y) / (N*1000.0));
+                }
+                {
+                    Timer t;
+                    for( int i = 0 ; i < 20; i++ ) { 
+                        f.synchronousAppend(b.buf(), 1024 * 1024);
+                    }
+                    bb[pass].append("1MB", t.millis() / 20.0);
+                }
+                // second time around, we are prealloced.
+            }
+            result.append("timeMillis", bb[0].obj());
+            result.append("timeMillisWithPrealloc", bb[1].obj());
+
+            try { 
+                remove(p);
+            }
+            catch(...) { }
+
+            try {
+                result.append("onSamePartition", onSamePartition(dur::getJournalDir().string(), dbpath));
+            }
+            catch(...) { }
+
+            return 1;
+        }
+    } journalLatencyTestCmd;
+
+    class ValidateCmd : public Command {
+    public:
+        ValidateCmd() : Command( "validate" ) {}
+
+        virtual bool slaveOk() const {
+            return true;
+        }
+
+        virtual void help(stringstream& h) const { h << "Validate contents of a namespace by scanning its data structures for correctness.  Slow.\n"
+                                                        "Add full:true option to do a more thorough check"; }
+
+        virtual LockType locktype() const { return READ; }
+        //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] [, full: <bool> } */
+
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string ns = dbname + "." + cmdObj.firstElement().valuestrsafe();
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: validate " << ns << endl;
+
+            if ( ! d ) {
+                errmsg = "ns not found";
+                return 0;
+            }
+
+            result.append( "ns", ns );
+            validateNS( ns.c_str() , d, cmdObj, result);
+            return 1;
+        }
+
+    private:
+        void validateNS(const char *ns, NamespaceDetails *d, const BSONObj& cmdObj, BSONObjBuilder& result) {
+            const bool full = cmdObj["full"].trueValue();
+            const bool scanData = full || cmdObj["scandata"].trueValue();
+
+            bool valid = true;
+            BSONArrayBuilder errors; // explanation(s) for why valid = false
+            if ( d->capped ){
+                result.append("capped", d->capped);
+                result.append("max", d->max);
+            }
+
+            result.append("firstExtent", str::stream() << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString());
+            result.append( "lastExtent", str::stream() <<  d->lastExtent.toString() << " ns:" <<  d->lastExtent.ext()->nsDiagnostic.toString());
+            
+            BSONArrayBuilder extentData;
+
+            try {
+                d->firstExtent.ext()->assertOk();
+                d->lastExtent.ext()->assertOk();
+
+                DiskLoc el = d->firstExtent;
+                int ne = 0;
+                while( !el.isNull() ) {
+                    Extent *e = el.ext();
+                    e->assertOk();
+                    el = e->xnext;
+                    ne++;
+                    if ( full )
+                        extentData << e->dump();
+                    
+                    killCurrentOp.checkForInterrupt();
+                }
+                result.append("extentCount", ne);
+            }
+            catch (...) {
+                valid=false;
+                errors << "extent asserted";
+            }
+
+            if ( full )
+                result.appendArray( "extents" , extentData.arr() );
+
+            
+            result.appendNumber("datasize", d->stats.datasize);
+            result.appendNumber("nrecords", d->stats.nrecords);
+            result.appendNumber("lastExtentSize", d->lastExtentSize);
+            result.appendNumber("padding", d->paddingFactor);
+            
+
+            try {
+
+                try {
+                    result.append("firstExtentDetails", d->firstExtent.ext()->dump());
+
+                    valid = valid && d->firstExtent.ext()->validates() && 
+                        d->firstExtent.ext()->xprev.isNull();
+                }
+                catch (...) {
+                    errors << "exception firstextent";
+                    valid = false;
+                }
+
+                set<DiskLoc> recs;
+                if( scanData ) {
+                    shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+                    int n = 0;
+                    int nInvalid = 0;
+                    long long len = 0;
+                    long long nlen = 0;
+                    int outOfOrder = 0;
+                    DiskLoc cl_last;
+                    while ( c->ok() ) {
+                        n++;
+
+                        DiskLoc cl = c->currLoc();
+                        if ( n < 1000000 )
+                            recs.insert(cl);
+                        if ( d->capped ) {
+                            if ( cl < cl_last )
+                                outOfOrder++;
+                            cl_last = cl;
+                        }
+
+                        Record *r = c->_current();
+                        len += r->lengthWithHeaders;
+                        nlen += r->netLength();
+
+                        if (full){
+                            BSONObj obj(r);
+                            if (!obj.isValid() || !obj.valid()){ // both fast and deep checks
+                                valid = false;
+                                if (nInvalid == 0) // only log once;
+                                    errors << "invalid bson object detected (see logs for more info)";
+
+                                nInvalid++;
+                                if (strcmp("_id", obj.firstElementFieldName()) == 0){
+                                    try {
+                                        obj.firstElement().validate(); // throws on error
+                                        log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl;
+                                    }
+                                    catch(...){
+                                        log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl;
+                                    }
+                                }
+                                else {
+                                    log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl;
+                                }
+                            }
+                        }
+
+                        c->advance();
+                    }
+                    if ( d->capped && !d->capLooped() ) {
+                        result.append("cappedOutOfOrder", outOfOrder);
+                        if ( outOfOrder > 1 ) {
+                            valid = false;
+                            errors << "too many out of order records";
+                        }
+                    }
+                    result.append("objectsFound", n);
+
+                    if (full) {
+                        result.append("invalidObjects", nInvalid);
+                    }
+
+                    result.appendNumber("bytesWithHeaders", len);
+                    result.appendNumber("bytesWithoutHeaders", nlen);
+                }
+
+                BSONArrayBuilder deletedListArray;
+                for ( int i = 0; i < Buckets; i++ ) {
+                    deletedListArray << d->deletedList[i].isNull();
+                }
+
+                int ndel = 0;
+                long long delSize = 0;
+                int incorrect = 0;
+                for ( int i = 0; i < Buckets; i++ ) {
+                    DiskLoc loc = d->deletedList[i];
+                    try {
+                        int k = 0;
+                        while ( !loc.isNull() ) {
+                            if ( recs.count(loc) )
+                                incorrect++;
+                            ndel++;
+
+                            if ( loc.questionable() ) {
+                                if( d->capped && !loc.isValid() && i == 1 ) {
+                                    /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
+                                       see comments in namespace.h
+                                    */
+                                    break;
+                                }
+
+                                if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) {
+                                    string err (str::stream() << "bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k);
+                                    errors << err;
+
+                                    valid = false;
+                                    break;
+                                }
+                            }
+
+                            DeletedRecord *d = loc.drec();
+                            delSize += d->lengthWithHeaders;
+                            loc = d->nextDeleted;
+                            k++;
+                            killCurrentOp.checkForInterrupt();
+                        }
+                    }
+                    catch (...) {
+                        errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i));
+                        valid = false;
+                    }
+                }
+                result.appendNumber("deletedCount", ndel);
+                result.appendNumber("deletedSize", delSize);
+
+                if ( incorrect ) {
+                    errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list");
+                    valid = false;
+                }
+
+                int idxn = 0;
+                try  {
+                    result.append("nIndexes", d->nIndexes);
+                    BSONObjBuilder indexes; // not using subObjStart to be exception safe
+                    NamespaceDetails::IndexIterator i = d->ii();
+                    while( i.more() ) {
+                        IndexDetails& id = i.next();
+                        long long keys = id.idxInterface().fullValidate(id.head, id.keyPattern());
+                        indexes.appendNumber(id.indexNamespace(), keys);
+                    }
+                    result.append("keysPerIndex", indexes.done());
+                }
+                catch (...) {
+                    errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn));
+                    valid=false;
+                }
+
+            }
+            catch (AssertionException) {
+                errors << "exception during validate";
+                valid = false;
+            }
+
+            result.appendBool("valid", valid);
+            result.append("errors", errors.arr());
+
+            if ( !full ){
+                result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan.");
+            }
+            
+            if ( !valid ) {
+                result.append("advice", "ns corrupt, requires repair");
+            }
+
+        }
+    } validateCmd;
+
+    bool lockedForWriting = false; // read from db/instance.cpp
+    static bool unlockRequested = false;
+    static mongo::mutex fsyncLockMutex("fsyncLock");
+    static boost::condition fsyncLockCondition;
+    static OID fsyncLockID; // identifies the current lock job
+
+    /*
+        class UnlockCommand : public Command {
+        public:
+            UnlockCommand() : Command( "unlock" ) { }
+            virtual bool readOnly() { return true; }
+            virtual bool slaveOk() const { return true; }
+            virtual bool adminOnly() const { return true; }
+            virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+                if( lockedForWriting ) {
+                    log() << "command: unlock requested" << endl;
+                    errmsg = "unlock requested";
+                    unlockRequested = true;
+                }
+                else {
+                    errmsg = "not locked, so cannot unlock";
+                    return 0;
+                }
+                return 1;
+            }
+
+        } unlockCommand;
+    */
+    /* see unlockFsync() for unlocking:
+       db.$cmd.sys.unlock.findOne()
+    */
+    class FSyncCommand : public Command {
+        static const char* url() { return  "http://www.mongodb.org/display/DOCS/fsync+Command"; }
+        class LockDBJob : public BackgroundJob {
+        protected:
+            virtual string name() const { return "lockdbjob"; }
+            void run() {
+                Client::initThread("fsyncjob");
+                Client& c = cc();
+                {
+                    scoped_lock lk(fsyncLockMutex);
+                    while (lockedForWriting){ // there is a small window for two LockDBJob's to be active. This prevents it.
+                        fsyncLockCondition.wait(lk.boost());
+                    }
+                    lockedForWriting = true;
+                    fsyncLockID.init();
+                }
+                readlock lk("");
+                MemoryMappedFile::flushAll(true);
+                log() << "db is now locked for snapshotting, no writes allowed. db.fsyncUnlock() to unlock" << endl;
+                log() << "    For more info see " << FSyncCommand::url() << endl;
+                _ready = true;
+                {
+                    scoped_lock lk(fsyncLockMutex);
+                    while( !unlockRequested ) {
+                        fsyncLockCondition.wait(lk.boost());
+                    }
+                    unlockRequested = false;
+                    lockedForWriting = false;
+                    fsyncLockCondition.notify_all();
+                }
+                c.shutdown();
+            }
+        public:
+            bool& _ready;
+            LockDBJob(bool& ready) : BackgroundJob( true /* delete self */ ), _ready(ready) {
+                _ready = false;
+            }
+        };
+    public:
+        FSyncCommand() : Command( "fsync" ) {}
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) {
+            string x = cmdObj["exec"].valuestrsafe();
+            return !x.empty();
+        }*/
+        virtual void help(stringstream& h) const { h << url(); }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately
+            bool lock = cmdObj["lock"].trueValue();
+            log() << "CMD fsync:  sync:" << sync << " lock:" << lock << endl;
+
+            if( lock ) {
+                // fsync and lock variation 
+
+                uassert(12034, "fsync: can't lock while an unlock is pending", !unlockRequested);
+                uassert(12032, "fsync: sync option must be true when using lock", sync);
+                /* With releaseEarly(), we must be extremely careful we don't do anything
+                   where we would have assumed we were locked.  profiling is one of those things.
+                   Perhaps at profile time we could check if we released early -- however,
+                   we need to be careful to keep that code very fast it's a very common code path when on.
+                */
+                uassert(12033, "fsync: profiling must be off to enter locked mode", cc().database()->profile == 0);
+
+                // todo future: Perhaps we could do this in the background thread.  As is now, writes may interleave between 
+                //              the releaseEarly below and the acquisition of the readlock in the background thread. 
+                //              However the real problem is that it seems complex to unlock here and then have a window for 
+                //              writes before the bg job -- can be done correctly but harder to reason about correctness.
+                //              If this command ran within a read lock in the first place, would it work, and then that 
+                //              would be quite easy?
+                //              Or, could we downgrade the write lock to a read lock, wait for ready, then release?
+                getDur().syncDataAndTruncateJournal();
+
+                bool ready = false;
+                LockDBJob *l = new LockDBJob(ready);
+
+                d.dbMutex.releaseEarly();
+                
+                // There is a narrow window for another lock request to come in
+                // here before the LockDBJob grabs the readlock. LockDBJob will
+                // ensure that the requests are serialized and never running
+                // concurrently
+
+                l->go();
+                // don't return until background thread has acquired the read lock
+                while( !ready ) {
+                    sleepmillis(10);
+                }
+                result.append("info", "now locked against writes, use db.fsyncUnlock() to unlock");
+                result.append("seeAlso", url());
+            }
+            else {
+                // the simple fsync command case
+
+                if (sync)
+                    getDur().commitNow();
+                result.append( "numFiles" , MemoryMappedFile::flushAll( sync ) );
+            }
+            return 1;
+        }
+
+    } fsyncCmd;
+
+    // Note that this will only unlock the current lock.  If another thread
+    // relocks before we return we still consider the unlocking successful.
+    // This is imporant because if two scripts are trying to fsync-lock, each
+    // one must be assured that between the fsync return and the call to unlock
+    // that the database is fully locked
+    void unlockFsyncAndWait(){
+        scoped_lock lk(fsyncLockMutex);
+        if (lockedForWriting) { // could have handled another unlock before we grabbed the lock
+            OID curOp = fsyncLockID;
+            unlockRequested = true;
+            fsyncLockCondition.notify_all();
+            while (lockedForWriting && fsyncLockID == curOp){
+                fsyncLockCondition.wait( lk.boost() );
+            }
+        }
+    }
+}
+
diff --git a/src/mongo/db/dbcommands_generic.cpp b/src/mongo/db/dbcommands_generic.cpp
new file mode 100644
index 00000000000..cfd833aa72d
--- /dev/null
+++ b/src/mongo/db/dbcommands_generic.cpp
@@ -0,0 +1,432 @@
+/** @file dbcommands_generic.cpp commands suited for any mongo server (both mongod, mongos) */
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "../util/md5.hpp"
+#include "../util/processinfo.h"
+#include "json.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "replutil.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "../scripting/engine.h"
+#include "stats/counters.h"
+#include "background.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "repl/multicmd.h"
+#include "server.h"
+
+namespace mongo {
+
+#if 0
+    namespace cloud {
+        SimpleMutex mtx("cloud");
+        Guarded< vector<string>, mtx > ips;
+        bool startedThread = false;
+
+        void thread() { 
+            bson::bo cmd;
+            while( 1 ) {
+                list<Target> L;
+                {
+                    SimpleMutex::scoped_lock lk(mtx);
+                    if( ips.ref(lk).empty() )
+                        continue;
+                    for( unsigned i = 0; i < ips.ref(lk).size(); i++ ) { 
+                        L.push_back( Target(ips.ref(lk)[i]) );
+                    }
+                }
+
+
+                /** repoll as machines might be down on the first lookup (only if not found previously) */
+                sleepsecs(6); 
+            }
+        }
+    }
+
+    class CmdCloud : public Command {
+    public:
+        CmdCloud() : Command( "cloud" ) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "internal command facilitating running in certain cloud computing environments";
+        }
+        bool run(const string& dbname, BSONObj& obj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            if( !obj.hasElement("servers") ) { 
+                vector<string> ips;
+                obj["servers"].Obj().Vals(ips);
+                {
+                    SimpleMutex::scoped_lock lk(cloud::mtx);
+                    cloud::ips.ref(lk).swap(ips);
+                    if( !cloud::startedThread ) {
+                        cloud::startedThread = true;
+                        boost::thread thr(cloud::thread);
+                    }
+                }
+            }
+            return true;
+        }
+    } cmdCloud;
+#endif
+
+    class CmdBuildInfo : public Command {
+    public:
+        CmdBuildInfo() : Command( "buildInfo", true, "buildinfo" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool requiresAuth() { return false; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "get version #, etc.\n";
+            help << "{ buildinfo:1 }";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo();
+            result << "versionArray" << versionArray;
+            result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 );
+            result.appendBool( "debug" , debug );
+            result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
+            return true;
+        }
+    } cmdBuildInfo;
+
+    /** experimental. either remove or add support in repl sets also.  in a repl set, getting this setting from the
+        repl set config could make sense.
+        */
+    unsigned replApplyBatchSize = 1;
+
+    class CmdGet : public Command {
+    public:
+        CmdGet() : Command( "getParameter" ) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "get administrative option(s)\nexample:\n";
+            help << "{ getParameter:1, notablescan:1 }\n";
+            help << "supported so far:\n";
+            help << "  quiet\n";
+            help << "  notablescan\n";
+            help << "  logLevel\n";
+            help << "  syncdelay\n";
+            help << "{ getParameter:'*' } to get everything\n";
+        }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            bool all = *cmdObj.firstElement().valuestrsafe() == '*';
+
+            int before = result.len();
+
+            if( all || cmdObj.hasElement("quiet") ) {
+                result.append("quiet", cmdLine.quiet );
+            }
+            if( all || cmdObj.hasElement("notablescan") ) {
+                result.append("notablescan", cmdLine.noTableScan);
+            }
+            if( all || cmdObj.hasElement("logLevel") ) {
+                result.append("logLevel", logLevel);
+            }
+            if( all || cmdObj.hasElement("syncdelay") ) {
+                result.append("syncdelay", cmdLine.syncdelay);
+            }
+            if( all || cmdObj.hasElement("replApplyBatchSize") ) {
+                result.append("replApplyBatchSize", replApplyBatchSize);
+            }
+
+            if ( before == result.len() ) {
+                errmsg = "no option found to get";
+                return false;
+            }
+            return true;
+        }
+    } cmdGet;
+
+    // tempish
+    bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl );
+
+    class CmdSet : public Command {
+    public:
+        CmdSet() : Command( "setParameter" ) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "set administrative option(s)\n";
+            help << "{ setParameter:1, <param>:<value> }\n";
+            help << "supported so far:\n";
+            help << "  journalCommitInterval\n";
+            help << "  logLevel\n";
+            help << "  notablescan\n";
+            help << "  quiet\n";
+            help << "  syncdelay\n";
+        }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            int s = 0;
+            bool found = setParmsMongodSpecific(dbname, cmdObj, errmsg, result, fromRepl);
+            if( cmdObj.hasElement("journalCommitInterval") ) { 
+                if( !cmdLine.dur ) { 
+                    errmsg = "journaling is off";
+                    return false;
+                }
+                int x = (int) cmdObj["journalCommitInterval"].Number();
+                assert( x > 1 && x < 500 );
+                cmdLine.journalCommitInterval = x;
+                log() << "setParameter journalCommitInterval=" << x << endl;
+                s++;
+            }
+            if( cmdObj.hasElement("notablescan") ) {
+                assert( !cmdLine.isMongos() );
+                if( s == 0 )
+                    result.append("was", cmdLine.noTableScan);
+                cmdLine.noTableScan = cmdObj["notablescan"].Bool();
+                s++;
+            }
+            if( cmdObj.hasElement("quiet") ) {
+                if( s == 0 )
+                    result.append("was", cmdLine.quiet );
+                cmdLine.quiet = cmdObj["quiet"].Bool();
+                s++;
+            }
+            if( cmdObj.hasElement("syncdelay") ) {
+                assert( !cmdLine.isMongos() );
+                if( s == 0 )
+                    result.append("was", cmdLine.syncdelay );
+                cmdLine.syncdelay = cmdObj["syncdelay"].Number();
+                s++;
+            }
+            if( cmdObj.hasElement( "logLevel" ) ) {
+                if( s == 0 )
+                    result.append("was", logLevel );
+                logLevel = cmdObj["logLevel"].numberInt();
+                s++;
+            }
+            if( cmdObj.hasElement( "replApplyBatchSize" ) ) {
+                if( s == 0 )
+                    result.append("was", replApplyBatchSize );
+                BSONElement e = cmdObj["replApplyBatchSize"];
+                ParameterValidator * v = ParameterValidator::get( e.fieldName() );
+                assert( v );
+                if ( ! v->isValid( e , errmsg ) )
+                    return false;
+                replApplyBatchSize = e.numberInt();
+                s++;
+            }
+            if( cmdObj.hasElement( "traceExceptions" ) ) {
+                if( s == 0 ) result.append( "was", DBException::traceExceptions );
+                DBException::traceExceptions = cmdObj["traceExceptions"].Bool();
+                s++;
+            }
+
+            if( s == 0 && !found ) {
+                errmsg = "no option found to set, use help:true to see options ";
+                return false;
+            }
+
+            return true;
+        }
+    } cmdSet;
+
+    class PingCommand : public Command {
+    public:
+        PingCommand() : Command( "ping" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream &help ) const { help << "a way to check that the server is alive. responds immediately even if server is in a db lock."; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool requiresAuth() { return false; }
+        virtual bool run(const string& badns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            // IMPORTANT: Don't put anything in here that might lock db - including authentication
+            return true;
+        }
+    } pingCmd;
+
+    class FeaturesCmd : public Command {
+    public:
+        FeaturesCmd() : Command( "features", true ) {}
+        void help(stringstream& h) const { h << "return build level feature settings"; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool readOnly() { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( globalScriptEngine ) {
+                BSONObjBuilder bb( result.subobjStart( "js" ) );
+                result.append( "utf8" , globalScriptEngine->utf8Ok() );
+                bb.done();
+            }
+            if ( cmdObj["oidReset"].trueValue() ) {
+                result.append( "oidMachineOld" , OID::getMachineId() );
+                OID::regenMachineId();
+            }
+            result.append( "oidMachine" , OID::getMachineId() );
+            return true;
+        }
+
+    } featuresCmd;
+
+    class LogRotateCmd : public Command {
+    public:
+        LogRotateCmd() : Command( "logRotate" ) {}
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            rotateLogs();
+            return 1;
+        }
+
+    } logRotateCmd;
+
+    class ListCommandsCmd : public Command {
+    public:
+        virtual void help( stringstream &help ) const { help << "get a list of all db commands"; }
+        ListCommandsCmd() : Command( "listCommands", false ) {}
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONObjBuilder b( result.subobjStart( "commands" ) );
+            for ( map<string,Command*>::iterator i=_commands->begin(); i!=_commands->end(); ++i ) {
+                Command * c = i->second;
+
+                // don't show oldnames
+                if (i->first != c->name)
+                    continue;
+
+                BSONObjBuilder temp( b.subobjStart( c->name ) );
+
+                {
+                    stringstream help;
+                    c->help( help );
+                    temp.append( "help" , help.str() );
+                }
+                temp.append( "lockType" , c->locktype() );
+                temp.append( "slaveOk" , c->slaveOk() );
+                temp.append( "adminOnly" , c->adminOnly() );
+                //optionally indicates that the command can be forced to run on a slave/secondary
+                if ( c->slaveOverrideOk() ) temp.append( "slaveOverrideOk" , c->slaveOverrideOk() );
+                temp.done();
+            }
+            b.done();
+
+            return 1;
+        }
+
+    } listCommandsCmd;
+
+    bool CmdShutdown::shutdownHelper() {
+        Client * c = currentClient.get();
+        if ( c ) {
+            c->shutdown();
+        }
+
+        log() << "terminating, shutdown command received" << endl;
+
+        dbexit( EXIT_CLEAN , "shutdown called" , true ); // this never returns
+        assert(0);
+        return true;
+    }
+
+    /* for testing purposes only */
+    class CmdForceError : public Command {
+    public:
+        virtual void help( stringstream& help ) const {
+            help << "for testing purposes only.  forces a user assertion exception";
+        }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        CmdForceError() : Command("forceerror") {}
+        bool run(const string& dbnamne, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            uassert( 10038 , "forced error", false);
+            return true;
+        }
+    } cmdForceError;
+
+    class AvailableQueryOptions : public Command {
+    public:
+        AvailableQueryOptions() : Command( "availableQueryOptions" , false , "availablequeryoptions" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            result << "options" << QueryOption_AllSupported;
+            return true;
+        }
+    } availableQueryOptionsCmd;
+
+    class GetLogCmd : public Command {
+    public:
+        GetLogCmd() : Command( "getLog" ){}
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+
+        virtual void help( stringstream& help ) const {
+            help << "{ getLog : '*' }  OR { getLog : 'global' }";
+        }
+
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string p = cmdObj.firstElement().String();
+            if ( p == "*" ) {
+                vector<string> names;
+                RamLog::getNames( names );
+
+                BSONArrayBuilder arr;
+                for ( unsigned i=0; i<names.size(); i++ ) {
+                    arr.append( names[i] );
+                }
+                
+                result.appendArray( "names" , arr.arr() );
+            }
+            else {
+                RamLog* rl = RamLog::get( p );
+                if ( ! rl ) {
+                    errmsg = str::stream() << "no RamLog named: " << p;
+                    return false;
+                }
+                
+                vector<const char*> lines;
+                rl->get( lines );
+                
+                BSONArrayBuilder arr( result.subarrayStart( "log" ) );
+                for ( unsigned i=0; i<lines.size(); i++ )
+                    arr.append( lines[i] );
+                arr.done();
+            }
+            return true;
+        }
+
+    } getLogCmd;
+
+}
diff --git a/src/mongo/db/dbeval.cpp b/src/mongo/db/dbeval.cpp
new file mode 100644
index 00000000000..9e77d8c8097
--- /dev/null
+++ b/src/mongo/db/dbeval.cpp
@@ -0,0 +1,136 @@
+/* commands.cpp
+   db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "json.h"
+#include "repl.h"
+#include "commands.h"
+#include "cmdline.h"
+
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+    const int edebug=0;
+
+    bool dbEval(const string& dbName, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) {
+        BSONElement e = cmd.firstElement();
+        uassert( 10046 ,  "eval needs Code" , e.type() == Code || e.type() == CodeWScope || e.type() == String );
+
+        const char *code = 0;
+        switch ( e.type() ) {
+        case String:
+        case Code:
+            code = e.valuestr();
+            break;
+        case CodeWScope:
+            code = e.codeWScopeCode();
+            break;
+        default:
+            assert(0);
+        }
+        assert( code );
+
+        if ( ! globalScriptEngine ) {
+            errmsg = "db side execution is disabled";
+            return false;
+        }
+
+        auto_ptr<Scope> s = globalScriptEngine->getPooledScope( dbName );
+        ScriptingFunction f = s->createFunction(code);
+        if ( f == 0 ) {
+            errmsg = (string)"compile failed: " + s->getError();
+            return false;
+        }
+
+        if ( e.type() == CodeWScope )
+            s->init( e.codeWScopeScopeData() );
+        s->localConnect( dbName.c_str() );
+
+        BSONObj args;
+        {
+            BSONElement argsElement = cmd.getField("args");
+            if ( argsElement.type() == Array ) {
+                args = argsElement.embeddedObject();
+                if ( edebug ) {
+                    out() << "args:" << args.toString() << endl;
+                    out() << "code:\n" << code << endl;
+                }
+            }
+        }
+
+        int res;
+        {
+            Timer t;
+            res = s->invoke(f, &args, 0, cmdLine.quota ? 10 * 60 * 1000 : 0 );
+            int m = t.millis();
+            if ( m > cmdLine.slowMS ) {
+                out() << "dbeval slow, time: " << dec << m << "ms " << dbName << endl;
+                if ( m >= 1000 ) log() << code << endl;
+                else OCCASIONALLY log() << code << endl;
+            }
+        }
+        if ( res ) {
+            result.append("errno", (double) res);
+            errmsg = "invoke failed: ";
+            errmsg += s->getError();
+            return false;
+        }
+
+        s->append( result , "retval" , "return" );
+
+        return true;
+    }
+
+    class CmdEval : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual void help( stringstream &help ) const {
+            help << "Evaluate javascript at the server.\n" "http://www.mongodb.org/display/DOCS/Server-side+Code+Execution";
+        }
+        virtual LockType locktype() const { return NONE; }
+        CmdEval() : Command("eval", false, "$eval") { }
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+            AuthenticationInfo *ai = cc().getAuthenticationInfo();
+            uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(dbname.c_str()) );
+
+            if ( cmdObj["nolock"].trueValue() ) {
+                return dbEval(dbname, cmdObj, result, errmsg);
+            }
+
+            // write security will be enforced in DBDirectClient
+            mongolock lk( ai->isAuthorized( dbname.c_str() ) );
+            Client::Context ctx( dbname );
+
+            return dbEval(dbname, cmdObj, result, errmsg);
+        }
+    } cmdeval;
+
+} // namespace mongo
diff --git a/src/mongo/db/dbhelpers.cpp b/src/mongo/db/dbhelpers.cpp
new file mode 100644
index 00000000000..39540c9ce89
--- /dev/null
+++ b/src/mongo/db/dbhelpers.cpp
@@ -0,0 +1,353 @@
+// dbhelpers.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "json.h"
+#include "queryoptimizer.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "oplog.h"
+#include "ops/update.h"
+#include "ops/delete.h"
+
+namespace mongo {
+
+    void Helpers::ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name) {
+        NamespaceDetails *d = nsdetails(ns);
+        if( d == 0 )
+            return;
+
+        {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                if( i.next().keyPattern().woCompare(keyPattern) == 0 )
+                    return;
+            }
+        }
+
+        if( d->nIndexes >= NamespaceDetails::NIndexesMax ) {
+            problem() << "Helper::ensureIndex fails, MaxIndexes exceeded " << ns << '\n';
+            return;
+        }
+
+        string system_indexes = cc().database()->name + ".system.indexes";
+
+        BSONObjBuilder b;
+        b.append("name", name);
+        b.append("ns", ns);
+        b.append("key", keyPattern);
+        b.appendBool("unique", unique);
+        BSONObj o = b.done();
+
+        theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize());
+    }
+
+    /* fetch a single object from collection ns that matches query
+       set your db SavedContext first
+    */
+    bool Helpers::findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex) {
+        DiskLoc loc = findOne( ns, query, requireIndex );
+        if ( loc.isNull() )
+            return false;
+        result = loc.obj();
+        return true;
+    }
+
+    /* fetch a single object from collection ns that matches query
+       set your db SavedContext first
+    */
+    DiskLoc Helpers::findOne(const char *ns, const BSONObj &query, bool requireIndex) {
+        shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns, query, BSONObj(), requireIndex );
+        while( c->ok() ) {
+            if ( c->currentMatches() && !c->getsetdup( c->currLoc() ) ) {
+                return c->currLoc();
+            }
+            c->advance();
+        }
+        return DiskLoc();
+    }
+
+    bool Helpers::findById(Client& c, const char *ns, BSONObj query, BSONObj& result ,
+                           bool * nsFound , bool * indexFound ) {
+        d.dbMutex.assertAtLeastReadLocked();
+        Database *database = c.database();
+        assert( database );
+        NamespaceDetails *d = database->namespaceIndex.details(ns);
+        if ( ! d )
+            return false;
+        if ( nsFound )
+            *nsFound = 1;
+
+        int idxNo = d->findIdIndex();
+        if ( idxNo < 0 )
+            return false;
+        if ( indexFound )
+            *indexFound = 1;
+
+        IndexDetails& i = d->idx( idxNo );
+
+        BSONObj key = i.getKeyFromQuery( query );
+
+        DiskLoc loc = i.idxInterface().findSingle(i , i.head , key);
+        if ( loc.isNull() )
+            return false;
+        result = loc.obj();
+        return true;
+    }
+
+    DiskLoc Helpers::findById(NamespaceDetails *d, BSONObj idquery) {
+        assert(d);
+        int idxNo = d->findIdIndex();
+        uassert(13430, "no _id index", idxNo>=0);
+        IndexDetails& i = d->idx( idxNo );
+        BSONObj key = i.getKeyFromQuery( idquery );
+        return i.idxInterface().findSingle(i , i.head , key);
+    }
+
+    bool Helpers::isEmpty(const char *ns, bool doAuth) {
+        Client::Context context(ns, dbpath, doAuth);
+        shared_ptr<Cursor> c = DataFileMgr::findAll(ns);
+        return !c->ok();
+    }
+
+    /* Get the first object from a collection.  Generally only useful if the collection
+       only ever has a single object -- which is a "singleton collection.
+
+       Returns: true if object exists.
+    */
+    bool Helpers::getSingleton(const char *ns, BSONObj& result) {
+        Client::Context context(ns);
+
+        shared_ptr<Cursor> c = DataFileMgr::findAll(ns);
+        if ( !c->ok() ) {
+            context.getClient()->curop()->done();
+            return false;
+        }
+
+        result = c->current();
+        context.getClient()->curop()->done();
+        return true;
+    }
+
+    bool Helpers::getLast(const char *ns, BSONObj& result) {
+        Client::Context ctx(ns);
+        shared_ptr<Cursor> c = findTableScan(ns, reverseNaturalObj);
+        if( !c->ok() )
+            return false;
+        result = c->current();
+        return true;
+    }
+
+    void Helpers::upsert( const string& ns , const BSONObj& o ) {
+        BSONElement e = o["_id"];
+        assert( e.type() );
+        BSONObj id = e.wrap();
+
+        OpDebug debug;
+        Client::Context context(ns);
+        updateObjects(ns.c_str(), o, /*pattern=*/id, /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug );
+    }
+
+    void Helpers::putSingleton(const char *ns, BSONObj obj) {
+        OpDebug debug;
+        Client::Context context(ns);
+        updateObjects(ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug );
+        context.getClient()->curop()->done();
+    }
+
+    void Helpers::putSingletonGod(const char *ns, BSONObj obj, bool logTheOp) {
+        OpDebug debug;
+        Client::Context context(ns);
+        _updateObjects(/*god=*/true, ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , logTheOp , debug );
+        context.getClient()->curop()->done();
+    }
+
+    BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ) {
+        BSONObjBuilder me;
+        BSONObjBuilder k;
+
+        BSONObjIterator i( o );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            k.append( e.fieldName() , 1 );
+            me.appendAs( e , "" );
+        }
+        key = k.obj();
+        return me.obj();
+    }
+
+    long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ) {
+        BSONObj keya , keyb;
+        BSONObj minClean = toKeyFormat( min , keya );
+        BSONObj maxClean = toKeyFormat( max , keyb );
+        assert( keya == keyb );
+
+        Client::Context ctx(ns);
+        NamespaceDetails* nsd = nsdetails( ns.c_str() );
+        if ( ! nsd )
+            return 0;
+
+        int ii = nsd->findIndexByKeyPattern( keya );
+        assert( ii >= 0 );
+
+        long long num = 0;
+
+        IndexDetails& i = nsd->idx( ii );
+
+        shared_ptr<Cursor> c( BtreeCursor::make( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
+        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
+        cc->setDoingDeletes( true );
+
+        while ( c->ok() ) {
+
+            if ( yield && ! cc->yieldSometimes( ClientCursor::WillNeed) ) {
+                // cursor got finished by someone else, so we're done
+                cc.release(); // if the collection/db is dropped, cc may be deleted
+                break;
+            }
+
+            if ( ! c->ok() )
+                break;
+
+            DiskLoc rloc = c->currLoc();
+
+            if ( callback )
+                callback->goingToDelete( c->current() );
+
+            c->advance();
+            c->noteLocation();
+
+            logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() );
+            theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc);
+            num++;
+
+            c->checkLocation();
+
+            getDur().commitIfNeeded();
+
+
+        }
+
+        return num;
+    }
+
+    void Helpers::emptyCollection(const char *ns) {
+        Client::Context context(ns);
+        deleteObjects(ns, BSONObj(), false);
+    }
+
+    DbSet::~DbSet() {
+        if ( name_.empty() )
+            return;
+        try {
+            Client::Context c( name_.c_str() );
+            if ( nsdetails( name_.c_str() ) ) {
+                string errmsg;
+                BSONObjBuilder result;
+                dropCollection( name_, errmsg, result );
+            }
+        }
+        catch ( ... ) {
+            problem() << "exception cleaning up DbSet" << endl;
+        }
+    }
+
+    void DbSet::reset( const string &name, const BSONObj &key ) {
+        if ( !name.empty() )
+            name_ = name;
+        if ( !key.isEmpty() )
+            key_ = key.getOwned();
+        Client::Context c( name_.c_str() );
+        if ( nsdetails( name_.c_str() ) ) {
+            Helpers::emptyCollection( name_.c_str() );
+        }
+        else {
+            string err;
+            massert( 10303 ,  err, userCreateNS( name_.c_str(), fromjson( "{autoIndexId:false}" ), err, false ) );
+        }
+        Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" );
+    }
+
+    bool DbSet::get( const BSONObj &obj ) const {
+        Client::Context c( name_.c_str() );
+        BSONObj temp;
+        return Helpers::findOne( name_.c_str(), obj, temp, true );
+    }
+
+    void DbSet::set( const BSONObj &obj, bool val ) {
+        Client::Context c( name_.c_str() );
+        if ( val ) {
+            try {
+                BSONObj k = obj;
+                theDataFileMgr.insertWithObjMod( name_.c_str(), k, false );
+            }
+            catch ( DBException& ) {
+                // dup key - already in set
+            }
+        }
+        else {
+            deleteObjects( name_.c_str(), obj, true, false, false );
+        }
+    }
+
+    RemoveSaver::RemoveSaver( const string& a , const string& b , const string& why) : _out(0) {
+        static int NUM = 0;
+
+        _root = dbpath;
+        if ( a.size() )
+            _root /= a;
+        if ( b.size() )
+            _root /= b;
+        assert( a.size() || b.size() );
+
+        _file = _root;
+
+        stringstream ss;
+        ss << why << "." << terseCurrentTime(false) << "." << NUM++ << ".bson";
+        _file /= ss.str();
+
+    }
+
+    RemoveSaver::~RemoveSaver() {
+        if ( _out ) {
+            _out->close();
+            delete _out;
+            _out = 0;
+        }
+    }
+
+    void RemoveSaver::goingToDelete( const BSONObj& o ) {
+        if ( ! _out ) {
+            create_directories( _root );
+            _out = new ofstream();
+            _out->open( _file.string().c_str() , ios_base::out | ios_base::binary );
+            if ( ! _out->good() ) {
+                log( LL_WARNING ) << "couldn't create file: " << _file.string() << " for remove saving" << endl;
+                delete _out;
+                _out = 0;
+                return;
+            }
+
+        }
+        _out->write( o.objdata() , o.objsize() );
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbhelpers.h b/src/mongo/db/dbhelpers.h
new file mode 100644
index 00000000000..99d401fa1f8
--- /dev/null
+++ b/src/mongo/db/dbhelpers.h
@@ -0,0 +1,159 @@
+/* @file dbhelpers.h
+
+   db helpers are helper functions and classes that let us easily manipulate the local
+   database instance in-proc.
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "client.h"
+#include "db.h"
+
+namespace mongo {
+
+    const BSONObj reverseNaturalObj = BSON( "$natural" << -1 );
+
+    class Cursor;
+    class CoveredIndexMatcher;
+
+    /**
+       all helpers assume locking is handled above them
+     */
+    struct Helpers {
+
+        /* ensure the specified index exists.
+
+           @param keyPattern key pattern, e.g., { ts : 1 }
+           @param name index name, e.g., "name_1"
+
+           This method can be a little (not much) cpu-slow, so you may wish to use
+             OCCASIONALLY ensureIndex(...);
+
+           Note: use ensureHaveIdIndex() for the _id index: it is faster.
+           Note: does nothing if collection does not yet exist.
+        */
+        static void ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name);
+
+        /* fetch a single object from collection ns that matches query.
+           set your db SavedContext first.
+
+           @param query - the query to perform.  note this is the low level portion of query so "orderby : ..."
+                          won't work.
+
+           @param requireIndex if true, assert if no index for the query.  a way to guard against
+           writing a slow query.
+
+           @return true if object found
+        */
+        static bool findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex = false);
+        static DiskLoc findOne(const char *ns, const BSONObj &query, bool requireIndex);
+
+        /**
+         * @param foundIndex if passed in will be set to 1 if ns and index found
+         * @return true if object found
+         */
+        static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result ,
+                             bool * nsFound = 0 , bool * indexFound = 0 );
+
+        /* uasserts if no _id index.
+           @return null loc if not found */
+        static DiskLoc findById(NamespaceDetails *d, BSONObj query);
+
+        /** Get/put the first (or last) object from a collection.  Generally only useful if the collection
+            only ever has a single object -- which is a "singleton collection".
+
+            You do not need to set the database (Context) before calling.
+
+            @return true if object exists.
+        */
+        static bool getSingleton(const char *ns, BSONObj& result);
+        static void putSingleton(const char *ns, BSONObj obj);
+        static void putSingletonGod(const char *ns, BSONObj obj, bool logTheOp);
+        static bool getFirst(const char *ns, BSONObj& result) { return getSingleton(ns, result); }
+        static bool getLast(const char *ns, BSONObj& result); // get last object int he collection; e.g. {$natural : -1}
+
+        /**
+         * you have to lock
+         * you do not have to have Context set
+         * o has to have an _id field or will assert
+         */
+        static void upsert( const string& ns , const BSONObj& o );
+
+        /** You do not need to set the database before calling.
+            @return true if collection is empty.
+        */
+        static bool isEmpty(const char *ns, bool doAuth=true);
+
+        // TODO: this should be somewhere else probably
+        static BSONObj toKeyFormat( const BSONObj& o , BSONObj& key );
+
+        class RemoveCallback {
+        public:
+            virtual ~RemoveCallback() {}
+            virtual void goingToDelete( const BSONObj& o ) = 0;
+        };
+        /* removeRange: operation is oplog'd */
+        static long long removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield = false , bool maxInclusive = false , RemoveCallback * callback = 0 );
+
+        /* Remove all objects from a collection.
+        You do not need to set the database before calling.
+        */
+        static void emptyCollection(const char *ns);
+
+    };
+
+    class Database;
+
+    // manage a set using collection backed storage
+    class DbSet {
+    public:
+        DbSet( const string &name = "", const BSONObj &key = BSONObj() ) :
+            name_( name ),
+            key_( key.getOwned() ) {
+        }
+        ~DbSet();
+        void reset( const string &name = "", const BSONObj &key = BSONObj() );
+        bool get( const BSONObj &obj ) const;
+        void set( const BSONObj &obj, bool val );
+    private:
+        string name_;
+        BSONObj key_;
+    };
+
+
+    /**
+     * user for saving deleted bson objects to a flat file
+     */
+    class RemoveSaver : public Helpers::RemoveCallback , boost::noncopyable {
+    public:
+        RemoveSaver( const string& type , const string& ns , const string& why);
+        ~RemoveSaver();
+
+        void goingToDelete( const BSONObj& o );
+
+    private:
+        path _root;
+        path _file;
+        ofstream* _out;
+
+    };
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbmessage.cpp b/src/mongo/db/dbmessage.cpp
new file mode 100644
index 00000000000..c86b5a05240
--- /dev/null
+++ b/src/mongo/db/dbmessage.cpp
@@ -0,0 +1,108 @@
+// dbmessage.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dbmessage.h"
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+    string Message::toString() const {
+        stringstream ss;
+        ss << "op: " << opToString( operation() ) << " len: " << size();
+        if ( operation() >= 2000 && operation() < 2100 ) {
+            DbMessage d(*this);
+            ss << " ns: " << d.getns();
+            switch ( operation() ) {
+            case dbUpdate: {
+                int flags = d.pullInt();
+                BSONObj q = d.nextJsObj();
+                BSONObj o = d.nextJsObj();
+                ss << " flags: " << flags << " query: " << q << " update: " << o;
+                break;
+            }
+            case dbInsert:
+                ss << d.nextJsObj();
+                break;
+            case dbDelete: {
+                int flags = d.pullInt();
+                BSONObj q = d.nextJsObj();
+                ss << " flags: " << flags << " query: " << q;
+                break;
+            }
+            default:
+                ss << " CANNOT HANDLE YET";
+            }
+
+
+        }
+        return ss.str();
+    }
+
+
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      void *data, int size,
+                      int nReturned, int startingFrom,
+                      long long cursorId 
+                      ) {
+        BufBuilder b(32768);
+        b.skip(sizeof(QueryResult));
+        b.appendBuf(data, size);
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->_resultFlags() = queryResultFlags;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->cursorId = cursorId;
+        qr->startingFrom = startingFrom;
+        qr->nReturned = nReturned;
+        b.decouple();
+        Message resp(qr, true);
+        p->reply(requestMsg, resp, requestMsg.header()->id);
+    }
+
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      BSONObj& responseObj) {
+        replyToQuery(queryResultFlags,
+                     p, requestMsg,
+                     (void *) responseObj.objdata(), responseObj.objsize(), 1);
+    }
+
+    void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) {
+        BufBuilder b;
+        b.skip(sizeof(QueryResult));
+        b.appendBuf((void*) obj.objdata(), obj.objsize());
+        QueryResult* msgdata = (QueryResult *) b.buf();
+        b.decouple();
+        QueryResult *qr = msgdata;
+        qr->_resultFlags() = queryResultFlags;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->cursorId = 0;
+        qr->startingFrom = 0;
+        qr->nReturned = 1;
+        Message *resp = new Message();
+        resp->setData(msgdata, true); // transport will free
+        dbresponse.response = resp;
+        dbresponse.responseTo = m.header()->id;
+    }
+
+
+
+}
diff --git a/src/mongo/db/dbmessage.h b/src/mongo/db/dbmessage.h
new file mode 100644
index 00000000000..a789bff849c
--- /dev/null
+++ b/src/mongo/db/dbmessage.h
@@ -0,0 +1,282 @@
+// dbmessage.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "diskloc.h"
+#include "jsobj.h"
+#include "namespace-inl.h"
+#include "../util/net/message.h"
+#include "../client/constants.h"
+#include "instance.h"
+
+namespace mongo {
+
+    /* db response format
+
+       Query or GetMore: // see struct QueryResult
+          int resultFlags;
+          int64 cursorID;
+          int startingFrom;
+          int nReturned;
+          list of marshalled JSObjects;
+    */
+
+/* db request message format
+
+   unsigned opid;         // arbitary; will be echoed back
+   byte operation;
+   int options;
+
+   then for:
+
+   dbInsert:
+      string collection;
+      a series of JSObjects
+   dbDelete:
+      string collection;
+      int flags=0; // 1=DeleteSingle
+      JSObject query;
+   dbUpdate:
+      string collection;
+      int flags; // 1=upsert
+      JSObject query;
+      JSObject objectToUpdate;
+        objectToUpdate may include { $inc: <field> } or { $set: ... }, see struct Mod.
+   dbQuery:
+      string collection;
+      int nToSkip;
+      int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit)
+                     // greater than zero is simply a hint on how many objects to send back per "cursor batch".
+                     // a negative number indicates a hard limit.
+      JSObject query;
+      [JSObject fieldsToReturn]
+   dbGetMore:
+      string collection; // redundant, might use for security.
+      int nToReturn;
+      int64 cursorID;
+   dbKillCursors=2007:
+      int n;
+      int64 cursorIDs[n];
+
+   Note that on Update, there is only one object, which is different
+   from insert where you can pass a list of objects to insert in the db.
+   Note that the update field layout is very similar layout to Query.
+*/
+
+
+#pragma pack(1)
+    struct QueryResult : public MsgData {
+        long long cursorId;
+        int startingFrom;
+        int nReturned;
+        const char *data() {
+            return (char *) (((int *)&nReturned)+1);
+        }
+        int resultFlags() {
+            return dataAsInt();
+        }
+        int& _resultFlags() {
+            return dataAsInt();
+        }
+        void setResultFlagsToOk() {
+            _resultFlags() = ResultFlag_AwaitCapable;
+        }
+        void initializeResultFlags() {
+            _resultFlags() = 0;   
+        }
+    };
+
+#pragma pack()
+
+    /* For the database/server protocol, these objects and functions encapsulate
+       the various messages transmitted over the connection.
+
+       See http://www.mongodb.org/display/DOCS/Mongo+Wire+Protocol
+    */
+    class DbMessage {
+    public:
+        DbMessage(const Message& _m) : m(_m) , mark(0) {
+            // for received messages, Message has only one buffer
+            theEnd = _m.singleData()->_data + _m.header()->dataLen();
+            char *r = _m.singleData()->_data;
+            reserved = (int *) r;
+            data = r + 4;
+            nextjsobj = data;
+        }
+
+        /** the 32 bit field before the ns 
+         * track all bit usage here as its cross op
+         * 0: InsertOption_ContinueOnError
+         * 1: fromWriteback
+         */
+        int& reservedField() { return *reserved; }
+
+        const char * getns() const {
+            return data;
+        }
+        void getns(Namespace& ns) const {
+            ns = data;
+        }
+
+        const char * afterNS() const {
+            return data + strlen( data ) + 1;
+        }
+
+        int getInt( int num ) const {
+            const int * foo = (const int*)afterNS();
+            return foo[num];
+        }
+
+        int getQueryNToReturn() const {
+            return getInt( 1 );
+        }
+
+        /**
+         * get an int64 at specified offsetBytes after ns
+         */
+        long long getInt64( int offsetBytes ) const {
+            const char * x = afterNS();
+            x += offsetBytes;
+            const long long * ll = (const long long*)x;
+            return ll[0];
+        }
+
+        void resetPull() { nextjsobj = data; }
+        int pullInt() const { return pullInt(); }
+        int& pullInt() {
+            if ( nextjsobj == data )
+                nextjsobj += strlen(data) + 1; // skip namespace
+            int& i = *((int *)nextjsobj);
+            nextjsobj += 4;
+            return i;
+        }
+        long long pullInt64() const {
+            return pullInt64();
+        }
+        long long &pullInt64() {
+            if ( nextjsobj == data )
+                nextjsobj += strlen(data) + 1; // skip namespace
+            long long &i = *((long long *)nextjsobj);
+            nextjsobj += 8;
+            return i;
+        }
+
+        OID* getOID() const {
+            return (OID *) (data + strlen(data) + 1); // skip namespace
+        }
+
+        void getQueryStuff(const char *&query, int& ntoreturn) {
+            int *i = (int *) (data + strlen(data) + 1);
+            ntoreturn = *i;
+            i++;
+            query = (const char *) i;
+        }
+
+        /* for insert and update msgs */
+        bool moreJSObjs() const {
+            return nextjsobj != 0;
+        }
+        BSONObj nextJsObj() {
+            if ( nextjsobj == data ) {
+                nextjsobj += strlen(data) + 1; // skip namespace
+                massert( 13066 ,  "Message contains no documents", theEnd > nextjsobj );
+            }
+            massert( 10304 ,  "Client Error: Remaining data too small for BSON object", theEnd - nextjsobj > 3 );
+            BSONObj js(nextjsobj);
+            massert( 10305 ,  "Client Error: Invalid object size", js.objsize() > 3 );
+            massert( 10306 ,  "Client Error: Next object larger than space left in message",
+                     js.objsize() < ( theEnd - data ) );
+            if ( cmdLine.objcheck && !js.valid() ) {
+                massert( 10307 , "Client Error: bad object in message", false);
+            }
+            nextjsobj += js.objsize();
+            if ( nextjsobj >= theEnd )
+                nextjsobj = 0;
+            return js;
+        }
+
+        const Message& msg() const { return m; }
+
+        void markSet() {
+            mark = nextjsobj;
+        }
+
+        void markReset() {
+            assert( mark );
+            nextjsobj = mark;
+        }
+
+    private:
+        const Message& m;
+        int* reserved;
+        const char *data;
+        const char *nextjsobj;
+        const char *theEnd;
+
+        const char * mark;
+
+    public:
+        enum ReservedOptions {
+            Reserved_InsertOption_ContinueOnError = 1 << 0 , 
+            Reserved_FromWriteback = 1 << 1 
+        };
+    };
+
+
+    /* a request to run a query, received from the database */
+    class QueryMessage {
+    public:
+        const char *ns;
+        int ntoskip;
+        int ntoreturn;
+        int queryOptions;
+        BSONObj query;
+        BSONObj fields;
+
+        /* parses the message into the above fields */
+        QueryMessage(DbMessage& d) {
+            ns = d.getns();
+            ntoskip = d.pullInt();
+            ntoreturn = d.pullInt();
+            query = d.nextJsObj();
+            if ( d.moreJSObjs() ) {
+                fields = d.nextJsObj();
+            }
+            queryOptions = d.msg().header()->dataAsInt();
+        }
+    };
+
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      void *data, int size,
+                      int nReturned, int startingFrom = 0,
+                      long long cursorId = 0
+                      );
+
+
+    /* object reply helper. */
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      BSONObj& responseObj);
+
+    /* helper to do a reply using a DbResponse object */
+    void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj);
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbwebserver.cpp b/src/mongo/db/dbwebserver.cpp
new file mode 100644
index 00000000000..eb19ba3be6c
--- /dev/null
+++ b/src/mongo/db/dbwebserver.cpp
@@ -0,0 +1,539 @@
+/* dbwebserver.cpp
+
+   This is the administrative web page displayed on port 28017.
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/miniwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/md5.hpp"
+#include "db.h"
+#include "instance.h"
+#include "security.h"
+#include "stats/snapshots.h"
+#include "background.h"
+#include "commands.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "pcrecpp.h"
+#include "../util/admin_access.h"
+#include "dbwebserver.h"
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+    using namespace mongoutils::html;
+    using namespace bson;
+
+    time_t started = time(0);
+
+    struct Timing {
+        Timing() {
+            start = timeLocked = 0;
+        }
+        unsigned long long start, timeLocked;
+    };
+
+    bool execCommand( Command * c ,
+                      Client& client , int queryOptions ,
+                      const char *ns, BSONObj& cmdObj ,
+                      BSONObjBuilder& result,
+                      bool fromRepl );
+
+    class DbWebServer : public MiniWebServer {
+    public:
+        DbWebServer(const string& ip, int port, const AdminAccess* webUsers)
+            : MiniWebServer("admin web console", ip, port), _webUsers(webUsers) {
+            WebStatusPlugin::initAll();
+        }
+
+    private:
+        const AdminAccess* _webUsers; // not owned here
+
+        void doUnlockedStuff(stringstream& ss) {
+            /* this is in the header already ss << "port:      " << port << '\n'; */
+            ss << "<pre>";
+            ss << mongodVersion() << '\n';
+            ss << "git hash: " << gitVersion() << '\n';
+            ss << "sys info: " << sysInfo() << '\n';
+            ss << "uptime: " << time(0)-started << " seconds\n";
+            ss << "</pre>";
+        }
+
+        bool allowed( const char * rq , vector<string>& headers, const SockAddr &from ) {
+            if ( from.isLocalHost() || !_webUsers->haveAdminUsers() ) {
+                cmdAuthenticate.authenticate( "admin", "RestUser", false );
+                return true;
+            }
+
+            string auth = getHeader( rq , "Authorization" );
+
+            if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ) {
+                auth = auth.substr( 7 ) + ", ";
+
+                map<string,string> parms;
+                pcrecpp::StringPiece input( auth );
+
+                string name, val;
+                pcrecpp::RE re("(\\w+)=\"?(.*?)\"?, ");
+                while ( re.Consume( &input, &name, &val) ) {
+                    parms[name] = val;
+                }
+
+                BSONObj user = _webUsers->getAdminUser( parms["username"] );
+                if ( ! user.isEmpty() ) {
+                    string ha1 = user["pwd"].str();
+                    string ha2 = md5simpledigest( (string)"GET" + ":" + parms["uri"] );
+
+                    stringstream r;
+                    r << ha1 << ':' << parms["nonce"];
+                    if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ) {
+                        r << ':';
+                        r << parms["nc"];
+                        r << ':';
+                        r << parms["cnonce"];
+                        r << ':';
+                        r << parms["qop"];
+                    }
+                    r << ':';
+                    r << ha2;
+                    string r1 = md5simpledigest( r.str() );
+
+                    if ( r1 == parms["response"] ) {
+                        cmdAuthenticate.authenticate( "admin", user["user"].str(), user[ "readOnly" ].isBoolean() && user[ "readOnly" ].boolean() );
+                        return true;
+                    }
+                }
+            }
+
+            stringstream authHeader;
+            authHeader
+                    << "WWW-Authenticate: "
+                    << "Digest realm=\"mongo\", "
+                    << "nonce=\"abc\", "
+                    << "algorithm=MD5, qop=\"auth\" "
+                    ;
+
+            headers.push_back( authHeader.str() );
+            return 0;
+        }
+
+        virtual void doRequest(
+            const char *rq, // the full request
+            string url,
+            // set these and return them:
+            string& responseMsg,
+            int& responseCode,
+            vector<string>& headers, // if completely empty, content-type: text/html will be added
+            const SockAddr &from
+        ) {
+            if ( url.size() > 1 ) {
+
+                if ( ! allowed( rq , headers, from ) ) {
+                    responseCode = 401;
+                    headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+                    responseMsg = "not allowed\n";
+                    return;
+                }
+
+                {
+                    BSONObj params;
+                    const size_t pos = url.find( "?" );
+                    if ( pos != string::npos ) {
+                        MiniWebServer::parseParams( params , url.substr( pos + 1 ) );
+                        url = url.substr(0, pos);
+                    }
+
+                    DbWebHandler * handler = DbWebHandler::findHandler( url );
+                    if ( handler ) {
+                        if ( handler->requiresREST( url ) && ! cmdLine.rest ) {
+                            _rejectREST( responseMsg , responseCode , headers );
+                        }
+                        else {
+                            string callback = params.getStringField("jsonp");
+                            uassert(13453, "server not started with --jsonp", callback.empty() || cmdLine.jsonp);
+
+                            handler->handle( rq , url , params , responseMsg , responseCode , headers , from );
+
+                            if (responseCode == 200 && !callback.empty()) {
+                                responseMsg = callback + '(' + responseMsg + ')';
+                            }
+                        }
+                        return;
+                    }
+                }
+
+
+                if ( ! cmdLine.rest ) {
+                    _rejectREST( responseMsg , responseCode , headers );
+                    return;
+                }
+
+                responseCode = 404;
+                headers.push_back( "Content-Type: text/html;charset=utf-8" );
+                responseMsg = "<html><body>unknown url</body></html>\n";
+                return;
+            }
+
+            // generate home page
+
+            if ( ! allowed( rq , headers, from ) ) {
+                responseCode = 401;
+                headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+                responseMsg = "not allowed\n";
+                return;
+            }
+
+            responseCode = 200;
+            stringstream ss;
+            string dbname;
+            {
+                stringstream z;
+                z << cmdLine.binaryName << ' ' << prettyHostName();
+                dbname = z.str();
+            }
+            ss << start(dbname) << h2(dbname);
+            ss << "<p><a href=\"/_commands\">List all commands</a> | \n";
+            ss << "<a href=\"/_replSet\">Replica set status</a></p>\n";
+
+            //ss << "<a href=\"/_status\">_status</a>";
+            {
+                const map<string, Command*> *m = Command::webCommands();
+                if( m ) {
+                    ss <<
+                       a("",
+                         "These read-only context-less commands can be executed from the web interface. "
+                         "Results are json format, unless ?text=1 is appended in which case the result is output as text "
+                         "for easier human viewing",
+                         "Commands")
+                       << ": ";
+                    for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ ) {
+                        stringstream h;
+                        i->second->help(h);
+                        string help = h.str();
+                        ss << "<a href=\"/" << i->first << "?text=1\"";
+                        if( help != "no help defined" )
+                            ss << " title=\"" << help << '"';
+                        ss << ">" << i->first << "</a> ";
+                    }
+                    ss << '\n';
+                }
+            }
+            ss << '\n';
+            /*
+                ss << "HTTP <a "
+                    "title=\"click for documentation on this http interface\""
+                    "href=\"http://www.mongodb.org/display/DOCS/Http+Interface\">admin port</a>:" << _port << "<p>\n";
+            */
+
+            doUnlockedStuff(ss);
+
+            WebStatusPlugin::runAll( ss );
+
+            ss << "</body></html>\n";
+            responseMsg = ss.str();
+            headers.push_back( "Content-Type: text/html;charset=utf-8" );
+        }
+
+        void _rejectREST( string& responseMsg , int& responseCode, vector<string>& headers ) {
+            responseCode = 403;
+            stringstream ss;
+            ss << "REST is not enabled.  use --rest to turn on.\n";
+            ss << "check that port " << _port << " is secured for the network too.\n";
+            responseMsg = ss.str();
+            headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+        }
+
+    };
+    // ---
+
+    bool prisort( const Prioritizable * a , const Prioritizable * b ) {
+        return a->priority() < b->priority();
+    }
+
+    // -- status framework ---
+    WebStatusPlugin::WebStatusPlugin( const string& secionName , double priority , const string& subheader )
+        : Prioritizable(priority), _name( secionName ) , _subHeading( subheader ) {
+        if ( ! _plugins )
+            _plugins = new vector<WebStatusPlugin*>();
+        _plugins->push_back( this );
+    }
+
+    void WebStatusPlugin::initAll() {
+        if ( ! _plugins )
+            return;
+
+        sort( _plugins->begin(), _plugins->end() , prisort );
+
+        for ( unsigned i=0; i<_plugins->size(); i++ )
+            (*_plugins)[i]->init();
+    }
+
+    void WebStatusPlugin::runAll( stringstream& ss ) {
+        if ( ! _plugins )
+            return;
+
+        for ( unsigned i=0; i<_plugins->size(); i++ ) {
+            WebStatusPlugin * p = (*_plugins)[i];
+            ss << "<hr>\n"
+               << "<b>" << p->_name << "</b>";
+
+            ss << " " << p->_subHeading;
+
+            ss << "<br>\n";
+
+            p->run(ss);
+        }
+
+    }
+
+    vector<WebStatusPlugin*> * WebStatusPlugin::_plugins = 0;
+
+    // -- basic statuc plugins --
+
+    class LogPlugin : public WebStatusPlugin {
+    public:
+        LogPlugin() : WebStatusPlugin( "Log" , 100 ), _log(0) {
+        }
+
+        virtual void init() {
+            _log = RamLog::get( "global" );
+            if ( ! _log ) {
+                _log = new RamLog("global");
+                Logstream::get().addGlobalTee( _log );
+            }
+        }
+
+        virtual void run( stringstream& ss ) {
+            _log->toHTML( ss );
+        }
+        RamLog * _log;
+    };
+
+    LogPlugin * logPlugin = new LogPlugin();
+
+    // -- handler framework ---
+
+    DbWebHandler::DbWebHandler( const string& name , double priority , bool requiresREST )
+        : Prioritizable(priority), _name(name) , _requiresREST(requiresREST) {
+
+        {
+            // setup strings
+            _defaultUrl = "/";
+            _defaultUrl += name;
+
+            stringstream ss;
+            ss << name << " priority: " << priority << " rest: " << requiresREST;
+            _toString = ss.str();
+        }
+
+        {
+            // add to handler list
+            if ( ! _handlers )
+                _handlers = new vector<DbWebHandler*>();
+            _handlers->push_back( this );
+            sort( _handlers->begin() , _handlers->end() , prisort );
+        }
+    }
+
+    DbWebHandler * DbWebHandler::findHandler( const string& url ) {
+        if ( ! _handlers )
+            return 0;
+
+        for ( unsigned i=0; i<_handlers->size(); i++ ) {
+            DbWebHandler * h = (*_handlers)[i];
+            if ( h->handles( url ) )
+                return h;
+        }
+
+        return 0;
+    }
+
+    vector<DbWebHandler*> * DbWebHandler::_handlers = 0;
+
+    // --- basic handlers ---
+
+    class FavIconHandler : public DbWebHandler {
+    public:
+        FavIconHandler() : DbWebHandler( "favicon.ico" , 0 , false ) {}
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+            responseCode = 404;
+            headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+            responseMsg = "no favicon\n";
+        }
+
+    } faviconHandler;
+
+    class StatusHandler : public DbWebHandler {
+    public:
+        StatusHandler() : DbWebHandler( "_status" , 1 , false ) {}
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+            headers.push_back( "Content-Type: application/json;charset=utf-8" );
+            responseCode = 200;
+
+            static vector<string> commands;
+            if ( commands.size() == 0 ) {
+                commands.push_back( "serverStatus" );
+                commands.push_back( "buildinfo" );
+            }
+
+            BSONObjBuilder buf(1024);
+
+            for ( unsigned i=0; i<commands.size(); i++ ) {
+                string cmd = commands[i];
+
+                Command * c = Command::findCommand( cmd );
+                assert( c );
+                assert( c->locktype() == 0 );
+
+                BSONObj co;
+                {
+                    BSONObjBuilder b;
+                    b.append( cmd , 1 );
+
+                    if ( cmd == "serverStatus" && params["repl"].type() ) {
+                        b.append( "repl" , atoi( params["repl"].valuestr() ) );
+                    }
+
+                    co = b.obj();
+                }
+
+                string errmsg;
+
+                BSONObjBuilder sub;
+                if ( ! c->run( "admin.$cmd" , co , 0, errmsg , sub , false ) )
+                    buf.append( cmd , errmsg );
+                else
+                    buf.append( cmd , sub.obj() );
+            }
+
+            responseMsg = buf.obj().jsonString();
+
+        }
+
+    } statusHandler;
+
+    class CommandListHandler : public DbWebHandler {
+    public:
+        CommandListHandler() : DbWebHandler( "_commands" , 1 , true ) {}
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+            headers.push_back( "Content-Type: text/html;charset=utf-8" );
+            responseCode = 200;
+
+            stringstream ss;
+            ss << start("Commands List");
+            ss << p( a("/", "back", "Home") );
+            ss << p( "<b>MongoDB List of <a href=\"http://www.mongodb.org/display/DOCS/Commands\">Commands</a></b>\n" );
+            const map<string, Command*> *m = Command::commandsByBestName();
+            ss << "S:slave-ok  R:read-lock  W:write-lock  A:admin-only<br>\n";
+            ss << table();
+            ss << "<tr><th>Command</th><th>Attributes</th><th>Help</th></tr>\n";
+            for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ )
+                i->second->htmlHelp(ss);
+            ss << _table() << _end();
+
+            responseMsg = ss.str();
+        }
+    } commandListHandler;
+
+    class CommandsHandler : public DbWebHandler {
+    public:
+        CommandsHandler() : DbWebHandler( "DUMMY COMMANDS" , 2 , true ) {}
+
+        bool _cmd( const string& url , string& cmd , bool& text, bo params ) const {
+            cmd = str::after(url, '/');
+            text = params["text"].boolean();
+            return true;
+        }
+
+        Command * _cmd( const string& cmd ) const {
+            const map<string,Command*> *m = Command::webCommands();
+            if( ! m )
+                return 0;
+
+            map<string,Command*>::const_iterator i = m->find(cmd);
+            if ( i == m->end() )
+                return 0;
+
+            return i->second;
+        }
+
+        virtual bool handles( const string& url ) const {
+            string cmd;
+            bool text;
+            if ( ! _cmd( url , cmd , text, bo() ) )
+                return false;
+            return _cmd(cmd) != 0;
+        }
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+            string cmd;
+            bool text = false;
+            assert( _cmd( url , cmd , text, params ) );
+            Command * c = _cmd( cmd );
+            assert( c );
+
+            BSONObj cmdObj = BSON( cmd << 1 );
+            Client& client = cc();
+
+            BSONObjBuilder result;
+            execCommand(c, client, 0, "admin.", cmdObj , result, false);
+
+            responseCode = 200;
+
+            string j = result.done().jsonString(Strict, text );
+            responseMsg = j;
+
+            if( text ) {
+                headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+                responseMsg += '\n';
+            }
+            else {
+                headers.push_back( "Content-Type: application/json;charset=utf-8" );
+            }
+
+        }
+
+    } commandsHandler;
+
+    // --- external ----
+
+    void webServerThread(const AdminAccess* adminAccess) {
+        boost::scoped_ptr<const AdminAccess> adminAccessPtr(adminAccess); // adminAccess is owned here
+        Client::initThread("websvr");
+        const int p = cmdLine.port + 1000;
+        DbWebServer mini(cmdLine.bind_ip, p, adminAccessPtr.get());
+        mini.initAndListen();
+        cc().shutdown();
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/dbwebserver.h b/src/mongo/db/dbwebserver.h
new file mode 100644
index 00000000000..bdbcba2c07d
--- /dev/null
+++ b/src/mongo/db/dbwebserver.h
@@ -0,0 +1,85 @@
+/** @file dbwebserver.h
+ */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "../util/admin_access.h"
+
+namespace mongo {
+
+    class Prioritizable {
+    public:
+        Prioritizable( double p ) : _priority(p) {}
+        double priority() const { return _priority; }
+    private:
+        double _priority;
+    };
+
+    class DbWebHandler : public Prioritizable {
+    public:
+        DbWebHandler( const string& name , double priority , bool requiresREST );
+        virtual ~DbWebHandler() {}
+
+        virtual bool handles( const string& url ) const { return url == _defaultUrl; }
+
+        virtual bool requiresREST( const string& url ) const { return _requiresREST; }
+
+        virtual void handle( const char *rq, // the full request
+                             string url,
+                             BSONObj params,
+                             // set these and return them:
+                             string& responseMsg,
+                             int& responseCode,
+                             vector<string>& headers, // if completely empty, content-type: text/html will be added
+                             const SockAddr &from
+                           ) = 0;
+
+        string toString() const { return _toString; }
+        static DbWebHandler * findHandler( const string& url );
+
+    private:
+        string _name;
+        bool _requiresREST;
+
+        string _defaultUrl;
+        string _toString;
+
+        static vector<DbWebHandler*> * _handlers;
+    };
+
+    class WebStatusPlugin : public Prioritizable {
+    public:
+        WebStatusPlugin( const string& secionName , double priority , const string& subheader = "" );
+        virtual ~WebStatusPlugin() {}
+
+        virtual void run( stringstream& ss ) = 0;
+        /** called when web server stats up */
+        virtual void init() = 0;
+
+        static void initAll();
+        static void runAll( stringstream& ss );
+    private:
+        string _name;
+        string _subHeading;
+        static vector<WebStatusPlugin*> * _plugins;
+
+    };
+
+    void webServerThread( const AdminAccess* admins );
+    string prettyHostName();
+
+};
diff --git a/src/mongo/db/diskloc.h b/src/mongo/db/diskloc.h
new file mode 100644
index 00000000000..5295df3e260
--- /dev/null
+++ b/src/mongo/db/diskloc.h
@@ -0,0 +1,160 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* @file diskloc.h
+
+   Storage subsystem management.
+   Lays out our datafiles on disk, manages disk space.
+*/
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+
+    class Record;
+    class DeletedRecord;
+    class Extent;
+    class MongoDataFile;
+    class DiskLoc;
+
+    template< class Version > class BtreeBucket;
+
+#pragma pack(1)
+    /** represents a disk location/offset on disk in a database.  64 bits.
+        it is assumed these will be passed around by value a lot so don't do anything to make them large
+        (such as adding a virtual function)
+     */
+    class DiskLoc {
+        int _a;     // this will be volume, file #, etsc. but is a logical value could be anything depending on storage engine
+        int ofs;
+
+    public:
+
+        enum SentinelValues {
+            /* note NullOfs is different. todo clean up.  see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
+            NullOfs = -1,
+            MaxFiles=16000 // thus a limit of about 32TB of data per db
+        };
+
+        DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) { }
+        DiskLoc() { Null(); }
+        DiskLoc(const DiskLoc& l) {
+            _a=l._a;
+            ofs=l.ofs;
+        }
+
+        bool questionable() const {
+            return ofs < -1 ||
+                   _a < -1 ||
+                   _a > 524288;
+        }
+
+        bool isNull() const { return _a == -1; }
+        void Null() {
+            _a = -1;
+            ofs = 0; /* note NullOfs is different. todo clean up.  see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
+        }
+        void assertOk() { assert(!isNull()); }
+        void setInvalid() {
+            _a = -2;
+            ofs = 0;
+        }
+        bool isValid() const { return _a != -2; }
+
+        string toString() const {
+            if ( isNull() )
+                return "null";
+            stringstream ss;
+            ss << hex << _a << ':' << ofs;
+            return ss.str();
+        }
+
+        BSONObj toBSONObj() const { return BSON( "file" << _a << "offset" << ofs );  }
+
+        int a() const { return _a; }
+
+        int& GETOFS()      { return ofs; }
+        int getOfs() const { return ofs; }
+        void set(int a, int b) {
+            _a=a;
+            ofs=b;
+        }
+
+        void inc(int amt) {
+            assert( !isNull() );
+            ofs += amt;
+        }
+
+        bool sameFile(DiskLoc b) {
+            return _a== b._a;
+        }
+
+        bool operator==(const DiskLoc& b) const {
+            return _a==b._a&& ofs == b.ofs;
+        }
+        bool operator!=(const DiskLoc& b) const {
+            return !(*this==b);
+        }
+        const DiskLoc& operator=(const DiskLoc& b) {
+            _a=b._a;
+            ofs = b.ofs;
+            //assert(ofs!=0);
+            return *this;
+        }
+        int compare(const DiskLoc& b) const {
+            int x = _a - b._a;
+            if ( x )
+                return x;
+            return ofs - b.ofs;
+        }
+        bool operator<(const DiskLoc& b) const {
+            return compare(b) < 0;
+        }
+
+        /**
+         * Marks this disk loc for writing
+         * @returns a non const reference to this disk loc
+         * This function explicitly signals we are writing and casts away const
+         */
+        DiskLoc& writing() const; // see dur.h
+
+        /* Get the "thing" associated with this disk location.
+           it is assumed the object is what you say it is -- you must assure that
+           (think of this as an unchecked type cast)
+           Note: set your Context first so that the database to which the diskloc applies is known.
+        */
+        BSONObj obj() const;
+        Record* rec() const;
+        DeletedRecord* drec() const;
+        Extent* ext() const;
+
+        template< class V >
+        const BtreeBucket<V> * btree() const;
+
+        // Explicitly signals we are writing and casts away const
+        template< class V >
+        BtreeBucket<V> * btreemod() const;
+
+        /*MongoDataFile& pdf() const;*/
+    };
+#pragma pack()
+
+    const DiskLoc minDiskLoc(0, 1);
+    const DiskLoc maxDiskLoc(0x7fffffff, 0x7fffffff);
+
+} // namespace mongo
diff --git a/src/mongo/db/driverHelpers.cpp b/src/mongo/db/driverHelpers.cpp
new file mode 100644
index 00000000000..12aa01886c4
--- /dev/null
+++ b/src/mongo/db/driverHelpers.cpp
@@ -0,0 +1,62 @@
+// driverHelpers.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+   this file has dbcommands that are for drivers
+   mostly helpers
+*/
+
+
+#include "pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "namespace-inl.h"
+#include "commands.h"
+#include "cmdline.h"
+#include "btree.h"
+#include "curop-inl.h"
+#include "../util/background.h"
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+    class BasicDriverHelper : public Command {
+    public:
+        BasicDriverHelper( const char * name ) : Command( name ) {}
+
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool slaveOverrideOk() { return true; }
+    };
+
+    class ObjectIdTest : public BasicDriverHelper {
+    public:
+        ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ) {}
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( cmdObj.firstElement().type() != jstOID ) {
+                errmsg = "not oid";
+                return false;
+            }
+
+            const OID& oid = cmdObj.firstElement().__oid();
+            result.append( "oid" , oid );
+            result.append( "str" , oid.str() );
+
+            return true;
+        }
+    } driverObjectIdTest;
+}
diff --git a/src/mongo/db/dur.cpp b/src/mongo/db/dur.cpp
new file mode 100644
index 00000000000..822fa5232c0
--- /dev/null
+++ b/src/mongo/db/dur.cpp
@@ -0,0 +1,840 @@
+// @file dur.cpp durability in the storage engine (crash-safeness / journaling)
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+   phases:
+
+     PREPLOGBUFFER
+       we will build an output buffer ourself and then use O_DIRECT
+       we could be in read lock for this
+       for very large objects write directly to redo log in situ?
+     WRITETOJOURNAL
+       we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
+         have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
+         for now (1.7.5/1.8.0) we are in read lock which is not ideal.
+     WRITETODATAFILES
+       apply the writes back to the non-private MMF after they are for certain in redo log
+     REMAPPRIVATEVIEW
+       we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
+         remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
+         to be too frequent.
+       there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
+         be required.  so doing these remaps fractionally is helpful. 
+
+   mutexes:
+
+     READLOCK dbMutex
+     LOCK groupCommitMutex
+       PREPLOGBUFFER()
+     READLOCK mmmutex
+       commitJob.reset()
+     UNLOCK dbMutex                                     // now other threads can write
+       WRITETOJOURNAL()
+       WRITETODATAFILES()
+     UNLOCK mmmutex
+     UNLOCK groupCommitMutex
+
+     on the next write lock acquisition for dbMutex:    // see MongoMutex::_acquiredWriteLock()
+       REMAPPRIVATEVIEW()
+
+     @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "client.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_commitjob.h"
+#include "dur_recover.h"
+#include "dur_stats.h"
+#include "../util/concurrency/race.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/timer.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    namespace dur {
+
+        void PREPLOGBUFFER(JSectHeader& outParm);
+        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed);
+        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed);
+
+        /** declared later in this file
+            only used in this file -- use DurableInterface::commitNow() outside
+        */
+        static void groupCommit();
+
+        CommitJob& commitJob = *(new CommitJob()); // don't destroy
+
+        Stats stats;
+
+        void Stats::S::reset() {
+            memset(this, 0, sizeof(*this));
+        }
+
+        Stats::Stats() {
+            _a.reset();
+            _b.reset();
+            curr = &_a;
+            _intervalMicros = 3000000;
+        }
+
+        Stats::S * Stats::other() {
+            return curr == &_a ? &_b : &_a;
+        }
+                        string _CSVHeader();
+
+        string Stats::S::_CSVHeader() { 
+            return "cmts  jrnMB\twrDFMB\tcIWLk\tearly\tprpLgB  wrToJ\twrToDF\trmpPrVw";
+        }
+
+        string Stats::S::_asCSV() { 
+            stringstream ss;
+            ss << 
+                setprecision(2) << 
+                _commits << '\t' << fixed << 
+                _journaledBytes / 1000000.0 << '\t' << 
+                _writeToDataFilesBytes / 1000000.0 << '\t' << 
+                _commitsInWriteLock << '\t' << 
+                _earlyCommits <<  '\t' << 
+                (unsigned) (_prepLogBufferMicros/1000) << '\t' << 
+                (unsigned) (_writeToJournalMicros/1000) << '\t' << 
+                (unsigned) (_writeToDataFilesMicros/1000) << '\t' << 
+                (unsigned) (_remapPrivateViewMicros/1000);
+            return ss.str();
+        }
+
+        //int getAgeOutJournalFiles();
+        BSONObj Stats::S::_asObj() {
+            BSONObjBuilder b;
+            b << 
+                       "commits" << _commits <<
+                       "journaledMB" << _journaledBytes / 1000000.0 <<
+                       "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 <<
+                       "compression" << _journaledBytes / (_uncompressedBytes+1.0) <<
+                       "commitsInWriteLock" << _commitsInWriteLock <<
+                       "earlyCommits" << _earlyCommits << 
+                       "timeMs" <<
+                       BSON( "dt" << _dtMillis <<
+                             "prepLogBuffer" << (unsigned) (_prepLogBufferMicros/1000) <<
+                             "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) <<
+                             "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) <<
+                             "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000)
+                           );
+            /*int r = getAgeOutJournalFiles();
+            if( r == -1 )
+                b << "ageOutJournalFiles" << "mutex timeout";
+            if( r == 0 )
+                b << "ageOutJournalFiles" << false;*/
+            if( cmdLine.journalCommitInterval != 0 )
+                b << "journalCommitIntervalMs" << cmdLine.journalCommitInterval;
+            return b.obj();
+        }
+
+        BSONObj Stats::asObj() {
+            return other()->_asObj();
+        }
+
+        void Stats::rotate() {
+            unsigned long long now = curTimeMicros64();
+            unsigned long long dt = now - _lastRotate;
+            if( dt >= _intervalMicros && _intervalMicros ) {
+                // rotate
+                curr->_dtMillis = (unsigned) (dt/1000);
+                _lastRotate = now;
+                curr = other();
+                curr->reset();
+            }
+        }
+
+        void NonDurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+            memcpy(dst, src, len);
+        }
+
+        void DurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+            // we are at least read locked, so we need not worry about REMAPPRIVATEVIEW herein.
+            DEV d.dbMutex.assertAtLeastReadLocked();
+
+            MemoryMappedFile::makeWritable(dst, len);
+
+            // we enter the RecoveryJob mutex here, so that if WRITETODATAFILES is happening we do not 
+            // conflict with it
+            scoped_lock lk1( RecoveryJob::get()._mx );
+
+            // we stay in this mutex for everything to work with DurParanoid/validateSingleMapMatches
+            //
+            // either of these mutexes also makes setNoJournal threadsafe, which is good as we call it from a read 
+            // (not a write) lock in class SlaveTracking
+            //
+            scoped_lock lk( privateViews._mutex() );
+
+            size_t ofs;
+            MongoMMF *f = privateViews.find_inlock(dst, ofs);
+            assert(f);
+            void *w = (((char *)f->view_write())+ofs);
+            // first write it to the writable (file) view
+            memcpy(w, src, len);
+            if( memcmp(w, dst, len) ) {
+                // if we get here, a copy-on-write had previously occurred. so write it to the private view too
+                // to keep them in sync.  we do this as we do not want to cause a copy on write unnecessarily.
+                memcpy(dst, src, len);
+            }
+        }
+
+        /** base declare write intent function that all the helpers call. */
+        void DurableImpl::declareWriteIntent(void *p, unsigned len) {
+            commitJob.note(p, len);
+        }
+
+        static DurableImpl* durableImpl = new DurableImpl();
+        static NonDurableImpl* nonDurableImpl = new NonDurableImpl();
+        DurableInterface* DurableInterface::_impl = nonDurableImpl;
+
+        void DurableInterface::enableDurability() {
+            assert(_impl == nonDurableImpl);
+            _impl = durableImpl;
+        }
+
+        void DurableInterface::disableDurability() {
+            assert(_impl == durableImpl);
+            massert(13616, "can't disable durability with pending writes", !commitJob.hasWritten());
+            _impl = nonDurableImpl;
+        }
+
+        bool DurableImpl::commitNow() {
+            stats.curr->_earlyCommits++;
+            groupCommit();
+            return true;
+        }
+
+        bool DurableImpl::awaitCommit() {
+            commitJob._notify.awaitBeyondNow();
+            return true;
+        }
+
+        /** Declare that a file has been created
+            Normally writes are applied only after journaling, for safety.  But here the file
+            is created first, and the journal will just replay the creation if the create didn't
+            happen because of crashing.
+        */
+        void DurableImpl::createdFile(string filename, unsigned long long len) {
+            shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
+            commitJob.noteOp(op);
+        }
+
+        void* DurableImpl::writingPtr(void *x, unsigned len) {
+            void *p = x;
+            declareWriteIntent(p, len);
+            return p;
+        }
+
+        /** declare intent to write
+            @param ofs offset within buf at which we will write
+            @param len the length at ofs we will write
+            @return new buffer pointer.
+        */
+        void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) {
+            char *p = (char *) buf;
+            declareWriteIntent(p+ofs, len);
+            return p;
+        }
+
+        void* DurableImpl::writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) {
+            char *p = (char *) buf;
+            for( vector< pair< long long, unsigned > >::const_iterator i = ranges.begin();
+                    i != ranges.end(); ++i ) {
+                declareWriteIntent( p + i->first, i->second );
+            }
+            return p;
+        }
+
+        bool DurableImpl::aCommitIsNeeded() const {
+            DEV commitJob._nSinceCommitIfNeededCall = 0;
+            return commitJob.bytes() > UncommittedBytesLimit;
+        }
+
+        bool DurableImpl::commitIfNeeded() {
+            if ( !d.dbMutex.isWriteLocked() )
+                return false;
+
+            DEV commitJob._nSinceCommitIfNeededCall = 0;
+            if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit?
+                stats.curr->_earlyCommits++;
+                groupCommit();
+                return true;
+            }
+            return false;
+        }
+
+        /** Used in _DEBUG builds to check that we didn't overwrite the last intent
+            that was declared.  called just before writelock release.  we check a few
+            bytes after the declared region to see if they changed.
+
+            @see MongoMutex::_releasedWriteLock
+
+            SLOW
+        */
+#if 0
+        void DurableImpl::debugCheckLastDeclaredWrite() {
+            static int n;
+            ++n;
+
+            assert(debug && cmdLine.dur);
+            if (commitJob.writes().empty())
+                return;
+            const WriteIntent &i = commitJob.lastWrite();
+            size_t ofs;
+            MongoMMF *mmf = privateViews.find(i.start(), ofs);
+            if( mmf == 0 )
+                return;
+            size_t past = ofs + i.length();
+            if( mmf->length() < past + 8 )
+                return; // too close to end of view
+            char *priv = (char *) mmf->getView();
+            char *writ = (char *) mmf->view_write();
+            unsigned long long *a = (unsigned long long *) (priv+past);
+            unsigned long long *b = (unsigned long long *) (writ+past);
+            if( *a != *b ) {
+                for( set<WriteIntent>::iterator it(commitJob.writes().begin()), end((commitJob.writes().begin())); it != end; ++it ) {
+                    const WriteIntent& wi = *it;
+                    char *r1 = (char*) wi.start();
+                    char *r2 = (char*) wi.end();
+                    if( r1 <= (((char*)a)+8) && r2 > (char*)a ) {
+                        //log() << "it's ok " << wi.p << ' ' << wi.len << endl;
+                        return;
+                    }
+                }
+                log() << "journal data after write area " << i.start() << " does not agree" << endl;
+                log() << " was:  " << ((void*)b) << "  " << hexdump((char*)b, 8) << endl;
+                log() << " now:  " << ((void*)a) << "  " << hexdump((char*)a, 8) << endl;
+                log() << " n:    " << n << endl;
+                log() << endl;
+            }
+        }
+#endif
+
+        // Functor to be called over all MongoFiles
+
+        class validateSingleMapMatches {
+        public:
+            validateSingleMapMatches(unsigned long long& bytes) :_bytes(bytes)  {}
+            void operator () (MongoFile *mf) {
+                if( mf->isMongoMMF() ) {
+                    MongoMMF *mmf = (MongoMMF*) mf;
+                    const unsigned char *p = (const unsigned char *) mmf->getView();
+                    const unsigned char *w = (const unsigned char *) mmf->view_write();
+
+                    if (!p || !w) return; // File not fully opened yet
+
+                    _bytes += mmf->length();
+
+                    assert( mmf->length() == (unsigned) mmf->length() );
+                    {
+                        scoped_lock lk( privateViews._mutex() ); // see setNoJournal
+                        if (memcmp(p, w, (unsigned) mmf->length()) == 0)
+                            return; // next file
+                    }
+
+                    unsigned low = 0xffffffff;
+                    unsigned high = 0;
+                    log() << "DurParanoid mismatch in " << mmf->filename() << endl;
+                    int logged = 0;
+                    unsigned lastMismatch = 0xffffffff;
+                    for( unsigned i = 0; i < mmf->length(); i++ ) {
+                        if( p[i] != w[i] ) {
+                            if( lastMismatch != 0xffffffff && lastMismatch+1 != i )
+                                log() << endl; // separate blocks of mismatches
+                            lastMismatch= i;
+                            if( ++logged < 60 ) {
+                                if( logged == 1 )
+                                    log() << "ofs % 628 = 0x" << hex << (i%628) << endl; // for .ns files to find offset in record
+                                stringstream ss;
+                                ss << "mismatch ofs:" << hex << i <<  "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
+                                if( p[i] > 32 && p[i] <= 126 )
+                                    ss << '\t' << p[i];
+                                log() << ss.str() << endl;
+                            }
+                            if( logged == 60 )
+                                log() << "..." << endl;
+                            if( i < low ) low = i;
+                            if( i > high ) high = i;
+                        }
+                    }
+                    if( low != 0xffffffff ) {
+                        std::stringstream ss;
+                        ss << "journal error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
+                        log() << ss.str() << endl;
+                        log() << "priv loc: " << (void*)(p+low) << ' ' << endl;
+                        set<WriteIntent>& b = commitJob.writes();
+                        (void)b; // mark as unused. Useful for inspection in debugger
+
+                        // should we abort() here so this isn't unnoticed in some circumstances?
+                        massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false);
+                    }
+                }
+            }
+        private:
+            unsigned long long& _bytes;
+        };
+
+        /** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
+        */
+        void debugValidateAllMapsMatch() {
+            if( ! (cmdLine.durOptions & CmdLine::DurParanoid) )
+                return;
+
+            unsigned long long bytes = 0;
+            Timer t;
+            MongoFile::forEach(validateSingleMapMatches(bytes));
+            OCCASIONALLY log() << "DurParanoid map check " << t.millis() << "ms for " <<  (bytes / (1024*1024)) << "MB" << endl;
+        }
+
+        extern size_t privateMapBytes;
+
+        static void _REMAPPRIVATEVIEW() {
+            // todo: Consider using ProcessInfo herein and watching for getResidentSize to drop.  that could be a way 
+            //       to assure very good behavior here.
+
+            static unsigned startAt;
+            static unsigned long long lastRemap;
+
+            LOG(4) << "journal REMAPPRIVATEVIEW" << endl;
+
+            d.dbMutex.assertWriteLocked();
+            d.dbMutex._remapPrivateViewRequested = false;
+            assert( !commitJob.hasWritten() );
+
+            // we want to remap all private views about every 2 seconds.  there could be ~1000 views so
+            // we do a little each pass; beyond the remap time, more significantly, there will be copy on write
+            // faults after remapping, so doing a little bit at a time will avoid big load spikes on
+            // remapping.
+            unsigned long long now = curTimeMicros64();
+            double fraction = (now-lastRemap)/2000000.0;
+            if( cmdLine.durOptions & CmdLine::DurAlwaysRemap )
+                fraction = 1;
+            lastRemap = now;
+
+            LockMongoFilesShared lk;
+            set<MongoFile*>& files = MongoFile::getAllFiles();
+            unsigned sz = files.size();
+            if( sz == 0 )
+                return;
+
+            {
+                // be careful not to use too much memory if the write rate is 
+                // extremely high
+                double f = privateMapBytes / ((double)UncommittedBytesLimit);
+                if( f > fraction ) { 
+                    fraction = f;
+                }
+                privateMapBytes = 0;
+            }
+
+            unsigned ntodo = (unsigned) (sz * fraction);
+            if( ntodo < 1 ) ntodo = 1;
+            if( ntodo > sz ) ntodo = sz;
+
+            const set<MongoFile*>::iterator b = files.begin();
+            const set<MongoFile*>::iterator e = files.end();
+            set<MongoFile*>::iterator i = b;
+            // skip to our starting position
+            for( unsigned x = 0; x < startAt; x++ ) {
+                i++;
+                if( i == e ) i = b;
+            }
+            unsigned startedAt = startAt;
+            startAt = (startAt + ntodo) % sz; // mark where to start next time
+
+            Timer t;
+            for( unsigned x = 0; x < ntodo; x++ ) {
+                dassert( i != e );
+                if( (*i)->isMongoMMF() ) {
+                    MongoMMF *mmf = (MongoMMF*) *i;
+                    assert(mmf);
+                    if( mmf->willNeedRemap() ) {
+                        mmf->willNeedRemap() = false;
+                        mmf->remapThePrivateView();
+                    }
+                    i++;
+                    if( i == e ) i = b;
+                }
+            }
+            LOG(2) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' ' << t.millis() << "ms" << endl;
+        }
+
+        /** We need to remap the private views periodically. otherwise they would become very large.
+            Call within write lock.  See top of file for more commentary.
+        */
+        void REMAPPRIVATEVIEW() {
+            Timer t;
+            _REMAPPRIVATEVIEW();
+            stats.curr->_remapPrivateViewMicros += t.micros();
+        }
+
+        // lock order: dbMutex first, then this
+        mutex groupCommitMutex("groupCommit");
+
+        bool _groupCommitWithLimitedLocks() {
+
+            int p = 0;
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            scoped_ptr<ExcludeAllWrites> lk1( new ExcludeAllWrites() );
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            scoped_lock lk2(groupCommitMutex);
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            commitJob.beginCommit();
+
+            if( !commitJob.hasWritten() ) {
+                // getlasterror request could have came after the data was already committed
+                commitJob.notifyCommitted();
+                return true;
+            }
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            JSectHeader h;
+            PREPLOGBUFFER(h); // need to be in readlock (writes excluded) for this
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            LockMongoFilesShared lk3;
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            unsigned abLen = commitJob._ab.len();
+            commitJob.reset(); // must be reset before allowing anyone to write
+            DEV assert( !commitJob.hasWritten() );
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            // release the readlock -- allowing others to now write while we are writing to the journal (etc.)
+            lk1.reset();
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            // ****** now other threads can do writes ******
+
+            WRITETOJOURNAL(h, commitJob._ab);
+            assert( abLen == commitJob._ab.len() ); // a check that no one touched the builder while we were doing work. if so, our locking is wrong.
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            // data is now in the journal, which is sufficient for acknowledging getLastError.
+            // (ok to crash after that)
+            commitJob.notifyCommitted();
+
+            LOG(4) << "groupcommitll " << p++ << " WRITETODATAFILES()" << endl;
+
+            WRITETODATAFILES(h, commitJob._ab);
+            assert( abLen == commitJob._ab.len() ); // check again wasn't modded
+            commitJob._ab.reset();
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            // can't : d.dbMutex._remapPrivateViewRequested = true;
+
+            return true;
+        }
+
+        /** @return true if committed; false if lock acquisition timed out (we only try for a read lock herein and only wait for a certain duration). */
+        bool groupCommitWithLimitedLocks() {
+            try {
+                return _groupCommitWithLimitedLocks();
+            }
+            catch(DBException& e ) {
+                log() << "dbexception in groupCommitLL causing immediate shutdown: " << e.toString() << endl;
+                mongoAbort("dur1");
+            }
+            catch(std::ios_base::failure& e) {
+                log() << "ios_base exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("dur2");
+            }
+            catch(std::bad_alloc& e) {
+                log() << "bad_alloc exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("dur3");
+            }
+            catch(std::exception& e) {
+                log() << "exception in dur::groupCommitLL causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("dur4");
+            }
+            return false;
+        }
+
+        static void _groupCommit() {
+
+            LOG(4) << "_groupCommit " << endl;
+
+            // we need to be at least read locked on the dbMutex so that we know the write intent data 
+            // structures are not changing while we work
+            d.dbMutex.assertAtLeastReadLocked();
+
+            commitJob.beginCommit();
+
+            if( !commitJob.hasWritten() ) {
+                // getlasterror request could have came after the data was already committed
+                commitJob.notifyCommitted();
+                return;
+            }
+
+            // we need to make sure two group commits aren't running at the same time
+            // (and we are only read locked in the dbMutex, so it could happen)
+            scoped_lock lk(groupCommitMutex);
+
+            JSectHeader h;
+            PREPLOGBUFFER(h);
+
+            // todo : write to the journal outside locks, as this write can be slow.
+            //        however, be careful then about remapprivateview as that cannot be done 
+            //        if new writes are then pending in the private maps.
+            WRITETOJOURNAL(h, commitJob._ab);
+
+            // data is now in the journal, which is sufficient for acknowledging getLastError.
+            // (ok to crash after that)
+            commitJob.notifyCommitted();
+
+            WRITETODATAFILES(h, commitJob._ab);
+            debugValidateAllMapsMatch();
+
+            commitJob.reset();
+            commitJob._ab.reset();
+
+            // REMAPPRIVATEVIEW
+            //
+            // remapping private views must occur after WRITETODATAFILES otherwise
+            // we wouldn't see newly written data on reads.
+            //
+            DEV assert( !commitJob.hasWritten() );
+            if( !d.dbMutex.isWriteLocked() ) {
+                // this needs done in a write lock (as there is a short window during remapping when each view 
+                // might not exist) thus we do it on the next acquisition of that instead of here (there is no 
+                // rush if you aren't writing anyway -- but it must happen, if it is done, before any uncommitted 
+                // writes occur).  If desired, perhaps this can be eliminated on posix as it may be that the remap 
+                // is race-free there.
+                //
+                d.dbMutex._remapPrivateViewRequested = true;
+            }
+            else {
+                stats.curr->_commitsInWriteLock++;
+                // however, if we are already write locked, we must do it now -- up the call tree someone
+                // may do a write without a new lock acquisition.  this can happen when MongoMMF::close() calls
+                // this method when a file (and its views) is about to go away.
+                //
+                REMAPPRIVATEVIEW();
+            }
+        }
+
+        /** locking: in read lock when called
+                     or, for early commits (commitIfNeeded), in write lock
+            @see MongoMMF::close()
+        */
+        static void groupCommit() {
+            try {
+                _groupCommit();
+            }
+            catch(DBException& e ) { 
+                log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl;
+                mongoAbort("gc1");
+            }
+            catch(std::ios_base::failure& e) { 
+                log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("gc2");
+            }
+            catch(std::bad_alloc& e) { 
+                log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("gc3");
+            }
+            catch(std::exception& e) {
+                log() << "exception in dur::groupCommit causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("gc4");
+            }
+            LOG(4) << "groupCommit end" << endl;
+        }
+
+        static void go() {
+            const int N = 10;
+            static int n;
+            if( privateMapBytes < UncommittedBytesLimit && ++n % N && (cmdLine.durOptions&CmdLine::DurAlwaysRemap)==0 ) {
+                // limited locks version doesn't do any remapprivateview at all, so only try this if privateMapBytes
+                // is in an acceptable range.  also every Nth commit, we do everything so we can do some remapping;
+                // remapping a lot all at once could cause jitter from a large amount of copy-on-writes all at once.
+                if( groupCommitWithLimitedLocks() )
+                    return;
+            }
+            else {
+                readlocktry lk("", 1000);
+                if( lk.got() ) {
+                    groupCommit();
+                    return;
+                }
+            }
+
+            // starvation on read locks could occur.  so if read lock acquisition is slow, try to get a
+            // write lock instead.  otherwise journaling could be delayed too long (too much data will 
+            // not accumulate though, as commitIfNeeded logic will have executed in the meantime if there 
+            // has been writes)
+            writelock lk;
+            groupCommit();
+        }
+
+        /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its
+            views disappear
+        */
+        void closingFileNotification() {
+            if (!cmdLine.dur)
+                return;
+
+            if( d.dbMutex.atLeastReadLocked() ) {
+                groupCommit();
+            }
+            else {
+                assert( inShutdown() );
+                if( commitJob.hasWritten() ) {
+                    log() << "journal warning files are closing outside locks with writes pending" << endl;
+                }
+            }
+        }
+
+        extern int groupCommitIntervalMs;
+        boost::filesystem::path getJournalDir();
+
+        void durThread() {
+            Client::initThread("journal");
+
+            bool samePartition = true;
+            try {
+                const string dbpathDir = boost::filesystem::path(dbpath).native_directory_string();
+                samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
+            }
+            catch(...) {
+            }
+
+            while( !inShutdown() ) {
+                RACECHECK
+
+                unsigned ms = cmdLine.journalCommitInterval;
+                if( ms == 0 ) { 
+                    // use default
+                    ms = samePartition ? 100 : 30;
+                }
+
+                unsigned oneThird = (ms / 3) + 1; // +1 so never zero
+
+                try {
+                    stats.rotate();
+
+                    // we do this in a couple blocks (the invoke()), which makes it a tiny bit faster (only a little) on throughput,
+                    // but is likely also less spiky on our cpu usage, which is good.
+
+                    // commit sooner if one or more getLastError j:true is pending
+                    sleepmillis(oneThird);
+                    for( unsigned i = 1; i <= 2; i++ ) {
+                        if( commitJob._notify.nWaiting() )
+                            break;
+                        commitJob.wi()._deferred.invoke();
+                        sleepmillis(oneThird);
+                    }
+
+                    go();
+                }
+                catch(std::exception& e) {
+                    log() << "exception in durThread causing immediate shutdown: " << e.what() << endl;
+                    mongoAbort("exception in durThread");
+                }
+            }
+            cc().shutdown();
+        }
+
+        void recover();
+
+        unsigned notesThisLock = 0;
+
+        void releasingWriteLock() {
+            DEV notesThisLock = 0;
+            // implicit commitIfNeeded check on each write unlock
+            DEV commitJob._nSinceCommitIfNeededCall = 0; // implicit commit if needed
+            if( commitJob.bytes() > UncommittedBytesLimit || cmdLine.durOptions & CmdLine::DurAlwaysCommit ) {
+                stats.curr->_earlyCommits++;
+                groupCommit();
+            }
+        }
+
+        void preallocateFiles();
+
+        /** at startup, recover, and then start the journal threads */
+        void startup() {
+            if( !cmdLine.dur )
+                return;
+
+#if defined(_DURABLEDEFAULTON)
+            DEV { 
+                if( time(0) & 1 ) {
+                    cmdLine.durOptions |= CmdLine::DurAlwaysCommit;
+                    log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysCommit mode for this run" << endl;
+                }
+                if( time(0) & 2 ) {
+                    cmdLine.durOptions |= CmdLine::DurAlwaysRemap;
+                    log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysRemap mode for this run" << endl;
+                }
+            }
+#endif
+
+            DurableInterface::enableDurability();
+
+            journalMakeDir();
+            try {
+                recover();
+            }
+            catch(...) {
+                log() << "exception during recovery" << endl;
+                throw;
+            }
+
+            preallocateFiles();
+
+            boost::thread t(durThread);
+        }
+
+        void DurableImpl::syncDataAndTruncateJournal() {
+            d.dbMutex.assertWriteLocked();
+
+            // a commit from the commit thread won't begin while we are in the write lock,
+            // but it may already be in progress and the end of that work is done outside 
+            // (dbMutex) locks. This line waits for that to complete if already underway.
+            {
+                scoped_lock lk(groupCommitMutex);
+            }
+
+            groupCommit();
+            MongoFile::flushAll(true);
+            journalCleanup();
+
+            assert(!haveJournalFiles()); // Double check post-conditions
+        }
+
+    } // namespace dur
+
+} // namespace mongo
diff --git a/src/mongo/db/dur.h b/src/mongo/db/dur.h
new file mode 100644
index 00000000000..f06ff500195
--- /dev/null
+++ b/src/mongo/db/dur.h
@@ -0,0 +1,209 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "diskloc.h"
+#include "mongommf.h"
+
+namespace mongo {
+
+    class NamespaceDetails;
+
+    void mongoAbort(const char *msg);
+    void abort(); // not defined -- use mongoAbort() instead
+
+    namespace dur {
+
+        // a smaller limit is likely better on 32 bit
+#if defined(__i386__) || defined(_M_IX86)
+        const unsigned UncommittedBytesLimit = 50 * 1024 * 1024;
+#else
+        const unsigned UncommittedBytesLimit = 100 * 1024 * 1024;
+#endif
+
+        /** Call during startup so durability module can initialize
+            Throws if fatal error
+            Does nothing if cmdLine.dur is false
+         */
+        void startup();
+
+        class DurableInterface : boost::noncopyable {
+        public:
+            virtual ~DurableInterface() { log() << "ERROR warning ~DurableInterface not intended to be called" << endl; }
+
+            /** Declare that a file has been created
+                Normally writes are applied only after journaling, for safety.  But here the file
+                is created first, and the journal will just replay the creation if the create didn't
+                happen because of crashing.
+            */
+            virtual void createdFile(string filename, unsigned long long len) = 0;
+
+            /** Declarations of write intent.
+
+                Use these methods to declare "i'm about to write to x and it should be logged for redo."
+
+                Failure to call writing...() is checked in _DEBUG mode by using a read only mapped view
+                (i.e., you'll segfault if the code is covered in that situation).  The _DEBUG check doesn't
+                verify that your length is correct though.
+            */
+
+            /** declare intent to write to x for up to len
+                @return pointer where to write.  this is modified when testIntent is true.
+            */
+            virtual void* writingPtr(void *x, unsigned len) = 0;
+
+            /** declare write intent; should already be in the write view to work correctly when testIntent is true.
+                if you aren't, use writingPtr() instead.
+            */
+            virtual void declareWriteIntent(void *x, unsigned len) = 0;
+
+            /** declare intent to write
+                @param ofs offset within buf at which we will write
+                @param len the length at ofs we will write
+                @return new buffer pointer.  this is modified when testIntent is true.
+            */
+            virtual void* writingAtOffset(void *buf, unsigned ofs, unsigned len) = 0;
+
+            /** declare intent to write
+                @param ranges vector of pairs representing ranges.  Each pair
+                comprises an offset from buf where a range begins, then the
+                range length.
+                @return new buffer pointer.  this is modified when testIntent is true.
+             */
+            virtual void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) = 0;
+
+            /** Wait for acknowledgement of the next group commit.
+                @return true if --dur is on.  There will be delay.
+                @return false if --dur is off.
+            */
+            virtual bool awaitCommit() = 0;
+
+            /** Commit immediately.
+
+                Generally, you do not want to do this often, as highly granular committing may affect
+                performance.
+
+                Does not return until the commit is complete.
+
+                You must be at least read locked when you call this.  Ideally, you are not write locked
+                and then read operations can occur concurrently.
+
+                @return true if --dur is on.
+                @return false if --dur is off. (in which case there is action)
+            */
+            virtual bool commitNow() = 0;
+
+            /** Commit if enough bytes have been modified. Current threshold is 50MB
+
+                The idea is that long running write operations that dont yield
+                (like creating an index or update with $atomic) can call this
+                whenever the db is in a sane state and it will prevent commits
+                from growing too large.
+                @return true if commited
+            */
+            virtual bool commitIfNeeded() = 0;
+
+            /** @return true if time to commit but does NOT do a commit */
+            virtual bool aCommitIsNeeded() const = 0;
+
+            /** Declare write intent for a DiskLoc.  @see DiskLoc::writing() */
+            inline DiskLoc& writingDiskLoc(DiskLoc& d) { return *((DiskLoc*) writingPtr(&d, sizeof(d))); }
+
+            /** Declare write intent for an int */
+            inline int& writingInt(const int& d) { return *((int*) writingPtr((int*) &d, sizeof(d))); }
+
+            /** "assume i've already indicated write intent, let me write"
+                redeclaration is fine too, but this is faster.
+            */
+            template <typename T>
+            inline
+            T* alreadyDeclared(T *x) {
+#if defined(_TESTINTENT)
+                return (T*) MongoMMF::switchToPrivateView(x);
+#else
+                return x;
+#endif
+            }
+
+            /** declare intent to write to x for sizeof(*x) */
+            template <typename T>
+            inline
+            T* writing(T *x) {
+                return (T*) writingPtr(x, sizeof(T));
+            }
+
+            /** write something that doesn't have to be journaled, as this write is "unimportant".
+                a good example is paddingFactor.
+                can be thought of as memcpy(dst,src,len)
+                the dur implementation acquires a mutex in this method, so do not assume it is faster
+                without measuring!
+            */
+            virtual void setNoJournal(void *dst, void *src, unsigned len) = 0;
+
+            /** Commits pending changes, flushes all changes to main data
+                files, then removes the journal.
+                
+                This is useful as a "barrier" to ensure that writes before this
+                call will never go through recovery and be applied to files
+                that have had changes made after this call applied.
+             */
+            virtual void syncDataAndTruncateJournal() = 0;
+
+            static DurableInterface& getDur() { return *_impl; }
+
+        private:
+            /** Intentionally unimplemented method.
+             It's very easy to manipulate Record::data open ended.  Thus a call to writing(Record*) is suspect.
+             This will override the templated version and yield an unresolved external.
+             */
+            Record* writing(Record* r);
+            /** Intentionally unimplemented method. BtreeBuckets are allocated in buffers larger than sizeof( BtreeBucket ). */
+//            BtreeBucket* writing( BtreeBucket* );
+            /** Intentionally unimplemented method. NamespaceDetails may be based on references to 'Extra' objects. */
+            NamespaceDetails* writing( NamespaceDetails* );
+
+            static DurableInterface* _impl; // NonDurableImpl at startup()
+            static void enableDurability(); // makes _impl a DurableImpl
+            static void disableDurability(); // makes _impl a NonDurableImpl
+
+            // these need to be able to enable/disable Durability
+            friend void startup();
+            friend class TempDisableDurability;
+        }; // class DurableInterface
+
+        class NonDurableImpl : public DurableInterface {
+            void* writingPtr(void *x, unsigned len) { return x; }
+            void* writingAtOffset(void *buf, unsigned ofs, unsigned len) { return buf; }
+            void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges) { return buf; }
+            void declareWriteIntent(void *, unsigned) { }
+            void createdFile(string filename, unsigned long long len) { }
+            bool awaitCommit() { return false; }
+            bool commitNow() { return false; }
+            bool commitIfNeeded() { return false; }
+            bool aCommitIsNeeded() const { return false; }
+            void setNoJournal(void *dst, void *src, unsigned len);
+            void syncDataAndTruncateJournal() {}
+        };
+
+        class DurableImpl : public DurableInterface {
+            void* writingPtr(void *x, unsigned len);
+            void* writingAtOffset(void *buf, unsigned ofs, unsigned len);
+            void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges);
+            void declareWriteIntent(void *, unsigned);
+            void createdFile(string filename, unsigned long long len);
+            bool awaitCommit();
+            bool commitNow();
+            bool aCommitIsNeeded() const;
+            bool commitIfNeeded();
+            void setNoJournal(void *dst, void *src, unsigned len);
+            void syncDataAndTruncateJournal();
+        };
+
+    } // namespace dur
+
+    inline dur::DurableInterface& getDur() { return dur::DurableInterface::getDur(); }
+
+    /** declare that we are modifying a diskloc and this is a datafile write. */
+    inline DiskLoc& DiskLoc::writing() const { return getDur().writingDiskLoc(*const_cast< DiskLoc * >( this )); }
+
+}
diff --git a/src/mongo/db/dur_commitjob.cpp b/src/mongo/db/dur_commitjob.cpp
new file mode 100644
index 00000000000..5a9e9cb5679
--- /dev/null
+++ b/src/mongo/db/dur_commitjob.cpp
@@ -0,0 +1,240 @@
+/* @file dur_commitjob.cpp */
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dur_commitjob.h"
+#include "dur_stats.h"
+#include "taskqueue.h"
+#include "client.h"
+
+namespace mongo {
+
+    namespace dur {
+
+        BOOST_STATIC_ASSERT( UncommittedBytesLimit > BSONObjMaxInternalSize * 3 );
+        BOOST_STATIC_ASSERT( sizeof(void*)==4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6 );
+
+        void Writes::D::go(const Writes::D& d) {
+            commitJob.wi()._insertWriteIntent(d.p, d.len);
+        }
+
+        void WriteIntent::absorb(const WriteIntent& other) {
+            dassert(overlaps(other));
+
+            void* newStart = min(start(), other.start());
+            p = max(p, other.p);
+            len = (char*)p - (char*)newStart;
+
+            dassert(contains(other));
+        }
+
+        void Writes::clear() {
+            d.dbMutex.assertAtLeastReadLocked();
+
+            _alreadyNoted.clear();
+            _writes.clear();
+            _ops.clear();
+            _drained = false;
+#if defined(DEBUG_WRITE_INTENT)
+            cout << "_debug clear\n";
+            _debug.clear();
+#endif
+        }
+
+#if defined(DEBUG_WRITE_INTENT)
+        void assertAlreadyDeclared(void *p, int len) {
+            if( commitJob.wi()._debug[p] >= len )
+                return;
+            log() << "assertAlreadyDeclared fails " << (void*)p << " len:" << len << ' ' << commitJob.wi()._debug[p] << endl;
+            printStackTrace();
+            abort();
+        }
+#endif
+
+        void Writes::_insertWriteIntent(void* p, int len) {
+            WriteIntent wi(p, len);
+
+            if (_writes.empty()) {
+                _writes.insert(wi);
+                return;
+            }
+
+            typedef set<WriteIntent>::const_iterator iterator; // shorter
+
+            iterator closest = _writes.lower_bound(wi);
+            // closest.end() >= wi.end()
+
+            if ((closest != _writes.end() && closest->overlaps(wi)) || // high end
+                    (closest != _writes.begin() && (--closest)->overlaps(wi))) { // low end
+                if (closest->contains(wi))
+                    return; // nothing to do
+
+                // find overlapping range and merge into wi
+                iterator   end(closest);
+                iterator begin(closest);
+                while (  end->overlaps(wi)) { wi.absorb(*end); ++end; if (end == _writes.end()) break; }  // look forwards
+                while (begin->overlaps(wi)) { wi.absorb(*begin); if (begin == _writes.begin()) break; --begin; } // look backwards
+                if (!begin->overlaps(wi)) ++begin; // make inclusive
+
+                DEV { // ensure we're not deleting anything we shouldn't
+                    for (iterator it(begin); it != end; ++it) {
+                        assert(wi.contains(*it));
+                    }
+                }
+
+                _writes.erase(begin, end);
+                _writes.insert(wi);
+
+                DEV { // ensure there are no overlaps
+                    // this can be very slow - n^2 - so make it RARELY
+                    RARELY {
+                        for (iterator it(_writes.begin()), end(boost::prior(_writes.end())); it != end; ++it) {
+                            assert(!it->overlaps(*boost::next(it)));
+                        }
+                    }
+                }
+            }
+            else { // no entries overlapping wi
+                _writes.insert(closest, wi);
+            }
+        }
+
+        /** note an operation other than a "basic write" */
+        void CommitJob::noteOp(shared_ptr<DurOp> p) {
+            d.dbMutex.assertWriteLocked();
+            dassert( cmdLine.dur );
+            cc()._hasWrittenThisPass = true;
+            if( !_hasWritten ) {
+                assert( !d.dbMutex._remapPrivateViewRequested );
+                _hasWritten = true;
+            }
+            _wi._ops.push_back(p);
+        }
+
+        size_t privateMapBytes = 0; // used by _REMAPPRIVATEVIEW to track how much / how fast to remap
+
+        void CommitJob::beginCommit() { 
+            DEV d.dbMutex.assertAtLeastReadLocked();
+            _commitNumber = _notify.now();
+            stats.curr->_commits++;
+        }
+
+        void CommitJob::reset() {
+            _hasWritten = false;
+            _wi.clear();
+            privateMapBytes += _bytes;
+            _bytes = 0;
+            _nSinceCommitIfNeededCall = 0;
+        }
+
+        CommitJob::CommitJob() : _ab(4 * 1024 * 1024) , _hasWritten(false), 
+            _bytes(0), _nSinceCommitIfNeededCall(0) { 
+            _commitNumber = 0;
+        }
+
+        extern unsigned notesThisLock;
+
+        void CommitJob::note(void* p, int len) {
+            // from the point of view of the dur module, it would be fine (i think) to only
+            // be read locked here.  but must be at least read locked to avoid race with
+            // remapprivateview
+            DEV notesThisLock++;
+            DEV d.dbMutex.assertWriteLocked();
+            dassert( cmdLine.dur );
+            cc()._hasWrittenThisPass = true;
+            if( !_wi._alreadyNoted.checkAndSet(p, len) ) {
+                MemoryMappedFile::makeWritable(p, len);
+
+                if( !_hasWritten ) {
+                    // you can't be writing if one of these is pending, so this is a verification.
+                    assert( !d.dbMutex._remapPrivateViewRequested ); // safe to assert here since it must be the first write in a write lock
+
+                    // we don't bother doing a group commit when nothing is written, so we have a var to track that
+                    _hasWritten = true;
+                }
+
+                /** tips for debugging:
+                        if you have an incorrect diff between data files in different folders
+                        (see jstests/dur/quick.js for example),
+                        turn this on and see what is logged.  if you have a copy of its output from before the
+                        regression, a simple diff of these lines would tell you a lot likely.
+                */
+#if 0 && defined(_DEBUG)
+                {
+                    static int n;
+                    if( ++n < 10000 ) {
+                        size_t ofs;
+                        MongoMMF *mmf = privateViews._find(w.p, ofs);
+                        if( mmf ) {
+                            log() << "DEBUG note write intent " << w.p << ' ' << mmf->filename() << " ofs:" << hex << ofs << " len:" << w.len << endl;
+                        }
+                        else {
+                            log() << "DEBUG note write intent " << w.p << ' ' << w.len << " NOT FOUND IN privateViews" << endl;
+                        }
+                    }
+                    else if( n == 10000 ) {
+                        log() << "DEBUG stopping write intent logging, too much to log" << endl;
+                    }
+                }
+#endif
+
+                // remember intent. we will journal it in a bit
+                _wi.insertWriteIntent(p, len);
+                wassert( _wi._writes.size() <  2000000 );
+                //assert(  _wi._writes.size() < 20000000 );
+
+                {
+                    // a bit over conservative in counting pagebytes used
+                    static size_t lastPos; // note this doesn't reset with each commit, but that is ok we aren't being that precise
+                    size_t x = ((size_t) p) & ~0xfff; // round off to page address (4KB)
+                    if( x != lastPos ) { 
+                        lastPos = x;
+                        unsigned b = (len+4095) & ~0xfff;
+                        _bytes += b;
+#if defined(_DEBUG)
+                        _nSinceCommitIfNeededCall++;
+                        if( _nSinceCommitIfNeededCall >= 80 ) {
+                            if( _nSinceCommitIfNeededCall % 40 == 0 ) {
+                                log() << "debug nsincecommitifneeded:" << _nSinceCommitIfNeededCall << " bytes:" << _bytes << endl;
+                                if( _nSinceCommitIfNeededCall == 120 || _nSinceCommitIfNeededCall == 1200 ) {
+                                    log() << "_DEBUG printing stack given high nsinccommitifneeded number" << endl;
+                                    printStackTrace();
+                                }
+                            }
+                        }
+#endif
+                        if (_bytes > UncommittedBytesLimit * 3) {
+                            static time_t lastComplain;
+                            static unsigned nComplains;
+                            // throttle logging
+                            if( ++nComplains < 100 || time(0) - lastComplain >= 60 ) {
+                                lastComplain = time(0);
+                                warning() << "DR102 too much data written uncommitted " << _bytes/1000000.0 << "MB" << endl;
+                                if( nComplains < 10 || nComplains % 10 == 0 ) {
+                                    // wassert makes getLastError show an error, so we just print stack trace
+                                    printStackTrace();
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+}
diff --git a/src/mongo/db/dur_commitjob.h b/src/mongo/db/dur_commitjob.h
new file mode 100644
index 00000000000..bfc5e3c268f
--- /dev/null
+++ b/src/mongo/db/dur_commitjob.h
@@ -0,0 +1,220 @@
+/* @file dur_commitjob.h used by dur.cpp
+*/
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/alignedbuilder.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/concurrency/synchronization.h"
+#include "cmdline.h"
+#include "durop.h"
+#include "dur.h"
+#include "taskqueue.h"
+
+//#define DEBUG_WRITE_INTENT 1
+
+namespace mongo {
+    namespace dur {
+
+        /** declaration of an intent to write to a region of a memory mapped view
+         *
+         * We store the end rather than the start pointer to make operator< faster
+         * since that is heavily used in set lookup.
+         */
+        struct WriteIntent { /* copyable */
+            WriteIntent() : /*w_ptr(0), */ p(0) { }
+            WriteIntent(void *a, unsigned b) : /*w_ptr(0), */ p((char*)a+b), len(b) { }
+
+            void* start() const { return (char*)p - len; }
+            void* end() const { return p; }
+            unsigned length() const { return len; }
+
+            bool operator < (const WriteIntent& rhs) const { return end() < rhs.end(); }
+
+            // can they be merged?
+            bool overlaps(const WriteIntent& rhs) const {
+                return (start() <= rhs.end() && end() >= rhs.start());
+            }
+
+            // is merging necessary?
+            bool contains(const WriteIntent& rhs) const {
+                return (start() <= rhs.start() && end() >= rhs.end());
+            }
+
+            // merge into me
+            void absorb(const WriteIntent& other);
+
+            friend ostream& operator << (ostream& out, const WriteIntent& wi) {
+                return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len);
+            }
+
+            //mutable void *w_ptr;  // writable mapping of p.
+            // mutable because set::iterator is const but this isn't used in op<
+#if defined(_EXPERIMENTAL)
+            mutable unsigned ofsInJournalBuffer;
+#endif
+        private:
+            void *p;      // intent to write up to p
+            unsigned len; // up to this len
+        };
+
+        /** try to remember things we have already marked for journaling.  false negatives are ok if infrequent -
+            we will just log them twice.
+        */
+        template<int Prime>
+        class Already : boost::noncopyable {
+        public:
+            Already() { clear(); }
+            void clear() { memset(this, 0, sizeof(*this)); }
+
+            /* see if we have Already recorded/indicated our write intent for this region of memory.
+               automatically upgrades the length if the length was shorter previously.
+               @return true if already indicated.
+            */
+            bool checkAndSet(void* p, int len) {
+                unsigned x = mongoutils::hashPointer(p);
+                pair<void*, int>& nd = nodes[x % N];
+                if( nd.first == p ) {
+                    if( nd.second < len ) {
+                        nd.second = len;
+                        return false; // haven't indicated this len yet
+                    }
+                    return true; // already indicated
+                }
+                nd.first = p;
+                nd.second = len;
+                return false; // a new set
+            }
+
+        private:
+            enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
+            pair<void*,int> nodes[N];
+        };
+
+        /** our record of pending/uncommitted write intents */
+        class Writes : boost::noncopyable {
+            struct D {
+                void *p;
+                unsigned len;
+                static void go(const D& d);
+            };
+        public:
+            TaskQueue<D> _deferred;
+            Already<127> _alreadyNoted;
+            set<WriteIntent> _writes;
+            vector< shared_ptr<DurOp> > _ops; // all the ops other than basic writes
+            bool _drained; // _deferred is drained?  for asserting/testing
+
+            /** reset the Writes structure (empties all the above) */
+            void clear();
+
+            /** merges into set (ie non-deferred version) */
+            void _insertWriteIntent(void* p, int len);
+
+            void insertWriteIntent(void* p, int len) {
+#if defined(DEBUG_WRITE_INTENT)
+                if( _debug[p] < len )
+                    _debug[p] = len;
+#endif
+                D d;
+                d.p = p;
+                d.len = len;
+                _deferred.defer(d);
+            }
+
+#ifdef _DEBUG
+            WriteIntent _last;
+#endif
+#if defined(DEBUG_WRITE_INTENT)
+            map<void*,int> _debug;
+#endif
+        };
+
+#if defined(DEBUG_WRITE_INTENT)
+        void assertAlreadyDeclared(void *, int len);
+#else
+        inline void assertAlreadyDeclared(void *, int len) { }
+#endif
+
+        /** A commit job object for a group commit.  Currently there is one instance of this object.
+
+            concurrency: assumption is caller is appropriately locking.
+                         for example note() invocations are from the write lock.
+                         other uses are in a read lock from a single thread (durThread)
+        */
+        class CommitJob : boost::noncopyable {
+        public:
+            AlignedBuilder _ab; // for direct i/o writes to journal
+
+            CommitJob();
+
+            ~CommitJob(){ assert(!"shouldn't destroy CommitJob!"); }
+
+            /** record/note an intent to write */
+            void note(void* p, int len);
+
+            /** note an operation other than a "basic write" */
+            void noteOp(shared_ptr<DurOp> p);
+
+            set<WriteIntent>& writes() {
+                if( !_wi._drained ) {
+                    // generally, you don't want to use the set until it is prepared (after deferred ops are applied)
+                    // thus this assert here.
+                    assert(false);
+                }
+                return _wi._writes;
+            }
+
+            vector< shared_ptr<DurOp> >& ops() { return _wi._ops; }
+
+            /** this method is safe to call outside of locks. when haswritten is false we don't do any group commit and avoid even
+                trying to acquire a lock, which might be helpful at times.
+            */
+            bool hasWritten() const { return _hasWritten; }
+
+            /** we use the commitjob object over and over, calling reset() rather than reconstructing */
+            void reset();
+
+            void beginCommit();
+
+            /** the commit code calls this when data reaches the journal (on disk) */
+            void notifyCommitted() { _notify.notifyAll(_commitNumber); }
+
+            /** we check how much written and if it is getting to be a lot, we commit sooner. */
+            size_t bytes() const { return _bytes; }
+
+#if defined(_DEBUG)
+            const WriteIntent& lastWrite() const { return _wi._last; }
+#endif
+
+            Writes& wi() { return _wi; }
+        private:
+            NotifyAll::When _commitNumber;
+            bool _hasWritten;
+            Writes _wi; // todo: fix name
+            size_t _bytes;
+        public:
+            NotifyAll _notify; // for getlasterror fsync:true acknowledgements
+            unsigned _nSinceCommitIfNeededCall;
+        };
+
+        extern CommitJob& commitJob;
+
+    }
+}
diff --git a/src/mongo/db/dur_journal.cpp b/src/mongo/db/dur_journal.cpp
new file mode 100644
index 00000000000..6a6609f55ee
--- /dev/null
+++ b/src/mongo/db/dur_journal.cpp
@@ -0,0 +1,748 @@
+// @file dur_journal.cpp writing to the writeahead logging journal
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client.h"
+#include "namespace.h"
+#include "dur_journal.h"
+#include "dur_journalformat.h"
+#include "dur_stats.h"
+#include "../util/logfile.h"
+#include "../util/timer.h"
+#include "../util/alignedbuilder.h"
+#include "../util/net/listen.h" // getelapsedtimemillis
+#include <boost/static_assert.hpp>
+#include <boost/filesystem.hpp>
+#undef assert
+#define assert MONGO_assert
+#include "../util/mongoutils/str.h"
+#include "dur_journalimpl.h"
+#include "../util/file.h"
+#include "../util/checksum.h"
+#include "../util/concurrency/race.h"
+#include "../util/compress.h"
+#include "../server.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    class AlignedBuilder;
+
+    unsigned goodRandomNumberSlow();
+
+    namespace dur {
+        // Rotate after reaching this data size in a journal (j._<n>) file
+        // We use a smaller size for 32 bit as the journal is mmapped during recovery (only)
+        // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must 
+        // work.  (and should as-is)
+        // --smallfiles makes the limit small.
+
+#if defined(_DEBUG)
+        unsigned long long DataLimitPerJournalFile = 128 * 1024 * 1024;
+#elif defined(__APPLE__)
+        // assuming a developer box if OS X
+        unsigned long long DataLimitPerJournalFile = 256 * 1024 * 1024;
+#else
+        unsigned long long DataLimitPerJournalFile = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024;
+#endif
+
+        BOOST_STATIC_ASSERT( sizeof(Checksum) == 16 );
+        BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 );
+        BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 );
+        BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 );
+        BOOST_STATIC_ASSERT( sizeof(JEntry) == 12 );
+        BOOST_STATIC_ASSERT( sizeof(LSNFile) == 88 );
+
+        bool usingPreallocate = false;
+
+        void removeOldJournalFile(path p);
+
+        boost::filesystem::path getJournalDir() {
+            boost::filesystem::path p(dbpath);
+            p /= "journal";
+            return p;
+        }
+
+        path lsnPath() {
+            return getJournalDir()/"lsn";
+        }
+
+        /** this should be called when something really bad happens so that we can flag appropriately
+        */
+        void journalingFailure(const char *msg) {
+            /** todo:
+                (1) don't log too much
+                (2) make an indicator in the journal dir that something bad happened.
+                (2b) refuse to do a recovery startup if that is there without manual override.
+            */
+            log() << "journaling failure/error: " << msg << endl;
+            assert(false);
+        }
+
+        JSectFooter::JSectFooter() { 
+            memset(this, 0, sizeof(*this));
+            sentinel = JEntry::OpCode_Footer;
+        }
+
+        JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash
+            sentinel = JEntry::OpCode_Footer;
+            reserved = 0;
+            magic[0] = magic[1] = magic[2] = magic[3] = '\n';
+
+            Checksum c;
+            c.gen(begin, (unsigned) len);
+            memcpy(hash, c.bytes, sizeof(hash));
+        }
+
+        bool JSectFooter::checkHash(const void* begin, int len) const {
+            if( !magicOk() ) { 
+                log() << "journal footer not valid" << endl;
+                return false;
+            }
+            Checksum c;
+            c.gen(begin, len);
+            DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(c.bytes, 16) << endl;
+            if( memcmp(hash, c.bytes, sizeof(hash)) == 0 ) 
+                return true;
+            log() << "journal checkHash mismatch, got: " << toHex(c.bytes, 16) << " expected: " << toHex(hash,16) << endl;
+            return false;
+        }
+
+        JHeader::JHeader(string fname) {
+            magic[0] = 'j'; magic[1] = '\n';
+            _version = CurrentVersion;
+            memset(ts, 0, sizeof(ts));
+            time_t t = time(0);
+            strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts)-1);
+            memset(dbpath, 0, sizeof(dbpath));
+            strncpy(dbpath, fname.c_str(), sizeof(dbpath)-1);
+            {
+                fileId = t&0xffffffff;
+                fileId |= ((unsigned long long)goodRandomNumberSlow()) << 32;
+            }
+            memset(reserved3, 0, sizeof(reserved3));
+            txt2[0] = txt2[1] = '\n';
+            n1 = n2 = n3 = n4 = '\n';
+        }
+
+        Journal j;
+
+        const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0);
+
+        Journal::Journal() :
+            _curLogFileMutex("JournalLfMutex") {
+            _ageOut = true;
+            _written = 0;
+            _nextFileNumber = 0;
+            _curLogFile = 0;
+            _curFileId = 0;
+            _preFlushTime = 0;
+            _lastFlushTime = 0;
+            _writeToLSNNeeded = false;
+        }
+
+        path Journal::getFilePathFor(int filenumber) const {
+            boost::filesystem::path p(dir);
+            p /= string(str::stream() << "j._" << filenumber);
+            return p;
+        }
+
+        /** never throws
+            @return true if journal dir is not empty
+        */
+        bool haveJournalFiles() {
+            try {
+                for ( boost::filesystem::directory_iterator i( getJournalDir() );
+                        i != boost::filesystem::directory_iterator();
+                        ++i ) {
+                    string fileName = boost::filesystem::path(*i).leaf();
+                    if( str::startsWith(fileName, "j._") )
+                        return true;
+                }
+            }
+            catch(...) { }
+            return false;
+        }
+
+        /** throws */
+        void removeJournalFiles() {
+            log() << "removeJournalFiles" << endl;
+            try {
+                for ( boost::filesystem::directory_iterator i( getJournalDir() );
+                        i != boost::filesystem::directory_iterator();
+                        ++i ) {
+                    string fileName = boost::filesystem::path(*i).leaf();
+                    if( str::startsWith(fileName, "j._") ) {
+                        try {
+                            removeOldJournalFile(*i);
+                        }
+                        catch(std::exception& e) {
+                            log() << "couldn't remove " << fileName << ' ' << e.what() << endl;
+                            throw;
+                        }
+                    }
+                }
+                try {
+                    boost::filesystem::remove(lsnPath());
+                }
+                catch(...) {
+                    log() << "couldn't remove " << lsnPath().string() << endl;
+                    throw;
+                }
+            }
+            catch( std::exception& e ) {
+                log() << "error removing journal files " << e.what() << endl;
+                throw;
+            }
+            assert(!haveJournalFiles());
+
+            flushMyDirectory(getJournalDir() / "file"); // flushes parent of argument (in this case journal dir)
+
+            log(1) << "removeJournalFiles end" << endl;
+        }
+
+        /** at clean shutdown */
+        bool okToCleanUp = false; // successful recovery would set this to true
+        void Journal::cleanup(bool _log) {
+            if( !okToCleanUp )
+                return;
+
+            if( _log )
+                log() << "journalCleanup..." << endl;
+            try {
+                SimpleMutex::scoped_lock lk(_curLogFileMutex);
+                closeCurrentJournalFile();
+                removeJournalFiles();
+            }
+            catch(std::exception& e) {
+                log() << "error couldn't remove journal file during shutdown " << e.what() << endl;
+                throw;
+            }
+        }
+        void journalCleanup(bool log) { j.cleanup(log); }
+
+        bool _preallocateIsFaster() {
+            bool faster = false;
+            boost::filesystem::path p = getJournalDir() / "tempLatencyTest";
+            try { remove(p); } catch(...) { }
+            try {
+                AlignedBuilder b(8192);
+                int millis[2];
+                const int N = 50;
+                for( int pass = 0; pass < 2; pass++ ) {
+                    LogFile f(p.string());
+                    Timer t;
+                    for( int i = 0 ; i < N; i++ ) { 
+                        f.synchronousAppend(b.buf(), 8192);
+                    }
+                    millis[pass] = t.millis();
+                    // second time through, file exists and is prealloc case
+                }
+                int diff = millis[0] - millis[1];
+                if( diff > 2 * N ) {
+                    // at least 2ms faster for prealloc case?
+                    faster = true;
+                    log() << "preallocateIsFaster=true " << diff / (1.0*N) << endl;
+                }
+            }
+            catch(...) {
+                log() << "info preallocateIsFaster couldn't run; returning false" << endl;
+            }
+            try { remove(p); } catch(...) { }
+            return faster;
+        }
+        bool preallocateIsFaster() {
+            Timer t;
+            bool res = false;
+            if( _preallocateIsFaster() && _preallocateIsFaster() ) { 
+                // maybe system is just super busy at the moment? sleep a second to let it calm down.  
+                // deciding to to prealloc is a medium big decision:
+                sleepsecs(1);
+                res = _preallocateIsFaster();
+            }
+            if( t.millis() > 3000 ) 
+                log() << "preallocateIsFaster check took " << t.millis()/1000.0 << " secs" << endl;
+            return res;
+        }
+
+        // throws
+        void preallocateFile(boost::filesystem::path p, unsigned long long len) {
+            if( exists(p) ) 
+                return;
+            
+            log() << "preallocating a journal file " << p.string() << endl;
+
+            const unsigned BLKSZ = 1024 * 1024;
+            assert( len % BLKSZ == 0 );
+
+            AlignedBuilder b(BLKSZ);            
+            memset((void*)b.buf(), 0, BLKSZ);
+
+            ProgressMeter m(len, 3/*secs*/, 10/*hits between time check (once every 6.4MB)*/);
+
+            File f;
+            f.open( p.string().c_str() , /*read-only*/false , /*direct-io*/false );
+            assert( f.is_open() );
+            fileofs loc = 0;
+            while ( loc < len ) {
+                f.write( loc , b.buf() , BLKSZ );
+                loc += BLKSZ;
+                m.hit(BLKSZ);
+            }
+            assert( loc == len );
+            f.fsync();
+        }
+
+        const int NUM_PREALLOC_FILES = 3;
+        inline boost::filesystem::path preallocPath(int n) {
+            assert(n >= 0);
+            assert(n < NUM_PREALLOC_FILES);
+            string fn = str::stream() << "prealloc." << n;
+            return getJournalDir() / fn;
+        }
+
+        // throws
+        void _preallocateFiles() {
+            for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+                boost::filesystem::path filepath = preallocPath(i);
+
+                unsigned long long limit = DataLimitPerJournalFile;
+                if( debug && i == 1 ) { 
+                    // moving 32->64, the prealloc files would be short.  that is "ok", but we want to exercise that 
+                    // case, so we force exercising here when _DEBUG is set by arbitrarily stopping prealloc at a low 
+                    // limit for a file.  also we want to be able to change in the future the constant without a lot of
+                    // work anyway.
+                    limit = 16 * 1024 * 1024;
+                }
+                preallocateFile(filepath, limit);
+            }
+        }
+
+        void checkFreeSpace() {
+            unsigned long long spaceNeeded = static_cast<unsigned long long>(3 * DataLimitPerJournalFile * 1.1); // add 10% for headroom
+            unsigned long long freeSpace = File::freeSpace(getJournalDir().string());
+            unsigned long long prealloced = 0;
+            for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+                boost::filesystem::path filepath = preallocPath(i);
+                if (exists(filepath))
+                    prealloced += file_size(filepath);
+            }
+
+            if (freeSpace + prealloced < spaceNeeded) {
+                log() << endl;
+                error() << "Insufficient free space for journals." << endl;
+                log() << "Please make at least " << spaceNeeded/(1024*1024) << "MB available in " << getJournalDir().string() << endl;
+                log() << endl;
+                throw UserException(15926, "Insufficient free space for journals");
+            }
+        }
+
+        void preallocateFiles() {
+            if (! (cmdLine.durOptions & CmdLine::DurNoCheckSpace))
+                checkFreeSpace();
+
+            if( exists(preallocPath(0)) || // if enabled previously, keep using
+                exists(preallocPath(1)) ||
+                ( cmdLine.preallocj && preallocateIsFaster() ) ) {
+                    usingPreallocate = true;
+                    try {
+                        _preallocateFiles();
+                    }
+                    catch(...) {
+                        log() << "warning caught exception in preallocateFiles, continuing" << endl;
+                    }
+            }
+            j.open();
+        }
+
+        void removeOldJournalFile(path p) { 
+            if( usingPreallocate ) {
+                try {
+                    for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+                        boost::filesystem::path filepath = preallocPath(i);
+                        if( !boost::filesystem::exists(filepath) ) {
+                            // we can recycle this file into this prealloc file location
+                            boost::filesystem::path temppath = filepath.string() + ".temp";
+                            boost::filesystem::rename(p, temppath);
+                            {
+                                // zero the header
+                                File f;
+                                f.open(temppath.string().c_str(), false, false);
+                                char buf[8192];
+                                memset(buf, 0, 8192);
+                                f.write(0, buf, 8192);
+                                f.truncate(DataLimitPerJournalFile);
+                                f.fsync();
+                            }
+                            boost::filesystem::rename(temppath, filepath);
+                            return;
+                        }
+                    }
+                } catch(...) { 
+                    log() << "warning exception in dur::removeOldJournalFile " << p.string() << endl;
+                    // fall through and try to delete the file
+                }
+            }
+
+            // already have 3 prealloc files, so delete this file
+            try {
+                boost::filesystem::remove(p);
+            }
+            catch(...) { 
+                log() << "warning exception removing " << p.string() << endl;
+            }
+        }
+
+        // find a prealloc.<n> file, presumably to take and use
+        path findPrealloced() { 
+            try {
+                for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+                    boost::filesystem::path filepath = preallocPath(i);
+                    if( boost::filesystem::exists(filepath) )
+                        return filepath;
+                }
+            } catch(...) { 
+                log() << "warning exception in dur::findPrealloced()" << endl;
+            }
+            return path();
+        }
+
+        /** assure journal/ dir exists. throws. call during startup. */
+        void journalMakeDir() {
+            j.init();
+
+            boost::filesystem::path p = getJournalDir();
+            j.dir = p.string();
+            log() << "journal dir=" << j.dir << endl;
+            if( !exists(j.dir) ) {
+                try {
+                    create_directory(j.dir);
+                }
+                catch(std::exception& e) {
+                    log() << "error creating directory " << j.dir << ' ' << e.what() << endl;
+                    throw;
+                }
+            }
+        }
+
+        void Journal::_open() {
+            _curFileId = 0;
+            assert( _curLogFile == 0 );
+            path fname = getFilePathFor(_nextFileNumber);
+
+            // if we have a prealloced file, use it 
+            {
+                path p = findPrealloced();
+                if( !p.empty() ) { 
+                    try { 
+                        {
+                            // JHeader::fileId must be updated before renaming to be race-safe
+                            LogFile f(p.string());
+                            JHeader h(p.string());
+                            AlignedBuilder b(8192);
+                            b.appendStruct(h);
+                            f.synchronousAppend(b.buf(), b.len());
+                        }
+                        boost::filesystem::rename(p, fname);
+                    }
+                    catch(...) { 
+                        log() << "warning couldn't write to / rename file " << p.string() << endl;
+                    }
+                }
+            }
+
+            _curLogFile = new LogFile(fname.string());
+            _nextFileNumber++;
+            {
+                JHeader h(fname.string());
+                _curFileId = h.fileId;
+                assert(_curFileId);
+                AlignedBuilder b(8192);
+                b.appendStruct(h);
+                _curLogFile->synchronousAppend(b.buf(), b.len());
+            }
+        }
+
+        void Journal::init() {
+            assert( _curLogFile == 0 );
+            MongoFile::notifyPreFlush = preFlush;
+            MongoFile::notifyPostFlush = postFlush;
+        }
+
+        void Journal::open() {
+            assert( MongoFile::notifyPreFlush == preFlush );
+            SimpleMutex::scoped_lock lk(_curLogFileMutex);
+            _open();
+        }
+
+        void LSNFile::set(unsigned long long x) {
+            memset(this, 0, sizeof(*this));
+            lsn = x;
+            checkbytes = ~x;
+        }
+
+        /** logs details of the situation, and returns 0, if anything surprising in the LSNFile
+            if something highly surprising, throws to abort
+        */
+        unsigned long long LSNFile::get() {
+            uassert(13614, str::stream() << "unexpected version number of lsn file in journal/ directory got: " << ver , ver == 0);
+            if( ~lsn != checkbytes ) {
+                log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn << " checkbytes: " << hex << checkbytes << endl;
+                return 0;
+            }
+            return lsn;
+        }
+
+        /** called during recovery (the error message text below assumes that)
+        */
+        unsigned long long journalReadLSN() {
+            if( !MemoryMappedFile::exists(lsnPath()) ) {
+                log() << "info no lsn file in journal/ directory" << endl;
+                return 0;
+            }
+
+            try {
+                // os can flush as it likes.  if it flushes slowly, we will just do extra work on recovery.
+                // however, given we actually close the file when writing, that seems unlikely.
+                LSNFile L;
+                File f;
+                f.open(lsnPath().string().c_str());
+                assert(f.is_open());
+                if( f.len() == 0 ) { 
+                    // this could be 'normal' if we crashed at the right moment
+                    log() << "info lsn file is zero bytes long" << endl;
+                    return 0;
+                }
+                f.read(0,(char*)&L, sizeof(L));
+                unsigned long long lsn = L.get();
+                return lsn;
+            }
+            catch(std::exception& e) {
+                uasserted(13611, str::stream() << "can't read lsn file in journal directory : " << e.what());
+            }
+            return 0;
+        }
+
+        unsigned long long getLastDataFileFlushTime() {
+            return j.lastFlushTime();
+        }
+
+        /** remember "last sequence number" to speed recoveries
+            concurrency: called by durThread only.
+        */
+        void Journal::updateLSNFile() {
+            RACECHECK
+            if( !_writeToLSNNeeded )
+                return;
+            _writeToLSNNeeded = false;
+            try {
+                // os can flush as it likes.  if it flushes slowly, we will just do extra work on recovery.
+                // however, given we actually close the file, that seems unlikely.
+                File f;
+                f.open(lsnPath().string().c_str());
+                if( !f.is_open() ) { 
+                    // can get 0 if an i/o error
+                    log() << "warning: open of lsn file failed" << endl;
+                    return;
+                }
+                LOG(1) << "lsn set " << _lastFlushTime << endl;
+                LSNFile lsnf;
+                lsnf.set(_lastFlushTime);
+                f.write(0, (char*)&lsnf, sizeof(lsnf));
+				// do we want to fsync here? if we do it probably needs to be async so the durthread
+				// is not delayed.
+            }
+            catch(std::exception& e) {
+                log() << "warning: write to lsn file failed " << e.what() << endl;
+                // keep running (ignore the error). recovery will be slow.
+            }
+        }
+
+        void Journal::preFlush() {
+            j._preFlushTime = Listener::getElapsedTimeMillis();
+        }
+
+        void Journal::postFlush() {
+            j._lastFlushTime = j._preFlushTime;
+            j._writeToLSNNeeded = true;
+        }
+
+        // call from within _curLogFileMutex
+        void Journal::closeCurrentJournalFile() {
+            if (!_curLogFile)
+                return;
+
+            JFile jf;
+            jf.filename = _curLogFile->_name;
+            jf.lastEventTimeMs = Listener::getElapsedTimeMillis();
+            _oldJournalFiles.push_back(jf);
+
+            delete _curLogFile; // close
+            _curLogFile = 0;
+            _written = 0;
+        }
+
+        /** remove older journal files.
+            be in _curLogFileMutex but not dbMutex when calling
+        */
+        void Journal::removeUnneededJournalFiles() {
+            while( !_oldJournalFiles.empty() ) {
+                JFile f = _oldJournalFiles.front();
+
+                if( f.lastEventTimeMs < _lastFlushTime + ExtraKeepTimeMs ) {
+                    // eligible for deletion
+                    path p( f.filename );
+                    log() << "old journal file will be removed: " << f.filename << endl;
+                    removeOldJournalFile(p);
+                }
+                else {
+                    break;
+                }
+
+                _oldJournalFiles.pop_front();
+            }
+        }
+
+        /*int getAgeOutJournalFiles() {
+            mutex::try_lock lk(j._curLogFileMutex, 4000);
+            if( !lk.ok )
+                return -1;
+            return j._ageOut ? 1 : 0;
+        }*/
+        void setAgeOutJournalFiles(bool a) {
+            SimpleMutex::scoped_lock lk(j._curLogFileMutex);
+            j._ageOut = a;
+        }
+
+        void Journal::_rotate() {
+            if( d.dbMutex.atLeastReadLocked() ) { 
+                LOGSOME << "info journal _rotate called insider dbMutex - ok but should be somewhat rare" << endl;
+            }
+
+            RACECHECK;
+
+            _curLogFileMutex.dassertLocked();
+
+            if ( inShutdown() || !_curLogFile )
+                return;
+
+            j.updateLSNFile();
+
+            if( _curLogFile && _written < DataLimitPerJournalFile )
+                return;
+
+            if( _curLogFile ) {
+                _curLogFile->truncate();
+                closeCurrentJournalFile();
+                removeUnneededJournalFiles();
+            }
+
+            try {
+                Timer t;
+                _open();
+                int ms = t.millis();
+                if( ms >= 200 ) {
+                    log() << "DR101 latency warning on journal file open " << ms << "ms" << endl;
+                }
+            }
+            catch(std::exception& e) {
+                log() << "warning exception opening journal file " << e.what() << endl;
+                throw;
+            }
+        }
+
+        /** write (append) the buffer we have built to the journal and fsync it.
+            outside of dbMutex lock as this could be slow.
+            @param uncompressed - a buffer that will be written to the journal after compression
+            will not return until on disk
+        */
+        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed) {
+            Timer t;
+            j.journal(h, uncompressed);
+            stats.curr->_writeToJournalMicros += t.micros();
+        }
+        void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) {
+            RACECHECK
+            static AlignedBuilder b(32*1024*1024);
+            /* buffer to journal will be
+               JSectHeader
+               compressed operations
+               JSectFooter
+            */
+            const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter);
+            const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize;
+            b.reset(max);
+
+            {
+                dassert( h.sectionLen() == (unsigned) 0xffffffff ); // we will backfill later
+                b.appendStruct(h);
+            }
+
+            size_t compressedLength = 0;
+            rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength);
+            assert( compressedLength < 0xffffffff );
+            assert( compressedLength < max );
+            b.skip(compressedLength);
+
+            // footer
+            unsigned L = 0xffffffff;
+            {
+                // pad to alignment, and set the total section length in the JSectHeader
+                assert( 0xffffe000 == (~(Alignment-1)) );
+                unsigned lenUnpadded = b.len() + sizeof(JSectFooter);
+                L = (lenUnpadded + Alignment-1) & (~(Alignment-1));
+                dassert( L >= lenUnpadded );
+
+                ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded);
+
+                JSectFooter f(b.buf(), b.len()); // computes checksum
+                b.appendStruct(f);
+                dassert( b.len() == lenUnpadded );
+
+                b.skip(L - lenUnpadded);
+                dassert( b.len() % Alignment == 0 );
+            }
+
+            try {
+                SimpleMutex::scoped_lock lk(_curLogFileMutex);
+
+                // must already be open -- so that _curFileId is correct for previous buffer building
+                assert( _curLogFile );
+
+                stats.curr->_uncompressedBytes += b.len();
+                unsigned w = b.len();
+                _written += w;
+                assert( w <= L );
+                stats.curr->_journaledBytes += L;
+                _curLogFile->synchronousAppend((const void *) b.buf(), L);
+                _rotate();
+            }
+            catch(std::exception& e) {
+                log() << "error exception in dur::journal " << e.what() << endl;
+                throw;
+            }
+        }
+
+    }
+}
+
+/* todo
+   test (and handle) disk full on journal append.  best quick thing to do is to terminate.
+   if we roll back operations, there are nuances such as is ReplSetImpl::lastOpTimeWritten too new in ram then?
+*/
diff --git a/src/mongo/db/dur_journal.h b/src/mongo/db/dur_journal.h
new file mode 100644
index 00000000000..664f63942e0
--- /dev/null
+++ b/src/mongo/db/dur_journal.h
@@ -0,0 +1,68 @@
+// @file dur_journal.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+    class AlignedBuilder;
+
+    namespace dur {
+
+        /** true if ok to cleanup journal files at termination. otherwise, files journal will be retained.
+        */
+        extern bool okToCleanUp;
+
+        /** at termination after db files closed & fsynced 
+            also after recovery
+            closes and removes journal files
+            @param log report in log that we are cleaning up if we actually do any work
+        */
+        void journalCleanup(bool log = false);
+
+        /** assure journal/ dir exists. throws */
+        void journalMakeDir();
+
+        /** check if time to rotate files; assure a file is open.
+             done separately from the journal() call as we can do this part
+             outside of lock.
+            only called by durThread.
+         */
+        void journalRotate();
+
+        /** flag that something has gone wrong during writing to the journal
+            (not for recovery mode)
+        */
+        void journalingFailure(const char *msg);
+
+        /** read lsn from disk from the last run before doing recovery */
+        unsigned long long journalReadLSN();
+
+        unsigned long long getLastDataFileFlushTime();
+
+        /** never throws.
+            @return true if there are any journal files in the journal dir.
+        */
+        bool haveJournalFiles();
+
+        // in case disk controller buffers writes
+        const long long ExtraKeepTimeMs = 10000;
+
+        const unsigned JournalCommitIntervalDefault = 100;
+
+    }
+}
diff --git a/src/mongo/db/dur_journalformat.h b/src/mongo/db/dur_journalformat.h
new file mode 100644
index 00000000000..10ed8487b71
--- /dev/null
+++ b/src/mongo/db/dur_journalformat.h
@@ -0,0 +1,174 @@
+// @file dur_journalformat.h The format of our journal files.
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    namespace dur {
+
+        const unsigned Alignment = 8192;
+
+#pragma pack(1)
+        /** beginning header for a journal/j._<n> file
+            there is nothing important int this header at this time.  except perhaps version #.
+        */
+        struct JHeader {
+            JHeader() { }
+            JHeader(string fname);
+
+            char magic[2]; // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or something...
+
+            // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
+            // that.  simply incrementing the version # is safe on a fwd basis.
+#if defined(_NOCOMPRESS)
+            enum { CurrentVersion = 0x4148 };
+#else
+            enum { CurrentVersion = 0x4149 };
+#endif
+            unsigned short _version;
+
+            // these are just for diagnostic ease (make header more useful as plain text)
+            char n1;          // '\n'
+            char ts[20];      // ascii timestamp of file generation.  for user reading, not used by code.
+            char n2;          // '\n'
+            char dbpath[128]; // path/filename of this file for human reading and diagnostics.  not used by code.
+            char n3, n4;      // '\n', '\n'
+
+            unsigned long long fileId; // unique identifier that will be in each JSectHeader. important as we recycle prealloced files
+
+            char reserved3[8026]; // 8KB total for the file header
+            char txt2[2];         // "\n\n" at the end
+
+            bool versionOk() const { return _version == CurrentVersion; }
+            bool valid() const { return magic[0] == 'j' && txt2[1] == '\n' && fileId; }
+        };
+
+        /** "Section" header.  A section corresponds to a group commit.
+            len is length of the entire section including header and footer.
+            header and footer are not compressed, just the stuff in between.
+        */
+        struct JSectHeader {
+        private:
+            unsigned _sectionLen;          // unpadded length in bytes of the whole section
+        public:
+            unsigned long long seqNumber;  // sequence number that can be used on recovery to not do too much work
+            unsigned long long fileId;     // matches JHeader::fileId
+            unsigned sectionLen() const { return _sectionLen; }
+
+            // we store the unpadded length so we can use that when we uncompress. to 
+            // get the true total size this must be rounded up to the Alignment.
+            void setSectionLen(unsigned lenUnpadded) { _sectionLen = lenUnpadded; }
+
+            unsigned sectionLenWithPadding() const { 
+                unsigned x = (sectionLen() + (Alignment-1)) & (~(Alignment-1));
+                dassert( x % Alignment == 0 );
+                return x;
+            }
+        };
+
+        /** an individual write operation within a group commit section.  Either the entire section should
+            be applied, or nothing.  (We check the md5 for the whole section before doing anything on recovery.)
+        */
+        struct JEntry {
+            enum OpCodes {
+                OpCode_Footer      = 0xffffffff,
+                OpCode_DbContext   = 0xfffffffe,
+                OpCode_FileCreated = 0xfffffffd,
+                OpCode_DropDb      = 0xfffffffc,
+                OpCode_Min         = 0xfffff000
+            };
+            union {
+                unsigned len;    // length in bytes of the data of the JEntry. does not include the JEntry header
+                OpCodes opcode;
+            };
+
+            unsigned ofs;  // offset in file
+
+            // sentinel and masks for _fileNo
+            enum {
+                DotNsSuffix = 0x7fffffff, // ".ns" file
+                LocalDbBit  = 0x80000000  // assuming "local" db instead of using the JDbContext
+            };
+            int _fileNo;   // high bit is set to indicate it should be the <dbpath>/local database
+            // char data[len] follows
+
+            const char * srcData() const {
+                const int *i = &_fileNo;
+                return (const char *) (i+1);
+            }
+
+            int getFileNo() const { return _fileNo & (~LocalDbBit); }
+            void setFileNo(int f) { _fileNo = f; }
+            bool isNsSuffix() const { return getFileNo() == DotNsSuffix; }
+
+            void setLocalDbContextBit() { _fileNo |= LocalDbBit; }
+            bool isLocalDbContext() const { return _fileNo & LocalDbBit; }
+            void clearLocalDbContextBit() { _fileNo = getFileNo(); }
+
+            static string suffix(int fileno) {
+                if( fileno == DotNsSuffix ) return "ns";
+                stringstream ss;
+                ss << fileno;
+                return ss.str();
+            }
+        };
+
+        /** group commit section footer. md5 is a key field. */
+        struct JSectFooter {
+            JSectFooter();
+            JSectFooter(const void* begin, int len); // needs buffer to compute hash
+            unsigned sentinel;
+            unsigned char hash[16];
+            unsigned long long reserved;
+            char magic[4]; // "\n\n\n\n"
+
+            /** used by recovery to see if buffer is valid
+                @param begin the buffer
+                @param len buffer len
+                @return true if buffer looks valid
+            */
+            bool checkHash(const void* begin, int len) const;
+
+            bool magicOk() const { return *((unsigned*)magic) == 0x0a0a0a0a; }
+        };
+
+        /** declares "the next entry(s) are for this database / file path prefix" */
+        struct JDbContext {
+            JDbContext() : sentinel(JEntry::OpCode_DbContext) { }
+            const unsigned sentinel;   // compare to JEntry::len -- zero is our sentinel
+            //char dbname[];
+        };
+
+        /** "last sequence number" */
+        struct LSNFile {
+            unsigned ver;
+            unsigned reserved2;
+            unsigned long long lsn;
+            unsigned long long checkbytes;
+            unsigned long long reserved[8];
+
+            void set(unsigned long long lsn);
+            unsigned long long get();
+        };
+
+#pragma pack()
+
+    }
+
+}
diff --git a/src/mongo/db/dur_journalimpl.h b/src/mongo/db/dur_journalimpl.h
new file mode 100644
index 00000000000..8aad70b0e5c
--- /dev/null
+++ b/src/mongo/db/dur_journalimpl.h
@@ -0,0 +1,103 @@
+// @file dur_journal.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/logfile.h"
+
+namespace mongo {
+    namespace dur {
+
+        /** the writeahead journal for durability */
+        class Journal {
+        public:
+            string dir; // set by journalMakeDir() during initialization
+
+            Journal();
+
+            /** call during startup by journalMakeDir() */
+            void init();
+
+            /** check if time to rotate files.  assure a file is open.
+                done separately from the journal() call as we can do this part
+                outside of lock.
+                thread: durThread()
+             */
+            void rotate();
+
+            /** append to the journal file
+            */
+            void journal(const JSectHeader& h, const AlignedBuilder& b);
+
+            boost::filesystem::path getFilePathFor(int filenumber) const;
+
+            unsigned long long lastFlushTime() const { return _lastFlushTime; }
+            void cleanup(bool log); // closes and removes journal files
+
+            unsigned long long curFileId() const { return _curFileId; }
+
+            void assureLogFileOpen() {
+                SimpleMutex::scoped_lock lk(_curLogFileMutex);
+                if( _curLogFile == 0 )
+                    _open();
+            }
+
+            /** open a journal file to journal operations to. */
+            void open();
+
+        private:
+            /** check if time to rotate files.  assure a file is open.
+             *  internally called with every commit
+             */
+            void _rotate();
+
+            void _open();
+            void closeCurrentJournalFile();
+            void removeUnneededJournalFiles();
+
+            unsigned long long _written; // bytes written so far to the current journal (log) file
+            unsigned _nextFileNumber;
+        public:
+            SimpleMutex _curLogFileMutex;
+            bool _ageOut;
+        private:
+
+            LogFile *_curLogFile; // use _curLogFileMutex
+            unsigned long long _curFileId; // current file id see JHeader::fileId
+
+            struct JFile {
+                string filename;
+                unsigned long long lastEventTimeMs;
+            };
+
+            // files which have been closed but not unlinked (rotated out) yet
+            // ordered oldest to newest
+            list<JFile> _oldJournalFiles; // use _curLogFileMutex
+
+            // lsn related
+            static void preFlush();
+            static void postFlush();
+            unsigned long long _preFlushTime;
+            unsigned long long _lastFlushTime; // data < this time is fsynced in the datafiles (unless hard drive controller is caching)
+            bool _writeToLSNNeeded;
+            void updateLSNFile();
+        };
+
+    }
+}
diff --git a/src/mongo/db/dur_preplogbuffer.cpp b/src/mongo/db/dur_preplogbuffer.cpp
new file mode 100644
index 00000000000..10b63c0e549
--- /dev/null
+++ b/src/mongo/db/dur_preplogbuffer.cpp
@@ -0,0 +1,177 @@
+// @file dur_preplogbuffer.cpp
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+     PREPLOGBUFFER
+       we will build an output buffer ourself and then use O_DIRECT
+       we could be in read lock for this
+       for very large objects write directly to redo log in situ?
+     @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_journalimpl.h"
+#include "dur_commitjob.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/alignedbuilder.h"
+#include "../util/timer.h"
+#include "dur_stats.h"
+#include "../server.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+    namespace dur {
+
+        extern Journal j;
+
+        RelativePath local = RelativePath::fromRelativePath("local");
+
+        static MongoMMF* findMMF_inlock(void *ptr, size_t &ofs) {
+            MongoMMF *f = privateViews.find_inlock(ptr, ofs);
+            if( f == 0 ) {
+                error() << "findMMF_inlock failed " << privateViews.numberOfViews_inlock() << endl;
+                printStackTrace(); // we want a stack trace and the assert below didn't print a trace once in the real world - not sure why
+                stringstream ss;
+                ss << "view pointer cannot be resolved " << hex << (size_t) ptr;
+                journalingFailure(ss.str().c_str()); // asserts, which then abends
+            }
+            return f;
+        }
+
+        /** put the basic write operation into the buffer (bb) to be journaled */
+        static void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) {
+            size_t ofs = 1;
+            MongoMMF *mmf = findMMF_inlock(i->start(), /*out*/ofs);
+
+            if( unlikely(!mmf->willNeedRemap()) ) {
+                // tag this mmf as needed a remap of its private view later.
+                // usually it will already be dirty/already set, so we do the if above first
+                // to avoid possibility of cpu cache line contention
+                mmf->willNeedRemap() = true;
+            }
+
+            // since we have already looked up the mmf, we go ahead and remember the write view location
+            // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+            // 
+            // this was for WRITETODATAFILES_Impl2 so commented out now
+            //
+            /*
+            dassert( i->w_ptr == 0 );
+            i->w_ptr = ((char*)mmf->view_write()) + ofs;
+            */
+
+            JEntry e;
+            e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //dont write past end of file
+            assert( ofs <= 0x80000000 );
+            e.ofs = (unsigned) ofs;
+            e.setFileNo( mmf->fileSuffixNo() );
+            if( mmf->relativePath() == local ) {
+                e.setLocalDbContextBit();
+            }
+            else if( mmf->relativePath() != lastDbPath ) {
+                lastDbPath = mmf->relativePath();
+                JDbContext c;
+                bb.appendStruct(c);
+                bb.appendStr(lastDbPath.toString());
+            }
+            bb.appendStruct(e);
+#if defined(_EXPERIMENTAL)
+            i->ofsInJournalBuffer = bb.len();
+#endif
+            bb.appendBuf(i->start(), e.len);
+
+            if (unlikely(e.len != (unsigned)i->length())) {
+                log() << "journal info splitting prepBasicWrite at boundary" << endl;
+
+                // This only happens if we write to the last byte in a file and
+                // the fist byte in another file that is mapped adjacently. I
+                // think most OSs leave at least a one page gap between
+                // mappings, but better to be safe.
+
+                WriteIntent next ((char*)i->start() + e.len, i->length() - e.len);
+                prepBasicWrite_inlock(bb, &next, lastDbPath);
+            }
+        }
+
+        /** basic write ops / write intents.  note there is no particular order to these : if we have
+            two writes to the same location during the group commit interval, it is likely
+            (although not assured) that it is journaled here once.
+        */
+        static void prepBasicWrites(AlignedBuilder& bb) {
+            scoped_lock lk(privateViews._mutex());
+
+            // each time events switch to a different database we journal a JDbContext
+            RelativePath lastDbPath;
+
+            for( set<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
+                prepBasicWrite_inlock(bb, &(*i), lastDbPath);
+            }
+        }
+
+        static void resetLogBuffer(/*out*/JSectHeader& h, AlignedBuilder& bb) {
+            bb.reset();
+
+            h.setSectionLen(0xffffffff);  // total length, will fill in later
+            h.seqNumber = getLastDataFileFlushTime();
+            h.fileId = j.curFileId();
+        }
+
+        /** we will build an output buffer ourself and then use O_DIRECT
+            we could be in read lock for this
+            caller handles locking
+            @return partially populated sectheader and _ab set
+        */
+        static void _PREPLOGBUFFER(JSectHeader& h) {
+            assert( cmdLine.dur );
+
+            {
+                // now that we are locked, fully drain deferred notes of write intents
+                DEV d.dbMutex.assertAtLeastReadLocked();
+                Writes& writes = commitJob.wi();
+                writes._deferred.invoke();
+                writes._drained = true;
+            }
+
+            AlignedBuilder& bb = commitJob._ab;
+            resetLogBuffer(h, bb); // adds JSectHeader
+
+            // ops other than basic writes (DurOp's)
+            {
+                for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) {
+                    (*i)->serialize(bb);
+                }
+            }
+
+            prepBasicWrites(bb);
+
+            return;
+        }
+        void PREPLOGBUFFER(/*out*/ JSectHeader& h) {
+            Timer t;
+            j.assureLogFileOpen(); // so fileId is set
+            _PREPLOGBUFFER(h);
+            stats.curr->_prepLogBufferMicros += t.micros();
+        }
+
+    }
+}
diff --git a/src/mongo/db/dur_recover.cpp b/src/mongo/db/dur_recover.cpp
new file mode 100644
index 00000000000..a0a8843572c
--- /dev/null
+++ b/src/mongo/db/dur_recover.cpp
@@ -0,0 +1,544 @@
+// @file dur_recover.cpp crash recovery via the journal
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "dur.h"
+#include "dur_stats.h"
+#include "dur_recover.h"
+#include "dur_journal.h"
+#include "dur_journalformat.h"
+#include "durop.h"
+#include "namespace.h"
+#include "../util/mongoutils/str.h"
+#include "../util/bufreader.h"
+#include "../util/concurrency/race.h"
+#include "pdfile.h"
+#include "database.h"
+#include "db.h"
+#include "../util/unittest.h"
+#include "../util/checksum.h"
+#include "cmdline.h"
+#include "curop.h"
+#include "mongommf.h"
+#include "../util/compress.h"
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    namespace dur {
+
+        struct ParsedJournalEntry { /*copyable*/
+            ParsedJournalEntry() : e(0) { }
+
+            // relative path of database for the operation.
+            // might be a pointer into mmaped Journal file
+            const char *dbName;
+
+            // thse are pointers into the memory mapped journal file
+            const JEntry *e;  // local db sentinel is already parsed out here into dbName
+
+            // if not one of the two simple JEntry's above, this is the operation:
+            shared_ptr<DurOp> op;
+        };
+
+        void removeJournalFiles();
+        path getJournalDir();
+
+        /** get journal filenames, in order. throws if unexpected content found */
+        static void getFiles(path dir, vector<path>& files) {
+            map<unsigned,path> m;
+            for ( boost::filesystem::directory_iterator i( dir );
+                    i != boost::filesystem::directory_iterator();
+                    ++i ) {
+                boost::filesystem::path filepath = *i;
+                string fileName = boost::filesystem::path(*i).leaf();
+                if( str::startsWith(fileName, "j._") ) {
+                    unsigned u = str::toUnsigned( str::after(fileName, '_') );
+                    if( m.count(u) ) {
+                        uasserted(13531, str::stream() << "unexpected files in journal directory " << dir.string() << " : " << fileName);
+                    }
+                    m.insert( pair<unsigned,path>(u,filepath) );
+                }
+            }
+            for( map<unsigned,path>::iterator i = m.begin(); i != m.end(); ++i ) {
+                if( i != m.begin() && m.count(i->first - 1) == 0 ) {
+                    uasserted(13532,
+                    str::stream() << "unexpected file in journal directory " << dir.string()
+                      << " : " << boost::filesystem::path(i->second).leaf() << " : can't find its preceeding file");
+                }
+                files.push_back(i->second);
+            }
+        }
+
+        /** read through the memory mapped data of a journal file (journal/j._<n> file)
+            throws
+        */
+        class JournalSectionIterator : boost::noncopyable {
+            auto_ptr<BufReader> _entries;
+            const JSectHeader _h;
+            const char *_lastDbName; // pointer into mmaped journal file
+            const bool _doDurOps;
+            string _uncompressed;
+        public:
+            JournalSectionIterator(const JSectHeader& h, const void *compressed, unsigned compressedLen, bool doDurOpsRecovering) :
+                _h(h),
+                _lastDbName(0)
+                , _doDurOps(doDurOpsRecovering)
+            {
+                assert( doDurOpsRecovering );
+                bool ok = uncompress((const char *)compressed, compressedLen, &_uncompressed);
+                if( !ok ) { 
+                    // it should always be ok (i think?) as there is a previous check to see that the JSectFooter is ok
+                    log() << "couldn't uncompress journal section" << endl;
+                    msgasserted(15874, "couldn't uncompress journal section");
+                }
+                const char *p = _uncompressed.c_str();
+                assert( compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader) );
+                _entries = auto_ptr<BufReader>( new BufReader(p, _uncompressed.size()) );
+            }
+
+            // we work with the uncompressed buffer when doing a WRITETODATAFILES (for speed)
+            JournalSectionIterator(const JSectHeader &h, const void *p, unsigned len) :
+                _entries( new BufReader((const char *) p, len) ),
+                _h(h),
+                _lastDbName(0)
+                , _doDurOps(false)
+
+                { }
+
+            bool atEof() const { return _entries->atEof(); }
+
+            unsigned long long seqNumber() const { return _h.seqNumber; }
+
+            /** get the next entry from the log.  this function parses and combines JDbContext and JEntry's.
+             *  throws on premature end of section.
+             */
+            void next(ParsedJournalEntry& e) {
+                unsigned lenOrOpCode;
+                _entries->read(lenOrOpCode);
+
+                if (lenOrOpCode > JEntry::OpCode_Min) {
+                    switch( lenOrOpCode ) {
+
+                    case JEntry::OpCode_Footer: {
+                        assert( false );
+                    }
+
+                    case JEntry::OpCode_FileCreated:
+                    case JEntry::OpCode_DropDb: {
+                        e.dbName = 0;
+                        boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries);
+                        if (_doDurOps) {
+                            e.op = op;
+                        }
+                        return;
+                    }
+
+                    case JEntry::OpCode_DbContext: {
+                        _lastDbName = (const char*) _entries->pos();
+                        const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _entries->remaining());
+                        const unsigned len = strnlen(_lastDbName, limit);
+                        massert(13533, "problem processing journal file during recovery", _lastDbName[len] == '\0');
+                        _entries->skip(len+1); // skip '\0' too
+                        _entries->read(lenOrOpCode); // read this for the fall through
+                    }
+                    // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
+
+                    default:
+                        // fall through
+                        ;
+                    }
+                }
+
+                // JEntry - a basic write
+                assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
+                _entries->rewind(4);
+                e.e = (JEntry *) _entries->skip(sizeof(JEntry));
+                e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
+                assert( e.e->len == lenOrOpCode );
+                _entries->skip(e.e->len);
+            }
+
+        };
+
+        static string fileName(const char* dbName, int fileNo) {
+            stringstream ss;
+            ss << dbName << '.';
+            assert( fileNo >= 0 );
+            if( fileNo == JEntry::DotNsSuffix )
+                ss << "ns";
+            else
+                ss << fileNo;
+
+            // relative name -> full path name
+            path full(dbpath);
+            full /= ss.str();
+            return full.string();
+        }
+
+        RecoveryJob::~RecoveryJob() {
+            DESTRUCTOR_GUARD(
+                if( !_mmfs.empty() )
+                    close();
+            )
+        }
+
+        void RecoveryJob::close() {
+            scoped_lock lk(_mx);
+            _close();
+        }
+
+        void RecoveryJob::_close() {
+            MongoFile::flushAll(true);
+            _mmfs.clear();
+        }
+
+        void RecoveryJob::write(const ParsedJournalEntry& entry) {
+            //TODO(mathias): look into making some of these dasserts
+            assert(entry.e);
+            assert(entry.dbName);
+            assert(strnlen(entry.dbName, MaxDatabaseNameLen) < MaxDatabaseNameLen);
+
+            const string fn = fileName(entry.dbName, entry.e->getFileNo());
+            MongoFile* file;
+            {
+                MongoFileFinder finder; // must release lock before creating new MongoMMF
+                file = finder.findByPath(fn);
+            }
+
+            MongoMMF* mmf;
+            if (file) {
+                assert(file->isMongoMMF());
+                mmf = (MongoMMF*)file;
+            }
+            else {
+                if( !_recovering ) {
+                    log() << "journal error applying writes, file " << fn << " is not open" << endl;
+                    assert(false);
+                }
+                boost::shared_ptr<MongoMMF> sp (new MongoMMF);
+                assert(sp->open(fn, false));
+                _mmfs.push_back(sp);
+                mmf = sp.get();
+            }
+
+            if ((entry.e->ofs + entry.e->len) <= mmf->length()) {
+                assert(mmf->view_write());
+                assert(entry.e->srcData());
+
+                void* dest = (char*)mmf->view_write() + entry.e->ofs;
+                memcpy(dest, entry.e->srcData(), entry.e->len);
+                stats.curr->_writeToDataFilesBytes += entry.e->len;
+            }
+            else {
+                massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering);
+            }
+        }
+
+        void RecoveryJob::applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump) {
+            if( entry.e ) {
+                if( dump ) {
+                    stringstream ss;
+                    ss << "  BASICWRITE " << setw(20) << entry.dbName << '.';
+                    if( entry.e->isNsSuffix() )
+                        ss << "ns";
+                    else
+                        ss << setw(2) << entry.e->getFileNo();
+                    ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/
+                       "  " << hexdump(entry.e->srcData(), entry.e->len);
+                    log() << ss.str() << endl;
+                }
+                if( apply ) {
+                    write(entry);
+                }
+            }
+            else if(entry.op) {
+                // a DurOp subclass operation
+                if( dump ) {
+                    log() << "  OP " << entry.op->toString() << endl;
+                }
+                if( apply ) {
+                    if( entry.op->needFilesClosed() ) {
+                        _close(); // locked in processSection
+                    }
+                    entry.op->replay();
+                }
+            }
+        }
+
+        void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) {
+            bool apply = (cmdLine.durOptions & CmdLine::DurScanOnly) == 0;
+            bool dump = cmdLine.durOptions & CmdLine::DurDumpJournal;
+            if( dump )
+                log() << "BEGIN section" << endl;
+
+            for( vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) {
+                applyEntry(*i, apply, dump);
+            }
+
+            if( dump )
+                log() << "END section" << endl;
+        }
+
+        void RecoveryJob::processSection(const JSectHeader *h, const void *p, unsigned len, const JSectFooter *f) {
+            scoped_lock lk(_mx);
+            RACECHECK
+
+            /** todo: we should really verify the checksum to see that seqNumber is ok?
+                      that is expensive maybe there is some sort of checksum of just the header 
+                      within the header itself
+            */
+            if( _recovering && _lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs ) {
+                if( h->seqNumber != _lastSeqMentionedInConsoleLog ) {
+                    static int n;
+                    if( ++n < 10 ) {
+                        log() << "recover skipping application of section seq:" << h->seqNumber << " < lsn:" << _lastDataSyncedFromLastRun << endl;
+                    }
+                    else if( n == 10 ) { 
+                        log() << "recover skipping application of section more..." << endl;
+                    }
+                    _lastSeqMentionedInConsoleLog = h->seqNumber;
+                }
+                return;
+            }
+
+            auto_ptr<JournalSectionIterator> i;
+            if( _recovering ) {
+                i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering));
+            }
+            else { 
+                i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, /*after header*/p, /*w/out header*/len));
+            }
+
+            // we use a static so that we don't have to reallocate every time through.  occasionally we 
+            // go back to a small allocation so that if there were a spiky growth it won't stick forever.
+            static vector<ParsedJournalEntry> entries;
+            entries.clear();
+/** TEMP uncomment
+            RARELY OCCASIONALLY {
+                if( entries.capacity() > 2048 ) {
+                    entries.shrink_to_fit();
+                    entries.reserve(2048);
+                }
+            }
+*/
+
+            // first read all entries to make sure this section is valid
+            ParsedJournalEntry e;
+            while( !i->atEof() ) {
+                i->next(e);
+                entries.push_back(e);
+            }
+
+            // after the entries check the footer checksum
+            if( _recovering ) {
+                assert( ((const char *)h) + sizeof(JSectHeader) == p );
+                if( !f->checkHash(h, len + sizeof(JSectHeader)) ) { 
+                    msgasserted(13594, "journal checksum doesn't match");
+                }
+            }
+
+            // got all the entries for one group commit.  apply them:
+            applyEntries(entries);
+        }
+
+        /** apply a specific journal file, that is already mmap'd
+            @param p start of the memory mapped file
+            @return true if this is detected to be the last file (ends abruptly)
+        */
+        bool RecoveryJob::processFileBuffer(const void *p, unsigned len) {
+            try {
+                unsigned long long fileId;
+                BufReader br(p,len);
+
+                {
+                    // read file header
+                    JHeader h;
+                    br.read(h);
+
+                    /* [dm] not automatically handled.  we should eventually handle this automatically.  i think:
+                       (1) if this is the final journal file
+                       (2) and the file size is just the file header in length (or less) -- this is a bit tricky to determine if prealloced
+                       then can just assume recovery ended cleanly and not error out (still should log).
+                    */
+                    uassert(13537, 
+                        "journal file header invalid. This could indicate corruption in a journal file, or perhaps a crash where sectors in file header were in flight written out of order at time of crash (unlikely but possible).", 
+                        h.valid());
+
+                    if( !h.versionOk() ) {
+                        log() << "journal file version number mismatch got:" << hex << h._version                             
+                            << " expected:" << hex << (unsigned) JHeader::CurrentVersion 
+                            << ". if you have just upgraded, recover with old version of mongod, terminate cleanly, then upgrade." 
+                            << endl;
+                        uasserted(13536, str::stream() << "journal version number mismatch " << h._version);
+                    }
+                    fileId = h.fileId;
+                    if(cmdLine.durOptions & CmdLine::DurDumpJournal) { 
+                        log() << "JHeader::fileId=" << fileId << endl;
+                    }
+                }
+
+                // read sections
+                while ( !br.atEof() ) {
+                    JSectHeader h;
+                    br.peek(h);
+                    if( h.fileId != fileId ) {
+                        if( debug || (cmdLine.durOptions & CmdLine::DurDumpJournal) ) {
+                            log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl;
+                            log() << "  sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl;
+                        }
+                        return true;
+                    }
+                    unsigned slen = h.sectionLen();
+                    unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter);
+                    const char *hdr = (const char *) br.skip(h.sectionLenWithPadding());
+                    const char *data = hdr + sizeof(JSectHeader);
+                    const char *footer = data + dataLen;
+                    processSection((const JSectHeader*) hdr, data, dataLen, (const JSectFooter*) footer);
+
+                    // ctrl c check
+                    killCurrentOp.checkForInterrupt(false);
+                }
+            }
+            catch( BufReader::eof& ) {
+                if( cmdLine.durOptions & CmdLine::DurDumpJournal )
+                    log() << "ABRUPT END" << endl;
+                return true; // abrupt end
+            }
+
+            return false; // non-abrupt end
+        }
+
+        /** apply a specific journal file */
+        bool RecoveryJob::processFile(path journalfile) {
+            log() << "recover " << journalfile.string() << endl;
+
+            try { 
+                if( boost::filesystem::file_size( journalfile.string() ) == 0 ) {
+                    log() << "recover info " << journalfile.string() << " has zero length" << endl;
+                    return true;
+                }
+            } catch(...) { 
+                // if something weird like a permissions problem keep going so the massert down below can happen (presumably)
+                log() << "recover exception checking filesize" << endl;
+            }
+
+            MemoryMappedFile f;
+            void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL);
+            massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p);
+            return processFileBuffer(p, (unsigned) f.length());
+        }
+
+        /** @param files all the j._0 style files we need to apply for recovery */
+        void RecoveryJob::go(vector<path>& files) {
+            log() << "recover begin" << endl;
+            _recovering = true;
+
+            // load the last sequence number synced to the datafiles on disk before the last crash
+            _lastDataSyncedFromLastRun = journalReadLSN();
+            log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl;
+
+            for( unsigned i = 0; i != files.size(); ++i ) {
+	      bool abruptEnd = processFile(files[i]);
+                if( abruptEnd && i+1 < files.size() ) {
+                    log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
+                    close();
+                    uasserted(13535, "recover abrupt journal file end");
+                }
+            }
+
+            close();
+
+            if( cmdLine.durOptions & CmdLine::DurScanOnly ) {
+                uasserted(13545, str::stream() << "--durOptions " << (int) CmdLine::DurScanOnly << " (scan only) specified");
+            }
+
+            log() << "recover cleaning up" << endl;
+            removeJournalFiles();
+            log() << "recover done" << endl;
+            okToCleanUp = true;
+            _recovering = false;
+        }
+
+        void _recover() {
+            assert( cmdLine.dur );
+
+            boost::filesystem::path p = getJournalDir();
+            if( !exists(p) ) {
+                log() << "directory " << p.string() << " does not exist, there will be no recovery startup step" << endl;
+                okToCleanUp = true;
+                return;
+            }
+
+            vector<path> journalFiles;
+            getFiles(p, journalFiles);
+
+            if( journalFiles.empty() ) {
+                log() << "recover : no journal files present, no recovery needed" << endl;
+                okToCleanUp = true;
+                return;
+            }
+
+            RecoveryJob::get().go(journalFiles);
+        }
+
+        extern mutex groupCommitMutex;
+
+        /** recover from a crash
+            called during startup
+            throws on error
+        */
+        void recover() {
+            // we use a lock so that exitCleanly will wait for us
+            // to finish (or at least to notice what is up and stop)
+            writelock lk;
+
+            // this is so the mutexdebugger doesn't get confused.  we are actually single threaded 
+            // at this point in the program so it wouldn't have been a true problem (I think)
+            scoped_lock lk2(groupCommitMutex);
+
+            _recover(); // throws on interruption
+        }
+
+        struct BufReaderY { int a,b; };
+        class BufReaderUnitTest : public UnitTest {
+        public:
+            void run() {
+                BufReader r((void*) "abcdabcdabcd", 12);
+                char x;
+                BufReaderY y;
+                r.read(x); //cout << x; // a
+                assert( x == 'a' );
+                r.read(y);
+                r.read(x);
+                assert( x == 'b' );
+            }
+        } brunittest;
+
+        // can't free at termination because order of destruction of global vars is arbitrary
+        RecoveryJob &RecoveryJob::_instance = *(new RecoveryJob());
+
+    } // namespace dur
+
+} // namespace mongo
+
diff --git a/src/mongo/db/dur_recover.h b/src/mongo/db/dur_recover.h
new file mode 100644
index 00000000000..955e730ea05
--- /dev/null
+++ b/src/mongo/db/dur_recover.h
@@ -0,0 +1,50 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/concurrency/mutex.h"
+#include "../util/file.h"
+
+namespace mongo {
+    class MongoMMF;
+
+    namespace dur {
+        struct ParsedJournalEntry;
+
+        /** call go() to execute a recovery from existing journal files.
+         */
+        class RecoveryJob : boost::noncopyable {
+        public:
+            RecoveryJob() : _lastDataSyncedFromLastRun(0), 
+                _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; }
+            void go(vector<path>& files);
+            ~RecoveryJob();
+
+            /** @param data data between header and footer. compressed if recovering. */
+            void processSection(const JSectHeader *h, const void *data, unsigned len, const JSectFooter *f);
+
+            void close(); // locks and calls _close()
+
+            static RecoveryJob & get() { return _instance; }
+        private:
+            void write(const ParsedJournalEntry& entry); // actually writes to the file
+            void applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump);
+            void applyEntries(const vector<ParsedJournalEntry> &entries);
+            bool processFileBuffer(const void *, unsigned len);
+            bool processFile(path journalfile);
+            void _close(); // doesn't lock
+
+            list<boost::shared_ptr<MongoMMF> > _mmfs;
+
+            unsigned long long _lastDataSyncedFromLastRun;
+            unsigned long long _lastSeqMentionedInConsoleLog;
+        public:
+            mongo::mutex _mx; // protects _mmfs; see setNoJournal() too
+        private:
+            bool _recovering; // are we in recovery or WRITETODATAFILES
+
+            static RecoveryJob &_instance;
+        };
+    }
+}
diff --git a/src/mongo/db/dur_stats.h b/src/mongo/db/dur_stats.h
new file mode 100644
index 00000000000..50a26d1f215
--- /dev/null
+++ b/src/mongo/db/dur_stats.h
@@ -0,0 +1,49 @@
+// @file dur_stats.h
+
+namespace mongo {
+    namespace dur {
+
+        /** journaling stats.  the model here is that the commit thread is the only writer, and that reads are
+            uncommon (from a serverStatus command and such).  Thus, there should not be multicore chatter overhead.
+        */
+        struct Stats {
+            Stats();
+            void rotate();
+            BSONObj asObj();
+            unsigned _intervalMicros;
+            struct S {
+                BSONObj _asObj();
+                string _asCSV();
+                string _CSVHeader();
+                void reset();
+
+                unsigned _commits;
+                unsigned _earlyCommits; // count of early commits from commitIfNeeded() or from getDur().commitNow()
+                unsigned long long _journaledBytes;
+                unsigned long long _uncompressedBytes;
+                unsigned long long _writeToDataFilesBytes;
+
+                unsigned long long _prepLogBufferMicros;
+                unsigned long long _writeToJournalMicros;
+                unsigned long long _writeToDataFilesMicros;
+                unsigned long long _remapPrivateViewMicros;
+
+                // undesirable to be in write lock for the group commit (it can be done in a read lock), so good if we
+                // have visibility when this happens.  can happen for a couple reasons
+                // - read lock starvation
+                // - file being closed
+                // - data being written faster than the normal group commit interval
+                unsigned _commitsInWriteLock;
+
+                unsigned _dtMillis;
+            };
+            S *curr;
+        private:
+            S _a,_b;
+            unsigned long long _lastRotate;
+            S* other();
+        };
+        extern Stats stats;
+
+    }
+}
diff --git a/src/mongo/db/dur_writetodatafiles.cpp b/src/mongo/db/dur_writetodatafiles.cpp
new file mode 100644
index 00000000000..d77b0482c20
--- /dev/null
+++ b/src/mongo/db/dur_writetodatafiles.cpp
@@ -0,0 +1,94 @@
+// @file dur_writetodatafiles.cpp apply the writes back to the non-private MMF after they are for certain in redo log
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dur_commitjob.h"
+#include "dur_stats.h"
+#include "dur_recover.h"
+#include "../util/timer.h"
+
+namespace mongo {
+    namespace dur {
+
+        void debugValidateAllMapsMatch();
+
+        static void WRITETODATAFILES_Impl1(const JSectHeader& h, AlignedBuilder& uncompressed) {
+            LockMongoFilesShared lk;
+            LOG(3) << "journal WRITETODATAFILES 1" << endl;
+            RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), 0);
+            LOG(3) << "journal WRITETODATAFILES 2" << endl;
+        }
+
+#if 0
+        // the old implementation.  doesn't work with groupCommitWithLimitedLocks()
+        void WRITETODATAFILES_Impl2() {
+            /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
+            for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
+                const WriteIntent& intent = *it;
+                stats.curr->_writeToDataFilesBytes += intent.length();
+                dassert(intent.w_ptr);
+                memcpy(intent.w_ptr, intent.start(), intent.length());
+            }
+        }
+#endif
+
+#if defined(_EXPERIMENTAL)
+        // doesn't work with groupCommitWithLimitedLocks()
+        void WRITETODATAFILES_Impl3() {
+            /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
+            for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
+                const WriteIntent& intent = *it;
+                stats.curr->_writeToDataFilesBytes += intent.length();
+                dassert(intent.w_ptr);
+                memcpy(intent.w_ptr,
+                       commitJob._ab.atOfs(intent.ofsInJournalBuffer),
+                       intent.length());
+            }
+        }
+#endif
+
+        /** apply the writes back to the non-private MMF after they are for certain in redo log
+
+            (1) todo we don't need to write back everything every group commit.  we MUST write back
+            that which is going to be a remapped on its private view - but that might not be all
+            views.
+
+            (2) todo should we do this using N threads?  would be quite easy
+                see Hackenberg paper table 5 and 6.  2 threads might be a good balance.
+
+            (3) with enough work, we could do this outside the read lock.  it's a bit tricky though.
+                - we couldn't do it from the private views then as they may be changing.  would have to then
+                  be from the journal alignedbuffer.
+                - we need to be careful the file isn't unmapped on us -- perhaps a mutex or something
+                  with MongoMMF on closes or something to coordinate that.
+
+            concurrency: in mmmutex, not necessarily in dbMutex
+
+            @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc&hl=en
+        */
+
+        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed) {
+            Timer t;
+            WRITETODATAFILES_Impl1(h, uncompressed);
+            unsigned long long m = t.micros();
+            stats.curr->_writeToDataFilesMicros += m;
+            LOG(2) << "journal WRITETODATAFILES " << m / 1000.0 << "ms" << endl;
+        }
+
+    }
+}
diff --git a/src/mongo/db/durop.cpp b/src/mongo/db/durop.cpp
new file mode 100644
index 00000000000..80ee5043410
--- /dev/null
+++ b/src/mongo/db/durop.cpp
@@ -0,0 +1,161 @@
+// @file durop.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "concurrency.h"
+#include "../util/alignedbuilder.h"
+#include "../util/mongoutils/str.h"
+#include "../util/file.h"
+#include "mongommf.h"
+#include "durop.h"
+#include "../util/file_allocator.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    extern string dbpath; // --dbpath parm
+
+    void _deleteDataFiles(const char *);
+
+    namespace dur {
+
+        /** read a durop from journal file referenced by br.
+            @param opcode the opcode which has already been written from the bufreader
+        */
+        shared_ptr<DurOp> DurOp::read(unsigned opcode, BufReader& br) {
+            shared_ptr<DurOp> op;
+            switch( opcode ) {
+            case JEntry::OpCode_FileCreated:
+                op = shared_ptr<DurOp>( new FileCreatedOp(br) );
+                break;
+            case JEntry::OpCode_DropDb:
+                op = shared_ptr<DurOp>( new DropDbOp(br) );
+                break;
+            default:
+                massert(13546, (str::stream() << "journal recover: unrecognized opcode in journal " << opcode), false);
+            }
+            return op;
+        }
+
+        void DurOp::serialize(AlignedBuilder& ab) {
+            ab.appendNum(_opcode);
+            _serialize(ab);
+        }
+
+        DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) {
+            unsigned long long reserved;
+            log.read(reserved);
+            log.read(reserved);
+            log.readStr(_db);
+            string reservedStr;
+            log.readStr(reservedStr);
+        }
+
+        void DropDbOp::_serialize(AlignedBuilder& ab) {
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendStr(_db);
+            ab.appendStr(""); // reserved
+        }
+
+        /** throws */
+        void DropDbOp::replay() {
+            log() << "recover replay drop db " << _db << endl;
+            _deleteDataFiles(_db.c_str());
+        }
+
+        FileCreatedOp::FileCreatedOp(string f, unsigned long long l) :
+            DurOp(JEntry::OpCode_FileCreated) {
+            _p = RelativePath::fromFullPath(f);
+            _len = l;
+        }
+
+        FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) {
+            unsigned long long reserved;
+            log.read(reserved);
+            log.read(reserved);
+            log.read(_len); // size of file, not length of name
+            string s;
+            log.readStr(s);
+            _p._p = s;
+        }
+
+        void FileCreatedOp::_serialize(AlignedBuilder& ab) {
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum(_len);
+            ab.appendStr(_p.toString());
+        }
+
+        string FileCreatedOp::toString() {
+            return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len/1024.0/1024.0 << "MB";
+        }
+
+        // if an operation deletes or creates a file (or moves etc.), it may need files closed.
+        bool FileCreatedOp::needFilesClosed() {
+            return exists( _p.asFullPath() );
+        }
+
+        void FileCreatedOp::replay() {
+            // i believe the code assumes new files are filled with zeros.  thus we have to recreate the file,
+            // or rewrite at least, even if it were the right length.  perhaps one day we should change that
+            // although easier to avoid defects if we assume it is zeros perhaps.
+            string full = _p.asFullPath();
+            if( exists(full) ) {
+                try {
+                    remove(full);
+                }
+                catch(std::exception& e) {
+                    log(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl;
+                }
+            }
+
+            log() << "recover create file " << full << ' ' << _len/1024.0/1024.0 << "MB" << endl;
+            if( MemoryMappedFile::exists(full) ) {
+                // first delete if exists.
+                try {
+                    remove(full);
+                }
+                catch(...) {
+                    log() << "warning could not delete file " << full << endl;
+                }
+            }
+            ensureParentDirCreated(full);
+            File f;
+            f.open(full.c_str());
+            massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open());
+            unsigned long long left = _len;
+            const unsigned blksz = 64 * 1024;
+            scoped_array<char> v( new char[blksz] );
+            memset( v.get(), 0, blksz );
+            fileofs ofs = 0;
+            while( left ) {
+                unsigned long long w = left < blksz ? left : blksz;
+                f.write(ofs, v.get(), (unsigned) w);
+                left -= w;
+                ofs += w;
+            }
+            f.fsync();
+            flushMyDirectory(full);
+            massert(13628, str::stream() << "recover failure writing file " << full, !f.bad() );
+        }
+
+    }
+
+}
diff --git a/src/mongo/db/durop.h b/src/mongo/db/durop.h
new file mode 100644
index 00000000000..9ab1bfcbede
--- /dev/null
+++ b/src/mongo/db/durop.h
@@ -0,0 +1,109 @@
+// @file durop.h class DurOp and descendants
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/bufreader.h"
+#include "../util/paths.h"
+
+namespace mongo {
+
+    class AlignedBuilder;
+
+    namespace dur {
+
+        /** DurOp - Operations we journal that aren't just basic writes.
+         *
+         *  Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent.
+         *  We don't make WriteIntent inherit from DurOp to keep it as lean as possible as there will be millions of
+         *  them (we don't want a vtable for example there).
+         *
+         *  For each op we want to journal, we define a subclass.
+         */
+        class DurOp { /* copyable */
+        public:
+            // @param opcode a sentinel value near max unsigned which uniquely identifies the operation.
+            // @see dur::JEntry
+            DurOp(unsigned opcode) : _opcode(opcode) { }
+
+            virtual ~DurOp() { }
+
+            /** serialize the op out to a builder which will then be written (presumably) to the journal */
+            void serialize(AlignedBuilder& ab);
+
+            /** read a durop from journal file referenced by br.
+                @param opcode the opcode which has already been written from the bufreader
+            */
+            static shared_ptr<DurOp> read(unsigned opcode, BufReader& br);
+
+            /** replay the operation (during recovery)
+                throws
+
+                For now, these are not replayed during the normal WRITETODATAFILES phase, since these
+                operations are handled in other parts of the code. At some point this may change.
+            */
+            virtual void replay() = 0;
+
+            virtual string toString() = 0;
+
+            /** if the op requires all file to be closed before doing its work, returns true. */
+            virtual bool needFilesClosed() { return false; }
+
+        protected:
+            /** DurOp will have already written the opcode for you */
+            virtual void _serialize(AlignedBuilder& ab) = 0;
+
+        private:
+            const unsigned _opcode;
+        };
+
+        /** indicates creation of a new file */
+        class FileCreatedOp : public DurOp {
+        public:
+            FileCreatedOp(BufReader& log);
+            /** param f filename to create with path */
+            FileCreatedOp(string f, unsigned long long l);
+            virtual void replay();
+            virtual string toString();
+            virtual bool needFilesClosed();
+        protected:
+            virtual void _serialize(AlignedBuilder& ab);
+        private:
+            RelativePath _p;
+            unsigned long long _len; // size of file, not length of name
+        };
+
+        /** record drop of a database */
+        class DropDbOp : public DurOp {
+        public:
+            DropDbOp(BufReader& log);
+            DropDbOp(string db) :
+                DurOp(JEntry::OpCode_DropDb), _db(db) { }
+            virtual void replay();
+            virtual string toString() { return string("DropDbOp ") + _db; }
+            virtual bool needFilesClosed() { return true; }
+        protected:
+            virtual void _serialize(AlignedBuilder& ab);
+        private:
+            string _db;
+        };
+
+    }
+
+}
diff --git a/src/mongo/db/extsort.cpp b/src/mongo/db/extsort.cpp
new file mode 100644
index 00000000000..06a9756cc0a
--- /dev/null
+++ b/src/mongo/db/extsort.cpp
@@ -0,0 +1,245 @@
+// extsort.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "extsort.h"
+#include "namespace-inl.h"
+#include "../util/file.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+namespace mongo {
+
+    IndexInterface *BSONObjExternalSorter::extSortIdxInterface;
+    Ordering BSONObjExternalSorter::extSortOrder( Ordering::make(BSONObj()) );
+    unsigned long long BSONObjExternalSorter::_compares = 0;
+
+    BSONObjExternalSorter::BSONObjExternalSorter( IndexInterface &i, const BSONObj & order , long maxFileSize )
+        : _idxi(i), _order( order.getOwned() ) , _maxFilesize( maxFileSize ) ,
+          _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0) {
+
+        stringstream rootpath;
+        rootpath << dbpath;
+        if ( dbpath[dbpath.size()-1] != '/' )
+            rootpath << "/";
+        rootpath << "_tmp/esort." << time(0) << "." << rand() << "/";
+        _root = rootpath.str();
+
+        log(1) << "external sort root: " << _root.string() << endl;
+
+        create_directories( _root );
+        _compares = 0;
+    }
+
+    BSONObjExternalSorter::~BSONObjExternalSorter() {
+        if ( _cur ) {
+            delete _cur;
+            _cur = 0;
+        }
+        unsigned long removed = remove_all( _root );
+        wassert( removed == 1 + _files.size() );
+    }
+
+    void BSONObjExternalSorter::_sortInMem() {
+        // extSortComp needs to use glbals
+        // qsort_r only seems available on bsd, which is what i really want to use
+        dblock l;
+        extSortIdxInterface = &_idxi;
+        extSortOrder = Ordering::make(_order);
+        _cur->sort( BSONObjExternalSorter::extSortComp );
+    }
+
+    void BSONObjExternalSorter::sort() {
+        uassert( 10048 ,  "already sorted" , ! _sorted );
+
+        _sorted = true;
+
+        if ( _cur && _files.size() == 0 ) {
+            _sortInMem();
+            log(1) << "\t\t not using file.  size:" << _curSizeSoFar << " _compares:" << _compares << endl;
+            return;
+        }
+
+        if ( _cur ) {
+            finishMap();
+        }
+
+        if ( _cur ) {
+            delete _cur;
+            _cur = 0;
+        }
+
+        if ( _files.size() == 0 )
+            return;
+
+    }
+
+    void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ) {
+        uassert( 10049 ,  "sorted already" , ! _sorted );
+
+        if ( ! _cur ) {
+            _cur = new InMemory( _arraySize );
+        }
+
+        Data& d = _cur->getNext();
+        d.first = o.getOwned();
+        d.second = loc;
+
+        long size = o.objsize();
+        _curSizeSoFar += size + sizeof( DiskLoc ) + sizeof( BSONObj );
+
+        if (  _cur->hasSpace() == false ||  _curSizeSoFar > _maxFilesize ) {
+            finishMap();
+            log(1) << "finishing map" << endl;
+        }
+
+    }
+
+    void BSONObjExternalSorter::finishMap() {
+        uassert( 10050 ,  "bad" , _cur );
+
+        _curSizeSoFar = 0;
+        if ( _cur->size() == 0 )
+            return;
+
+        _sortInMem();
+
+        stringstream ss;
+        ss << _root.string() << "/file." << _files.size();
+        string file = ss.str();
+
+        // todo: it may make sense to fadvise that this not be cached so that building the index doesn't 
+        //       eject other things the db is using from the file system cache.  while we will soon be reading 
+        //       this back, if it fit in ram, there wouldn't have been a need for an external sort in the first 
+        //       place.
+
+        ofstream out;
+        out.open( file.c_str() , ios_base::out | ios_base::binary );
+        assertStreamGood( 10051 ,  (string)"couldn't open file: " + file , out );
+
+        int num = 0;
+        for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ) {
+            Data p = *i;
+            out.write( p.first.objdata() , p.first.objsize() );
+            out.write( (char*)(&p.second) , sizeof( DiskLoc ) );
+            num++;
+        }
+
+        _cur->clear();
+
+        _files.push_back( file );
+        out.close();
+
+        log(2) << "Added file: " << file << " with " << num << "objects for external sort" << endl;
+    }
+
+    // ---------------------------------
+
+    BSONObjExternalSorter::Iterator::Iterator( BSONObjExternalSorter * sorter ) :
+        _cmp( sorter->_idxi, sorter->_order ) , _in( 0 ) {
+
+        for ( list<string>::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ) {
+            _files.push_back( new FileIterator( *i ) );
+            _stash.push_back( pair<Data,bool>( Data( BSONObj() , DiskLoc() ) , false ) );
+        }
+
+        if ( _files.size() == 0 && sorter->_cur ) {
+            _in = sorter->_cur;
+            _it = sorter->_cur->begin();
+        }
+    }
+
+    BSONObjExternalSorter::Iterator::~Iterator() {
+        for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
+            delete *i;
+        _files.clear();
+    }
+
+    bool BSONObjExternalSorter::Iterator::more() {
+
+        if ( _in )
+            return _it != _in->end();
+
+        for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
+            if ( (*i)->more() )
+                return true;
+        for ( vector< pair<Data,bool> >::iterator i=_stash.begin(); i!=_stash.end(); i++ )
+            if ( i->second )
+                return true;
+        return false;
+    }
+
+    BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next() {
+
+        if ( _in ) {
+            Data& d = *_it;
+            ++_it;
+            return d;
+        }
+
+        Data best;
+        int slot = -1;
+
+        for ( unsigned i=0; i<_stash.size(); i++ ) {
+
+            if ( ! _stash[i].second ) {
+                if ( _files[i]->more() )
+                    _stash[i] = pair<Data,bool>( _files[i]->next() , true );
+                else
+                    continue;
+            }
+
+            if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ) {
+                best = _stash[i].first;
+                slot = i;
+            }
+
+        }
+
+        assert( slot >= 0 );
+        _stash[slot].second = false;
+
+        return best;
+    }
+
+    // -----------------------------------
+
+    BSONObjExternalSorter::FileIterator::FileIterator( string file ) {
+        unsigned long long length;
+        _buf = (char*)_file.map( file.c_str() , length , MemoryMappedFile::SEQUENTIAL );
+        massert( 10308 ,  "mmap failed" , _buf );
+        assert( length == (unsigned long long) file_size( file ) );
+        _end = _buf + length;
+    }
+    BSONObjExternalSorter::FileIterator::~FileIterator() {}
+
+    bool BSONObjExternalSorter::FileIterator::more() {
+        return _buf < _end;
+    }
+
+    BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next() {
+        BSONObj o( _buf );
+        _buf += o.objsize();
+        DiskLoc * l = (DiskLoc*)_buf;
+        _buf += 8;
+        return Data( o , *l );
+    }
+
+}
diff --git a/src/mongo/db/extsort.h b/src/mongo/db/extsort.h
new file mode 100644
index 00000000000..15a6d441849
--- /dev/null
+++ b/src/mongo/db/extsort.h
@@ -0,0 +1,150 @@
+// extsort.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "namespace-inl.h"
+#include "curop-inl.h"
+#include "../util/array.h"
+
+namespace mongo {
+
+    /**
+       for external (disk) sorting by BSONObj and attaching a value
+     */
+    class BSONObjExternalSorter : boost::noncopyable {
+    public:
+        BSONObjExternalSorter( IndexInterface &i, const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 );
+        ~BSONObjExternalSorter();
+        typedef pair<BSONObj,DiskLoc> Data;
+ 
+    private:
+        IndexInterface& _idxi;
+
+        static int _compare(IndexInterface& i, const Data& l, const Data& r, const Ordering& order) { 
+            RARELY killCurrentOp.checkForInterrupt();
+            _compares++;
+            int x = i.keyCompare(l.first, r.first, order);
+            if ( x )
+                return x;
+            return l.second.compare( r.second );
+        }
+
+        class MyCmp {
+        public:
+            MyCmp( IndexInterface& i, BSONObj order = BSONObj() ) : _i(i), _order( Ordering::make(order) ) {}
+            bool operator()( const Data &l, const Data &r ) const {
+                return _compare(_i, l, r, _order) < 0;
+            };
+        private:
+            IndexInterface& _i;
+            const Ordering _order;
+        };
+
+        static IndexInterface *extSortIdxInterface;
+        static Ordering extSortOrder;
+        static int extSortComp( const void *lv, const void *rv ) {
+            DEV RARELY {                 
+                d.dbMutex.assertWriteLocked(); // must be as we use a global var
+            }
+            Data * l = (Data*)lv;
+            Data * r = (Data*)rv;
+            return _compare(*extSortIdxInterface, *l, *r, extSortOrder);
+        };
+
+        class FileIterator : boost::noncopyable {
+        public:
+            FileIterator( string file );
+            ~FileIterator();
+            bool more();
+            Data next();
+        private:
+            MemoryMappedFile _file;
+            char * _buf;
+            char * _end;
+        };
+
+    public:
+
+        typedef FastArray<Data> InMemory;
+
+        class Iterator : boost::noncopyable {
+        public:
+
+            Iterator( BSONObjExternalSorter * sorter );
+            ~Iterator();
+            bool more();
+            Data next();
+
+        private:
+            MyCmp _cmp;
+            vector<FileIterator*> _files;
+            vector< pair<Data,bool> > _stash;
+
+            InMemory * _in;
+            InMemory::iterator _it;
+
+        };
+
+        void add( const BSONObj& o , const DiskLoc & loc );
+        void add( const BSONObj& o , int a , int b ) {
+            add( o , DiskLoc( a , b ) );
+        }
+
+        /* call after adding values, and before fetching the iterator */
+        void sort();
+
+        auto_ptr<Iterator> iterator() {
+            uassert( 10052 ,  "not sorted" , _sorted );
+            return auto_ptr<Iterator>( new Iterator( this ) );
+        }
+
+        int numFiles() {
+            return _files.size();
+        }
+
+        long getCurSizeSoFar() { return _curSizeSoFar; }
+
+        void hintNumObjects( long long numObjects ) {
+            if ( numObjects < _arraySize )
+                _arraySize = (int)(numObjects + 100);
+        }
+
+    private:
+
+        void _sortInMem();
+
+        void sort( string file );
+        void finishMap();
+
+        BSONObj _order;
+        long _maxFilesize;
+        path _root;
+
+        int _arraySize;
+        InMemory * _cur;
+        long _curSizeSoFar;
+
+        list<string> _files;
+        bool _sorted;
+
+        static unsigned long long _compares;
+    };
+}
diff --git a/src/mongo/db/filever.h b/src/mongo/db/filever.h
new file mode 100644
index 00000000000..e89a8243dcf
--- /dev/null
+++ b/src/mongo/db/filever.h
@@ -0,0 +1,30 @@
+/* filever.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    inline void checkDataFileVersion(NamespaceDetails& d) {
+    }
+
+    inline void checkIndexFileVersion(NamespaceDetails& d) {
+    }
+
+}
+
diff --git a/src/mongo/db/flushtest.cpp b/src/mongo/db/flushtest.cpp
new file mode 100644
index 00000000000..2009d922950
--- /dev/null
+++ b/src/mongo/db/flushtest.cpp
@@ -0,0 +1,150 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include <stdio.h>
+#include "../util/goodies.h"
+#include <fcntl.h>
+
+namespace mongo {
+
+#if defined(F_FULLFSYNC)
+    void fullsync(int f) {
+        fcntl( f, F_FULLFSYNC );
+    }
+#else
+    void fullsync(int f) {
+        fdatasync(f);
+    }
+#endif
+
+    int main(int argc, char* argv[], char *envp[] ) {
+        cout << "hello" << endl;
+
+        FILE *f = fopen("/data/db/temptest", "a");
+
+        if ( f == 0 ) {
+            cout << "can't open file\n";
+            return 1;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 50000; i++ )
+                fwrite("abc", 3, 1, f);
+            cout << "small writes: " << t.millis() << "ms" << endl;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 10000; i++ ) {
+                fwrite("abc", 3, 1, f);
+                fflush(f);
+                fsync( fileno( f ) );
+            }
+            int ms = t.millis();
+            cout << "flush: " << ms << "ms, " << ms / 10000.0 << "ms/request" << endl;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 500; i++ ) {
+                fwrite("abc", 3, 1, f);
+                fflush(f);
+                fsync( fileno( f ) );
+                sleepmillis(2);
+            }
+            int ms = t.millis() - 500 * 2;
+            cout << "flush with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl;
+        }
+
+        char buf[8192];
+        for ( int pass = 0; pass < 2; pass++ ) {
+            cout << "pass " << pass << endl;
+            {
+                Timer t;
+                int n = 500;
+                for ( int i = 0; i < n; i++ ) {
+                    if ( pass == 0 )
+                        fwrite("abc", 3, 1, f);
+                    else
+                        fwrite(buf, 8192, 1, f);
+                    buf[0]++;
+                    fflush(f);
+                    fullsync(fileno(f));
+                }
+                int ms = t.millis();
+                cout << "fullsync: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+            }
+
+            {
+                Timer t;
+                for ( int i = 0; i < 500; i++ ) {
+                    if ( pass == 0 )
+                        fwrite("abc", 3, 1, f);
+                    else
+                        fwrite(buf, 8192, 1, f);
+                    buf[0]++;
+                    fflush(f);
+                    fullsync(fileno(f));
+                    sleepmillis(2);
+                }
+                int ms = t.millis() - 2 * 500;
+                cout << "fullsync with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl;
+            }
+        }
+
+        // without growing
+        {
+            fclose(f);
+            /* try from beginning of the file, where we aren't appending and changing the file length,
+               to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect).
+            */
+            f = fopen("/data/db/temptest", "r+");
+            Timer t;
+            int n = 500;
+            for ( int i = 0; i < n; i++ ) {
+                fwrite("xyz", 3, 1, f);
+                fflush(f);
+                fullsync(fileno(f));
+            }
+            int ms = t.millis();
+            cout << "fullsync without growing: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+        }
+
+        // without growing, with delay
+        {
+            fclose(f);
+            /* try from beginning of the file, where we aren't appending and changing the file length,
+               to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect).
+            */
+            f = fopen("/data/db/temptest", "r+");
+            Timer t;
+            int n = 500;
+            for ( int i = 0; i < n; i++ ) {
+                fwrite("xyz", 3, 1, f);
+                fflush(f);
+                fullsync(fileno(f));
+                sleepmillis(2);
+            }
+            int ms = t.millis() - 2 * 500;
+            cout << "fullsync without growing with sleeps: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+        }
+
+        return 0;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/geo/2d.cpp b/src/mongo/db/geo/2d.cpp
new file mode 100644
index 00000000000..f05ce4315b2
--- /dev/null
+++ b/src/mongo/db/geo/2d.cpp
@@ -0,0 +1,3289 @@
+// geo2d.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../namespace-inl.h"
+#include "../jsobj.h"
+#include "../index.h"
+#include "../../util/unittest.h"
+#include "../commands.h"
+#include "../pdfile.h"
+#include "../btree.h"
+#include "../curop-inl.h"
+#include "../matcher.h"
+#include "../queryutil.h"
+#include "core.h"
+#include "../../util/timer.h"
+
+// Note: we use indexinterface herein to talk to the btree code. In the future it would be nice to 
+//       be able to use the V1 key class (see key.h) instead of toBson() which has some cost.
+//       toBson() is new with v1 so this could be slower than it used to be?  a quick profiling
+//       might make sense.
+
+namespace mongo {
+
+    class GeoKeyNode { 
+        GeoKeyNode();
+    public:
+        GeoKeyNode( DiskLoc bucket, int keyOfs, DiskLoc r, BSONObj k) : _bucket( bucket ), _keyOfs( keyOfs ), recordLoc(r), _key(k) { }
+        const DiskLoc _bucket;
+        const int _keyOfs;
+        const DiskLoc recordLoc;
+        const BSONObj _key;
+    };
+
+    // just use old indexes for geo for now. todo.
+//    typedef BtreeBucket<V0> GeoBtreeBucket;
+//    typedef GeoBtreeBucket::KeyNode GeoKeyNode;
+
+//#define BTREE btree<V0>
+
+#if 0
+# define CDEBUG -1
+#else
+# define CDEBUG 10
+#endif
+
+#if 0
+# define GEODEBUGGING
+# define GEODEBUG(x) cout << x << endl;
+# define GEODEBUGPRINT(x) PRINT(x)
+    inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g) {
+        if (!prefix.constrains()) {
+            cout << "\t empty prefix" << endl;
+            return ;
+        }
+
+        Point ll (g, prefix); // lower left
+        prefix.move(1,1);
+        Point tr (g, prefix); // top right
+
+        Point center ( (ll._x+tr._x)/2, (ll._y+tr._y)/2 );
+        double radius = fabs(ll._x - tr._x) / 2;
+
+        cout << "\t ll: " << ll.toString() << " tr: " << tr.toString()
+             << " center: " << center.toString() << " radius: " << radius << endl;
+
+    }
+#else
+# define GEODEBUG(x)
+# define GEODEBUGPRINT(x)
+# define PREFIXDEBUG(x, y)
+#endif
+
+    const double EARTH_RADIUS_KM = 6371;
+    const double EARTH_RADIUS_MILES = EARTH_RADIUS_KM * 0.621371192;
+
+    enum GeoDistType {
+        GEO_PLAIN,
+        GEO_SPHERE
+    };
+
+    inline double computeXScanDistance(double y, double maxDistDegrees) {
+        // TODO: this overestimates for large madDistDegrees far from the equator
+        return maxDistDegrees / min(cos(deg2rad(min(+89.0, y + maxDistDegrees))),
+                                    cos(deg2rad(max(-89.0, y - maxDistDegrees))));
+    }
+
+    GeoBitSets geoBitSets;
+
+    const string GEO2DNAME = "2d";
+
+    class Geo2dType : public IndexType , public GeoConvert {
+    public:
+        virtual ~Geo2dType() { }
+
+        Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec )
+            : IndexType( plugin , spec ) {
+
+            BSONObjBuilder orderBuilder;
+
+            BSONObjIterator i( spec->keyPattern );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.type() == String && GEO2DNAME == e.valuestr() ) {
+                    uassert( 13022 , "can't have 2 geo field" , _geo.size() == 0 );
+                    uassert( 13023 , "2d has to be first in index" , _other.size() == 0 );
+                    _geo = e.fieldName();
+                }
+                else {
+                    _other.push_back( e.fieldName() );
+                }
+                orderBuilder.append( "" , 1 );
+            }
+
+            uassert( 13024 , "no geo field specified" , _geo.size() );
+
+            double bits =  _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft
+
+            uassert( 13028 , "bits in geo index must be between 1 and 32" , bits > 0 && bits <= 32 );
+
+            _bits = (unsigned) bits;
+
+            _max = _configval( spec , "max" , 180.0 );
+            _min = _configval( spec , "min" , -180.0 );
+
+            double numBuckets = (1024 * 1024 * 1024 * 4.0);
+
+            _scaling = numBuckets / ( _max - _min );
+
+            _order = orderBuilder.obj();
+
+            GeoHash a(0, 0, _bits);
+            GeoHash b = a;
+            b.move(1, 1);
+
+            // Epsilon is 1/100th of a bucket size
+            // TODO:  Can we actually find error bounds for the sqrt function?
+            double epsilon = 0.001 / _scaling;
+            _error = distance(a, b) + epsilon;
+
+            // Error in radians
+            _errorSphere = deg2rad( _error );
+
+        }
+
+        double _configval( const IndexSpec* spec , const string& name , double def ) {
+            BSONElement e = spec->info[name];
+            if ( e.isNumber() ) {
+                return e.numberDouble();
+            }
+            return def;
+        }
+
+        virtual BSONObj fixKey( const BSONObj& in ) {
+            if ( in.firstElement().type() == BinData )
+                return in;
+
+            BSONObjBuilder b(in.objsize()+16);
+
+            if ( in.firstElement().isABSONObj() )
+                _hash( in.firstElement().embeddedObject() ).append( b , "" );
+            else if ( in.firstElement().type() == String )
+                GeoHash( in.firstElement().valuestr() ).append( b , "" );
+            else if ( in.firstElement().type() == RegEx )
+                GeoHash( in.firstElement().regex() ).append( b , "" );
+            else
+                return in;
+
+            BSONObjIterator i(in);
+            i.next();
+            while ( i.more() )
+                b.append( i.next() );
+            return b.obj();
+        }
+
+        /** Finds the key objects to put in an index */
+        virtual void getKeys( const BSONObj& obj, BSONObjSet& keys ) const {
+            getKeys( obj, &keys, NULL );
+        }
+
+        /** Finds all locations in a geo-indexed object */
+        // TODO:  Can we just return references to the locs, if they won't change?
+        void getKeys( const BSONObj& obj, vector< BSONObj >& locs ) const {
+            getKeys( obj, NULL, &locs );
+        }
+
+        /** Finds the key objects and/or locations for a geo-indexed object */
+        void getKeys( const BSONObj &obj, BSONObjSet* keys, vector< BSONObj >* locs ) const {
+
+            BSONElementMSet bSet;
+
+            // Get all the nested location fields, but don't return individual elements from
+            // the last array, if it exists.
+            obj.getFieldsDotted(_geo.c_str(), bSet, false);
+
+            if( bSet.empty() )
+                return;
+
+            for( BSONElementMSet::iterator setI = bSet.begin(); setI != bSet.end(); ++setI ) {
+
+                BSONElement geo = *setI;
+
+                GEODEBUG( "Element " << geo << " found for query " << _geo.c_str() );
+
+                if ( geo.eoo() || ! geo.isABSONObj() )
+                    continue;
+
+                //
+                // Grammar for location lookup:
+                // locs ::= [loc,loc,...,loc]|{<k>:loc,<k>:loc,...,<k>:loc}|loc
+                // loc  ::= { <k1> : #, <k2> : # }|[#, #]|{}
+                //
+                // Empty locations are ignored, preserving single-location semantics
+                //
+
+                BSONObj embed = geo.embeddedObject();
+                if ( embed.isEmpty() )
+                    continue;
+
+                // Differentiate between location arrays and locations
+                // by seeing if the first element value is a number
+                bool singleElement = embed.firstElement().isNumber();
+
+                BSONObjIterator oi(embed);
+
+                while( oi.more() ) {
+
+                    BSONObj locObj;
+
+                    if( singleElement ) locObj = embed;
+                    else {
+                        BSONElement locElement = oi.next();
+
+                        uassert( 13654, str::stream() << "location object expected, location array not in correct format",
+                                 locElement.isABSONObj() );
+
+                        locObj = locElement.embeddedObject();
+
+                        if( locObj.isEmpty() )
+                            continue;
+                    }
+
+                    BSONObjBuilder b(64);
+
+                    // Remember the actual location object if needed
+                    if( locs )
+                        locs->push_back( locObj );
+
+                    // Stop if we don't need to get anything but location objects
+                    if( ! keys ) {
+                        if( singleElement ) break;
+                        else continue;
+                    }
+
+                    _hash( locObj ).append( b , "" );
+
+                    // Go through all the other index keys
+                    for ( vector<string>::const_iterator i = _other.begin(); i != _other.end(); ++i ) {
+
+                        // Get *all* fields for the index key
+                        BSONElementSet eSet;
+                        obj.getFieldsDotted( *i, eSet );
+
+
+                        if ( eSet.size() == 0 )
+                            b.appendAs( _spec->missingField(), "" );
+                        else if ( eSet.size() == 1 )
+                            b.appendAs( *(eSet.begin()), "" );
+                        else {
+
+                            // If we have more than one key, store as an array of the objects
+
+                            BSONArrayBuilder aBuilder;
+
+                            for( BSONElementSet::iterator ei = eSet.begin(); ei != eSet.end(); ++ei ) {
+                                aBuilder.append( *ei );
+                            }
+
+                            BSONArray arr = aBuilder.arr();
+
+                            b.append( "", arr );
+
+                        }
+
+                    }
+
+                    keys->insert( b.obj() );
+
+                    if( singleElement ) break;
+
+                }
+            }
+
+        }
+
+        BSONObj _fromBSONHash( const BSONElement& e ) const {
+            return _unhash( _tohash( e ) );
+        }
+
+        BSONObj _fromBSONHash( const BSONObj& o ) const {
+            return _unhash( _tohash( o.firstElement() ) );
+        }
+
+        GeoHash _tohash( const BSONElement& e ) const {
+            if ( e.isABSONObj() )
+                return _hash( e.embeddedObject() );
+
+            return GeoHash( e , _bits );
+        }
+
+        GeoHash _hash( const BSONObj& o ) const {
+            BSONObjIterator i(o);
+            uassert( 13067 , "geo field is empty" , i.more() );
+            BSONElement x = i.next();
+            uassert( 13068 , "geo field only has 1 element" , i.more() );
+            BSONElement y = i.next();
+
+            uassert( 13026 , "geo values have to be numbers: " + o.toString() , x.isNumber() && y.isNumber() );
+
+            return hash( x.number() , y.number() );
+        }
+
+        GeoHash hash( const Point& p ) const {
+            return hash( p._x, p._y );
+        }
+
+        GeoHash hash( double x , double y ) const {
+            return GeoHash( _convert(x), _convert(y) , _bits );
+        }
+
+        BSONObj _unhash( const GeoHash& h ) const {
+            unsigned x , y;
+            h.unhash( x , y );
+            BSONObjBuilder b;
+            b.append( "x" , _unconvert( x ) );
+            b.append( "y" , _unconvert( y ) );
+            return b.obj();
+        }
+
+        unsigned _convert( double in ) const {
+            uassert( 13027 , str::stream() << "point not in interval of [ " << _min << ", " << _max << " )", in < _max && in >= _min );
+            in -= _min;
+            assert( in >= 0 );
+            return (unsigned)(in * _scaling);
+        }
+
+        double _unconvert( unsigned in ) const {
+            double x = in;
+            x /= _scaling;
+            x += _min;
+            return x;
+        }
+
+        void unhash( const GeoHash& h , double& x , double& y ) const {
+            unsigned a,b;
+            h.unhash(a,b);
+            x = _unconvert( a );
+            y = _unconvert( b );
+        }
+
+        double distance( const GeoHash& a , const GeoHash& b ) const {
+            double ax,ay,bx,by;
+            unhash( a , ax , ay );
+            unhash( b , bx , by );
+
+            double dx = bx - ax;
+            double dy = by - ay;
+
+            return sqrt( ( dx * dx ) + ( dy * dy ) );
+        }
+
+        double sizeDiag( const GeoHash& a ) const {
+            GeoHash b = a;
+            b.move( 1 , 1 );
+            return distance( a , b );
+        }
+
+        double sizeEdge( const GeoHash& a ) const {
+
+            if( ! a.constrains() )
+                return _max - _min;
+
+            double ax,ay,bx,by;
+            GeoHash b = a;
+            b.move( 1 , 1 );
+            unhash( a, ax, ay );
+            unhash( b, bx, by );
+
+            // _min and _max are a singularity
+            if (bx == _min)
+                bx = _max;
+
+            return (fabs(ax-bx));
+        }
+
+        const IndexDetails* getDetails() const {
+            return _spec->getDetails();
+        }
+
+        virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const;
+
+        virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const {
+            BSONElement e = query.getFieldDotted(_geo.c_str());
+            switch ( e.type() ) {
+            case Object: {
+                BSONObj sub = e.embeddedObject();
+                switch ( sub.firstElement().getGtLtOp() ) {
+                case BSONObj::opNEAR:
+                case BSONObj::opWITHIN:
+                    return OPTIMAL;
+                default:
+                    // We can try to match if there's no other indexing defined,
+                    // this is assumed a point
+                    return HELPFUL;
+                }
+            }
+            case Array:
+                // We can try to match if there's no other indexing defined,
+                // this is assumed a point
+                return HELPFUL;
+            default:
+                return USELESS;
+            }
+        }
+
+        string _geo;
+        vector<string> _other;
+
+        unsigned _bits;
+        double _max;
+        double _min;
+        double _scaling;
+
+        BSONObj _order;
+        double _error;
+        double _errorSphere;
+    };
+
+    class Box {
+    public:
+
+        Box( const Geo2dType * g , const GeoHash& hash )
+            : _min( g , hash ) ,
+              _max( _min._x + g->sizeEdge( hash ) , _min._y + g->sizeEdge( hash ) ) {
+        }
+
+        Box( double x , double y , double size )
+            : _min( x , y ) ,
+              _max( x + size , y + size ) {
+        }
+
+        Box( Point min , Point max )
+            : _min( min ) , _max( max ) {
+        }
+
+        Box() {}
+
+        BSONArray toBSON() const {
+            return BSON_ARRAY( BSON_ARRAY( _min._x << _min._y ) << BSON_ARRAY( _max._x << _max._y ) );
+        }
+
+        string toString() const {
+            StringBuilder buf(64);
+            buf << _min.toString() << " -->> " << _max.toString();
+            return buf.str();
+        }
+
+        bool between( double min , double max , double val , double fudge=0) const {
+            return val + fudge >= min && val <= max + fudge;
+        }
+
+        bool onBoundary( double bound, double val, double fudge = 0 ) const {
+            return ( val >= bound - fudge && val <= bound + fudge );
+        }
+
+        bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const {
+            assert( amin <= amax );
+            assert( bmin <= bmax );
+
+            if ( amin < bmin ) {
+                if ( amax < bmin )
+                    return false;
+                res = min ? bmin : amax;
+                return true;
+            }
+            if ( amin > bmax )
+                return false;
+            res = min ? amin : bmax;
+            return true;
+        }
+
+        double intersects( const Box& other ) const {
+
+            Point boundMin(0,0);
+            Point boundMax(0,0);
+
+            if ( mid( _min._x , _max._x , other._min._x , other._max._x , true , boundMin._x ) == false ||
+                    mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false ||
+                    mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false ||
+                    mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false )
+                return 0;
+
+            Box intersection( boundMin , boundMax );
+
+            return intersection.area() / area();
+        }
+
+        double area() const {
+            return ( _max._x - _min._x ) * ( _max._y - _min._y );
+        }
+
+        double maxDim() const {
+            return max( _max._x - _min._x, _max._y - _min._y );
+        }
+
+        Point center() const {
+            return Point( ( _min._x + _max._x ) / 2 ,
+                          ( _min._y + _max._y ) / 2 );
+        }
+
+        void truncate( const Geo2dType* g ) {
+            if( _min._x < g->_min ) _min._x = g->_min;
+            if( _min._y < g->_min ) _min._y = g->_min;
+            if( _max._x > g->_max ) _max._x = g->_max;
+            if( _max._y > g->_max ) _max._y = g->_max;
+        }
+
+        void fudge( const Geo2dType* g ) {
+            _min._x -= g->_error;
+            _min._y -= g->_error;
+            _max._x += g->_error;
+            _max._y += g->_error;
+        }
+
+        bool onBoundary( Point p, double fudge = 0 ) {
+            return onBoundary( _min._x, p._x, fudge ) ||
+                   onBoundary( _max._x, p._x, fudge ) ||
+                   onBoundary( _min._y, p._y, fudge ) ||
+                   onBoundary( _max._y, p._y, fudge );
+        }
+
+        bool inside( Point p , double fudge = 0 ) {
+            bool res = inside( p._x , p._y , fudge );
+            //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl;
+            return res;
+        }
+
+        bool inside( double x , double y , double fudge = 0 ) {
+            return
+                between( _min._x , _max._x  , x , fudge ) &&
+                between( _min._y , _max._y  , y , fudge );
+        }
+
+        bool contains(const Box& other, double fudge=0) {
+            return inside(other._min, fudge) && inside(other._max, fudge);
+        }
+
+        Point _min;
+        Point _max;
+    };
+
+
+    class Polygon {
+    public:
+
+        Polygon( void ) : _centroidCalculated( false ) {}
+
+        Polygon( vector<Point> points ) : _centroidCalculated( false ),
+            _points( points ) { }
+
+        void add( Point p ) {
+            _centroidCalculated = false;
+            _points.push_back( p );
+        }
+
+        int size( void ) const {
+            return _points.size();
+        }
+
+        /**
+         * Determine if the point supplied is contained by the current polygon.
+         *
+         * The algorithm uses a ray casting method.
+         */
+        bool contains( const Point& p ) const {
+            return contains( p, 0 ) > 0;
+        }
+
+        int contains( const Point &p, double fudge ) const {
+
+            Box fudgeBox( Point( p._x - fudge, p._y - fudge ), Point( p._x + fudge, p._y + fudge ) );
+
+            int counter = 0;
+            Point p1 = _points[0];
+            for ( int i = 1; i <= size(); i++ ) {
+                Point p2 = _points[i % size()];
+
+                GEODEBUG( "Doing intersection check of " << fudgeBox.toString() << " with seg " << p1.toString() << " to " << p2.toString() );
+
+                // We need to check whether or not this segment intersects our error box
+                if( fudge > 0 &&
+                        // Points not too far below box
+                        fudgeBox._min._y <= std::max( p1._y, p2._y ) &&
+                        // Points not too far above box
+                        fudgeBox._max._y >= std::min( p1._y, p2._y ) &&
+                        // Points not too far to left of box
+                        fudgeBox._min._x <= std::max( p1._x, p2._x ) &&
+                        // Points not too far to right of box
+                        fudgeBox._max._x >= std::min( p1._x, p2._x ) ) {
+
+                    GEODEBUG( "Doing detailed check" );
+
+                    // If our box contains one or more of these points, we need to do an exact check.
+                    if( fudgeBox.inside(p1) ) {
+                        GEODEBUG( "Point 1 inside" );
+                        return 0;
+                    }
+                    if( fudgeBox.inside(p2) ) {
+                        GEODEBUG( "Point 2 inside" );
+                        return 0;
+                    }
+
+                    // Do intersection check for vertical sides
+                    if ( p1._y != p2._y ) {
+
+                        double invSlope = ( p2._x - p1._x ) / ( p2._y - p1._y );
+
+                        double xintersT = ( fudgeBox._max._y - p1._y ) * invSlope + p1._x;
+                        if( fudgeBox._min._x <= xintersT && fudgeBox._max._x >= xintersT ) {
+                            GEODEBUG( "Top intersection @ " << xintersT );
+                            return 0;
+                        }
+
+                        double xintersB = ( fudgeBox._min._y - p1._y ) * invSlope + p1._x;
+                        if( fudgeBox._min._x <= xintersB && fudgeBox._max._x >= xintersB ) {
+                            GEODEBUG( "Bottom intersection @ " << xintersB );
+                            return 0;
+                        }
+
+                    }
+
+                    // Do intersection check for horizontal sides
+                    if( p1._x != p2._x ) {
+
+                        double slope = ( p2._y - p1._y ) / ( p2._x - p1._x );
+
+                        double yintersR = ( p1._x - fudgeBox._max._x ) * slope + p1._y;
+                        if( fudgeBox._min._y <= yintersR && fudgeBox._max._y >= yintersR ) {
+                            GEODEBUG( "Right intersection @ " << yintersR );
+                            return 0;
+                        }
+
+                        double yintersL = ( p1._x - fudgeBox._min._x ) * slope + p1._y;
+                        if( fudgeBox._min._y <= yintersL && fudgeBox._max._y >= yintersL ) {
+                            GEODEBUG( "Left intersection @ " << yintersL );
+                            return 0;
+                        }
+
+                    }
+
+                }
+                else if( fudge == 0 ){
+
+                    // If this is an exact vertex, we won't intersect, so check this
+                    if( p._y == p1._y && p._x == p1._x ) return 1;
+                    else if( p._y == p2._y && p._x == p2._x ) return 1;
+
+                    // If this is a horizontal line we won't intersect, so check this
+                    if( p1._y == p2._y && p._y == p1._y ){
+                        // Check that the x-coord lies in the line
+                        if( p._x >= std::min( p1._x, p2._x ) && p._x <= std::max( p1._x, p2._x ) ) return 1;
+                    }
+
+                }
+
+                // Normal intersection test.
+                // TODO: Invert these for clearer logic?
+                if ( p._y > std::min( p1._y, p2._y ) ) {
+                    if ( p._y <= std::max( p1._y, p2._y ) ) {
+                        if ( p._x <= std::max( p1._x, p2._x ) ) {
+                            if ( p1._y != p2._y ) {
+                                double xinters = (p._y-p1._y)*(p2._x-p1._x)/(p2._y-p1._y)+p1._x;
+                                // Special case of point on vertical line
+                                if ( p1._x == p2._x && p._x == p1._x ){
+
+                                    // Need special case for the vertical edges, for example:
+                                    // 1) \e   pe/----->
+                                    // vs.
+                                    // 2) \ep---e/----->
+                                    //
+                                    // if we count exact as intersection, then 1 is in but 2 is out
+                                    // if we count exact as no-int then 1 is out but 2 is in.
+
+                                    return 1;
+                                }
+                                else if( p1._x == p2._x || p._x <= xinters ) {
+                                    counter++;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                p1 = p2;
+            }
+
+            if ( counter % 2 == 0 ) {
+                return -1;
+            }
+            else {
+                return 1;
+            }
+        }
+
+        /**
+         * Calculate the centroid, or center of mass of the polygon object.
+         */
+        Point centroid( void ) {
+
+            /* Centroid is cached, it won't change betwen points */
+            if ( _centroidCalculated ) {
+                return _centroid;
+            }
+
+            Point cent;
+            double signedArea = 0.0;
+            double area = 0.0;  // Partial signed area
+
+            /// For all vertices except last
+            int i = 0;
+            for ( i = 0; i < size() - 1; ++i ) {
+                area = _points[i]._x * _points[i+1]._y - _points[i+1]._x * _points[i]._y ;
+                signedArea += area;
+                cent._x += ( _points[i]._x + _points[i+1]._x ) * area;
+                cent._y += ( _points[i]._y + _points[i+1]._y ) * area;
+            }
+
+            // Do last vertex
+            area = _points[i]._x * _points[0]._y - _points[0]._x * _points[i]._y;
+            cent._x += ( _points[i]._x + _points[0]._x ) * area;
+            cent._y += ( _points[i]._y + _points[0]._y ) * area;
+            signedArea += area;
+            signedArea *= 0.5;
+            cent._x /= ( 6 * signedArea );
+            cent._y /= ( 6 * signedArea );
+
+            _centroidCalculated = true;
+            _centroid = cent;
+
+            return cent;
+        }
+
+        Box bounds( void ) {
+
+            // TODO: Cache this
+
+            _bounds._max = _points[0];
+            _bounds._min = _points[0];
+
+            for ( int i = 1; i < size(); i++ ) {
+
+                _bounds._max._x = max( _bounds._max._x, _points[i]._x );
+                _bounds._max._y = max( _bounds._max._y, _points[i]._y );
+                _bounds._min._x = min( _bounds._min._x, _points[i]._x );
+                _bounds._min._y = min( _bounds._min._y, _points[i]._y );
+
+            }
+
+            return _bounds;
+
+        }
+
+    private:
+
+        bool _centroidCalculated;
+        Point _centroid;
+
+        Box _bounds;
+
+        vector<Point> _points;
+    };
+
+    class Geo2dPlugin : public IndexPlugin {
+    public:
+        Geo2dPlugin() : IndexPlugin( GEO2DNAME ) {
+        }
+
+        virtual IndexType* generate( const IndexSpec* spec ) const {
+            return new Geo2dType( this , spec );
+        }
+    } geo2dplugin;
+
+    void __forceLinkGeoPlugin() {
+        geo2dplugin.getName();
+    }
+    
+
+
+    class GeoHopper;
+
+    class GeoPoint {
+    public:
+
+        GeoPoint() : _distance( -1 ), _exact( false ), _dirty( false )
+        {}
+
+        //// Distance not used ////
+
+        GeoPoint( const GeoKeyNode& node )
+            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( -1 ) , _exact( false ), _dirty( false ), _bucket( node._bucket ), _pos( node._keyOfs ) {
+        }
+
+        //// Immediate initialization of distance ////
+
+        GeoPoint( const GeoKeyNode& node, double distance, bool exact )
+            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( distance ), _exact( exact ), _dirty( false ) {
+        }
+
+        GeoPoint( const GeoPoint& pt, double distance, bool exact )
+            : _key( pt.key() ) , _loc( pt.loc() ) , _o( pt.obj() ), _distance( distance ), _exact( exact ), _dirty( false ) {
+        }
+
+        bool operator<( const GeoPoint& other ) const {
+            if( _distance != other._distance ) return _distance < other._distance;
+            if( _exact != other._exact ) return _exact < other._exact;
+            return _loc < other._loc;
+        }
+
+        double distance() const {
+            return _distance;
+        }
+
+        bool isExact() const {
+            return _exact;
+        }
+
+        BSONObj key() const {
+            return _key;
+        }
+
+        bool hasLoc() const {
+            return _loc.isNull();
+        }
+
+        DiskLoc loc() const {
+            assert( ! _dirty );
+            return _loc;
+        }
+
+        BSONObj obj() const {
+            return _o;
+        }
+
+        BSONObj pt() const {
+            return _pt;
+        }
+
+        bool isEmpty() {
+            return _o.isEmpty();
+        }
+
+        bool isCleanAndEmpty() {
+            return isEmpty() && ! isDirty();
+        }
+
+        string toString() const {
+            return str::stream() << "Point from " << _key << " - " << _o << " dist : " << _distance << ( _exact ? " (ex)" : " (app)" );
+        }
+
+
+        // TODO:  Recover from yield by finding all the changed disk locs here, modifying the _seenPts array.
+        // Not sure yet the correct thing to do about _seen.
+        // Definitely need to re-find our current max/min locations too
+        bool unDirty( const Geo2dType* g, DiskLoc& oldLoc ){
+
+            assert( _dirty );
+            assert( ! _id.isEmpty() );
+
+            oldLoc = _loc;
+            _loc = DiskLoc();
+
+            // Fast undirty
+            IndexInterface& ii = g->getDetails()->idxInterface();
+            // Check this position and the one immediately preceding
+            for( int i = 0; i < 2; i++ ){
+                if( _pos - i < 0 ) continue;
+
+                // log() << "bucket : " << _bucket << " pos " << _pos << endl;
+
+                BSONObj key;
+                DiskLoc loc;
+                ii.keyAt( _bucket, _pos - i, key, loc );
+
+                // log() << "Loc: " << loc << " Key : " << key << endl;
+
+                if( loc.isNull() ) continue;
+
+                if( key.binaryEqual( _key ) && loc.obj()["_id"].wrap( "" ).binaryEqual( _id ) ){
+                    _pos = _pos - i;
+                    _loc = loc;
+                    _dirty = false;
+                    _o = loc.obj();
+                    return true;
+                }
+            }
+
+            // Slow undirty
+            scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsdetails( g->getDetails()->parentNS().c_str() ),
+                                            *( g->getDetails() ), _key, _key, true, 1 ) );
+
+            int count = 0;
+            while( cursor->ok() ){
+                count++;
+                if( cursor->current()["_id"].wrap( "" ).binaryEqual( _id ) ){
+                    _bucket = cursor->getBucket();
+                    _pos = cursor->getKeyOfs();
+                    _loc = cursor->currLoc();
+                    _o = _loc.obj();
+                    break;
+                }
+                else{
+                    LOG( CDEBUG + 1 ) << "Key doesn't match : " << cursor->current()["_id"] << " saved : " << _id << endl;
+                }
+                cursor->advance();
+            }
+
+            if( ! count ) { LOG( CDEBUG ) << "No key found for " << _key << endl; }
+
+            _dirty = false;
+
+            return _loc == oldLoc;
+        }
+
+        bool isDirty(){
+            return _dirty;
+        }
+
+        bool makeDirty(){
+            if( ! _dirty ){
+                assert( ! obj()["_id"].eoo() );
+                assert( ! _bucket.isNull() );
+                assert( _pos >= 0 );
+
+                if( _id.isEmpty() ){
+                    _id = obj()["_id"].wrap( "" ).getOwned();
+                }
+                _o = BSONObj();
+                _key = _key.getOwned();
+                _pt = _pt.getOwned();
+                _dirty = true;
+
+                return true;
+            }
+
+            return false;
+        }
+
+        BSONObj _key;
+        DiskLoc _loc;
+        BSONObj _o;
+        BSONObj _pt;
+
+        double _distance;
+        bool _exact;
+
+        BSONObj _id;
+        bool _dirty;
+        DiskLoc _bucket;
+        int _pos;
+    };
+
+    // GeoBrowse subclasses this
+    class GeoAccumulator {
+    public:
+        GeoAccumulator( const Geo2dType * g , const BSONObj& filter, bool uniqueDocs, bool needDistance )
+            : _g(g) ,
+              _lookedAt(0) ,
+              _matchesPerfd(0) ,
+              _objectsLoaded(0) ,
+              _pointsLoaded(0) ,
+              _found(0) ,
+              _uniqueDocs( uniqueDocs ) ,
+              _needDistance( needDistance )
+        {
+            if ( ! filter.isEmpty() ) {
+                _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) );
+                GEODEBUG( "Matcher is now " << _matcher->docMatcher().toString() );
+            }
+        }
+
+        virtual ~GeoAccumulator() { }
+
+        enum KeyResult { BAD, BORDER, GOOD };
+
+        virtual void add( const GeoKeyNode& node ) {
+
+            GEODEBUG( "\t\t\t\t checking key " << node._key.toString() )
+
+            _lookedAt++;
+
+            ////
+            // Approximate distance check using key data
+            ////
+            double keyD = 0;
+            Point keyP( _g, GeoHash( node._key.firstElement(), _g->_bits ) );
+            KeyResult keyOk = approxKeyCheck( keyP, keyD );
+            if ( keyOk == BAD ) {
+                GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj()  << "\t" << keyD );
+                return;
+            }
+            GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj()  << "\t" << keyD );
+
+            ////
+            // Check for match using other key (and potentially doc) criteria
+            ////
+            // Remember match results for each object
+            map<DiskLoc, bool>::iterator match = _matched.find( node.recordLoc );
+            bool newDoc = match == _matched.end();
+            if( newDoc ) {
+
+                GEODEBUG( "\t\t\t\t matching new doc with " << (_matcher ? _matcher->docMatcher().toString() : "(empty)" ) );
+
+                // matcher
+                MatchDetails details;
+                if ( _matcher.get() ) {
+                    bool good = _matcher->matchesWithSingleKeyIndex( node._key , node.recordLoc , &details );
+
+                    _matchesPerfd++;
+
+                    if ( details._loadedObject )
+                        _objectsLoaded++;
+
+                    if ( ! good ) {
+                        GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] );
+                        _matched[ node.recordLoc ] = false;
+                        return;
+                    }
+                }
+
+                _matched[ node.recordLoc ] = true;
+
+                if ( ! details._loadedObject ) // don't double count
+                    _objectsLoaded++;
+
+            }
+            else if( !((*match).second) ) {
+                GEODEBUG( "\t\t\t\t previously didn't match : " << node.recordLoc.obj()["_id"] );
+                return;
+            }
+
+            ////
+            // Exact check with particular data fields
+            ////
+            // Can add multiple points
+            int diff = addSpecific( node , keyP, keyOk == BORDER, keyD, newDoc );
+            if( diff > 0 ) _found += diff;
+            else _found -= -diff;
+
+        }
+
+        virtual void getPointsFor( const BSONObj& key, const BSONObj& obj, vector< BSONObj >& locsForNode, bool allPoints = false ){
+
+            // Find all the location objects from the keys
+            vector< BSONObj > locs;
+            _g->getKeys( obj, allPoints ? locsForNode : locs );
+            _pointsLoaded++;
+
+            if( allPoints ) return;
+            if( locs.size() == 1 ){
+                locsForNode.push_back( locs[0] );
+                return;
+            }
+
+            // Find the particular location we want
+            GeoHash keyHash( key.firstElement(), _g->_bits );
+
+            // log() << "Hash: " << node.key << " and " << keyHash.getHash() << " unique " << _uniqueDocs << endl;
+            for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
+
+                // Ignore all locations not hashed to the key's hash, since we may see
+                // those later
+                if( _g->_hash( *i ) != keyHash ) continue;
+
+                locsForNode.push_back( *i );
+
+            }
+
+        }
+
+        virtual int addSpecific( const GeoKeyNode& node, const Point& p , bool inBounds, double d, bool newDoc ) = 0;
+        virtual KeyResult approxKeyCheck( const Point& p , double& keyD ) = 0;
+        virtual bool exactDocCheck( const Point& p , double& d ) = 0;
+        virtual bool expensiveExactCheck(){ return false; }
+
+
+        long long found() const {
+            return _found;
+        }
+
+        const Geo2dType * _g;
+        map<DiskLoc, bool> _matched;
+        shared_ptr<CoveredIndexMatcher> _matcher;
+
+        long long _lookedAt;
+        long long _matchesPerfd;
+        long long _objectsLoaded;
+        long long _pointsLoaded;
+        long long _found;
+
+        bool _uniqueDocs;
+        bool _needDistance;
+
+    };
+
+
+    struct BtreeLocation {
+        BtreeLocation() { }
+
+        scoped_ptr<BtreeCursor> _cursor;
+        scoped_ptr<FieldRangeSet> _frs;
+        scoped_ptr<IndexSpec> _spec;
+
+        BSONObj key() {
+            return _cursor->currKey();
+        }
+
+        bool hasPrefix( const GeoHash& hash ) {
+            BSONObj k = key();
+            BSONElement e = k.firstElement();
+            if ( e.eoo() )
+                return false;
+            return GeoHash( e ).hasPrefix( hash );
+        }
+
+        bool checkAndAdvance( const GeoHash& hash, int& totalFound, GeoAccumulator* all ){
+            if( ! _cursor->ok() || ! hasPrefix( hash ) ) return false;
+
+            if( all ){
+                totalFound++;
+                GeoKeyNode n( _cursor->getBucket(), _cursor->getKeyOfs(), _cursor->currLoc(), _cursor->currKey() );
+                all->add( n );
+            }
+            _cursor->advance();
+
+            return true;
+        }
+
+        void save(){
+            _cursor->noteLocation();
+        }
+
+        void restore(){
+            _cursor->checkLocation();
+        }
+
+        string toString() {
+            stringstream ss;
+            ss << "bucket: " << _cursor->getBucket().toString() << " pos: " << _cursor->getKeyOfs() <<
+               ( _cursor->ok() ? ( str::stream() << " k: " << _cursor->currKey() << " o : " << _cursor->current()["_id"] ) : (string)"[none]" ) << endl;
+            return ss.str();
+        }
+
+        // Returns the min and max keys which bound a particular location.
+        // The only time these may be equal is when we actually equal the location
+        // itself, otherwise our expanding algorithm will fail.
+        static bool initial( const IndexDetails& id , const Geo2dType * spec ,
+                             BtreeLocation& min , BtreeLocation&  max ,
+                             GeoHash start ,
+                             int & found , GeoAccumulator * hopper ) {
+
+            //Ordering ordering = Ordering::make(spec->_order);
+
+            // Would be nice to build this directly, but bug in max/min queries SERVER-3766 and lack of interface
+            // makes this easiest for now.
+            BSONObj minQuery = BSON( spec->_geo << BSON( "$gt" << MINKEY << start.wrap( "$lte" ).firstElement() ) );
+            BSONObj maxQuery = BSON( spec->_geo << BSON( "$lt" << MAXKEY << start.wrap( "$gt" ).firstElement() ) );
+
+            // log() << "MinQuery: " << minQuery << endl;
+            // log() << "MaxQuery: " << maxQuery << endl;
+
+            min._frs.reset( new FieldRangeSet( spec->getDetails()->parentNS().c_str(),
+                                  minQuery,
+                                  true,
+                                  false ) );
+
+            max._frs.reset( new FieldRangeSet( spec->getDetails()->parentNS().c_str(),
+                                  maxQuery,
+                                  true,
+                                  false ) );
+
+
+            BSONObjBuilder bob;
+            bob.append( spec->_geo, 1 );
+            for( vector<string>::const_iterator i = spec->_other.begin(); i != spec->_other.end(); i++ ){
+                bob.append( *i, 1 );
+            }
+            BSONObj iSpec = bob.obj();
+
+            min._spec.reset( new IndexSpec( iSpec ) );
+            max._spec.reset( new IndexSpec( iSpec ) );
+
+            shared_ptr<FieldRangeVector> frvMin( new FieldRangeVector( *(min._frs), *(min._spec), -1 ) );
+            shared_ptr<FieldRangeVector> frvMax( new FieldRangeVector( *(max._frs), *(max._spec), 1 ) );
+
+            min._cursor.reset(
+                            BtreeCursor::make( nsdetails( spec->getDetails()->parentNS().c_str() ), *( spec->getDetails() ),
+                                               frvMin, -1 )
+                    );
+
+            max._cursor.reset(
+                           BtreeCursor::make( nsdetails( spec->getDetails()->parentNS().c_str() ), *( spec->getDetails() ),
+                                              frvMax, 1 )
+                   );
+
+            // if( hopper ) min.checkCur( found, hopper );
+            // if( hopper ) max.checkCur( found, hopper );
+
+            return min._cursor->ok() || max._cursor->ok();
+        }
+    };
+
+
+    class GeoCursorBase : public Cursor {
+    public:
+
+        static const shared_ptr< CoveredIndexMatcher > emptyMatcher;
+
+        GeoCursorBase( const Geo2dType * spec )
+            : _spec( spec ), _id( _spec->getDetails() ) {
+
+        }
+
+        virtual DiskLoc refLoc() { return DiskLoc(); }
+
+        virtual BSONObj indexKeyPattern() {
+            return _spec->keyPattern();
+        }
+
+        virtual void noteLocation() {
+            // no-op since these are meant to be safe
+        }
+
+        /* called before query getmore block is iterated */
+        virtual void checkLocation() {
+            // no-op since these are meant to be safe
+        }
+
+        virtual bool supportGetMore() { return false; }
+        virtual bool supportYields() { return false; }
+
+        virtual bool getsetdup(DiskLoc loc) { return false; }
+        virtual bool modifiedKeys() const { return true; }
+        virtual bool isMultiKey() const { return false; }
+
+        virtual bool autoDedup() const { return false; }
+
+        const Geo2dType * _spec;
+        const IndexDetails * _id;
+    };
+
+    const shared_ptr< CoveredIndexMatcher > GeoCursorBase::emptyMatcher( new CoveredIndexMatcher( BSONObj(), BSONObj(), false ) );
+
+    // TODO: Pull out the cursor bit from the browse, have GeoBrowse as field of cursor to clean up
+    // this hierarchy a bit.  Also probably useful to look at whether GeoAccumulator can be a member instead
+    // of a superclass.
+    class GeoBrowse : public GeoCursorBase , public GeoAccumulator {
+    public:
+
+        // The max points which should be added to an expanding box at one time
+        static const int maxPointsHeuristic = 50;
+
+        // Expand states
+        enum State {
+            START ,
+            DOING_EXPAND ,
+            DONE_NEIGHBOR ,
+            DONE
+        } _state;
+
+        GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj(), bool uniqueDocs = true, bool needDistance = false )
+            : GeoCursorBase( g ), GeoAccumulator( g , filter, uniqueDocs, needDistance ) ,
+              _type( type ) , _filter( filter ) , _firstCall(true), _noted( false ), _nscanned(), _nDirtied(0), _nChangedOnYield(0), _nRemovedOnYield(0), _centerPrefix(0, 0, 0) {
+
+            // Set up the initial expand state
+            _state = START;
+            _neighbor = -1;
+            _foundInExp = 0;
+
+        }
+
+        virtual string toString() {
+            return (string)"GeoBrowse-" + _type;
+        }
+
+        virtual bool ok() {
+
+            bool filled = false;
+
+            LOG( CDEBUG ) << "Checking cursor, in state " << (int) _state << ", first call " << _firstCall <<
+                             ", empty : " << _cur.isEmpty() << ", dirty : " << _cur.isDirty() << ", stack : " << _stack.size() << endl;
+
+            bool first = _firstCall;
+            if ( _firstCall ) {
+                fillStack( maxPointsHeuristic );
+                filled = true;
+                _firstCall = false;
+            }
+            if ( ! _cur.isCleanAndEmpty() || _stack.size() ) {
+                if ( first ) {
+                    ++_nscanned;
+                }
+
+                if( _noted && filled ) noteLocation();
+                return true;
+            }
+
+            while ( moreToDo() ) {
+
+                LOG( CDEBUG ) << "Refilling stack..." << endl;
+
+                fillStack( maxPointsHeuristic );
+                filled = true;
+
+                if ( ! _cur.isCleanAndEmpty() ) {
+                    if ( first ) {
+                        ++_nscanned;
+                    }
+
+                    if( _noted && filled ) noteLocation();
+                    return true;
+                }
+            }
+
+            if( _noted && filled ) noteLocation();
+            return false;
+        }
+
+        virtual bool advance() {
+            _cur._o = BSONObj();
+
+            if ( _stack.size() ) {
+                _cur = _stack.front();
+                _stack.pop_front();
+                ++_nscanned;
+                return true;
+            }
+
+            if ( ! moreToDo() )
+                return false;
+
+            bool filled = false;
+            while ( _cur.isCleanAndEmpty() && moreToDo() ){
+                fillStack( maxPointsHeuristic );
+                filled = true;
+            }
+
+            if( _noted && filled ) noteLocation();
+            return ! _cur.isCleanAndEmpty() && ++_nscanned;
+        }
+
+        virtual void noteLocation() {
+            _noted = true;
+
+            LOG( CDEBUG ) << "Noting location with " << _stack.size() << ( _cur.isEmpty() ? "" : " + 1 " ) << " points " << endl;
+
+            // Make sure we advance past the point we're at now,
+            // since the current location may move on an update/delete
+            // if( _state == DOING_EXPAND ){
+            //     if( _min.hasPrefix( _prefix ) ){ _min.advance( -1, _foundInExp, this ); }
+            //    if( _max.hasPrefix( _prefix ) ){ _max.advance(  1, _foundInExp, this ); }
+            // }
+
+            // Remember where our _max, _min are
+            _min.save();
+            _max.save();
+
+            LOG( CDEBUG ) << "Min " << _min.toString() << endl;
+            LOG( CDEBUG ) << "Max " << _max.toString() << endl;
+
+            // Dirty all our queued stuff
+            for( list<GeoPoint>::iterator i = _stack.begin(); i != _stack.end(); i++ ){
+
+                LOG( CDEBUG ) << "Undirtying stack point with id " << i->_id << endl;
+
+                if( i->makeDirty() ) _nDirtied++;
+                assert( i->isDirty() );
+            }
+
+            // Check current item
+            if( ! _cur.isEmpty() ){
+                if( _cur.makeDirty() ) _nDirtied++;
+            }
+
+            // Our cached matches become invalid now
+            _matched.clear();
+        }
+
+        void fixMatches( DiskLoc oldLoc, DiskLoc newLoc ){
+            map<DiskLoc, bool>::iterator match = _matched.find( oldLoc );
+            if( match != _matched.end() ){
+                bool val = match->second;
+                _matched.erase( oldLoc );
+                _matched[ newLoc ] = val;
+            }
+        }
+
+        /* called before query getmore block is iterated */
+        virtual void checkLocation() {
+
+            LOG( CDEBUG ) << "Restoring location with " << _stack.size() << ( ! _cur.isDirty() ? "" : " + 1 " ) << " points " << endl;
+
+            // We can assume an error was thrown earlier if this database somehow disappears
+
+            // Recall our _max, _min
+            _min.restore();
+            _max.restore();
+
+            LOG( CDEBUG ) << "Min " << _min.toString() << endl;
+            LOG( CDEBUG ) << "Max " << _max.toString() << endl;
+
+            // If the current key moved, we may have been advanced past the current point - need to check this
+            // if( _state == DOING_EXPAND ){
+            //    if( _min.hasPrefix( _prefix ) ){ _min.advance( -1, _foundInExp, this ); }
+            //    if( _max.hasPrefix( _prefix ) ){ _max.advance(  1, _foundInExp, this ); }
+            //}
+
+            // Undirty all the queued stuff
+            // Dirty all our queued stuff
+            list<GeoPoint>::iterator i = _stack.begin();
+            while( i != _stack.end() ){
+
+                LOG( CDEBUG ) << "Undirtying stack point with id " << i->_id << endl;
+
+                DiskLoc oldLoc;
+                if( i->unDirty( _spec, oldLoc ) ){
+                    // Document is in same location
+                    LOG( CDEBUG ) << "Undirtied " << oldLoc << endl;
+
+                    i++;
+                }
+                else if( ! i->loc().isNull() ){
+
+                    // Re-found document somewhere else
+                    LOG( CDEBUG ) << "Changed location of " << i->_id << " : " << i->loc() << " vs " << oldLoc << endl;
+
+                    _nChangedOnYield++;
+                    fixMatches( oldLoc, i->loc() );
+                    i++;
+                }
+                else {
+
+                    // Can't re-find document
+                    LOG( CDEBUG ) << "Removing document " << i->_id << endl;
+
+                    _nRemovedOnYield++;
+                    _found--;
+                    assert( _found >= 0 );
+
+                    // Can't find our key again, remove
+                    i = _stack.erase( i );
+                }
+            }
+
+            if( _cur.isDirty() ){
+                LOG( CDEBUG ) << "Undirtying cur point with id : " << _cur._id << endl;
+            }
+
+            // Check current item
+            DiskLoc oldLoc;
+            if( _cur.isDirty() && ! _cur.unDirty( _spec, oldLoc ) ){
+                if( _cur.loc().isNull() ){
+
+                    // Document disappeared!
+                    LOG( CDEBUG ) << "Removing cur point " << _cur._id << endl;
+
+                    _nRemovedOnYield++;
+                    advance();
+                }
+                else{
+
+                    // Document moved
+                    LOG( CDEBUG ) << "Changed location of cur point " << _cur._id << " : " << _cur.loc() << " vs " << oldLoc << endl;
+
+                    _nChangedOnYield++;
+                    fixMatches( oldLoc, _cur.loc() );
+                }
+            }
+
+            _noted = false;
+        }
+
+        virtual Record* _current() { assert(ok()); LOG( CDEBUG + 1 ) << "_current " << _cur._loc.obj()["_id"] << endl; return _cur._loc.rec(); }
+        virtual BSONObj current() { assert(ok()); LOG( CDEBUG + 1 ) << "current " << _cur._o << endl; return _cur._o; }
+        virtual DiskLoc currLoc() { assert(ok()); LOG( CDEBUG + 1 ) << "currLoc " << _cur._loc << endl; return _cur._loc; }
+        virtual BSONObj currKey() const { return _cur._key; }
+
+        virtual CoveredIndexMatcher* matcher() const {
+            if( _matcher.get() ) return _matcher.get();
+            else return GeoCursorBase::emptyMatcher.get();
+        }
+
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+            if( _matcher.get() ) return _matcher;
+            else return GeoCursorBase::emptyMatcher;
+        }
+
+        // Are we finished getting points?
+        virtual bool moreToDo() {
+            return _state != DONE;
+        }
+
+        virtual bool supportGetMore() { return true; }
+
+        // Fills the stack, but only checks a maximum number of maxToCheck points at a time.
+        // Further calls to this function will continue the expand/check neighbors algorithm.
+        virtual void fillStack( int maxToCheck, int maxToAdd = -1, bool onlyExpand = false ) {
+
+#ifdef GEODEBUGGING
+            log() << "Filling stack with maximum of " << maxToCheck << ", state : " << (int) _state << endl;
+#endif
+
+            if( maxToAdd < 0 ) maxToAdd = maxToCheck;
+            int maxFound = _foundInExp + maxToCheck;
+            assert( maxToCheck > 0 );
+            assert( maxFound > 0 );
+            assert( _found <= 0x7fffffff ); // conversion to int
+            int maxAdded = static_cast<int>(_found) + maxToAdd;
+            assert( maxAdded >= 0 ); // overflow check
+
+            bool isNeighbor = _centerPrefix.constrains();
+
+            // Starting a box expansion
+            if ( _state == START ) {
+
+                // Get the very first hash point, if required
+                if( ! isNeighbor )
+                    _prefix = expandStartHash();
+
+                GEODEBUG( "initializing btree" );
+
+#ifdef GEODEBUGGING
+                log() << "Initializing from b-tree with hash of " << _prefix << " @ " << Box( _g, _prefix ) << endl;
+#endif
+
+                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , _prefix , _foundInExp , this ) )
+                    _state = isNeighbor ? DONE_NEIGHBOR : DONE;
+                else {
+                    _state = DOING_EXPAND;
+                    _lastPrefix.reset();
+                }
+
+                GEODEBUG( (_state == DONE_NEIGHBOR || _state == DONE ? "not initialized" : "initializedFig") );
+
+            }
+
+            // Doing the actual box expansion
+            if ( _state == DOING_EXPAND ) {
+
+                while ( true ) {
+
+                    GEODEBUG( "box prefix [" << _prefix << "]" );
+#ifdef GEODEBUGGING
+                    if( _prefix.constrains() ) {
+                        log() << "current expand box : " << Box( _g, _prefix ).toString() << endl;
+                    }
+                    else {
+                        log() << "max expand box." << endl;
+                    }
+#endif
+
+                    GEODEBUG( "expanding box points... ");
+
+                    // Record the prefix we're actively exploring...
+                    _expPrefix.reset( new GeoHash( _prefix ) );
+
+                    // Find points inside this prefix
+                    while ( _min.checkAndAdvance( _prefix, _foundInExp, this ) && _foundInExp < maxFound && _found < maxAdded );
+                    while ( _max.checkAndAdvance( _prefix, _foundInExp, this ) && _foundInExp < maxFound && _found < maxAdded );
+
+#ifdef GEODEBUGGING
+
+                    log() << "finished expand, checked : " << ( maxToCheck - ( maxFound - _foundInExp ) )
+                          << " found : " << ( maxToAdd - ( maxAdded - _found ) )
+                          << " max : " << maxToCheck << " / " << maxToAdd << endl;
+
+#endif
+
+                    GEODEBUG( "finished expand, found : " << ( maxToAdd - ( maxAdded - _found ) ) );
+                    if( _foundInExp >= maxFound || _found >= maxAdded ) return;
+
+                    // We've searched this prefix fully, remember
+                    _lastPrefix.reset( new GeoHash( _prefix ));
+
+                    // If we've searched the entire space, we're finished.
+                    if ( ! _prefix.constrains() ) {
+                        GEODEBUG( "box exhausted" );
+                        _state = DONE;
+                        notePrefix();
+                        return;
+                    }
+
+                    // If we won't fit in the box, and we're not doing a sub-scan, increase the size
+                    if ( ! fitsInBox( _g->sizeEdge( _prefix ) ) && _fringe.size() == 0 ) {
+
+                        // If we're still not expanded bigger than the box size, expand again
+                        // TODO: Is there an advantage to scanning prior to expanding?
+                        _prefix = _prefix.up();
+                        continue;
+
+                    }
+
+                    // log() << "finished box prefix [" << _prefix << "]" << endl;
+
+                    // We're done and our size is large enough
+                    _state = DONE_NEIGHBOR;
+
+                    // Go to the next sub-box, if applicable
+                    if( _fringe.size() > 0 ) _fringe.pop_back();
+                    // Go to the next neighbor if this was the last sub-search
+                    if( _fringe.size() == 0 ) _neighbor++;
+
+                    break;
+
+                }
+
+                notePrefix();
+            }
+
+            // If we doeighbors
+            if( onlyExpand ) return;
+
+            // If we're done expanding the current box...
+            if( _state == DONE_NEIGHBOR ) {
+
+                // Iterate to the next neighbor
+                // Loop is useful for cases where we want to skip over boxes entirely,
+                // otherwise recursion increments the neighbors.
+                for ( ; _neighbor < 9; _neighbor++ ) {
+
+                    // If we have no fringe for the neighbor, make sure we have the default fringe
+                    if( _fringe.size() == 0 ) _fringe.push_back( "" );
+
+                    if( ! isNeighbor ) {
+                        _centerPrefix = _prefix;
+                        _centerBox = Box( _g, _centerPrefix );
+                        isNeighbor = true;
+                    }
+
+                    int i = (_neighbor / 3) - 1;
+                    int j = (_neighbor % 3) - 1;
+
+                    if ( ( i == 0 && j == 0 ) ||
+                         ( i < 0 && _centerPrefix.atMinX() ) ||
+                         ( i > 0 && _centerPrefix.atMaxX() ) ||
+                         ( j < 0 && _centerPrefix.atMinY() ) ||
+                         ( j > 0 && _centerPrefix.atMaxY() ) ) {
+
+                        //log() << "not moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() << " " << _centerPrefix << endl;
+                        //log() << _centerPrefix.atMinX() << " "
+                        //        << _centerPrefix.atMinY() << " "
+                        //        << _centerPrefix.atMaxX() << " "
+                        //        << _centerPrefix.atMaxY() << " " << endl;
+
+                        continue; // main box or wrapped edge
+                        // TODO:  We may want to enable wrapping in future, probably best as layer on top of
+                        // this search.
+                    }
+
+                    // Make sure we've got a reasonable center
+                    assert( _centerPrefix.constrains() );
+
+                    GeoHash _neighborPrefix = _centerPrefix;
+                    _neighborPrefix.move( i, j );
+
+                    //log() << "moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() << " " << _centerPrefix << " " << _neighborPrefix << endl;
+
+                    GEODEBUG( "moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() );
+                    PREFIXDEBUG( _centerPrefix, _g );
+                    PREFIXDEBUG( _neighborPrefix , _g );
+                    while( _fringe.size() > 0 ) {
+
+                        _prefix = _neighborPrefix + _fringe.back();
+                        Box cur( _g , _prefix );
+
+                        PREFIXDEBUG( _prefix, _g );
+
+                        double intAmt = intersectsBox( cur );
+
+                        // No intersection
+                        if( intAmt <= 0 ) {
+                            GEODEBUG( "skipping box" << cur.toString() );
+                            _fringe.pop_back();
+                            continue;
+                        }
+                        // Small intersection, refine search
+                        else if( intAmt < 0.5 && _prefix.canRefine() && _fringe.back().size() < 4 /* two bits */ ) {
+
+                            GEODEBUG( "Intersection small : " << intAmt << ", adding to fringe: " << _fringe.back() << " curr prefix : " << _prefix << " bits : " << _prefix.getBits() );
+
+                            // log() << "Diving to level : " << ( _fringe.back().size() / 2 + 1 ) << endl;
+
+                            string lastSuffix = _fringe.back();
+                            _fringe.pop_back();
+                            _fringe.push_back( lastSuffix + "00" );
+                            _fringe.push_back( lastSuffix + "01" );
+                            _fringe.push_back( lastSuffix + "11" );
+                            _fringe.push_back( lastSuffix + "10" );
+
+                            continue;
+                        }
+
+                        // Restart our search from a diff box.
+                        _state = START;
+
+                        assert( ! onlyExpand );
+
+                        assert( _found <= 0x7fffffff );
+                        fillStack( maxFound - _foundInExp, maxAdded - static_cast<int>(_found) );
+
+                        // When we return from the recursive fillStack call, we'll either have checked enough points or
+                        // be entirely done.  Max recurse depth is < 8 * 16.
+
+                        // If we're maxed out on points, return
+                        if( _foundInExp >= maxFound || _found >= maxAdded ) {
+                            // Make sure we'll come back to add more points
+                            assert( _state == DOING_EXPAND );
+                            return;
+                        }
+
+                        // Otherwise we must be finished to return
+                        assert( _state == DONE );
+                        return;
+
+                    }
+
+                }
+
+                // Finished with neighbors
+                _state = DONE;
+            }
+
+        }
+
+        // The initial geo hash box for our first expansion
+        virtual GeoHash expandStartHash() = 0;
+
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ) = 0;
+
+        // The amount the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ) = 0;
+
+        bool remembered( BSONObj o ){
+            BSONObj seenId = o["_id"].wrap("").getOwned();
+            if( _seenIds.find( seenId ) != _seenIds.end() ){
+                LOG( CDEBUG + 1 ) << "Object " << o["_id"] << " already seen." << endl;
+                return true;
+            }
+            else{
+                _seenIds.insert( seenId );
+                LOG( CDEBUG + 1 ) << "Object " << o["_id"] << " remembered." << endl;
+                return false;
+            }
+        }
+
+        virtual int addSpecific( const GeoKeyNode& node , const Point& keyP , bool onBounds , double keyD , bool potentiallyNewDoc ) {
+
+            int found = 0;
+
+            // We need to handle every possible point in this method, even those not in the key value, to
+            // avoid us tracking which hashes we've already seen.
+            if( ! potentiallyNewDoc ){
+                // log() << "Already handled doc!" << endl;
+                return 0;
+            }
+
+            // Final check for new doc
+            // OK to touch, since we're probably returning this object now
+            if( remembered( node.recordLoc.obj() ) ) return 0;
+
+            if( _uniqueDocs && ! onBounds ) {
+                //log() << "Added ind to " << _type << endl;
+                _stack.push_front( GeoPoint( node ) );
+                found++;
+            }
+            else {
+                // We now handle every possible point in the document, even those not in the key value,
+                // since we're iterating through them anyway - prevents us from having to save the hashes
+                // we've seen per-doc
+
+                // If we're filtering by hash, get the original
+                bool expensiveExact = expensiveExactCheck();
+
+                vector< BSONObj > locs;
+                getPointsFor( node._key, node.recordLoc.obj(), locs, true );
+                for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ){
+
+                    double d = -1;
+                    Point p( *i );
+
+                    // We can avoid exact document checks by redoing approx checks,
+                    // if the exact checks are more expensive.
+                    bool needExact = true;
+                    if( expensiveExact ){
+                        assert( false );
+                        KeyResult result = approxKeyCheck( p, d );
+                        if( result == BAD ) continue;
+                        else if( result == GOOD ) needExact = false;
+                    }
+
+                    if( ! needExact || exactDocCheck( p, d ) ){
+                        //log() << "Added mult to " << _type << endl;
+                        _stack.push_front( GeoPoint( node ) );
+                        found++;
+                        // If returning unique, just exit after first point is added
+                        if( _uniqueDocs ) break;
+                    }
+                }
+            }
+
+            while( _cur.isCleanAndEmpty() && _stack.size() > 0 ){
+                _cur = _stack.front();
+                _stack.pop_front();
+            }
+
+            return found;
+        }
+
+        virtual long long nscanned() {
+            if ( _firstCall ) {
+                ok();
+            }
+            return _nscanned;
+        }
+
+        virtual void explainDetails( BSONObjBuilder& b ){
+            b << "lookedAt" << _lookedAt;
+            b << "matchesPerfd" << _matchesPerfd;
+            b << "objectsLoaded" << _objectsLoaded;
+            b << "pointsLoaded" << _pointsLoaded;
+            b << "pointsSavedForYield" << _nDirtied;
+            b << "pointsChangedOnYield" << _nChangedOnYield;
+            b << "pointsRemovedOnYield" << _nRemovedOnYield;
+        }
+
+        virtual BSONObj prettyIndexBounds() const {
+
+            vector<GeoHash>::const_iterator i = _expPrefixes.end();
+            if( _expPrefixes.size() > 0 && *(--i) != *( _expPrefix.get() ) )
+                _expPrefixes.push_back( *( _expPrefix.get() ) );
+
+            BSONObjBuilder bob;
+            BSONArrayBuilder bab;
+            for( i = _expPrefixes.begin(); i != _expPrefixes.end(); ++i ){
+                bab << Box( _g, *i ).toBSON();
+            }
+            bob << _g->_geo << bab.arr();
+
+            return bob.obj();
+
+        }
+
+        void notePrefix() {
+            _expPrefixes.push_back( _prefix );
+        }
+
+        string _type;
+        BSONObj _filter;
+        list<GeoPoint> _stack;
+        set<BSONObj> _seenIds;
+
+        GeoPoint _cur;
+        bool _firstCall;
+        bool _noted;
+
+        long long _nscanned;
+        long long _nDirtied;
+        long long _nChangedOnYield;
+        long long _nRemovedOnYield;
+
+        // The current box we're expanding (-1 is first/center box)
+        int _neighbor;
+
+        // The points we've found so far
+        // TODO:  Long long?
+        int _foundInExp;
+
+        // The current hash prefix we're expanding and the center-box hash prefix
+        GeoHash _prefix;
+        shared_ptr<GeoHash> _lastPrefix;
+        GeoHash _centerPrefix;
+        list<string> _fringe;
+        int recurseDepth;
+        Box _centerBox;
+
+        // Start and end of our search range in the current box
+        BtreeLocation _min;
+        BtreeLocation _max;
+
+        shared_ptr<GeoHash> _expPrefix;
+        mutable vector<GeoHash> _expPrefixes;
+
+    };
+
+
+    class GeoHopper : public GeoBrowse {
+    public:
+        typedef multiset<GeoPoint> Holder;
+
+        GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = true )
+            : GeoBrowse( g, "search", filter, uniqueDocs, needDistance ), _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _distError( type == GEO_PLAIN ? g->_error : g->_errorSphere ), _farthest(0)
+        {}
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+            // Always check approximate distance, since it lets us avoid doing
+            // checks of the rest of the object if it succeeds
+
+            switch (_type) {
+            case GEO_PLAIN:
+                d = _near.distance( p );
+                break;
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                d = spheredist_deg( _near, p );
+                break;
+            default: assert( false );
+            }
+            assert( d >= 0 );
+
+            GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString()
+                      << "\t" << p.toString() << "\t" << d
+                      << " farthest: " << farthest() );
+
+            // If we need more points
+            double borderDist = ( _points.size() < _max ? _maxDistance : farthest() );
+
+            if( d >= borderDist - 2 * _distError && d <= borderDist + 2 * _distError ) return BORDER;
+            else return d < borderDist ? GOOD : BAD;
+
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+
+            bool within = false;
+
+            // Get the appropriate distance for the type
+            switch ( _type ) {
+            case GEO_PLAIN:
+                d = _near.distance( p );
+                within = _near.distanceWithin( p, _maxDistance );
+                break;
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                d = spheredist_deg( _near, p );
+                within = ( d <= _maxDistance );
+                break;
+            default: assert( false );
+            }
+
+            return within;
+        }
+
+        // Always in distance units, whether radians or normal
+        double farthest() const {
+            return _farthest;
+        }
+
+        virtual int addSpecific( const GeoKeyNode& node, const Point& keyP, bool onBounds, double keyD, bool potentiallyNewDoc ) {
+
+            // Unique documents
+
+            GeoPoint newPoint( node, keyD, false );
+
+            int prevSize = _points.size();
+
+            // STEP 1 : Remove old duplicate points from the set if needed
+            if( _uniqueDocs ){
+
+                // Lookup old point with same doc
+                map< DiskLoc , Holder::iterator >::iterator oldPointIt = _seenPts.find( newPoint.loc() );
+
+                if( oldPointIt != _seenPts.end() ){
+                    const GeoPoint& oldPoint = *(oldPointIt->second);
+                    // We don't need to care if we've already seen this same approx pt or better,
+                    // or we've already gone to disk once for the point
+                    if( oldPoint < newPoint ){
+                        GEODEBUG( "\t\tOld point closer than new point" );
+                        return 0;
+                    }
+                    GEODEBUG( "\t\tErasing old point " << oldPointIt->first.obj() );
+                    _points.erase( oldPointIt->second );
+                }
+            }
+
+            Holder::iterator newIt = _points.insert( newPoint );
+            if( _uniqueDocs ) _seenPts[ newPoint.loc() ] = newIt;
+
+            GEODEBUG( "\t\tInserted new point " << newPoint.toString() << " approx : " << keyD );
+
+            assert( _max > 0 );
+
+            Holder::iterator lastPtIt = _points.end();
+            lastPtIt--;
+            _farthest = lastPtIt->distance() + 2 * _distError;
+
+            return _points.size() - prevSize;
+
+        }
+
+        // Removes extra points from end of _points set.
+        // Check can be a bit costly if we have lots of exact points near borders,
+        // so we'll do this every once and awhile.
+        void processExtraPoints(){
+
+            if( _points.size() == 0 ) return;
+
+            int prevSize = _points.size();
+
+            // Erase all points from the set with a position >= _max *and*
+            // whose distance isn't close to the _max - 1 position distance
+
+            int numToErase = _points.size() - _max;
+            if( numToErase < 0 ) numToErase = 0;
+
+            // Get the first point definitely in the _points array
+            Holder::iterator startErase = _points.end();
+            for( int i = 0; i < numToErase + 1; i++ ) startErase--;
+            _farthest = startErase->distance() + 2 * _distError;
+
+            GEODEBUG( "\t\tPotentially erasing " << numToErase << " points, " << " size : " << _points.size() << " max : " << _max << " dist : " << startErase->distance() << " farthest dist : " << _farthest << " from error : " << _distError );
+
+            startErase++;
+            while( numToErase > 0 && startErase->distance() <= _farthest ){
+                GEODEBUG( "\t\tNot erasing point " << startErase->toString() );
+                numToErase--;
+                startErase++;
+                assert( startErase != _points.end() || numToErase == 0 );
+            }
+
+            if( _uniqueDocs ){
+                for( Holder::iterator i = startErase; i != _points.end(); ++i )
+                    _seenPts.erase( i->loc() );
+            }
+
+            _points.erase( startErase, _points.end() );
+
+            int diff = _points.size() - prevSize;
+            if( diff > 0 ) _found += diff;
+            else _found -= -diff;
+
+        }
+
+        unsigned _max;
+        Point _near;
+        Holder _points;
+        double _maxDistance;
+        GeoDistType _type;
+        double _distError;
+        double _farthest;
+
+        // Safe to use currently since we don't yield in $near searches.  If we do start to yield, we may need to
+        // replace dirtied disklocs in our holder / ensure our logic is correct.
+        map< DiskLoc , Holder::iterator > _seenPts;
+
+    };
+
+
+
+    class GeoSearch : public GeoHopper {
+    public:
+        GeoSearch( const Geo2dType * g , const Point& startPt , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = false )
+           : GeoHopper( g , numWanted , startPt , filter , maxDistance, type, uniqueDocs, needDistance ),
+             _start( g->hash( startPt._x, startPt._y ) ),
+             // TODO:  Remove numWanted...
+             _numWanted( numWanted ),
+             _type(type)
+        {
+
+           assert( g->getDetails() );
+            _nscanned = 0;
+            _found = 0;
+
+            if( _maxDistance < 0 ){
+               _scanDistance = numeric_limits<double>::max();
+            }
+            else if (type == GEO_PLAIN) {
+                _scanDistance = maxDistance + _spec->_error;
+            }
+            else if (type == GEO_SPHERE) {
+                checkEarthBounds( startPt );
+                // TODO: consider splitting into x and y scan distances
+                _scanDistance = computeXScanDistance( startPt._y, rad2deg( _maxDistance ) + _spec->_error );
+            }
+
+            assert( _scanDistance > 0 );
+
+        }
+
+
+        /** Check if we've already looked at a key.  ALSO marks as seen, anticipating a follow-up call
+            to add().  This is broken out to avoid some work extracting the key bson if it's an
+            already seen point.
+        */
+    private:
+        set< pair<DiskLoc,int> > _seen;
+    public:
+
+        void exec() {
+
+            if( _numWanted == 0 ) return;
+
+            /*
+             * Search algorithm
+             * 1) use geohash prefix to find X items
+             * 2) compute max distance from want to an item
+             * 3) find optimal set of boxes that complete circle
+             * 4) use regular btree cursors to scan those boxes
+             */
+
+#ifdef GEODEBUGGING
+
+           log() << "start near search for " << _numWanted << " points near " << _near << " (max dist " << _maxDistance << ")" << endl;
+
+#endif
+
+           // Part 1
+           {
+               do {
+                   long long f = found();
+                   assert( f <= 0x7fffffff );
+                   fillStack( maxPointsHeuristic, _numWanted - static_cast<int>(f) , true );
+                   processExtraPoints();
+               } while( _state != DONE && _state != DONE_NEIGHBOR &&
+                        found() < _numWanted &&
+                        (! _prefix.constrains() || _g->sizeEdge( _prefix ) <= _scanDistance ) );
+
+               // If we couldn't scan or scanned everything, we're done
+               if( _state == DONE ){
+                   expandEndPoints();
+                   return;
+               }
+           }
+
+#ifdef GEODEBUGGING
+
+           log() << "part 1 of near search completed, found " << found() << " points (out of " << _foundInExp << " scanned)"
+                 << " in expanded region " << _prefix << " @ " << Box( _g, _prefix )
+                 << " with furthest distance " << farthest() << endl;
+
+#endif
+
+           // Part 2
+            {
+
+               // Find farthest distance for completion scan
+                double farDist = farthest();
+                if( found() < _numWanted ) {
+                    // Not enough found in Phase 1
+                    farDist = _scanDistance;
+                }
+                else if ( _type == GEO_PLAIN ) {
+                   // Enough found, but need to search neighbor boxes
+                    farDist += _spec->_error;
+                }
+                else if ( _type == GEO_SPHERE ) {
+                   // Enough found, but need to search neighbor boxes
+                    farDist = std::min( _scanDistance, computeXScanDistance( _near._y, rad2deg( farDist ) ) + 2 * _spec->_error );
+                }
+                assert( farDist >= 0 );
+                GEODEBUGPRINT( farDist );
+
+                // Find the box that includes all the points we need to return
+                _want = Box( _near._x - farDist , _near._y - farDist , farDist * 2 );
+                GEODEBUGPRINT( _want.toString() );
+
+                // log() << "Found : " << found() << " wanted : " << _numWanted << " Far distance : " << farDist << " box : " << _want << endl;
+
+                // Remember the far distance for further scans
+                _scanDistance = farDist;
+
+                // Reset the search, our distances have probably changed
+                if( _state == DONE_NEIGHBOR ){
+                   _state = DOING_EXPAND;
+                   _neighbor = -1;
+                }
+
+#ifdef GEODEBUGGING
+
+                log() << "resetting search with start at " << _start << " (edge length " << _g->sizeEdge( _start ) << ")" << endl;
+
+#endif
+
+                // Do regular search in the full region
+                do {
+                   fillStack( maxPointsHeuristic );
+                   processExtraPoints();
+                }
+                while( _state != DONE );
+
+            }
+
+            GEODEBUG( "done near search with " << _points.size() << " points " );
+
+            expandEndPoints();
+
+        }
+
+        void addExactPoints( const GeoPoint& pt, Holder& points, bool force ){
+            int before, after;
+            addExactPoints( pt, points, before, after, force );
+        }
+
+        void addExactPoints( const GeoPoint& pt, Holder& points, int& before, int& after, bool force ){
+
+            before = 0;
+            after = 0;
+
+            GEODEBUG( "Adding exact points for " << pt.toString() );
+
+            if( pt.isExact() ){
+                if( force ) points.insert( pt );
+                return;
+            }
+
+            vector<BSONObj> locs;
+            getPointsFor( pt.key(), pt.obj(), locs, _uniqueDocs );
+
+            GeoPoint nearestPt( pt, -1, true );
+
+            for( vector<BSONObj>::iterator i = locs.begin(); i != locs.end(); i++ ){
+
+                Point loc( *i );
+
+                double d;
+                if( ! exactDocCheck( loc, d ) ) continue;
+
+                if( _uniqueDocs && ( nearestPt.distance() < 0 || d < nearestPt.distance() ) ){
+                    nearestPt._distance = d;
+                    nearestPt._pt = *i;
+                    continue;
+                }
+                else if( ! _uniqueDocs ){
+                    GeoPoint exactPt( pt, d, true );
+                    exactPt._pt = *i;
+                    GEODEBUG( "Inserting exact pt " << exactPt.toString() << " for " << pt.toString() << " exact : " << d << " is less? " << ( exactPt < pt ) << " bits : " << _g->_bits );
+                    points.insert( exactPt );
+                    exactPt < pt ? before++ : after++;
+                }
+
+            }
+
+            if( _uniqueDocs && nearestPt.distance() >= 0 ){
+                GEODEBUG( "Inserting unique exact pt " << nearestPt.toString() << " for " << pt.toString() << " exact : " << nearestPt.distance() << " is less? " << ( nearestPt < pt ) << " bits : " << _g->_bits );
+                points.insert( nearestPt );
+                if( nearestPt < pt ) before++;
+                else after++;
+            }
+
+        }
+
+        // TODO: Refactor this back into holder class, allow to run periodically when we are seeing a lot of pts
+        void expandEndPoints( bool finish = true ){
+
+            processExtraPoints();
+
+            // All points in array *could* be in maxDistance
+
+            // Step 1 : Trim points to max size
+            // TODO:  This check will do little for now, but is skeleton for future work in incremental $near
+            // searches
+            if( _max > 0 ){
+
+                int numToErase = _points.size() - _max;
+
+                if( numToErase > 0 ){
+
+                    Holder tested;
+
+                    // Work backward through all points we're not sure belong in the set
+                    Holder::iterator maybePointIt = _points.end();
+                    maybePointIt--;
+                    double approxMin = maybePointIt->distance() - 2 * _distError;
+
+                    GEODEBUG( "\t\tNeed to erase " << numToErase << " max : " << _max << " min dist " << approxMin << " error : " << _distError << " starting from : " << (*maybePointIt).toString() );
+
+                    // Insert all
+                    int erased = 0;
+                    while( _points.size() > 0 && ( maybePointIt->distance() >= approxMin || erased < numToErase ) ){
+
+                        Holder::iterator current = maybePointIt--;
+
+                        addExactPoints( *current, tested, true );
+                        _points.erase( current );
+                        erased++;
+
+                        if( tested.size() )
+                            approxMin = tested.begin()->distance() - 2 * _distError;
+
+                    }
+
+                    GEODEBUG( "\t\tEnding search at point " << ( _points.size() == 0 ? "(beginning)" : maybePointIt->toString() ) );
+
+                    int numToAddBack = erased - numToErase;
+                    assert( numToAddBack >= 0 );
+
+                    GEODEBUG( "\t\tNum tested valid : " << tested.size() << " erased : " << erased << " added back : " << numToAddBack );
+
+#ifdef GEODEBUGGING
+                    for( Holder::iterator it = tested.begin(); it != tested.end(); it++ ){
+                        log() << "Tested Point: " << *it << endl;
+                    }
+#endif
+                    Holder::iterator testedIt = tested.begin();
+                    for( int i = 0; i < numToAddBack && testedIt != tested.end(); i++ ){
+                        _points.insert( *testedIt );
+                        testedIt++;
+                    }
+                }
+            }
+
+#ifdef GEODEBUGGING
+            for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+                log() << "Point: " << *it << endl;
+            }
+#endif
+            // We've now trimmed first set of unneeded points
+
+            GEODEBUG( "\t\t Start expanding, num points : " << _points.size() << " max : " << _max );
+
+            // Step 2: iterate through all points and add as needed
+
+            unsigned expandedPoints = 0;
+            Holder::iterator it = _points.begin();
+            double expandWindowEnd = -1;
+            while( it != _points.end() ){
+                const GeoPoint& currPt = *it;
+
+                // TODO: If one point is exact, maybe not 2 * _distError
+
+                // See if we're in an expand window
+                bool inWindow = currPt.distance() <= expandWindowEnd;
+                // If we're not, and we're done with points, break
+                if( ! inWindow && expandedPoints >= _max ) break;
+
+                bool expandApprox = ! currPt.isExact() && ( ! _uniqueDocs || ( finish && _needDistance ) || inWindow );
+
+                if( expandApprox ){
+
+                    // Add new point(s)
+                    // These will only be added in a radius of 2 * _distError around the current point,
+                    // so should not affect previously valid points.
+                    int before, after;
+                    addExactPoints( currPt, _points, before, after, false );
+                    expandedPoints += before;
+
+                    if( _max > 0 && expandedPoints < _max )
+                        expandWindowEnd = currPt.distance() + 2 * _distError;
+
+                    // Iterate to the next point
+                    Holder::iterator current = it++;
+                    // Erase the current point
+                    _points.erase( current );
+
+                }
+                else{
+                    expandedPoints++;
+                    it++;
+                }
+            }
+
+            GEODEBUG( "\t\tFinished expanding, num points : " << _points.size() << " max : " << _max );
+
+            // Finish
+            // TODO:  Don't really need to trim?
+            for( ; expandedPoints > _max; expandedPoints-- ) it--;
+            _points.erase( it, _points.end() );
+
+#ifdef GEODEBUGGING
+            for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+                log() << "Point: " << *it << endl;
+            }
+#endif
+        }
+
+        virtual GeoHash expandStartHash(){
+           return _start;
+        }
+
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ){
+           return width >= _scanDistance;
+        }
+
+        // Whether the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ){
+            return cur.intersects( _want );
+        }
+
+        GeoHash _start;
+        int _numWanted;
+        double _scanDistance;
+
+        long long _nscanned;
+        int _found;
+        GeoDistType _type;
+
+        Box _want;
+    };
+
+    class GeoSearchCursor : public GeoCursorBase {
+    public:
+
+        GeoSearchCursor( shared_ptr<GeoSearch> s )
+            : GeoCursorBase( s->_spec ) ,
+              _s( s ) , _cur( s->_points.begin() ) , _end( s->_points.end() ), _nscanned() {
+            if ( _cur != _end ) {
+                ++_nscanned;
+            }
+        }
+
+        virtual ~GeoSearchCursor() {}
+
+        virtual bool ok() {
+            return _cur != _end;
+        }
+
+        virtual Record* _current() { assert(ok()); return _cur->_loc.rec(); }
+        virtual BSONObj current() { assert(ok()); return _cur->_o; }
+        virtual DiskLoc currLoc() { assert(ok()); return _cur->_loc; }
+        virtual bool advance() {
+            if( ok() ){
+                _cur++;
+                incNscanned();
+                return ok();
+            }
+            return false;
+        }
+        virtual BSONObj currKey() const { return _cur->_key; }
+
+        virtual string toString() {
+            return "GeoSearchCursor";
+        }
+
+
+        virtual BSONObj prettyStartKey() const {
+            return BSON( _s->_g->_geo << _s->_prefix.toString() );
+        }
+        virtual BSONObj prettyEndKey() const {
+            GeoHash temp = _s->_prefix;
+            temp.move( 1 , 1 );
+            return BSON( _s->_g->_geo << temp.toString() );
+        }
+
+        virtual long long nscanned() { return _nscanned; }
+
+        virtual CoveredIndexMatcher* matcher() const {
+            if( _s->_matcher.get() ) return _s->_matcher.get();
+            else return emptyMatcher.get();
+        }
+
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+            if( _s->_matcher.get() ) return _s->_matcher;
+            else return emptyMatcher;
+        }
+
+        shared_ptr<GeoSearch> _s;
+        GeoHopper::Holder::iterator _cur;
+        GeoHopper::Holder::iterator _end;
+
+        void incNscanned() { if ( ok() ) { ++_nscanned; } }
+        long long _nscanned;
+    };
+
+    class GeoCircleBrowse : public GeoBrowse {
+    public:
+
+        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center", bool uniqueDocs = true )
+            : GeoBrowse( g , "circle" , filter, uniqueDocs ) {
+
+            uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 );
+
+            BSONObjIterator i(circle);
+            BSONElement center = i.next();
+
+            uassert( 13656 , "the first field of $center object must be a location object" , center.isABSONObj() );
+
+            // Get geohash and exact center point
+            // TODO: For wrapping search, may be useful to allow center points outside-of-bounds here.
+            // Calculating the nearest point as a hash start inside the region would then be required.
+            _start = g->_tohash(center);
+            _startPt = Point(center);
+
+            _maxDistance = i.next().numberDouble();
+            uassert( 13061 , "need a max distance >= 0 " , _maxDistance >= 0 );
+
+            if (type == "$center") {
+                // Look in box with bounds of maxDistance in either direction
+                _type = GEO_PLAIN;
+                _xScanDistance = _maxDistance + _g->_error;
+                _yScanDistance = _maxDistance + _g->_error;
+            }
+            else if (type == "$centerSphere") {
+                // Same, but compute maxDistance using spherical transform
+
+                uassert(13461, "Spherical MaxDistance > PI. Are you sure you are using radians?", _maxDistance < M_PI);
+                checkEarthBounds( _startPt );
+
+                _type = GEO_SPHERE;
+                _yScanDistance = rad2deg( _maxDistance ) + _g->_error;
+                _xScanDistance = computeXScanDistance(_startPt._y, _yScanDistance);
+
+                uassert(13462, "Spherical distance would require wrapping, which isn't implemented yet",
+                        (_startPt._x + _xScanDistance < 180) && (_startPt._x - _xScanDistance > -180) &&
+                        (_startPt._y + _yScanDistance < 90) && (_startPt._y - _yScanDistance > -90));
+            }
+            else {
+                uassert(13460, "invalid $center query type: " + type, false);
+            }
+
+            // Bounding box includes fudge factor.
+            // TODO:  Is this correct, since fudge factor may be spherically transformed?
+            _bBox._min = Point( _startPt._x - _xScanDistance, _startPt._y - _yScanDistance );
+            _bBox._max = Point( _startPt._x + _xScanDistance, _startPt._y + _yScanDistance );
+
+            GEODEBUG( "Bounding box for circle query : " << _bBox.toString() << " (max distance : " << _maxDistance << ")" << " starting from " << _startPt.toString() );
+
+            ok();
+        }
+
+        virtual GeoHash expandStartHash() {
+            return _start;
+        }
+
+        virtual bool fitsInBox( double width ) {
+            return width >= std::max(_xScanDistance, _yScanDistance);
+        }
+
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _bBox );
+        }
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+            // Inexact hash distance checks.
+            double error = 0;
+            switch (_type) {
+            case GEO_PLAIN:
+                d = _startPt.distance( p );
+                error = _g->_error;
+                break;
+            case GEO_SPHERE: {
+                checkEarthBounds( p );
+                d = spheredist_deg( _startPt, p );
+                error = _g->_errorSphere;
+                break;
+            }
+            default: assert( false );
+            }
+
+            // If our distance is in the error bounds...
+            if( d >= _maxDistance - error && d <= _maxDistance + error ) return BORDER;
+            return d > _maxDistance ? BAD : GOOD;
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+
+            switch (_type) {
+            case GEO_PLAIN: {
+                if( _startPt.distanceWithin( p, _maxDistance ) ) return true;
+                break;
+            }
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                if( spheredist_deg( _startPt , p ) <= _maxDistance ) return true;
+                break;
+            default: assert( false );
+            }
+
+            return false;
+        }
+
+        GeoDistType _type;
+        GeoHash _start;
+        Point _startPt;
+        double _maxDistance; // user input
+        double _xScanDistance; // effected by GeoDistType
+        double _yScanDistance; // effected by GeoDistType
+        Box _bBox;
+
+    };
+
+    class GeoBoxBrowse : public GeoBrowse {
+    public:
+
+        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj(), bool uniqueDocs = true )
+            : GeoBrowse( g , "box" , filter, uniqueDocs ) {
+
+            uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 );
+
+            // Initialize an *exact* box from the given obj.
+            BSONObjIterator i(box);
+            _want._min = Point( i.next() );
+            _want._max = Point( i.next() );
+
+            _wantRegion = _want;
+            _wantRegion.fudge( g ); // Need to make sure we're checking regions within error bounds of where we want
+            fixBox( g, _wantRegion );
+            fixBox( g, _want );
+
+            uassert( 13064 , "need an area > 0 " , _want.area() > 0 );
+
+            Point center = _want.center();
+            _start = _g->hash( center._x , center._y );
+
+            GEODEBUG( "center : " << center.toString() << "\t" << _prefix );
+
+            _fudge = _g->_error;
+            _wantLen = _fudge +
+                       std::max( ( _want._max._x - _want._min._x ) ,
+                                 ( _want._max._y - _want._min._y ) ) / 2;
+
+            ok();
+        }
+
+        void fixBox( const Geo2dType* g, Box& box ) {
+            if( box._min._x > box._max._x )
+                swap( box._min._x, box._max._x );
+            if( box._min._y > box._max._y )
+                swap( box._min._y, box._max._y );
+
+            double gMin = g->_min;
+            double gMax = g->_max;
+
+            if( box._min._x < gMin ) box._min._x = gMin;
+            if( box._min._y < gMin ) box._min._y = gMin;
+            if( box._max._x > gMax) box._max._x = gMax;
+            if( box._max._y > gMax ) box._max._y = gMax;
+        }
+
+        void swap( double& a, double& b ) {
+            double swap = a;
+            a = b;
+            b = swap;
+        }
+
+        virtual GeoHash expandStartHash() {
+            return _start;
+        }
+
+        virtual bool fitsInBox( double width ) {
+            return width >= _wantLen;
+        }
+
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _wantRegion );
+        }
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+            if( _want.onBoundary( p, _fudge ) ) return BORDER;
+            else return _want.inside( p, _fudge ) ? GOOD : BAD;
+
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+            return _want.inside( p );
+        }
+
+        Box _want;
+        Box _wantRegion;
+        double _wantLen;
+        double _fudge;
+
+        GeoHash _start;
+
+    };
+
+    class GeoPolygonBrowse : public GeoBrowse {
+    public:
+
+        GeoPolygonBrowse( const Geo2dType* g , const BSONObj& polyPoints ,
+                          BSONObj filter = BSONObj(), bool uniqueDocs = true ) : GeoBrowse( g , "polygon" , filter, uniqueDocs ) {
+
+            GEODEBUG( "In Polygon" )
+
+            BSONObjIterator i( polyPoints );
+            BSONElement first = i.next();
+            _poly.add( Point( first ) );
+
+            while ( i.more() ) {
+                _poly.add( Point( i.next() ) );
+            }
+
+            uassert( 14030, "polygon must be defined by three points or more", _poly.size() >= 3 );
+
+            _bounds = _poly.bounds();
+            _bounds.fudge( g ); // We need to check regions within the error bounds of these bounds
+            _bounds.truncate( g ); // We don't need to look anywhere outside the space
+
+            _maxDim = _g->_error + _bounds.maxDim() / 2;
+
+            ok();
+        }
+
+        // The initial geo hash box for our first expansion
+        virtual GeoHash expandStartHash() {
+            return _g->hash( _bounds.center() );
+        }
+
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ) {
+            return _maxDim <= width;
+        }
+
+        // Whether the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _bounds );
+        }
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+            int in = _poly.contains( p, _g->_error );
+
+            if( in == 0 ) return BORDER;
+            else return in > 0 ? GOOD : BAD;
+
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+            return _poly.contains( p );
+        }
+
+    private:
+
+        Polygon _poly;
+        Box _bounds;
+        double _maxDim;
+
+        GeoHash _start;
+    };
+
+    shared_ptr<Cursor> Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
+        if ( numWanted < 0 )
+            numWanted = numWanted * -1;
+        else if ( numWanted == 0 )
+            numWanted = 100;
+
+        BSONObjIterator i(query);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+
+            if ( _geo != e.fieldName() )
+                continue;
+
+            if ( e.type() == Array ) {
+                // If we get an array query, assume it is a location, and do a $within { $center : [[x, y], 0] } search
+                shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ), "$center", true ) );
+                return c;
+            }
+            else if ( e.type() == Object ) {
+
+                // TODO:  Filter out _geo : { $special... } field so it doesn't get matched accidentally,
+                // if matcher changes
+
+                switch ( e.embeddedObject().firstElement().getGtLtOp() ) {
+                case BSONObj::opNEAR: {
+                    BSONObj n = e.embeddedObject();
+                    e = n.firstElement();
+
+                    const char* suffix = e.fieldName() + 5; // strlen("$near") == 5;
+                    GeoDistType type;
+                    if (suffix[0] == '\0') {
+                        type = GEO_PLAIN;
+                    }
+                    else if (strcmp(suffix, "Sphere") == 0) {
+                        type = GEO_SPHERE;
+                    }
+                    else {
+                        uassert(13464, string("invalid $near search type: ") + e.fieldName(), false);
+                        type = GEO_PLAIN; // prevents uninitialized warning
+                    }
+
+                    double maxDistance = numeric_limits<double>::max();
+                    if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ) {
+                        BSONObjIterator i(e.embeddedObject());
+                        i.next();
+                        i.next();
+                        BSONElement e = i.next();
+                        if ( e.isNumber() )
+                            maxDistance = e.numberDouble();
+                    }
+                    {
+                        BSONElement e = n["$maxDistance"];
+                        if ( e.isNumber() )
+                            maxDistance = e.numberDouble();
+                    }
+
+                    bool uniqueDocs = false;
+                    if( ! n["$uniqueDocs"].eoo() ) uniqueDocs = n["$uniqueDocs"].trueValue();
+
+                    shared_ptr<GeoSearch> s( new GeoSearch( this , Point( e ) , numWanted , query , maxDistance, type, uniqueDocs ) );
+                    s->exec();
+                    shared_ptr<Cursor> c;
+                    c.reset( new GeoSearchCursor( s ) );
+                    return c;
+                }
+                case BSONObj::opWITHIN: {
+
+                    e = e.embeddedObject().firstElement();
+                    uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() );
+
+                    BSONObj context = e.embeddedObject();
+                    e = e.embeddedObject().firstElement();
+                    string type = e.fieldName();
+
+                    bool uniqueDocs = true;
+                    if( ! context["$uniqueDocs"].eoo() ) uniqueDocs = context["$uniqueDocs"].trueValue();
+
+                    if ( startsWith(type,  "$center") ) {
+                        uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type, uniqueDocs ) );
+                        return c;
+                    }
+                    else if ( type == "$box" ) {
+                        uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
+                        return c;
+                    }
+                    else if ( startsWith( type, "$poly" ) ) {
+                        uassert( 14029 , "$polygon has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoPolygonBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
+                        return c;
+                    }
+                    throw UserException( 13058 , str::stream() << "unknown $within information : " << context << ", a shape must be specified." );
+                }
+                default:
+                    // Otherwise... assume the object defines a point, and we want to do a zero-radius $within $center
+                    shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ) ) );
+                    return c;
+                }
+            }
+        }
+
+        throw UserException( 13042 , (string)"missing geo field (" + _geo + ") in : " + query.toString() );
+    }
+
+    // ------
+    // commands
+    // ------
+
+    class Geo2dFindNearCmd : public Command {
+    public:
+        Geo2dFindNearCmd() : Command( "geoNear" ) {}
+        virtual LockType locktype() const { return READ; }
+        bool slaveOk() const { return true; }
+        void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; }
+        bool slaveOverrideOk() { return true; }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( ! d ) {
+                errmsg = "can't find ns";
+                return false;
+            }
+
+            vector<int> idxs;
+            d->findIndexByType( GEO2DNAME , idxs );
+
+            if ( idxs.size() > 1 ) {
+                errmsg = "more than 1 geo indexes :(";
+                return false;
+            }
+
+            if ( idxs.size() == 0 ) {
+                errmsg = "no geo index :(";
+                return false;
+            }
+
+            int geoIdx = idxs[0];
+
+            result.append( "ns" , ns );
+
+            IndexDetails& id = d->idx( geoIdx );
+            Geo2dType * g = (Geo2dType*)id.getSpec().getType();
+            assert( &id == g->getDetails() );
+
+            int numWanted = 100;
+            if ( cmdObj["num"].isNumber() ) {
+                numWanted = cmdObj["num"].numberInt();
+                assert( numWanted >= 0 );
+            }
+
+            bool uniqueDocs = false;
+            if( ! cmdObj["uniqueDocs"].eoo() ) uniqueDocs = cmdObj["uniqueDocs"].trueValue();
+
+            bool includeLocs = false;
+            if( ! cmdObj["includeLocs"].eoo() ) includeLocs = cmdObj["includeLocs"].trueValue();
+
+            uassert(13046, "'near' param missing/invalid", !cmdObj["near"].eoo());
+            const Point n( cmdObj["near"] );
+            result.append( "near" , g->_tohash( cmdObj["near"] ).toString() );
+
+            BSONObj filter;
+            if ( cmdObj["query"].type() == Object )
+                filter = cmdObj["query"].embeddedObject();
+
+            double maxDistance = numeric_limits<double>::max();
+            if ( cmdObj["maxDistance"].isNumber() )
+                maxDistance = cmdObj["maxDistance"].number();
+
+            GeoDistType type = GEO_PLAIN;
+            if ( cmdObj["spherical"].trueValue() )
+                type = GEO_SPHERE;
+
+            GeoSearch gs( g , n , numWanted , filter , maxDistance , type, uniqueDocs, true );
+
+            if ( cmdObj["start"].type() == String) {
+                GeoHash start ((string) cmdObj["start"].valuestr());
+                gs._start = start;
+            }
+
+            gs.exec();
+
+            double distanceMultiplier = 1;
+            if ( cmdObj["distanceMultiplier"].isNumber() )
+                distanceMultiplier = cmdObj["distanceMultiplier"].number();
+
+            double totalDistance = 0;
+
+            BSONObjBuilder arr( result.subarrayStart( "results" ) );
+            int x = 0;
+            for ( GeoHopper::Holder::iterator i=gs._points.begin(); i!=gs._points.end(); i++ ) {
+
+                const GeoPoint& p = *i;
+                double dis = distanceMultiplier * p.distance();
+                totalDistance += dis;
+
+                BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ) ) );
+                bb.append( "dis" , dis );
+                if( includeLocs ){
+                    if( p._pt.couldBeArray() ) bb.append( "loc", BSONArray( p._pt ) );
+                    else bb.append( "loc" , p._pt );
+                }
+                bb.append( "obj" , p._o );
+                bb.done();
+
+                if ( arr.len() > BSONObjMaxUserSize ) {
+                    warning() << "Too many results to fit in single document. Truncating..." << endl;
+                    break;
+                }
+            }
+            arr.done();
+
+            BSONObjBuilder stats( result.subobjStart( "stats" ) );
+            stats.append( "time" , cc().curop()->elapsedMillis() );
+            stats.appendNumber( "btreelocs" , gs._nscanned );
+            stats.appendNumber( "nscanned" , gs._lookedAt );
+            stats.appendNumber( "objectsLoaded" , gs._objectsLoaded );
+            stats.append( "avgDistance" , totalDistance / x );
+            stats.append( "maxDistance" , gs.farthest() );
+            stats.done();
+
+            return true;
+        }
+
+    } geo2dFindNearCmd;
+
+    class GeoWalkCmd : public Command {
+    public:
+        GeoWalkCmd() : Command( "geoWalk" ) {}
+        virtual LockType locktype() const { return READ; }
+        bool slaveOk() const { return true; }
+        bool slaveOverrideOk() { return true; }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( ! d ) {
+                errmsg = "can't find ns";
+                return false;
+            }
+
+            int geoIdx = -1;
+            {
+                NamespaceDetails::IndexIterator ii = d->ii();
+                while ( ii.more() ) {
+                    IndexDetails& id = ii.next();
+                    if ( id.getSpec().getTypeName() == GEO2DNAME ) {
+                        if ( geoIdx >= 0 ) {
+                            errmsg = "2 geo indexes :(";
+                            return false;
+                        }
+                        geoIdx = ii.pos() - 1;
+                    }
+                }
+            }
+
+            if ( geoIdx < 0 ) {
+                errmsg = "no geo index :(";
+                return false;
+            }
+
+
+            IndexDetails& id = d->idx( geoIdx );
+            Geo2dType * g = (Geo2dType*)id.getSpec().getType();
+            assert( &id == g->getDetails() );
+
+            int max = 100000;
+
+            auto_ptr<BtreeCursor> bc( BtreeCursor::make( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 ) );
+            BtreeCursor &c = *bc;
+            while ( c.ok() && max-- ) {
+                GeoHash h( c.currKey().firstElement() );
+                int len;
+                cout << "\t" << h.toString()
+                     << "\t" << c.current()[g->_geo]
+                     << "\t" << hex << h.getHash()
+                     << "\t" << hex << ((long long*)c.currKey().firstElement().binData(len))[0]
+                     << "\t" << c.current()["_id"]
+                     << endl;
+                c.advance();
+            }
+
+            return true;
+        }
+
+    } geoWalkCmd;
+
+    struct GeoUnitTest : public UnitTest {
+
+        int round( double d ) {
+            return (int)(.5+(d*1000));
+        }
+
+#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); }
+
+        void run() {
+            assert( ! GeoHash::isBitSet( 0 , 0 ) );
+            assert( ! GeoHash::isBitSet( 0 , 31 ) );
+            assert( GeoHash::isBitSet( 1 , 31 ) );
+
+            IndexSpec i( BSON( "loc" << "2d" ) );
+            Geo2dType g( &geo2dplugin , &i );
+            {
+                double x = 73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                double x = -73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                GeoHash h( "0000" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0001" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0000" );
+
+                h.init( "0001" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0100" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0001" );
+
+
+                h.init( "0000" );
+                h.move( 1 , 0 );
+                GEOHEQ( h , "0010" );
+            }
+
+            {
+                Box b( 5 , 5 , 2 );
+                assert( "(5,5) -->> (7,7)" == b.toString() );
+            }
+
+            {
+                GeoHash a = g.hash( 1 , 1 );
+                GeoHash b = g.hash( 4 , 5 );
+                assert( 5 == (int)(g.distance( a , b ) ) );
+                a = g.hash( 50 , 50 );
+                b = g.hash( 42 , 44 );
+                assert( round(10) == round(g.distance( a , b )) );
+            }
+
+            {
+                GeoHash x("0000");
+                assert( 0 == x.getHash() );
+                x.init( 0 , 1 , 32 );
+                GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" )
+
+                assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
+                assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
+            }
+
+            {
+                GeoHash x("1010");
+                GEOHEQ( x , "1010" );
+                GeoHash y = x + "01";
+                GEOHEQ( y , "101001" );
+            }
+
+            {
+
+                GeoHash a = g.hash( 5 , 5 );
+                GeoHash b = g.hash( 5 , 7 );
+                GeoHash c = g.hash( 100 , 100 );
+                /*
+                cout << "a: " << a << endl;
+                cout << "b: " << b << endl;
+                cout << "c: " << c << endl;
+
+                cout << "a: " << a.toStringHex1() << endl;
+                cout << "b: " << b.toStringHex1() << endl;
+                cout << "c: " << c.toStringHex1() << endl;
+                */
+                BSONObj oa = a.wrap();
+                BSONObj ob = b.wrap();
+                BSONObj oc = c.wrap();
+                /*
+                cout << "a: " << oa.hexDump() << endl;
+                cout << "b: " << ob.hexDump() << endl;
+                cout << "c: " << oc.hexDump() << endl;
+                */
+                assert( oa.woCompare( ob ) < 0 );
+                assert( oa.woCompare( oc ) < 0 );
+
+            }
+
+            {
+                GeoHash x( "000000" );
+                x.move( -1 , 0 );
+                GEOHEQ( x , "101010" );
+                x.move( 1 , -1 );
+                GEOHEQ( x , "010101" );
+                x.move( 0 , 1 );
+                GEOHEQ( x , "000000" );
+            }
+
+            {
+                GeoHash prefix( "110011000000" );
+                GeoHash entry(  "1100110000011100000111000001110000011100000111000001000000000000" );
+                assert( ! entry.hasPrefix( prefix ) );
+
+                entry = GeoHash("1100110000001100000111000001110000011100000111000001000000000000");
+                assert( entry.toString().find( prefix.toString() ) == 0 );
+                assert( entry.hasPrefix( GeoHash( "1100" ) ) );
+                assert( entry.hasPrefix( prefix ) );
+            }
+
+            {
+                GeoHash a = g.hash( 50 , 50 );
+                GeoHash b = g.hash( 48 , 54 );
+                assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
+            }
+
+
+            {
+                Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
+                assert( b.inside( 29.763 , -95.363 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) );
+            }
+
+            {
+                GeoHash a( "11001111" );
+                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11") ) );
+                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11110000") ) );
+            }
+
+            {
+                int N = 10000;
+                {
+                    Timer t;
+                    for ( int i=0; i<N; i++ ) {
+                        unsigned x = (unsigned)rand();
+                        unsigned y = (unsigned)rand();
+                        GeoHash h( x , y );
+                        unsigned a,b;
+                        h.unhash_slow( a,b );
+                        assert( a == x );
+                        assert( b == y );
+                    }
+                    //cout << "slow: " << t.millis() << endl;
+                }
+
+                {
+                    Timer t;
+                    for ( int i=0; i<N; i++ ) {
+                        unsigned x = (unsigned)rand();
+                        unsigned y = (unsigned)rand();
+                        GeoHash h( x , y );
+                        unsigned a,b;
+                        h.unhash_fast( a,b );
+                        assert( a == x );
+                        assert( b == y );
+                    }
+                    //cout << "fast: " << t.millis() << endl;
+                }
+
+            }
+
+            {
+                // see http://en.wikipedia.org/wiki/Great-circle_distance#Worked_example
+
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+
+                    double dist1 = spheredist_deg(BNA, LAX);
+                    double dist2 = spheredist_deg(LAX, BNA);
+
+                    // target is 0.45306
+                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+                }
+                {
+                    Point BNA (-1.5127, 0.6304);
+                    Point LAX (-2.0665, 0.5924);
+
+                    double dist1 = spheredist_rad(BNA, LAX);
+                    double dist2 = spheredist_rad(LAX, BNA);
+
+                    // target is 0.45306
+                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+                }
+                {
+                    Point JFK (-73.77694444, 40.63861111 );
+                    Point LAX (-118.40, 33.94);
+
+                    double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES;
+                    assert( dist > 2469 && dist < 2470 );
+                }
+
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+                    Point JFK (-73.77694444, 40.63861111 );
+                    assert( spheredist_deg(BNA, BNA) < 1e-6);
+                    assert( spheredist_deg(LAX, LAX) < 1e-6);
+                    assert( spheredist_deg(JFK, JFK) < 1e-6);
+
+                    Point zero (0, 0);
+                    Point antizero (0,-180);
+
+                    // these were known to cause NaN
+                    assert( spheredist_deg(zero, zero) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6);
+                }
+            }
+        }
+    } geoUnitTest;
+
+
+}
+
diff --git a/src/mongo/db/geo/core.h b/src/mongo/db/geo/core.h
new file mode 100644
index 00000000000..c49131e0162
--- /dev/null
+++ b/src/mongo/db/geo/core.h
@@ -0,0 +1,550 @@
+// core.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+
+#include <cmath>
+
+#ifndef M_PI
+#  define M_PI 3.14159265358979323846
+#endif
+
+namespace mongo {
+
+    class GeoBitSets {
+    public:
+        GeoBitSets() {
+            for ( int i=0; i<32; i++ ) {
+                masks32[i] = ( 1 << ( 31 - i ) );
+            }
+            for ( int i=0; i<64; i++ ) {
+                masks64[i] = ( 1LL << ( 63 - i ) );
+            }
+
+            for ( unsigned i=0; i<16; i++ ) {
+                unsigned fixed = 0;
+                for ( int j=0; j<4; j++ ) {
+                    if ( i & ( 1 << j ) )
+                        fixed |= ( 1 << ( j * 2 ) );
+                }
+                hashedToNormal[fixed] = i;
+            }
+
+            long long currAllX = 0, currAllY = 0;
+            for ( int i = 0; i < 64; i++ ){
+                if( i % 2 == 0 ){
+                    allX[ i / 2 ] = currAllX;
+                    currAllX = currAllX + ( 1LL << ( 63 - i ) );
+                }
+                else{
+                    allY[ i / 2 ] = currAllY;
+                    currAllY = currAllY + ( 1LL << ( 63 - i ) );
+                }
+            }
+        }
+        int masks32[32];
+        long long masks64[64];
+        long long allX[32];
+        long long allY[32];
+
+        unsigned hashedToNormal[256];
+    };
+
+    extern GeoBitSets geoBitSets;
+
+    class GeoHash {
+    public:
+
+        GeoHash()
+            : _hash(0),_bits(0) {
+        }
+
+        explicit GeoHash( const char * hash ) {
+            init( hash );
+        }
+
+        explicit GeoHash( const string& hash ) {
+            init( hash );
+        }
+
+        static GeoHash makeFromBinData(const char *bindata, unsigned bits) {
+            GeoHash h;
+            h._bits = bits;
+            h._copy( (char*)&h._hash , bindata );
+            h._fix();
+            return h;
+        }
+
+        explicit GeoHash( const BSONElement& e , unsigned bits=32 ) {
+            _bits = bits;
+            if ( e.type() == BinData ) {
+                int len = 0;
+                _copy( (char*)&_hash , e.binData( len ) );
+                assert( len == 8 );
+                _bits = bits;
+            }
+            else {
+                cout << "GeoHash bad element: " << e << endl;
+                uassert(13047,"wrong type for geo index. if you're using a pre-release version, need to rebuild index",0);
+            }
+            _fix();
+        }
+
+        GeoHash( unsigned x , unsigned y , unsigned bits=32) {
+            init( x , y , bits );
+        }
+
+        GeoHash( const GeoHash& old ) {
+            _hash = old._hash;
+            _bits = old._bits;
+        }
+
+        GeoHash( long long hash , unsigned bits )
+            : _hash( hash ) , _bits( bits ) {
+            _fix();
+        }
+
+        void init( unsigned x , unsigned y , unsigned bits ) {
+            assert( bits <= 32 );
+            _hash = 0;
+            _bits = bits;
+            for ( unsigned i=0; i<bits; i++ ) {
+                if ( isBitSet( x , i ) ) _hash |= geoBitSets.masks64[i*2];
+                if ( isBitSet( y , i ) ) _hash |= geoBitSets.masks64[(i*2)+1];
+            }
+        }
+
+        void unhash_fast( unsigned& x , unsigned& y ) const {
+            x = 0;
+            y = 0;
+            char * c = (char*)(&_hash);
+            for ( int i=0; i<8; i++ ) {
+                unsigned t = (unsigned)(c[i]) & 0x55;
+                y |= ( geoBitSets.hashedToNormal[t] << (4*(i)) );
+
+                t = ( (unsigned)(c[i]) >> 1 ) & 0x55;
+                x |= ( geoBitSets.hashedToNormal[t] << (4*(i)) );
+            }
+        }
+
+        void unhash_slow( unsigned& x , unsigned& y ) const {
+            x = 0;
+            y = 0;
+            for ( unsigned i=0; i<_bits; i++ ) {
+                if ( getBitX(i) )
+                    x |= geoBitSets.masks32[i];
+                if ( getBitY(i) )
+                    y |= geoBitSets.masks32[i];
+            }
+        }
+
+        void unhash( unsigned& x , unsigned& y ) const {
+            unhash_fast( x , y );
+        }
+
+        /**
+         * @param 0 = high
+         */
+        static bool isBitSet( unsigned val , unsigned  bit ) {
+            return geoBitSets.masks32[bit] & val;
+        }
+
+        GeoHash up() const {
+            return GeoHash( _hash , _bits - 1 );
+        }
+
+        bool hasPrefix( const GeoHash& other ) const {
+            assert( other._bits <= _bits );
+            if ( other._bits == 0 )
+                return true;
+            long long x = other._hash ^ _hash;
+            x = x >> (64-(other._bits*2));
+            return x == 0;
+        }
+
+
+        string toString() const {
+            StringBuilder buf( _bits * 2 );
+            for ( unsigned x=0; x<_bits*2; x++ )
+                buf.append( _hash & geoBitSets.masks64[x] ? "1" : "0" );
+            return buf.str();
+        }
+
+        string toStringHex1() const {
+            stringstream ss;
+            ss << hex << _hash;
+            return ss.str();
+        }
+
+        void init( const string& s ) {
+            _hash = 0;
+            _bits = s.size() / 2;
+            for ( unsigned pos=0; pos<s.size(); pos++ )
+                if ( s[pos] == '1' )
+                    setBit( pos , 1 );
+        }
+
+        void setBit( unsigned pos , bool one ) {
+            assert( pos < _bits * 2 );
+            if ( one )
+                _hash |= geoBitSets.masks64[pos];
+            else if ( _hash & geoBitSets.masks64[pos] )
+                _hash &= ~geoBitSets.masks64[pos];
+        }
+
+        bool getBit( unsigned pos ) const {
+            return _hash & geoBitSets.masks64[pos];
+        }
+
+        bool getBitX( unsigned pos ) const {
+            assert( pos < 32 );
+            return getBit( pos * 2 );
+        }
+
+        bool getBitY( unsigned pos ) const {
+            assert( pos < 32 );
+            return getBit( ( pos * 2 ) + 1 );
+        }
+
+        BSONObj wrap( const char* name = "" ) const {
+            BSONObjBuilder b(20);
+            append( b , name );
+            BSONObj o = b.obj();
+            if( ! strlen( name ) ) assert( o.objsize() == 20 );
+            return o;
+        }
+
+        bool constrains() const {
+            return _bits > 0;
+        }
+
+        bool canRefine() const {
+           return _bits < 32;
+        }
+
+        bool atMinX() const {
+            return ( _hash & geoBitSets.allX[ _bits ] ) == 0;
+        }
+
+        bool atMinY() const {
+            //log() << " MinY : " << hex << (unsigned long long) _hash << " " << _bits << " " << hex << (unsigned long long) geoBitSets.allY[ _bits ] << endl;
+            return ( _hash & geoBitSets.allY[ _bits ] ) == 0;
+        }
+
+        bool atMaxX() const {
+            return ( _hash & geoBitSets.allX[ _bits ] ) == geoBitSets.allX[ _bits ];
+        }
+
+        bool atMaxY() const {
+            return ( _hash & geoBitSets.allY[ _bits ] ) == geoBitSets.allY[ _bits ];
+        }
+
+        void move( int x , int y ) {
+            assert( _bits );
+            _move( 0 , x );
+            _move( 1 , y );
+        }
+
+        void _move( unsigned offset , int d ) {
+            if ( d == 0 )
+                return;
+            assert( d <= 1 && d>= -1 ); // TEMP
+
+            bool from, to;
+            if ( d > 0 ) {
+                from = 0;
+                to = 1;
+            }
+            else {
+                from = 1;
+                to = 0;
+            }
+
+            unsigned pos = ( _bits * 2 ) - 1;
+            if ( offset == 0 )
+                pos--;
+            while ( true ) {
+                if ( getBit(pos) == from ) {
+                    setBit( pos , to );
+                    return;
+                }
+
+                if ( pos < 2 ) {
+                    // overflow
+                    for ( ; pos < ( _bits * 2 ) ; pos += 2 ) {
+                        setBit( pos , from );
+                    }
+                    return;
+                }
+
+                setBit( pos , from );
+                pos -= 2;
+            }
+
+            assert(0);
+        }
+
+        GeoHash& operator=(const GeoHash& h) {
+            _hash = h._hash;
+            _bits = h._bits;
+            return *this;
+        }
+
+        bool operator==(const GeoHash& h ) const {
+            return _hash == h._hash && _bits == h._bits;
+        }
+
+        bool operator!=(const GeoHash& h ) const {
+            return !( *this == h );
+        }
+
+        bool operator<(const GeoHash& h ) const {
+            if( _hash != h._hash ) return _hash < h._hash;
+            return _bits < h._bits;
+        }
+
+        GeoHash& operator+=( const char * s ) {
+            unsigned pos = _bits * 2;
+            _bits += strlen(s) / 2;
+            assert( _bits <= 32 );
+            while ( s[0] ) {
+                if ( s[0] == '1' )
+                    setBit( pos , 1 );
+                pos++;
+                s++;
+            }
+
+            return *this;
+        }
+
+        GeoHash operator+( const char * s ) const {
+            GeoHash n = *this;
+            n+=s;
+            return n;
+        }
+
+        GeoHash operator+( string s ) const {
+           return operator+( s.c_str() );
+        }
+
+        void _fix() {
+            static long long FULL = 0xFFFFFFFFFFFFFFFFLL;
+            long long mask = FULL << ( 64 - ( _bits * 2 ) );
+            _hash &= mask;
+        }
+
+        void append( BSONObjBuilder& b , const char * name ) const {
+            char buf[8];
+            _copy( buf , (char*)&_hash );
+            b.appendBinData( name , 8 , bdtCustom , buf );
+        }
+
+        long long getHash() const {
+            return _hash;
+        }
+
+        unsigned getBits() const {
+            return _bits;
+        }
+
+        GeoHash commonPrefix( const GeoHash& other ) const {
+            unsigned i=0;
+            for ( ; i<_bits && i<other._bits; i++ ) {
+                if ( getBitX( i ) == other.getBitX( i ) &&
+                        getBitY( i ) == other.getBitY( i ) )
+                    continue;
+                break;
+            }
+            return GeoHash(_hash,i);
+        }
+
+    private:
+
+        static void _copy( char * dst , const char * src ) {
+            for ( unsigned a=0; a<8; a++ ) {
+                dst[a] = src[7-a];
+            }
+        }
+
+        long long _hash;
+        unsigned _bits; // bits per field, so 1 to 32
+    };
+
+    inline ostream& operator<<( ostream &s, const GeoHash &h ) {
+        s << h.toString();
+        return s;
+    }
+
+    class GeoConvert {
+    public:
+        virtual ~GeoConvert() {}
+
+        virtual void unhash( const GeoHash& h , double& x , double& y ) const = 0;
+        virtual GeoHash hash( double x , double y ) const = 0;
+    };
+
+    class Point {
+    public:
+
+        Point( const GeoConvert * g , const GeoHash& hash ) {
+            g->unhash( hash , _x , _y );
+        }
+
+        explicit Point( const BSONElement& e ) {
+            BSONObjIterator i(e.Obj());
+            _x = i.next().number();
+            _y = i.next().number();
+        }
+
+        explicit Point( const BSONObj& o ) {
+            BSONObjIterator i(o);
+            _x = i.next().number();
+            _y = i.next().number();
+        }
+
+        Point( double x , double y )
+            : _x( x ) , _y( y ) {
+        }
+
+        Point() : _x(0),_y(0) {
+        }
+
+        GeoHash hash( const GeoConvert * g ) {
+            return g->hash( _x , _y );
+        }
+
+        double distance( const Point& p ) const {
+            double a = _x - p._x;
+            double b = _y - p._y;
+
+            // Avoid numerical error if possible...
+            if( a == 0 ) return abs( _y - p._y );
+            if( b == 0 ) return abs( _x - p._x );
+
+            return sqrt( ( a * a ) + ( b * b ) );
+        }
+
+        /**
+         * Distance method that compares x or y coords when other direction is zero,
+         * avoids numerical error when distances are very close to radius but axis-aligned.
+         *
+         * An example of the problem is:
+         * (52.0 - 51.9999) - 0.0001 = 3.31965e-15 and 52.0 - 51.9999 > 0.0001 in double arithmetic
+         * but:
+         * 51.9999 + 0.0001 <= 52.0
+         *
+         * This avoids some (but not all!) suprising results in $center queries where points are
+         * ( radius + center.x, center.y ) or vice-versa.
+         */
+        bool distanceWithin( const Point& p, double radius ) const {
+            double a = _x - p._x;
+            double b = _y - p._y;
+
+            if( a == 0 ) {
+                //
+                // Note:  For some, unknown reason, when a 32-bit g++ optimizes this call, the sum is
+                // calculated imprecisely.  We need to force the compiler to always evaluate it correctly,
+                // hence the weirdness.
+                //
+                // On some 32-bit linux machines, removing the volatile keyword or calculating the sum inline
+                // will make certain geo tests fail.  Of course this check will force volatile for all 32-bit systems,
+                // not just affected systems.
+                if( sizeof(void*) <= 4 ){
+                    volatile double sum = _y > p._y ? p._y + radius : _y + radius;
+                    return _y > p._y ? sum >= _y : sum >= p._y;
+                }
+                else {
+                    // Original math, correct for most systems
+                    return _y > p._y ? p._y + radius >= _y : _y + radius >= p._y;
+                }
+            }
+            if( b == 0 ) {
+                if( sizeof(void*) <= 4 ){
+                    volatile double sum = _x > p._x ? p._x + radius : _x + radius;
+                    return _x > p._x ? sum >= _x : sum >= p._x;
+                }
+                else {
+                    return _x > p._x ? p._x + radius >= _x : _x + radius >= p._x;
+                }
+            }
+
+            return sqrt( ( a * a ) + ( b * b ) ) <= radius;
+        }
+
+        string toString() const {
+            StringBuilder buf(32);
+            buf << "(" << _x << "," << _y << ")";
+            return buf.str();
+
+        }
+
+        double _x;
+        double _y;
+    };
+
+
+    extern const double EARTH_RADIUS_KM;
+    extern const double EARTH_RADIUS_MILES;
+
+    // Technically lat/long bounds, not really tied to earth radius.
+    inline void checkEarthBounds( Point p ) {
+        uassert( 14808, str::stream() << "point " << p.toString() << " must be in earth-like bounds of long : [-180, 180), lat : [-90, 90] ",
+                 p._x >= -180 && p._x < 180 && p._y >= -90 && p._y <= 90 );
+    }
+
+    inline double deg2rad(double deg) { return deg * (M_PI/180); }
+    inline double rad2deg(double rad) { return rad * (180/M_PI); }
+
+    // WARNING: _x and _y MUST be longitude and latitude in that order
+    // note: multiply by earth radius for distance
+    inline double spheredist_rad( const Point& p1, const Point& p2 ) {
+        // this uses the n-vector formula: http://en.wikipedia.org/wiki/N-vector
+        // If you try to match the code to the formula, note that I inline the cross-product.
+        // TODO: optimize with SSE
+
+        double sin_x1(sin(p1._x)), cos_x1(cos(p1._x));
+        double sin_y1(sin(p1._y)), cos_y1(cos(p1._y));
+        double sin_x2(sin(p2._x)), cos_x2(cos(p2._x));
+        double sin_y2(sin(p2._y)), cos_y2(cos(p2._y));
+
+        double cross_prod =
+            (cos_y1*cos_x1 * cos_y2*cos_x2) +
+            (cos_y1*sin_x1 * cos_y2*sin_x2) +
+            (sin_y1        * sin_y2);
+
+        if (cross_prod >= 1 || cross_prod <= -1) {
+            // fun with floats
+            assert( fabs(cross_prod)-1 < 1e-6 );
+            return cross_prod > 0 ? 0 : M_PI;
+        }
+
+        return acos(cross_prod);
+    }
+
+    // note: return is still in radians as that can be multiplied by radius to get arc length
+    inline double spheredist_deg( const Point& p1, const Point& p2 ) {
+        return spheredist_rad(
+                   Point( deg2rad(p1._x), deg2rad(p1._y) ),
+                   Point( deg2rad(p2._x), deg2rad(p2._y) )
+               );
+    }
+
+}
diff --git a/src/mongo/db/geo/haystack.cpp b/src/mongo/db/geo/haystack.cpp
new file mode 100644
index 00000000000..104665087f6
--- /dev/null
+++ b/src/mongo/db/geo/haystack.cpp
@@ -0,0 +1,318 @@
+// db/geo/haystack.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../namespace-inl.h"
+#include "../jsobj.h"
+#include "../index.h"
+#include "../../util/unittest.h"
+#include "../commands.h"
+#include "../pdfile.h"
+#include "../btree.h"
+#include "../curop-inl.h"
+#include "../matcher.h"
+#include "core.h"
+#include "../../util/timer.h"
+
+#define GEOQUADDEBUG(x)
+//#define GEOQUADDEBUG(x) cout << x << endl
+
+/**
+ * this is a geo based search piece, which is different than regular geo lookup
+ * this is useful when you want to look for something within a region where the ratio is low
+ * works well for search for restaurants withing 25 miles with a certain name
+ * should not be used for finding the closest restaurants that are open
+ */
+namespace mongo {
+
+    string GEOSEARCHNAME = "geoHaystack";
+
+    class GeoHaystackSearchHopper {
+    public:
+        GeoHaystackSearchHopper( const BSONObj& n , double maxDistance , unsigned limit , const string& geoField )
+            : _near( n ) , _maxDistance( maxDistance ) , _limit( limit ) , _geoField(geoField) {
+
+        }
+
+        void got( const DiskLoc& loc ) {
+            Point p( loc.obj().getFieldDotted( _geoField ) );
+            if ( _near.distance( p ) > _maxDistance )
+                return;
+            _locs.push_back( loc );
+        }
+
+        int append( BSONArrayBuilder& b ) {
+            for ( unsigned i=0; i<_locs.size() && i<_limit; i++ )
+                b.append( _locs[i].obj() );
+            return _locs.size();
+        }
+
+        Point _near;
+        double _maxDistance;
+        unsigned _limit;
+        string _geoField;
+
+        vector<DiskLoc> _locs;
+    };
+
+    class GeoHaystackSearchIndex : public IndexType {
+
+    public:
+
+        GeoHaystackSearchIndex( const IndexPlugin* plugin , const IndexSpec* spec )
+            : IndexType( plugin , spec ) {
+
+            BSONElement e = spec->info["bucketSize"];
+            uassert( 13321 , "need bucketSize" , e.isNumber() );
+            _bucketSize = e.numberDouble();
+
+            BSONObjBuilder orderBuilder;
+
+            BSONObjIterator i( spec->keyPattern );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.type() == String && GEOSEARCHNAME == e.valuestr() ) {
+                    uassert( 13314 , "can't have 2 geo fields" , _geo.size() == 0 );
+                    uassert( 13315 , "2d has to be first in index" , _other.size() == 0 );
+                    _geo = e.fieldName();
+                }
+                else {
+                    _other.push_back( e.fieldName() );
+                }
+                orderBuilder.append( "" , 1 );
+            }
+
+            uassert( 13316 , "no geo field specified" , _geo.size() );
+            uassert( 13317 , "no other fields specified" , _other.size() );
+            uassert( 13326 , "quadrant search can only have 1 other field for now" , _other.size() == 1 );
+            _order = orderBuilder.obj();
+        }
+
+        int hash( const BSONElement& e ) const {
+            uassert( 13322 , "not a number" , e.isNumber() );
+            return hash( e.numberDouble() );
+        }
+
+        int hash( double d ) const {
+            d += 180;
+            d /= _bucketSize;
+            return (int)d;
+        }
+
+        string makeString( int hashedX , int hashedY ) const {
+            stringstream ss;
+            ss << hashedX << "_" << hashedY;
+            return ss.str();
+        }
+
+        void _add( const BSONObj& obj, const string& root , const BSONElement& e , BSONObjSet& keys ) const {
+            BSONObjBuilder buf;
+            buf.append( "" , root );
+            if ( e.eoo() )
+                buf.appendNull( "" );
+            else
+                buf.appendAs( e , "" );
+
+            BSONObj key = buf.obj();
+            GEOQUADDEBUG( obj << "\n\t" << root << "\n\t" << key );
+            keys.insert( key );
+        }
+
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+
+            BSONElement loc = obj.getFieldDotted( _geo );
+            if ( loc.eoo() )
+                return;
+
+            uassert( 13323 , "latlng not an array" , loc.isABSONObj() );
+            string root;
+            {
+                BSONObjIterator i( loc.Obj() );
+                BSONElement x = i.next();
+                BSONElement y = i.next();
+                root = makeString( hash(x) , hash(y) );
+            }
+
+
+            assert( _other.size() == 1 );
+
+            BSONElementSet all;
+            obj.getFieldsDotted( _other[0] , all );
+
+            if ( all.size() == 0 ) {
+                _add( obj , root , BSONElement() , keys );
+            }
+            else {
+                for ( BSONElementSet::iterator i=all.begin(); i!=all.end(); ++i ) {
+                    _add( obj , root , *i , keys );
+                }
+            }
+
+        }
+
+        shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
+            shared_ptr<Cursor> c;
+            assert(0);
+            return c;
+        }
+
+        void searchCommand( NamespaceDetails* nsd , int idxNo ,
+                            const BSONObj& n /*near*/ , double maxDistance , const BSONObj& search ,
+                            BSONObjBuilder& result , unsigned limit ) {
+
+            Timer t;
+
+            log(1) << "SEARCH near:" << n << " maxDistance:" << maxDistance << " search: " << search << endl;
+            int x,y;
+            {
+                BSONObjIterator i( n );
+                x = hash( i.next() );
+                y = hash( i.next() );
+            }
+            int scale = (int)ceil( maxDistance / _bucketSize );
+
+            GeoHaystackSearchHopper hopper(n,maxDistance,limit,_geo);
+
+            long long btreeMatches = 0;
+
+            for ( int a=-scale; a<=scale; a++ ) {
+                for ( int b=-scale; b<=scale; b++ ) {
+
+                    BSONObjBuilder bb;
+                    bb.append( "" , makeString( x + a , y + b ) );
+                    for ( unsigned i=0; i<_other.size(); i++ ) {
+                        BSONElement e = search.getFieldDotted( _other[i] );
+                        if ( e.eoo() )
+                            bb.appendNull( "" );
+                        else
+                            bb.appendAs( e , "" );
+                    }
+
+                    BSONObj key = bb.obj();
+
+                    GEOQUADDEBUG( "KEY: " << key );
+
+                    set<DiskLoc> thisPass;
+                    scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsd , idxNo , *getDetails() , key , key , true , 1 ) );
+                    while ( cursor->ok() ) {
+                        pair<set<DiskLoc>::iterator, bool> p = thisPass.insert( cursor->currLoc() );
+                        if ( p.second ) {
+                            hopper.got( cursor->currLoc() );
+                            GEOQUADDEBUG( "\t" << cursor->current() );
+                            btreeMatches++;
+                        }
+                        cursor->advance();
+                    }
+                }
+
+            }
+
+            BSONArrayBuilder arr( result.subarrayStart( "results" ) );
+            int num = hopper.append( arr );
+            arr.done();
+
+            {
+                BSONObjBuilder b( result.subobjStart( "stats" ) );
+                b.append( "time" , t.millis() );
+                b.appendNumber( "btreeMatches" , btreeMatches );
+                b.append( "n" , num );
+                b.done();
+            }
+        }
+
+        const IndexDetails* getDetails() const {
+            return _spec->getDetails();
+        }
+
+        string _geo;
+        vector<string> _other;
+
+        BSONObj _order;
+
+        double _bucketSize;
+    };
+
+    class GeoHaystackSearchIndexPlugin : public IndexPlugin {
+    public:
+        GeoHaystackSearchIndexPlugin() : IndexPlugin( GEOSEARCHNAME ) {
+        }
+
+        virtual IndexType* generate( const IndexSpec* spec ) const {
+            return new GeoHaystackSearchIndex( this , spec );
+        }
+
+    } nameIndexPlugin;
+
+
+    class GeoHaystackSearchCommand : public Command {
+    public:
+        GeoHaystackSearchCommand() : Command( "geoSearch" ) {}
+        virtual LockType locktype() const { return READ; }
+        bool slaveOk() const { return true; }
+        bool slaveOverrideOk() const { return true; }
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+            string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( ! d ) {
+                errmsg = "can't find ns";
+                return false;
+            }
+
+            vector<int> idxs;
+            d->findIndexByType( GEOSEARCHNAME , idxs );
+            if ( idxs.size() == 0 ) {
+                errmsg = "no geoSearch index";
+                return false;
+            }
+            if ( idxs.size() > 1 ) {
+                errmsg = "more than 1 geosearch index";
+                return false;
+            }
+
+            int idxNum = idxs[0];
+
+            IndexDetails& id = d->idx( idxNum );
+            GeoHaystackSearchIndex * si = (GeoHaystackSearchIndex*)id.getSpec().getType();
+            assert( &id == si->getDetails() );
+
+            BSONElement n = cmdObj["near"];
+            BSONElement maxDistance = cmdObj["maxDistance"];
+            BSONElement search = cmdObj["search"];
+
+            uassert( 13318 , "near needs to be an array" , n.isABSONObj() );
+            uassert( 13319 , "maxDistance needs a number" , maxDistance.isNumber() );
+            uassert( 13320 , "search needs to be an object" , search.type() == Object );
+
+            unsigned limit = 50;
+            if ( cmdObj["limit"].isNumber() )
+                limit = (unsigned)cmdObj["limit"].numberInt();
+
+            si->searchCommand( d , idxNum , n.Obj() , maxDistance.numberDouble() , search.Obj() , result , limit );
+
+            return 1;
+        }
+
+    } nameSearchCommand;
+
+
+
+
+
+}
diff --git a/src/mongo/db/globals.h b/src/mongo/db/globals.h
new file mode 100644
index 00000000000..093bec76a0e
--- /dev/null
+++ b/src/mongo/db/globals.h
@@ -0,0 +1,54 @@
+// @file globals.h
+// grouping of global variables to make concurrency work clearer
+
+#pragma once
+
+namespace mongo {
+
+    void assertStartingUp();
+
+    // this is prototype for now, we'll see if it is helpful
+
+    /** "value is Const After Server Init" helper
+    *
+    * Example:
+    *
+    *  casi<int> foo = 3;
+    *  foo.ref() = 4; // asserts if not still in server init
+    *  int x = foo+1; // ok anytime
+    *
+    */
+    template< class T >
+    class casi : boost::noncopyable {
+        T val;
+    public:
+        casi(const T& t) : val(t) { 
+            DEV assertStartingUp();
+        }
+        operator const T& () { return val; }
+        T& ref() { 
+            DEV assertStartingUp();
+            return val; 
+        }
+    };
+
+    /** partially specialized for cases where out global variable is a pointer -- we want the value
+     * pointed at to be constant, not just the pointer itself
+     */
+    template< typename T >
+    class casi<T*> : boost::noncopyable {
+        T * val;
+        void operator=(T*);
+    public:
+        casi(T* t) : val(t) { 
+            DEV assertStartingUp();
+        }
+        operator const T* () { return val; }
+        const T* get() { return val; }
+        T*& ref() { 
+            DEV assertStartingUp();
+            return val; 
+        }
+    };
+
+}
diff --git a/src/mongo/db/helpers/dblogger.h b/src/mongo/db/helpers/dblogger.h
new file mode 100644
index 00000000000..4d6ee6d78c4
--- /dev/null
+++ b/src/mongo/db/helpers/dblogger.h
@@ -0,0 +1,31 @@
+// @file db.logger.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    /** helper to log (and read log) of a capped collection in the database */
+    class DBLogger {
+        bool _inited;
+    public:
+        const string _ns;
+        DBLogger(string ns) : _inited(false), _ns(ns) { }
+    };
+
+}
diff --git a/src/mongo/db/index.cpp b/src/mongo/db/index.cpp
new file mode 100644
index 00000000000..5eaeab551df
--- /dev/null
+++ b/src/mongo/db/index.cpp
@@ -0,0 +1,446 @@
+/** @file index.cpp */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "namespace-inl.h"
+#include "index.h"
+#include "btree.h"
+#include "background.h"
+#include "repl/rs.h"
+#include "ops/delete.h"
+
+
+namespace mongo {
+
+    template< class V >
+    class IndexInterfaceImpl : public IndexInterface { 
+    public:
+        typedef typename V::KeyOwned KeyOwned;
+        typedef Continuation<V> Cont;
+        virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering);
+
+        Cont *c[NamespaceDetails::NIndexesMax];
+        int n;
+
+    public:
+        IndexInterfaceImpl() { n = 0; }
+
+        /* lacking CONCURRENCY WRITE this supports only one writer */
+        void _phasedBegin() {
+            // we do this here as phasedFinish can throw exceptions (we could catch there, but just as easy to do here)
+            for( int i = 0; i < n; i++ ) {
+                delete c[i];
+                c[i] = 0; // defensive
+            }
+            n = 0;
+        }
+        void phasedQueueItemToInsert(
+            int idxNo,
+            DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+            const Ordering& _order, IndexDetails& _idx, bool dupsAllowed) 
+        { 
+            if( idxNo >= n )
+                n = idxNo + 1;
+            Cont *C = c[idxNo] = new Cont(thisLoc, _recordLoc, _key, _order, _idx);
+            thisLoc.btree<V>()->twoStepInsert(thisLoc, *C, dupsAllowed);
+        }
+        void _phasedFinish() {
+            for( int i = 0; i < n; i++ ) {
+                // if mixing v0 and v1 indexes, in that case (only) there could be nulls in the list
+                if( c[i] ) {
+                    c[i]->stepTwo();
+                }
+            }
+        }
+
+/*        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+            int& pos, bool& found, const DiskLoc &recordLoc, int direction) { 
+            return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction);
+        }
+        */
+        virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) { 
+            return thisLoc.btree<V>()->fullValidate(thisLoc, order);
+        }
+        virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const { 
+            return thisLoc.btree<V>()->findSingle(indexdetails,thisLoc,key);
+        } 
+        virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const {
+            return thisLoc.btree<V>()->unindex(thisLoc, id, key, recordLoc);
+        }
+        virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                      const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                      IndexDetails& idx, bool toplevel = true) const {
+            return thisLoc.btree<V>()->bt_insert(thisLoc, recordLoc, key, order, dupsAllowed, idx, toplevel);
+        }
+        virtual DiskLoc addBucket(const IndexDetails& id) { 
+            return BtreeBucket<V>::addBucket(id);
+        }
+        virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, DiskLoc self, const Ordering& ordering) { 
+            const BtreeBucket<V> *h = head.btree<V>();
+            for( vector<BSONObj*>::iterator i = addedKeys.begin(); i != addedKeys.end(); i++ ) {
+                KeyOwned k(**i);
+                bool dup = h->wouldCreateDup(idx, head, k, ordering, self);
+                uassert( 11001 , h->dupKeyError( idx , k ) , !dup);
+            }
+        }
+
+        // for geo:
+        virtual bool isUsed(DiskLoc thisLoc, int pos) { return thisLoc.btree<V>()->isUsed(pos); }
+        virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj& key, DiskLoc& recordLoc) {
+            recordLoc = DiskLoc();
+            const BtreeBucket<V>* bucket = thisLoc.btree<V>();
+            int n = bucket->nKeys();
+
+            if( pos < 0 || pos >= n || n == 0xffff /* bucket deleted */ || ! bucket->isUsed( pos ) ){
+                // log() << "Pos: " << pos << " n " << n << endl;
+                return;
+            }
+
+            typename BtreeBucket<V>::KeyNode kn = bucket->keyNode(pos);
+            key = kn.key.toBson();
+            recordLoc = kn.recordLoc;
+        }
+        virtual BSONObj keyAt(DiskLoc thisLoc, int pos) {
+            return thisLoc.btree<V>()->keyAt(pos).toBson();
+        }
+        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) { 
+            return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction);
+        }
+        virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) { 
+            return thisLoc.btree<V>()->advance(thisLoc,keyOfs,direction,caller);
+        }
+    };
+
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o); // key.cpp
+
+    template <>
+    int IndexInterfaceImpl< V0 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) { 
+        return oldCompare(l, r, ordering);
+    }
+
+    template <>
+    int IndexInterfaceImpl< V1 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) { 
+        return l.woCompare(r, ordering, /*considerfieldname*/false);
+    }
+
+    IndexInterfaceImpl<V0> iii_v0;
+    IndexInterfaceImpl<V1> iii_v1;
+
+    IndexInterface *IndexDetails::iis[] = { &iii_v0, &iii_v1 };
+
+    void IndexInterface::phasedBegin() { 
+        iii_v0._phasedBegin();
+        iii_v1._phasedBegin();
+    }
+    void IndexInterface::phasedFinish() { 
+        iii_v0._phasedFinish();
+        iii_v1._phasedFinish();
+    }
+
+    int removeFromSysIndexes(const char *ns, const char *idxName) {
+        string system_indexes = cc().database()->name + ".system.indexes";
+        BSONObjBuilder b;
+        b.append("ns", ns);
+        b.append("name", idxName); // e.g.: { name: "ts_1", ns: "foo.coll" }
+        BSONObj cond = b.done();
+        return (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
+    }
+
+    /* this is just an attempt to clean up old orphaned stuff on a delete all indexes
+       call. repair database is the clean solution, but this gives one a lighter weight
+       partial option.  see dropIndexes()
+    */
+    void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) {
+        string system_indexes = cc().database()->name + ".system.indexes";
+        BSONObjBuilder b;
+        b.append("ns", ns);
+        if( idIndex ) {
+            b.append("name", BSON( "$ne" << idIndex->indexName().c_str() ));
+        }
+        BSONObj cond = b.done();
+        int n = (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
+        if( n ) {
+            log() << "info: assureSysIndexesEmptied cleaned up " << n << " entries" << endl;
+        }
+    }
+
+    int IndexDetails::keyPatternOffset( const string& key ) const {
+        BSONObjIterator i( keyPattern() );
+        int n = 0;
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( key == e.fieldName() )
+                return n;
+            n++;
+        }
+        return -1;
+    }
+
+    const IndexSpec& IndexDetails::getSpec() const {
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this );
+    }
+
+    /* delete this index.  does NOT clean up the system catalog
+       (system.indexes or system.namespaces) -- only NamespaceIndex.
+    */
+    void IndexDetails::kill_idx() {
+        string ns = indexNamespace(); // e.g. foo.coll.$ts_1
+        try {
+
+            string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below
+
+            // clean up parent namespace index cache
+            NamespaceDetailsTransient::get( pns.c_str() ).deletedIndex();
+
+            string name = indexName();
+
+            /* important to catch exception here so we can finish cleanup below. */
+            try {
+                dropNS(ns.c_str());
+            }
+            catch(DBException& ) {
+                log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl;
+            }
+            head.setInvalid();
+            info.setInvalid();
+
+            // clean up in system.indexes.  we do this last on purpose.
+            int n = removeFromSysIndexes(pns.c_str(), name.c_str());
+            wassert( n == 1 );
+
+        }
+        catch ( DBException &e ) {
+            log() << "exception in kill_idx: " << e << ", ns: " << ns << endl;
+        }
+    }
+
+    void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const {
+        getSpec().getKeys( obj, keys );
+    }
+
+    void setDifference(BSONObjSet &l, BSONObjSet &r, vector<BSONObj*> &diff) {
+        // l and r must use the same ordering spec.
+        verify( 14819, l.key_comp().order() == r.key_comp().order() );
+        BSONObjSet::iterator i = l.begin();
+        BSONObjSet::iterator j = r.begin();
+        while ( 1 ) {
+            if ( i == l.end() )
+                break;
+            while ( j != r.end() && j->woCompare( *i ) < 0 )
+                j++;
+            if ( j == r.end() || i->woCompare(*j) != 0  ) {
+                const BSONObj *jo = &*i;
+                diff.push_back( (BSONObj *) jo );
+            }
+            i++;
+        }
+    }
+
+    void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &changedId) {
+        int z = d.nIndexesBeingBuilt();
+        v.resize(z);
+        for( int i = 0; i < z; i++ ) {
+            IndexDetails& idx = d.idx(i);
+            BSONObj idxKey = idx.info.obj().getObjectField("key"); // eg { ts : 1 }
+            IndexChanges& ch = v[i];
+            idx.getKeysFromObject(oldObj, ch.oldkeys);
+            idx.getKeysFromObject(newObj, ch.newkeys);
+            if( ch.newkeys.size() > 1 )
+                d.setIndexIsMultikey(i);
+            setDifference(ch.oldkeys, ch.newkeys, ch.removed);
+            setDifference(ch.newkeys, ch.oldkeys, ch.added);
+            if ( ch.removed.size() > 0 && ch.added.size() > 0 && idx.isIdIndex() ) {
+                changedId = true;
+            }
+        }
+    }
+
+    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc) {
+        int z = d.nIndexesBeingBuilt();
+        for( int i = 0; i < z; i++ ) {
+            IndexDetails& idx = d.idx(i);
+            v[i].dupCheck(idx, curObjLoc);
+        }
+    }
+
+    // should be { <something> : <simpletype[1|-1]>, .keyp.. }
+    static bool validKeyPattern(BSONObj kp) {
+        BSONObjIterator i(kp);
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if( e.type() == Object || e.type() == Array )
+                return false;
+        }
+        return true;
+    }
+
+    /* Prepare to build an index.  Does not actually build it (except for a special _id case).
+       - We validate that the params are good
+       - That the index does not already exist
+       - Creates the source collection if it DNE
+
+       example of 'io':
+         { ns : 'test.foo', name : 'z', key : { z : 1 } }
+
+       throws DBException
+
+       @param sourceNS - source NS we are indexing
+       @param sourceCollection - its details ptr
+       @return true if ok to continue.  when false we stop/fail silently (index already exists)
+    */
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) {
+        sourceCollection = 0;
+
+        // logical name of the index.  todo: get rid of the name, we don't need it!
+        const char *name = io.getStringField("name");
+        uassert(12523, "no index name specified", *name);
+
+        // the collection for which we are building an index
+        sourceNS = io.getStringField("ns");
+        uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos);
+        uassert(10097, "bad table to index name on add index attempt",
+                cc().database()->name == nsToDatabase(sourceNS.c_str()));
+
+        BSONObj key = io.getObjectField("key");
+        uassert(12524, "index key pattern too large", key.objsize() <= 2048);
+        if( !validKeyPattern(key) ) {
+            string s = string("bad index key pattern ") + key.toString();
+            uasserted(10098 , s.c_str());
+        }
+
+        if ( sourceNS.empty() || key.isEmpty() ) {
+            log(2) << "bad add index attempt name:" << (name?name:"") << "\n  ns:" <<
+                   sourceNS << "\n  idxobj:" << io.toString() << endl;
+            string s = "bad add index attempt " + sourceNS + " key:" + key.toString();
+            uasserted(12504, s);
+        }
+
+        sourceCollection = nsdetails(sourceNS.c_str());
+        if( sourceCollection == 0 ) {
+            // try to create it
+            string err;
+            if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) {
+                problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl;
+                return false;
+            }
+            sourceCollection = nsdetails(sourceNS.c_str());
+            tlog() << "info: creating collection " << sourceNS << " on add index" << endl;
+            assert( sourceCollection );
+        }
+
+        if ( sourceCollection->findIndexByName(name) >= 0 ) {
+            // index already exists.
+            return false;
+        }
+        if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) {
+            log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl;
+            return false;
+        }
+
+        if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) {
+            stringstream ss;
+            ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString();
+            string s = ss.str();
+            log() << s << '\n';
+            uasserted(12505,s);
+        }
+
+        /* we can't build a new index for the ns if a build is already in progress in the background -
+           EVEN IF this is a foreground build.
+           */
+        uassert(12588, "cannot add index with a background operation in progress",
+                !BackgroundOperation::inProgForNs(sourceNS.c_str()));
+
+        /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to
+           all be treated as the same pattern.
+        */
+        if ( IndexDetails::isIdIndexPattern(key) ) {
+            if( !god ) {
+                ensureHaveIdIndex( sourceNS.c_str() );
+                return false;
+            }
+        }
+        else {
+            /* is buildIndexes:false set for this replica set member?
+               if so we don't build any indexes except _id
+            */
+            if( theReplSet && !theReplSet->buildIndexes() )
+                return false;
+        }
+
+        string pluginName = IndexPlugin::findPluginName( key );
+        IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0;
+
+
+        { 
+            BSONObj o = io;
+            if ( plugin ) {
+                o = plugin->adjustIndexSpec(o);
+            }
+            BSONObjBuilder b;
+            int v = DefaultIndexVersionNumber;
+            if( !o["v"].eoo() ) {
+                double vv = o["v"].Number();
+                // note (one day) we may be able to fresh build less versions than we can use
+                // isASupportedIndexVersionNumber() is what we can use
+                uassert(14803, str::stream() << "this version of mongod cannot build new indexes of version number " << vv, 
+                    vv == 0 || vv == 1);
+                v = (int) vv;
+            }
+            // idea is to put things we use a lot earlier
+            b.append("v", v);
+            b.append(o["key"]);
+            if( o["unique"].trueValue() )
+                b.appendBool("unique", true); // normalize to bool true in case was int 1 or something...
+            b.append(o["ns"]);
+
+            {
+                // stripping _id
+                BSONObjIterator i(o);
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    string s = e.fieldName();
+                    if( s != "_id" && s != "v" && s != "ns" && s != "unique" && s != "key" )
+                        b.append(e);
+                }
+            }
+        
+            fixedIndexObject = b.obj();
+        }
+
+        return true;
+    }
+
+    void IndexSpec::reset( const IndexDetails * details ) {
+        _details = details;
+        reset( details->info );
+    }
+
+    void IndexSpec::reset( const BSONObj& _info ) {
+        info = _info;
+        keyPattern = info["key"].embeddedObjectUserCheck();
+        if ( keyPattern.objsize() == 0 ) {
+            out() << info.toString() << endl;
+            assert(false);
+        }
+        _init();
+    }
+
+}
diff --git a/src/mongo/db/index.h b/src/mongo/db/index.h
new file mode 100644
index 00000000000..d297f8a4ca1
--- /dev/null
+++ b/src/mongo/db/index.h
@@ -0,0 +1,237 @@
+// index.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "diskloc.h"
+#include "jsobj.h"
+#include "indexkey.h"
+#include "key.h"
+
+namespace mongo {
+
+    class IndexInterface {
+    protected:
+        virtual ~IndexInterface() { }
+    public:
+        static void phasedBegin();
+        virtual void phasedQueueItemToInsert(
+            int idxNo,
+            DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+            const Ordering& _order, IndexDetails& _idx, bool dupsAllowed) = 0;
+        static void phasedFinish();
+
+        virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering) = 0;
+        virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) = 0;
+        virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const = 0;
+        virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const = 0;
+        virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+            const BSONObj& key, const Ordering &order, bool dupsAllowed,
+            IndexDetails& idx, bool toplevel = true) const = 0;
+        virtual DiskLoc addBucket(const IndexDetails&) = 0;
+        virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, 
+            DiskLoc self, const Ordering& ordering) = 0;
+
+        // these are for geo
+        virtual bool isUsed(DiskLoc thisLoc, int pos) = 0;
+        virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj&, DiskLoc& recordLoc) = 0;
+        virtual BSONObj keyAt(DiskLoc thisLoc, int pos) = 0;
+        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                               int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) = 0;
+        virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
+    };
+
+    /* Details about a particular index. There is one of these effectively for each object in
+       system.namespaces (although this also includes the head pointer, which is not in that
+       collection).
+
+       ** MemoryMapped Record ** (i.e., this is on disk data)
+     */
+    class IndexDetails {
+    public:
+        /**
+         * btree head disk location
+         * TODO We should make this variable private, since btree operations
+         * may change its value and we don't want clients to rely on an old
+         * value.  If we create a btree class, we can provide a btree object
+         * to clients instead of 'head'.
+         */
+        DiskLoc head;
+
+        /* Location of index info object. Format:
+
+             { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
+               [, unique: <bool>, background: <bool>, v:<version>]
+             }
+
+           This object is in the system.indexes collection.  Note that since we
+           have a pointer to the object here, the object in system.indexes MUST NEVER MOVE.
+        */
+        DiskLoc info;
+
+        /* extract key value from the query object
+           e.g., if key() == { x : 1 },
+                 { x : 70, y : 3 } -> { x : 70 }
+        */
+        BSONObj getKeyFromQuery(const BSONObj& query) const {
+            BSONObj k = keyPattern();
+            BSONObj res = query.extractFieldsUnDotted(k);
+            return res;
+        }
+
+        /* pull out the relevant key objects from obj, so we
+           can index them.  Note that the set is multiple elements
+           only when it's a "multikey" array.
+           keys will be left empty if key not found in the object.
+        */
+        void getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const;
+
+        /* get the key pattern for this object.
+           e.g., { lastname:1, firstname:1 }
+        */
+        BSONObj keyPattern() const {
+            return info.obj().getObjectField("key");
+        }
+
+        /**
+         * @return offset into keyPattern for key
+                   -1 if doesn't exist
+         */
+        int keyPatternOffset( const string& key ) const;
+        bool inKeyPattern( const string& key ) const { return keyPatternOffset( key ) >= 0; }
+
+        /* true if the specified key is in the index */
+        bool hasKey(const BSONObj& key);
+
+        // returns name of this index's storage area
+        // database.table.$index
+        string indexNamespace() const {
+            BSONObj io = info.obj();
+            string s;
+            s.reserve(Namespace::MaxNsLen);
+            s = io.getStringField("ns");
+            assert( !s.empty() );
+            s += ".$";
+            s += io.getStringField("name");
+            return s;
+        }
+
+        string indexName() const { // e.g. "ts_1"
+            BSONObj io = info.obj();
+            return io.getStringField("name");
+        }
+
+        static bool isIdIndexPattern( const BSONObj &pattern ) {
+            BSONObjIterator i(pattern);
+            BSONElement e = i.next();
+            if( strcmp(e.fieldName(), "_id") != 0 ) return false;
+            return i.next().eoo();
+        }
+
+        /* returns true if this is the _id index. */
+        bool isIdIndex() const {
+            return isIdIndexPattern( keyPattern() );
+        }
+
+        /* gets not our namespace name (indexNamespace for that),
+           but the collection we index, its name.
+           */
+        string parentNS() const {
+            BSONObj io = info.obj();
+            return io.getStringField("ns");
+        }
+
+        static int versionForIndexObj( const BSONObj &obj ) {
+            BSONElement e = obj["v"];
+            if( e.type() == NumberInt ) 
+                return e._numberInt();
+            // should normally be an int.  this is for backward compatibility
+            int v = e.numberInt();
+            uassert(14802, "index v field should be Integer type", v == 0);
+            return v;            
+        }
+        
+        int version() const {
+            return versionForIndexObj( info.obj() );
+        }
+
+        /** @return true if index has unique constraint */
+        bool unique() const {
+            BSONObj io = info.obj();
+            return io["unique"].trueValue() ||
+                   /* temp: can we juse make unique:true always be there for _id and get rid of this? */
+                   isIdIndex();
+        }
+
+        /** return true if dropDups was set when building index (if any duplicates, dropdups drops the duplicating objects) */
+        bool dropDups() const {
+            return info.obj().getBoolField( "dropDups" );
+        }
+
+        /** delete this index.  does NOT clean up the system catalog
+            (system.indexes or system.namespaces) -- only NamespaceIndex.
+        */
+        void kill_idx();
+
+        const IndexSpec& getSpec() const;
+
+        string toString() const {
+            return info.obj().toString();
+        }
+
+        /** @return true if supported.  supported means we can use the index, including adding new keys.
+                    it may not mean we can build the index version in question: we may not maintain building 
+                    of indexes in old formats in the future.
+        */
+        static bool isASupportedIndexVersionNumber(int v) { return (v&1)==v; } // v == 0 || v == 1
+
+        /** @return the interface for this interface, which varies with the index version.
+            used for backward compatibility of index versions/formats.
+        */
+        IndexInterface& idxInterface() const { 
+            int v = version();
+            dassert( isASupportedIndexVersionNumber(v) );
+            return *iis[v&1];
+        }
+
+        static IndexInterface *iis[];
+    };
+
+    struct IndexChanges { /*on an update*/
+        BSONObjSet oldkeys;
+        BSONObjSet newkeys;
+        vector<BSONObj*> removed; // these keys were removed as part of the change
+        vector<BSONObj*> added;   // these keys were added as part of the change
+
+        /** @curObjLoc - the object we want to add's location.  if it is already in the
+                         index, that is allowed here (for bg indexing case).
+        */
+        void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) {
+            if( added.empty() || !idx.unique() )
+                return;
+            const Ordering ordering = Ordering::make(idx.keyPattern());
+            idx.idxInterface().uassertIfDups(idx, added, idx.head, curObjLoc, ordering); // "E11001 duplicate key on update"
+        }
+    };
+
+    class NamespaceDetails;
+    // changedId should be initialized to false
+    void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &cangedId);
+    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc);
+} // namespace mongo
diff --git a/src/mongo/db/indexkey.cpp b/src/mongo/db/indexkey.cpp
new file mode 100644
index 00000000000..18dfcb079b9
--- /dev/null
+++ b/src/mongo/db/indexkey.cpp
@@ -0,0 +1,462 @@
+// index_key.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "namespace-inl.h"
+#include "index.h"
+#include "btree.h"
+#include "ops/query.h"
+#include "background.h"
+#include "../util/text.h"
+
+namespace mongo {
+
+    /** old (<= v1.8) : 0
+     1 is new version
+     */
+    const int DefaultIndexVersionNumber = 1;
+    
+    map<string,IndexPlugin*> * IndexPlugin::_plugins;
+
+    IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec )
+        : _plugin( plugin ) , _spec( spec ) {
+
+    }
+
+    IndexType::~IndexType() {
+    }
+
+    const BSONObj& IndexType::keyPattern() const {
+        return _spec->keyPattern;
+    }
+
+    IndexPlugin::IndexPlugin( const string& name )
+        : _name( name ) {
+        if ( ! _plugins )
+            _plugins = new map<string,IndexPlugin*>();
+        (*_plugins)[name] = this;
+    }
+
+    string IndexPlugin::findPluginName( const BSONObj& keyPattern ) {
+        string pluginName = "";
+
+        BSONObjIterator i( keyPattern );
+
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( e.type() != String )
+                continue;
+
+            uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 || pluginName == e.String() );
+            pluginName = e.String();
+        }
+
+        return pluginName;
+    }
+
+    int IndexType::compare( const BSONObj& l , const BSONObj& r ) const {
+        return l.woCompare( r , _spec->keyPattern );
+    }
+
+    void IndexSpec::_init() {
+        assert( keyPattern.objsize() );
+
+        // some basics
+        _nFields = keyPattern.nFields();
+        _sparse = info["sparse"].trueValue();
+        uassert( 13529 , "sparse only works for single field keys" , ! _sparse || _nFields );
+
+
+        {
+            // build _nullKey
+
+            BSONObjBuilder b;
+            BSONObjIterator i( keyPattern );
+
+            while( i.more() ) {
+                BSONElement e = i.next();
+                _fieldNames.push_back( e.fieldName() );
+                _fixed.push_back( BSONElement() );
+                b.appendNull( "" );
+            }
+            _nullKey = b.obj();
+        }
+
+        {
+            // _nullElt
+            BSONObjBuilder b;
+            b.appendNull( "" );
+            _nullObj = b.obj();
+            _nullElt = _nullObj.firstElement();
+        }
+
+        {
+            // _undefinedElt
+            BSONObjBuilder b;
+            b.appendUndefined( "" );
+            _undefinedObj = b.obj();
+            _undefinedElt = _undefinedObj.firstElement();
+        }
+        
+        {
+            // handle plugins
+            string pluginName = IndexPlugin::findPluginName( keyPattern );
+            if ( pluginName.size() ) {
+                IndexPlugin * plugin = IndexPlugin::get( pluginName );
+                if ( ! plugin ) {
+                    log() << "warning: can't find plugin [" << pluginName << "]" << endl;
+                }
+                else {
+                    _indexType.reset( plugin->generate( this ) );
+                }
+            }
+        }
+
+        _finishedInit = true;
+    }
+
+    void assertParallelArrays( const char *first, const char *second ) {
+        stringstream ss;
+        ss << "cannot index parallel arrays [" << first << "] [" << second << "]";
+        uasserted( ParallelArraysCode ,  ss.str() );
+    }
+    
+    class KeyGeneratorV0 {
+    public:
+        KeyGeneratorV0( const IndexSpec &spec ) : _spec( spec ) {}
+        
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+            if ( _spec._indexType.get() ) { //plugin (eg geo)
+                _spec._indexType->getKeys( obj , keys );
+                return;
+            }
+            vector<const char*> fieldNames( _spec._fieldNames );
+            vector<BSONElement> fixed( _spec._fixed );
+            _getKeys( fieldNames , fixed , obj, keys );
+            if ( keys.empty() && ! _spec._sparse )
+                keys.insert( _spec._nullKey );
+        }
+        
+    private:
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const {
+            BSONElement arrElt;
+            unsigned arrIdx = ~0;
+            int numNotFound = 0;
+            
+            for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+                if ( *fieldNames[ i ] == '\0' )
+                    continue;
+                
+                BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
+                
+                if ( e.eoo() ) {
+                    e = _spec._nullElt; // no matching field
+                    numNotFound++;
+                }
+                
+                if ( e.type() != Array )
+                    fieldNames[ i ] = ""; // no matching field or non-array match
+                
+                if ( *fieldNames[ i ] == '\0' )
+                    fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
+                
+                if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
+                    arrIdx = i;
+                    arrElt = e;
+                }
+                
+                // enforce single array path here
+                if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) {
+                    assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+                }
+            }
+            
+            bool allFound = true; // have we found elements for all field names in the key spec?
+            for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) {
+                if ( **i != '\0' ) {
+                    allFound = false;
+                    break;
+                }
+            }
+            
+            if ( _spec._sparse && numNotFound == _spec._nFields ) {
+                // we didn't find any fields
+                // so we're not going to index this document
+                return;
+            }
+            
+            bool insertArrayNull = false;
+            
+            if ( allFound ) {
+                if ( arrElt.eoo() ) {
+                    // no terminal array element to expand
+                    BSONObjBuilder b(_spec._sizeTracker);
+                    for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
+                        b.appendAs( *i, "" );
+                    keys.insert( b.obj() );
+                }
+                else {
+                    // terminal array element to expand, so generate all keys
+                    BSONObjIterator i( arrElt.embeddedObject() );
+                    if ( i.more() ) {
+                        while( i.more() ) {
+                            BSONObjBuilder b(_spec._sizeTracker);
+                            for( unsigned j = 0; j < fixed.size(); ++j ) {
+                                if ( j == arrIdx )
+                                    b.appendAs( i.next(), "" );
+                                else
+                                    b.appendAs( fixed[ j ], "" );
+                            }
+                            keys.insert( b.obj() );
+                        }
+                    }
+                    else if ( fixed.size() > 1 ) {
+                        insertArrayNull = true;
+                    }
+                }
+            }
+            else {
+                // nonterminal array element to expand, so recurse
+                assert( !arrElt.eoo() );
+                BSONObjIterator i( arrElt.embeddedObject() );
+                if ( i.more() ) {
+                    while( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( e.type() == Object ) {
+                            _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
+                        }
+                    }
+                }
+                else {
+                    insertArrayNull = true;
+                }
+            }
+            
+            if ( insertArrayNull ) {
+                // x : [] - need to insert undefined
+                BSONObjBuilder b(_spec._sizeTracker);
+                for( unsigned j = 0; j < fixed.size(); ++j ) {
+                    if ( j == arrIdx ) {
+                        b.appendUndefined( "" );
+                    }
+                    else {
+                        BSONElement e = fixed[j];
+                        if ( e.eoo() )
+                            b.appendNull( "" );
+                        else
+                            b.appendAs( e , "" );
+                    }
+                }
+                keys.insert( b.obj() );
+            }
+        }
+        
+        const IndexSpec &_spec;
+    };
+
+    class KeyGeneratorV1 {
+    public:
+        KeyGeneratorV1( const IndexSpec &spec ) : _spec( spec ) {}
+        
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+            if ( _spec._indexType.get() ) { //plugin (eg geo)
+                _spec._indexType->getKeys( obj , keys );
+                return;
+            }
+            vector<const char*> fieldNames( _spec._fieldNames );
+            vector<BSONElement> fixed( _spec._fixed );
+            _getKeys( fieldNames , fixed , obj, keys );
+            if ( keys.empty() && ! _spec._sparse )
+                keys.insert( _spec._nullKey );
+        }     
+        
+    private:
+        /**
+         * @param arrayNestedArray - set if the returned element is an array nested directly within arr.
+         */
+        BSONElement extractNextElement( const BSONObj &obj, const BSONObj &arr, const char *&field, bool &arrayNestedArray ) const {
+            string firstField = mongoutils::str::before( field, '.' );
+            bool haveObjField = !obj.getField( firstField ).eoo();
+            BSONElement arrField = arr.getField( firstField );
+            bool haveArrField = !arrField.eoo();
+
+            // An index component field name cannot exist in both a document array and one of that array's children.
+            uassert( 15855 ,  str::stream() << "Ambiguous field name found in array (do not use numeric field names in embedded elements in an array), field: '" << arrField.fieldName() << "' for array: " << arr, !haveObjField || !haveArrField );
+
+            arrayNestedArray = false;
+			if ( haveObjField ) {
+                return obj.getFieldDottedOrArray( field );
+            }
+            else if ( haveArrField ) {
+                if ( arrField.type() == Array ) {
+                    arrayNestedArray = true;
+                }
+                return arr.getFieldDottedOrArray( field );
+            }
+            return BSONElement();
+        }
+        
+        void _getKeysArrEltFixed( vector<const char*> &fieldNames , vector<BSONElement> &fixed , const BSONElement &arrEntry, BSONObjSet &keys, int numNotFound, const BSONElement &arrObjElt, const set< unsigned > &arrIdxs, bool mayExpandArrayUnembedded ) const {
+            // set up any terminal array values
+            for( set<unsigned>::const_iterator j = arrIdxs.begin(); j != arrIdxs.end(); ++j ) {
+                if ( *fieldNames[ *j ] == '\0' ) {
+                    fixed[ *j ] = mayExpandArrayUnembedded ? arrEntry : arrObjElt;
+                }
+            }
+            // recurse
+            _getKeys( fieldNames, fixed, ( arrEntry.type() == Object ) ? arrEntry.embeddedObject() : BSONObj(), keys, numNotFound, arrObjElt.embeddedObject() );        
+        }
+        
+        /**
+         * @param fieldNames - fields to index, may be postfixes in recursive calls
+         * @param fixed - values that have already been identified for their index fields
+         * @param obj - object from which keys should be extracted, based on names in fieldNames
+         * @param keys - set where index keys are written
+         * @param numNotFound - number of index fields that have already been identified as missing
+         * @param array - array from which keys should be extracted, based on names in fieldNames
+         *        If obj and array are both nonempty, obj will be one of the elements of array.
+         */        
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys, int numNotFound = 0, const BSONObj &array = BSONObj() ) const {
+            BSONElement arrElt;
+            set<unsigned> arrIdxs;
+            bool mayExpandArrayUnembedded = true;
+            for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+                if ( *fieldNames[ i ] == '\0' ) {
+                    continue;
+                }
+                
+                bool arrayNestedArray;
+                // Extract element matching fieldName[ i ] from object xor array.
+                BSONElement e = extractNextElement( obj, array, fieldNames[ i ], arrayNestedArray );
+                
+                if ( e.eoo() ) {
+                    // if field not present, set to null
+                    fixed[ i ] = _spec._nullElt;
+                    // done expanding this field name
+                    fieldNames[ i ] = "";
+                    numNotFound++;
+                }
+                else if ( e.type() == Array ) {
+                    arrIdxs.insert( i );
+                    if ( arrElt.eoo() ) {
+                        // we only expand arrays on a single path -- track the path here
+                        arrElt = e;
+                    }
+                    else if ( e.rawdata() != arrElt.rawdata() ) {
+                        // enforce single array path here
+                        assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+                    }
+                    if ( arrayNestedArray ) {
+                        mayExpandArrayUnembedded = false;   
+                    }
+                }
+                else {
+                    // not an array - no need for further expansion
+                    fixed[ i ] = e;
+                }
+            }
+            
+            if ( arrElt.eoo() ) {
+                // No array, so generate a single key.
+                if ( _spec._sparse && numNotFound == _spec._nFields ) {
+                    return;
+                }            
+                BSONObjBuilder b(_spec._sizeTracker);
+                for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) {
+                    b.appendAs( *i, "" );
+                }
+                keys.insert( b.obj() );
+            }
+            else if ( arrElt.embeddedObject().firstElement().eoo() ) {
+                // Empty array, so set matching fields to undefined.
+                _getKeysArrEltFixed( fieldNames, fixed, _spec._undefinedElt, keys, numNotFound, arrElt, arrIdxs, true );
+            }
+            else {
+                // Non empty array that can be expanded, so generate a key for each member.
+                BSONObj arrObj = arrElt.embeddedObject();
+                BSONObjIterator i( arrObj );
+                while( i.more() ) {
+                    _getKeysArrEltFixed( fieldNames, fixed, i.next(), keys, numNotFound, arrElt, arrIdxs, mayExpandArrayUnembedded );
+                }
+            }
+        }
+        
+        const IndexSpec &_spec;
+    };
+    
+    void IndexSpec::getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+        switch( indexVersion() ) {
+            case 0: {
+                KeyGeneratorV0 g( *this );
+                g.getKeys( obj, keys );
+                break;
+            }
+            case 1: {
+                KeyGeneratorV1 g( *this );
+                g.getKeys( obj, keys );
+                break;
+            }
+            default:
+                massert( 15869, "Invalid index version for key generation.", false );
+        }
+    }
+
+    bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ) {
+        BSONObjIterator x(a);
+        while ( x.more() ) {
+            BSONElement e = x.next();
+            BSONObjIterator y(b);
+            while ( y.more() ) {
+                BSONElement f = y.next();
+                FieldCompareResult res = compareDottedFieldNames( e.fieldName() , f.fieldName() );
+                if ( res == SAME || res == LEFT_SUBFIELD || res == RIGHT_SUBFIELD )
+                    return true;
+            }
+        }
+        return false;
+    }
+
+    IndexSuitability IndexSpec::suitability( const BSONObj& query , const BSONObj& order ) const {
+        if ( _indexType.get() )
+            return _indexType->suitability( query , order );
+        return _suitability( query , order );
+    }
+
+    IndexSuitability IndexSpec::_suitability( const BSONObj& query , const BSONObj& order ) const {
+        // TODO: optimize
+        if ( anyElementNamesMatch( keyPattern , query ) == 0 && anyElementNamesMatch( keyPattern , order ) == 0 )
+            return USELESS;
+        return HELPFUL;
+    }
+
+    IndexSuitability IndexType::suitability( const BSONObj& query , const BSONObj& order ) const {
+        return _spec->_suitability( query , order );
+    }
+    
+    int IndexSpec::indexVersion() const {
+        if ( !info.hasField( "v" ) ) {
+            return DefaultIndexVersionNumber;
+        }
+        return IndexDetails::versionForIndexObj( info );
+    }    
+
+    bool IndexType::scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const {
+        return ! order.isEmpty();
+    }
+
+}
diff --git a/src/mongo/db/indexkey.h b/src/mongo/db/indexkey.h
new file mode 100644
index 00000000000..12cd755e8a0
--- /dev/null
+++ b/src/mongo/db/indexkey.h
@@ -0,0 +1,198 @@
+// index_key.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "diskloc.h"
+#include "jsobj.h"
+#include <map>
+
+namespace mongo {
+
+    extern const int DefaultIndexVersionNumber;
+
+    const int ParallelArraysCode = 10088;
+    
+    class Cursor;
+    class IndexSpec;
+    class IndexType; // TODO: this name sucks
+    class IndexPlugin;
+    class IndexDetails;
+
+    enum IndexSuitability { USELESS = 0 , HELPFUL = 1 , OPTIMAL = 2 };
+
+    /**
+     * this represents an instance of a index plugin
+     * done this way so parsing, etc... can be cached
+     * so if there is a FTS IndexPlugin, for each index using FTS
+     * there will be 1 of these, and it can have things pre-parsed, etc...
+     */
+    class IndexType : boost::noncopyable {
+    public:
+        IndexType( const IndexPlugin * plugin , const IndexSpec * spec );
+        virtual ~IndexType();
+
+        virtual void getKeys( const BSONObj &obj, BSONObjSet &keys ) const = 0;
+        virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0;
+
+        /** optional op : changes query to match what's in the index */
+        virtual BSONObj fixKey( const BSONObj& in ) { return in; }
+
+        /** optional op : compare 2 objects with regards to this index */
+        virtual int compare( const BSONObj& l , const BSONObj& r ) const;
+
+        /** @return plugin */
+        const IndexPlugin * getPlugin() const { return _plugin; }
+
+        const BSONObj& keyPattern() const;
+
+        virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+        virtual bool scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const ;
+
+    protected:
+        const IndexPlugin * _plugin;
+        const IndexSpec * _spec;
+    };
+
+    /**
+     * this represents a plugin
+     * a plugin could be something like full text search, sparse index, etc...
+     * 1 of these exists per type of index per server
+     * 1 IndexType is created per index using this plugin
+     */
+    class IndexPlugin : boost::noncopyable {
+    public:
+        IndexPlugin( const string& name );
+        virtual ~IndexPlugin() {}
+
+        virtual IndexType* generate( const IndexSpec * spec ) const = 0;
+
+        string getName() const { return _name; }
+
+        /**
+         * @return new keyPattern
+         * if nothing changes, should return keyPattern
+         */
+        virtual BSONObj adjustIndexSpec( const BSONObj& spec ) const { return spec; }
+
+        // ------- static below -------
+
+        static IndexPlugin* get( const string& name ) {
+            if ( ! _plugins )
+                return 0;
+            map<string,IndexPlugin*>::iterator i = _plugins->find( name );
+            if ( i == _plugins->end() )
+                return 0;
+            return i->second;
+        }
+
+        /**
+         * @param keyPattern { x : "fts" }
+         * @return "" or the name
+         */
+        static string findPluginName( const BSONObj& keyPattern );
+
+    private:
+        string _name;
+        static map<string,IndexPlugin*> * _plugins;
+    };
+
+    /* precomputed details about an index, used for inserting keys on updates
+       stored/cached in NamespaceDetailsTransient, or can be used standalone
+       */
+    class IndexSpec {
+    public:
+        BSONObj keyPattern; // e.g., { name : 1 }
+        BSONObj info; // this is the same as IndexDetails::info.obj()
+
+        IndexSpec()
+            : _details(0) , _finishedInit(false) {
+        }
+
+        explicit IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() )
+            : keyPattern(k) , info(m) , _details(0) , _finishedInit(false) {
+            _init();
+        }
+
+        /**
+           this is a DiscLoc of an IndexDetails info
+           should have a key field
+         */
+        explicit IndexSpec( const DiskLoc& loc ) {
+            reset( loc );
+        }
+
+        void reset( const BSONObj& info );
+        void reset( const DiskLoc& infoLoc ) { reset(infoLoc.obj()); }
+        void reset( const IndexDetails * details );
+
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const;
+
+        BSONElement missingField() const { return _nullElt; }
+
+        string getTypeName() const {
+            if ( _indexType.get() )
+                return _indexType->getPlugin()->getName();
+            return "";
+        }
+
+        IndexType* getType() const {
+            return _indexType.get();
+        }
+
+        const IndexDetails * getDetails() const {
+            return _details;
+        }
+
+        IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+    protected:
+
+        int indexVersion() const;
+        
+        IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+        BSONSizeTracker _sizeTracker;
+        vector<const char*> _fieldNames;
+        vector<BSONElement> _fixed;
+
+        BSONObj _nullKey; // a full key with all fields null
+        BSONObj _nullObj; // only used for _nullElt
+        BSONElement _nullElt; // jstNull
+
+        BSONObj _undefinedObj; // only used for _undefinedElt
+        BSONElement _undefinedElt; // undefined
+
+        int _nFields; // number of fields in the index
+        bool _sparse; // if the index is sparse
+        shared_ptr<IndexType> _indexType;
+        const IndexDetails * _details;
+
+        void _init();
+
+        friend class IndexType;
+        friend class KeyGeneratorV0;
+        friend class KeyGeneratorV1;
+    public:
+        bool _finishedInit;
+    };
+
+
+} // namespace mongo
diff --git a/src/mongo/db/instance.cpp b/src/mongo/db/instance.cpp
new file mode 100644
index 00000000000..c8f8c6ea85b
--- /dev/null
+++ b/src/mongo/db/instance.cpp
@@ -0,0 +1,1148 @@
+// instance.cpp 
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "../bson/util/atomic_int.h"
+#include "introspect.h"
+#include "repl.h"
+#include "dbmessage.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "json.h"
+#include "replutil.h"
+#include "../s/d_logic.h"
+#include "../util/file_allocator.h"
+#include "../util/goodies.h"
+#include "cmdline.h"
+#if !defined(_WIN32)
+#include <sys/file.h>
+#endif
+#include "stats/counters.h"
+#include "background.h"
+#include "dur_journal.h"
+#include "dur_recover.h"
+#include "d_concurrency.h"
+#include "ops/count.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+#include "ops/update.h"
+#include "pagefault.h"
+
+namespace mongo {
+
+    // "diaglog"
+    inline void opread(Message& m) { if( _diaglog.getLevel() & 2 ) _diaglog.readop((char *) m.singleData(), m.header()->len); }
+    inline void opwrite(Message& m) { if( _diaglog.getLevel() & 1 ) _diaglog.write((char *) m.singleData(), m.header()->len); }
+
+    void receivedKillCursors(Message& m);
+    void receivedUpdate(Message& m, CurOp& op);
+    void receivedDelete(Message& m, CurOp& op);
+    void receivedInsert(Message& m, CurOp& op);
+    bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop );
+
+    int nloggedsome = 0;
+#define LOGWITHRATELIMIT if( ++nloggedsome < 1000 || nloggedsome % 100 == 0 )
+
+    string dbExecCommand;
+
+    DiagLog _diaglog;
+
+    bool useCursors = true;
+    bool useHints = true;
+
+    KillCurrentOp killCurrentOp;
+
+    int lockFile = 0;
+#ifdef _WIN32
+    HANDLE lockFileHandle;
+#endif
+
+    // see FSyncCommand:
+    extern bool lockedForWriting;
+
+    OpTime OpTime::now() {
+        DEV d.dbMutex.assertWriteLocked();
+        return now_inlock();
+    }
+    OpTime OpTime::last_inlock(){
+        DEV d.dbMutex.assertAtLeastReadLocked();
+        return last;
+    }
+
+    // OpTime::now() uses dbMutex, thus it is in this file not in the cpp files used by drivers and such
+    void BSONElementManipulator::initTimestamp() {
+        massert( 10332 ,  "Expected CurrentTime type", _element.type() == Timestamp );
+        unsigned long long &timestamp = *( reinterpret_cast< unsigned long long* >( value() ) );
+        if ( timestamp == 0 )
+            timestamp = OpTime::now().asDate();
+    }
+    void BSONElementManipulator::SetNumber(double d) {
+        if ( _element.type() == NumberDouble )
+            *getDur().writing( reinterpret_cast< double * >( value() )  ) = d;
+        else if ( _element.type() == NumberInt )
+            *getDur().writing( reinterpret_cast< int * >( value() ) ) = (int) d;
+        else assert(0);
+    }
+    void BSONElementManipulator::SetLong(long long n) {
+        assert( _element.type() == NumberLong );
+        *getDur().writing( reinterpret_cast< long long * >(value()) ) = n;
+    }
+    void BSONElementManipulator::SetInt(int n) {
+        assert( _element.type() == NumberInt );
+        getDur().writingInt( *reinterpret_cast< int * >( value() ) ) = n;
+    }
+    /* dur:: version */
+    void BSONElementManipulator::ReplaceTypeAndValue( const BSONElement &e ) {
+        char *d = data();
+        char *v = value();
+        int valsize = e.valuesize();
+        int ofs = (int) (v-d);
+        dassert( ofs > 0 );
+        char *p = (char *) getDur().writingPtr(d, valsize + ofs);
+        *p = e.type();
+        memcpy( p + ofs, e.value(), valsize );
+    }
+
+    void inProgCmd( Message &m, DbResponse &dbresponse ) {
+        BSONObjBuilder b;
+
+        if( ! cc().isAdmin() ) {
+            b.append("err", "unauthorized");
+        }
+        else {
+            DbMessage d(m);
+            QueryMessage q(d);
+            bool all = q.query["$all"].trueValue();
+            vector<BSONObj> vals;
+            {
+                Client& me = cc();
+                scoped_lock bl(Client::clientsMutex);
+                auto_ptr<Matcher> m(new Matcher(q.query));
+                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) {
+                    Client *c = *i;
+                    assert( c );
+                    CurOp* co = c->curop();
+                    if ( c == &me && !co ) {
+                        continue;
+                    }
+                    assert( co );
+                    if( all || co->active() ) {
+                        BSONObj info = co->infoNoauth();
+                        if ( all || m->matches( info )) {
+                            vals.push_back( info );
+                        }
+                    }
+                }
+            }
+            b.append("inprog", vals);
+            unsigned x = lockedForWriting;
+            if( x ) {
+                b.append("fsyncLock", x);
+                b.append("info", "use db.fsyncUnlock() to terminate the fsync write/snapshot lock");
+            }
+        }
+
+        replyToQuery(0, m, dbresponse, b.obj());
+    }
+
+    void killOp( Message &m, DbResponse &dbresponse ) {
+        BSONObj obj;
+        if( ! cc().isAdmin() ) {
+            obj = fromjson("{\"err\":\"unauthorized\"}");
+        }
+        /*else if( !dbMutexInfo.isLocked() )
+            obj = fromjson("{\"info\":\"no op in progress/not locked\"}");
+            */
+        else {
+            DbMessage d(m);
+            QueryMessage q(d);
+            BSONElement e = q.query.getField("op");
+            if( !e.isNumber() ) {
+                obj = fromjson("{\"err\":\"no op number field specified?\"}");
+            }
+            else {
+                log() << "going to kill op: " << e << endl;
+                obj = fromjson("{\"info\":\"attempting to kill op\"}");
+                killCurrentOp.kill( (unsigned) e.number() );
+            }
+        }
+        replyToQuery(0, m, dbresponse, obj);
+    }
+
+    void unlockFsyncAndWait();
+    void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) {
+        BSONObj obj;
+        if ( ! cc().isAdmin() ) { // checks auth
+            obj = fromjson("{\"err\":\"unauthorized\"}");
+        }
+        else if (strncmp(ns, "admin.", 6) != 0 ) {
+            obj = fromjson("{\"err\":\"unauthorized - this command must be run against the admin DB\"}");
+        }
+        else {
+            if( lockedForWriting ) {
+                log() << "command: unlock requested" << endl;
+                obj = fromjson("{ok:1,\"info\":\"unlock completed\"}");
+                unlockFsyncAndWait();
+            }
+            else {
+                obj = fromjson("{ok:0,\"errmsg\":\"not locked\"}");
+            }
+        }
+        replyToQuery(0, m, dbresponse, obj);
+    }
+
+    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
+        bool ok = true;
+        MSGID responseTo = m.header()->id;
+
+        DbMessage d(m);
+        QueryMessage q(d);
+        auto_ptr< Message > resp( new Message() );
+
+        CurOp& op = *(c.curop());
+
+        shared_ptr<AssertionException> ex;
+
+        try {
+            dbresponse.exhaust = runQuery(m, q, op, *resp);
+            assert( !resp->empty() );
+        }
+        catch ( SendStaleConfigException& e ){
+            ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) );
+            ok = false;
+        }
+        catch ( AssertionException& e ) {
+            ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
+            ok = false;
+        }
+
+        if( ex ){
+
+            op.debug().exceptionInfo = ex->getInfo();
+            LOGWITHRATELIMIT {
+                log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" <<
+                (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
+                if( q.ntoskip || q.ntoreturn )
+                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
+            }
+
+            SendStaleConfigException* scex = NULL;
+            if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() );
+
+            BSONObjBuilder err;
+            ex->getInfo().append( err );
+            if( scex ) err.append( "ns", scex->getns() );
+            BSONObj errObj = err.done();
+
+            log() << errObj << endl;
+
+            BufBuilder b;
+            b.skip(sizeof(QueryResult));
+            b.appendBuf((void*) errObj.objdata(), errObj.objsize());
+
+            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
+            QueryResult * msgdata = (QueryResult *) b.buf();
+            b.decouple();
+            QueryResult *qr = msgdata;
+            qr->_resultFlags() = ResultFlag_ErrSet;
+            if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale;
+            qr->len = b.len();
+            qr->setOperation(opReply);
+            qr->cursorId = 0;
+            qr->startingFrom = 0;
+            qr->nReturned = 1;
+            resp.reset( new Message() );
+            resp->setData( msgdata, true );
+
+        }
+
+        op.debug().responseLength = resp->header()->dataLen();
+
+        dbresponse.response = resp.release();
+        dbresponse.responseTo = responseTo;
+
+        return ok;
+    }
+
+    void (*reportEventToSystem)(const char *msg) = 0;
+
+    void mongoAbort(const char *msg) { 
+        if( reportEventToSystem ) 
+            reportEventToSystem(msg);
+        rawOut(msg);
+        ::abort();
+    }
+
+    // Returns false when request includes 'end'
+    void _assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) {
+
+        // before we lock...
+        int op = m.operation();
+        bool isCommand = false;
+        const char *ns = m.singleData()->_data + 4;
+        if ( op == dbQuery ) {
+            if( strstr(ns, ".$cmd") ) {
+                isCommand = true;
+                opwrite(m);
+                if( strstr(ns, ".$cmd.sys.") ) {
+                    if( strstr(ns, "$cmd.sys.inprog") ) {
+                        inProgCmd(m, dbresponse);
+                        return;
+                    }
+                    if( strstr(ns, "$cmd.sys.killop") ) {
+                        killOp(m, dbresponse);
+                        return;
+                    }
+                    if( strstr(ns, "$cmd.sys.unlock") ) {
+                        unlockFsync(ns, m, dbresponse);
+                        return;
+                    }
+                }
+            }
+            else {
+                opread(m);
+            }
+        }
+        else if( op == dbGetMore ) {
+            opread(m);
+        }
+        else {
+            opwrite(m);
+        }
+
+        globalOpCounters.gotOp( op , isCommand );
+
+        Client& c = cc();
+
+        auto_ptr<CurOp> nestedOp;
+        CurOp* currentOpP = c.curop();
+        if ( currentOpP->active() ) {
+            nestedOp.reset( new CurOp( &c , currentOpP ) );
+            currentOpP = nestedOp.get();
+        }
+        CurOp& currentOp = *currentOpP;
+        currentOp.reset(remote,op);
+
+        OpDebug& debug = currentOp.debug();
+        debug.op = op;
+
+        int logThreshold = cmdLine.slowMS;
+        bool log = logLevel >= 1;
+
+        if ( op == dbQuery ) {
+            if ( handlePossibleShardedMessage( m , &dbresponse ) )
+                return;
+            receivedQuery(c , dbresponse, m );
+        }
+        else if ( op == dbGetMore ) {
+            if ( ! receivedGetMore(dbresponse, m, currentOp) )
+                log = true;
+        }
+        else if ( op == dbMsg ) {
+            // deprecated - replaced by commands
+            char *p = m.singleData()->_data;
+            int len = strlen(p);
+            if ( len > 400 )
+                out() << curTimeMillis64() % 10000 <<
+                      " long msg received, len:" << len << endl;
+
+            Message *resp = new Message();
+            if ( strcmp( "end" , p ) == 0 )
+                resp->setData( opReply , "dbMsg end no longer supported" );
+            else
+                resp->setData( opReply , "i am fine - dbMsg deprecated");
+
+            dbresponse.response = resp;
+            dbresponse.responseTo = m.header()->id;
+        }
+        else {
+            const char *ns = m.singleData()->_data + 4;
+            char cl[256];
+            nsToDatabase(ns, cl);
+            if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) {
+                uassert_nothrow("unauthorized");
+            }
+            else {
+                try {
+                    if ( op == dbInsert ) {
+                        receivedInsert(m, currentOp);
+                    }
+                    else if ( op == dbUpdate ) {
+                        receivedUpdate(m, currentOp);
+                    }
+                    else if ( op == dbDelete ) {
+                        receivedDelete(m, currentOp);
+                    }
+                    else if ( op == dbKillCursors ) {
+                        currentOp.ensureStarted();
+                        logThreshold = 10;
+                        receivedKillCursors(m);
+                    }
+                    else {
+                        mongo::log() << "    operation isn't supported: " << op << endl;
+                        currentOp.done();
+                        log = true;
+                    }
+                }
+                catch ( UserException& ue ) {
+                    tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << ue.toString() << endl;
+                    debug.exceptionInfo = ue.getInfo();
+                }
+                catch ( AssertionException& e ) {
+                    tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << e.toString() << endl;
+                    debug.exceptionInfo = e.getInfo();
+                    log = true;
+                }
+            }
+        }
+        currentOp.ensureStarted();
+        currentOp.done();
+        debug.executionTime = currentOp.totalTimeMillis();
+
+        //DEV log = true;
+        if ( log || debug.executionTime > logThreshold ) {
+            if( logLevel < 3 && op == dbGetMore && strstr(ns, ".oplog.") && debug.executionTime < 4300 && !log ) {
+                /* it's normal for getMore on the oplog to be slow because of use of awaitdata flag. */
+            }
+            else {
+                mongo::tlog() << debug << endl;
+            }
+        }
+
+        if ( currentOp.shouldDBProfile( debug.executionTime ) ) {
+            // performance profiling is on
+            if ( d.dbMutex.getState() < 0 ) {
+                mongo::log(1) << "note: not profiling because recursive read lock" << endl;
+            }
+            else {
+                writelock lk;
+                if ( dbHolder()._isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ) {
+                    Client::Context cx( currentOp.getNS() );
+                    profile(c , currentOp );
+                }
+                else {
+                    mongo::log() << "note: not profiling because db went away - probably a close on: " << currentOp.getNS() << endl;
+                }
+            }
+        }
+        
+        debug.reset();
+    } /* _assembleResponse() */
+
+    void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) {
+        PageFaultRetryableSection s;
+        while( 1 ) {
+            try {
+                _assembleResponse( m, dbresponse, remote );
+                break;
+            }
+            catch( PageFaultException& e ) { 
+                DEV log() << "TEMP PageFaultException touch and retry" << endl;
+                e.touch();
+            } 
+        }
+    }
+
+    void receivedKillCursors(Message& m) {
+        int *x = (int *) m.singleData()->_data;
+        x++; // reserved
+        int n = *x++;
+
+        uassert( 13659 , "sent 0 cursors to kill" , n != 0 );
+        massert( 13658 , str::stream() << "bad kill cursors size: " << m.dataSize() , m.dataSize() == 8 + ( 8 * n ) );
+        uassert( 13004 , str::stream() << "sent negative cursors to kill: " << n  , n >= 1 );
+
+        if ( n > 2000 ) {
+            log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl;
+            assert( n < 30000 );
+        }
+
+        int found = ClientCursor::erase(n, (long long *) x);
+
+        if ( logLevel > 0 || found != n ) {
+            log( found == n ) << "killcursors: found " << found << " of " << n << endl;
+        }
+
+    }
+
+    /* db - database name
+       path - db directory
+    */
+    /*static*/ void Database::closeDatabase( const char *db, const string& path ) {
+        assertInWriteLock();
+
+        Client::Context * ctx = cc().getContext();
+        assert( ctx );
+        assert( ctx->inDB( db , path ) );
+        Database *database = ctx->db();
+        assert( database->name == db );
+
+        oplogCheckCloseDatabase( database ); // oplog caches some things, dirty its caches
+
+        if( BackgroundOperation::inProgForDb(db) ) {
+            log() << "warning: bg op in prog during close db? " << db << endl;
+        }
+
+        /* important: kill all open cursors on the database */
+        string prefix(db);
+        prefix += '.';
+        ClientCursor::invalidate(prefix.c_str());
+
+        NamespaceDetailsTransient::clearForPrefix( prefix.c_str() );
+
+        dbHolderW().erase( db, path );
+        ctx->_clear();
+        delete database; // closes files
+    }
+
+    void receivedUpdate(Message& m, CurOp& op) {
+        DbMessage d(m);
+        const char *ns = d.getns();
+        op.debug().ns = ns;
+        int flags = d.pullInt();
+        BSONObj query = d.nextJsObj();
+
+        assert( d.moreJSObjs() );
+        assert( query.objsize() < m.header()->dataLen() );
+        BSONObj toupdate = d.nextJsObj();
+        uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize);
+        assert( toupdate.objsize() < m.header()->dataLen() );
+        assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() );
+        bool upsert = flags & UpdateOption_Upsert;
+        bool multi = flags & UpdateOption_Multi;
+        bool broadcast = flags & UpdateOption_Broadcast;
+        
+        op.debug().query = query;
+        op.setQuery(query);
+
+        writelock lk;
+
+        // void ReplSetImpl::relinquish() uses big write lock so 
+        // this is thus synchronized given our lock above.
+        uassert( 10054 ,  "not master", isMasterNs( ns ) );
+        
+        // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
+        if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
+            return;
+
+        Client::Context ctx( ns );
+
+        UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() );
+        lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror
+    }
+
+    void receivedDelete(Message& m, CurOp& op) {
+        DbMessage d(m);
+        const char *ns = d.getns();
+        op.debug().ns = ns;
+        int flags = d.pullInt();
+        bool justOne = flags & RemoveOption_JustOne;
+        bool broadcast = flags & RemoveOption_Broadcast;
+        assert( d.moreJSObjs() );
+        BSONObj pattern = d.nextJsObj();
+        
+        op.debug().query = pattern;
+        op.setQuery(pattern);
+
+        writelock lk(ns);
+
+        // writelock is used to synchronize stepdowns w/ writes
+        uassert( 10056 ,  "not master", isMasterNs( ns ) );
+
+        // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
+        if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
+            return;
+
+        Client::Context ctx(ns);
+
+        long long n = deleteObjects(ns, pattern, justOne, true);
+        lastError.getSafe()->recordDelete( n );
+    }
+
+    QueryResult* emptyMoreResult(long long);
+
+    void OpTime::waitForDifferent(unsigned millis){
+        DEV d.dbMutex.assertAtLeastReadLocked();
+
+        if (*this != last) return; // check early
+
+        boost::xtime timeout;
+        boost::xtime_get(&timeout, boost::TIME_UTC);
+
+        timeout.nsec += millis * 1000*1000;
+        if (timeout.nsec >= 1000*1000*1000){
+            timeout.nsec -= 1000*1000*1000;
+            timeout.sec += 1;
+        }
+
+        do {
+            dbtemprelease tmp;
+            boost::mutex::scoped_lock lk(notifyMutex());
+            if (!notifier().timed_wait(lk, timeout))
+                return; // timed out
+        } while (*this != last);
+    }
+
+    bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
+        bool ok = true;
+
+        DbMessage d(m);
+
+        const char *ns = d.getns();
+        int ntoreturn = d.pullInt();
+        long long cursorid = d.pullInt64();
+
+        curop.debug().ns = ns;
+        curop.debug().ntoreturn = ntoreturn;
+        curop.debug().cursorid = cursorid;
+
+        time_t start = 0;
+        int pass = 0;
+        bool exhaust = false;
+        QueryResult* msgdata;
+        OpTime last;
+        while( 1 ) {
+            try {
+                Client::ReadContext ctx(ns);
+                if (str::startsWith(ns, "local.oplog.")){
+                    if (pass == 0)
+                        last = OpTime::last_inlock();
+                    else
+                        last.waitForDifferent(1000/*ms*/);
+                }
+                msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust);
+            }
+            catch ( AssertionException& e ) {
+                exhaust = false;
+                curop.debug().exceptionInfo = e.getInfo();
+                msgdata = emptyMoreResult(cursorid);
+                ok = false;
+            }
+            if (msgdata == 0) {
+                exhaust = false;
+                massert(13073, "shutting down", !inShutdown() );
+                if( pass == 0 ) {
+                    start = time(0);
+                }
+                else {
+                    if( time(0) - start >= 4 ) {
+                        // after about 4 seconds, return. pass stops at 1000 normally.
+                        // we want to return occasionally so slave can checkpoint.
+                        pass = 10000;
+                    }
+                }
+                pass++;
+                if (debug)
+                    sleepmillis(20);
+                else
+                    sleepmillis(2);
+                continue;
+            }
+            break;
+        };
+
+        Message *resp = new Message();
+        resp->setData(msgdata, true);
+        curop.debug().responseLength = resp->header()->dataLen();
+        curop.debug().nreturned = msgdata->nReturned;
+
+        dbresponse.response = resp;
+        dbresponse.responseTo = m.header()->id;
+        
+        if( exhaust ) {
+            curop.debug().exhaust = true;
+            dbresponse.exhaust = ns;
+        }
+
+        return ok;
+    }
+
+    void checkAndInsert(const char *ns, /*modifies*/BSONObj& js) { 
+        uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize);
+        {
+            // check no $ modifiers.  note we only check top level.  (scanning deep would be quite expensive)
+            BSONObjIterator i( js );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' );
+            }
+        }
+        theDataFileMgr.insertWithObjMod(ns, js, false); // js may be modified in the call to add an _id field.
+        logOp("i", ns, js);
+    }
+
+    NOINLINE_DECL void insertMulti(bool keepGoing, const char *ns, vector<BSONObj>& objs) {
+        size_t i;
+        for (i=0; i<objs.size(); i++){
+            try {
+                checkAndInsert(ns, objs[i]);
+                getDur().commitIfNeeded();
+            } catch (const UserException&) {
+                if (!keepGoing || i == objs.size()-1){
+                    globalOpCounters.incInsertInWriteLock(i);
+                    throw;
+                }
+                // otherwise ignore and keep going
+            }
+        }
+
+        globalOpCounters.incInsertInWriteLock(i);
+    }
+
+    void receivedInsert(Message& m, CurOp& op) {
+        DbMessage d(m);
+        const char *ns = d.getns();
+        op.debug().ns = ns;
+
+        if( !d.moreJSObjs() ) {
+            // strange.  should we complain?
+            return;
+        }
+        BSONObj first = d.nextJsObj();
+
+        vector<BSONObj> multi;
+        while (d.moreJSObjs()){
+            if (multi.empty()) // first pass
+                multi.push_back(first);
+            multi.push_back( d.nextJsObj() );
+        }
+
+        writelock lk(ns);
+        //LockCollectionExclusively lk(ns);
+
+        // CONCURRENCY TODO: is being read locked in big log sufficient here?
+        // writelock is used to synchronize stepdowns w/ writes
+        uassert( 10058 , "not master", isMasterNs(ns) );
+
+        if ( handlePossibleShardedMessage( m , 0 ) )
+            return;
+
+        Client::Context ctx(ns);
+
+        if( !multi.empty() ) {
+            const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError;
+            insertMulti(keepGoing, ns, multi);
+            return;
+        }
+
+        checkAndInsert(ns, first);
+        globalOpCounters.incInsertInWriteLock(1);
+    }
+
+    void getDatabaseNames( vector< string > &names , const string& usePath ) {
+        boost::filesystem::path path( usePath );
+        for ( boost::filesystem::directory_iterator i( path );
+                i != boost::filesystem::directory_iterator(); ++i ) {
+            if ( directoryperdb ) {
+                boost::filesystem::path p = *i;
+                string dbName = p.leaf();
+                p /= ( dbName + ".ns" );
+                if ( MMF::exists( p ) )
+                    names.push_back( dbName );
+            }
+            else {
+                string fileName = boost::filesystem::path(*i).leaf();
+                if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" )
+                    names.push_back( fileName.substr( 0, fileName.length() - 3 ) );
+            }
+        }
+    }
+
+    /* returns true if there is data on this server.  useful when starting replication.
+       local database does NOT count except for rsoplog collection.
+       used to set the hasData field on replset heartbeat command response
+    */
+    bool replHasDatabases() {
+        vector<string> names;
+        getDatabaseNames(names);
+        if( names.size() >= 2 ) return true;
+        if( names.size() == 1 ) {
+            if( names[0] != "local" )
+                return true;
+            // we have a local database.  return true if oplog isn't empty
+            {
+                readlock lk(rsoplog);
+                BSONObj o;
+                if( Helpers::getFirst(rsoplog, o) )
+                    return true;
+            }
+        }
+        return false;
+    }
+
+    bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) {
+        if ( lastError._get() )
+            lastError.startRequest( toSend, lastError._get() );
+        DbResponse dbResponse;
+        assembleResponse( toSend, dbResponse , _clientHost );
+        assert( dbResponse.response );
+        dbResponse.response->concat(); // can get rid of this if we make response handling smarter
+        response = *dbResponse.response;
+        getDur().commitIfNeeded();
+        return true;
+    }
+
+    void DBDirectClient::say( Message &toSend, bool isRetry ) {
+        if ( lastError._get() )
+            lastError.startRequest( toSend, lastError._get() );
+        DbResponse dbResponse;
+        assembleResponse( toSend, dbResponse , _clientHost );
+        getDur().commitIfNeeded();
+    }
+
+    auto_ptr<DBClientCursor> DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip ,
+            const BSONObj *fieldsToReturn , int queryOptions ) {
+
+        //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions )
+        return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions );
+        //
+        //assert( query.obj.isEmpty() );
+        //throw UserException( (string)"yay:" + ns );
+    }
+
+    void DBDirectClient::killCursor( long long id ) {
+        ClientCursor::erase( id );
+    }
+
+    HostAndPort DBDirectClient::_clientHost = HostAndPort( "0.0.0.0" , 0 );
+
+    unsigned long long DBDirectClient::count(const string &ns, const BSONObj& query, int options, int limit, int skip ) {
+        LockCollectionForReading lk( ns );
+        string errmsg;
+        long long res = runCount( ns.c_str() , _countCmd( ns , query , options , limit , skip ) , errmsg );
+        if ( res == -1 )
+            return 0;
+        uassert( 13637 , str::stream() << "count failed in DBDirectClient: " << errmsg , res >= 0 );
+        return (unsigned long long )res;
+    }
+
+    DBClientBase * createDirectClient() {
+        return new DBDirectClient();
+    }
+
+    mongo::mutex exitMutex("exit");
+    AtomicUInt numExitCalls = 0;
+
+    bool inShutdown() {
+        return numExitCalls > 0;
+    }
+
+    void tryToOutputFatal( const string& s ) {
+        try {
+            rawOut( s );
+            return;
+        }
+        catch ( ... ) {}
+
+        try {
+            cerr << s << endl;
+            return;
+        }
+        catch ( ... ) {}
+
+        // uh - oh, not sure there is anything else we can do...
+    }
+
+    /** also called by ntservice.cpp */
+    void shutdownServer() {
+
+        log() << "shutdown: going to close listening sockets..." << endl;
+        ListeningSockets::get()->closeAll();
+
+        log() << "shutdown: going to flush diaglog..." << endl;
+        _diaglog.flush();
+
+        /* must do this before unmapping mem or you may get a seg fault */
+        log() << "shutdown: going to close sockets..." << endl;
+        boost::thread close_socket_thread( boost::bind(MessagingPort::closeAllSockets, 0) );
+
+        // wait until file preallocation finishes
+        // we would only hang here if the file_allocator code generates a
+        // synchronous signal, which we don't expect
+        log() << "shutdown: waiting for fs preallocator..." << endl;
+        FileAllocator::get()->waitUntilFinished();
+
+        if( cmdLine.dur ) {
+            log() << "shutdown: lock for final commit..." << endl;
+            {
+                int n = 10;
+                while( 1 ) {
+                    // we may already be in a read lock from earlier in the call stack, so do read lock here 
+                    // to be consistent with that.
+                    readlocktry w("", 20000);
+                    if( w.got() ) { 
+                        log() << "shutdown: final commit..." << endl;
+                        getDur().commitNow();
+                        break;
+                    }
+                    if( --n <= 0 ) {
+                        log() << "shutdown: couldn't acquire write lock, aborting" << endl;
+                        mongoAbort("couldn't acquire write lock");
+                    }
+                    log() << "shutdown: waiting for write lock..." << endl;
+                }
+            }
+            MemoryMappedFile::flushAll(true);
+        }
+
+        log() << "shutdown: closing all files..." << endl;
+        stringstream ss3;
+        MemoryMappedFile::closeAllFiles( ss3 );
+        log() << ss3.str() << endl;
+
+        if( cmdLine.dur ) {
+            dur::journalCleanup(true);
+        }
+
+#if !defined(__sunos__)
+        if ( lockFile ) {
+            log() << "shutdown: removing fs lock..." << endl;
+            /* This ought to be an unlink(), but Eliot says the last
+               time that was attempted, there was a race condition
+               with acquirePathLock().  */
+#ifdef _WIN32
+            if( _chsize( lockFile , 0 ) )
+                log() << "couldn't remove fs lock " << WSAGetLastError() << endl;
+            CloseHandle(lockFileHandle);
+#else
+            if( ftruncate( lockFile , 0 ) )
+                log() << "couldn't remove fs lock " << errnoWithDescription() << endl;
+            flock( lockFile, LOCK_UN );
+#endif
+        }
+#endif
+    }
+
+    void exitCleanly( ExitCode code ) {
+        killCurrentOp.killAll();
+        {
+            dblock lk;
+            log() << "now exiting" << endl;
+            dbexit( code );
+        }
+    }
+
+
+    namespace dur { 
+        extern mutex groupCommitMutex;
+    }
+
+    /* not using log() herein in case we are already locked */
+    NOINLINE_DECL void dbexit( ExitCode rc, const char *why, bool tryToGetLock ) {
+
+        auto_ptr<writelocktry> wlt;
+        if ( tryToGetLock ) {
+            wlt.reset( new writelocktry( "" , 2 * 60 * 1000 ) );
+            uassert( 13455 , "dbexit timed out getting lock" , wlt->got() );
+        }
+
+        Client * c = currentClient.get();
+        {
+            scoped_lock lk( exitMutex );
+            if ( numExitCalls++ > 0 ) {
+                if ( numExitCalls > 5 ) {
+                    // this means something horrible has happened
+                    ::_exit( rc );
+                }
+                stringstream ss;
+                ss << "dbexit: " << why << "; exiting immediately";
+                tryToOutputFatal( ss.str() );
+                if ( c ) c->shutdown();
+                ::exit( rc );
+            }
+        }
+
+        {
+            stringstream ss;
+            ss << "dbexit: " << why;
+            tryToOutputFatal( ss.str() );
+        }
+
+        try {
+            shutdownServer(); // gracefully shutdown instance
+        }
+        catch ( ... ) {
+            tryToOutputFatal( "shutdown failed with exception" );
+        }
+
+#if defined(_DEBUG)
+        try {
+            mutexDebugger.programEnding();
+        }
+        catch (...) { }
+#endif
+
+        // block the dur thread from doing any work for the rest of the run
+        log(2) << "shutdown: groupCommitMutex" << endl;
+        scoped_lock lk(dur::groupCommitMutex);
+
+#ifdef _WIN32
+        // Windows Service Controller wants to be told when we are down,
+        //  so don't call ::exit() yet, or say "really exiting now"
+        //
+        if ( rc == EXIT_WINDOWS_SERVICE_STOP ) {
+            if ( c ) c->shutdown();
+            return;
+        }
+#endif
+        tryToOutputFatal( "dbexit: really exiting now" );
+        if ( c ) c->shutdown();
+        ::exit(rc);
+    }
+
+#if !defined(__sunos__)
+    void writePid(int fd) {
+        stringstream ss;
+        ss << getpid() << endl;
+        string s = ss.str();
+        const char * data = s.c_str();
+#ifdef _WIN32
+        assert ( _write( fd, data, strlen( data ) ) );
+#else
+        assert ( write( fd, data, strlen( data ) ) );
+#endif
+    }
+
+    void acquirePathLock(bool doingRepair) {
+        string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+
+        bool oldFile = false;
+
+        if ( boost::filesystem::exists( name ) && boost::filesystem::file_size( name ) > 0 ) {
+            oldFile = true;
+        }
+
+#ifdef _WIN32
+        lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE,
+            0 /* do not allow anyone else access */, NULL, 
+            OPEN_ALWAYS /* success if fh can open */, 0, NULL );
+
+        if (lockFileHandle == INVALID_HANDLE_VALUE) {
+            DWORD code = GetLastError();
+            char *msg;
+            FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+                NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                (LPSTR)&msg, 0, NULL);
+            string m = msg;
+            str::stripTrailing(m, "\r\n");
+            uasserted( 13627 , str::stream() << "Unable to create/open lock file: " << name << ' ' << m << ". Is a mongod instance already running?" );
+        }
+        lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0);
+#else
+        lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO );
+        if( lockFile <= 0 ) {
+            uasserted( 10309 , str::stream() << "Unable to create/open lock file: " << name << ' ' << errnoWithDescription() << " Is a mongod instance already running?" );
+        }
+        if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) {
+            close ( lockFile );
+            lockFile = 0;
+            uassert( 10310 ,  "Unable to lock file: " + name + ". Is a mongod instance already running?",  0 );
+        }
+#endif
+
+        if ( oldFile ) {
+            // we check this here because we want to see if we can get the lock
+            // if we can't, then its probably just another mongod running
+            
+            string errmsg;
+            if (cmdLine.dur) {
+                if (!dur::haveJournalFiles()) {
+                    
+                    vector<string> dbnames;
+                    getDatabaseNames( dbnames );
+                    
+                    if ( dbnames.size() == 0 ) {
+                        // this means that mongod crashed
+                        // between initial startup and when journaling was initialized
+                        // it is safe to continue
+                    }
+                    else {
+                        errmsg = str::stream()
+                            << "************** \n"
+                            << "old lock file: " << name << ".  probably means unclean shutdown,\n"
+                            << "but there are no journal files to recover.\n"
+                            << "this is likely human error or filesystem corruption.\n"
+                            << "found " << dbnames.size() << " dbs.\n"
+                            << "see: http://dochub.mongodb.org/core/repair for more information\n"
+                            << "*************";
+                    }
+
+
+                }
+            }
+            else {
+                if (!dur::haveJournalFiles() && !doingRepair) {
+                    errmsg = str::stream()
+                             << "************** \n"
+                             << "Unclean shutdown detected.\n"
+                             << "Please visit http://dochub.mongodb.org/core/repair for recovery instructions.\n"
+                             << "*************";
+                }
+            }
+
+            if (!errmsg.empty()) {
+                cout << errmsg << endl;
+#ifdef _WIN32
+                CloseHandle( lockFileHandle );
+#else
+                close ( lockFile );
+#endif
+                lockFile = 0;
+                uassert( 12596 , "old lock file" , 0 );
+            }
+        }
+
+        // Not related to lock file, but this is where we handle unclean shutdown
+        if( !cmdLine.dur && dur::haveJournalFiles() ) {
+            cout << "**************" << endl;
+            cout << "Error: journal files are present in journal directory, yet starting without journaling enabled." << endl;
+            cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
+            cout << "**************" << endl;
+            uasserted(13597, "can't start without --journal enabled when journal/ files are present");
+        }
+
+#ifdef _WIN32
+        uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0);
+        writePid( lockFile );
+        _commit( lockFile );
+#else
+        uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0);
+        writePid( lockFile );
+        fsync( lockFile );
+        flushMyDirectory(name);
+#endif
+    }
+#else
+    void acquirePathLock(bool) {
+        // TODO - this is very bad that the code above not running here.
+
+        // Not related to lock file, but this is where we handle unclean shutdown
+        if( !cmdLine.dur && dur::haveJournalFiles() ) {
+            cout << "**************" << endl;
+            cout << "Error: journal files are present in journal directory, yet starting without --journal enabled." << endl;
+            cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
+            cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl;
+            cout << "**************" << endl;
+            uasserted(13618, "can't start without --journal enabled when journal/ files are present");
+        }
+    }
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/db/instance.h b/src/mongo/db/instance.h
new file mode 100644
index 00000000000..9dde729997d
--- /dev/null
+++ b/src/mongo/db/instance.h
@@ -0,0 +1,174 @@
+// instance.h : Global state functions.
+//
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+
+#include "../client/dbclient.h"
+#include "curop-inl.h"
+#include "security.h"
+#include "cmdline.h"
+#include "client.h"
+
+namespace mongo {
+
+    extern string dbExecCommand;
+
+    /** a high level recording of operations to the database - sometimes used for diagnostics 
+        and debugging.
+        */
+    class DiagLog {
+        ofstream *f; // note this is never freed
+        /* 0 = off; 1 = writes, 2 = reads, 3 = both
+           7 = log a few reads, and all writes.
+        */
+        int level;
+        mongo::mutex mutex;
+        void openFile() {
+            assert( f == 0 );
+            stringstream ss;
+            ss << dbpath << "/diaglog." << hex << time(0);
+            string name = ss.str();
+            f = new ofstream(name.c_str(), ios::out | ios::binary);
+            if ( ! f->good() ) {
+                problem() << "diagLogging couldn't open " << name << endl;
+                // todo what is this? :
+                throw 1717;
+            }
+            else {
+                log() << "diagLogging using file " << name << endl;
+            }
+        }
+    public:
+        DiagLog() : f(0) , level(0), mutex("DiagLog") { }
+        int getLevel() const { return level; }
+        /**
+         * @return old
+         */
+        int setLevel( int newLevel ) {
+            scoped_lock lk(mutex);
+            int old = level;
+            log() << "diagLogging level=" << newLevel << endl;
+            if( f == 0 ) { 
+                openFile();
+            }
+            level = newLevel; // must be done AFTER f is set
+            return old;
+        }
+        void flush() {
+            if ( level ) {
+                log() << "flushing diag log" << endl;
+                scoped_lock lk(mutex);
+                f->flush();
+            }
+        }
+        void write(char *data,int len) {
+            if ( level & 1 ) {
+                scoped_lock lk(mutex);
+                f->write(data,len);
+            }
+        }
+        void readop(char *data, int len) {
+            if ( level & 2 ) {
+                bool log = (level & 4) == 0;
+                OCCASIONALLY log = true;
+                if ( log ) {
+                    scoped_lock lk(mutex);
+                    assert( f );
+                    f->write(data,len);
+                }
+            }
+        }
+    };
+
+    extern DiagLog _diaglog;
+
+    /* we defer response until we unlock.  don't want a blocked socket to
+       keep things locked.
+    */
+    struct DbResponse {
+        Message *response;
+        MSGID responseTo;
+        const char *exhaust; /* points to ns if exhaust mode. 0=normal mode*/
+        DbResponse(Message *r, MSGID rt) : response(r), responseTo(rt), exhaust(0) { }
+        DbResponse() {
+            response = 0;
+            exhaust = 0;
+        }
+        ~DbResponse() { delete response; }
+    };
+
+    void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort &client );
+
+    void getDatabaseNames( vector< string > &names , const string& usePath = dbpath );
+
+    /* returns true if there is no data on this server.  useful when starting replication.
+       local database does NOT count.
+    */
+    bool replHasDatabases();
+
+    /** "embedded" calls to the local server directly. 
+        Caller does not need to lock, that is handled within.
+     */
+    class DBDirectClient : public DBClientBase {
+    public:
+        virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0,
+                                               const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+
+        virtual bool isFailed() const {
+            return false;
+        }
+        virtual string toString() {
+            return "DBDirectClient";
+        }
+        virtual string getServerAddress() const {
+            return "localhost"; // TODO: should this have the port?
+        }
+        virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 );
+        virtual void say( Message &toSend, bool isRetry = false );
+        virtual void sayPiggyBack( Message &toSend ) {
+            // don't need to piggy back when connected locally
+            return say( toSend );
+        }
+
+        virtual void killCursor( long long cursorID );
+
+        virtual bool callRead( Message& toSend , Message& response ) {
+            return call( toSend , response );
+        }
+        
+        virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 );
+        
+        virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; }
+
+        double getSoTimeout() const { return 0; }
+
+        virtual bool lazySupported() const { return true; }
+    private:
+        static HostAndPort _clientHost;
+    };
+
+    extern int lockFile;
+#ifdef _WIN32
+    extern HANDLE lockFileHandle;
+#endif
+    void acquirePathLock(bool doingRepair=false); // if doingRepair=true don't consider unclean shutdown an error
+    void maybeCreatePidFile();
+
+} // namespace mongo
diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp
new file mode 100644
index 00000000000..7e1d19ce2f3
--- /dev/null
+++ b/src/mongo/db/introspect.cpp
@@ -0,0 +1,88 @@
+// introspect.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "introspect.h"
+#include "../bson/util/builder.h"
+#include "../util/goodies.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "curop.h"
+
+namespace mongo {
+
+    BufBuilder profileBufBuilder; // reused, instead of allocated every time - avoids a malloc/free cycle
+
+    void profile( const Client& c , CurOp& currentOp ) {
+        assertInWriteLock();
+
+        Database *db = c.database();
+        DEV assert( db );
+        const char *ns = db->profileName.c_str();
+        
+        // build object
+        profileBufBuilder.reset();
+        BSONObjBuilder b(profileBufBuilder);
+        b.appendDate("ts", jsTime());
+        currentOp.debug().append( currentOp , b );
+
+        b.append("client", c.clientAddress() );
+
+        if ( c.getAuthenticationInfo() )
+            b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
+        BSONObj p = b.done();
+
+        if (p.objsize() > 100*1024){
+            string small = p.toString(/*isArray*/false, /*full*/false);
+
+            warning() << "can't add full line to system.profile: " << small;
+
+            // rebuild with limited info
+            BSONObjBuilder b(profileBufBuilder);
+            b.appendDate("ts", jsTime());
+            b.append("client", c.clientAddress() );
+            if ( c.getAuthenticationInfo() )
+                b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
+            b.append("err", "profile line too large (max is 100KB)");
+            if (small.size() < 100*1024){ // should be much smaller but if not don't break anything
+                b.append("abbreviated", small);
+            }
+
+            p = b.done();
+        }
+
+        // write: not replicated
+        NamespaceDetails *d = db->namespaceIndex.details(ns);
+        if( d ) {
+            int len = p.objsize();
+            Record *r = theDataFileMgr.fast_oplog_insert(d, ns, len);
+            memcpy(getDur().writingPtr(r->data, len), p.objdata(), len);
+        }
+        else { 
+            static time_t last;
+            if( time(0) > last+10 ) {
+                log() << "profile: warning ns " << ns << " does not exist" << endl;
+                last = time(0);
+            }
+        }
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/introspect.h b/src/mongo/db/introspect.h
new file mode 100644
index 00000000000..209eeacab7c
--- /dev/null
+++ b/src/mongo/db/introspect.h
@@ -0,0 +1,34 @@
+// introspect.h
+// system management stuff.
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+
+namespace mongo {
+
+    /* --- profiling --------------------------------------------
+       do when database->profile is set
+    */
+
+    void profile( const Client& c , CurOp& currentOp );
+
+} // namespace mongo
diff --git a/src/mongo/db/javatest.cpp b/src/mongo/db/javatest.cpp
new file mode 100644
index 00000000000..22f2bdf8d3c
--- /dev/null
+++ b/src/mongo/db/javatest.cpp
@@ -0,0 +1,24 @@
+// javatest.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "javajs.h"
+
+int main() {
+    JavaJS = new JavaJSImpl();
+    javajstest();
+}
diff --git a/src/mongo/db/jsobj.cpp b/src/mongo/db/jsobj.cpp
new file mode 100644
index 00000000000..1e850982396
--- /dev/null
+++ b/src/mongo/db/jsobj.cpp
@@ -0,0 +1,1268 @@
+/** @file jsobj.cpp - BSON implementation
+    http://www.mongodb.org/display/DOCS/BSON
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "../bson/oid.h"
+#include "jsobj.h"
+#include "nonce.h"
+#include "../bson/util/atomic_int.h"
+#include "../util/base64.h"
+#include "../util/md5.hpp"
+#include <limits>
+#include <cmath>
+#include "../util/unittest.h"
+#include "../util/embedded_builder.h"
+#include "../util/stringutils.h"
+#include "../util/mongoutils/str.h"
+#include "json.h"
+#include "jsobjmanipulator.h"
+#include "../util/optime.h"
+#include <boost/static_assert.hpp>
+#undef assert
+#define assert MONGO_assert
+
+// make sure our assumptions are valid
+BOOST_STATIC_ASSERT( sizeof(short) == 2 );
+BOOST_STATIC_ASSERT( sizeof(int) == 4 );
+BOOST_STATIC_ASSERT( sizeof(long long) == 8 );
+BOOST_STATIC_ASSERT( sizeof(double) == 8 );
+BOOST_STATIC_ASSERT( sizeof(mongo::Date_t) == 8 );
+BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 );
+
+namespace mongo {
+
+    BSONElement eooElement;
+
+    GENOIDLabeler GENOID;
+
+    DateNowLabeler DATENOW;
+    NullLabeler BSONNULL;
+
+    MinKeyLabeler MINKEY;
+    MaxKeyLabeler MAXKEY;
+
+    // need to move to bson/, but has dependency on base64 so move that to bson/util/ first.
+    inline string BSONElement::jsonString( JsonStringFormat format, bool includeFieldNames, int pretty ) const {
+        BSONType t = type();
+        int sign;
+        if ( t == Undefined )
+            return "undefined";
+
+        stringstream s;
+        if ( includeFieldNames )
+            s << '"' << escape( fieldName() ) << "\" : ";
+        switch ( type() ) {
+        case mongo::String:
+        case Symbol:
+            s << '"' << escape( string(valuestr(), valuestrsize()-1) ) << '"';
+            break;
+        case NumberLong:
+            s << _numberLong();
+            break;
+        case NumberInt:
+        case NumberDouble:
+            if ( number() >= -numeric_limits< double >::max() &&
+                    number() <= numeric_limits< double >::max() ) {
+                s.precision( 16 );
+                s << number();
+            }
+            else if ( mongo::isNaN(number()) ) {
+                s << "NaN";
+            }
+            else if ( mongo::isInf(number(), &sign) ) {
+                s << ( sign == 1 ? "Infinity" : "-Infinity");
+            }
+            else {
+                StringBuilder ss;
+                ss << "Number " << number() << " cannot be represented in JSON";
+                string message = ss.str();
+                massert( 10311 ,  message.c_str(), false );
+            }
+            break;
+        case mongo::Bool:
+            s << ( boolean() ? "true" : "false" );
+            break;
+        case jstNULL:
+            s << "null";
+            break;
+        case Object:
+            s << embeddedObject().jsonString( format, pretty );
+            break;
+        case mongo::Array: {
+            if ( embeddedObject().isEmpty() ) {
+                s << "[]";
+                break;
+            }
+            s << "[ ";
+            BSONObjIterator i( embeddedObject() );
+            BSONElement e = i.next();
+            if ( !e.eoo() ) {
+                int count = 0;
+                while ( 1 ) {
+                    if( pretty ) {
+                        s << '\n';
+                        for( int x = 0; x < pretty; x++ )
+                            s << "  ";
+                    }
+
+                    if (strtol(e.fieldName(), 0, 10) > count) {
+                        s << "undefined";
+                    }
+                    else {
+                        s << e.jsonString( format, false, pretty?pretty+1:0 );
+                        e = i.next();
+                    }
+                    count++;
+                    if ( e.eoo() )
+                        break;
+                    s << ", ";
+                }
+            }
+            s << " ]";
+            break;
+        }
+        case DBRef: {
+            mongo::OID *x = (mongo::OID *) (valuestr() + valuestrsize());
+            if ( format == TenGen )
+                s << "Dbref( ";
+            else
+                s << "{ \"$ref\" : ";
+            s << '"' << valuestr() << "\", ";
+            if ( format != TenGen )
+                s << "\"$id\" : ";
+            s << '"' << *x << "\" ";
+            if ( format == TenGen )
+                s << ')';
+            else
+                s << '}';
+            break;
+        }
+        case jstOID:
+            if ( format == TenGen ) {
+                s << "ObjectId( ";
+            }
+            else {
+                s << "{ \"$oid\" : ";
+            }
+            s << '"' << __oid() << '"';
+            if ( format == TenGen ) {
+                s << " )";
+            }
+            else {
+                s << " }";
+            }
+            break;
+        case BinData: {
+            int len = *(int *)( value() );
+            BinDataType type = BinDataType( *(char *)( (int *)( value() ) + 1 ) );
+            s << "{ \"$binary\" : \"";
+            char *start = ( char * )( value() ) + sizeof( int ) + 1;
+            base64::encode( s , start , len );
+            s << "\", \"$type\" : \"" << hex;
+            s.width( 2 );
+            s.fill( '0' );
+            s << type << dec;
+            s << "\" }";
+            break;
+        }
+        case mongo::Date:
+            if ( format == Strict )
+                s << "{ \"$date\" : ";
+            else
+                s << "Date( ";
+            if( pretty ) {
+                Date_t d = date();
+                if( d == 0 ) s << '0';
+                else
+                    s << '"' << date().toString() << '"';
+            }
+            else
+                s << date();
+            if ( format == Strict )
+                s << " }";
+            else
+                s << " )";
+            break;
+        case RegEx:
+            if ( format == Strict ) {
+                s << "{ \"$regex\" : \"" << escape( regex() );
+                s << "\", \"$options\" : \"" << regexFlags() << "\" }";
+            }
+            else {
+                s << "/" << escape( regex() , true ) << "/";
+                // FIXME Worry about alpha order?
+                for ( const char *f = regexFlags(); *f; ++f ) {
+                    switch ( *f ) {
+                    case 'g':
+                    case 'i':
+                    case 'm':
+                        s << *f;
+                    default:
+                        break;
+                    }
+                }
+            }
+            break;
+
+        case CodeWScope: {
+            BSONObj scope = codeWScopeObject();
+            if ( ! scope.isEmpty() ) {
+                s << "{ \"$code\" : " << _asCode() << " , "
+                  << " \"$scope\" : " << scope.jsonString() << " }";
+                break;
+            }
+        }
+
+        case Code:
+            s << _asCode();
+            break;
+
+        case Timestamp:
+            s << "{ \"t\" : " << timestampTime() << " , \"i\" : " << timestampInc() << " }";
+            break;
+
+        case MinKey:
+            s << "{ \"$minKey\" : 1 }";
+            break;
+
+        case MaxKey:
+            s << "{ \"$maxKey\" : 1 }";
+            break;
+
+        default:
+            StringBuilder ss;
+            ss << "Cannot create a properly formatted JSON string with "
+               << "element: " << toString() << " of type: " << type();
+            string message = ss.str();
+            massert( 10312 ,  message.c_str(), false );
+        }
+        return s.str();
+    }
+
+    int BSONElement::getGtLtOp( int def ) const {
+        const char *fn = fieldName();
+        if ( fn[0] == '$' && fn[1] ) {
+            if ( fn[2] == 't' ) {
+                if ( fn[1] == 'g' ) {
+                    if ( fn[3] == 0 ) return BSONObj::GT;
+                    else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::GTE;
+                }
+                else if ( fn[1] == 'l' ) {
+                    if ( fn[3] == 0 ) return BSONObj::LT;
+                    else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE;
+                }
+            }
+            else if ( fn[1] == 'n' && fn[2] == 'e' ) {
+                if ( fn[3] == 0 )
+                    return BSONObj::NE;
+                if ( fn[3] == 'a' && fn[4] == 'r') // matches anything with $near prefix
+                    return BSONObj::opNEAR;
+            }
+            else if ( fn[1] == 'm' ) {
+                if ( fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 )
+                    return BSONObj::opMOD;
+                if ( fn[2] == 'a' && fn[3] == 'x' && fn[4] == 'D' && fn[5] == 'i' && fn[6] == 's' && fn[7] == 't' && fn[8] == 'a' && fn[9] == 'n' && fn[10] == 'c' && fn[11] == 'e' && fn[12] == 0 )
+                    return BSONObj::opMAX_DISTANCE;
+            }
+            else if ( fn[1] == 't' && fn[2] == 'y' && fn[3] == 'p' && fn[4] == 'e' && fn[5] == 0 )
+                return BSONObj::opTYPE;
+            else if ( fn[1] == 'i' && fn[2] == 'n' && fn[3] == 0 )
+                return BSONObj::opIN;
+            else if ( fn[1] == 'n' && fn[2] == 'i' && fn[3] == 'n' && fn[4] == 0 )
+                return BSONObj::NIN;
+            else if ( fn[1] == 'a' && fn[2] == 'l' && fn[3] == 'l' && fn[4] == 0 )
+                return BSONObj::opALL;
+            else if ( fn[1] == 's' && fn[2] == 'i' && fn[3] == 'z' && fn[4] == 'e' && fn[5] == 0 )
+                return BSONObj::opSIZE;
+            else if ( fn[1] == 'e' ) {
+                if ( fn[2] == 'x' && fn[3] == 'i' && fn[4] == 's' && fn[5] == 't' && fn[6] == 's' && fn[7] == 0 )
+                    return BSONObj::opEXISTS;
+                if ( fn[2] == 'l' && fn[3] == 'e' && fn[4] == 'm' && fn[5] == 'M' && fn[6] == 'a' && fn[7] == 't' && fn[8] == 'c' && fn[9] == 'h' && fn[10] == 0 )
+                    return BSONObj::opELEM_MATCH;
+            }
+            else if ( fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'g' && fn[4] == 'e' && fn[5] == 'x' && fn[6] == 0 )
+                return BSONObj::opREGEX;
+            else if ( fn[1] == 'o' && fn[2] == 'p' && fn[3] == 't' && fn[4] == 'i' && fn[5] == 'o' && fn[6] == 'n' && fn[7] == 's' && fn[8] == 0 )
+                return BSONObj::opOPTIONS;
+            else if ( fn[1] == 'w' && fn[2] == 'i' && fn[3] == 't' && fn[4] == 'h' && fn[5] == 'i' && fn[6] == 'n' && fn[7] == 0 )
+                return BSONObj::opWITHIN;
+        }
+        return def;
+    }
+
+    /* Matcher --------------------------------------*/
+
+// If the element is something like:
+//   a : { $gt : 3 }
+// we append
+//   a : 3
+// else we just append the element.
+//
+    void appendElementHandlingGtLt(BSONObjBuilder& b, const BSONElement& e) {
+        if ( e.type() == Object ) {
+            BSONElement fe = e.embeddedObject().firstElement();
+            const char *fn = fe.fieldName();
+            if ( fn[0] == '$' && fn[1] && fn[2] == 't' ) {
+                b.appendAs(fe, e.fieldName());
+                return;
+            }
+        }
+        b.append(e);
+    }
+
+    int getGtLtOp(const BSONElement& e) {
+        if ( e.type() != Object )
+            return BSONObj::Equality;
+
+        BSONElement fe = e.embeddedObject().firstElement();
+        return fe.getGtLtOp();
+    }
+
+    FieldCompareResult compareDottedFieldNames( const string& l , const string& r ) {
+        static int maxLoops = 1024 * 1024;
+
+        size_t lstart = 0;
+        size_t rstart = 0;
+
+        for ( int i=0; i<maxLoops; i++ ) {
+
+            size_t a = l.find( '.' , lstart );
+            size_t b = r.find( '.' , rstart );
+
+            size_t lend = a == string::npos ? l.size() : a;
+            size_t rend = b == string::npos ? r.size() : b;
+
+            const string& c = l.substr( lstart , lend - lstart );
+            const string& d = r.substr( rstart , rend - rstart );
+
+            int x = lexNumCmp( c.c_str(), d.c_str() );
+
+            if ( x < 0 )
+                return LEFT_BEFORE;
+            if ( x > 0 )
+                return RIGHT_BEFORE;
+
+            lstart = lend + 1;
+            rstart = rend + 1;
+
+            if ( lstart >= l.size() ) {
+                if ( rstart >= r.size() )
+                    return SAME;
+                return RIGHT_SUBFIELD;
+            }
+            if ( rstart >= r.size() )
+                return LEFT_SUBFIELD;
+        }
+
+        log() << "compareDottedFieldNames ERROR  l: " << l << " r: " << r << "  TOO MANY LOOPS" << endl;
+        assert(0);
+        return SAME; // will never get here
+    }
+
+    /* BSONObj ------------------------------------------------------------*/
+
+    string BSONObj::md5() const {
+        md5digest d;
+        md5_state_t st;
+        md5_init(&st);
+        md5_append( &st , (const md5_byte_t*)_objdata , objsize() );
+        md5_finish(&st, d);
+        return digestToString( d );
+    }
+
+    string BSONObj::jsonString( JsonStringFormat format, int pretty ) const {
+
+        if ( isEmpty() ) return "{}";
+
+        StringBuilder s;
+        s << "{ ";
+        BSONObjIterator i(*this);
+        BSONElement e = i.next();
+        if ( !e.eoo() )
+            while ( 1 ) {
+                s << e.jsonString( format, true, pretty?pretty+1:0 );
+                e = i.next();
+                if ( e.eoo() )
+                    break;
+                s << ",";
+                if ( pretty ) {
+                    s << '\n';
+                    for( int x = 0; x < pretty; x++ )
+                        s << "  ";
+                }
+                else {
+                    s << " ";
+                }
+            }
+        s << " }";
+        return s.str();
+    }
+
+    bool BSONObj::valid() const {
+        try {
+            BSONObjIterator it(*this);
+            while( it.moreWithEOO() ) {
+                // both throw exception on failure
+                BSONElement e = it.next(true);
+                e.validate();
+
+                if (e.eoo()) {
+                    if (it.moreWithEOO())
+                        return false;
+                    return true;
+                }
+                else if (e.isABSONObj()) {
+                    if(!e.embeddedObject().valid())
+                        return false;
+                }
+                else if (e.type() == CodeWScope) {
+                    if(!e.codeWScopeObject().valid())
+                        return false;
+                }
+            }
+        }
+        catch (...) {
+        }
+        return false;
+    }
+
+    int BSONObj::woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName) const {
+        if ( isEmpty() )
+            return r.isEmpty() ? 0 : -1;
+        if ( r.isEmpty() )
+            return 1;
+
+        BSONObjIterator i(*this);
+        BSONObjIterator j(r);
+        unsigned mask = 1;
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x;
+            {
+                x = l.woCompare( r, considerFieldName );
+                if( o.descending(mask) )
+                    x = -x;
+            }
+            if ( x != 0 )
+                return x;
+            mask <<= 1;
+        }
+        return -1;
+    }
+
+    /* well ordered compare */
+    int BSONObj::woCompare(const BSONObj &r, const BSONObj &idxKey,
+                           bool considerFieldName) const {
+        if ( isEmpty() )
+            return r.isEmpty() ? 0 : -1;
+        if ( r.isEmpty() )
+            return 1;
+
+        bool ordered = !idxKey.isEmpty();
+
+        BSONObjIterator i(*this);
+        BSONObjIterator j(r);
+        BSONObjIterator k(idxKey);
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            BSONElement o;
+            if ( ordered )
+                o = k.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x;
+            /*
+                        if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 &&
+                            l.type() == String && r.type() == String ) {
+                            // note: no negative support yet, as this is just sort of a POC
+                            x = _stricmp(l.valuestr(), r.valuestr());
+                        }
+                        else*/ {
+                x = l.woCompare( r, considerFieldName );
+                if ( ordered && o.number() < 0 )
+                    x = -x;
+            }
+            if ( x != 0 )
+                return x;
+        }
+        return -1;
+    }
+
+    BSONObj staticNull = fromjson( "{'':null}" );
+    BSONObj makeUndefined() {
+        BSONObjBuilder b;
+        b.appendUndefined( "" );
+        return b.obj();
+    }
+    BSONObj staticUndefined = makeUndefined();
+
+    /* well ordered compare */
+    int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const {
+        if ( isEmpty() )
+            return other.isEmpty() ? 0 : -1;
+        if ( other.isEmpty() )
+            return 1;
+
+        uassert( 10060 ,  "woSortOrder needs a non-empty sortKey" , ! sortKey.isEmpty() );
+
+        BSONObjIterator i(sortKey);
+        while ( 1 ) {
+            BSONElement f = i.next();
+            if ( f.eoo() )
+                return 0;
+
+            BSONElement l = useDotted ? getFieldDotted( f.fieldName() ) : getField( f.fieldName() );
+            if ( l.eoo() )
+                l = staticNull.firstElement();
+            BSONElement r = useDotted ? other.getFieldDotted( f.fieldName() ) : other.getField( f.fieldName() );
+            if ( r.eoo() )
+                r = staticNull.firstElement();
+
+            int x = l.woCompare( r, false );
+            if ( f.number() < 0 )
+                x = -x;
+            if ( x != 0 )
+                return x;
+        }
+        return -1;
+    }
+
+    template <typename BSONElementColl>
+    void _getFieldsDotted( const BSONObj* obj, const StringData& name, BSONElementColl &ret, bool expandLastArray ) {
+        BSONElement e = obj->getField( name );
+
+        if ( e.eoo() ) {
+            const char *p = strchr(name.data(), '.');
+            if ( p ) {
+                string left(name.data(), p-name.data());
+                const char* next = p+1;
+                BSONElement e = obj->getField( left.c_str() );
+
+                if (e.type() == Object) {
+                    e.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+                }
+                else if (e.type() == Array) {
+                    bool allDigits = false;
+                    if ( isdigit( *next ) ) {
+                        const char * temp = next + 1;
+                        while ( isdigit( *temp ) )
+                            temp++;
+                        allDigits = (*temp == '.' || *temp == '\0');
+                    }
+                    if (allDigits) {
+                        e.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+                    }
+                    else {
+                        BSONObjIterator i(e.embeddedObject());
+                        while ( i.more() ) {
+                            BSONElement e2 = i.next();
+                            if (e2.type() == Object || e2.type() == Array)
+                                e2.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+                        }
+                    }
+                }
+                else {
+                    // do nothing: no match
+                }
+            }
+        }
+        else {
+            if (e.type() == Array && expandLastArray) {
+                BSONObjIterator i(e.embeddedObject());
+                while ( i.more() )
+                    ret.insert(i.next());
+            }
+            else {
+                ret.insert(e);
+            }
+        }
+    }
+
+    void BSONObj::getFieldsDotted(const StringData& name, BSONElementSet &ret, bool expandLastArray ) const {
+        _getFieldsDotted( this, name, ret, expandLastArray );
+    }
+    void BSONObj::getFieldsDotted(const StringData& name, BSONElementMSet &ret, bool expandLastArray ) const {
+        _getFieldsDotted( this, name, ret, expandLastArray );
+    }
+
+    BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const {
+        const char *p = strchr(name, '.');
+
+        BSONElement sub;
+
+        if ( p ) {
+            sub = getField( string(name, p-name) );
+            name = p + 1;
+        }
+        else {
+            sub = getField( name );
+            name = name + strlen(name);
+        }
+
+        if ( sub.eoo() )
+            return eooElement;
+        else if ( sub.type() == Array || name[0] == '\0' )
+            return sub;
+        else if ( sub.type() == Object )
+            return sub.embeddedObject().getFieldDottedOrArray( name );
+        else
+            return eooElement;
+    }
+
+    /**
+     sets element field names to empty string
+     If a field in pattern is missing, it is omitted from the returned
+     object.
+     */
+    BSONObj BSONObj::extractFieldsUnDotted(BSONObj pattern) const {
+        BSONObjBuilder b;
+        BSONObjIterator i(pattern);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = getField(e.fieldName());
+            if ( !x.eoo() )
+                b.appendAs(x, "");
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::extractFields(const BSONObj& pattern , bool fillWithNull ) const {
+        BSONObjBuilder b(32); // scanandorder.h can make a zillion of these, so we start the allocation very small
+        BSONObjIterator i(pattern);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = getFieldDotted(e.fieldName());
+            if ( ! x.eoo() )
+                b.appendAs( x, e.fieldName() );
+            else if ( fillWithNull )
+                b.appendNull( e.fieldName() );
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::filterFieldsUndotted( const BSONObj &filter, bool inFilter ) const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = filter.getField( e.fieldName() );
+            if ( ( x.eoo() && !inFilter ) ||
+                    ( !x.eoo() && inFilter ) )
+                b.append( e );
+        }
+        return b.obj();
+    }
+
+    BSONElement BSONObj::getFieldUsingIndexNames(const char *fieldName, const BSONObj &indexKey) const {
+        BSONObjIterator i( indexKey );
+        int j = 0;
+        while( i.moreWithEOO() ) {
+            BSONElement f = i.next();
+            if ( f.eoo() )
+                return BSONElement();
+            if ( strcmp( f.fieldName(), fieldName ) == 0 )
+                break;
+            ++j;
+        }
+        BSONObjIterator k( *this );
+        while( k.moreWithEOO() ) {
+            BSONElement g = k.next();
+            if ( g.eoo() )
+                return BSONElement();
+            if ( j == 0 ) {
+                return g;
+            }
+            --j;
+        }
+        return BSONElement();
+    }
+
+    /* grab names of all the fields in this object */
+    int BSONObj::getFieldNames(set<string>& fields) const {
+        int n = 0;
+        BSONObjIterator i(*this);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            fields.insert(e.fieldName());
+            n++;
+        }
+        return n;
+    }
+
+    /* note: addFields always adds _id even if not specified
+       returns n added not counting _id unless requested.
+    */
+    int BSONObj::addFields(BSONObj& from, set<string>& fields) {
+        assert( isEmpty() && !isOwned() ); /* partial implementation for now... */
+
+        BSONObjBuilder b;
+
+        int N = fields.size();
+        int n = 0;
+        BSONObjIterator i(from);
+        bool gotId = false;
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            const char *fname = e.fieldName();
+            if ( fields.count(fname) ) {
+                b.append(e);
+                ++n;
+                gotId = gotId || strcmp(fname, "_id")==0;
+                if ( n == N && gotId )
+                    break;
+            }
+            else if ( strcmp(fname, "_id")==0 ) {
+                b.append(e);
+                gotId = true;
+                if ( n == N && gotId )
+                    break;
+            }
+        }
+
+        if ( n ) {
+            *this = b.obj();
+        }
+
+        return n;
+    }
+
+    bool BSONObj::couldBeArray() const {
+        BSONObjIterator i( *this );
+        int index = 0;
+        while( i.moreWithEOO() ){
+            BSONElement e = i.next();
+            if( e.eoo() ) break;
+
+            // TODO:  If actually important, may be able to do int->char* much faster
+            if( strcmp( e.fieldName(), ((string)( mongoutils::str::stream() << index )).c_str() ) != 0 )
+                return false;
+            index++;
+        }
+        return true;
+    }
+
+    BSONObj BSONObj::clientReadable() const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            switch( e.type() ) {
+            case MinKey: {
+                BSONObjBuilder m;
+                m.append( "$minElement", 1 );
+                b.append( e.fieldName(), m.done() );
+                break;
+            }
+            case MaxKey: {
+                BSONObjBuilder m;
+                m.append( "$maxElement", 1 );
+                b.append( e.fieldName(), m.done() );
+                break;
+            }
+            default:
+                b.append( e );
+            }
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::replaceFieldNames( const BSONObj &names ) const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        BSONObjIterator j( names );
+        BSONElement f = j.moreWithEOO() ? j.next() : BSONObj().firstElement();
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            if ( !f.eoo() ) {
+                b.appendAs( e, f.fieldName() );
+                f = j.next();
+            }
+            else {
+                b.append( e );
+            }
+        }
+        return b.obj();
+    }
+
+    bool BSONObj::okForStorage() const {
+        BSONObjIterator i( *this );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            const char * name = e.fieldName();
+
+            if ( strchr( name , '.' ) ||
+                    strchr( name , '$' ) ) {
+                return
+                    strcmp( name , "$ref" ) == 0 ||
+                    strcmp( name , "$id" ) == 0
+                    ;
+            }
+
+            if ( e.mayEncapsulate() ) {
+                switch ( e.type() ) {
+                case Object:
+                case Array:
+                    if ( ! e.embeddedObject().okForStorage() )
+                        return false;
+                    break;
+                case CodeWScope:
+                    if ( ! e.codeWScopeObject().okForStorage() )
+                        return false;
+                    break;
+                default:
+                    uassert( 12579, "unhandled cases in BSONObj okForStorage" , 0 );
+                }
+
+            }
+        }
+        return true;
+    }
+
+    void BSONObj::dump() const {
+        out() << hex;
+        const char *p = objdata();
+        for ( int i = 0; i < objsize(); i++ ) {
+            out() << i << '\t' << ( 0xff & ( (unsigned) *p ) );
+            if ( *p >= 'A' && *p <= 'z' )
+                out() << '\t' << *p;
+            out() << endl;
+            p++;
+        }
+    }
+
+    void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base) {
+        BSONObjIterator it(obj);
+        while (it.more()) {
+            BSONElement e = it.next();
+            if (e.type() == Object) {
+                string newbase = base + e.fieldName() + ".";
+                nested2dotted(b, e.embeddedObject(), newbase);
+            }
+            else {
+                string newbase = base + e.fieldName();
+                b.appendAs(e, newbase);
+            }
+        }
+    }
+
+    void dotted2nested(BSONObjBuilder& b, const BSONObj& obj) {
+        //use map to sort fields
+        BSONMap sorted = bson2map(obj);
+        EmbeddedBuilder eb(&b);
+        for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it) {
+            eb.appendAs(it->second, it->first);
+        }
+        eb.done();
+    }
+
+    /*-- test things ----------------------------------------------------*/
+
+#pragma pack(1)
+    struct MaxKeyData {
+        MaxKeyData() {
+            totsize=7;
+            maxkey=MaxKey;
+            name=0;
+            eoo=EOO;
+        }
+        int totsize;
+        char maxkey;
+        char name;
+        char eoo;
+    } maxkeydata;
+    BSONObj maxKey((const char *) &maxkeydata);
+
+    struct MinKeyData {
+        MinKeyData() {
+            totsize=7;
+            minkey=MinKey;
+            name=0;
+            eoo=EOO;
+        }
+        int totsize;
+        char minkey;
+        char name;
+        char eoo;
+    } minkeydata;
+    BSONObj minKey((const char *) &minkeydata);
+
+    /*
+        struct JSObj0 {
+            JSObj0() {
+                totsize = 5;
+                eoo = EOO;
+            }
+            int totsize;
+            char eoo;
+        } js0;
+    */
+#pragma pack()
+
+    struct BsonUnitTest : public UnitTest {
+        void testRegex() {
+
+            BSONObjBuilder b;
+            b.appendRegex("x", "foo");
+            BSONObj o = b.done();
+
+            BSONObjBuilder c;
+            c.appendRegex("x", "goo");
+            BSONObj p = c.done();
+
+            assert( !o.binaryEqual( p ) );
+            assert( o.woCompare( p ) < 0 );
+
+        }
+        void testoid() {
+            OID id;
+            id.init();
+            //            sleepsecs(3);
+
+            OID b;
+            // goes with sleep above...
+            // b.init();
+            // assert( memcmp(id.getData(), b.getData(), 12) < 0 );
+
+            b.init( id.str() );
+            assert( b == id );
+        }
+
+        void testbounds() {
+            BSONObj l , r;
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<long long>::max() );
+                l = b.obj();
+            }
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<double>::max() );
+                r = b.obj();
+            }
+            assert( l.woCompare( r ) < 0 );
+            assert( r.woCompare( l ) > 0 );
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<int>::max() );
+                l = b.obj();
+            }
+            assert( l.woCompare( r ) < 0 );
+            assert( r.woCompare( l ) > 0 );
+        }
+
+        void testorder() {
+            {
+                BSONObj x,y,z;
+                { BSONObjBuilder b; b.append( "x" , (long long)2 ); x = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (int)3 ); y = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (long long)4 ); z = b.obj(); }
+                assert( x.woCompare( y ) < 0 );
+                assert( x.woCompare( z ) < 0 );
+                assert( y.woCompare( x ) > 0 );
+                assert( z.woCompare( x ) > 0 );
+                assert( y.woCompare( z ) < 0 );
+                assert( z.woCompare( y ) > 0 );
+            }
+
+            {
+                BSONObj ll,d,i,n,u;
+                { BSONObjBuilder b; b.append( "x" , (long long)2 ); ll = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (double)2 ); d = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (int)2 ); i = b.obj(); }
+                { BSONObjBuilder b; b.appendNull( "x" ); n = b.obj(); }
+                { BSONObjBuilder b; u = b.obj(); }
+
+                assert( ll.woCompare( u ) == d.woCompare( u ) );
+                assert( ll.woCompare( u ) == i.woCompare( u ) );
+                BSONObj k = BSON( "x" << 1 );
+                assert( ll.woCompare( u , k ) == d.woCompare( u , k ) );
+                assert( ll.woCompare( u , k ) == i.woCompare( u , k ) );
+
+                assert( u.woCompare( ll ) == u.woCompare( d ) );
+                assert( u.woCompare( ll ) == u.woCompare( i ) );
+                assert( u.woCompare( ll , k ) == u.woCompare( d , k ) );
+                assert( u.woCompare( ll , k ) == u.woCompare( d , k ) );
+
+                assert( i.woCompare( n ) == d.woCompare( n ) );
+
+                assert( ll.woCompare( n ) == d.woCompare( n ) );
+                assert( ll.woCompare( n ) == i.woCompare( n ) );
+                assert( ll.woCompare( n , k ) == d.woCompare( n , k ) );
+                assert( ll.woCompare( n , k ) == i.woCompare( n , k ) );
+
+                assert( n.woCompare( ll ) == n.woCompare( d ) );
+                assert( n.woCompare( ll ) == n.woCompare( i ) );
+                assert( n.woCompare( ll , k ) == n.woCompare( d , k ) );
+                assert( n.woCompare( ll , k ) == n.woCompare( d , k ) );
+            }
+
+            {
+                BSONObj l,r;
+                { BSONObjBuilder b; b.append( "x" , "eliot" ); l = b.obj(); }
+                { BSONObjBuilder b; b.appendSymbol( "x" , "eliot" ); r = b.obj(); }
+                assert( l.woCompare( r ) == 0 );
+                assert( r.woCompare( l ) == 0 );
+            }
+        }
+
+        void run() {
+            testRegex();
+            BSONObjBuilder A,B,C;
+            A.append("x", 2);
+            B.append("x", 2.0);
+            C.append("x", 2.1);
+            BSONObj a = A.done();
+            BSONObj b = B.done();
+            BSONObj c = C.done();
+            assert( !a.binaryEqual( b ) ); // comments on operator==
+            int cmp = a.woCompare(b);
+            assert( cmp == 0 );
+            cmp = a.woCompare(c);
+            assert( cmp < 0 );
+            testoid();
+            testbounds();
+            testorder();
+        }
+    } bson_unittest;
+
+    Labeler::Label GT( "$gt" );
+    Labeler::Label GTE( "$gte" );
+    Labeler::Label LT( "$lt" );
+    Labeler::Label LTE( "$lte" );
+    Labeler::Label NE( "$ne" );
+    Labeler::Label SIZE( "$size" );
+
+    void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ) {
+        switch ( t ) {
+                
+        // Shared canonical types
+        case NumberInt:
+        case NumberDouble:
+        case NumberLong:
+            append( fieldName , - numeric_limits<double>::max() ); return;
+        case Symbol:
+        case String:
+            append( fieldName , "" ); return;
+        case Date: 
+            // min varies with V0 and V1 indexes, so we go one type lower.
+            appendBool(fieldName, true);
+            //appendDate( fieldName , numeric_limits<long long>::min() ); 
+            return;
+        case Timestamp: // TODO integrate with Date SERVER-3304
+            appendTimestamp( fieldName , 0 ); return;
+        case Undefined: // shared with EOO
+            appendUndefined( fieldName ); return;
+                
+        // Separate canonical types
+        case MinKey:
+            appendMinKey( fieldName ); return;
+        case MaxKey:
+            appendMaxKey( fieldName ); return;
+        case jstOID: {
+            OID o;
+            memset(&o, 0, sizeof(o));
+            appendOID( fieldName , &o);
+            return;
+        }
+        case Bool:
+            appendBool( fieldName , false); return;
+        case jstNULL:
+            appendNull( fieldName ); return;
+        case Object:
+            append( fieldName , BSONObj() ); return;
+        case Array:
+            appendArray( fieldName , BSONObj() ); return;
+        case BinData:
+            appendBinData( fieldName , 0 , BinDataGeneral , (const char *) 0 ); return;
+        case RegEx:
+            appendRegex( fieldName , "" ); return;
+        case DBRef: {
+            OID o;
+            memset(&o, 0, sizeof(o));
+            appendDBRef( fieldName , "" , o );
+            return;
+        }
+        case Code:
+            appendCode( fieldName , "" ); return;
+        case CodeWScope:
+            appendCodeWScope( fieldName , "" , BSONObj() ); return;
+        };
+        log() << "type not supported for appendMinElementForType: " << t << endl;
+        uassert( 10061 ,  "type not supported for appendMinElementForType" , false );
+    }
+
+    void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ) {
+        switch ( t ) {
+                
+        // Shared canonical types
+        case NumberInt:
+        case NumberDouble:
+        case NumberLong:
+            append( fieldName , numeric_limits<double>::max() ); return;
+        case Symbol:
+        case String:
+            appendMinForType( fieldName, Object ); return;
+        case Date:
+            appendDate( fieldName , numeric_limits<long long>::max() ); return;
+        case Timestamp: // TODO integrate with Date SERVER-3304
+            appendTimestamp( fieldName , numeric_limits<unsigned long long>::max() ); return;
+        case Undefined: // shared with EOO
+            appendUndefined( fieldName ); return;
+
+        // Separate canonical types
+        case MinKey:
+            appendMinKey( fieldName ); return;
+        case MaxKey:
+            appendMaxKey( fieldName ); return;
+        case jstOID: {
+            OID o;
+            memset(&o, 0xFF, sizeof(o));
+            appendOID( fieldName , &o);
+            return;
+        }
+        case Bool:
+            appendBool( fieldName , true ); return;
+        case jstNULL:
+            appendNull( fieldName ); return;
+        case Object:
+            appendMinForType( fieldName, Array ); return;
+        case Array:
+            appendMinForType( fieldName, BinData ); return;
+        case BinData:
+            appendMinForType( fieldName, jstOID ); return;
+        case RegEx:
+            appendMinForType( fieldName, DBRef ); return;
+        case DBRef:
+            appendMinForType( fieldName, Code ); return;                
+        case Code:
+            appendMinForType( fieldName, CodeWScope ); return;
+        case CodeWScope:
+            // This upper bound may change if a new bson type is added.
+            appendMinForType( fieldName , MaxKey ); return;
+        }
+        log() << "type not supported for appendMaxElementForType: " << t << endl;
+        uassert( 14853 ,  "type not supported for appendMaxElementForType" , false );
+    }
+
+    int BSONElementFieldSorter( const void * a , const void * b ) {
+        const char * x = *((const char**)a);
+        const char * y = *((const char**)b);
+        x++; y++;
+        return lexNumCmp( x , y );
+    }
+
+    bool fieldsMatch(const BSONObj& lhs, const BSONObj& rhs) {
+        BSONObjIterator l(lhs);
+        BSONObjIterator r(rhs);
+
+        while (l.more() && r.more()){
+            if (strcmp(l.next().fieldName(), r.next().fieldName())) {
+                return false;
+            }
+        }
+
+        return !(l.more() || r.more()); // false if lhs and rhs have diff nFields()
+    }
+
+    BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ) {
+        _nfields = o.nFields();
+        _fields = new const char*[_nfields];
+        int x = 0;
+        BSONObjIterator i( o );
+        while ( i.more() ) {
+            _fields[x++] = i.next().rawdata();
+            assert( _fields[x-1] );
+        }
+        assert( x == _nfields );
+        qsort( _fields , _nfields , sizeof(char*) , BSONElementFieldSorter );
+        _cur = 0;
+    }
+
+    bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ) {
+        if ( data.size() == 0 || data == "-" || data == ".")
+            return false;
+
+        unsigned int pos=0;
+        if ( data[0] == '-' )
+            pos++;
+
+        bool hasDec = false;
+
+        for ( ; pos<data.size(); pos++ ) {
+            if ( isdigit(data[pos]) )
+                continue;
+
+            if ( data[pos] == '.' ) {
+                if ( hasDec )
+                    return false;
+                hasDec = true;
+                continue;
+            }
+
+            return false;
+        }
+
+        if ( hasDec ) {
+            double d = atof( data.c_str() );
+            append( fieldName , d );
+            return true;
+        }
+
+        if ( data.size() < 8 ) {
+            append( fieldName , atoi( data.c_str() ) );
+            return true;
+        }
+
+        try {
+            long long num = boost::lexical_cast<long long>( data );
+            append( fieldName , num );
+            return true;
+        }
+        catch(bad_lexical_cast &) {
+            return false;
+        }
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/jsobj.h b/src/mongo/db/jsobj.h
new file mode 100644
index 00000000000..ae039529fbf
--- /dev/null
+++ b/src/mongo/db/jsobj.h
@@ -0,0 +1,47 @@
+/** @file jsobj.h
+    BSON classes
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+   BSONObj and its helpers
+
+   "BSON" stands for "binary JSON" -- ie a binary way to represent objects that would be
+   represented in JSON (plus a few extensions useful for databases & other languages).
+
+   http://www.bsonspec.org/
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../bson/util/builder.h"
+#include "../util/optime.h"
+//#include "boost/utility.hpp"
+//#include <set>
+#include "../bson/bsontypes.h"
+#include "../bson/oid.h"
+#include "../bson/bsonelement.h"
+#include "../bson/bsonobj.h"
+#include "../bson/bsonmisc.h"
+#include "../bson/bsonobjbuilder.h"
+#include "../bson/bsonobjiterator.h"
+#include "../bson/bson-inl.h"
+#include "../bson/ordering.h"
+#include "../bson/stringdata.h"
+#include "../bson/bson_db.h"
+
diff --git a/src/mongo/db/jsobjmanipulator.h b/src/mongo/db/jsobjmanipulator.h
new file mode 100644
index 00000000000..860e575940e
--- /dev/null
+++ b/src/mongo/db/jsobjmanipulator.h
@@ -0,0 +1,94 @@
+/** jsobjManipulator.h */
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+//#include "dur.h"
+
+namespace mongo {
+
+    /** Manipulate the binary representation of a BSONElement in-place.
+        Careful, this casts away const.
+    */
+    class BSONElementManipulator {
+    public:
+        BSONElementManipulator( const BSONElement &element ) :
+            _element( element ) {
+            assert( !_element.eoo() );
+        }
+        /** Replace a Timestamp type with a Date type initialized to
+            OpTime::now().asDate()
+        */
+        void initTimestamp();
+
+        // Note the ones with a capital letter call getDur().writing and journal
+
+        /** Change the value, in place, of the number. */
+        void setNumber(double d) {
+            if ( _element.type() == NumberDouble ) *reinterpret_cast< double * >( value() )  = d;
+            else if ( _element.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d;
+            else assert(0);
+        }
+        void SetNumber(double d);
+        void setLong(long long n) {
+            assert( _element.type() == NumberLong );
+            *reinterpret_cast< long long * >( value() ) = n;
+        }
+        void SetLong(long long n);
+        void setInt(int n) {
+            assert( _element.type() == NumberInt );
+            *reinterpret_cast< int * >( value() ) = n;
+        }
+        void SetInt(int n);
+
+        /** Replace the type and value of the element with the type and value of e,
+            preserving the original fieldName */
+        void replaceTypeAndValue( const BSONElement &e ) {
+            *data() = e.type();
+            memcpy( value(), e.value(), e.valuesize() );
+        }
+
+        /* dur:: version */
+        void ReplaceTypeAndValue( const BSONElement &e );
+
+        static void lookForTimestamps( const BSONObj& obj ) {
+            // If have a Timestamp field as the first or second element,
+            // update it to a Date field set to OpTime::now().asDate().  The
+            // replacement policy is a work in progress.
+
+            BSONObjIterator i( obj );
+            for( int j = 0; i.moreWithEOO() && j < 2; ++j ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                if ( e.type() == Timestamp ) {
+                    BSONElementManipulator( e ).initTimestamp();
+                    break;
+                }
+            }
+        }
+    private:
+        char *data() { return nonConst( _element.rawdata() ); }
+        char *value() { return nonConst( _element.value() ); }
+        static char *nonConst( const char *s ) { return const_cast< char * >( s ); }
+
+        const BSONElement _element;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/json.cpp b/src/mongo/db/json.cpp
new file mode 100644
index 00000000000..73457a2bfbb
--- /dev/null
+++ b/src/mongo/db/json.cpp
@@ -0,0 +1,651 @@
+// json.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#define BOOST_SPIRIT_THREADSAFE
+#if BOOST_VERSION >= 103800
+#define BOOST_SPIRIT_USE_OLD_NAMESPACE
+#include <boost/spirit/include/classic_core.hpp>
+#include <boost/spirit/include/classic_loops.hpp>
+#include <boost/spirit/include/classic_lists.hpp>
+#else
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/utility/loops.hpp>
+#include <boost/spirit/utility/lists.hpp>
+#endif
+#undef assert
+#define assert MONGO_assert
+
+#include "json.h"
+#include "../bson/util/builder.h"
+#include "../util/base64.h"
+#include "../util/hex.h"
+
+
+using namespace boost::spirit;
+
+namespace mongo {
+
+    struct ObjectBuilder : boost::noncopyable {
+        ~ObjectBuilder() {
+            unsigned i = builders.size();
+            if ( i ) {
+                i--;
+                for ( ; i>=1; i-- ) {
+                    if ( builders[i] ) {
+                        builders[i]->done();
+                    }
+                }
+            }
+        }
+        BSONObjBuilder *back() {
+            return builders.back().get();
+        }
+        // Storage for field names of elements within builders.back().
+        const char *fieldName() {
+            return fieldNames.back().c_str();
+        }
+        bool empty() const {
+            return builders.size() == 0;
+        }
+        void init() {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        void pushObject( const char *fieldName ) {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subobjStart( fieldName ) ) );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        void pushArray( const char *fieldName ) {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subarrayStart( fieldName ) ) );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        BSONObj pop() {
+            BSONObj ret;
+            if ( back()->owned() )
+                ret = back()->obj();
+            else
+                ret = back()->done();
+            builders.pop_back();
+            fieldNames.pop_back();
+            indexes.pop_back();
+            return ret;
+        }
+        void nameFromIndex() {
+            fieldNames.back() = BSONObjBuilder::numStr( indexes.back() );
+        }
+        string popString() {
+            string ret = ss.str();
+            ss.str( "" );
+            return ret;
+        }
+        // Cannot use auto_ptr because its copy constructor takes a non const reference.
+        vector< boost::shared_ptr< BSONObjBuilder > > builders;
+        vector< string > fieldNames;
+        vector< int > indexes;
+        stringstream ss;
+        string ns;
+        OID oid;
+        string binData;
+        BinDataType binDataType;
+        string regex;
+        string regexOptions;
+        Date_t date;
+        OpTime timestamp;
+    };
+
+    struct objectStart {
+        objectStart( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            if ( b.empty() )
+                b.init();
+            else
+                b.pushObject( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayStart {
+        arrayStart( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            b.pushArray( b.fieldName() );
+            b.nameFromIndex();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayNext {
+        arrayNext( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            ++b.indexes.back();
+            b.nameFromIndex();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct ch {
+        ch( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            b.ss << c;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct chE {
+        chE( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            char o = '\0';
+            switch ( c ) {
+            case '\"':
+                o = '\"';
+                break;
+            case '\'':
+                o = '\'';
+                break;
+            case '\\':
+                o = '\\';
+                break;
+            case '/':
+                o = '/';
+                break;
+            case 'b':
+                o = '\b';
+                break;
+            case 'f':
+                o = '\f';
+                break;
+            case 'n':
+                o = '\n';
+                break;
+            case 'r':
+                o = '\r';
+                break;
+            case 't':
+                o = '\t';
+                break;
+            case 'v':
+                o = '\v';
+                break;
+            default:
+                assert( false );
+            }
+            b.ss << o;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct chU {
+        chU( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            unsigned char first = fromHex( start );
+            unsigned char second = fromHex( start + 2 );
+            if ( first == 0 && second < 0x80 )
+                b.ss << second;
+            else if ( first < 0x08 ) {
+                b.ss << char( 0xc0 | ( ( first << 2 ) | ( second >> 6 ) ) );
+                b.ss << char( 0x80 | ( ~0xc0 & second ) );
+            }
+            else {
+                b.ss << char( 0xe0 | ( first >> 4 ) );
+                b.ss << char( 0x80 | ( ~0xc0 & ( ( first << 2 ) | ( second >> 6 ) ) ) );
+                b.ss << char( 0x80 | ( ~0xc0 & second ) );
+            }
+        }
+        ObjectBuilder &b;
+    };
+
+    struct chClear {
+        chClear( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct fieldNameEnd {
+        fieldNameEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            string name = b.popString();
+            massert( 10338 ,  "Invalid use of reserved field name: " + name,
+                     name != "$oid" &&
+                     name != "$binary" &&
+                     name != "$type" &&
+                     name != "$date" &&
+                     name != "$timestamp" &&
+                     name != "$regex" &&
+                     name != "$options" );
+            b.fieldNames.back() = name;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct unquotedFieldNameEnd {
+        unquotedFieldNameEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            string name( start, end );
+            b.fieldNames.back() = name;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct stringEnd {
+        stringEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->append( b.fieldName(), b.popString() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct numberValue {
+        numberValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            string raw(start);
+            double val;
+
+            // strtod isn't able to deal with NaN and inf in a portable way.
+            // Correspondingly, we perform the conversions explicitly.
+
+            if ( ! raw.compare(0, 3, "NaN" ) ) {
+                val = std::numeric_limits<double>::quiet_NaN();
+            } 
+            else if ( ! raw.compare(0, 8, "Infinity" ) ) {
+                val = std::numeric_limits<double>::infinity();
+            } 
+            else if ( ! raw.compare(0, 9, "-Infinity" ) ) {
+                val = -std::numeric_limits<double>::infinity();
+            }
+            else {
+                // We re-parse the numeric string here because spirit parsing of strings
+                // to doubles produces different results from strtod in some cases and
+                // we want to use strtod to ensure consistency with other string to
+                // double conversions in our code.
+
+                val = strtod( start, 0 );
+            }
+
+            b.back()->append( b.fieldName(), val );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct intValue {
+        intValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( long long num ) const {
+            if (num >= numeric_limits<int>::min() && num <= numeric_limits<int>::max())
+                b.back()->append( b.fieldName(), (int)num );
+            else
+                b.back()->append( b.fieldName(), num );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct subobjectEnd {
+        subobjectEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.pop();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayEnd {
+        arrayEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.pop();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct trueValue {
+        trueValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBool( b.fieldName(), true );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct falseValue {
+        falseValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBool( b.fieldName(), false );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct nullValue {
+        nullValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendNull( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct undefinedValue {
+        undefinedValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendUndefined( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+    
+    struct dbrefNS {
+        dbrefNS( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.ns = b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+// NOTE s must be 24 characters.
+    OID stringToOid( const char *s ) {
+        OID oid;
+        char *oidP = (char *)( &oid );
+        for ( int i = 0; i < 12; ++i )
+            oidP[ i ] = fromHex( s + ( i * 2 ) );
+        return oid;
+    }
+
+    struct oidValue {
+        oidValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.oid = stringToOid( start );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dbrefEnd {
+        dbrefEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendDBRef( b.fieldName(), b.ns, b.oid );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct oidEnd {
+        oidEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendOID( b.fieldName(), &b.oid );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct timestampEnd {
+        timestampEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendTimestamp( b.fieldName(), b.timestamp.asDate() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataBinary {
+        binDataBinary( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            massert( 10339 ,  "Badly formatted bindata", ( end - start ) % 4 == 0 );
+            string encoded( start, end );
+            b.binData = base64::decode( encoded );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataType {
+        binDataType( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.binDataType = BinDataType( fromHex( start ) );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataEnd {
+        binDataEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBinData( b.fieldName(), b.binData.length(),
+                                     b.binDataType, b.binData.data() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct timestampSecs {
+        timestampSecs( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( unsigned long long x) const {
+            b.timestamp = OpTime( (unsigned) (x/1000) , 0);
+        }
+        ObjectBuilder &b;
+    };
+
+    struct timestampInc {
+        timestampInc( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( unsigned x) const {
+            b.timestamp = OpTime(b.timestamp.getSecs(), x);
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dateValue {
+        dateValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( Date_t v ) const {
+            b.date = v;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dateEnd {
+        dateEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendDate( b.fieldName(), b.date );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexValue {
+        regexValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.regex = b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexOptions {
+        regexOptions( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.regexOptions = string( start, end );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexEnd {
+        regexEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendRegex( b.fieldName(), b.regex, b.regexOptions );
+        }
+        ObjectBuilder &b;
+    };
+
+// One gotcha with this parsing library is probably best ilustrated with an
+// example.  Say we have a production like this:
+// z = ( ch_p( 'a' )[ foo ] >> ch_p( 'b' ) ) | ( ch_p( 'a' )[ foo ] >> ch_p( 'c' ) );
+// On input "ac", action foo() will be called twice -- once as the parser tries
+// to match "ab", again as the parser successfully matches "ac".  Sometimes
+// the grammar can be modified to eliminate these situations.  Here, for example:
+// z = ch_p( 'a' )[ foo ] >> ( ch_p( 'b' ) | ch_p( 'c' ) );
+// However, this is not always possible.  In my implementation I've tried to
+// stick to the following pattern: store fields fed to action callbacks
+// temporarily as ObjectBuilder members, then append to a BSONObjBuilder once
+// the parser has completely matched a nonterminal and won't backtrack.  It's
+// worth noting here that this parser follows a short-circuit convention.  So,
+// in the original z example on line 3, if the input was "ab", foo() would only
+// be called once.
+    struct JsonGrammar : public grammar< JsonGrammar > {
+    public:
+        JsonGrammar( ObjectBuilder &_b ) : b( _b ) {}
+
+        template < typename ScannerT >
+        struct definition {
+            definition( JsonGrammar const &self ) {
+                object = ch_p( '{' )[ objectStart( self.b ) ] >> !members >> '}';
+                members = list_p((fieldName >> ':' >> value) , ',');
+                fieldName =
+                    str[ fieldNameEnd( self.b ) ] |
+                    singleQuoteStr[ fieldNameEnd( self.b ) ] |
+                    unquotedFieldName[ unquotedFieldNameEnd( self.b ) ];
+                array = ch_p( '[' )[ arrayStart( self.b ) ] >> !elements >> ']';
+                elements = list_p(value, ch_p(',')[arrayNext( self.b )]);
+                value =
+                    str[ stringEnd( self.b ) ] |
+                    number[ numberValue( self.b ) ] |
+                    integer |
+                    array[ arrayEnd( self.b ) ] |
+                    lexeme_d[ str_p( "true" ) ][ trueValue( self.b ) ] |
+                    lexeme_d[ str_p( "false" ) ][ falseValue( self.b ) ] |
+                    lexeme_d[ str_p( "null" ) ][ nullValue( self.b ) ] |
+                    lexeme_d[ str_p( "undefined" ) ][ undefinedValue( self.b ) ] |
+                    singleQuoteStr[ stringEnd( self.b ) ] |
+                    date[ dateEnd( self.b ) ] |
+                    oid[ oidEnd( self.b ) ] |
+                    bindata[ binDataEnd( self.b ) ] |
+                    dbref[ dbrefEnd( self.b ) ] |
+                    timestamp[ timestampEnd( self.b ) ] |
+                    regex[ regexEnd( self.b ) ] |
+                    object[ subobjectEnd( self.b ) ] ;
+                // NOTE lexeme_d and rules don't mix well, so we have this mess.
+                // NOTE We use range_p rather than cntrl_p, because the latter is locale dependent.
+                str = lexeme_d[ ch_p( '"' )[ chClear( self.b ) ] >>
+                                *( ( ch_p( '\\' ) >>
+                                     (
+                                         ch_p( 'b' )[ chE( self.b ) ] |
+                                         ch_p( 'f' )[ chE( self.b ) ] |
+                                         ch_p( 'n' )[ chE( self.b ) ] |
+                                         ch_p( 'r' )[ chE( self.b ) ] |
+                                         ch_p( 't' )[ chE( self.b ) ] |
+                                         ch_p( 'v' )[ chE( self.b ) ] |
+                                         ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+                                         ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+                                     )
+                                   ) |
+                                   ( ~range_p( 0x00, 0x1f ) & ~ch_p( '"' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '"' ];
+
+                singleQuoteStr = lexeme_d[ ch_p( '\'' )[ chClear( self.b ) ] >>
+                                           *( ( ch_p( '\\' ) >>
+                                                (
+                                                    ch_p( 'b' )[ chE( self.b ) ] |
+                                                    ch_p( 'f' )[ chE( self.b ) ] |
+                                                    ch_p( 'n' )[ chE( self.b ) ] |
+                                                    ch_p( 'r' )[ chE( self.b ) ] |
+                                                    ch_p( 't' )[ chE( self.b ) ] |
+                                                    ch_p( 'v' )[ chE( self.b ) ] |
+                                                    ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+                                                    ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+                                                )
+                                              ) |
+                                              ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ];
+
+                // real_p accepts numbers with nonsignificant zero prefixes, which
+                // aren't allowed in JSON.  Oh well.
+                number = strict_real_p | str_p( "NaN" ) | str_p( "Infinity" ) | str_p( "-Infinity" );
+
+                static int_parser<long long, 10,  1, numeric_limits<long long>::digits10 + 1> long_long_p;
+                integer = long_long_p[ intValue(self.b) ];
+
+                // We allow a subset of valid js identifier names here.
+                unquotedFieldName = lexeme_d[ ( alpha_p | ch_p( '$' ) | ch_p( '_' ) ) >> *( ( alnum_p | ch_p( '$' ) | ch_p( '_'  )) ) ];
+
+                dbref = dbrefS | dbrefT;
+                dbrefS = ch_p( '{' ) >> "\"$ref\"" >> ':' >>
+                         str[ dbrefNS( self.b ) ] >> ',' >> "\"$id\"" >> ':' >> quotedOid >> '}';
+                dbrefT = str_p( "Dbref" ) >> '(' >> str[ dbrefNS( self.b ) ] >> ',' >>
+                         quotedOid >> ')';
+
+                timestamp = ch_p( '{' ) >> "\"$timestamp\"" >> ':' >> '{' >>
+                    "\"t\"" >> ':' >> uint_parser<unsigned long long, 10, 1, -1>()[ timestampSecs(self.b) ] >> ',' >>
+                    "\"i\"" >> ':' >> uint_parser<unsigned int, 10, 1, -1>()[ timestampInc(self.b) ] >> '}' >>'}';
+
+                oid = oidS | oidT;
+                oidS = ch_p( '{' ) >> "\"$oid\"" >> ':' >> quotedOid >> '}';
+                oidT = str_p( "ObjectId" ) >> '(' >> quotedOid >> ')';
+
+                quotedOid = lexeme_d[ '"' >> ( repeat_p( 24 )[ xdigit_p ] )[ oidValue( self.b ) ] >> '"' ];
+
+                bindata = ch_p( '{' ) >> "\"$binary\"" >> ':' >>
+                          lexeme_d[ '"' >> ( *( range_p( 'A', 'Z' ) | range_p( 'a', 'z' ) | range_p( '0', '9' ) | ch_p( '+' ) | ch_p( '/' ) ) >> *ch_p( '=' ) )[ binDataBinary( self.b ) ] >> '"' ] >> ',' >> "\"$type\"" >> ':' >>
+                          lexeme_d[ '"' >> ( repeat_p( 2 )[ xdigit_p ] )[ binDataType( self.b ) ] >> '"' ] >> '}';
+
+                // TODO: this will need to use a signed parser at some point
+                date = dateS | dateT;
+                dateS = ch_p( '{' ) >> "\"$date\"" >> ':' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> '}';
+                dateT = !str_p("new") >> str_p( "Date" ) >> '(' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> ')';
+
+                regex = regexS | regexT;
+                regexS = ch_p( '{' ) >> "\"$regex\"" >> ':' >> str[ regexValue( self.b ) ] >> ',' >> "\"$options\"" >> ':' >> lexeme_d[ '"' >> ( *( alpha_p ) )[ regexOptions( self.b ) ] >> '"' ] >> '}';
+                // FIXME Obviously it would be nice to unify this with str.
+                regexT = lexeme_d[ ch_p( '/' )[ chClear( self.b ) ] >>
+                                   *( ( ch_p( '\\' ) >>
+                                        ( ch_p( '"' )[ chE( self.b ) ] |
+                                          ch_p( '\\' )[ chE( self.b ) ] |
+                                          ch_p( '/' )[ chE( self.b ) ] |
+                                          ch_p( 'b' )[ chE( self.b ) ] |
+                                          ch_p( 'f' )[ chE( self.b ) ] |
+                                          ch_p( 'n' )[ chE( self.b ) ] |
+                                          ch_p( 'r' )[ chE( self.b ) ] |
+                                          ch_p( 't' )[ chE( self.b ) ] |
+                                          ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) ) ) |
+                                      ( ~range_p( 0x00, 0x1f ) & ~ch_p( '/' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> str_p( "/" )[ regexValue( self.b ) ]
+                                   >> ( *( ch_p( 'i' ) | ch_p( 'g' ) | ch_p( 'm' ) ) )[ regexOptions( self.b ) ] ];
+            }
+            rule< ScannerT > object, members, array, elements, value, str, number, integer,
+                  dbref, dbrefS, dbrefT, timestamp, timestampS, timestampT, oid, oidS, oidT, 
+                  bindata, date, dateS, dateT, regex, regexS, regexT, quotedOid, fieldName, 
+                  unquotedFieldName, singleQuoteStr;
+            const rule< ScannerT > &start() const {
+                return object;
+            }
+        };
+        ObjectBuilder &b;
+    };
+
+    BSONObj fromjson( const char *str , int* len) {
+        if ( str[0] == '\0' ) {
+            if (len) *len = 0;
+            return BSONObj();
+        }
+
+        ObjectBuilder b;
+        JsonGrammar parser( b );
+        parse_info<> result = parse( str, parser, space_p );
+        if (len) {
+            *len = result.stop - str;
+        }
+        else if ( !result.full ) {
+            int limit = strnlen(result.stop , 10);
+            if (limit == -1) limit = 10;
+            msgasserted(10340, "Failure parsing JSON string near: " + string( result.stop, limit ));
+        }
+        BSONObj ret = b.pop();
+        assert( b.empty() );
+        return ret;
+    }
+
+    BSONObj fromjson( const string &str ) {
+        return fromjson( str.c_str() );
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/json.h b/src/mongo/db/json.h
new file mode 100644
index 00000000000..68dae042574
--- /dev/null
+++ b/src/mongo/db/json.h
@@ -0,0 +1,41 @@
+/** @file json.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    /** Create a BSONObj from a JSON <http://www.json.org> string.  In addition
+     to the JSON extensions extensions described here
+     <http://mongodb.onconfluence.com/display/DOCS/Mongo+Extended+JSON>,
+     this function accepts certain unquoted field names and allows single quotes
+     to optionally be used when specifying field names and string values instead
+     of double quotes.  JSON unicode escape sequences (of the form \uXXXX) are
+     converted to utf8.
+     \throws MsgAssertionException if parsing fails.  The message included with
+     this assertion includes a rough indication of where parsing failed.
+    */
+    BSONObj fromjson(const string &str);
+
+    /** len will be size of JSON object in text chars. */
+    BSONObj fromjson(const char *str, int* len=NULL);
+
+} // namespace mongo
diff --git a/src/mongo/db/key.cpp b/src/mongo/db/key.cpp
new file mode 100644
index 00000000000..47449986d21
--- /dev/null
+++ b/src/mongo/db/key.cpp
@@ -0,0 +1,678 @@
+// @file key.cpp
+
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "key.h"
+#include "../util/unittest.h"
+
+namespace mongo {
+
+    extern const Ordering nullOrdering = Ordering::make(BSONObj());
+
+    // KeyBson is for V0 (version #0) indexes
+
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o);
+
+    // "old" = pre signed dates & such; i.e. btree V0
+    /* must be same canon type when called */
+    int oldCompareElementValues(const BSONElement& l, const BSONElement& r) {
+        dassert( l.canonicalType() == r.canonicalType() );
+        int f;
+        double x;
+
+        switch ( l.type() ) {
+        case EOO:
+        case Undefined: // EOO and Undefined are same canonicalType
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            return 0;
+        case Bool:
+            return *l.value() - *r.value();
+        case Timestamp:
+        case Date:
+            // unsigned dates for old version
+            if ( l.date() < r.date() )
+                return -1;
+            return l.date() == r.date() ? 0 : 1;
+        case NumberLong:
+            if( r.type() == NumberLong ) {
+                long long L = l._numberLong();
+                long long R = r._numberLong();
+                if( L < R ) return -1;
+                if( L == R ) return 0;
+                return 1;
+            }
+            // else fall through
+        case NumberInt:
+        case NumberDouble: {
+            double left = l.number();
+            double right = r.number();
+            bool lNan = !( left <= numeric_limits< double >::max() &&
+                           left >= -numeric_limits< double >::max() );
+            bool rNan = !( right <= numeric_limits< double >::max() &&
+                           right >= -numeric_limits< double >::max() );
+            if ( lNan ) {
+                if ( rNan ) {
+                    return 0;
+                }
+                else {
+                    return -1;
+                }
+            }
+            else if ( rNan ) {
+                return 1;
+            }
+            x = left - right;
+            if ( x < 0 ) return -1;
+            return x == 0 ? 0 : 1;
+        }
+        case jstOID:
+            return memcmp(l.value(), r.value(), 12);
+        case Code:
+        case Symbol:
+        case String:
+            // nulls not allowed in the middle of strings in the old version
+            return strcmp(l.valuestr(), r.valuestr());
+        case Object:
+        case Array:
+            return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering);
+        case DBRef: {
+            int lsz = l.valuesize();
+            int rsz = r.valuesize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value(), r.value(), lsz);
+        }
+        case BinData: {
+            int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+            int rsz = r.objsize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value()+4, r.value()+4, lsz+1);
+        }
+        case RegEx: {
+            int c = strcmp(l.regex(), r.regex());
+            if ( c )
+                return c;
+            return strcmp(l.regexFlags(), r.regexFlags());
+        }
+        case CodeWScope : {
+            f = l.canonicalType() - r.canonicalType();
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeScopeData() , r.codeWScopeScopeData() );
+            if ( f )
+                return f;
+            return 0;
+        }
+        default:
+            out() << "oldCompareElementValues: bad type " << (int) l.type() << endl;
+            assert(false);
+        }
+        return -1;
+    }
+
+    int oldElemCompare(const BSONElement&l , const BSONElement& r) { 
+        int lt = (int) l.canonicalType();
+        int rt = (int) r.canonicalType();
+        int x = lt - rt;
+        if( x )
+            return x;
+        return oldCompareElementValues(l, r);
+    }
+
+    // pre signed dates & such
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) {
+        BSONObjIterator i(l);
+        BSONObjIterator j(r);
+        unsigned mask = 1;
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x;
+            {
+                x = oldElemCompare(l, r);
+                if( o.descending(mask) )
+                    x = -x;
+            }
+            if ( x != 0 )
+                return x;
+            mask <<= 1;
+        }
+        return -1;
+    }
+
+    /* old style compares:
+       - dates are unsigned 
+       - strings no nulls
+    */
+    int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const { 
+        return oldCompare(_o, r._o, o); 
+    }
+
+    // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort
+    bool KeyBson::woEqual(const KeyBson& r) const { 
+        return oldCompare(_o, r._o, nullOrdering) == 0;
+    }
+
+    // [ ][HASMORE][x][y][canontype_4bits]
+    enum CanonicalsEtc { 
+        cminkey=1,
+        cnull=2,
+        cdouble=4,
+        cstring=6,
+        cbindata=7,
+        coid=8,
+        cfalse=10,
+        ctrue=11,
+        cdate=12,
+        cmaxkey=14,
+        cCANONTYPEMASK = 0xf,
+        cY = 0x10,
+        cint = cY | cdouble,
+        cX = 0x20,
+        clong = cX | cdouble,
+        cHASMORE = 0x40,
+        cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
+    };
+
+    // bindata bson type
+    const unsigned BinDataLenMask = 0xf0;  // lengths are powers of 2 of this value
+    const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value.  see BinDataType.
+    const int BinDataLenMax = 32;
+    const int BinDataLengthToCode[] = { 
+        0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 
+        0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/,
+        0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1, 
+        0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1, 
+        0xf0/*32*/ 
+    };
+    const int BinDataCodeToLength[] = { 
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32
+    };
+
+    int binDataCodeToLength(int codeByte) { 
+        return BinDataCodeToLength[codeByte >> 4];
+    }
+
+    /** object cannot be represented in compact format.  so store in traditional bson format 
+        with a leading sentinel byte IsBSON to indicate it's in that format.
+
+        Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here 
+        so that we don't have to do an extra malloc.
+    */
+    void KeyV1Owned::traditional(const BSONObj& obj) { 
+        b.reset();
+        b.appendUChar(IsBSON);
+        b.appendBuf(obj.objdata(), obj.objsize());
+        _keyData = (const unsigned char *) b.buf();
+    }
+
+    KeyV1Owned::KeyV1Owned(const KeyV1& rhs) {
+        b.appendBuf( rhs.data(), rhs.dataSize() );
+        _keyData = (const unsigned char *) b.buf();
+        dassert( b.len() == dataSize() ); // check datasize method is correct
+        dassert( (*_keyData & cNOTUSED) == 0 );
+    }
+
+    // fromBSON to Key format
+    KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
+        BSONObj::iterator i(obj);
+        unsigned char bits = 0;
+        while( 1 ) { 
+            BSONElement e = i.next();
+            if( i.more() )
+                bits |= cHASMORE;
+            switch( e.type() ) { 
+            case MinKey:
+                b.appendUChar(cminkey|bits);
+                break;
+            case jstNULL:
+                b.appendUChar(cnull|bits);
+                break;
+            case MaxKey:
+                b.appendUChar(cmaxkey|bits);
+                break;
+            case Bool:
+                b.appendUChar( (e.boolean()?ctrue:cfalse) | bits );
+                break;
+            case jstOID:
+                b.appendUChar(coid|bits);
+                b.appendBuf(&e.__oid(), sizeof(OID));
+                break;
+            case BinData:
+                {
+                    int t = e.binDataType();
+                    // 0-7 and 0x80 to 0x87 are supported by Key
+                    if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
+                        int len;
+                        const char * d = e.binData(len);
+                        if( len <= BinDataLenMax ) {
+                            int code = BinDataLengthToCode[len];
+                            if( code >= 0 ) {
+                                if( t >= 128 )
+                                    t = (t-128) | 0x08;
+                                dassert( (code&t) == 0 );
+                                b.appendUChar( cbindata|bits );
+                                b.appendUChar( code | t );
+                                b.appendBuf(d, len);
+                                break;
+                            }
+                        }
+                    }
+                    traditional(obj);
+                    return;
+                }
+            case Date:
+                b.appendUChar(cdate|bits);
+                b.appendStruct(e.date());
+                break;
+            case String:
+                {
+                    b.appendUChar(cstring|bits);
+                    // note we do not store the terminating null, to save space.
+                    unsigned x = (unsigned) e.valuestrsize() - 1;
+                    if( x > 255 ) { 
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(x);
+                    b.appendBuf(e.valuestr(), x);
+                    break;
+                }
+            case NumberInt:
+                b.appendUChar(cint|bits);
+                b.appendNum((double) e._numberInt());
+                break;
+            case NumberLong:
+                {
+                    long long n = e._numberLong();
+                    long long m = 2LL << 52;
+                    DEV {
+                        long long d = m-1;
+                        assert( ((long long) ((double) -d)) == -d );
+                    }
+                    if( n >= m || n <= -m ) {
+                        // can't represent exactly as a double
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(clong|bits);
+                    b.appendNum((double) n);
+                    break;
+                }
+            case NumberDouble:
+                {
+                    double d = e._numberDouble();
+                    if( isNaN(d) ) {
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(cdouble|bits);
+                    b.appendNum(d);
+                    break;
+                }
+            default:
+                // if other types involved, store as traditional BSON
+                traditional(obj);
+                return;
+            }
+            if( !i.more() )
+                break;
+            bits = 0;
+        }
+        _keyData = (const unsigned char *) b.buf();
+        dassert( b.len() == dataSize() ); // check datasize method is correct
+        dassert( (*_keyData & cNOTUSED) == 0 );
+    }
+
+    BSONObj KeyV1::toBson() const { 
+        assert( _keyData != 0 );
+        if( !isCompactFormat() )
+            return bson();
+
+        BSONObjBuilder b(512);
+        const unsigned char *p = _keyData;
+        while( 1 ) { 
+            unsigned bits = *p++;
+
+            switch( bits & 0x3f ) {
+                case cminkey: b.appendMinKey(""); break;
+                case cnull:   b.appendNull(""); break;
+                case cfalse:  b.appendBool("", false); break;
+                case ctrue:   b.appendBool("", true); break;
+                case cmaxkey: 
+                    b.appendMaxKey(""); 
+                    break;
+                case cstring:
+                    {
+                        unsigned sz = *p++;
+                        // we build the element ourself as we have to null terminate it
+                        BufBuilder &bb = b.bb();
+                        bb.appendNum((char) String);
+                        bb.appendUChar(0); // fieldname ""
+                        bb.appendNum(sz+1);
+                        bb.appendBuf(p, sz);
+                        bb.appendUChar(0); // null char at end of string
+                        p += sz;
+                        break;
+                    }
+                case coid:
+                    b.appendOID("", (OID *) p);
+                    p += sizeof(OID);
+                    break;
+                case cbindata:
+                    {
+                        int len = binDataCodeToLength(*p);
+                        int subtype = (*p) & BinDataTypeMask;
+                        if( subtype & 0x8 ) { 
+                            subtype = (subtype & 0x7) | 0x80;
+                        }
+                        b.appendBinData("", len, (BinDataType) subtype, ++p);
+                        p += len;
+                        break;
+                    }
+                case cdate:
+                    b.appendDate("", (Date_t&) *p);
+                    p += 8;
+                    break;
+                case cdouble:
+                    b.append("", (double&) *p);
+                    p += sizeof(double);
+                    break;
+                case cint:
+                    b.append("", (int) ((double&) *p));
+                    p += sizeof(double);
+                    break;
+                case clong:
+                    b.append("", (long long) ((double&) *p));
+                    p += sizeof(double);
+                    break;
+                default:
+                    assert(false);
+            }
+
+            if( (bits & cHASMORE) == 0 )
+                break;
+        }
+        return b.obj();
+    }
+
+    static int compare(const unsigned char *&l, const unsigned char *&r) { 
+        int lt = (*l & cCANONTYPEMASK);
+        int rt = (*r & cCANONTYPEMASK);
+        int x = lt - rt;
+        if( x ) 
+            return x;
+
+        l++; r++;
+
+        // same type
+        switch( lt ) { 
+        case cdouble:
+            {
+                double L = *((double *) l);
+                double R = *((double *) r);
+                if( L < R )
+                    return -1;
+                if( L != R )
+                    return 1;
+                l += 8; r += 8;
+                break;
+            }
+        case cstring:
+            {
+                int lsz = *l;
+                int rsz = *r;
+                int common = min(lsz, rsz);
+                l++; r++; // skip the size byte
+                // use memcmp as we (will) allow zeros in UTF8 strings
+                int res = memcmp(l, r, common);
+                if( res ) 
+                    return res;
+                // longer string is the greater one
+                int diff = lsz-rsz;
+                if( diff ) 
+                    return diff;
+                l += lsz; r += lsz;
+                break;
+            }
+        case cbindata:
+            {
+                int L = *l;
+                int R = *r;
+                int llen = binDataCodeToLength(L);
+                int diff = L-R; // checks length and subtype simultaneously
+                if( diff ) {
+                    // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...)
+                    int rlen = binDataCodeToLength(R);
+                    if( llen != rlen ) 
+                        return llen - rlen;
+                    return diff;
+                }
+                // same length, same type
+                l++; r++;
+                int res = memcmp(l, r, llen);
+                if( res ) 
+                    return res;
+                l += llen; r += llen;
+                break;
+            }
+        case cdate:
+            {
+                long long L = *((long long *) l);
+                long long R = *((long long *) r);
+                if( L < R )
+                    return -1;
+                if( L > R )
+                    return 1;
+                l += 8; r += 8;
+                break;
+            }
+        case coid:
+            {
+                int res = memcmp(l, r, sizeof(OID));
+                if( res ) 
+                    return res;
+                l += 12; r += 12;
+                break;
+            }
+        default:
+            // all the others are a match -- e.g. null == null
+            ;
+        }
+
+        return 0;
+    }
+
+    // at least one of this and right are traditional BSON format
+    int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const { 
+        BSONObj L = toBson();
+        BSONObj R = right.toBson();
+        return L.woCompare(R, order, /*considerfieldname*/false);
+    }
+
+    int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const {
+        const unsigned char *l = _keyData;
+        const unsigned char *r = right._keyData;
+
+        if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained
+            return compareHybrid(right, order);
+
+        unsigned mask = 1;
+        while( 1 ) { 
+            char lval = *l; 
+            char rval = *r;
+            {
+                int x = compare(l, r); // updates l and r pointers
+                if( x ) {
+                    if( order.descending(mask) )
+                        x = -x;
+                    return x;
+                }
+            }
+
+            {
+                int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
+                if( x ) 
+                    return x;
+                if( (lval & cHASMORE) == 0 )
+                    break;
+            }
+
+            mask <<= 1;
+        }
+
+        return 0;
+    }
+
+    static unsigned sizes[] = {
+        0,
+        1, //cminkey=1,
+        1, //cnull=2,
+        0,
+        9, //cdouble=4,
+        0,
+        0, //cstring=6,
+        0,
+        13, //coid=8,
+        0,
+        1, //cfalse=10,
+        1, //ctrue=11,
+        9, //cdate=12,
+        0,
+        1, //cmaxkey=14,
+        0
+    };
+
+    inline unsigned sizeOfElement(const unsigned char *p) { 
+        unsigned type = *p & cCANONTYPEMASK;
+        unsigned sz = sizes[type];
+        if( sz == 0 ) {
+            if( type == cstring ) { 
+                sz = ((unsigned) p[1]) + 2;
+            }
+            else {
+                assert( type == cbindata );
+                sz = binDataCodeToLength(p[1]) + 2;
+            }
+        }
+        return sz;
+    }
+
+    int KeyV1::dataSize() const { 
+        const unsigned char *p = _keyData;
+        if( !isCompactFormat() ) {
+            return bson().objsize() + 1;
+        }
+
+        bool more;
+        do { 
+            unsigned z = sizeOfElement(p);
+            more = (*p & cHASMORE) != 0;
+            p += z;
+        } while( more );
+        return p - _keyData;
+    }
+
+    bool KeyV1::woEqual(const KeyV1& right) const {
+        const unsigned char *l = _keyData;
+        const unsigned char *r = right._keyData;
+
+        if( (*l|*r) == IsBSON ) {
+            return toBson().equal(right.toBson());
+        }
+
+        while( 1 ) { 
+            char lval = *l; 
+            char rval = *r;
+            if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) )
+                return false;
+            l++; r++;
+            switch( lval&cCANONTYPEMASK ) { 
+            case coid:
+                if( *((unsigned*) l) != *((unsigned*) r) )
+                    return false;
+                l += 4; r += 4;
+            case cdate:
+                if( *((unsigned long long *) l) != *((unsigned long long *) r) )
+                    return false;
+                l += 8; r += 8;
+                break;
+            case cdouble:
+                if( *((double *) l) != *((double *) r) )
+                    return false;
+                l += 8; r += 8;
+                break;
+            case cstring:
+                {
+                    if( *l != *r ) 
+                        return false; // not same length
+                    unsigned sz = ((unsigned) *l) + 1;
+                    if( memcmp(l, r, sz) )
+                        return false;
+                    l += sz; r += sz;
+                    break;
+                }
+            case cbindata:
+                {
+                    if( *l != *r )
+                        return false; // len or subtype mismatch
+                    int len = binDataCodeToLength(*l) + 1;
+                    if( memcmp(l, r, len) ) 
+                        return false;
+                    l += len; r += len;
+                    break;
+                }
+            case cminkey:
+            case cnull:
+            case cfalse:
+            case ctrue:
+            case cmaxkey:
+                break;
+            default:
+                assert(false);
+            }
+            if( (lval&cHASMORE) == 0 )
+                break;
+        }
+        return true;
+    }
+
+    struct CmpUnitTest : public UnitTest {
+        void run() {
+            char a[2];
+            char b[2];
+            a[0] = -3;
+            a[1] = 0;
+            b[0] = 3;
+            b[1] = 0;
+            assert( strcmp(a,b)>0 && memcmp(a,b,2)>0 );
+        }
+    } cunittest;
+
+}
diff --git a/src/mongo/db/key.h b/src/mongo/db/key.h
new file mode 100644
index 00000000000..9284cdc7422
--- /dev/null
+++ b/src/mongo/db/key.h
@@ -0,0 +1,115 @@
+// @file key.h class(es) representing individual keys in a btree
+
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+ 
+#include "jsobj.h"
+
+namespace mongo { 
+
+    /** Key class for precomputing a small format index key that is denser than a traditional BSONObj. 
+
+        KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes.
+
+        KeyV1 is the new implementation.
+    */
+    class KeyBson /* "KeyV0" */ { 
+    public:
+        KeyBson() { }
+        explicit KeyBson(const char *keyData) : _o(keyData) { }
+        explicit KeyBson(const BSONObj& obj) : _o(obj) { }
+        int woCompare(const KeyBson& r, const Ordering &o) const;
+        BSONObj toBson() const { return _o; }
+        string toString() const { return _o.toString(); }
+        int dataSize() const { return _o.objsize(); }
+        const char * data() const { return _o.objdata(); }
+        BSONElement _firstElement() const { return _o.firstElement(); }
+        bool isCompactFormat() const { return false; }
+        bool woEqual(const KeyBson& r) const;
+        void assign(const KeyBson& rhs) { *this = rhs; }
+    private:
+        BSONObj _o;
+    };
+
+    class KeyV1Owned;
+
+    // corresponding to BtreeData_V1
+    class KeyV1 { 
+        void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer
+        KeyV1(const KeyV1Owned&);     // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope
+    public:
+        KeyV1() { _keyData = 0; }
+        ~KeyV1() { DEV _keyData = (const unsigned char *) 1; }
+
+        KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) { 
+            dassert( _keyData > (const unsigned char *) 1 );
+        }
+
+        // explicit version of operator= to be safe
+        void assign(const KeyV1& rhs) { 
+            _keyData = rhs._keyData;
+        }
+
+        /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format. 
+                   when BSON, we are just a wrapper
+        */
+        explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { }
+
+        int woCompare(const KeyV1& r, const Ordering &o) const;
+        bool woEqual(const KeyV1& r) const;
+        BSONObj toBson() const;
+        string toString() const { return toBson().toString(); }
+
+        /** get the key data we want to store in the btree bucket */
+        const char * data() const { return (const char *) _keyData; }
+
+        /** @return size of data() */
+        int dataSize() const;
+
+        /** only used by geo, which always has bson keys */
+        BSONElement _firstElement() const { return bson().firstElement(); }
+        bool isCompactFormat() const { return *_keyData != IsBSON; }
+    protected:
+        enum { IsBSON = 0xff };
+        const unsigned char *_keyData;
+        BSONObj bson() const {
+            dassert( !isCompactFormat() );
+            return BSONObj((const char *) _keyData+1);
+        }
+    private:
+        int compareHybrid(const KeyV1& right, const Ordering& order) const;
+    };
+
+    class KeyV1Owned : public KeyV1 { 
+        void operator=(const KeyV1Owned&);
+    public:
+        /** @obj a BSON object to be translated to KeyV1 format.  If the object isn't 
+                 representable in KeyV1 format (which happens, intentionally, at times)
+                 it will stay as bson herein.
+        */
+        KeyV1Owned(const BSONObj& obj);
+
+        /** makes a copy (memcpy's the whole thing) */
+        KeyV1Owned(const KeyV1& rhs);
+
+    private:
+        StackBufBuilder b;
+        void traditional(const BSONObj& obj); // store as traditional bson not as compact format
+    };
+
+};
diff --git a/src/mongo/db/lasterror.cpp b/src/mongo/db/lasterror.cpp
new file mode 100644
index 00000000000..4ed4dfb0571
--- /dev/null
+++ b/src/mongo/db/lasterror.cpp
@@ -0,0 +1,142 @@
+// lasterror.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#include "../util/unittest.h"
+#include "../util/net/message.h"
+
+
+#include "lasterror.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    LastError LastError::noError;
+    LastErrorHolder lastError;
+
+    bool isShell = false;
+    void raiseError(int code , const char *msg) {
+        LastError *le = lastError.get();
+        if ( le == 0 ) {
+            /* might be intentional (non-user thread) */
+            DEV {
+                static unsigned n;
+                if( ++n < 4 && !isShell ) log() << "dev: lastError==0 won't report:" << msg << endl;
+            }
+        }
+        else if ( le->disabled ) {
+            log() << "lastError disabled, can't report: " << code << ":" << msg << endl;
+        }
+        else {
+            le->raiseError(code, msg);
+        }
+    }
+
+    bool LastError::appendSelf( BSONObjBuilder &b , bool blankErr ) {
+        if ( !valid ) {
+            if ( blankErr )
+                b.appendNull( "err" );
+            b.append( "n", 0 );
+            return false;
+        }
+
+        if ( msg.empty() ) {
+            if ( blankErr ) {
+                b.appendNull( "err" );
+            }
+        }
+        else {
+            b.append( "err", msg );
+        }
+
+        if ( code )
+            b.append( "code" , code );
+        if ( updatedExisting != NotUpdate )
+            b.appendBool( "updatedExisting", updatedExisting == True );
+        if ( upsertedId.isSet() )
+            b.append( "upserted" , upsertedId );
+        if ( writebackId.isSet() ) {
+            b.append( "writeback" , writebackId );
+            b.append( "instanceIdent" , prettyHostName() ); // this can be any unique string
+        }
+        b.appendNumber( "n", nObjects );
+
+        return ! msg.empty();
+    }
+
+    LastErrorHolder::~LastErrorHolder() {
+    }
+
+
+    LastError * LastErrorHolder::disableForCommand() {
+        LastError *le = _get();
+        uassert(13649, "no operation yet", le);
+        le->disabled = true;
+        le->nPrev--; // caller is a command that shouldn't count as an operation
+        return le;
+    }
+
+    LastError * LastErrorHolder::get( bool create ) {
+        LastError *ret = _get( create );
+        if ( ret && !ret->disabled )
+            return ret;
+        return 0;
+    }
+
+    LastError * LastErrorHolder::_get( bool create ) {
+        LastError * le = _tl.get();
+        if ( ! le && create ) {
+            le = new LastError();
+            _tl.reset( le );
+        }
+        return le;
+    }
+
+    void LastErrorHolder::release() {
+        _tl.release();
+    }
+
+    /** ok to call more than once. */
+    void LastErrorHolder::initThread() {
+        if( ! _tl.get() ) 
+            _tl.reset( new LastError() );
+    }
+
+    void LastErrorHolder::reset( LastError * le ) {
+        _tl.reset( le );
+    }
+
+    void prepareErrForNewRequest( Message &m, LastError * err ) {
+        // a killCursors message shouldn't affect last error
+        assert( err );
+        if ( m.operation() == dbKillCursors ) {
+            err->disabled = true;
+        }
+        else {
+            err->disabled = false;
+            err->nPrev++;
+        }
+    }
+
+    LastError * LastErrorHolder::startRequest( Message& m , LastError * le ) {
+        assert( le );
+        prepareErrForNewRequest( m, le );
+        return le;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/lasterror.h b/src/mongo/db/lasterror.h
new file mode 100644
index 00000000000..86250e496a8
--- /dev/null
+++ b/src/mongo/db/lasterror.h
@@ -0,0 +1,146 @@
+// lasterror.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../bson/oid.h"
+
+namespace mongo {
+    class BSONObjBuilder;
+    class Message;
+
+    struct LastError {
+        int code;
+        string msg;
+        enum UpdatedExistingType { NotUpdate, True, False } updatedExisting;
+        OID upsertedId;
+        OID writebackId;
+        long long nObjects;
+        int nPrev;
+        bool valid;
+        bool disabled;
+        void writeback( OID& oid ) {
+            reset( true );
+            writebackId = oid;
+        }
+        void raiseError(int _code , const char *_msg) {
+            reset( true );
+            code = _code;
+            msg = _msg;
+        }
+        void recordUpdate( bool _updateObjects , long long _nObjects , OID _upsertedId ) {
+            reset( true );
+            nObjects = _nObjects;
+            updatedExisting = _updateObjects ? True : False;
+            if ( _upsertedId.isSet() )
+                upsertedId = _upsertedId;
+
+        }
+        void recordDelete( long long nDeleted ) {
+            reset( true );
+            nObjects = nDeleted;
+        }
+        LastError() {
+            reset();
+        }
+        void reset( bool _valid = false ) {
+            code = 0;
+            msg.clear();
+            updatedExisting = NotUpdate;
+            nObjects = 0;
+            nPrev = 1;
+            valid = _valid;
+            disabled = false;
+            upsertedId.clear();
+            writebackId.clear();
+        }
+
+        /**
+         * @return if there is an err
+         */
+        bool appendSelf( BSONObjBuilder &b , bool blankErr = true );
+
+        struct Disabled : boost::noncopyable {
+            Disabled( LastError * le ) {
+                _le = le;
+                if ( _le ) {
+                    _prev = _le->disabled;
+                    _le->disabled = true;
+                }
+                else {
+                    _prev = false;
+                }
+            }
+
+            ~Disabled() {
+                if ( _le )
+                    _le->disabled = _prev;
+            }
+
+            LastError * _le;
+            bool _prev;
+        };
+
+        static LastError noError;
+    };
+
+    extern class LastErrorHolder {
+    public:
+        LastErrorHolder(){}
+        ~LastErrorHolder();
+
+        LastError * get( bool create = false );
+        LastError * getSafe() {
+            LastError * le = get(false);
+            if ( ! le ) {
+                error() << " no LastError!" << endl;
+                assert( le );
+            }
+            return le;
+        }
+
+        LastError * _get( bool create = false ); // may return a disabled LastError
+
+        void reset( LastError * le );
+
+        /** ok to call more than once. */
+        void initThread();
+
+        int getID();
+        
+        void release();
+
+        /** when db receives a message/request, call this */
+        LastError * startRequest( Message& m , LastError * connectionOwned );
+
+        void disconnect( int clientId );
+
+        // used to disable lastError reporting while processing a killCursors message
+        // disable causes get() to return 0.
+        LastError *disableForCommand(); // only call once per command invocation!
+    private:
+        boost::thread_specific_ptr<LastError> _tl;
+
+        struct Status {
+            time_t time;
+            LastError *lerr;
+        };
+    } lastError;
+
+    void raiseError(int code , const char *msg);
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher.cpp b/src/mongo/db/matcher.cpp
new file mode 100755
index 00000000000..2631845a757
--- /dev/null
+++ b/src/mongo/db/matcher.cpp
@@ -0,0 +1,1128 @@
+// matcher.cpp
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "matcher.h"
+#include "../util/goodies.h"
+#include "../util/unittest.h"
+#include "diskloc.h"
+#include "../scripting/engine.h"
+#include "db.h"
+#include "queryutil.h"
+#include "client.h"
+
+#include "pdfile.h"
+
+namespace {
+    inline pcrecpp::RE_Options flags2options(const char* flags) {
+        pcrecpp::RE_Options options;
+        options.set_utf8(true);
+        while ( flags && *flags ) {
+            if ( *flags == 'i' )
+                options.set_caseless(true);
+            else if ( *flags == 'm' )
+                options.set_multiline(true);
+            else if ( *flags == 'x' )
+                options.set_extended(true);
+            else if ( *flags == 's' )
+                options.set_dotall(true);
+            flags++;
+        }
+        return options;
+    }
+}
+
+//#define DEBUGMATCHER(x) cout << x << endl;
+#define DEBUGMATCHER(x)
+
+namespace mongo {
+
+    extern BSONObj staticNull;
+
+    class Where {
+    public:
+        Where() {
+            jsScope = 0;
+            func = 0;
+        }
+        ~Where() {
+
+            if ( scope.get() ){
+                try {
+                    scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
+                }
+                catch( DBException& e ){
+                    warning() << "javascript scope cleanup interrupted" << causedBy( e ) << endl;
+                }
+            }
+
+            if ( jsScope ) {
+                delete jsScope;
+                jsScope = 0;
+            }
+            func = 0;
+        }
+
+        auto_ptr<Scope> scope;
+        ScriptingFunction func;
+        BSONObj *jsScope;
+
+        void setFunc(const char *code) {
+            massert( 10341 ,  "scope has to be created first!" , scope.get() );
+            func = scope->createFunction( code );
+        }
+
+    };
+
+    Matcher::~Matcher() {
+        delete _where;
+        _where = 0;
+    }
+
+    ElementMatcher::ElementMatcher( BSONElement e , int op, bool isNot )
+        : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) {
+        if ( op == BSONObj::opMOD ) {
+            BSONObj o = e.embeddedObject();
+            _mod = o["0"].numberInt();
+            _modm = o["1"].numberInt();
+
+            uassert( 10073 ,  "mod can't be 0" , _mod );
+        }
+        else if ( op == BSONObj::opTYPE ) {
+            _type = (BSONType)(e.numberInt());
+        }
+        else if ( op == BSONObj::opELEM_MATCH ) {
+            BSONElement m = e;
+            uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object );
+            BSONObj x = m.embeddedObject();
+            if ( x.firstElement().getGtLtOp() == 0 ) {
+                _subMatcher.reset( new Matcher( x ) );
+                _subMatcherOnPrimitives = false;
+            }
+            else {
+                // meant to act on primitives
+                _subMatcher.reset( new Matcher( BSON( "" << x ) ) );
+                _subMatcherOnPrimitives = true;
+            }
+        }
+    }
+
+    ElementMatcher::ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot )
+        : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) {
+
+        _myset.reset( new set<BSONElement,element_lt>() );
+
+        BSONObjIterator i( array );
+        while ( i.more() ) {
+            BSONElement ie = i.next();
+            if ( op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+                shared_ptr<Matcher> s;
+                s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) );
+                _allMatchers.push_back( s );
+            }
+            else if ( ie.type() == RegEx ) {
+                if ( !_myregex.get() ) {
+                    _myregex.reset( new vector< RegexMatcher >() );
+                }
+                _myregex->push_back( RegexMatcher() );
+                RegexMatcher &rm = _myregex->back();
+                rm._re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) );
+                rm._fieldName = 0; // no need for field name
+                rm._regex = ie.regex();
+                rm._flags = ie.regexFlags();
+                rm._isNot = false;
+                bool purePrefix;
+                string prefix = simpleRegex(rm._regex, rm._flags, &purePrefix);
+                if (purePrefix)
+                    rm._prefix = prefix;
+            }
+            else {
+                uassert( 15882, "$elemMatch not allowed within $in",
+                         ie.type() != Object ||
+                         ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
+                _myset->insert(ie);
+            }
+        }
+
+        if ( _allMatchers.size() ) {
+            uassert( 13020 , "with $all, can't mix $elemMatch and others" , _myset->size() == 0 && !_myregex.get());
+        }
+
+    }
+
+    int ElementMatcher::inverseOfNegativeCompareOp() const {
+        verify( 15892, negativeCompareOp() );
+        return _compareOp == BSONObj::NE ? BSONObj::Equality : BSONObj::opIN;
+    }
+
+    bool ElementMatcher::negativeCompareOpContainsNull() const {
+        verify( 15893, negativeCompareOp() );
+        return (_compareOp == BSONObj::NE && _toMatch.type() != jstNULL) ||
+        (_compareOp == BSONObj::NIN && _myset->count( staticNull.firstElement()) == 0 );
+    }
+
+    void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot) {
+
+        RegexMatcher rm;
+        rm._re.reset( new pcrecpp::RE(regex, flags2options(flags)) );
+        rm._fieldName = fieldName;
+        rm._regex = regex;
+        rm._flags = flags;
+        rm._isNot = isNot;
+        _regexs.push_back(rm);
+
+        if (!isNot) { //TODO something smarter
+            bool purePrefix;
+            string prefix = simpleRegex(regex, flags, &purePrefix);
+            if (purePrefix)
+                rm._prefix = prefix;
+        }
+    }
+
+    bool Matcher::addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ) {
+        const char *fn = fe.fieldName();
+        int op = fe.getGtLtOp( -1 );
+        if ( op == -1 ) {
+            if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ) {
+                return false; // { $ref : xxx } - treat as normal object
+            }
+            uassert( 10068 ,  (string)"invalid operator: " + fn , op != -1 );
+        }
+
+        switch ( op ) {
+        case BSONObj::GT:
+        case BSONObj::GTE:
+        case BSONObj::LT:
+        case BSONObj::LTE: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), op, isNot);
+            break;
+        }
+        case BSONObj::NE: {
+            _haveNeg = true;
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::NE, isNot);
+            break;
+        }
+        case BSONObj::opALL:
+            _all = true;
+        case BSONObj::opIN: {
+            uassert( 13276 , "$in needs an array" , fe.isABSONObj() );
+            _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            BSONObjIterator i( fe.embeddedObject() );
+            while( i.more() ) {
+                if ( i.next().type() == Array ) {
+                    _hasArray = true;
+                }
+            }
+            break;
+        }
+        case BSONObj::NIN:
+            uassert( 13277 , "$nin needs an array" , fe.isABSONObj() );
+            _haveNeg = true;
+            _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            break;
+        case BSONObj::opMOD:
+        case BSONObj::opTYPE:
+        case BSONObj::opELEM_MATCH: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            // these are types where ElementMatcher has all the info
+            _basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) );
+            break;
+        }
+        case BSONObj::opSIZE: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot);
+            _haveSize = true;
+            break;
+        }
+        case BSONObj::opEXISTS: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot);
+            break;
+        }
+        case BSONObj::opREGEX: {
+            uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot );
+            if ( fe.type() == RegEx ) {
+                regex = fe.regex();
+                flags = fe.regexFlags();
+            }
+            else {
+                regex = fe.valuestrsafe();
+            }
+            break;
+        }
+        case BSONObj::opOPTIONS: {
+            uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot );
+            flags = fe.valuestrsafe();
+            break;
+        }
+        case BSONObj::opNEAR:
+        case BSONObj::opWITHIN:
+        case BSONObj::opMAX_DISTANCE:
+            break;
+        default:
+            uassert( 10069 ,  (string)"BUG - can't operator for: " + fn , 0 );
+        }
+        return true;
+    }
+
+    void Matcher::parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers ) {
+        uassert( 13086, "$and/$or/$nor must be a nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+        BSONObjIterator j( e.embeddedObject() );
+        while( j.more() ) {
+            BSONElement f = j.next();
+            uassert( 13087, "$and/$or/$nor match element must be an object", f.type() == Object );
+            matchers.push_back( shared_ptr< Matcher >( new Matcher( f.embeddedObject(), true ) ) );
+        }
+    }
+
+    bool Matcher::parseClause( const BSONElement &e ) {
+        const char *ef = e.fieldName();
+
+        if ( ef[ 0 ] != '$' )
+            return false;
+        
+        // $and
+        if ( ef[ 1 ] == 'a' && ef[ 2 ] == 'n' && ef[ 3 ] == 'd' ) {
+            parseExtractedClause( e, _andMatchers );
+            return true;
+        }
+
+        // $or
+        if ( ef[ 1 ] == 'o' && ef[ 2 ] == 'r' && ef[ 3 ] == 0 ) {
+            parseExtractedClause( e, _orMatchers );
+            return true;
+        }
+        
+        // $nor
+        if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) {
+            parseExtractedClause( e, _norMatchers );
+            return true;
+        }
+        
+        // $comment
+        if ( ef[ 1 ] == 'c' && ef[ 2 ] == 'o' && ef[ 3 ] == 'm' && str::equals( ef , "$comment" ) ) {
+            return true;
+        }
+
+        return false;
+    }
+
+    // $where: function()...
+    NOINLINE_DECL void Matcher::parseWhere( const BSONElement &e ) { 
+        uassert(15902 , "$where expression has an unexpected type", e.type() == String || e.type() == CodeWScope || e.type() == Code );
+        uassert( 10066 , "$where may only appear once in query", _where == 0 );
+        uassert( 10067 , "$where query, but no script engine", globalScriptEngine );
+        massert( 13089 , "no current client needed for $where" , haveClient() );
+        _where = new Where();
+        _where->scope = globalScriptEngine->getPooledScope( cc().ns() );
+        _where->scope->localConnect( cc().database()->name.c_str() );
+            
+        if ( e.type() == CodeWScope ) {
+            _where->setFunc( e.codeWScopeCode() );
+            _where->jsScope = new BSONObj( e.codeWScopeScopeData() );
+        }
+        else {
+            const char *code = e.valuestr();
+            _where->setFunc(code);
+        }
+            
+        _where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" );
+    }
+    
+    void Matcher::parseMatchExpressionElement( const BSONElement &e, bool nested ) {
+        
+        uassert( 13629 , "can't have undefined in a query expression" , e.type() != Undefined );
+        
+        if ( parseClause( e ) ) {
+            return;   
+        }
+
+        const char *fn = e.fieldName();
+        if ( str::equals(fn, "$where") ) {
+            parseWhere(e);
+            return;
+        }
+
+        if ( e.type() == RegEx ) {
+            addRegex( fn, e.regex(), e.regexFlags() );
+            return;
+        }
+        
+        // greater than / less than...
+        // e.g., e == { a : { $gt : 3 } }
+        //       or
+        //            { a : { $in : [1,2,3] } }
+        if ( e.type() == Object ) {
+            // support {$regex:"a|b", $options:"imx"}
+            const char* regex = NULL;
+            const char* flags = "";
+            
+            // e.g., fe == { $gt : 3 }
+            BSONObjIterator j(e.embeddedObject());
+            bool isOperator = false;
+            while ( j.more() ) {
+                BSONElement fe = j.next();
+                const char *fn = fe.fieldName();
+                
+                if ( fn[0] == '$' && fn[1] ) {
+                    isOperator = true;
+                    
+                    if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) {
+                        _haveNeg = true;
+                        switch( fe.type() ) {
+                            case Object: {
+                                BSONObjIterator k( fe.embeddedObject() );
+                                uassert( 13030, "$not cannot be empty", k.more() );
+                                while( k.more() ) {
+                                    addOp( e, k.next(), true, regex, flags );
+                                }
+                                break;
+                            }
+                            case RegEx:
+                                addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true );
+                                break;
+                            default:
+                                uassert( 13031, "invalid use of $not", false );
+                        }
+                    }
+                    else {
+                        if ( !addOp( e, fe, false, regex, flags ) ) {
+                            isOperator = false;
+                            break;
+                        }
+                    }
+                }
+                else {
+                    isOperator = false;
+                    break;
+                }
+            }
+            if (regex) {
+                addRegex(e.fieldName(), regex, flags);
+            }
+            if ( isOperator )
+                return;
+        }
+        
+        if ( e.type() == Array ) {
+            _hasArray = true;
+        }
+        else if( *fn == '$' ) {
+            if( str::equals(fn, "$atomic") || str::equals(fn, "$isolated") ) {
+                uassert( 14844, "$atomic specifier must be a top level field", !nested );
+                _atomic = e.trueValue();
+                return;
+            }
+        }
+        
+        // normal, simple case e.g. { a : "foo" }
+        addBasic(e, BSONObj::Equality, false);
+    }
+    
+    /* _jsobj          - the query pattern
+    */
+    Matcher::Matcher(const BSONObj &jsobj, bool nested) :
+        _where(0), _jsobj(jsobj), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false) {
+
+        BSONObjIterator i(_jsobj);
+        while ( i.more() ) {
+            parseMatchExpressionElement( i.next(), nested );
+        }
+    }
+
+    Matcher::Matcher( const Matcher &docMatcher, const BSONObj &key ) :
+        _where(0), _constrainIndexKey( key ), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false) {
+        // Filter out match components that will provide an incorrect result
+        // given a key from a single key index.
+        for( vector< ElementMatcher >::const_iterator i = docMatcher._basics.begin(); i != docMatcher._basics.end(); ++i ) {
+            if ( key.hasField( i->_toMatch.fieldName() ) ) {
+                switch( i->_compareOp ) {
+                case BSONObj::opSIZE:
+                case BSONObj::opALL:
+                case BSONObj::NE:
+                case BSONObj::NIN:
+                case BSONObj::opEXISTS: // We can't match on index in this case.
+                case BSONObj::opTYPE: // For $type:10 (null), a null key could be a missing field or a null value field.
+                    break;
+                case BSONObj::opIN: {
+                    bool inContainsArray = false;
+                    for( set<BSONElement,element_lt>::const_iterator j = i->_myset->begin(); j != i->_myset->end(); ++j ) {
+                        if ( j->type() == Array ) {
+                            inContainsArray = true;
+                            break;
+                        }
+                    }
+                    // Can't match an array to its first indexed element.
+                    if ( !i->_isNot && !inContainsArray ) {
+                        _basics.push_back( *i );
+                    }
+                    break;
+                }
+                default: {
+                    // Can't match an array to its first indexed element.
+                    if ( !i->_isNot && i->_toMatch.type() != Array ) {
+                        _basics.push_back( *i );
+                    }
+                }
+                }
+            }
+        }
+        for( vector<RegexMatcher>::const_iterator it = docMatcher._regexs.begin();
+            it != docMatcher._regexs.end();
+            ++it) {
+            if ( !it->_isNot && key.hasField( it->_fieldName ) ) {
+                _regexs.push_back(*it);
+            }
+        }
+        // Recursively filter match components for and and or matchers.
+        for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._andMatchers.begin(); i != docMatcher._andMatchers.end(); ++i ) {
+            _andMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
+        }
+        for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._orMatchers.begin(); i != docMatcher._orMatchers.end(); ++i ) {
+            _orMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
+        }
+    }
+
+    inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) {
+        switch (e.type()) {
+        case String:
+        case Symbol:
+            if (rm._prefix.empty())
+                return rm._re->PartialMatch(e.valuestr());
+            else
+                return !strncmp(e.valuestr(), rm._prefix.c_str(), rm._prefix.size());
+        case RegEx:
+            return !strcmp(rm._regex, e.regex()) && !strcmp(rm._flags, e.regexFlags());
+        default:
+            return false;
+        }
+    }
+
+    inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) const {
+        assert( op != BSONObj::NE && op != BSONObj::NIN );
+
+        if ( op == BSONObj::Equality ) {
+            return l.valuesEqual(r);
+        }
+
+        if ( op == BSONObj::opIN ) {
+            // { $in : [1,2,3] }
+            int count = bm._myset->count(l);
+            if ( count )
+                return count;
+            if ( bm._myregex.get() ) {
+                for( vector<RegexMatcher>::const_iterator i = bm._myregex->begin(); i != bm._myregex->end(); ++i ) {
+                    if ( regexMatches( *i, l ) ) {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        if ( op == BSONObj::opSIZE ) {
+            if ( l.type() != Array )
+                return 0;
+            int count = 0;
+            BSONObjIterator i( l.embeddedObject() );
+            while( i.moreWithEOO() ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                ++count;
+            }
+            return count == r.number();
+        }
+
+        if ( op == BSONObj::opMOD ) {
+            if ( ! l.isNumber() )
+                return false;
+
+            return l.numberLong() % bm._mod == bm._modm;
+        }
+
+        if ( op == BSONObj::opTYPE ) {
+            return bm._type == l.type();
+        }
+
+        /* check LT, GTE, ... */
+        if ( l.canonicalType() != r.canonicalType() )
+            return false;
+        int c = compareElementValues(l, r);
+        if ( c < -1 ) c = -1;
+        if ( c > 1 ) c = 1;
+        int z = 1 << (c+1);
+        return (op & z);
+    }
+
+    int Matcher::inverseMatch(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm , MatchDetails * details ) const {
+        int inverseRet = matchesDotted( fieldName, toMatch, obj, bm.inverseOfNegativeCompareOp(), bm , false , details );
+        if ( bm.negativeCompareOpContainsNull() ) {
+            return ( inverseRet <= 0 ) ? 1 : 0;
+        }
+        return -inverseRet;
+    }
+
+    int retExistsFound( const ElementMatcher &bm ) {
+        return bm._toMatch.trueValue() ? 1 : -1;
+    }
+
+    /* Check if a particular field matches.
+
+       fieldName - field to match "a.b" if we are reaching into an embedded object.
+       toMatch   - element we want to match.
+       obj       - database object to check against
+       compareOp - Equality, LT, GT, etc.  This may be different than, and should supersede, the compare op in em. 
+       isArr     -
+
+       Special forms:
+
+         { "a.b" : 3 }             means       obj.a.b == 3
+         { a : { $lt : 3 } }       means       obj.a < 3
+         { a : { $in : [1,2] } }   means       [1,2].contains(obj.a)
+
+         return value
+       -1 mismatch
+        0 missing element
+        1 match
+    */
+    int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& em , bool isArr, MatchDetails * details ) const {
+        DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) );
+
+        if ( compareOp == BSONObj::opALL ) {
+
+            if ( em._allMatchers.size() ) {
+                // $all query matching will not be performed against indexes, so the field
+                // to match is always extracted from the full document.
+                BSONElement e = obj.getFieldDotted( fieldName );
+                // The $all/$elemMatch operator only matches arrays.
+                if ( e.type() != Array ) {
+                    return -1;
+                }
+
+                for ( unsigned i=0; i<em._allMatchers.size(); i++ ) {
+                    bool found = false;
+                    BSONObjIterator x( e.embeddedObject() );
+                    while ( x.more() ) {
+                        BSONElement f = x.next();
+
+                        if ( f.type() != Object )
+                            continue;
+                        if ( em._allMatchers[i]->matches( f.embeddedObject() ) ) {
+                            found = true;
+                            break;
+                        }
+                    }
+
+                    if ( ! found )
+                        return -1;
+                }
+
+                return 1;
+            }
+
+            if ( em._myset->size() == 0 && !em._myregex.get() )
+                return -1; // is this desired?
+
+            BSONElementSet myValues;
+            obj.getFieldsDotted( fieldName , myValues );
+
+            for( set< BSONElement, element_lt >::const_iterator i = em._myset->begin(); i != em._myset->end(); ++i ) {
+                // ignore nulls
+                if ( i->type() == jstNULL )
+                    continue;
+
+                if ( myValues.count( *i ) == 0 )
+                    return -1;
+            }
+
+            if ( !em._myregex.get() )
+                return 1;
+
+            for( vector< RegexMatcher >::const_iterator i = em._myregex->begin(); i != em._myregex->end(); ++i ) {
+                bool match = false;
+                for( BSONElementSet::const_iterator j = myValues.begin(); j != myValues.end(); ++j ) {
+                    if ( regexMatches( *i, *j ) ) {
+                        match = true;
+                        break;
+                    }
+                }
+                if ( !match )
+                    return -1;
+            }
+
+            return 1;
+        } // end opALL
+
+        if ( compareOp == BSONObj::NE || compareOp == BSONObj::NIN ) {
+            return inverseMatch( fieldName, toMatch, obj, em , details );
+        }
+
+        BSONElement e;
+        bool indexed = !_constrainIndexKey.isEmpty();
+        if ( indexed ) {
+            e = obj.getFieldUsingIndexNames(fieldName, _constrainIndexKey);
+            if( e.eoo() ) {
+                cout << "obj: " << obj << endl;
+                cout << "fieldName: " << fieldName << endl;
+                cout << "_constrainIndexKey: " << _constrainIndexKey << endl;
+                assert( !e.eoo() );
+            }
+        }
+        else {
+
+            const char *p = strchr(fieldName, '.');
+            if ( p ) {
+                string left(fieldName, p-fieldName);
+
+                BSONElement se = obj.getField(left.c_str());
+                if ( se.eoo() )
+                    ;
+                else if ( se.type() != Object && se.type() != Array )
+                    ;
+                else {
+                    BSONObj eo = se.embeddedObject();
+                    return matchesDotted(p+1, toMatch, eo, compareOp, em, se.type() == Array , details );
+                }
+            }
+
+            // An array was encountered while scanning for components of the field name.
+            if ( isArr ) {
+                DEBUGMATCHER( "\t\t isArr 1 : obj : " << obj );
+                BSONObjIterator ai(obj);
+                bool found = false;
+                while ( ai.moreWithEOO() ) {
+                    BSONElement z = ai.next();
+
+                    if( strcmp(z.fieldName(),fieldName) == 0 ) {
+                        if ( compareOp == BSONObj::opEXISTS ) {
+                         	return retExistsFound( em );
+                        }
+                        if (valuesMatch(z, toMatch, compareOp, em) ) {
+	                        // "field.<n>" array notation was used
+    	                    if ( details )
+        	                    details->_elemMatchKey = z.fieldName();
+            	            return 1;
+                        }
+                    }
+
+                    if ( z.type() == Object ) {
+                        BSONObj eo = z.embeddedObject();
+                        int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, em, false, details );
+                        if ( cmp > 0 ) {
+                            if ( details )
+                                details->_elemMatchKey = z.fieldName();
+                            return 1;
+                        }
+                        else if ( cmp < 0 ) {
+                            found = true;
+                        }
+                    }
+                }
+                return found ? -1 : 0;
+            }
+
+            if( p ) {
+                // Left portion of field name was not found or wrong type.
+                return 0;
+            }
+            else {
+                e = obj.getField(fieldName);
+            }
+        }
+
+        if ( compareOp == BSONObj::opEXISTS ) {
+            if( e.eoo() ) {
+             	return 0;
+            } else {
+             	return retExistsFound( em );   
+            }
+        }
+        else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) &&
+                  valuesMatch(e, toMatch, compareOp, em ) ) {
+            return 1;
+        }
+        else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) {
+            BSONObjIterator ai(e.embeddedObject());
+
+            while ( ai.moreWithEOO() ) {
+                BSONElement z = ai.next();
+
+                if ( compareOp == BSONObj::opELEM_MATCH ) {
+                    if ( z.type() == Object ) {
+                        if ( em._subMatcher->matches( z.embeddedObject() ) ) {
+                            if ( details )
+                                details->_elemMatchKey = z.fieldName();
+                            return 1;
+                        }
+                    }
+                    else if ( em._subMatcherOnPrimitives ) {
+                        if ( z.type() && em._subMatcher->matches( z.wrap( "" ) ) ) {
+                            if ( details )
+                                details->_elemMatchKey = z.fieldName();
+                            return 1;
+                        }
+                    }
+                }
+                else {
+                    if ( valuesMatch( z, toMatch, compareOp, em) ) {
+                        if ( details )
+                            details->_elemMatchKey = z.fieldName();
+                        return 1;
+                    }
+                }
+
+            }
+
+            // match an entire array to itself
+            if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ) {
+                return 1;
+            }
+            if ( compareOp == BSONObj::opIN && valuesMatch( e, toMatch, compareOp, em ) ) {
+             	return 1;
+            }
+        }
+        else if ( e.eoo() ) {
+            return 0;
+        }
+        return -1;
+    }
+
+    extern int dump;
+
+    /* See if an object matches the query.
+    */
+    bool Matcher::matches(const BSONObj& jsobj , MatchDetails * details ) const {
+        LOG(5) << "Matcher::matches() " << jsobj.toString() << endl;
+
+        /* assuming there is usually only one thing to match.  if more this
+           could be slow sometimes. */
+
+        // check normal non-regex cases:
+        for ( unsigned i = 0; i < _basics.size(); i++ ) {
+            const ElementMatcher& bm = _basics[i];
+            const BSONElement& m = bm._toMatch;
+            // -1=mismatch. 0=missing element. 1=match
+            int cmp = matchesDotted(m.fieldName(), m, jsobj, bm._compareOp, bm , false , details );
+            if ( cmp == 0 && bm._compareOp == BSONObj::opEXISTS ) {
+                // If missing, match cmp is opposite of $exists spec.
+                cmp = -retExistsFound(bm);
+            }
+            if ( bm._isNot )
+                cmp = -cmp;
+            if ( cmp < 0 )
+                return false;
+            if ( cmp == 0 ) {
+                /* missing is ok iff we were looking for null */
+                if ( m.type() == jstNULL || m.type() == Undefined ||
+                    ( ( bm._compareOp == BSONObj::opIN || bm._compareOp == BSONObj::NIN ) && bm._myset->count( staticNull.firstElement() ) > 0 ) ) {
+                    if ( bm.negativeCompareOp() ^ bm._isNot ) {
+                        return false;
+                    }
+                }
+                else {
+                    if ( !bm._isNot ) {
+                        return false;
+                    }
+                }
+            }
+        }
+
+        for (vector<RegexMatcher>::const_iterator it = _regexs.begin();
+	     it != _regexs.end();
+	     ++it) {
+            BSONElementSet s;
+            if ( !_constrainIndexKey.isEmpty() ) {
+                BSONElement e = jsobj.getFieldUsingIndexNames(it->_fieldName, _constrainIndexKey);
+
+                // Should only have keys nested one deep here, for geo-indices
+                // TODO: future indices may nest deeper?
+                if( e.type() == Array ){
+                	BSONObjIterator i( e.Obj() );
+                	while( i.more() ){
+                		s.insert( i.next() );
+                	}
+                }
+                else if ( !e.eoo() )
+                    s.insert( e );
+
+            }
+            else {
+                jsobj.getFieldsDotted( it->_fieldName, s );
+            }
+            bool match = false;
+            for( BSONElementSet::const_iterator i = s.begin(); i != s.end(); ++i )
+                if ( regexMatches(*it, *i) )
+                    match = true;
+            if ( !match ^ it->_isNot )
+                return false;
+        }
+
+        if ( _orDedupConstraints.size() > 0 ) {
+            for( vector< shared_ptr< FieldRangeVector > >::const_iterator i = _orDedupConstraints.begin();
+                i != _orDedupConstraints.end(); ++i ) {
+                if ( (*i)->matches( jsobj ) ) {
+                    return false;
+                }
+            }
+        }
+        
+        if ( _andMatchers.size() > 0 ) {
+            for( list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin();
+                i != _andMatchers.end(); ++i ) {
+                // SERVER-3192 Track field matched using details the same as for
+                // top level fields, at least for now.
+                if ( !(*i)->matches( jsobj, details ) ) {
+                    return false;
+                }
+            }
+        }
+
+        if ( _orMatchers.size() > 0 ) {
+            bool match = false;
+            for( list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
+                    i != _orMatchers.end(); ++i ) {
+                // SERVER-205 don't submit details - we don't want to track field
+                // matched within $or
+                if ( (*i)->matches( jsobj ) ) {
+                    match = true;
+                    break;
+                }
+            }
+            if ( !match ) {
+                return false;
+            }
+        }
+
+        if ( _norMatchers.size() > 0 ) {
+            for( list< shared_ptr< Matcher > >::const_iterator i = _norMatchers.begin();
+                    i != _norMatchers.end(); ++i ) {
+                // SERVER-205 don't submit details - we don't want to track field
+                // matched within $nor
+                if ( (*i)->matches( jsobj ) ) {
+                    return false;
+                }
+            }
+        }
+
+        if ( _where ) {
+            if ( _where->func == 0 ) {
+                uassert( 10070 , "$where compile error", false);
+                return false; // didn't compile
+            }
+
+            if ( _where->jsScope ) {
+                _where->scope->init( _where->jsScope );
+            }
+            _where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) );
+            _where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant
+
+            int err = _where->scope->invoke( _where->func , 0, &jsobj , 1000 * 60 , false );
+            if ( err == -3 ) { // INVOKE_ERROR
+                stringstream ss;
+                ss << "error on invocation of $where function:\n"
+                   << _where->scope->getError();
+                uassert( 10071 , ss.str(), false);
+                return false;
+            }
+            else if ( err != 0 ) {   // ! INVOKE_SUCCESS
+                uassert( 10072 , "unknown error in invocation of $where function", false);
+                return false;
+            }
+            return _where->scope->getBoolean( "return" ) != 0;
+
+        }
+
+        return true;
+    }
+
+    bool Matcher::keyMatch( const Matcher &docMatcher ) const {
+        // Quick check certain non key match cases.
+        if ( docMatcher._all
+        	|| docMatcher._haveSize
+        	|| docMatcher._hasArray // We can't match an array to its first indexed element using keymatch
+        	|| docMatcher._haveNeg ) {
+         	return false;   
+        }
+        
+        // Check that all match components are available in the index matcher.
+        if ( !( _basics.size() == docMatcher._basics.size() && _regexs.size() == docMatcher._regexs.size() && !docMatcher._where ) ) {
+            return false;
+        }
+        if ( _andMatchers.size() != docMatcher._andMatchers.size() ) {
+            return false;
+        }
+        if ( _orMatchers.size() != docMatcher._orMatchers.size() ) {
+            return false;
+        }
+        if ( docMatcher._norMatchers.size() > 0 ) {
+            return false;
+        }
+        if ( docMatcher._orDedupConstraints.size() > 0 ) {
+            return false;
+        }
+        
+        // Recursively check that all submatchers support key match.
+        {
+            list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin();
+            list< shared_ptr< Matcher > >::const_iterator j = docMatcher._andMatchers.begin();
+            while( i != _andMatchers.end() ) {
+                if ( !(*i)->keyMatch( **j ) ) {
+                    return false;
+                }
+                ++i; ++j;
+            }
+        }
+        {
+            list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
+            list< shared_ptr< Matcher > >::const_iterator j = docMatcher._orMatchers.begin();
+            while( i != _orMatchers.end() ) {
+                if ( !(*i)->keyMatch( **j ) ) {
+                    return false;
+                }
+                ++i; ++j;
+            }
+        }
+        // Nor matchers and or dedup constraints aren't created for index matchers,
+        // so no need to check those here.
+        return true;
+    }
+
+
+    /*- just for testing -- */
+#pragma pack(1)
+    struct JSObj1 {
+        JSObj1() {
+            totsize=sizeof(JSObj1);
+            n = NumberDouble;
+            strcpy_s(nname, 5, "abcd");
+            N = 3.1;
+            s = String;
+            strcpy_s(sname, 7, "abcdef");
+            slen = 10;
+            strcpy_s(sval, 10, "123456789");
+            eoo = EOO;
+        }
+        unsigned totsize;
+
+        char n;
+        char nname[5];
+        double N;
+
+        char s;
+        char sname[7];
+        unsigned slen;
+        char sval[10];
+
+        char eoo;
+    };
+#pragma pack()
+
+    struct JSObj1 js1;
+
+#pragma pack(1)
+    struct JSObj2 {
+        JSObj2() {
+            totsize=sizeof(JSObj2);
+            s = String;
+            strcpy_s(sname, 7, "abcdef");
+            slen = 10;
+            strcpy_s(sval, 10, "123456789");
+            eoo = EOO;
+        }
+        unsigned totsize;
+        char s;
+        char sname[7];
+        unsigned slen;
+        char sval[10];
+        char eoo;
+    } js2;
+
+    struct JSUnitTest : public UnitTest {
+        void run() {
+
+            BSONObj j1((const char *) &js1);
+            BSONObj j2((const char *) &js2);
+            Matcher m(j2);
+            assert( m.matches(j1) );
+            js2.sval[0] = 'z';
+            assert( !m.matches(j1) );
+            Matcher n(j1);
+            assert( n.matches(j1) );
+            assert( !n.matches(j2) );
+
+            BSONObj j0 = BSONObj();
+//      BSONObj j0((const char *) &js0);
+            Matcher p(j0);
+            assert( p.matches(j1) );
+            assert( p.matches(j2) );
+        }
+    } jsunittest;
+
+#pragma pack()
+
+    struct RXTest : public UnitTest {
+
+        RXTest() {
+        }
+
+        void run() {
+            /*
+            static const boost::regex e("(\\d{4}[- ]){3}\\d{4}");
+            static const boost::regex b(".....");
+            out() << "regex result: " << regex_match("hello", e) << endl;
+            out() << "regex result: " << regex_match("abcoo", b) << endl;
+            */
+
+            int ret = 0;
+
+            pcre_config( PCRE_CONFIG_UTF8 , &ret );
+            massert( 10342 ,  "pcre not compiled with utf8 support" , ret );
+
+            pcrecpp::RE re1(")({a}h.*o");
+            pcrecpp::RE re("h.llo");
+            assert( re.FullMatch("hello") );
+            assert( !re1.FullMatch("hello") );
+
+
+            pcrecpp::RE_Options options;
+            options.set_utf8(true);
+            pcrecpp::RE part("dwi", options);
+            assert( part.PartialMatch("dwight") );
+
+            pcre_config( PCRE_CONFIG_UNICODE_PROPERTIES , &ret );
+            if ( ! ret )
+                cout << "warning: some regex utf8 things will not work.  pcre build doesn't have --enable-unicode-properties" << endl;
+
+        }
+    } rxtest;
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher.h b/src/mongo/db/matcher.h
new file mode 100644
index 00000000000..b6994a79229
--- /dev/null
+++ b/src/mongo/db/matcher.h
@@ -0,0 +1,276 @@
+// matcher.h
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "jsobj.h"
+#include "pcrecpp.h"
+
+namespace mongo {
+
+    class Cursor;
+    class CoveredIndexMatcher;
+    class Matcher;
+    class FieldRangeVector;
+
+    class RegexMatcher {
+    public:
+        const char *_fieldName;
+        const char *_regex;
+        const char *_flags;
+        string _prefix;
+        shared_ptr< pcrecpp::RE > _re;
+        bool _isNot;
+        RegexMatcher() : _isNot() {}
+    };
+
+    struct element_lt {
+        bool operator()(const BSONElement& l, const BSONElement& r) const {
+            int x = (int) l.canonicalType() - (int) r.canonicalType();
+            if ( x < 0 ) return true;
+            else if ( x > 0 ) return false;
+            return compareElementValues(l,r) < 0;
+        }
+    };
+
+
+    class ElementMatcher {
+    public:
+
+        ElementMatcher() {
+        }
+
+        ElementMatcher( BSONElement e , int op, bool isNot );
+
+        ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot );
+
+        ~ElementMatcher() { }
+        
+        bool negativeCompareOp() const { return _compareOp == BSONObj::NE || _compareOp == BSONObj::NIN; }
+        int inverseOfNegativeCompareOp() const;
+        bool negativeCompareOpContainsNull() const;
+        
+        BSONElement _toMatch;
+        int _compareOp;
+        bool _isNot;
+        shared_ptr< set<BSONElement,element_lt> > _myset;
+        shared_ptr< vector<RegexMatcher> > _myregex;
+
+        // these are for specific operators
+        int _mod;
+        int _modm;
+        BSONType _type;
+
+        shared_ptr<Matcher> _subMatcher;
+        bool _subMatcherOnPrimitives ;
+
+        vector< shared_ptr<Matcher> > _allMatchers;
+    };
+
+    class Where; // used for $where javascript eval
+    class DiskLoc;
+
+    struct MatchDetails {
+        MatchDetails() {
+            reset();
+        }
+
+        void reset() {
+            _loadedObject = false;
+            _elemMatchKey = 0;
+        }
+
+        string toString() const {
+            stringstream ss;
+            ss << "loadedObject: " << _loadedObject << " ";
+            ss << "elemMatchKey: " << ( _elemMatchKey ? _elemMatchKey : "NULL" ) << " ";
+            return ss.str();
+        }
+
+        bool _loadedObject;
+        const char * _elemMatchKey; // warning, this may go out of scope if matched object does
+    };
+
+    /* Match BSON objects against a query pattern.
+
+       e.g.
+           db.foo.find( { a : 3 } );
+
+       { a : 3 } is the pattern object.  See wiki documentation for full info.
+
+       GT/LT:
+         { a : { $gt : 3 } }
+       Not equal:
+         { a : { $ne : 3 } }
+
+       TODO: we should rewrite the matcher to be more an AST style.
+    */
+    class Matcher : boost::noncopyable {
+        int matchesDotted(
+            const char *fieldName,
+            const BSONElement& toMatch, const BSONObj& obj,
+            int compareOp, const ElementMatcher& bm, bool isArr , MatchDetails * details ) const;
+
+        /**
+         * Perform a NE or NIN match by returning the inverse of the opposite matching operation.
+         * Missing values are considered matches unless the match must not equal null.
+         */
+        int inverseMatch(
+            const char *fieldName,
+            const BSONElement &toMatch, const BSONObj &obj,
+            const ElementMatcher&bm, MatchDetails * details ) const;
+
+    public:
+        static int opDirection(int op) {
+            return op <= BSONObj::LTE ? -1 : 1;
+        }
+
+        Matcher(const BSONObj &pattern, bool nested=false);
+
+        ~Matcher();
+
+        bool matches(const BSONObj& j, MatchDetails * details = 0 ) const;
+
+        bool atomic() const { return _atomic; }
+
+        string toString() const {
+            return _jsobj.toString();
+        }
+
+        void addOrDedupConstraint( const shared_ptr< FieldRangeVector > &frv ) {
+            _orDedupConstraints.push_back( frv );
+        }
+
+        void popOrClause() {
+            _orMatchers.pop_front();
+        }
+
+        /**
+         * @return true if this key matcher will return the same true/false
+         * value as the provided doc matcher.
+         */
+        bool keyMatch( const Matcher &docMatcher ) const;
+        
+        bool singleSimpleCriterion() const {
+            return false; // TODO SERVER-958
+//            // TODO Really check, especially if all basics are ok.
+//            // $all, etc
+//            // _orConstraints?
+//            return ( ( basics.size() + nRegex ) < 2 ) && !where && !_orMatchers.size() && !_norMatchers.size();
+        }
+
+	const BSONObj *getQuery() const { return &_jsobj; };
+
+    private:
+        /**
+         * Generate a matcher for the provided index key format using the
+         * provided full doc matcher.
+         */
+        Matcher( const Matcher &docMatcher, const BSONObj &constrainIndexKey );
+
+        void addBasic(const BSONElement &e, int c, bool isNot) {
+            // TODO May want to selectively ignore these element types based on op type.
+            if ( e.type() == MinKey || e.type() == MaxKey )
+                return;
+            _basics.push_back( ElementMatcher( e , c, isNot ) );
+        }
+
+        void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false);
+        bool addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags );
+
+        int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) const;
+
+        bool parseClause( const BSONElement &e );
+        void parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers );
+
+        void parseWhere( const BSONElement &e );
+        void parseMatchExpressionElement( const BSONElement &e, bool nested );
+        
+        Where *_where;                    // set if query uses $where
+        BSONObj _jsobj;                  // the query pattern.  e.g., { name: "joe" }
+        BSONObj _constrainIndexKey;
+        vector<ElementMatcher> _basics;
+        bool _haveSize;
+        bool _all;
+        bool _hasArray;
+        bool _haveNeg;
+
+        /* $atomic - if true, a multi document operation (some removes, updates)
+                     should be done atomically.  in that case, we do not yield -
+                     i.e. we stay locked the whole time.
+                     http://www.mongodb.org/display/DOCS/Removing[
+        */
+        bool _atomic;
+
+        vector<RegexMatcher> _regexs;
+
+        // so we delete the mem when we're done:
+        vector< shared_ptr< BSONObjBuilder > > _builders;
+        list< shared_ptr< Matcher > > _andMatchers;
+        list< shared_ptr< Matcher > > _orMatchers;
+        list< shared_ptr< Matcher > > _norMatchers;
+        vector< shared_ptr< FieldRangeVector > > _orDedupConstraints;
+
+        friend class CoveredIndexMatcher;
+    };
+
+    // If match succeeds on index key, then attempt to match full document.
+    class CoveredIndexMatcher : boost::noncopyable {
+    public:
+        CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
+        bool matches(const BSONObj &o) { return _docMatcher->matches( o ); }
+        bool matchesWithSingleKeyIndex(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 ) {
+            return matches( key, recLoc, details, true );   
+        }
+        /**
+         * This is the preferred method for matching against a cursor, as it
+         * can handle both multi and single key cursors.
+         */
+        bool matchesCurrent( Cursor * cursor , MatchDetails * details = 0 );
+        bool needRecord() { return _needRecord; }
+
+        Matcher& docMatcher() { return *_docMatcher; }
+
+        // once this is called, shouldn't use this matcher for matching any more
+        void advanceOrClause( const shared_ptr< FieldRangeVector > &frv ) {
+            _docMatcher->addOrDedupConstraint( frv );
+            // TODO this is not yet optimal.  Since we could skip an entire
+            // or clause (if a match is impossible) between calls to advanceOrClause()
+            // we may not pop all the clauses we can.
+            _docMatcher->popOrClause();
+        }
+
+        CoveredIndexMatcher *nextClauseMatcher( const BSONObj &indexKeyPattern, bool alwaysUseRecord=false ) {
+            return new CoveredIndexMatcher( _docMatcher, indexKeyPattern, alwaysUseRecord );
+        }
+
+        string toString() const;
+
+    private:
+        bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 , bool keyUsable = true );
+        CoveredIndexMatcher(const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
+        void init( bool alwaysUseRecord );
+        shared_ptr< Matcher > _docMatcher;
+        Matcher _keyMatcher;
+
+        bool _needRecord; // if the key itself isn't good enough to determine a positive match
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher_covered.cpp b/src/mongo/db/matcher_covered.cpp
new file mode 100644
index 00000000000..c6c89d03007
--- /dev/null
+++ b/src/mongo/db/matcher_covered.cpp
@@ -0,0 +1,101 @@
+// matcher_covered.cpp
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "matcher.h"
+#include "../util/goodies.h"
+#include "../util/unittest.h"
+#include "diskloc.h"
+#include "../scripting/engine.h"
+#include "db.h"
+#include "client.h"
+
+#include "pdfile.h"
+
+namespace mongo {
+
+    CoveredIndexMatcher::CoveredIndexMatcher( const BSONObj &jsobj, const BSONObj &indexKeyPattern, bool alwaysUseRecord) :
+        _docMatcher( new Matcher( jsobj ) ),
+        _keyMatcher( *_docMatcher, indexKeyPattern ) {
+        init( alwaysUseRecord );
+    }
+
+    CoveredIndexMatcher::CoveredIndexMatcher( const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord ) :
+        _docMatcher( docMatcher ),
+        _keyMatcher( *_docMatcher, indexKeyPattern ) {
+        init( alwaysUseRecord );
+    }
+
+    void CoveredIndexMatcher::init( bool alwaysUseRecord ) {
+        _needRecord =
+            alwaysUseRecord ||
+	        !_keyMatcher.keyMatch( *_docMatcher );
+    }
+
+    bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ) {
+        // bool keyUsable = ! cursor->isMultiKey() && check for $orish like conditions in matcher SERVER-1264
+        return matches( cursor->currKey() , cursor->currLoc() , details ,
+                       !cursor->indexKeyPattern().isEmpty() // unindexed cursor
+                       && !cursor->isMultiKey() // multikey cursor
+                       );
+    }
+
+    bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details , bool keyUsable ) {
+
+        LOG(5) << "CoveredIndexMatcher::matches() " << key.toString() << ' ' << recLoc.toString() << ' ' << keyUsable << endl;
+
+        dassert( key.isValid() );
+
+        if ( details )
+            details->reset();
+
+        if ( keyUsable ) {
+            if ( !_keyMatcher.matches(key, details ) ) {
+                return false;
+            }
+            if ( ! _needRecord ) {
+                return true;
+            }
+        }
+
+        if ( details )
+            details->_loadedObject = true;
+
+        bool res = _docMatcher->matches(recLoc.obj() , details );
+        LOG(5) << "CoveredIndexMatcher _docMatcher->matches() returns " << res << endl;
+        return res;
+    }
+
+    string CoveredIndexMatcher::toString() const {
+        StringBuilder buf;
+        buf << "(CoveredIndexMatcher ";
+        
+        if ( _needRecord )
+            buf << "needRecord ";
+        
+        buf << "keyMatcher: " << _keyMatcher.toString() << " ";
+        
+        if ( _docMatcher )
+            buf << "docMatcher: " << _docMatcher->toString() << " ";
+        
+        buf << ")";
+        return buf.str();
+    }
+}
diff --git a/src/mongo/db/minilex.h b/src/mongo/db/minilex.h
new file mode 100644
index 00000000000..677514aa47c
--- /dev/null
+++ b/src/mongo/db/minilex.h
@@ -0,0 +1,164 @@
+// minilex.h
+// mini js lexical analyzer.  idea is to be dumb and fast.
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#error does anything use this?
+
+namespace mongo {
+
+#if defined(_WIN32)
+
+} // namespace mongo
+
+#include <hash_map>
+using namespace stdext;
+
+namespace mongo {
+
+    typedef const char * MyStr;
+    struct less_str {
+        bool operator()(const MyStr & x, const MyStr & y) const {
+            if ( strcmp(x, y) > 0)
+                return true;
+
+            return false;
+        }
+    };
+
+    typedef hash_map<const char*, int, hash_compare<const char *, less_str> > strhashmap;
+
+#else
+
+} // namespace mongo
+
+#include <ext/hash_map>
+
+namespace mongo {
+
+    using namespace __gnu_cxx;
+
+    typedef const char * MyStr;
+    struct eq_str {
+        bool operator()(const MyStr & x, const MyStr & y) const {
+            if ( strcmp(x, y) == 0)
+                return true;
+
+            return false;
+        }
+    };
+
+    typedef hash_map<const char*, int, hash<const char *>, eq_str > strhashmap;
+
+#endif
+
+    /*
+    struct MiniLexNotUsed {
+        strhashmap reserved;
+        bool ic[256]; // ic=Identifier Character
+        bool starter[256];
+
+        // dm: very dumb about comments and escaped quotes -- but we are faster then at least,
+        // albeit returning too much (which is ok for jsbobj current usage).
+        void grabVariables(char *code , strhashmap& vars) { // 'code' modified and must stay in scope*/
+    char *p = code;
+    char last = 0;
+    while ( *p ) {
+        if ( starter[*p] ) {
+            char *q = p+1;
+            while ( *q && ic[*q] ) q++;
+            const char *identifier = p;
+            bool done = *q == 0;
+            *q = 0;
+            if ( !reserved.count(identifier) ) {
+                // we try to be smart about 'obj' but have to be careful as obj.obj
+                // can happen; this is so that nFields is right for simplistic where cases
+                // so we can stop scanning in jsobj when we find the field of interest.
+                if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' )
+                    ;
+                else
+                    vars[identifier] = 1;
+            }
+            if ( done )
+                break;
+            p = q + 1;
+            continue;
+        }
+
+        if ( *p == '\'' ) {
+            p++;
+            while ( *p && *p != '\'' ) p++;
+        }
+        else if ( *p == '"' ) {
+            p++;
+            while ( *p && *p != '"' ) p++;
+        }
+        p++;
+    }
+}
+
+MiniLex() {
+    strhashmap atest;
+    atest["foo"] = 3;
+    assert( atest.count("bar") == 0 );
+    assert( atest.count("foo") == 1 );
+    assert( atest["foo"] == 3 );
+
+    for ( int i = 0; i < 256; i++ ) {
+        ic[i] = starter[i] = false;
+    }
+    for ( int i = 'a'; i <= 'z'; i++ )
+        ic[i] = starter[i] = true;
+    for ( int i = 'A'; i <= 'Z'; i++ )
+        ic[i] = starter[i] = true;
+    for ( int i = '0'; i <= '9'; i++ )
+        ic[i] = true;
+    for ( int i = 128; i < 256; i++ )
+        ic[i] = starter[i] = true;
+    ic['$'] = starter['$'] = true;
+    ic['_'] = starter['_'] = true;
+
+    reserved["break"] = true;
+    reserved["case"] = true;
+    reserved["catch"] = true;
+    reserved["continue"] = true;
+    reserved["default"] = true;
+    reserved["delete"] = true;
+    reserved["do"] = true;
+    reserved["else"] = true;
+    reserved["finally"] = true;
+    reserved["for"] = true;
+    reserved["function"] = true;
+    reserved["if"] = true;
+    reserved["in"] = true;
+    reserved["instanceof"] = true;
+    reserved["new"] = true;
+    reserved["return"] = true;
+    reserved["switch"] = true;
+    reserved["this"] = true;
+    reserved["throw"] = true;
+    reserved["try"] = true;
+    reserved["typeof"] = true;
+    reserved["var"] = true;
+    reserved["void"] = true;
+    reserved["while"] = true;
+    reserved["with "] = true;
+}
+};
+*/
+
+} // namespace mongo
diff --git a/src/mongo/db/module.cpp b/src/mongo/db/module.cpp
new file mode 100644
index 00000000000..4269c5e99a0
--- /dev/null
+++ b/src/mongo/db/module.cpp
@@ -0,0 +1,68 @@
+// module.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "module.h"
+
+namespace mongo {
+
+    std::list<Module*> * Module::_all;
+
+    Module::Module( const string& name )
+        : _name( name ) , _options( (string)"Module " + name + " options" ) {
+        if ( ! _all )
+            _all = new list<Module*>();
+        _all->push_back( this );
+    }
+
+    Module::~Module() {}
+
+    void Module::addOptions( boost::program_options::options_description& options ) {
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+            Module* m = *i;
+            options.add( m->_options );
+        }
+    }
+
+    void Module::configAll( boost::program_options::variables_map& params ) {
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+            Module* m = *i;
+            m->config( params );
+        }
+
+    }
+
+
+    void Module::initAll() {
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+            Module* m = *i;
+            m->init();
+        }
+
+    }
+
+}
diff --git a/src/mongo/db/module.h b/src/mongo/db/module.h
new file mode 100644
index 00000000000..71f276e0585
--- /dev/null
+++ b/src/mongo/db/module.h
@@ -0,0 +1,70 @@
+// module.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.info
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include <boost/program_options.hpp>
+#include <list>
+
+namespace mongo {
+
+    /**
+     * Module is the base class for adding modules to MongoDB
+     * modules allow adding hooks and features to mongo
+     * the idea is to add hooks into the main code for module support where needed
+     * some ideas are: monitoring, indexes, full text search
+     */
+    class Module {
+    public:
+        Module( const string& name );
+        virtual ~Module();
+
+        boost::program_options::options_description_easy_init add_options() {
+            return _options.add_options();
+        }
+
+        /**
+         * read config from command line
+         */
+        virtual void config( boost::program_options::variables_map& params ) = 0;
+
+        /**
+         * called after configuration when the server is ready start
+         */
+        virtual void init() = 0;
+
+        /**
+         * called when the database is about to shutdown
+         */
+        virtual void shutdown() = 0;
+
+        const string& getName() { return _name; }
+
+        // --- static things
+
+        static void addOptions( boost::program_options::options_description& options );
+        static void configAll( boost::program_options::variables_map& params );
+        static void initAll();
+
+    private:
+        static std::list<Module*> * _all;
+        string _name;
+        boost::program_options::options_description _options;
+    };
+}
diff --git a/src/mongo/db/modules/mms.cpp b/src/mongo/db/modules/mms.cpp
new file mode 100644
index 00000000000..418a553f283
--- /dev/null
+++ b/src/mongo/db/modules/mms.cpp
@@ -0,0 +1,170 @@
+// @file mms.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "../db.h"
+#include "../instance.h"
+#include "../module.h"
+#include "../../util/net/httpclient.h"
+#include "../../util/background.h"
+#include "../commands.h"
+
+namespace po = boost::program_options;
+
+namespace mongo {
+
+    /** Mongo Monitoring Service
+        if enabled, this runs in the background ands pings mss
+    */
+    class MMS : public BackgroundJob , Module {
+    public:
+
+        MMS()
+            : Module( "mms" ) , _baseurl( "" ) ,
+              _secsToSleep(1) , _token( "" ) , _name( "" ) {
+
+            add_options()
+            ( "mms-url" , po::value<string>()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" )
+            ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" )
+            ( "mms-name" , po::value<string>() , "server name for mongo monitoring server" )
+            ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" )
+            ;
+        }
+
+        ~MMS() {}
+
+        void config( boost::program_options::variables_map& params ) {
+            _baseurl = params["mms-url"].as<string>();
+            if ( params.count( "mms-token" ) ) {
+                _token = params["mms-token"].as<string>();
+            }
+            if ( params.count( "mms-name" ) ) {
+                _name = params["mms-name"].as<string>();
+            }
+            _secsToSleep = params["mms-interval"].as<int>();
+        }
+
+        void run() {
+            if ( _token.size() == 0  && _name.size() == 0 ) {
+                log(1) << "mms not configured" << endl;
+                return;
+            }
+
+            if ( _token.size() == 0 ) {
+                log() << "no token for mms - not running" << endl;
+                return;
+            }
+
+            if ( _name.size() == 0 ) {
+                log() << "no name for mms - not running" << endl;
+                return;
+            }
+
+            log() << "mms monitor staring...  token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
+            Client::initThread( "mms" );
+            Client& c = cc();
+
+
+            // TODO: using direct client is bad, but easy for now
+
+            while ( ! inShutdown() ) {
+                sleepsecs( _secsToSleep );
+
+                try {
+                    stringstream url;
+                    url << _baseurl << "?"
+                        << "token=" << _token << "&"
+                        << "name=" << _name << "&"
+                        << "ts=" << time(0)
+                        ;
+
+                    BSONObjBuilder bb;
+                    // duplicated so the post has everything
+                    bb.append( "token" , _token );
+                    bb.append( "name" , _name );
+                    bb.appendDate( "ts" , jsTime()  );
+
+                    // any commands
+                    _add( bb , "buildinfo" );
+                    _add( bb , "serverStatus" );
+
+                    BSONObj postData = bb.obj();
+
+                    log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;;
+
+                    HttpClient c;
+                    HttpClient::Result r;
+                    int rc = c.post( url.str() , postData.jsonString() , &r );
+                    log(1) << "\t response code: " << rc << endl;
+                    if ( rc != 200 ) {
+                        log() << "mms error response code:" << rc << endl;
+                        log(1) << "mms error body:" << r.getEntireResponse() << endl;
+                    }
+                }
+                catch ( std::exception& e ) {
+                    log() << "mms exception: " << e.what() << endl;
+                }
+            }
+
+            c.shutdown();
+        }
+
+        void _add( BSONObjBuilder& postData , const char* cmd ) {
+            Command * c = Command::findCommand( cmd );
+            if ( ! c ) {
+                log() << "MMS can't find command: " << cmd << endl;
+                postData.append( cmd , "can't find command" );
+                return;
+            }
+
+            if ( c->locktype() ) {
+                log() << "MMS can only use noLocking commands not: " << cmd << endl;
+                postData.append( cmd , "not noLocking" );
+                return;
+            }
+
+            BSONObj co = BSON( cmd << 1 );
+
+            string errmsg;
+            BSONObjBuilder sub;
+            if ( ! c->run( "admin.$cmd" , co , 0 , errmsg , sub , false ) )
+                postData.append( cmd , errmsg );
+            else
+                postData.append( cmd , sub.obj() );
+        }
+
+
+        void init() { go(); }
+
+        void shutdown() {
+            // TODO
+        }
+
+    private:
+        string _baseurl;
+        int _secsToSleep;
+
+        string _token;
+        string _name;
+
+    } /*mms*/ ;
+
+}
+
+
+
diff --git a/src/mongo/db/mongo.ico b/src/mongo/db/mongo.ico
new file mode 100755
index 00000000000..5258b6e0446
--- /dev/null
+++ b/src/mongo/db/mongo.ico
diff --git a/src/mongo/db/mongommf.cpp b/src/mongo/db/mongommf.cpp
new file mode 100644
index 00000000000..af2e822404e
--- /dev/null
+++ b/src/mongo/db/mongommf.cpp
@@ -0,0 +1,339 @@
+// @file mongommf.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* this module adds some of our layers atop memory mapped files - specifically our handling of private views & such
+   if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class, not this.
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "mongommf.h"
+#include "dur.h"
+#include "dur_journalformat.h"
+#include "../util/mongoutils/str.h"
+#include "mongomutex.h"
+#include "d_globals.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+#if defined(_WIN32)
+    extern mutex mapViewMutex;
+
+    __declspec(noinline) void makeChunkWritable(size_t chunkno) { 
+        scoped_lock lk(mapViewMutex);
+
+        if( writable.get(chunkno) ) // double check lock
+            return;
+
+        // remap all maps in this chunk.  common case is a single map, but could have more than one with smallfiles or .ns files
+        size_t chunkStart = chunkno * MemoryMappedFile::ChunkSize;
+        size_t chunkNext = chunkStart + MemoryMappedFile::ChunkSize;
+
+        scoped_lock lk2(privateViews._mutex());
+        map<void*,MongoMMF*>::iterator i = privateViews.finditer_inlock((void*) (chunkNext-1));
+        while( 1 ) {
+            const pair<void*,MongoMMF*> x = *(--i);
+            MongoMMF *mmf = x.second;
+            if( mmf == 0 )
+                break;
+
+            size_t viewStart = (size_t) x.first;
+            size_t viewEnd = (size_t) (viewStart + mmf->length());
+            if( viewEnd <= chunkStart )
+                break;
+
+            size_t protectStart = max(viewStart, chunkStart);
+            dassert(protectStart<chunkNext);
+
+            size_t protectEnd = min(viewEnd, chunkNext);
+            size_t protectSize = protectEnd - protectStart;
+            dassert(protectSize>0&&protectSize<=MemoryMappedFile::ChunkSize);
+
+            DWORD old;
+            bool ok = VirtualProtect((void*)protectStart, protectSize, PAGE_WRITECOPY, &old);
+            if( !ok ) {
+                DWORD e = GetLastError();
+                log() << "VirtualProtect failed (mcw) " << mmf->filename() << ' ' << chunkno << hex << protectStart << ' ' << protectSize << ' ' << errnoWithDescription(e) << endl;
+                assert(false);
+            }
+        }
+
+        writable.set(chunkno);
+    }
+
+    void* MemoryMappedFile::createPrivateMap() {
+        assert( maphandle );
+        scoped_lock lk(mapViewMutex);
+        void *p = MapViewOfFile(maphandle, FILE_MAP_READ, 0, 0, 0);
+        if ( p == 0 ) {
+            DWORD e = GetLastError();
+            log() << "createPrivateMap failed " << filename() << " " << 
+                errnoWithDescription(e) << " filelen:" << len <<
+                ((sizeof(void*) == 4 ) ? " (32 bit build)" : "") <<
+                endl;
+        }
+        else {
+            clearWritableBits(p);
+            views.push_back(p);
+        }
+        return p;
+    }
+
+    void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) {
+        d.dbMutex.assertWriteLocked(); // short window where we are unmapped so must be exclusive
+
+        // the mapViewMutex is to assure we get the same address on the remap
+        scoped_lock lk(mapViewMutex);
+
+        clearWritableBits(oldPrivateAddr);
+#if 1
+        // https://jira.mongodb.org/browse/SERVER-2942
+        DWORD old;
+        bool ok = VirtualProtect(oldPrivateAddr, (SIZE_T) len, PAGE_READONLY, &old);
+        if( !ok ) {
+            DWORD e = GetLastError();
+            log() << "VirtualProtect failed in remapPrivateView " << filename() << hex << oldPrivateAddr << ' ' << len << ' ' << errnoWithDescription(e) << endl;
+            assert(false);
+        }
+        return oldPrivateAddr;
+#else
+        if( !UnmapViewOfFile(oldPrivateAddr) ) {
+            DWORD e = GetLastError();
+            log() << "UnMapViewOfFile failed " << filename() << ' ' << errnoWithDescription(e) << endl;
+            assert(false);
+        }
+
+        // we want the new address to be the same as the old address in case things keep pointers around (as namespaceindex does).
+        void *p = MapViewOfFileEx(maphandle, FILE_MAP_READ, 0, 0,
+                                  /*dwNumberOfBytesToMap 0 means to eof*/0 /*len*/,
+                                  oldPrivateAddr);
+        
+        if ( p == 0 ) {
+            DWORD e = GetLastError();
+            log() << "MapViewOfFileEx failed " << filename() << " " << errnoWithDescription(e) << endl;
+            assert(p);
+        }
+        assert(p == oldPrivateAddr);
+        return p;
+#endif
+    }
+#endif
+
+    void MongoMMF::remapThePrivateView() {
+        assert( cmdLine.dur );
+
+        // todo 1.9 : it turns out we require that we always remap to the same address.
+        // so the remove / add isn't necessary and can be removed
+        privateViews.remove(_view_private);
+        _view_private = remapPrivateView(_view_private);
+        privateViews.add(_view_private, this);
+    }
+
+    /** register view. threadsafe */
+    void PointerToMMF::add(void *view, MongoMMF *f) {
+        assert(view);
+        assert(f);
+        mutex::scoped_lock lk(_m);
+        _views.insert( pair<void*,MongoMMF*>(view,f) );
+    }
+
+    /** de-register view. threadsafe */
+    void PointerToMMF::remove(void *view) {
+        if( view ) {
+            mutex::scoped_lock lk(_m);
+            _views.erase(view);
+        }
+    }
+
+    PointerToMMF::PointerToMMF() : _m("PointerToMMF") {
+#if defined(SIZE_MAX)
+        size_t max = SIZE_MAX;
+#else
+        size_t max = ~((size_t)0);
+#endif
+        assert( max > (size_t) this ); // just checking that no one redef'd SIZE_MAX and that it is sane
+
+        // this way we don't need any boundary checking in _find()
+        _views.insert( pair<void*,MongoMMF*>((void*)0,(MongoMMF*)0) );
+        _views.insert( pair<void*,MongoMMF*>((void*)max,(MongoMMF*)0) );
+    }
+
+    /** underscore version of find is for when you are already locked
+        @param ofs out return our offset in the view
+        @return the MongoMMF to which this pointer belongs
+    */
+    MongoMMF* PointerToMMF::find_inlock(void *p, /*out*/ size_t& ofs) {
+        //
+        // .................memory..........................
+        //    v1       p                      v2
+        //    [--------------------]          [-------]
+        //
+        // e.g., _find(p) == v1
+        //
+        const pair<void*,MongoMMF*> x = *(--_views.upper_bound(p));
+        MongoMMF *mmf = x.second;
+        if( mmf ) {
+            size_t o = ((char *)p) - ((char*)x.first);
+            if( o < mmf->length() ) {
+                ofs = o;
+                return mmf;
+            }
+        }
+        return 0;
+    }
+
+    /** find associated MMF object for a given pointer.
+        threadsafe
+        @param ofs out returns offset into the view of the pointer, if found.
+        @return the MongoMMF to which this pointer belongs. null if not found.
+    */
+    MongoMMF* PointerToMMF::find(void *p, /*out*/ size_t& ofs) {
+        mutex::scoped_lock lk(_m);
+        return find_inlock(p, ofs);
+    }
+
+    PointerToMMF privateViews;
+
+    /* void* MongoMMF::switchToPrivateView(void *readonly_ptr) {
+        assert( cmdLine.dur );
+        assert( testIntent );
+
+        void *p = readonly_ptr;
+
+        {
+            size_t ofs=0;
+            MongoMMF *mmf = ourReadViews.find(p, ofs);
+            if( mmf ) {
+                void *res = ((char *)mmf->_view_private) + ofs;
+                return res;
+            }
+        }
+
+        {
+            size_t ofs=0;
+            MongoMMF *mmf = privateViews.find(p, ofs);
+            if( mmf ) {
+                log() << "dur: perf warning p=" << p << " is already in the writable view of " << mmf->filename() << endl;
+                return p;
+            }
+        }
+
+        // did you call writing() with a pointer that isn't into a datafile?
+        log() << "dur error switchToPrivateView " << p << endl;
+        return p;
+    }*/
+
+    /* switch to _view_write.  normally, this is a bad idea since your changes will not
+       show up in _view_private if there have been changes there; thus the leading underscore
+       as a tad of a "warning".  but useful when done with some care, such as during
+       initialization.
+    */
+    void* MongoMMF::_switchToWritableView(void *p) {
+        size_t ofs;
+        MongoMMF *f = privateViews.find(p, ofs);
+        assert( f );
+        return (((char *)f->_view_write)+ofs);
+    }
+
+    extern string dbpath;
+
+    // here so that it is precomputed...
+    void MongoMMF::setPath(string f) {
+        string suffix;
+        string prefix;
+        bool ok = str::rSplitOn(f, '.', prefix, suffix);
+        uassert(13520, str::stream() << "MongoMMF only supports filenames in a certain format " << f, ok);
+        if( suffix == "ns" )
+            _fileSuffixNo = dur::JEntry::DotNsSuffix;
+        else
+            _fileSuffixNo = (int) str::toUnsigned(suffix);
+
+        _p = RelativePath::fromFullPath(prefix);
+    }
+
+    bool MongoMMF::open(string fname, bool sequentialHint) {
+        LOG(3) << "mmf open " << fname << endl;
+        setPath(fname);
+        _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0);
+        return finishOpening();
+    }
+
+    bool MongoMMF::create(string fname, unsigned long long& len, bool sequentialHint) {
+        LOG(3) << "mmf create " << fname << endl;
+        setPath(fname);
+        _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0);
+        return finishOpening();
+    }
+
+    bool MongoMMF::finishOpening() {
+        LOG(3) << "mmf finishOpening " << (void*) _view_write << ' ' << filename() << " len:" << length() << endl;
+        if( _view_write ) {
+            if( cmdLine.dur ) {
+                _view_private = createPrivateMap();
+                if( _view_private == 0 ) {
+                    msgasserted(13636, str::stream() << "file " << filename() << " open/create failed in createPrivateMap (look in log for more information)");
+                }
+                privateViews.add(_view_private, this); // note that testIntent builds use this, even though it points to view_write then...
+            }
+            else {
+                _view_private = _view_write;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    MongoMMF::MongoMMF() : _willNeedRemap(false) {
+        _view_write = _view_private = 0;
+    }
+
+    MongoMMF::~MongoMMF() {
+        try { 
+            close();
+        }
+        catch(...) { error() << "exception in ~MongoMMF" << endl; }
+    }
+
+    namespace dur {
+        void closingFileNotification();
+    }
+
+    /*virtual*/ void MongoMMF::close() {
+        LOG(3) << "mmf close " << filename() << endl;
+
+        if( view_write() /*actually was opened*/ ) {
+            if( cmdLine.dur ) {
+                dur::closingFileNotification();
+            }
+            if( !d.dbMutex.isWriteLocked() ) { 
+                assert( inShutdown() );
+                DEV { 
+                    log() << "is it really ok to close a mongommf outside a write lock? dbmutex status:" << d.dbMutex.getState() << " file:" << filename() << endl;
+                }
+            }
+        }
+
+        LockMongoFilesExclusive lk;
+        privateViews.remove(_view_private);
+        _view_write = _view_private = 0;
+        MemoryMappedFile::close();
+    }
+
+}
diff --git a/src/mongo/db/mongommf.h b/src/mongo/db/mongommf.h
new file mode 100644
index 00000000000..62a6cdfd3fd
--- /dev/null
+++ b/src/mongo/db/mongommf.h
@@ -0,0 +1,145 @@
+/** @file mongommf.h
+*
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/mmap.h"
+#include "../util/paths.h"
+
+namespace mongo {
+
+    /** MongoMMF adds some layers atop memory mapped files - specifically our handling of private views & such.
+        if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class,
+        not this.
+    */
+    class MongoMMF : private MemoryMappedFile {
+    protected:
+        virtual void* viewForFlushing() { return _view_write; }
+
+    public:
+        MongoMMF();
+        virtual ~MongoMMF();
+        virtual void close();
+
+        /** @return true if opened ok. */
+        bool open(string fname, bool sequentialHint /*typically we open with this false*/);
+
+        /** @return file length */
+        unsigned long long length() const { return MemoryMappedFile::length(); }
+
+        string filename() const { return MemoryMappedFile::filename(); }
+
+        void flush(bool sync)   { MemoryMappedFile::flush(sync); }
+
+        /* Creates with length if DNE, otherwise uses existing file length,
+           passed length.
+           @param sequentialHint if true will be sequentially accessed
+           @return true for ok
+        */
+        bool create(string fname, unsigned long long& len, bool sequentialHint);
+
+        /* Get the "standard" view (which is the private one).
+           @return the private view.
+        */
+        void* getView() const { return _view_private; }
+        
+        /* Get the "write" view (which is required for writing).
+           @return the write view.
+        */
+        void* view_write() const { return _view_write; }
+
+
+        /* switch to _view_write.  normally, this is a bad idea since your changes will not
+           show up in _view_private if there have been changes there; thus the leading underscore
+           as a tad of a "warning".  but useful when done with some care, such as during
+           initialization.
+        */
+        static void* _switchToWritableView(void *private_ptr);
+
+        /** for a filename a/b/c.3
+            filePath() is "a/b/c"
+            fileSuffixNo() is 3
+            if the suffix is "ns", fileSuffixNo -1
+        */
+        const RelativePath& relativePath() const {
+            DEV assert( !_p._p.empty() );
+            return _p;
+        }
+
+        int fileSuffixNo() const { return _fileSuffixNo; }
+
+        /** true if we have written.
+            set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration.
+            reset to false in REMAPPRIVATEVIEW
+        */
+        bool& willNeedRemap() { return _willNeedRemap; }
+
+        void remapThePrivateView();
+
+        virtual bool isMongoMMF() { return true; }
+
+    private:
+
+        void *_view_write;
+        void *_view_private;
+        bool _willNeedRemap;
+        RelativePath _p;   // e.g. "somepath/dbname"
+        int _fileSuffixNo;  // e.g. 3.  -1="ns"
+
+        void setPath(string pathAndFileName);
+        bool finishOpening();
+    };
+
+    /** for durability support we want to be able to map pointers to specific MongoMMF objects.
+    */
+    class PointerToMMF : boost::noncopyable {
+    public:
+        PointerToMMF();
+
+        /** register view.
+            threadsafe
+            */
+        void add(void *view, MongoMMF *f);
+
+        /** de-register view.
+            threadsafe
+            */
+        void remove(void *view);
+
+        /** find associated MMF object for a given pointer.
+            threadsafe
+            @param ofs out returns offset into the view of the pointer, if found.
+            @return the MongoMMF to which this pointer belongs. null if not found.
+        */
+        MongoMMF* find(void *p, /*out*/ size_t& ofs);
+
+        /** for doing many finds in a row with one lock operation */
+        mutex& _mutex() { return _m; }
+        MongoMMF* find_inlock(void *p, /*out*/ size_t& ofs);
+
+        map<void*,MongoMMF*>::iterator finditer_inlock(void *p) { return _views.upper_bound(p); }
+
+        unsigned numberOfViews_inlock() const { return _views.size(); }
+
+    private:
+        mutex _m;
+        map<void*, MongoMMF*> _views;
+    };
+
+    // allows a pointer into any private view of a MongoMMF to be resolved to the MongoMMF object
+    extern PointerToMMF privateViews;
+}
diff --git a/src/mongo/db/mongomutex.h b/src/mongo/db/mongomutex.h
new file mode 100644
index 00000000000..08b091cae9c
--- /dev/null
+++ b/src/mongo/db/mongomutex.h
@@ -0,0 +1,388 @@
+// @file mongomutex.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Mutex heirarchy (1 = "leaf")
+     name                   level
+     Logstream::mutex       1
+     ClientCursor::ccmutex  2
+     dblock                 3
+
+     End func name with _inlock to indicate "caller must lock before calling".
+*/
+
+#pragma once
+
+#include "../util/concurrency/rwlock.h"
+#include "../util/mmap.h"
+#include "../util/time_support.h"
+#include "d_globals.h"
+
+namespace mongo {
+
+    class Client;
+    Client* curopWaitingForLock( int type );
+    void curopGotLock(Client*);
+
+    /* mongomutex time stats */
+    class MutexInfo {
+        unsigned long long enter, timeLocked; // microseconds
+        int locked;
+        unsigned long long start; // last as we touch this least often
+    public:
+        MutexInfo() : timeLocked(0) , locked(0) {
+            start = curTimeMicros64();
+        }
+        void entered() {
+            if ( locked == 0 )
+                enter = curTimeMicros64();
+            locked++;
+            assert( locked >= 1 );
+        }
+        void leaving() {
+            locked--;
+            assert( locked >= 0 );
+            if ( locked == 0 )
+                timeLocked += curTimeMicros64() - enter;
+        }
+        int isLocked() const { return locked; }
+        void getTimingInfo(unsigned long long &s, unsigned long long &tl) const {
+            s = start;
+            tl = timeLocked;
+        }
+        unsigned long long getTimeLocked() const { return timeLocked; }
+    };
+
+    /** the 'big lock'. a read/write lock.
+        there is one of these, d.dbMutex.
+
+        generally if you need to declare a mutex use the right primitive class, not this.
+
+        use readlock and writelock classes for scoped locks on this rather than direct
+        manipulation.
+       */
+    class MongoMutex {
+    public:
+        MongoMutex(const char * name);
+
+        /** @return
+         *    > 0  write lock
+         *    = 0  no lock
+         *    < 0  read lock
+         */
+        int getState() const { return _state.get(); }
+
+        bool atLeastReadLocked() const { return _state.get() != 0; }
+        void assertAtLeastReadLocked() const { assert(atLeastReadLocked()); }
+        bool isWriteLocked/*by our thread*/() const { return getState() > 0; }
+        void assertWriteLocked() const {
+            assert( getState() > 0 );
+            DEV assert( !_releasedEarly.get() );
+        }
+
+        // write lock.  use the writelock scoped lock class, not this directly.
+        void lock() {
+            if ( _writeLockedAlready() )
+                return;
+
+            _state.set(1);
+
+            curopWaitingForLock( 1 ); // stats
+            _m.lock();
+            MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build
+            _acquiredWriteLock();
+        }
+
+        // try write lock
+        bool lock_try( int millis ) {
+            if ( _writeLockedAlready() ) // adjusts _state
+                return true;
+
+            curopWaitingForLock( 1 );
+            bool got = _m.lock_try( millis );
+
+            if ( got ) {
+                _state.set(1);
+                MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build
+                _acquiredWriteLock();
+            }
+
+            return got;
+        }
+
+        // un write lock
+        void unlock() {
+            int s = _state.get();
+            if( s > 1 ) {
+                _state.set(s-1); // recursive lock case
+                return;
+            }
+            if( s != 1 ) {
+                if( _releasedEarly.get() ) {
+                    _releasedEarly.set(false);
+                    return;
+                }
+                massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false);
+            }
+            _releasingWriteLock();
+            MongoFile::unmarkAllWritable(); // _DEBUG validation
+            _state.set(0);
+            _m.unlock();
+        }
+
+        /* unlock (write lock), and when unlock() is called later,
+           be smart then and don't unlock it again.
+           */
+        void releaseEarly() {
+            assert( getState() == 1 ); // must not be recursive
+            assert( !_releasedEarly.get() );
+            _releasedEarly.set(true);
+            unlock();
+        }
+
+        // read lock. don't call directly, use readlock.
+        void lock_shared() {
+            int s = _state.get();
+            if( s ) {
+                if( s > 0 ) {
+                    // already in write lock - just be recursive and stay write locked
+                    _state.set(s+1);
+                }
+                else {
+                    // already in read lock - recurse
+                    _state.set(s-1);
+                }
+            }
+            else {
+                _state.set(-1);
+                Client *c = curopWaitingForLock( -1 );
+                _m.lock_shared();
+                curopGotLock(c);
+            }
+        }
+
+        // try read lock
+        bool lock_shared_try( int millis ) {
+            int s = _state.get();
+            if ( s ) {
+                // we already have a lock, so no need to try
+                lock_shared();
+                return true;
+            }
+
+            /* [dm] should there be
+                             Client *c = curopWaitingForLock( 1 );
+               here?  i think so.  seems to be missing.
+               */
+            bool got = _m.lock_shared_try( millis );
+            if ( got )
+                _state.set(-1);
+            return got;
+        }
+
+        void unlock_shared() {
+            int s = _state.get();
+            if( s > 0 ) {
+                wassert( s > 1 ); /* we must have done a lock write first to have s > 1 */
+                _state.set(s-1);
+                return;
+            }
+            if( s < -1 ) {
+                _state.set(s+1);
+                return;
+            }
+            wassert( s == -1 );
+            _state.set(0);
+            _m.unlock_shared();
+        }
+
+        MutexInfo& info() { return _minfo; }
+
+    private:
+        void lockedExclusively();
+        void unlockingExclusively();
+        void _acquiredWriteLock();
+        void _releasingWriteLock();
+
+        /* @return true if was already write locked.  increments recursive lock count. */
+        bool _writeLockedAlready();
+
+        RWLock _m;
+
+        /* > 0 write lock with recurse count
+           < 0 read lock
+        */
+        ThreadLocalValue<int> _state;
+
+        MutexInfo _minfo;
+
+    public:
+        // indicates we need to call dur::REMAPPRIVATEVIEW on the next write lock
+        bool _remapPrivateViewRequested;
+
+    private:
+        /* See the releaseEarly() method.
+           we use a separate TLS value for releasedEarly - that is ok as
+           our normal/common code path, we never even touch it */
+        ThreadLocalValue<bool> _releasedEarly;
+
+        /* this is for fsyncAndLock command.  otherwise write lock's greediness will
+           make us block on any attempted write lock the the fsync's lock.
+           */
+        //volatile bool _blockWrites;
+    };
+
+    namespace dur {
+        void REMAPPRIVATEVIEW();
+        void releasingWriteLock(); // because it's hard to include dur.h here
+    }
+
+    inline void MongoMutex::_releasingWriteLock() {
+        dur::releasingWriteLock();
+        unlockingExclusively();
+    }
+
+    inline void MongoMutex::_acquiredWriteLock() {
+        lockedExclusively();
+        if( _remapPrivateViewRequested ) {
+            dur::REMAPPRIVATEVIEW();
+            dassert( !_remapPrivateViewRequested );
+        }
+    }
+
+    string sayClientState();
+
+    /* @return true if was already write locked.  increments recursive lock count. */
+    inline bool MongoMutex::_writeLockedAlready() {
+        int s = _state.get();
+        if( s > 0 ) {
+            _state.set(s+1);
+            return true;
+        }
+        massert( 10293 , string("internal error: locks are not upgradeable: ") + sayClientState() , s == 0 );
+        return false;
+    }
+
+    struct writelock {
+        writelock() { d.dbMutex.lock(); }
+        writelock(const string& ns) { d.dbMutex.lock(); }
+        ~writelock() {
+            DESTRUCTOR_GUARD(
+                d.dbMutex.unlock();
+            );
+        }
+    };
+
+    struct readlock {
+        readlock(const string& ns) {
+            d.dbMutex.lock_shared();
+        }
+        readlock() { d.dbMutex.lock_shared(); }
+        ~readlock() {
+            DESTRUCTOR_GUARD(
+                d.dbMutex.unlock_shared();
+            );
+        }
+    };
+    struct readlocktry {
+        readlocktry( const string&ns , int tryms ) {
+            _got = d.dbMutex.lock_shared_try( tryms );
+        }
+        ~readlocktry() {
+            if ( _got ) {
+                d.dbMutex.unlock_shared();
+            }
+        }
+        bool got() const { return _got; }
+    private:
+        bool _got;
+    };
+
+    struct writelocktry {
+        writelocktry( const string&ns , int tryms ) {
+            _got = d.dbMutex.lock_try( tryms );
+        }
+        ~writelocktry() {
+            if ( _got ) {
+                d.dbMutex.unlock();
+            }
+        }
+        bool got() const { return _got; }
+    private:
+        bool _got;
+    };
+
+    struct readlocktryassert : public readlocktry {
+        readlocktryassert(const string& ns, int tryms) :
+            readlocktry(ns,tryms) {
+            uassert(13142, "timeout getting readlock", got());
+        }
+    };
+
+    /** assure we have at least a read lock - they key with this being
+        if you have a write lock, that's ok too.
+    */
+    struct atleastreadlock {
+        atleastreadlock( const string& ns = "" ) {
+            _prev = d.dbMutex.getState();
+            if ( _prev == 0 )
+                d.dbMutex.lock_shared();
+        }
+        ~atleastreadlock() {
+            if ( _prev == 0 )
+                d.dbMutex.unlock_shared();
+        }
+    private:
+        int _prev;
+    };
+
+    /* parameterized choice of read or write locking
+       use readlock and writelock instead of this when statically known which you want
+    */
+    class mongolock {
+        bool _writelock;
+    public:
+        mongolock(bool write) : _writelock(write) {
+            if( _writelock ) {
+                d.dbMutex.lock();
+            }
+            else
+                d.dbMutex.lock_shared();
+        }
+        ~mongolock() {
+            DESTRUCTOR_GUARD(
+            if( _writelock ) {
+                d.dbMutex.unlock();
+            }
+            else {
+                d.dbMutex.unlock_shared();
+            }
+            );
+        }
+        /* this unlocks, does NOT upgrade. that works for our current usage */
+        //void releaseAndWriteLock();
+    };
+
+    /* deprecated - use writelock and readlock instead */
+    struct dblock : public writelock {
+        dblock() : writelock("") { }
+    };
+
+    // eliminate this - we should just type "d.dbMutex.assertWriteLocked();" instead
+    inline void assertInWriteLock() { d.dbMutex.assertWriteLocked(); }
+
+}
diff --git a/src/mongo/db/namespace-inl.h b/src/mongo/db/namespace-inl.h
new file mode 100644
index 00000000000..a621a229546
--- /dev/null
+++ b/src/mongo/db/namespace-inl.h
@@ -0,0 +1,132 @@
+// @file namespace-inl.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "namespace.h"
+
+namespace mongo {
+
+    inline Namespace& Namespace::operator=(const char *ns) {
+        // we fill the remaining space with all zeroes here.  as the full Namespace struct is in
+        // the datafiles (the .ns files specifically), that is helpful as then they are deterministic
+        // in the bytes they have for a given sequence of operations.  that makes testing and debugging
+        // the data files easier.
+        //
+        // if profiling indicates this method is a significant bottleneck, we could have a version we
+        // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes.
+        //
+        unsigned len = strlen(ns);
+        uassert( 10080 , "ns name too long, max size is 128", len < MaxNsLen);
+        memset(buf, 0, MaxNsLen);
+        memcpy(buf, ns, len);
+        return *this;
+    }
+
+    inline string Namespace::extraName(int i) const {
+        char ex[] = "$extra";
+        ex[5] += i;
+        string s = string(buf) + ex;
+        massert( 10348 , "$extra: ns name too long", s.size() < MaxNsLen);
+        return s;
+    }
+
+    inline bool Namespace::isExtra() const {
+        const char *p = strstr(buf, "$extr");
+        return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
+    }
+
+    inline int Namespace::hash() const {
+        unsigned x = 0;
+        const char *p = buf;
+        while ( *p ) {
+            x = x * 131 + *p;
+            p++;
+        }
+        return (x & 0x7fffffff) | 0x8000000; // must be > 0
+    }
+
+    /* future : this doesn't need to be an inline. */
+    inline string Namespace::getSisterNS( const char * local ) const {
+        assert( local && local[0] != '.' );
+        string old(buf);
+        if ( old.find( "." ) != string::npos )
+            old = old.substr( 0 , old.find( "." ) );
+        return old + "." + local;
+    }
+
+    inline IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected ) {
+        if( idxNo < NIndexesBase ) {
+            IndexDetails& id = _indexes[idxNo];
+            return id;
+        }
+        Extra *e = extra();
+        if ( ! e ) {
+            if ( missingExpected )
+                throw MsgAssertionException( 13283 , "Missing Extra" );
+            massert(14045, "missing Extra", e);
+        }
+        int i = idxNo - NIndexesBase;
+        if( i >= NIndexesExtra ) {
+            e = e->next(this);
+            if ( ! e ) {
+                if ( missingExpected )
+                    throw MsgAssertionException( 14823 , "missing extra" );
+                massert(14824, "missing Extra", e);
+            }
+            i -= NIndexesExtra;
+        }
+        return e->details[i];
+    }
+
+    inline int NamespaceDetails::idxNo(IndexDetails& idx) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if( &i.next() == &idx )
+                return i.pos()-1;
+        }
+        massert( 10349 , "E12000 idxNo fails", false);
+        return -1;
+    }
+
+    inline int NamespaceDetails::findIndexByKeyPattern(const BSONObj& keyPattern) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if( i.next().keyPattern() == keyPattern )
+                return i.pos()-1;
+        }
+        return -1;
+    }
+
+    // @return offset in indexes[]
+    inline int NamespaceDetails::findIndexByName(const char *name) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 )
+                return i.pos()-1;
+        }
+        return -1;
+    }
+
+    inline NamespaceDetails::IndexIterator::IndexIterator(NamespaceDetails *_d) {
+        d = _d;
+        i = 0;
+        n = d->nIndexes;
+    }
+
+}
diff --git a/src/mongo/db/namespace.cpp b/src/mongo/db/namespace.cpp
new file mode 100644
index 00000000000..af8b5694248
--- /dev/null
+++ b/src/mongo/db/namespace.cpp
@@ -0,0 +1,800 @@
+// namespace.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "mongommf.h"
+#include "../util/hashtab.h"
+#include "../scripting/engine.h"
+#include "btree.h"
+#include <algorithm>
+#include <list>
+#include "queryutil.h"
+#include "json.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+    BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 );
+
+    BSONObj idKeyPattern = fromjson("{\"_id\":1}");
+
+    /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+       so you can look for a deleterecord about the right size.
+    */
+    int bucketSizes[] = {
+        32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000,
+        0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000,
+        0x400000, 0x800000
+    };
+
+    NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool _capped ) {
+        /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
+        firstExtent = lastExtent = capExtent = loc;
+        stats.datasize = stats.nrecords = 0;
+        lastExtentSize = 0;
+        nIndexes = 0;
+        capped = _capped;
+        max = 0x7fffffff;
+        paddingFactor = 1.0;
+        flags = 0;
+        capFirstNewRecord = DiskLoc();
+        // Signal that we are on first allocation iteration through extents.
+        capFirstNewRecord.setInvalid();
+        // For capped case, signal that we are doing initial extent allocation.
+        if ( capped )
+            cappedLastDelRecLastExtent().setInvalid();
+        assert( sizeof(dataFileVersion) == 2 );
+        dataFileVersion = 0;
+        indexFileVersion = 0;
+        multiKeyIndexBits = 0;
+        reservedA = 0;
+        extraOffset = 0;
+        indexBuildInProgress = 0;
+        reservedB = 0;
+        capped2.cc2_ptr = 0;
+        capped2.fileNumber = 0;
+        memset(reserved, 0, sizeof(reserved));
+    }
+
+    bool NamespaceIndex::exists() const {
+        return !MMF::exists(path());
+    }
+
+    boost::filesystem::path NamespaceIndex::path() const {
+        boost::filesystem::path ret( dir_ );
+        if ( directoryperdb )
+            ret /= database_;
+        ret /= ( database_ + ".ns" );
+        return ret;
+    }
+
+    void NamespaceIndex::maybeMkdir() const {
+        if ( !directoryperdb )
+            return;
+        boost::filesystem::path dir( dir_ );
+        dir /= database_;
+        if ( !boost::filesystem::exists( dir ) )
+            MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " );
+    }
+
+    unsigned lenForNewNsFiles = 16 * 1024 * 1024;
+
+#if defined(_DEBUG)
+    void NamespaceDetails::dump(const Namespace& k) {
+        if( !cmdLine.dur )
+            cout << "ns offsets which follow will not display correctly with --journal disabled" << endl;
+
+        size_t ofs = 1; // 1 is sentinel that the find call below failed
+        privateViews.find(this, /*out*/ofs);
+
+        cout << "ns" << hex << setw(8) << ofs << ' ';
+        cout << k.toString() << '\n';
+
+        if( k.isExtra() ) {
+            cout << "ns\t extra" << endl;
+            return;
+        }
+
+        cout << "ns         " << firstExtent.toString() << ' ' << lastExtent.toString() << " nidx:" << nIndexes << '\n';
+        cout << "ns         " << stats.datasize << ' ' << stats.nrecords << ' ' << nIndexes << '\n';
+        cout << "ns         " << capped << ' ' << paddingFactor << ' ' << flags << ' ' << dataFileVersion << '\n';
+        cout << "ns         " << multiKeyIndexBits << ' ' << indexBuildInProgress << '\n';
+        cout << "ns         " << (int) reserved[0] << ' ' << (int) reserved[59];
+        cout << endl;
+    }
+#endif
+
+    void NamespaceDetails::onLoad(const Namespace& k) {
+
+        if( k.isExtra() ) {
+            /* overflow storage for indexes - so don't treat as a NamespaceDetails object. */
+            return;
+        }
+
+        if( indexBuildInProgress || capped2.cc2_ptr ) {
+            assertInWriteLock();
+            if( indexBuildInProgress ) {
+                log() << "indexBuildInProgress was " << indexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl;
+                getDur().writingInt( indexBuildInProgress ) = 0;
+            }
+            if( capped2.cc2_ptr )
+                *getDur().writing(&capped2.cc2_ptr) = 0;
+        }
+    }
+
+    static void namespaceOnLoadCallback(const Namespace& k, NamespaceDetails& v) {
+        v.onLoad(k);
+    }
+
+    bool checkNsFilesOnLoad = true;
+
+    NOINLINE_DECL void NamespaceIndex::_init() {
+        assert( !ht );
+
+        d.dbMutex.assertWriteLocked();
+
+        /* if someone manually deleted the datafiles for a database,
+           we need to be sure to clear any cached info for the database in
+           local.*.
+        */
+        /*
+        if ( "local" != database_ ) {
+            DBInfo i(database_.c_str());
+            i.dbDropped();
+        }
+        */
+
+        unsigned long long len = 0;
+        boost::filesystem::path nsPath = path();
+        string pathString = nsPath.string();
+        void *p = 0;
+        if( MMF::exists(nsPath) ) {
+            if( f.open(pathString, true) ) {
+                len = f.length();
+                if ( len % (1024*1024) != 0 ) {
+                    log() << "bad .ns file: " << pathString << endl;
+                    uassert( 10079 ,  "bad .ns file length, cannot open database", len % (1024*1024) == 0 );
+                }
+                p = f.getView();
+            }
+        }
+        else {
+            // use lenForNewNsFiles, we are making a new database
+            massert( 10343, "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 );
+            maybeMkdir();
+            unsigned long long l = lenForNewNsFiles;
+            if( f.create(pathString, l, true) ) {
+                getDur().createdFile(pathString, l); // always a new file
+                len = l;
+                assert( len == lenForNewNsFiles );
+                p = f.getView();
+            }
+        }
+
+        if ( p == 0 ) {
+            /** TODO: this shouldn't terminate? */
+            log() << "error couldn't open file " << pathString << " terminating" << endl;
+            dbexit( EXIT_FS );
+        }
+
+
+        assert( len <= 0x7fffffff );
+        ht = new HashTable<Namespace,NamespaceDetails>(p, (int) len, "namespace index");
+        if( checkNsFilesOnLoad )
+            ht->iterAll(namespaceOnLoadCallback);
+    }
+
+    static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , void * extra ) {
+        list<string> * l = (list<string>*)extra;
+        if ( ! k.hasDollarSign() )
+            l->push_back( (string)k );
+    }
+    void NamespaceIndex::getNamespaces( list<string>& tofill , bool onlyCollections ) const {
+        assert( onlyCollections ); // TODO: need to implement this
+        //                                  need boost::bind or something to make this less ugly
+
+        if ( ht )
+            ht->iterAll( namespaceGetNamespacesCallback , (void*)&tofill );
+    }
+
+    void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+
+        {
+            Record *r = (Record *) getDur().writingPtr(d, sizeof(Record));
+            d = &r->asDeleted();
+            // defensive code: try to make us notice if we reference a deleted record
+            (unsigned&) (r->data) = 0xeeeeeeee;
+        }
+        DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
+        if ( capped ) {
+            if ( !cappedLastDelRecLastExtent().isValid() ) {
+                // Initial extent allocation.  Insert at end.
+                d->nextDeleted = DiskLoc();
+                if ( cappedListOfAllDeletedRecords().isNull() )
+                    getDur().writingDiskLoc( cappedListOfAllDeletedRecords() ) = dloc;
+                else {
+                    DiskLoc i = cappedListOfAllDeletedRecords();
+                    for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted )
+                        ;
+                    i.drec()->nextDeleted.writing() = dloc;
+                }
+            }
+            else {
+                d->nextDeleted = cappedFirstDeletedInCurExtent();
+                getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = dloc;
+                // always compact() after this so order doesn't matter
+            }
+        }
+        else {
+            int b = bucket(d->lengthWithHeaders);
+            DiskLoc& list = deletedList[b];
+            DiskLoc oldHead = list;
+            getDur().writingDiskLoc(list) = dloc;
+            d->nextDeleted = oldHead;
+        }
+    }
+
+    /* predetermine location of the next alloc without actually doing it. 
+        if cannot predetermine returns null (so still call alloc() then)
+    */
+    DiskLoc NamespaceDetails::allocWillBeAt(const char *ns, int lenToAlloc) {
+        if ( !capped ) {
+            lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+            return __stdAlloc(lenToAlloc, true);
+        }
+        return DiskLoc();
+    }
+
+    /** allocate space for a new record from deleted lists.
+        @param lenToAlloc is WITH header
+        @param extentLoc OUT returns the extent location
+        @return null diskloc if no room - allocate a new extent then
+    */
+    DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) {
+        {
+            // align very slightly.  
+            // note that if doing more coarse-grained quantization (really just if it isn't always
+            //   a constant amount but if it varied by record size) then that quantization should 
+            //   NOT be done here but rather in __stdAlloc so that we can grab a deletedrecord that 
+            //   is just big enough if we happen to run into one.
+            lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+        }
+
+        DiskLoc loc = _alloc(ns, lenToAlloc);
+        if ( loc.isNull() )
+            return loc;
+
+        const DeletedRecord *r = loc.drec();
+        //r = getDur().writing(r);
+
+        /* note we want to grab from the front so our next pointers on disk tend
+        to go in a forward direction which is important for performance. */
+        int regionlen = r->lengthWithHeaders;
+        extentLoc.set(loc.a(), r->extentOfs);
+        assert( r->extentOfs < loc.getOfs() );
+
+        DEBUGGING out() << "TEMP: alloc() returns " << loc.toString() << ' ' << ns << " lentoalloc:" << lenToAlloc << " ext:" << extentLoc.toString() << endl;
+
+        int left = regionlen - lenToAlloc;
+        if ( capped == 0 ) {
+            if ( left < 24 || left < (lenToAlloc >> 3) ) {
+                // you get the whole thing.
+                return loc;
+            }
+        }
+
+        /* split off some for further use. */
+        getDur().writingInt(r->lengthWithHeaders) = lenToAlloc;
+        DiskLoc newDelLoc = loc;
+        newDelLoc.inc(lenToAlloc);
+        DeletedRecord *newDel = DataFileMgr::makeDeletedRecord(newDelLoc, left);
+        DeletedRecord *newDelW = getDur().writing(newDel);
+        newDelW->extentOfs = r->extentOfs;
+        newDelW->lengthWithHeaders = left;
+        newDelW->nextDeleted.Null();
+
+        addDeletedRec(newDel, newDelLoc);
+
+        return loc;
+    }
+
+    /* for non-capped collections.
+       @param peekOnly just look up where and don't reserve
+       returned item is out of the deleted list upon return
+    */
+    DiskLoc NamespaceDetails::__stdAlloc(int len, bool peekOnly) {
+        DiskLoc *prev;
+        DiskLoc *bestprev = 0;
+        DiskLoc bestmatch;
+        int bestmatchlen = 0x7fffffff;
+        int b = bucket(len);
+        DiskLoc cur = deletedList[b];
+        prev = &deletedList[b];
+        int extra = 5; // look for a better fit, a little.
+        int chain = 0;
+        while ( 1 ) {
+            {
+                int a = cur.a();
+                if ( a < -1 || a >= 100000 ) {
+                    problem() << "~~ Assertion - cur out of range in _alloc() " << cur.toString() <<
+                              " a:" << a << " b:" << b << " chain:" << chain << '\n';
+                    sayDbContext();
+                    if ( cur == *prev )
+                        prev->Null();
+                    cur.Null();
+                }
+            }
+            if ( cur.isNull() ) {
+                // move to next bucket.  if we were doing "extra", just break
+                if ( bestmatchlen < 0x7fffffff )
+                    break;
+                b++;
+                if ( b > MaxBucket ) {
+                    // out of space. alloc a new extent.
+                    return DiskLoc();
+                }
+                cur = deletedList[b];
+                prev = &deletedList[b];
+                continue;
+            }
+            DeletedRecord *r = cur.drec();
+            if ( r->lengthWithHeaders >= len &&
+                    r->lengthWithHeaders < bestmatchlen ) {
+                bestmatchlen = r->lengthWithHeaders;
+                bestmatch = cur;
+                bestprev = prev;
+            }
+            if ( bestmatchlen < 0x7fffffff && --extra <= 0 )
+                break;
+            if ( ++chain > 30 && b < MaxBucket ) {
+                // too slow, force move to next bucket to grab a big chunk
+                //b++;
+                chain = 0;
+                cur.Null();
+            }
+            else {
+                /*this defensive check only made sense for the mmap storage engine:
+                  if ( r->nextDeleted.getOfs() == 0 ) {
+                    problem() << "~~ Assertion - bad nextDeleted " << r->nextDeleted.toString() <<
+                    " b:" << b << " chain:" << chain << ", fixing.\n";
+                    r->nextDeleted.Null();
+                }*/
+                cur = r->nextDeleted;
+                prev = &r->nextDeleted;
+            }
+        }
+
+        /* unlink ourself from the deleted list */
+        if( !peekOnly ) {
+            const DeletedRecord *bmr = bestmatch.drec();
+            *getDur().writing(bestprev) = bmr->nextDeleted;
+            bmr->nextDeleted.writing().setInvalid(); // defensive.
+            assert(bmr->extentOfs < bestmatch.getOfs());
+        }
+
+        return bestmatch;
+    }
+
+    void NamespaceDetails::dumpDeleted(set<DiskLoc> *extents) {
+        for ( int i = 0; i < Buckets; i++ ) {
+            DiskLoc dl = deletedList[i];
+            while ( !dl.isNull() ) {
+                DeletedRecord *r = dl.drec();
+                DiskLoc extLoc(dl.a(), r->extentOfs);
+                if ( extents == 0 || extents->count(extLoc) <= 0 ) {
+                    out() << "  bucket " << i << endl;
+                    out() << "   " << dl.toString() << " ext:" << extLoc.toString();
+                    if ( extents && extents->count(extLoc) <= 0 )
+                        out() << '?';
+                    out() << " len:" << r->lengthWithHeaders << endl;
+                }
+                dl = r->nextDeleted;
+            }
+        }
+    }
+
+    DiskLoc NamespaceDetails::firstRecord( const DiskLoc &startExtent ) const {
+        for (DiskLoc i = startExtent.isNull() ? firstExtent : startExtent;
+                !i.isNull(); i = i.ext()->xnext ) {
+            if ( !i.ext()->firstRecord.isNull() )
+                return i.ext()->firstRecord;
+        }
+        return DiskLoc();
+    }
+
+    DiskLoc NamespaceDetails::lastRecord( const DiskLoc &startExtent ) const {
+        for (DiskLoc i = startExtent.isNull() ? lastExtent : startExtent;
+                !i.isNull(); i = i.ext()->xprev ) {
+            if ( !i.ext()->lastRecord.isNull() )
+                return i.ext()->lastRecord;
+        }
+        return DiskLoc();
+    }
+
+    int n_complaints_cap = 0;
+    void NamespaceDetails::maybeComplain( const char *ns, int len ) const {
+        if ( ++n_complaints_cap < 8 ) {
+            out() << "couldn't make room for new record (len: " << len << ") in capped ns " << ns << '\n';
+            int i = 0;
+            for ( DiskLoc e = firstExtent; !e.isNull(); e = e.ext()->xnext, ++i ) {
+                out() << "  Extent " << i;
+                if ( e == capExtent )
+                    out() << " (capExtent)";
+                out() << '\n';
+                out() << "    magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.toString() << '\n';
+                out() << "    fr: " << e.ext()->firstRecord.toString() <<
+                      " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n';
+            }
+            assert( len * 5 > lastExtentSize ); // assume it is unusually large record; if not, something is broken
+        }
+    }
+
+    /* alloc with capped table handling. */
+    DiskLoc NamespaceDetails::_alloc(const char *ns, int len) {
+        if ( !capped )
+            return __stdAlloc(len, false);
+
+        return cappedAlloc(ns,len);
+    }
+
+    void NamespaceIndex::kill_ns(const char *ns) {
+        d.dbMutex.assertWriteLocked();
+        if ( !ht )
+            return;
+        Namespace n(ns);
+        ht->kill(n);
+
+        for( int i = 0; i<=1; i++ ) {
+            try {
+                Namespace extra(n.extraName(i).c_str());
+                ht->kill(extra);
+            }
+            catch(DBException&) { 
+                dlog(3) << "caught exception in kill_ns" << endl;
+            }
+        }
+    }
+
+    void NamespaceIndex::add_ns(const char *ns, DiskLoc& loc, bool capped) {
+        NamespaceDetails details( loc, capped );
+        add_ns( ns, details );
+    }
+    void NamespaceIndex::add_ns( const char *ns, const NamespaceDetails &details ) {
+        d.dbMutex.assertWriteLocked();
+        init();
+        Namespace n(ns);
+        uassert( 10081 , "too many namespaces/collections", ht->put(n, details));
+    }
+
+    /* extra space for indexes when more than 10 */
+    NamespaceDetails::Extra* NamespaceIndex::newExtra(const char *ns, int i, NamespaceDetails *d) {
+        mongo::d.dbMutex.assertWriteLocked();
+        assert( i >= 0 && i <= 1 );
+        Namespace n(ns);
+        Namespace extra(n.extraName(i).c_str()); // throws userexception if ns name too long
+
+        massert( 10350 ,  "allocExtra: base ns missing?", d );
+        massert( 10351 ,  "allocExtra: extra already exists", ht->get(extra) == 0 );
+
+        NamespaceDetails::Extra temp;
+        temp.init();
+        uassert( 10082 ,  "allocExtra: too many namespaces/collections", ht->put(extra, (NamespaceDetails&) temp));
+        NamespaceDetails::Extra *e = (NamespaceDetails::Extra *) ht->get(extra);
+        return e;
+    }
+    NamespaceDetails::Extra* NamespaceDetails::allocExtra(const char *ns, int nindexessofar) {
+        NamespaceIndex *ni = nsindex(ns);
+        int i = (nindexessofar - NIndexesBase) / NIndexesExtra;
+        Extra *e = ni->newExtra(ns, i, this);
+        long ofs = e->ofsFrom(this);
+        if( i == 0 ) {
+            assert( extraOffset == 0 );
+            *getDur().writing(&extraOffset) = ofs;
+            assert( extra() == e );
+        }
+        else {
+            Extra *hd = extra();
+            assert( hd->next(this) == 0 );
+            hd->setNext(ofs);
+        }
+        return e;
+    }
+
+    /* you MUST call when adding an index.  see pdfile.cpp */
+    IndexDetails& NamespaceDetails::addIndex(const char *thisns, bool resetTransient) {
+        IndexDetails *id;
+        try {
+            id = &idx(nIndexes,true);
+        }
+        catch(DBException&) {
+            allocExtra(thisns, nIndexes);
+            id = &idx(nIndexes,false);
+        }
+
+        (*getDur().writing(&nIndexes))++;
+        if ( resetTransient )
+            NamespaceDetailsTransient::get(thisns).addedIndex();
+        return *id;
+    }
+
+    // must be called when renaming a NS to fix up extra
+    void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) {
+        extraOffset = 0; // we are a copy -- the old value is wrong.  fixing it up below.
+        Extra *se = src->extra();
+        int n = NIndexesBase;
+        if( se ) {
+            Extra *e = allocExtra(thisns, n);
+            while( 1 ) {
+                n += NIndexesExtra;
+                e->copy(this, *se);
+                se = se->next(src);
+                if( se == 0 ) break;
+                Extra *nxt = allocExtra(thisns, n);
+                e->setNext( nxt->ofsFrom(this) );
+                e = nxt;
+            }
+            assert( extraOffset );
+        }
+    }
+
+    /* returns index of the first index in which the field is present. -1 if not present.
+       (aug08 - this method not currently used)
+    */
+    int NamespaceDetails::fieldIsIndexed(const char *fieldName) {
+        massert( 10346 , "not implemented", false);
+        /*
+        for ( int i = 0; i < nIndexes; i++ ) {
+            IndexDetails& idx = indexes[i];
+            BSONObj idxKey = idx.info.obj().getObjectField("key"); // e.g., { ts : -1 }
+            if ( !idxKey.getField(fieldName).eoo() )
+                return i;
+        }*/
+        return -1;
+    }
+
+    long long NamespaceDetails::storageSize( int * numExtents , BSONArrayBuilder * extentInfo ) const {
+        Extent * e = firstExtent.ext();
+        assert( e );
+
+        long long total = 0;
+        int n = 0;
+        while ( e ) {
+            total += e->length;
+            n++;
+
+            if ( extentInfo ) {
+                extentInfo->append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) );
+            }
+
+            e = e->getNextExtent();
+        }
+
+        if ( numExtents )
+            *numExtents = n;
+
+        return total;
+    }
+
+    NamespaceDetails *NamespaceDetails::writingWithExtra() {
+        vector< pair< long long, unsigned > > writeRanges;
+        writeRanges.push_back( make_pair( 0, sizeof( NamespaceDetails ) ) );
+        for( Extra *e = extra(); e; e = e->next( this ) ) {
+            writeRanges.push_back( make_pair( (char*)e - (char*)this, sizeof( Extra ) ) );
+        }
+        return reinterpret_cast< NamespaceDetails* >( getDur().writingRangesAtOffsets( this, writeRanges ) );
+    }
+
+    /* ------------------------------------------------------------------------- */
+
+    SimpleMutex NamespaceDetailsTransient::_qcMutex("qc");
+    SimpleMutex NamespaceDetailsTransient::_isMutex("is");
+    map< string, shared_ptr< NamespaceDetailsTransient > > NamespaceDetailsTransient::_nsdMap;
+    typedef map< string, shared_ptr< NamespaceDetailsTransient > >::iterator ouriter;
+
+    void NamespaceDetailsTransient::reset() {
+        DEV assertInWriteLock();
+        clearQueryCache();
+        _keysComputed = false;
+        _indexSpecs.clear();
+    }
+
+    /*static*/ NOINLINE_DECL NamespaceDetailsTransient& NamespaceDetailsTransient::make_inlock(const char *ns) {
+        shared_ptr< NamespaceDetailsTransient > &t = _nsdMap[ ns ];
+        assert( t.get() == 0 );
+        Database *database = cc().database();
+        assert( database );
+        if( _nsdMap.size() % 20000 == 10000 ) { 
+            // so we notice if insanely large #s
+            log() << "opening namespace " << ns << endl;
+            log() << _nsdMap.size() << " namespaces in nsdMap" << endl;
+        }
+        t.reset( new NamespaceDetailsTransient(database, ns) );
+        return *t;
+    }
+
+    // note with repair there could be two databases with the same ns name.
+    // that is NOT handled here yet!  TODO
+    // repair may not use nsdt though not sure.  anyway, requires work.
+    NamespaceDetailsTransient::NamespaceDetailsTransient(Database *db, const char *ns) : 
+        _ns(ns), _keysComputed(false), _qcWriteCount() 
+    {
+        dassert(db);
+    }
+
+    NamespaceDetailsTransient::~NamespaceDetailsTransient() { 
+    }
+    
+    void NamespaceDetailsTransient::clearForPrefix(const char *prefix) {
+        assertInWriteLock();
+        vector< string > found;
+        for( ouriter i = _nsdMap.begin(); i != _nsdMap.end(); ++i )
+            if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 )
+                found.push_back( i->first );
+        for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) {
+            _nsdMap[ *i ].reset();
+        }
+    }
+
+    void NamespaceDetailsTransient::eraseForPrefix(const char *prefix) {
+        assertInWriteLock();
+        vector< string > found;
+        for( ouriter i = _nsdMap.begin(); i != _nsdMap.end(); ++i )
+            if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 )
+                found.push_back( i->first );
+        for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) {
+            _nsdMap.erase(*i);
+        }
+    }
+
+    void NamespaceDetailsTransient::computeIndexKeys() {
+        _keysComputed = true;
+        _indexKeys.clear();
+        NamespaceDetails *d = nsdetails(_ns.c_str());
+        if ( ! d )
+            return;
+        NamespaceDetails::IndexIterator i = d->ii();
+        while( i.more() )
+            i.next().keyPattern().getFieldNames(_indexKeys);
+    }
+
+
+    /* ------------------------------------------------------------------------- */
+
+    /* add a new namespace to the system catalog (<dbname>.system.namespaces).
+       options: { capped : ..., size : ... }
+    */
+    void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0) {
+        LOG(1) << "New namespace: " << ns << endl;
+        if ( strstr(ns, "system.namespaces") ) {
+            // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
+            // TODO: fix above should not be strstr!
+            return;
+        }
+
+        {
+            BSONObjBuilder b;
+            b.append("name", ns);
+            if ( options )
+                b.append("options", *options);
+            BSONObj j = b.done();
+            char database[256];
+            nsToDatabase(ns, database);
+            string s = database;
+            if( cmdLine.configsvr && (s != "config" && s != "admin") ) { 
+                uasserted(14037, "can't create user databases on a --configsvr instance");
+            }
+            s += ".system.namespaces";
+            theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true);
+        }
+    }
+
+    void renameNamespace( const char *from, const char *to ) {
+        NamespaceIndex *ni = nsindex( from );
+        assert( ni );
+        assert( ni->details( from ) );
+        assert( ! ni->details( to ) );
+
+        // Our namespace and index details will move to a different
+        // memory location.  The only references to namespace and
+        // index details across commands are in cursors and nsd
+        // transient (including query cache) so clear these.
+        ClientCursor::invalidate( from );
+        NamespaceDetailsTransient::eraseForPrefix( from );
+
+        NamespaceDetails *details = ni->details( from );
+        ni->add_ns( to, *details );
+        NamespaceDetails *todetails = ni->details( to );
+        try {
+            todetails->copyingFrom(to, details); // fixes extraOffset
+        }
+        catch( DBException& ) {
+            // could end up here if .ns is full - if so try to clean up / roll back a little
+            ni->kill_ns(to);
+            throw;
+        }
+        ni->kill_ns( from );
+        details = todetails;
+
+        BSONObj oldSpec;
+        char database[MaxDatabaseNameLen];
+        nsToDatabase(from, database);
+        string s = database;
+        s += ".system.namespaces";
+        assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) );
+
+        BSONObjBuilder newSpecB;
+        BSONObjIterator i( oldSpec.getObjectField( "options" ) );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( strcmp( e.fieldName(), "create" ) != 0 )
+                newSpecB.append( e );
+            else
+                newSpecB << "create" << to;
+        }
+        BSONObj newSpec = newSpecB.done();
+        addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec );
+
+        deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true );
+        // oldSpec variable no longer valid memory
+
+        BSONObj oldIndexSpec;
+        s = database;
+        s += ".system.indexes";
+        while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) {
+            BSONObjBuilder newIndexSpecB;
+            BSONObjIterator i( oldIndexSpec );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                if ( strcmp( e.fieldName(), "ns" ) != 0 )
+                    newIndexSpecB.append( e );
+                else
+                    newIndexSpecB << "ns" << to;
+            }
+            BSONObj newIndexSpec = newIndexSpecB.done();
+            DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, false );
+            int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) );
+            IndexDetails &indexDetails = details->idx(indexI);
+            string oldIndexNs = indexDetails.indexNamespace();
+            indexDetails.info = newIndexSpecLoc;
+            string newIndexNs = indexDetails.indexNamespace();
+
+            renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() );
+            deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true );
+        }
+    }
+
+    bool legalClientSystemNS( const string& ns , bool write ) {
+        if( ns == "local.system.replset" ) return true;
+
+        if ( ns.find( ".system.users" ) != string::npos )
+            return true;
+
+        if ( ns.find( ".system.js" ) != string::npos ) {
+            if ( write )
+                Scope::storedFuncMod();
+            return true;
+        }
+
+        return false;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/namespace.h b/src/mongo/db/namespace.h
new file mode 100644
index 00000000000..9ceb6a6f4e9
--- /dev/null
+++ b/src/mongo/db/namespace.h
@@ -0,0 +1,629 @@
+// namespace.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "namespacestring.h"
+#include "jsobj.h"
+#include "querypattern.h"
+#include "diskloc.h"
+#include "../util/hashtab.h"
+#include "mongommf.h"
+#include "d_concurrency.h"
+
+namespace mongo {
+
+    class Database;
+
+#pragma pack(1)
+    /* This helper class is used to make the HashMap below in NamespaceIndex e.g. see line:
+          HashTable<Namespace,NamespaceDetails> *ht;
+    */
+    class Namespace {
+    public:
+        explicit Namespace(const char *ns) { *this = ns; }
+        Namespace& operator=(const char *ns);
+
+        bool hasDollarSign() const { return strchr( buf , '$' ) > 0;  }
+        void kill() { buf[0] = 0x7f; }
+        bool operator==(const char *r) const { return strcmp(buf, r) == 0; }
+        bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; }
+        int hash() const; // value returned is always > 0
+
+        size_t size() const { return strlen( buf ); }
+
+        string toString() const { return (string) buf; }
+        operator string() const { return (string) buf; }
+
+        /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes
+           (more than 10 IndexDetails).  It's a bit hacky because of this late addition with backward
+           file support. */
+        string extraName(int i) const;
+        bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */
+
+        /** ( foo.bar ).getSisterNS( "blah" ) == foo.blah
+            perhaps this should move to the NamespaceString helper?
+         */
+        string getSisterNS( const char * local ) const;
+
+        enum MaxNsLenValue { MaxNsLen = 128 };
+    private:
+        char buf[MaxNsLen];
+    };
+#pragma pack()
+
+} // namespace mongo
+
+#include "index.h"
+
+namespace mongo {
+
+    /** @return true if a client can modify this namespace even though it is under ".system."
+        For example <dbname>.system.users is ok for regular clients to update.
+        @param write used when .system.js
+    */
+    bool legalClientSystemNS( const string& ns , bool write );
+
+    /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+       so you can look for a deleterecord about the right size.
+    */
+    const int Buckets = 19;
+    const int MaxBucket = 18;
+
+    extern int bucketSizes[];
+
+#pragma pack(1)
+    /* NamespaceDetails : this is the "header" for a collection that has all its details.
+       It's in the .ns file and this is a memory mapped region (thus the pack pragma above).
+    */
+    class NamespaceDetails {
+    public:
+        enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase  = 10 };
+
+        /*-------- data fields, as present on disk : */
+        DiskLoc firstExtent;
+        DiskLoc lastExtent;
+        /* NOTE: capped collections v1 override the meaning of deletedList.
+                 deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
+                 the capped namespace.
+                 deletedList[1] points to the last record in the prev extent.  When the "current extent"
+                 changes, this value is updated.  !deletedList[1].isValid() when this value is not
+                 yet computed.
+        */
+        DiskLoc deletedList[Buckets];
+        // ofs 168 (8 byte aligned)
+        struct Stats {
+            // datasize and nrecords MUST Be adjacent code assumes!
+            long long datasize; // this includes padding, but not record headers
+            long long nrecords;
+        } stats;
+        int lastExtentSize;
+        int nIndexes;
+    private:
+        // ofs 192
+        IndexDetails _indexes[NIndexesBase];
+    public:
+        // ofs 352 (16 byte aligned)
+        int capped;
+        int max;                              // max # of objects for a capped table.  TODO: should this be 64 bit?
+        double paddingFactor;                 // 1.0 = no padding.
+        // ofs 386 (16)
+        int flags;
+        DiskLoc capExtent;
+        DiskLoc capFirstNewRecord;
+        unsigned short dataFileVersion;       // NamespaceDetails version.  So we can do backward compatibility in the future. See filever.h
+        unsigned short indexFileVersion;
+        unsigned long long multiKeyIndexBits;
+    private:
+        // ofs 400 (16)
+        unsigned long long reservedA;
+        long long extraOffset;                // where the $extra info is located (bytes relative to this)
+    public:
+        int indexBuildInProgress;             // 1 if in prog
+        unsigned reservedB;
+        // ofs 424 (8)
+        struct Capped2 {
+            unsigned long long cc2_ptr;       // see capped.cpp
+            unsigned fileNumber;
+        } capped2;
+        char reserved[60];
+        /*-------- end data 496 bytes */
+
+        explicit NamespaceDetails( const DiskLoc &loc, bool _capped );
+
+        class Extra {
+            long long _next;
+        public:
+            IndexDetails details[NIndexesExtra];
+        private:
+            unsigned reserved2;
+            unsigned reserved3;
+            Extra(const Extra&) { assert(false); }
+            Extra& operator=(const Extra& r) { assert(false); return *this; }
+        public:
+            Extra() { }
+            long ofsFrom(NamespaceDetails *d) {
+                return ((char *) this) - ((char *) d);
+            }
+            void init() { memset(this, 0, sizeof(Extra)); }
+            Extra* next(NamespaceDetails *d) {
+                if( _next == 0 ) return 0;
+                return (Extra*) (((char *) d) + _next);
+            }
+            void setNext(long ofs) { *getDur().writing(&_next) = ofs;  }
+            void copy(NamespaceDetails *d, const Extra& e) {
+                memcpy(this, &e, sizeof(Extra));
+                _next = 0;
+            }
+        };
+        Extra* extra() {
+            if( extraOffset == 0 ) return 0;
+            return (Extra *) (((char *) this) + extraOffset);
+        }
+        /* add extra space for indexes when more than 10 */
+        Extra* allocExtra(const char *ns, int nindexessofar);
+        void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra
+
+        /* called when loaded from disk */
+        void onLoad(const Namespace& k);
+
+        /* dump info on this namespace.  for debugging. */
+        void dump(const Namespace& k);
+
+        /* dump info on all extents for this namespace.  for debugging. */
+        void dumpExtents();
+
+    private:
+        Extent *theCapExtent() const { return capExtent.ext(); }
+        void advanceCapExtent( const char *ns );
+        DiskLoc __capAlloc(int len);
+        DiskLoc cappedAlloc(const char *ns, int len);
+        DiskLoc &cappedFirstDeletedInCurExtent();
+        bool nextIsInCapExtent( const DiskLoc &dl ) const;
+
+    public:
+        DiskLoc& cappedListOfAllDeletedRecords() { return deletedList[0]; }
+        DiskLoc& cappedLastDelRecLastExtent()    { return deletedList[1]; }
+        void cappedDumpDelInfo();
+        bool capLooped() const { return capped && capFirstNewRecord.isValid();  }
+        bool inCapExtent( const DiskLoc &dl ) const;
+        void cappedCheckMigrate();
+        /**
+         * Truncate documents newer than the document at 'end' from the capped
+         * collection.  The collection cannot be completely emptied using this
+         * function.  An assertion will be thrown if that is attempted.
+         * @param inclusive - Truncate 'end' as well iff true
+         */
+        void cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive);
+        /** Remove all documents from the capped collection */
+        void emptyCappedCollection(const char *ns);
+
+        /* when a background index build is in progress, we don't count the index in nIndexes until
+           complete, yet need to still use it in _indexRecord() - thus we use this function for that.
+        */
+        int nIndexesBeingBuilt() const { return nIndexes + indexBuildInProgress; }
+
+        /* NOTE: be careful with flags.  are we manipulating them in read locks?  if so,
+                 this isn't thread safe.  TODO
+        */
+        enum NamespaceFlags {
+            Flag_HaveIdIndex = 1 << 0 // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called)
+        };
+
+        IndexDetails& idx(int idxNo, bool missingExpected = false );
+
+        /** get the IndexDetails for the index currently being built in the background. (there is at most one) */
+        IndexDetails& inProgIdx() {
+            DEV assert(indexBuildInProgress);
+            return idx(nIndexes);
+        }
+
+        class IndexIterator {
+        public:
+            int pos() { return i; } // note this is the next one to come
+            bool more() { return i < n; }
+            IndexDetails& next() { return d->idx(i++); }
+        private:
+            friend class NamespaceDetails;
+            int i, n;
+            NamespaceDetails *d;
+            IndexIterator(NamespaceDetails *_d);
+        };
+
+        IndexIterator ii() { return IndexIterator(this); }
+
+        /* hackish - find our index # in the indexes array */
+        int idxNo(IndexDetails& idx);
+
+        /* multikey indexes are indexes where there are more than one key in the index
+             for a single document. see multikey in wiki.
+           for these, we have to do some dedup work on queries.
+        */
+        bool isMultikey(int i) const { return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; }
+        void setIndexIsMultikey(int i) {
+            dassert( i < NIndexesMax );
+            unsigned long long x = ((unsigned long long) 1) << i;
+            if( multiKeyIndexBits & x ) return;
+            *getDur().writing(&multiKeyIndexBits) |= x;
+        }
+        void clearIndexIsMultikey(int i) {
+            dassert( i < NIndexesMax );
+            unsigned long long x = ((unsigned long long) 1) << i;
+            if( (multiKeyIndexBits & x) == 0 ) return;
+            *getDur().writing(&multiKeyIndexBits) &= ~x;
+        }
+
+        /* add a new index.  does not add to system.indexes etc. - just to NamespaceDetails.
+           caller must populate returned object.
+         */
+        IndexDetails& addIndex(const char *thisns, bool resetTransient=true);
+
+        void aboutToDeleteAnIndex() { 
+            *getDur().writing(&flags) = flags & ~Flag_HaveIdIndex;
+        }
+
+        /* returns index of the first index in which the field is present. -1 if not present. */
+        int fieldIsIndexed(const char *fieldName);
+
+        /* called to indicate that an update fit in place.  
+           fits also called on an insert -- idea there is that if you had some mix and then went to
+           pure inserts it would adapt and PF would trend to 1.0.  note update calls insert on a move
+           so there is a double count there that must be adjusted for below.
+
+           todo: greater sophistication could be helpful and added later.  for example the absolute 
+                 size of documents might be considered -- in some cases smaller ones are more likely 
+                 to grow than larger ones in the same collection? (not always)
+        */
+        void paddingFits() {
+            MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less
+                double x = paddingFactor - 0.001;
+                if ( x >= 1.0 ) {
+                    *getDur().writing(&paddingFactor) = x;
+                    //getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+                }
+            }
+        }
+        void paddingTooSmall() {            
+            MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less       
+                /* the more indexes we have, the higher the cost of a move.  so we take that into 
+                   account herein.  note on a move that insert() calls paddingFits(), thus
+                   here for example with no inserts and nIndexes = 1 we have
+                   .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves.  insert heavy 
+                   can pushes this down considerably. further tweaking will be a good idea but 
+                   this should be an adequate starting point.
+                */
+                double N = min(nIndexes,7) + 3;
+                double x = paddingFactor + (0.001 * N);
+                if ( x <= 2.0 ) {
+                    *getDur().writing(&paddingFactor) = x;
+                    //getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+                }
+            }
+        }
+
+        // @return offset in indexes[]
+        int findIndexByName(const char *name);
+
+        // @return offset in indexes[]
+        int findIndexByKeyPattern(const BSONObj& keyPattern);
+
+        void findIndexByType( const string& name , vector<int>& matches ) {
+            IndexIterator i = ii();
+            while ( i.more() ) {
+                if ( i.next().getSpec().getTypeName() == name )
+                    matches.push_back( i.pos() - 1 );
+            }
+        }
+
+        /* @return -1 = not found
+           generally id is first index, so not that expensive an operation (assuming present).
+        */
+        int findIdIndex() {
+            IndexIterator i = ii();
+            while( i.more() ) {
+                if( i.next().isIdIndex() )
+                    return i.pos()-1;
+            }
+            return -1;
+        }
+
+        bool haveIdIndex() { 
+            return (flags & NamespaceDetails::Flag_HaveIdIndex) || findIdIndex() >= 0;
+        }
+
+        /* return which "deleted bucket" for this size object */
+        static int bucket(int n) {
+            for ( int i = 0; i < Buckets; i++ )
+                if ( bucketSizes[i] > n )
+                    return i;
+            return Buckets-1;
+        }
+
+        /* predetermine location of the next alloc without actually doing it. 
+           if cannot predetermine returns null (so still call alloc() then)
+        */
+        DiskLoc allocWillBeAt(const char *ns, int lenToAlloc);
+
+        /* allocate a new record.  lenToAlloc includes headers. */
+        DiskLoc alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc);
+
+        /* add a given record to the deleted chains for this NS */
+        void addDeletedRec(DeletedRecord *d, DiskLoc dloc);
+        void dumpDeleted(set<DiskLoc> *extents = 0);
+        // Start from firstExtent by default.
+        DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+        // Start from lastExtent by default.
+        DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+        long long storageSize( int * numExtents = 0 , BSONArrayBuilder * extentInfo = 0 ) const;
+
+        int averageObjectSize() {
+            if ( stats.nrecords == 0 )
+                return 5;
+            return (int) (stats.datasize / stats.nrecords);
+        }
+
+        NamespaceDetails *writingWithoutExtra() {
+            return ( NamespaceDetails* ) getDur().writingPtr( this, sizeof( NamespaceDetails ) );
+        }
+        /** Make all linked Extra objects writeable as well */
+        NamespaceDetails *writingWithExtra();
+
+    private:
+        DiskLoc _alloc(const char *ns, int len);
+        void maybeComplain( const char *ns, int len ) const;
+        DiskLoc __stdAlloc(int len, bool willBeAt);
+        void compact(); // combine adjacent deleted records
+        friend class NamespaceIndex;
+        struct ExtraOld {
+            // note we could use this field for more chaining later, so don't waste it:
+            unsigned long long reserved1;
+            IndexDetails details[NIndexesExtra];
+            unsigned reserved2;
+            unsigned reserved3;
+        };
+        /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */
+        void cappedTruncateLastDelUpdate();
+        BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 );
+        BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 );
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 );
+    }; // NamespaceDetails
+#pragma pack()
+
+    /* NamespaceDetailsTransient
+
+       these are things we know / compute about a namespace that are transient -- things
+       we don't actually store in the .ns file.  so mainly caching of frequently used
+       information.
+
+       CAUTION: Are you maintaining this properly on a collection drop()?  A dropdatabase()?  Be careful.
+                The current field "allIndexKeys" may have too many keys in it on such an occurrence;
+                as currently used that does not cause anything terrible to happen.
+
+       todo: cleanup code, need abstractions and separation
+    */
+    // todo: multiple db's with the same name (repairDatbase) is not handled herein.  that may be 
+    //       the way to go, if not used by repair, but need some sort of enforcement / asserts.
+    class NamespaceDetailsTransient : boost::noncopyable {
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 );
+
+        //Database *database;
+        const string _ns;
+        void reset();
+        static std::map< string, shared_ptr< NamespaceDetailsTransient > > _nsdMap;
+
+        NamespaceDetailsTransient(Database*,const char *ns);
+    public:
+        ~NamespaceDetailsTransient();
+        void addedIndex() { assertInWriteLock(); reset(); }
+        void deletedIndex() { assertInWriteLock(); reset(); }
+        /* Drop cached information on all namespaces beginning with the specified prefix.
+           Can be useful as index namespaces share the same start as the regular collection.
+           SLOW - sequential scan of all NamespaceDetailsTransient objects */
+        static void clearForPrefix(const char *prefix);
+        static void eraseForPrefix(const char *prefix);
+
+        /**
+         * @return a cursor interface to the query optimizer.  The implementation may
+         * utilize a single query plan or interleave results from multiple query
+         * plans before settling on a single query plan.  Note that the schema of
+         * currKey() documents, the matcher(), and the isMultiKey() nature of the
+         * cursor may change over the course of iteration.
+         *
+         * @param query - Query used to select indexes and populate matchers.
+         *
+         * @param order - Required ordering spec for documents produced by this cursor,
+         * empty object default indicates no order requirement.  If no index exists that
+         * satisfies the required sort order, an empty shared_ptr is returned.
+         *
+         * @param requireIndex - If true, no unindexed (ie collection scan) cursors are
+         * used to generate the returned cursor.  If an unindexed cursor is required, an
+         * assertion is raised by the cursor during iteration.
+         *
+         * @param simpleEqualityMatch - Set to true for certain simple queries -
+         * see queryoptimizer.cpp.
+         *
+         * The returned cursor may @throw inside of advance() or recoverFromYield() in
+         * certain error cases, for example if a capped overrun occurred during a yield.
+         * This indicates that the cursor was unable to perform a complete scan.
+         *
+         * This is a work in progress.  Partial list of features not yet implemented:
+         * - covered indexes
+         * - in memory sorting
+         */
+        static shared_ptr<Cursor> getCursor( const char *ns, const BSONObj &query,
+                                            const BSONObj &order = BSONObj(), bool requireIndex = false,
+                                            bool *simpleEqualityMatch = 0 );
+                                     
+        /* indexKeys() cache ---------------------------------------------------- */
+        /* assumed to be in write lock for this */
+    private:
+        bool _keysComputed;
+        set<string> _indexKeys;
+        void computeIndexKeys();
+    public:
+        /* get set of index keys for this namespace.  handy to quickly check if a given
+           field is indexed (Note it might be a secondary component of a compound index.)
+        */
+        set<string>& indexKeys() {
+            DEV assertInWriteLock();
+            if ( !_keysComputed )
+                computeIndexKeys();
+            return _indexKeys;
+        }
+
+        /* IndexSpec caching */
+    private:
+        map<const IndexDetails*,IndexSpec> _indexSpecs;
+        static SimpleMutex _isMutex;
+    public:
+        const IndexSpec& getIndexSpec( const IndexDetails * details ) {
+            IndexSpec& spec = _indexSpecs[details];
+            if ( ! spec._finishedInit ) {
+                SimpleMutex::scoped_lock lk(_isMutex);
+                if ( ! spec._finishedInit ) {
+                    spec.reset( details );
+                    assert( spec._finishedInit );
+                }
+            }
+            return spec;
+        }
+
+        /* query cache (for query optimizer) ------------------------------------- */
+    private:
+        int _qcWriteCount;
+        map< QueryPattern, pair< BSONObj, long long > > _qcCache;
+        static NamespaceDetailsTransient& make_inlock(const char *ns);
+    public:
+        static SimpleMutex _qcMutex;
+
+        /* you must be in the qcMutex when calling this.
+           A NamespaceDetailsTransient object will not go out of scope on you if you are
+           d.dbMutex.atLeastReadLocked(), so you do't have to stay locked.
+           Creates a NamespaceDetailsTransient before returning if one DNE. 
+           todo: avoid creating too many on erroneous ns queries.
+           */
+        static NamespaceDetailsTransient& get_inlock(const char *ns);
+
+        static NamespaceDetailsTransient& get(const char *ns) {
+            SimpleMutex::scoped_lock lk(_qcMutex);
+            return get_inlock(ns);
+        }
+
+        void clearQueryCache() { // public for unit tests
+            _qcCache.clear();
+            _qcWriteCount = 0;
+        }
+        /* you must notify the cache if you are doing writes, as query plan optimality will change */
+        void notifyOfWriteOp() {
+            if ( _qcCache.empty() )
+                return;
+            if ( ++_qcWriteCount >= 100 )
+                clearQueryCache();
+        }
+        BSONObj indexForPattern( const QueryPattern &pattern ) {
+            return _qcCache[ pattern ].first;
+        }
+        long long nScannedForPattern( const QueryPattern &pattern ) {
+            return _qcCache[ pattern ].second;
+        }
+        void registerIndexForPattern( const QueryPattern &pattern, const BSONObj &indexKey, long long nScanned ) {
+            _qcCache[ pattern ] = make_pair( indexKey, nScanned );
+        }
+
+    }; /* NamespaceDetailsTransient */
+
+    inline NamespaceDetailsTransient& NamespaceDetailsTransient::get_inlock(const char *ns) {
+        std::map< string, shared_ptr< NamespaceDetailsTransient > >::iterator i = _nsdMap.find(ns);
+        if( i != _nsdMap.end() && 
+            i->second.get() ) { // could be null ptr from clearForPrefix
+            return *i->second;
+        }
+        return make_inlock(ns);
+    }
+
+    /* NamespaceIndex is the ".ns" file you see in the data directory.  It is the "system catalog"
+       if you will: at least the core parts.  (Additional info in system.* collections.)
+    */
+    class NamespaceIndex {
+    public:
+        NamespaceIndex(const string &dir, const string &database) :
+            ht( 0 ), dir_( dir ), database_( database ) {}
+
+        /* returns true if new db will be created if we init lazily */
+        bool exists() const;
+
+        void init() {
+            if( !ht ) 
+                _init();
+        }
+
+        void add_ns(const char *ns, DiskLoc& loc, bool capped);
+        void add_ns( const char *ns, const NamespaceDetails &details );
+
+        NamespaceDetails* details(const char *ns) {
+            if ( !ht )
+                return 0;
+            Namespace n(ns);
+            NamespaceDetails *d = ht->get(n);
+            if ( d && d->capped )
+                d->cappedCheckMigrate();
+            return d;
+        }
+
+        void kill_ns(const char *ns);
+
+        bool find(const char *ns, DiskLoc& loc) {
+            NamespaceDetails *l = details(ns);
+            if ( l ) {
+                loc = l->firstExtent;
+                return true;
+            }
+            return false;
+        }
+
+        bool allocated() const { return ht != 0; }
+
+        void getNamespaces( list<string>& tofill , bool onlyCollections = true ) const;
+
+        NamespaceDetails::Extra* newExtra(const char *ns, int n, NamespaceDetails *d);
+
+        boost::filesystem::path path() const;
+
+        unsigned long long fileLength() const { return f.length(); }
+
+    private:
+        void _init();
+        void maybeMkdir() const;
+
+        MongoMMF f;
+        HashTable<Namespace,NamespaceDetails> *ht;
+        string dir_;
+        string database_;
+    };
+
+    extern string dbpath; // --dbpath parm
+    extern bool directoryperdb;
+
+    // Rename a namespace within current 'client' db.
+    // (Arguments should include db name)
+    void renameNamespace( const char *from, const char *to );
+
+
+} // namespace mongo
diff --git a/src/mongo/db/namespacestring.h b/src/mongo/db/namespacestring.h
new file mode 100644
index 00000000000..d982c5fff75
--- /dev/null
+++ b/src/mongo/db/namespacestring.h
@@ -0,0 +1,147 @@
+// @file namespacestring.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <string>
+
+namespace mongo {
+
+    using std::string;
+
+    /* in the mongo source code, "client" means "database". */
+
+    const int MaxDatabaseNameLen = 256; // max str len for the db name, including null char
+
+    /* e.g.
+       NamespaceString ns("acme.orders");
+       cout << ns.coll; // "orders"
+    */
+    class NamespaceString {
+    public:
+        string db;
+        string coll; // note collection names can have periods in them for organizing purposes (e.g. "system.indexes")
+
+        NamespaceString( const char * ns ) { init(ns); }
+        NamespaceString( const string& ns ) { init(ns.c_str()); }
+
+        string ns() const { return db + '.' + coll; }
+
+        bool isSystem() const { return strncmp(coll.c_str(), "system.", 7) == 0; }
+        bool isCommand() const { return coll == "$cmd"; }
+
+        operator string() const { return ns(); }
+
+        bool operator==( const string& nsIn ) const { return nsIn == ns(); }
+        bool operator==( const char* nsIn ) const { return (string)nsIn == ns(); }
+        bool operator==( const NamespaceString& nsIn ) const { return nsIn.db == db && nsIn.coll == coll; }
+
+        bool operator!=( const string& nsIn ) const { return nsIn != ns(); }
+        bool operator!=( const char* nsIn ) const { return (string)nsIn != ns(); }
+        bool operator!=( const NamespaceString& nsIn ) const { return nsIn.db != db || nsIn.coll != coll; }
+
+        string toString() const { return ns(); }
+
+        /**
+         * @return true if ns is 'normal'.  $ used for collections holding index data, which do not contain BSON objects in their records.
+         * special case for the local.oplog.$main ns -- naming it as such was a mistake.
+         */
+        static bool normal(const char* ns) {
+            const char *p = strchr(ns, '$');
+            if( p == 0 )
+                return true;
+            return strcmp( ns, "local.oplog.$main" ) == 0;
+        }
+
+        static bool special(const char *ns) { 
+            return !normal(ns) || strstr(ns, ".system.");
+        }
+
+        /**
+         * samples:
+         *   good:  
+         *      foo  
+         *      bar
+         *      foo-bar
+         *   bad:
+         *      foo bar
+         *      foo.bar
+         *      foo"bar
+         *        
+         * @param db - a possible database name
+         * @return if db is an allowed database name
+         */
+        static bool validDBName( const string& db ) {
+            if ( db.size() == 0 || db.size() > 64 )
+                return false;
+            size_t good = strcspn( db.c_str() , "/\\. \"" );
+            return good == db.size();
+        }
+
+        /**
+         * samples:
+         *   good:
+         *      foo.bar
+         *   bad:
+         *      foo.
+         *
+         * @param dbcoll - a possible collection name of the form db.coll
+         * @return if db.coll is an allowed collection name
+         */
+        static bool validCollectionName(const char* dbcoll){
+          const char *c = strchr( dbcoll, '.' ) + 1;
+          return normal(dbcoll) && c && *c;
+        }
+
+    private:
+        void init(const char *ns) {
+            const char *p = strchr(ns, '.');
+            if( p == 0 ) return;
+            db = string(ns, p - ns);
+            coll = p + 1;
+        }
+    };
+
+    // "database.a.b.c" -> "database"
+    inline void nsToDatabase(const char *ns, char *database) {
+        const char *p = ns;
+        char *q = database;
+        while ( *p != '.' ) {
+            if ( *p == 0 )
+                break;
+            *q++ = *p++;
+        }
+        *q = 0;
+        if (q-database>=MaxDatabaseNameLen) {
+            log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl;
+            dbexit( EXIT_POSSIBLE_CORRUPTION );
+        }
+    }
+    inline string nsToDatabase(const char *ns) {
+        char buf[MaxDatabaseNameLen];
+        nsToDatabase(ns, buf);
+        return buf;
+    }
+    inline string nsToDatabase(const string& ns) {
+        size_t i = ns.find( '.' );
+        if ( i == string::npos )
+            return ns;
+        return ns.substr( 0 , i );
+    }
+
+}
diff --git a/src/mongo/db/nonce.cpp b/src/mongo/db/nonce.cpp
new file mode 100644
index 00000000000..379e88f116d
--- /dev/null
+++ b/src/mongo/db/nonce.cpp
@@ -0,0 +1,95 @@
+// nonce.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "nonce.h"
+#include "../util/time_support.h"
+
+extern int do_md5_test(void);
+
+namespace mongo {
+
+    BOOST_STATIC_ASSERT( sizeof(nonce64) == 8 );
+
+    static Security security; // needs to be static so _initialized is preset to false (see initsafe below)
+
+    Security::Security() {
+        static int n;
+        massert( 10352 , "Security is a singleton class", ++n == 1);
+        init();
+    }
+
+    NOINLINE_DECL void Security::init() {
+        if( _initialized ) return;
+        _initialized = true;
+
+#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
+        _devrandom = new ifstream("/dev/urandom", ios::binary|ios::in);
+        massert( 10353 ,  "can't open dev/urandom", _devrandom->is_open() );
+#elif defined(_WIN32)
+        srand(curTimeMicros()); // perhaps not relevant for rand_s but we might want elsewhere anyway
+#else
+        srandomdev();
+#endif
+
+#ifndef NDEBUG
+        if ( do_md5_test() )
+            massert( 10354 , "md5 unit test fails", false);
+#endif
+    }
+
+    nonce64 Security::__getNonce() { 
+        dassert( _initialized );
+        nonce64 n;
+#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
+        _devrandom->read((char*)&n, sizeof(n));
+        massert(10355 , "devrandom failed", !_devrandom->fail());
+#elif defined(_WIN32)
+        unsigned a=0, b=0;
+        assert( rand_s(&a) == 0 );
+        assert( rand_s(&b) == 0 );
+        n = (((unsigned long long)a)<<32) | b;
+#else
+        n = (((unsigned long long)random())<<32) | random();
+#endif
+        return n;
+    }
+
+    SimpleMutex nonceMutex("nonce");
+    nonce64 Security::_getNonce() {
+        // not good this is a static as gcc will mutex protect it which costs time
+        SimpleMutex::scoped_lock lk(nonceMutex);
+        if( !_initialized )
+            init();
+        return __getNonce();
+    }
+
+    nonce64 Security::getNonceDuringInit() {
+        // the mutex might not be inited yet.  init phase should be one thread anyway (hopefully we don't spawn threads therein)
+        if( !security._initialized )
+            security.init();
+        return security.__getNonce();
+    }
+
+    nonce64 Security::getNonce() {
+        return security._getNonce();
+    }
+
+    // name warns us this might be a little slow (see code above)
+    unsigned goodRandomNumberSlow() { return (unsigned) Security::getNonce(); }
+
+} // namespace mongo
diff --git a/src/mongo/db/nonce.h b/src/mongo/db/nonce.h
new file mode 100644
index 00000000000..d6a147ab1c0
--- /dev/null
+++ b/src/mongo/db/nonce.h
@@ -0,0 +1,36 @@
+// @file nonce.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    typedef unsigned long long nonce64;
+
+    struct Security {
+        Security();
+        static nonce64 getNonce();
+        static nonce64 getNonceDuringInit(); // use this version during global var constructors
+    private:
+        nonce64 _getNonce();
+        nonce64 __getNonce();
+        ifstream *_devrandom;
+        bool _initialized;
+        void init(); // can call more than once
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/oplog.cpp b/src/mongo/db/oplog.cpp
new file mode 100644
index 00000000000..342f362a28f
--- /dev/null
+++ b/src/mongo/db/oplog.cpp
@@ -0,0 +1,872 @@
+// @file oplog.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "oplog.h"
+#include "repl_block.h"
+#include "repl.h"
+#include "commands.h"
+#include "repl/rs.h"
+#include "stats/counters.h"
+#include "../util/file.h"
+#include "../util/unittest.h"
+#include "queryoptimizer.h"
+#include "ops/update.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+    void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt );
+
+    int __findingStartInitialTimeout = 5; // configurable for testing
+
+    // cached copies of these...so don't rename them, drop them, etc.!!!
+    static NamespaceDetails *localOplogMainDetails = 0;
+    static Database *localDB = 0;
+    static NamespaceDetails *rsOplogDetails = 0;
+    void oplogCheckCloseDatabase( Database * db ) {
+        localDB = 0;
+        localOplogMainDetails = 0;
+        rsOplogDetails = 0;
+        resetSlaveCache();
+    }
+
+    static void _logOpUninitialized(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+        uassert(13288, "replSet error write op to db before replSet initialized", str::startsWith(ns, "local.") || *opstr == 'n');
+    }
+
+    /** write an op to the oplog that is already built.
+        todo : make _logOpRS() call this so we don't repeat ourself?
+        */
+    void _logOpObjRS(const BSONObj& op) {
+        DEV assertInWriteLock();
+
+        const OpTime ts = op["ts"]._opTime();
+        long long h = op["h"].numberLong();
+
+        {
+            const char *logns = rsoplog;
+            if ( rsOplogDetails == 0 ) {
+                Client::Context ctx( logns , dbpath, false);
+                localDB = ctx.db();
+                assert( localDB );
+                rsOplogDetails = nsdetails(logns);
+                massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
+            }
+            Client::Context ctx( logns , localDB, false );
+            {
+                int len = op.objsize();
+                Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
+                memcpy(getDur().writingPtr(r->data, len), op.objdata(), len);
+            }
+            /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
+                     this code (or code in now() maybe) should be improved.
+                     */
+            if( theReplSet ) {
+                if( !(theReplSet->lastOpTimeWritten<ts) ) {
+                    log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl;
+                }
+                theReplSet->lastOpTimeWritten = ts;
+                theReplSet->lastH = h;
+                ctx.getClient()->setLastOp( ts );
+            }
+        }
+    }
+
+    /** given a BSON object, create a new one at dst which is the existing (partial) object
+        with a new object element appended at the end with fieldname "o".
+
+        @param partial already build object with everything except the o member.  e.g. something like:
+               { ts:..., ns:..., os2:... }
+        @param o a bson object to be added with fieldname "o"
+        @dst   where to put the newly built combined object.  e.g. ends up as something like:
+               { ts:..., ns:..., os2:..., o:... }
+    */
+    void append_O_Obj(char *dst, const BSONObj& partial, const BSONObj& o) {
+        const int size1 = partial.objsize() - 1;  // less the EOO char
+        const int oOfs = size1+3;                 // 3 = byte BSONOBJTYPE + byte 'o' + byte \0
+
+        void *p = getDur().writingPtr(dst, oOfs+o.objsize()+1);
+
+        memcpy(p, partial.objdata(), size1);
+
+        // adjust overall bson object size for the o: field
+        *(static_cast<unsigned*>(p)) += o.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/;
+
+        char *b = static_cast<char *>(p);
+        b += size1;
+        *b++ = (char) Object;
+        *b++ = 'o'; // { o : ... }
+        *b++ = 0;   // null terminate "o" fieldname
+        memcpy(b, o.objdata(), o.objsize());
+        b += o.objsize();
+        *b = EOO;
+    }
+
+    // global is safe as we are in write lock. we put the static outside the function to avoid the implicit mutex 
+    // the compiler would use if inside the function.  the reason this is static is to avoid a malloc/free for this
+    // on every logop call.
+    static BufBuilder logopbufbuilder(8*1024);
+    static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+        DEV assertInWriteLock();
+
+        if ( strncmp(ns, "local.", 6) == 0 ) {
+            if ( strncmp(ns, "local.slaves", 12) == 0 )
+                resetSlaveCache();
+            return;
+        }
+
+        const OpTime ts = OpTime::now();
+        long long hashNew;
+        if( theReplSet ) {
+            massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary());
+            hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId();
+        }
+        else {
+            // must be initiation
+            assert( *ns == 0 );
+            hashNew = 0;
+        }
+
+        /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
+           instead we do a single copy to the destination position in the memory mapped file.
+        */
+
+        logopbufbuilder.reset();
+        BSONObjBuilder b(logopbufbuilder);
+        b.appendTimestamp("ts", ts.asDate());
+        b.append("h", hashNew);
+        b.append("op", opstr);
+        b.append("ns", ns);
+        if ( bb )
+            b.appendBool("b", *bb);
+        if ( o2 )
+            b.append("o2", *o2);
+        BSONObj partial = b.done();
+        int posz = partial.objsize();
+        int len = posz + obj.objsize() + 1 + 2 /*o:*/;
+
+        Record *r;
+        DEV assert( logNS == 0 );
+        {
+            const char *logns = rsoplog;
+            if ( rsOplogDetails == 0 ) {
+                Client::Context ctx( logns , dbpath, false);
+                localDB = ctx.db();
+                assert( localDB );
+                rsOplogDetails = nsdetails(logns);
+                massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
+            }
+            Client::Context ctx( logns , localDB, false );
+            r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
+            /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
+                     this code (or code in now() maybe) should be improved.
+                     */
+            if( theReplSet ) {
+                if( !(theReplSet->lastOpTimeWritten<ts) ) {
+                    log() << "replSet ERROR possible failover clock skew issue? " << theReplSet->lastOpTimeWritten << ' ' << ts << rsLog;
+                    log() << "replSet " << theReplSet->isPrimary() << rsLog;
+                }
+                theReplSet->lastOpTimeWritten = ts;
+                theReplSet->lastH = hashNew;
+                ctx.getClient()->setLastOp( ts );
+            }
+        }
+
+        append_O_Obj(r->data, partial, obj);
+
+        if ( logLevel >= 6 ) {
+            BSONObj temp(r);
+            log( 6 ) << "logOp:" << temp << endl;
+        }
+    }
+
+    /* we write to local.opload.$main:
+         { ts : ..., op: ..., ns: ..., o: ... }
+       ts: an OpTime timestamp
+       op:
+        "i" insert
+        "u" update
+        "d" delete
+        "c" db cmd
+        "db" declares presence of a database (ns is set to the db name + '.')
+        "n" no op
+       logNS - where to log it.  0/null means "local.oplog.$main".
+       bb:
+         if not null, specifies a boolean to pass along to the other side as b: param.
+         used for "justOne" or "upsert" flags on 'd', 'u'
+       first: true
+         when set, indicates this is the first thing we have logged for this database.
+         thus, the slave does not need to copy down all the data when it sees this.
+
+       note this is used for single collection logging even when --replSet is enabled.
+    */
+    static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+        DEV assertInWriteLock();
+        static BufBuilder bufbuilder(8*1024);
+
+        if ( strncmp(ns, "local.", 6) == 0 ) {
+            if ( strncmp(ns, "local.slaves", 12) == 0 ) {
+                resetSlaveCache();
+            }
+            return;
+        }
+
+        const OpTime ts = OpTime::now();
+        Client::Context context("",0,false);
+
+        /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
+           instead we do a single copy to the destination position in the memory mapped file.
+        */
+
+        bufbuilder.reset();
+        BSONObjBuilder b(bufbuilder);
+        b.appendTimestamp("ts", ts.asDate());
+        b.append("op", opstr);
+        b.append("ns", ns);
+        if ( bb )
+            b.appendBool("b", *bb);
+        if ( o2 )
+            b.append("o2", *o2);
+        BSONObj partial = b.done(); // partial is everything except the o:... part.
+
+        int po_sz = partial.objsize();
+        int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;
+
+        Record *r;
+        if( logNS == 0 ) {
+            logNS = "local.oplog.$main";
+            if ( localOplogMainDetails == 0 ) {
+                Client::Context ctx( logNS , dbpath, false);
+                localDB = ctx.db();
+                assert( localDB );
+                localOplogMainDetails = nsdetails(logNS);
+                assert( localOplogMainDetails );
+            }
+            Client::Context ctx( logNS , localDB, false );
+            r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
+        }
+        else {
+            Client::Context ctx( logNS, dbpath, false );
+            assert( nsdetails( logNS ) );
+            // first we allocate the space, then we fill it below.
+            r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
+        }
+
+        append_O_Obj(r->data, partial, obj);
+
+        context.getClient()->setLastOp( ts );
+
+        if ( logLevel >= 6 ) {
+            BSONObj temp(r);
+            log( 6 ) << "logging op:" << temp << endl;
+        }
+
+    }
+
+    static void (*_logOp)(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) = _logOpOld;
+    void newReplUp() {
+        replSettings.master = true;
+        _logOp = _logOpRS;
+    }
+    void newRepl() {
+        replSettings.master = true;
+        _logOp = _logOpUninitialized;
+    }
+    void oldRepl() { _logOp = _logOpOld; }
+
+    void logKeepalive() {
+        _logOp("n", "", 0, BSONObj(), 0, 0);
+    }
+    void logOpComment(const BSONObj& obj) {
+        _logOp("n", "", 0, obj, 0, 0);
+    }
+    void logOpInitiate(const BSONObj& obj) {
+        _logOpRS("n", "", 0, obj, 0, 0);
+    }
+
+    /*@ @param opstr:
+          c userCreateNS
+          i insert
+          n no-op / keepalive
+          d delete / remove
+          u update
+    */
+    void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) {
+        if ( replSettings.master ) {
+            _logOp(opstr, ns, 0, obj, patt, b);
+        }
+
+        logOpForSharding( opstr , ns , obj , patt );
+    }
+
+    void createOplog() {
+        dblock lk;
+
+        const char * ns = "local.oplog.$main";
+
+        bool rs = !cmdLine._replSet.empty();
+        if( rs )
+            ns = rsoplog;
+
+        Client::Context ctx(ns);
+
+        NamespaceDetails * nsd = nsdetails( ns );
+
+        if ( nsd ) {
+
+            if ( cmdLine.oplogSize != 0 ) {
+                int o = (int)(nsd->storageSize() / ( 1024 * 1024 ) );
+                int n = (int)(cmdLine.oplogSize / ( 1024 * 1024 ) );
+                if ( n != o ) {
+                    stringstream ss;
+                    ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog";
+                    log() << ss.str() << endl;
+                    throw UserException( 13257 , ss.str() );
+                }
+            }
+
+            if( rs ) return;
+
+            DBDirectClient c;
+            BSONObj lastOp = c.findOne( ns, Query().sort(reverseNaturalObj) );
+            if ( !lastOp.isEmpty() ) {
+                OpTime::setLast( lastOp[ "ts" ].date() );
+            }
+            return;
+        }
+
+        /* create an oplog collection, if it doesn't yet exist. */
+        BSONObjBuilder b;
+        double sz;
+        if ( cmdLine.oplogSize != 0 )
+            sz = (double)cmdLine.oplogSize;
+        else {
+            /* not specified. pick a default size */
+            sz = 50.0 * 1000 * 1000;
+            if ( sizeof(int *) >= 8 ) {
+#if defined(__APPLE__)
+                // typically these are desktops (dev machines), so keep it smallish
+                sz = (256-64) * 1000 * 1000;
+#else
+                sz = 990.0 * 1000 * 1000;
+                boost::intmax_t free = File::freeSpace(dbpath); //-1 if call not supported.
+                double fivePct = free * 0.05;
+                if ( fivePct > sz )
+                    sz = fivePct;
+#endif
+            }
+        }
+
+        log() << "******" << endl;
+        log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;
+
+        b.append("size", sz);
+        b.appendBool("capped", 1);
+        b.appendBool("autoIndexId", false);
+
+        string err;
+        BSONObj o = b.done();
+        userCreateNS(ns, o, err, false);
+        if( !rs )
+            logOp( "n", "", BSONObj() );
+
+        /* sync here so we don't get any surprising lag later when we try to sync */
+        MemoryMappedFile::flushAll(true);
+        log() << "******" << endl;
+    }
+
+    // -------------------------------------
+
+    FindingStartCursor::FindingStartCursor( const QueryPlan & qp ) :
+    _qp( qp ),
+    _findingStart( true ),
+    _findingStartMode()
+    { init(); }
+    
+    void FindingStartCursor::next() {
+        if ( !_findingStartCursor || !_findingStartCursor->ok() ) {
+            _findingStart = false;
+            _c = _qp.newCursor(); // on error, start from beginning
+            destroyClientCursor();
+            return;
+        }
+        switch( _findingStartMode ) {
+            // Initial mode: scan backwards from end of collection
+            case Initial: {
+                if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStart = false; // found first record out of query range, so scan normally
+                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
+                    destroyClientCursor();
+                    return;
+                }
+                _findingStartCursor->advance();
+                RARELY {
+                    if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) {
+                        // If we've scanned enough, switch to find extent mode.
+                        createClientCursor( extentFirstLoc( _findingStartCursor->currLoc() ) );
+                        _findingStartMode = FindExtent;
+                        return;
+                    }
+                }
+                return;
+            }
+            // FindExtent mode: moving backwards through extents, check first
+            // document of each extent.
+            case FindExtent: {
+                if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStartMode = InExtent;
+                    return;
+                }
+                DiskLoc prev = prevExtentFirstLoc( _findingStartCursor->currLoc() );
+                if ( prev.isNull() ) { // hit beginning, so start scanning from here
+                    createClientCursor();
+                    _findingStartMode = InExtent;
+                    return;
+                }
+                // There might be a more efficient implementation than creating new cursor & client cursor each time,
+                // not worrying about that for now
+                createClientCursor( prev );
+                return;
+            }
+            // InExtent mode: once an extent is chosen, find starting doc in the extent.
+            case InExtent: {
+                if ( _matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStart = false; // found first record in query range, so scan normally
+                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
+                    destroyClientCursor();
+                    return;
+                }
+                _findingStartCursor->advance();
+                return;
+            }
+            default: {
+                massert( 14038, "invalid _findingStartMode", false );
+            }
+        }
+    }
+    
+    DiskLoc FindingStartCursor::extentFirstLoc( const DiskLoc &rec ) {
+        Extent *e = rec.rec()->myExtent( rec );
+        if ( !_qp.nsd()->capLooped() || ( e->myLoc != _qp.nsd()->capExtent ) )
+            return e->firstRecord;
+        // Likely we are on the fresh side of capExtent, so return first fresh record.
+        // If we are on the stale side of capExtent, then the collection is small and it
+        // doesn't matter if we start the extent scan with capFirstNewRecord.
+        return _qp.nsd()->capFirstNewRecord;
+    }
+    
+    void wassertExtentNonempty( const Extent *e ) {
+        // TODO ensure this requirement is clearly enforced, or fix.
+        wassert( !e->firstRecord.isNull() );
+    }
+    
+    DiskLoc FindingStartCursor::prevExtentFirstLoc( const DiskLoc &rec ) {
+        Extent *e = rec.rec()->myExtent( rec );
+        if ( _qp.nsd()->capLooped() ) {
+            if ( e->xprev.isNull() ) {
+                e = _qp.nsd()->lastExtent.ext();
+            }
+            else {
+                e = e->xprev.ext();
+            }
+            if ( e->myLoc != _qp.nsd()->capExtent ) {
+                wassertExtentNonempty( e );
+                return e->firstRecord;
+            }
+        }
+        else {
+            if ( !e->xprev.isNull() ) {
+                e = e->xprev.ext();
+                wassertExtentNonempty( e );
+                return e->firstRecord;
+            }
+        }
+        return DiskLoc(); // reached beginning of collection
+    }
+    
+    void FindingStartCursor::createClientCursor( const DiskLoc &startLoc ) {
+        shared_ptr<Cursor> c = _qp.newCursor( startLoc );
+        _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) );
+    }
+
+    bool FindingStartCursor::firstDocMatchesOrEmpty() const {
+        shared_ptr<Cursor> c = _qp.newCursor();
+        return !c->ok() || _matcher->matchesCurrent( c.get() );
+    }
+    
+    void FindingStartCursor::init() {
+        BSONElement tsElt = _qp.originalQuery()[ "ts" ];
+        massert( 13044, "no ts field in query", !tsElt.eoo() );
+        BSONObjBuilder b;
+        b.append( tsElt );
+        BSONObj tsQuery = b.obj();
+        _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey()));
+        if ( firstDocMatchesOrEmpty() ) {
+            _c = _qp.newCursor();
+            _findingStart = false;
+            return;
+        }
+        // Use a ClientCursor here so we can release db mutex while scanning
+        // oplog (can take quite a while with large oplogs).
+        shared_ptr<Cursor> c = _qp.newReverseCursor();
+        _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) );
+        _findingStartTimer.reset();
+        _findingStartMode = Initial;
+    }
+    
+    // -------------------------------------
+
+    struct TestOpTime : public UnitTest {
+        void run() {
+            OpTime t;
+            for ( int i = 0; i < 10; i++ ) {
+                OpTime s = OpTime::now_inlock();
+                assert( s != t );
+                t = s;
+            }
+            OpTime q = t;
+            assert( q == t );
+            assert( !(q != t) );
+        }
+    } testoptime;
+
+    int _dummy_z;
+
+    void pretouchN(vector<BSONObj>& v, unsigned a, unsigned b) {
+        DEV assert( !d.dbMutex.isWriteLocked() );
+
+        Client *c = currentClient.get();
+        if( c == 0 ) {
+            Client::initThread("pretouchN");
+            c = &cc();
+        }
+
+        readlock lk("");
+        for( unsigned i = a; i <= b; i++ ) {
+            const BSONObj& op = v[i];
+            const char *which = "o";
+            const char *opType = op.getStringField("op");
+            if ( *opType == 'i' )
+                ;
+            else if( *opType == 'u' )
+                which = "o2";
+            else
+                continue;
+            /* todo : other operations */
+
+            try {
+                BSONObj o = op.getObjectField(which);
+                BSONElement _id;
+                if( o.getObjectID(_id) ) {
+                    const char *ns = op.getStringField("ns");
+                    BSONObjBuilder b;
+                    b.append(_id);
+                    BSONObj result;
+                    Client::Context ctx( ns );
+                    if( Helpers::findById(cc(), ns, b.done(), result) )
+                        _dummy_z += result.objsize(); // touch
+                }
+            }
+            catch( DBException& e ) {
+                log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' << e.toString() << endl;
+            }
+        }
+    }
+
+    void pretouchOperation(const BSONObj& op) {
+
+        if( d.dbMutex.isWriteLocked() )
+            return; // no point pretouching if write locked. not sure if this will ever fire, but just in case.
+
+        const char *which = "o";
+        const char *opType = op.getStringField("op");
+        if ( *opType == 'i' )
+            ;
+        else if( *opType == 'u' )
+            which = "o2";
+        else
+            return;
+        /* todo : other operations */
+
+        try {
+            BSONObj o = op.getObjectField(which);
+            BSONElement _id;
+            if( o.getObjectID(_id) ) {
+                const char *ns = op.getStringField("ns");
+                BSONObjBuilder b;
+                b.append(_id);
+                BSONObj result;
+                readlock lk(ns);
+                Client::Context ctx( ns );
+                if( Helpers::findById(cc(), ns, b.done(), result) )
+                    _dummy_z += result.objsize(); // touch
+            }
+        }
+        catch( DBException& ) {
+            log() << "ignoring assertion in pretouchOperation()" << endl;
+        }
+    }
+
+    BSONObj Sync::getMissingDoc(const BSONObj& o) {
+        OplogReader missingObjReader;
+
+        uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn));
+
+        const char *ns = o.getStringField("ns");
+        // might be more than just _id in the update criteria
+        BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
+        BSONObj missingObj;
+        try {
+            missingObj = missingObjReader.findOne(ns, query);
+        } catch(DBException& e) {
+            log() << "replication assertion fetching missing object: " << e.what() << endl;
+            throw;
+        }
+
+        return missingObj;
+    }
+
+    bool Sync::shouldRetry(const BSONObj& o) {
+        // we don't have the object yet, which is possible on initial sync.  get it.
+        log() << "replication info adding missing object" << endl; // rare enough we can log
+
+        BSONObj missingObj = getMissingDoc(o);
+
+        if( missingObj.isEmpty() ) {
+            log() << "replication missing object not found on source. presumably deleted later in oplog" << endl;
+            log() << "replication o2: " << o.getObjectField("o2").toString() << endl;
+            log() << "replication o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl;
+
+            return false;
+        }
+        else {
+            const char *ns = o.getStringField("ns");
+            Client::Context ctx(ns);
+            DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize());
+            uassert(15917, "Got bad disk location when attempting to insert", !d.isNull());
+
+            return true;
+        }
+    }
+
+    /** @param fromRepl false if from ApplyOpsCmd
+        @return true if was and update should have happened and the document DNE.  see replset initial sync code.
+     */
+    bool applyOperation_inlock(const BSONObj& op , bool fromRepl ) {
+        assertInWriteLock();
+        LOG(6) << "applying op: " << op << endl;
+        bool failedUpdate = false;
+
+        OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;
+
+        const char *names[] = { "o", "ns", "op", "b" };
+        BSONElement fields[4];
+        op.getFields(4, names, fields);
+
+        BSONObj o;
+        if( fields[0].isABSONObj() )
+            o = fields[0].embeddedObject();
+            
+        const char *ns = fields[1].valuestrsafe();
+
+        // operation type -- see logOp() comments for types
+        const char *opType = fields[2].valuestrsafe();
+
+        if ( *opType == 'i' ) {
+            opCounters->gotInsert();
+
+            const char *p = strchr(ns, '.');
+            if ( p && strcmp(p, ".system.indexes") == 0 ) {
+                // updates aren't allowed for indexes -- so we will do a regular insert. if index already
+                // exists, that is ok.
+                theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize());
+            }
+            else {
+                // do upserts for inserts as we might get replayed more than once
+                OpDebug debug;
+                BSONElement _id;
+                if( !o.getObjectID(_id) ) {
+                    /* No _id.  This will be very slow. */
+                    Timer t;
+                    updateObjects(ns, o, o, true, false, false, debug );
+                    if( t.millis() >= 2 ) {
+                        RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
+                    }
+                }
+                else {
+                    /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */
+                    RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow
+
+                    /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
+                              then.  very few upserts will not be inserts...
+                              */
+                    BSONObjBuilder b;
+                    b.append(_id);
+                    updateObjects(ns, o, b.done(), true, false, false , debug );
+                }
+            }
+        }
+        else if ( *opType == 'u' ) {
+            opCounters->gotUpdate();
+            // dm do we create this for a capped collection?
+            //  - if not, updates would be slow
+            //    - but if were by id would be slow on primary too so maybe ok
+            //    - if on primary was by another key and there are other indexes, this could be very bad w/out an index
+            //  - if do create, odd to have on secondary but not primary.  also can cause secondary to block for 
+            //    quite a while on creation.  
+            RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow
+            OpDebug debug;
+            BSONObj updateCriteria = op.getObjectField("o2");
+            bool upsert = fields[3].booleanSafe();
+            UpdateResult ur = updateObjects(ns, o, updateCriteria, upsert, /*multi*/ false, /*logop*/ false , debug );
+            if( ur.num == 0 ) { 
+                if( ur.mod ) {
+                    if( updateCriteria.nFields() == 1 ) {
+                        // was a simple { _id : ... } update criteria
+                        failedUpdate = true; 
+                        // todo: probably should assert in these failedUpdate cases if not in initialSync
+                    }
+                    // need to check to see if it isn't present so we can set failedUpdate correctly.
+                    // note that adds some overhead for this extra check in some cases, such as an updateCriteria
+                    // of the form
+                    //   { _id:..., { x : {$size:...} }
+                    // thus this is not ideal.
+                    else {
+                        NamespaceDetails *nsd = nsdetails(ns);
+
+                        if (nsd == NULL ||
+                            (nsd->findIdIndex() >= 0 && Helpers::findById(nsd, updateCriteria).isNull()) ||
+                            // capped collections won't have an _id index
+                            (nsd->findIdIndex() < 0 && Helpers::findOne(ns, updateCriteria, false).isNull())) {
+                            failedUpdate = true;
+                        }
+
+                        // Otherwise, it's present; zero objects were updated because of additional specifiers
+                        // in the query for idempotence
+                    }
+                }
+                else { 
+                    // this could happen benignly on an oplog duplicate replay of an upsert
+                    // (because we are idempotent), 
+                    // if an regular non-mod update fails the item is (presumably) missing.
+                    if( !upsert ) {
+                        failedUpdate = true;
+                    }
+                }
+            }
+        }
+        else if ( *opType == 'd' ) {
+            opCounters->gotDelete();
+            if ( opType[1] == 0 )
+                deleteObjects(ns, o, /*justOne*/ fields[3].booleanSafe());
+            else
+                assert( opType[1] == 'b' ); // "db" advertisement
+        }
+        else if ( *opType == 'c' ) {
+            opCounters->gotCommand();
+            BufBuilder bb;
+            BSONObjBuilder ob;
+            _runCommands(ns, o, bb, ob, true, 0);
+        }
+        else if ( *opType == 'n' ) {
+            // no op
+        }
+        else {
+            throw MsgAssertionException( 14825 , ErrorMsg("error in applyOperation : unknown opType ", *opType) );
+        }
+        return failedUpdate;
+    }
+
+    class ApplyOpsCmd : public Command {
+    public:
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        ApplyOpsCmd() : Command( "applyOps" ) {}
+        virtual void help( stringstream &help ) const {
+            help << "internal (sharding)\n{ applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+            if ( cmdObj.firstElement().type() != Array ) {
+                errmsg = "ops has to be an array";
+                return false;
+            }
+
+            BSONObj ops = cmdObj.firstElement().Obj();
+
+            {
+                // check input
+                BSONObjIterator i( ops );
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    if ( e.type() == Object )
+                        continue;
+                    errmsg = "op not an object: ";
+                    errmsg += e.fieldName();
+                    return false;
+                }
+            }
+
+            if ( cmdObj["preCondition"].type() == Array ) {
+                BSONObjIterator i( cmdObj["preCondition"].Obj() );
+                while ( i.more() ) {
+                    BSONObj f = i.next().Obj();
+
+                    BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() );
+
+                    Matcher m( f["res"].Obj() );
+                    if ( ! m.matches( realres ) ) {
+                        result.append( "got" , realres );
+                        result.append( "whatFailed" , f );
+                        errmsg = "pre-condition failed";
+                        return false;
+                    }
+                }
+            }
+
+            // apply
+            int num = 0;
+            BSONObjIterator i( ops );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                // todo SERVER-4259 ?
+                applyOperation_inlock( e.Obj() , false );
+                num++;
+            }
+
+            result.append( "applied" , num );
+
+            if ( ! fromRepl ) {
+                // We want this applied atomically on slaves
+                // so we re-wrap without the pre-condition for speed
+
+                string tempNS = str::stream() << dbname << ".$cmd";
+
+                logOp( "c" , tempNS.c_str() , cmdObj.firstElement().wrap() );
+            }
+
+            return true;
+        }
+
+        DBDirectClient db;
+
+    } applyOpsCmd;
+
+}
diff --git a/src/mongo/db/oplog.h b/src/mongo/db/oplog.h
new file mode 100644
index 00000000000..6c1644fe3ab
--- /dev/null
+++ b/src/mongo/db/oplog.h
@@ -0,0 +1,149 @@
+// oplog.h - writing to and reading from oplog
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+
+     local.oplog.$main is the default
+*/
+
+#pragma once
+
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "clientcursor.h"
+#include "../client/dbclient.h"
+#include "../util/optime.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    void createOplog();
+
+    void _logOpObjRS(const BSONObj& op);
+
+    /** Write operation to the log (local.oplog.$main)
+
+       @param opstr
+        "i" insert
+        "u" update
+        "d" delete
+        "c" db cmd
+        "n" no-op
+        "db" declares presence of a database (ns is set to the db name + '.')
+
+       See _logOp() in oplog.cpp for more details.
+    */
+    void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0);
+
+    void logKeepalive();
+
+    /** puts obj in the oplog as a comment (a no-op).  Just for diags.
+        convention is
+          { msg : "text", ... }
+    */
+    void logOpComment(const BSONObj& obj);
+
+    void oplogCheckCloseDatabase( Database * db );
+
+    extern int __findingStartInitialTimeout; // configurable for testing
+
+    class QueryPlan;
+    
+    /** Implements an optimized procedure for finding the first op in the oplog. */
+    class FindingStartCursor {
+    public:
+
+        /**
+         * The cursor will attempt to find the first op in the oplog matching the
+         * 'ts' field of the qp's query.
+         */
+        FindingStartCursor( const QueryPlan & qp );
+
+        /** @return true if the first matching op in the oplog has been found. */
+        bool done() const { return !_findingStart; }
+
+        /** @return cursor pointing to the first matching op, if done(). */
+        shared_ptr<Cursor> cursor() { verify( 14835, done() ); return _c; }
+
+        /** Iterate the cursor, to continue trying to find matching op. */
+        void next();
+
+        /** Yield cursor, if not done(). */
+        bool prepareToYield() {
+            if ( _findingStartCursor ) {
+                return _findingStartCursor->prepareToYield( _yieldData );
+            }
+            return false;
+        }
+        
+        /** Recover from cursor yield. */
+        void recoverFromYield() {
+            if ( _findingStartCursor ) {
+                if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
+                    _findingStartCursor.reset( 0 );
+                    msgassertedNoTrace( 15889, "FindingStartCursor::recoverFromYield() failed to recover" );
+                }
+            }
+        }
+    private:
+        enum FindingStartMode { Initial, FindExtent, InExtent };
+        const QueryPlan &_qp;
+        bool _findingStart;
+        FindingStartMode _findingStartMode;
+        auto_ptr< CoveredIndexMatcher > _matcher;
+        Timer _findingStartTimer;
+        ClientCursor::CleanupPointer _findingStartCursor;
+        shared_ptr<Cursor> _c;
+        ClientCursor::YieldData _yieldData;
+        DiskLoc extentFirstLoc( const DiskLoc &rec );
+
+        DiskLoc prevExtentFirstLoc( const DiskLoc &rec );
+        void createClientCursor( const DiskLoc &startLoc = DiskLoc() );
+        void destroyClientCursor() {
+            _findingStartCursor.reset( 0 );
+        }
+        void init();
+        bool firstDocMatchesOrEmpty() const;
+    };
+
+    class Sync {
+    protected:
+        string hn;
+    public:
+        Sync(const string& hostname) : hn(hostname) {}
+        virtual ~Sync() {}
+        virtual BSONObj getMissingDoc(const BSONObj& o);
+
+        /**
+         * If applyOperation_inlock should be called again after an update fails.
+         */
+        virtual bool shouldRetry(const BSONObj& o);
+    };
+
+    void pretouchOperation(const BSONObj& op);
+    void pretouchN(vector<BSONObj>&, unsigned a, unsigned b);
+
+    /**
+     * take an op and apply locally
+     * used for applying from an oplog
+     * @param fromRepl really from replication or for testing/internal/command/etc...
+     * Returns if the op was an update that could not be applied (true on failure)
+     */
+    bool applyOperation_inlock(const BSONObj& op , bool fromRepl = true );
+}
diff --git a/src/mongo/db/oplogreader.h b/src/mongo/db/oplogreader.h
new file mode 100644
index 00000000000..6efd1469c01
--- /dev/null
+++ b/src/mongo/db/oplogreader.h
@@ -0,0 +1,121 @@
+/** @file oplogreader.h */
+
+#pragma once
+
+#include "../client/dbclient.h"
+#include "../client/constants.h"
+#include "dbhelpers.h"
+
+namespace mongo {
+
+    /* started abstracting out the querying of the primary/master's oplog
+       still fairly awkward but a start.
+    */
+    class OplogReader {
+        shared_ptr<DBClientConnection> _conn;
+        shared_ptr<DBClientCursor> cursor;
+    public:
+        OplogReader() { }
+        ~OplogReader() { }
+        void resetCursor() { cursor.reset(); }
+        void resetConnection() {
+            cursor.reset();
+            _conn.reset();
+        }
+        DBClientConnection* conn() { return _conn.get(); }
+        BSONObj findOne(const char *ns, const Query& q) {
+            return conn()->findOne(ns, q, 0, QueryOption_SlaveOk);
+        }
+        BSONObj getLastOp(const char *ns) {
+            return findOne(ns, Query().sort(reverseNaturalObj));
+        }
+
+        /* ok to call if already connected */
+        bool connect(string hostname);
+
+        bool connect(const BSONObj& rid, const int from, const string& to);
+
+        void tailCheck() {
+            if( cursor.get() && cursor->isDead() ) {
+                log() << "repl: old cursor isDead, will initiate a new one" << endl;
+                resetCursor();
+            }
+        }
+
+        bool haveCursor() { return cursor.get() != 0; }
+
+        /** this is ok but commented out as when used one should consider if QueryOption_OplogReplay
+           is needed; if not fine, but if so, need to change.
+        *//*
+        void query(const char *ns, const BSONObj& query) {
+            assert( !haveCursor() );
+            cursor.reset( _conn->query(ns, query, 0, 0, 0, QueryOption_SlaveOk).release() );
+        }*/
+
+        /** this can be used; it is commented out as it does not indicate
+            QueryOption_OplogReplay and that is likely important.  could be uncommented
+            just need to add that.
+            */
+        /*
+        void queryGTE(const char *ns, OpTime t) {
+            BSONObjBuilder q;
+            q.appendDate("$gte", t.asDate());
+            BSONObjBuilder q2;
+            q2.append("ts", q.done());
+            query(ns, q2.done());
+        }
+        */
+
+        void tailingQuery(const char *ns, const BSONObj& query, const BSONObj* fields=0) {
+            assert( !haveCursor() );
+            log(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl;
+            cursor.reset( _conn->query( ns, query, 0, 0, fields,
+                                        QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay |
+                                        /* TODO: slaveOk maybe shouldn't use? */
+                                        QueryOption_AwaitData
+                                        ).release() );
+        }
+
+        void tailingQueryGTE(const char *ns, OpTime t, const BSONObj* fields=0) {
+            BSONObjBuilder q;
+            q.appendDate("$gte", t.asDate());
+            BSONObjBuilder query;
+            query.append("ts", q.done());
+            tailingQuery(ns, query.done(), fields);
+        }
+
+        /* Do a tailing query, but only send the ts field back. */
+        void ghostQueryGTE(const char *ns, OpTime t) {
+            const BSONObj fields = BSON("ts" << 1 << "_id" << 0);
+            return tailingQueryGTE(ns, t, &fields);
+        }
+
+        bool more() {
+            uassert( 15910, "Doesn't have cursor for reading oplog", cursor.get() );
+            return cursor->more();
+        }
+
+        bool moreInCurrentBatch() {
+            uassert( 15911, "Doesn't have cursor for reading oplog", cursor.get() );
+            return cursor->moreInCurrentBatch();
+        }
+
+        /* old mongod's can't do the await flag... */
+        bool awaitCapable() {
+            return cursor->hasResultFlag(ResultFlag_AwaitCapable);
+        }
+
+        void peek(vector<BSONObj>& v, int n) {
+            if( cursor.get() )
+                cursor->peek(v,n);
+        }
+        BSONObj nextSafe() { return cursor->nextSafe(); }
+        BSONObj next() { return cursor->next(); }
+        void putBack(BSONObj op) { cursor->putBack(op); }
+
+    private:
+        bool commonConnect(const string& hostName);
+        bool passthroughHandshake(const BSONObj& rid, const int f);
+    };
+
+}
diff --git a/src/mongo/db/ops/count.cpp b/src/mongo/db/ops/count.cpp
new file mode 100644
index 00000000000..3c183596b9d
--- /dev/null
+++ b/src/mongo/db/ops/count.cpp
@@ -0,0 +1,103 @@
+// count.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "count.h"
+
+#include "../client.h"
+#include "../clientcursor.h"
+#include "../namespace.h"
+#include "../queryutil.h"
+
+namespace mongo {
+    
+    long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
+        Client::Context cx(ns);
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d ) {
+            err = "ns missing";
+            return -1;
+        }
+        BSONObj query = cmd.getObjectField("query");
+        
+        // count of all objects
+        if ( query.isEmpty() ) {
+            return applySkipLimit( d->stats.nrecords , cmd );
+        }
+        
+        string exceptionInfo;
+        long long count = 0;
+        long long skip = cmd["skip"].numberLong();
+        long long limit = cmd["limit"].numberLong();
+        bool simpleEqualityMatch;
+        shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor( ns, query, BSONObj(), false, &simpleEqualityMatch );
+        ClientCursor::CleanupPointer ccPointer;
+        ElapsedTracker timeToStartYielding( 256, 20 );
+        try {
+            while( cursor->ok() ) {
+                if ( !ccPointer ) {
+                    if ( timeToStartYielding.intervalHasElapsed() ) {
+                        // Lazily construct a ClientCursor, avoiding a performance regression when scanning a very
+                        // small number of documents.
+                        ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
+                    }
+                }
+                else if ( !ccPointer->yieldSometimes( simpleEqualityMatch ? ClientCursor::DontNeed : ClientCursor::MaybeCovered ) ||
+                         !cursor->ok() ) {
+                    break;
+                }
+                
+                // With simple equality matching there is no need to use the matcher because the bounds
+                // are enforced by the FieldRangeVectorIterator and only key fields have constraints.  There
+                // is no need to do key deduping because an exact value is specified in the query for all key
+                // fields and duplicate keys are not allowed per document.
+                // NOTE In the distant past we used a min/max bounded BtreeCursor with a shallow
+                // equality comparison to check for matches in the simple match case.  That may be
+                // more performant, but I don't think we've measured the performance.
+                if ( simpleEqualityMatch ||
+                    ( cursor->currentMatches() && !cursor->getsetdup( cursor->currLoc() ) ) ) {
+                    
+                    if ( skip > 0 ) {
+                        --skip;
+                    }
+                    else {
+                        ++count;
+                        if ( limit > 0 && count >= limit ) {
+                            break;
+                        }
+                    }
+                }
+                cursor->advance();
+            }
+            ccPointer.reset();
+            return count;
+            
+        } catch ( const DBException &e ) {
+            exceptionInfo = e.toString();
+        } catch ( const std::exception &e ) {
+            exceptionInfo = e.what();
+        } catch ( ... ) {
+            exceptionInfo = "unknown exception";
+        }
+        // Historically we have returned zero in many count assertion cases - see SERVER-2291.
+        log() << "Count with ns: " << ns << " and query: " << query
+        << " failed with exception: " << exceptionInfo
+        << endl;
+        return 0;
+    }
+    
+} // namespace mongo
diff --git a/src/mongo/db/ops/count.h b/src/mongo/db/ops/count.h
new file mode 100644
index 00000000000..807741e1253
--- /dev/null
+++ b/src/mongo/db/ops/count.h
@@ -0,0 +1,30 @@
+// count.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../jsobj.h"
+#include "../diskloc.h"
+
+namespace mongo {
+    
+    /**
+     * { count: "collectionname"[, query: <query>] }
+     * @return -1 on ns does not exist error and other errors, 0 on other errors, otherwise the match count.
+     */
+    long long runCount(const char *ns, const BSONObj& cmd, string& err);
+    
+} // namespace mongo
diff --git a/src/mongo/db/ops/delete.cpp b/src/mongo/db/ops/delete.cpp
new file mode 100644
index 00000000000..e33611c151e
--- /dev/null
+++ b/src/mongo/db/ops/delete.cpp
@@ -0,0 +1,158 @@
+// delete.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "delete.h"
+#include "../queryoptimizer.h"
+#include "../oplog.h"
+
+namespace mongo {
+    
+    /* ns:      namespace, e.g. <database>.<collection>
+       pattern: the "where" clause / criteria
+       justOne: stop after 1 match
+       god:     allow access to system namespaces, and don't yield
+    */
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
+        if( !god ) {
+            if ( strstr(ns, ".system.") ) {
+                /* note a delete from system.indexes would corrupt the db
+                if done here, as there are pointers into those objects in
+                NamespaceDetails.
+                */
+                uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
+            }
+            if ( strchr( ns , '$' ) ) {
+                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
+                uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
+            }
+        }
+
+        {
+            NamespaceDetails *d = nsdetails( ns );
+            if ( ! d )
+                return 0;
+            uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
+        }
+
+        long long nDeleted = 0;
+
+        shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern, BSONObj(), false, 0 );
+
+        if( !creal->ok() )
+            return nDeleted;
+
+        shared_ptr< Cursor > cPtr = creal;
+        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
+        cc->setDoingDeletes( true );
+
+        CursorId id = cc->cursorid();
+
+        bool justOne = justOneOrig;
+        bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic());
+
+        do {
+            // TODO: we can generalize this I believe
+            //       
+            bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern );
+            if ( ! willNeedRecord ) {
+                // TODO: this is a total hack right now
+                // check if the index full encompasses query
+                
+                if ( pattern.nFields() == 1 && 
+                     str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
+                    willNeedRecord = true;
+            }
+            
+            if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
+                cc.release(); // has already been deleted elsewhere
+                // TODO should we assert or something?
+                break;
+            }
+            if ( !cc->ok() ) {
+                break; // if we yielded, could have hit the end
+            }
+
+            // this way we can avoid calling updateLocation() every time (expensive)
+            // as well as some other nuances handled
+            cc->setDoingDeletes( true );
+
+            DiskLoc rloc = cc->currLoc();
+            BSONObj key = cc->currKey();
+
+            bool match = creal->currentMatches();
+            bool dup = cc->c()->getsetdup(rloc);
+
+            if ( ! cc->advance() )
+                justOne = true;
+
+            if ( ! match )
+                continue;
+
+            assert( !dup ); // can't be a dup, we deleted it!
+
+            if ( !justOne ) {
+                /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
+                    blocks.  here we might call millions of times which would be bad.
+                    */
+                cc->c()->prepareToTouchEarlierIterate();
+            }
+
+            if ( logop ) {
+                BSONElement e;
+                if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
+                    BSONObjBuilder b;
+                    b.append( e );
+                    bool replJustOne = true;
+                    logOp( "d", ns, b.done(), 0, &replJustOne );
+                }
+                else {
+                    problem() << "deleted object without id, not logging" << endl;
+                }
+            }
+
+            if ( rs )
+                rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
+
+            theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
+            nDeleted++;
+            if ( justOne ) {
+                break;
+            }
+            cc->c()->recoverFromTouchingEarlierIterate();
+         
+            if( !god ) 
+                getDur().commitIfNeeded();
+
+            if( debug && god && nDeleted == 100 ) 
+                log() << "warning high number of deletes with god=true which could use significant memory" << endl;
+        }
+        while ( cc->ok() );
+
+        if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
+            // TODO: remove this and the id declaration above if this doesn't trigger
+            //       if it does, then i'm very confused (ERH 06/2011)
+            error() << "this should be impossible" << endl;
+            printStackTrace();
+            cc.release();
+        }
+
+        return nDeleted;
+    }
+
+}
diff --git a/src/mongo/db/ops/delete.h b/src/mongo/db/ops/delete.h
new file mode 100644
index 00000000000..a74b7a664bc
--- /dev/null
+++ b/src/mongo/db/ops/delete.h
@@ -0,0 +1,33 @@
+// delete.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    class RemoveSaver;
+
+    // If justOne is true, deletedId is set to the id of the deleted object.
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false, RemoveSaver * rs=0);
+
+
+}
diff --git a/src/mongo/db/ops/query.cpp b/src/mongo/db/ops/query.cpp
new file mode 100644
index 00000000000..15e3ed9053f
--- /dev/null
+++ b/src/mongo/db/ops/query.cpp
@@ -0,0 +1,870 @@
+// query.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "query.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../../bson/util/builder.h"
+#include <time.h>
+#include "../introspect.h"
+#include "../btree.h"
+#include "../../util/lruishmap.h"
+#include "../json.h"
+#include "../repl.h"
+#include "../replutil.h"
+#include "../scanandorder.h"
+#include "../security.h"
+#include "../curop-inl.h"
+#include "../commands.h"
+#include "../queryoptimizer.h"
+#include "../lasterror.h"
+#include "../../s/d_logic.h"
+#include "../repl_block.h"
+#include "../../server.h"
+#include "../d_concurrency.h"
+
+namespace mongo {
+
+    /* We cut off further objects once we cross this threshold; thus, you might get
+       a little bit more than this, it is a threshold rather than a limit.
+    */
+    const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
+
+    //ns->query->DiskLoc
+//    LRUishMap<BSONObj,DiskLoc,5> lrutest(123);
+
+    extern bool useCursors;
+    extern bool useHints;
+
+    bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+        try {
+            return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
+        }
+        catch( SendStaleConfigException& ){
+            throw;
+        }
+        catch ( AssertionException& e ) {
+            assert( e.getCode() != SendStaleConfigCode && e.getCode() != RecvStaleConfigCode );
+
+            e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" );
+            curop.debug().exceptionInfo = e.getInfo();
+        }
+        anObjBuilder.append("errmsg", "db assertion failure");
+        anObjBuilder.append("ok", 0.0);
+        BSONObj x = anObjBuilder.done();
+        b.appendBuf((void*) x.objdata(), x.objsize());
+        return true;
+    }
+
+
+    BSONObj id_obj = fromjson("{\"_id\":1}");
+    BSONObj empty_obj = fromjson("{}");
+
+
+    //int dump = 0;
+
+    /* empty result for error conditions */
+    QueryResult* emptyMoreResult(long long cursorid) {
+        BufBuilder b(32768);
+        b.skip(sizeof(QueryResult));
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->cursorId = 0; // 0 indicates no more data to retrieve.
+        qr->startingFrom = 0;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->initializeResultFlags();
+        qr->nReturned = 0;
+        b.decouple();
+        return qr;
+    }
+
+    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
+        exhaust = false;
+        ClientCursor::Pointer p(cursorid);
+        ClientCursor *cc = p.c();
+
+        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;
+
+        BufBuilder b( bufSize );
+        b.skip(sizeof(QueryResult));
+        int resultFlags = ResultFlag_AwaitCapable;
+        int start = 0;
+        int n = 0;
+
+        if ( unlikely(!cc) ) {
+            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
+            cursorid = 0;
+            resultFlags = ResultFlag_CursorNotFound;
+        }
+        else {
+            // check for spoofing of the ns such that it does not match the one originally there for the cursor
+            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));
+
+            if ( pass == 0 )
+                cc->updateSlaveLocation( curop );
+
+            int queryOptions = cc->queryOptions();
+            
+            curop.debug().query = cc->query();
+
+            start = cc->pos();
+            Cursor *c = cc->c();
+            c->checkLocation();
+            DiskLoc last;
+
+            scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
+            if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
+                keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );
+
+            // This manager may be stale, but it's the state of chunking when the cursor was created.
+            ShardChunkManagerPtr manager = cc->getChunkManager();
+
+            while ( 1 ) {
+                if ( !c->ok() ) {
+                    if ( c->tailable() ) {
+                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
+                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
+                           return true.  that's what we are doing here.
+                           */
+                        if ( c->advance() )
+                            continue;
+
+                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
+                            return 0;
+                        }
+
+                        break;
+                    }
+                    p.release();
+                    bool ok = ClientCursor::erase(cursorid);
+                    assert(ok);
+                    cursorid = 0;
+                    cc = 0;
+                    break;
+                }
+
+                // in some cases (clone collection) there won't be a matcher
+                if ( c->matcher() && !c->matcher()->matchesCurrent( c ) ) {
+                }
+                else if ( manager && ! manager->belongsToMe( cc ) ){
+                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
+                }
+                else {
+                    if( c->getsetdup(c->currLoc()) ) {
+                        //out() << "  but it's a dup \n";
+                    }
+                    else {
+                        last = c->currLoc();
+                        n++;
+
+                        if ( keyFieldsOnly ) {
+                            fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
+                        }
+                        else {
+                            BSONObj js = c->current();
+                            // show disk loc should be part of the main query, not in an $or clause, so this should be ok
+                            fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
+                        }
+
+                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
+                            c->advance();
+                            cc->incPos( n );
+                            break;
+                        }
+                    }
+                }
+                c->advance();
+
+                if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) {
+                    ClientCursor::erase(cursorid);
+                    cursorid = 0;
+                    cc = 0;
+                    p.deleted();
+                    break;
+                }
+            }
+            
+            if ( cc ) {
+                cc->updateLocation();
+                cc->mayUpgradeStorage();
+                cc->storeOpForSlave( last );
+                exhaust = cc->queryOptions() & QueryOption_Exhaust;
+            }
+        }
+
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->_resultFlags() = resultFlags;
+        qr->cursorId = cursorid;
+        qr->startingFrom = start;
+        qr->nReturned = n;
+        b.decouple();
+
+        return qr;
+    }
+
+    class ExplainBuilder {
+        // Note: by default we filter out allPlans and oldPlan in the shell's
+        // explain() function. If you add any recursive structures, make sure to
+        // edit the JS to make sure everything gets filtered.
+    public:
+        ExplainBuilder() : _i() {}
+        void ensureStartScan() {
+            if ( !_a.get() ) {
+                _a.reset( new BSONArrayBuilder() );
+            }
+        }
+        void noteCursor( Cursor *c ) {
+            BSONObjBuilder b( _a->subobjStart() );
+            b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds();
+            b.done();
+        }
+        void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder,
+                       int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) {
+            if ( _i == 1 ) {
+                _c.reset( new BSONArrayBuilder() );
+                *_c << _b->obj();
+            }
+            if ( _i == 0 ) {
+                _b.reset( new BSONObjBuilder() );
+            }
+            else {
+                _b.reset( new BSONObjBuilder( _c->subobjStart() ) );
+            }
+            *_b << "cursor" << c->toString();
+            _b->appendNumber( "nscanned", nscanned );
+            _b->appendNumber( "nscannedObjects", nscannedObjects );
+            *_b << "n" << n;
+
+            if ( scanAndOrder )
+                *_b << "scanAndOrder" << true;
+
+            *_b << "millis" << millis;
+
+            *_b << "nYields" << nYields;
+            *_b << "nChunkSkips" << nChunkSkips;
+            *_b << "isMultiKey" << c->isMultiKey();
+            *_b << "indexOnly" << indexOnly;
+
+            *_b << "indexBounds" << c->prettyIndexBounds();
+
+            c->explainDetails( *_b );
+
+            if ( !hint ) {
+                *_b << "allPlans" << _a->arr();
+            }
+            if ( _i != 0 ) {
+                _b->done();
+            }
+            _a.reset( 0 );
+            ++_i;
+        }
+        BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) {
+            if ( _i > 1 ) {
+                BSONObjBuilder b;
+                b << "clauses" << _c->arr();
+                b.appendNumber( "nscanned", nscanned );
+                b.appendNumber( "nscannedObjects", nscannedObjects );
+                b << "n" << n;
+                b << "millis" << millis;
+                b.appendElements( suffix );
+                return b.obj();
+            }
+            else {
+            	stringstream host;
+            	host << getHostNameCached() << ":" << cmdLine.port;
+            	*_b << "server" << host.str();
+                _b->appendElements( suffix );
+                return _b->obj();
+            }
+        }
+    private:
+        auto_ptr< BSONArrayBuilder > _a;
+        auto_ptr< BSONObjBuilder > _b;
+        auto_ptr< BSONArrayBuilder > _c;
+        int _i;
+    };
+
+    // Implements database 'query' requests using the query optimizer's QueryOp interface
+    class UserQueryOp : public QueryOp {
+    public:
+
+        UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) :
+            _buf( 32768 ) , // TODO be smarter here
+            _pq( pq ) ,
+            _ntoskip( pq.getSkip() ) ,
+            _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0),
+            _n(0),
+            _oldN(0),
+            _nYields(),
+            _nChunkSkips(),
+            _chunkManager( shardingState.needShardChunkManager(pq.ns()) ?
+                           shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ),
+            _inMemSort(false),
+            _capped(false),
+            _saveClientCursor(false),
+            _wouldSaveClientCursor(false),
+            _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ),
+            _response( response ),
+            _eb( eb ),
+            _curop( curop ),
+            _yieldRecoveryFailed()
+        {}
+
+        virtual void _init() {
+            // only need to put the QueryResult fields there if we're building the first buffer in the message.
+            if ( _response.empty() ) {
+                _buf.skip( sizeof( QueryResult ) );
+            }
+
+            if ( _oplogReplay ) {
+                _findingStartCursor.reset( new FindingStartCursor( qp() ) );
+                _capped = true;
+            }
+            else {
+                _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() );
+                _capped = _c->capped();
+
+                // setup check for if we can only use index to extract
+                if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) {
+                    _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) );
+                }
+            }
+
+            if ( qp().scanAndOrderRequired() ) {
+                _inMemSort = true;
+                _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder(), qp().multikeyFrs() ) );
+            }
+
+            if ( _pq.isExplain() ) {
+                _eb.noteCursor( _c.get() );
+            }
+
+        }
+
+        virtual bool prepareToYield() {
+            if ( _findingStartCursor.get() ) {
+                return _findingStartCursor->prepareToYield();
+            }
+            else {
+                if ( _c && !_cc ) {
+                    _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) );
+                }
+                if ( _cc ) {
+	                return _cc->prepareToYield( _yieldData );
+                }
+            }
+            // no active cursor - ok to yield
+            return true;
+        }
+
+        virtual void recoverFromYield() {
+            _nYields++;
+
+            if ( _findingStartCursor.get() ) {
+                _findingStartCursor->recoverFromYield();
+            }
+            else if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _yieldRecoveryFailed = true;
+                _c.reset();
+                _cc.reset();
+                _so.reset();
+
+                if ( _capped ) {
+                    msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() );
+                }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15890, str::stream() << "UserQueryOp::recoverFromYield() failed to recover: " << _pq.ns() );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+
+                    // todo: this is wrong.  the cursor could be gone if closeAllDatabases command just ran
+                }
+
+            }
+        }
+
+        virtual long long nscanned() {
+            if ( _findingStartCursor.get() ) {
+                return 0; // should only be one query plan, so value doesn't really matter.
+            }
+            return _c.get() ? _c->nscanned() : _nscanned;
+        }
+
+        virtual void next() {
+            if ( _findingStartCursor.get() ) {
+                if ( !_findingStartCursor->done() ) {
+                    _findingStartCursor->next();
+                }                    
+                if ( _findingStartCursor->done() ) {
+                    _c = _findingStartCursor->cursor();
+                    _findingStartCursor.reset( 0 );
+                }
+                _capped = true;
+                return;
+            }
+
+            if ( !_c || !_c->ok() ) {
+                finish( false );
+                return;
+            }
+
+            bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors;
+
+            if( 0 ) {
+                cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl;
+            }
+
+            if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) {
+                finish( true ); //?
+                return;
+            }
+
+            _nscanned = _c->nscanned();
+            if ( !matcher( _c )->matchesCurrent(_c.get() , &_details ) ) {
+                // not a match, continue onward
+                if ( _details._loadedObject )
+                    _nscannedObjects++;
+            }
+            else {
+                _nscannedObjects++;
+                DiskLoc cl = _c->currLoc();
+                if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) { // TODO: should make this covered at some point
+                    _nChunkSkips++;
+                    // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl;
+                }
+                else if( _c->getsetdup(cl) ) {
+                    // dup
+                }
+                else {
+                    // got a match.
+
+                    if ( _inMemSort ) {
+                        // note: no cursors for non-indexed, ordered results.  results must be fairly small.
+                        _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 );
+                    }
+                    else if ( _ntoskip > 0 ) {
+                        _ntoskip--;
+                    }
+                    else {
+                        if ( _pq.isExplain() ) {
+                            _n++;
+                            if ( n() >= _pq.getNumToReturn() && !_pq.wantMore() ) {
+                                // .limit() was used, show just that much.
+                                finish( true ); //?
+                                return;
+                            }
+                        }
+                        else {
+
+                            if ( _pq.returnKey() ) {
+                                BSONObjBuilder bb( _buf );
+                                bb.appendKeys( _c->indexKeyPattern() , _c->currKey() );
+                                bb.done();
+                            }
+                            else if ( _keyFieldsOnly ) {
+                                fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) );
+                            }
+                            else {
+                                BSONObj js = _c->current();
+                                assert( js.isValid() );
+
+                                if ( _oplogReplay ) {
+                                    BSONElement e = js["ts"];
+                                    if ( e.type() == Date || e.type() == Timestamp )
+                                        _slaveReadTill = e._opTime();
+                                }
+
+                                fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0));
+                            }
+                            _n++;
+                            if ( ! _c->supportGetMore() ) {
+                                if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) {
+                                    finish( true );
+                                    return;
+                                }
+                            }
+                            else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) {
+                                /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
+                                if ( mayCreateCursor1 ) {
+                                    _wouldSaveClientCursor = true;
+                                    if ( _c->advance() ) {
+                                        // more...so save a cursor
+                                        _saveClientCursor = true;
+                                    }
+                                }
+                                finish( true );
+                                return;
+                            }
+                        }
+                    }
+                }
+            }
+            _c->advance();
+        }
+
+        // this plan won, so set data for response broadly
+        void finish( bool stop ) {
+            massert( 13638, "client cursor dropped during explain query yield", !_pq.isExplain() || _c.get() );
+
+            if ( _pq.isExplain() ) {
+                _n = _inMemSort ? _so->size() : _n;
+            }
+            else if ( _inMemSort ) {
+                if( _so.get() )
+                    _so->fill( _buf, _pq.getFields() , _n );
+            }
+
+            if ( _c.get() ) {
+                _nscanned = _c->nscanned();
+
+                if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
+                    _c->setTailable();
+
+                // If the tailing request succeeded.
+                if ( _c->tailable() )
+                    _saveClientCursor = true;
+            }
+
+            if ( _pq.isExplain() ) {
+                _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(),
+                              _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields ,
+                              _nChunkSkips, _keyFieldsOnly.get() > 0 );
+            }
+            else {
+                if ( _buf.len() ) {
+                    _response.appendData( _buf.buf(), _buf.len() );
+                    _buf.decouple();
+                }
+            }
+
+            if ( stop ) {
+                setStop();
+            }
+            else {
+                setComplete();
+            }
+
+        }
+
+        void finishExplain( const BSONObj &suffix ) {
+            BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix);
+            fillQueryResultFromObj(_buf, 0, obj);
+            _n = 1;
+            _oldN = 0;
+            _response.appendData( _buf.buf(), _buf.len() );
+            _buf.decouple();
+        }
+
+        virtual bool mayRecordPlan() const {
+            return !_yieldRecoveryFailed && ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) );
+        }
+
+        virtual QueryOp *_createChild() const {
+            if ( _pq.isExplain() ) {
+                _eb.ensureStartScan();
+            }
+            UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop );
+            ret->_oldN = n();
+            ret->_oldNscanned = totalNscanned();
+            ret->_oldNscannedObjects = nscannedObjects();
+            ret->_ntoskip = _ntoskip;
+            return ret;
+        }
+
+        bool scanAndOrderRequired() const { return _inMemSort; }
+        shared_ptr<Cursor> cursor() { return _c; }
+        int n() const { return _oldN + _n; }
+        long long totalNscanned() const { return _nscanned + _oldNscanned; }
+        long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; }
+        bool saveClientCursor() const { return _saveClientCursor; }
+        bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; }
+
+        void finishForOplogReplay( ClientCursor * cc ) {
+            if ( _oplogReplay && ! _slaveReadTill.isNull() )
+                cc->slaveReadTill( _slaveReadTill );
+
+        }
+
+        ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
+    private:
+        BufBuilder _buf;
+        const ParsedQuery& _pq;
+        scoped_ptr<Projection::KeyOnly> _keyFieldsOnly;
+
+        long long _ntoskip;
+        long long _nscanned;
+        long long _oldNscanned;
+        long long _nscannedObjects;
+        long long _oldNscannedObjects;
+        int _n; // found so far
+        int _oldN;
+
+        int _nYields;
+        int _nChunkSkips;
+
+        MatchDetails _details;
+
+        ShardChunkManagerPtr _chunkManager;
+
+        bool _inMemSort;
+        auto_ptr< ScanAndOrder > _so;
+
+        shared_ptr<Cursor> _c;
+        ClientCursor::CleanupPointer _cc;
+        ClientCursor::YieldData _yieldData;
+
+        bool _capped;
+        bool _saveClientCursor;
+        bool _wouldSaveClientCursor;
+        bool _oplogReplay;
+        auto_ptr< FindingStartCursor > _findingStartCursor;
+
+        Message &_response;
+        ExplainBuilder &_eb;
+        CurOp &_curop;
+        OpTime _slaveReadTill;
+
+        bool _yieldRecoveryFailed;
+    };
+
+    /* run a query -- includes checking for and running a Command \
+       @return points to ns if exhaust mode. 0=normal mode
+    */
+    const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
+        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
+        ParsedQuery& pq( *pq_shared );
+        int ntoskip = q.ntoskip;
+        BSONObj jsobj = q.query;
+        int queryOptions = q.queryOptions;
+        const char *ns = q.ns;
+
+        if( logLevel >= 2 )
+            log() << "runQuery called " << ns << " " << jsobj << endl;
+
+        curop.debug().ns = ns;
+        curop.debug().ntoreturn = pq.getNumToReturn();
+        curop.setQuery(jsobj);
+
+        if ( pq.couldBeCommand() ) {
+            BufBuilder bb;
+            bb.skip(sizeof(QueryResult));
+            BSONObjBuilder cmdResBuf;
+            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
+                curop.debug().iscommand = true;
+                curop.debug().query = jsobj;
+                curop.markCommand();
+
+                auto_ptr< QueryResult > qr;
+                qr.reset( (QueryResult *) bb.buf() );
+                bb.decouple();
+                qr->setResultFlagsToOk();
+                qr->len = bb.len();
+                curop.debug().responseLength = bb.len();
+                qr->setOperation(opReply);
+                qr->cursorId = 0;
+                qr->startingFrom = 0;
+                qr->nReturned = 1;
+                result.setData( qr.release(), true );
+            }
+            else {
+                uasserted(13530, "bad or malformed command request?");
+            }
+            return 0;
+        }
+
+        /* --- regular query --- */
+
+        int n = 0;
+        BSONElement hint = useHints ? pq.getHint() : BSONElement();
+        bool explain = pq.isExplain();
+        bool snapshot = pq.isSnapshot();
+        BSONObj order = pq.getOrder();
+        BSONObj query = pq.getFilter();
+
+        /* The ElemIter will not be happy if this isn't really an object. So throw exception
+           here when that is true.
+           (Which may indicate bad data from client.)
+        */
+        if ( query.objsize() == 0 ) {
+            out() << "Bad query object?\n  jsobj:";
+            out() << jsobj.toString() << "\n  query:";
+            out() << query.toString() << endl;
+            uassert( 10110 , "bad query object", false);
+        }
+
+        Client::ReadContext ctx( ns , dbpath ); // read locks
+
+        replVerifyReadsOk(pq);
+
+        if ( pq.hasOption( QueryOption_CursorTailable ) ) {
+            NamespaceDetails *d = nsdetails( ns );
+            uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped );
+            const BSONObj nat1 = BSON( "$natural" << 1 );
+            if ( order.isEmpty() ) {
+                order = nat1;
+            }
+            else {
+                uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
+            }
+        }
+
+        BSONObj snapshotHint; // put here to keep the data in scope
+        if( snapshot ) {
+            NamespaceDetails *d = nsdetails(ns);
+            if ( d ) {
+                int i = d->findIdIndex();
+                if( i < 0 ) {
+                    if ( strstr( ns , ".system." ) == 0 )
+                        log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
+                }
+                else {
+                    /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
+                       probably need a better way to specify "use the _id index" as a hint.  if someone is
+                       in the query optimizer please fix this then!
+                    */
+                    BSONObjBuilder b;
+                    b.append("$hint", d->idx(i).indexName());
+                    snapshotHint = b.obj();
+                    hint = snapshotHint.firstElement();
+                }
+            }
+        }
+
+        if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
+
+            bool nsFound = false;
+            bool indexFound = false;
+
+            BSONObj resObject;
+            Client& c = cc();
+            bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
+            if ( nsFound == false || indexFound == true ) {
+                BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
+                bb.skip(sizeof(QueryResult));
+                
+                curop.debug().idhack = true;
+                if ( found ) {
+                    n = 1;
+                    fillQueryResultFromObj( bb , pq.getFields() , resObject );
+                }
+                auto_ptr< QueryResult > qr;
+                qr.reset( (QueryResult *) bb.buf() );
+                bb.decouple();
+                qr->setResultFlagsToOk();
+                qr->len = bb.len();
+                
+                curop.debug().responseLength = bb.len();
+                qr->setOperation(opReply);
+                qr->cursorId = 0;
+                qr->startingFrom = 0;
+                qr->nReturned = n;
+                result.setData( qr.release(), true );
+                return NULL;
+            }
+        }
+
+        // regular, not QO bypass query
+
+        BSONObj oldPlan;
+        if ( explain && ! pq.hasIndexSpecifier() ) {
+            MultiPlanScanner mps( ns, query, order );
+            if ( mps.usingCachedPlan() )
+                oldPlan = mps.oldExplain();
+        }
+        auto_ptr< MultiPlanScanner > mps( new MultiPlanScanner( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax(), false, true ) );
+        BSONObj explainSuffix;
+        if ( explain ) {
+            BSONObjBuilder bb;
+            if ( !oldPlan.isEmpty() )
+                bb.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
+            explainSuffix = bb.obj();
+        }
+        ExplainBuilder eb;
+        UserQueryOp original( pq, result, eb, curop );
+        shared_ptr< UserQueryOp > o = mps->runOp( original );
+        UserQueryOp &dqo = *o;
+        if ( ! dqo.complete() )
+            throw MsgAssertionException( dqo.exception() );
+        if ( explain ) {
+            dqo.finishExplain( explainSuffix );
+        }
+        n = dqo.n();
+        long long nscanned = dqo.totalNscanned();
+        curop.debug().scanAndOrder = dqo.scanAndOrderRequired();
+
+        shared_ptr<Cursor> cursor = dqo.cursor();
+        if( logLevel >= 5 )
+            log() << "   used cursor: " << cursor.get() << endl;
+        long long cursorid = 0;
+        const char * exhaust = 0;
+        if ( dqo.saveClientCursor() || ( dqo.wouldSaveClientCursor() && mps->mayRunMore() ) ) {
+            ClientCursor *cc;
+            bool moreClauses = mps->mayRunMore();
+            if ( moreClauses ) {
+                // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield
+                shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher( cursor ), dqo ) );
+                cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned());
+            }
+            else {
+                if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher( cursor ) );
+                cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() );
+            }
+
+            cc->setChunkManager( dqo.getChunkManager() );
+
+            cursorid = cc->cursorid();
+            DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
+            cc->setPos( n );
+            cc->pq = pq_shared;
+            cc->fields = pq.getFieldPtr();
+            cc->originalMessage = m;
+            cc->updateLocation();
+            if ( !cc->ok() && cc->c()->tailable() )
+                DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
+            if( queryOptions & QueryOption_Exhaust ) {
+                exhaust = ns;
+                curop.debug().exhaust = true;
+            }
+            dqo.finishForOplogReplay(cc);
+        }
+
+        QueryResult *qr = (QueryResult *) result.header();
+        qr->cursorId = cursorid;
+        qr->setResultFlagsToOk();
+        // qr->len is updated automatically by appendData()
+        curop.debug().responseLength = qr->len;
+        qr->setOperation(opReply);
+        qr->startingFrom = 0;
+        qr->nReturned = n;
+
+        int duration = curop.elapsedMillis();
+        bool dbprofile = curop.shouldDBProfile( duration );
+        if ( dbprofile || duration >= cmdLine.slowMS ) {
+            curop.debug().nscanned = (int) nscanned;
+            curop.debug().ntoskip = ntoskip;
+        }
+        curop.debug().nreturned = n;
+        return exhaust;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/ops/query.h b/src/mongo/db/ops/query.h
new file mode 100644
index 00000000000..3324b75fe16
--- /dev/null
+++ b/src/mongo/db/ops/query.h
@@ -0,0 +1,248 @@
+// query.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../../util/net/message.h"
+#include "../dbmessage.h"
+#include "../jsobj.h"
+#include "../diskloc.h"
+#include "../projection.h"
+
+// struct QueryOptions, QueryResult, QueryResultFlags in:
+#include "../../client/dbclient.h"
+
+namespace mongo {
+
+    extern const int MaxBytesToReturnToClientAtOnce;
+
+    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op, int pass, bool& exhaust);
+
+    const char * runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result);
+
+    /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
+       [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
+    */
+    inline BSONObj transformOrderFromArrayFormat(BSONObj order) {
+        /* note: this is slow, but that is ok as order will have very few pieces */
+        BSONObjBuilder b;
+        char p[2] = "0";
+
+        while ( 1 ) {
+            BSONObj j = order.getObjectField(p);
+            if ( j.isEmpty() )
+                break;
+            BSONElement e = j.firstElement();
+            uassert( 10102 , "bad order array", !e.eoo());
+            uassert( 10103 , "bad order array [2]", e.isNumber());
+            b.append(e);
+            (*p)++;
+            uassert( 10104 , "too many ordering elements", *p <= '9');
+        }
+
+        return b.obj();
+    }
+
+    /**
+     * this represents a total user query
+     * includes fields from the query message, both possible query levels
+     * parses everything up front
+     */
+    class ParsedQuery : boost::noncopyable {
+    public:
+        ParsedQuery( QueryMessage& qm )
+            : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ) {
+            init( qm.query );
+            initFields( qm.fields );
+        }
+        ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields )
+            : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ) {
+            init( query );
+            initFields( fields );
+        }
+
+        const char * ns() const { return _ns; }
+        bool isLocalDB() const { return strncmp(_ns, "local.", 6) == 0; }
+
+        const BSONObj& getFilter() const { return _filter; }
+        Projection* getFields() const { return _fields.get(); }
+        shared_ptr<Projection> getFieldPtr() const { return _fields; }
+
+        int getSkip() const { return _ntoskip; }
+        int getNumToReturn() const { return _ntoreturn; }
+        bool wantMore() const { return _wantMore; }
+        int getOptions() const { return _options; }
+        bool hasOption( int x ) const { return x & _options; }
+
+        bool isExplain() const { return _explain; }
+        bool isSnapshot() const { return _snapshot; }
+        bool returnKey() const { return _returnKey; }
+        bool showDiskLoc() const { return _showDiskLoc; }
+
+        const BSONObj& getMin() const { return _min; }
+        const BSONObj& getMax() const { return _max; }
+        const BSONObj& getOrder() const { return _order; }
+        const BSONElement& getHint() const { return _hint; }
+        int getMaxScan() const { return _maxScan; }
+
+        bool couldBeCommand() const {
+            /* we assume you are using findOne() for running a cmd... */
+            return _ntoreturn == 1 && strstr( _ns , ".$cmd" );
+        }
+
+        bool hasIndexSpecifier() const {
+            return ! _hint.eoo() || ! _min.isEmpty() || ! _max.isEmpty();
+        }
+
+        /* if ntoreturn is zero, we return up to 101 objects.  on the subsequent getmore, there
+           is only a size limit.  The idea is that on a find() where one doesn't use much results,
+           we don't return much, but once getmore kicks in, we start pushing significant quantities.
+
+           The n limit (vs. size) is important when someone fetches only one small field from big
+           objects, which causes massive scanning server-side.
+        */
+        bool enoughForFirstBatch( int n , int len ) const {
+            if ( _ntoreturn == 0 )
+                return ( len > 1024 * 1024 ) || n >= 101;
+            return n >= _ntoreturn || len > MaxBytesToReturnToClientAtOnce;
+        }
+
+        bool enough( int n ) const {
+            if ( _ntoreturn == 0 )
+                return false;
+            return n >= _ntoreturn;
+        }
+
+    private:
+        void init( const BSONObj& q ) {
+            _reset();
+            uassert( 10105 , "bad skip value in query", _ntoskip >= 0);
+
+            if ( _ntoreturn < 0 ) {
+                /* _ntoreturn greater than zero is simply a hint on how many objects to send back per
+                   "cursor batch".
+                   A negative number indicates a hard limit.
+                */
+                _wantMore = false;
+                _ntoreturn = -_ntoreturn;
+            }
+
+
+            BSONElement e = q["query"];
+            if ( ! e.isABSONObj() )
+                e = q["$query"];
+
+            if ( e.isABSONObj() ) {
+                _filter = e.embeddedObject();
+                _initTop( q );
+            }
+            else {
+                _filter = q;
+            }
+        }
+
+        void _reset() {
+            _wantMore = true;
+            _explain = false;
+            _snapshot = false;
+            _returnKey = false;
+            _showDiskLoc = false;
+            _maxScan = 0;
+        }
+
+        void _initTop( const BSONObj& top ) {
+            BSONObjIterator i( top );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                const char * name = e.fieldName();
+
+                if ( strcmp( "$orderby" , name ) == 0 ||
+                        strcmp( "orderby" , name ) == 0 ) {
+                    if ( e.type() == Object ) {
+                        _order = e.embeddedObject();
+                    }
+                    else if ( e.type() == Array ) {
+                        _order = transformOrderFromArrayFormat( _order );
+                    }
+                    else {
+                        uasserted(13513, "sort must be an object or array");
+                    }
+                    continue;
+                }
+
+                if( *name == '$' ) {
+                    name++;
+                    if ( strcmp( "explain" , name ) == 0 )
+                        _explain = e.trueValue();
+                    else if ( strcmp( "snapshot" , name ) == 0 )
+                        _snapshot = e.trueValue();
+                    else if ( strcmp( "min" , name ) == 0 )
+                        _min = e.embeddedObject();
+                    else if ( strcmp( "max" , name ) == 0 )
+                        _max = e.embeddedObject();
+                    else if ( strcmp( "hint" , name ) == 0 )
+                        _hint = e;
+                    else if ( strcmp( "returnKey" , name ) == 0 )
+                        _returnKey = e.trueValue();
+                    else if ( strcmp( "maxScan" , name ) == 0 )
+                        _maxScan = e.numberInt();
+                    else if ( strcmp( "showDiskLoc" , name ) == 0 )
+                        _showDiskLoc = e.trueValue();
+                    else if ( strcmp( "comment" , name ) == 0 ) {
+                        ; // no-op
+                    }
+                }
+            }
+
+            if ( _snapshot ) {
+                uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() );
+                uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() );
+            }
+
+        }
+
+        void initFields( const BSONObj& fields ) {
+            if ( fields.isEmpty() )
+                return;
+            _fields.reset( new Projection() );
+            _fields->init( fields );
+        }
+
+        const char * const _ns;
+        const int _ntoskip;
+        int _ntoreturn;
+        BSONObj _filter;
+        BSONObj _order;
+        const int _options;
+        shared_ptr< Projection > _fields;
+        bool _wantMore;
+        bool _explain;
+        bool _snapshot;
+        bool _returnKey;
+        bool _showDiskLoc;
+        BSONObj _min;
+        BSONObj _max;
+        BSONElement _hint;
+        int _maxScan;
+    };
+
+
+} // namespace mongo
+
+
diff --git a/src/mongo/db/ops/update.cpp b/src/mongo/db/ops/update.cpp
new file mode 100644
index 00000000000..2abc6987218
--- /dev/null
+++ b/src/mongo/db/ops/update.cpp
@@ -0,0 +1,1308 @@
+// update.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "query.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../queryoptimizer.h"
+#include "../repl.h"
+#include "../btree.h"
+#include "../../util/stringutils.h"
+#include "update.h"
+
+//#define DEBUGUPDATE(x) cout << x << endl;
+#define DEBUGUPDATE(x)
+
+namespace mongo {
+
+    const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" ,
+                                    "$bitand" , "$bitor" , "$bit" , "$addToSet", "$rename", "$rename"
+                                  };
+    unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*);
+
+    bool Mod::_pullElementMatch( BSONElement& toMatch ) const {
+
+        if ( elt.type() != Object ) {
+            // if elt isn't an object, then comparison will work
+            return toMatch.woCompare( elt , false ) == 0;
+        }
+
+        if ( matcherOnPrimitive )
+            return matcher->matches( toMatch.wrap( "" ) );
+
+        if ( toMatch.type() != Object ) {
+            // looking for an object, so this can't match
+            return false;
+        }
+
+        // now we have an object on both sides
+        return matcher->matches( toMatch.embeddedObject() );
+    }
+
+    template< class Builder >
+    void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const {
+        BSONType a = in.type();
+        BSONType b = elt.type();
+
+        if ( a == NumberDouble || b == NumberDouble ) {
+            ms.incType = NumberDouble;
+            ms.incdouble = elt.numberDouble() + in.numberDouble();
+        }
+        else if ( a == NumberLong || b == NumberLong ) {
+            ms.incType = NumberLong;
+            ms.inclong = elt.numberLong() + in.numberLong();
+        }
+        else {
+            int x = elt.numberInt() + in.numberInt();
+            if ( x < 0 && elt.numberInt() > 0 && in.numberInt() > 0 ) {
+                // overflow
+                ms.incType = NumberLong;
+                ms.inclong = elt.numberLong() + in.numberLong();
+            }
+            else {
+                ms.incType = NumberInt;
+                ms.incint = elt.numberInt() + in.numberInt();
+            }
+        }
+
+        ms.appendIncValue( bb , false );
+    }
+
+    template< class Builder >
+    void appendUnset( Builder &b ) {
+    }
+
+    template<>
+    void appendUnset( BSONArrayBuilder &b ) {
+        b.appendNull();
+    }
+
+    template< class Builder >
+    void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const {
+        if ( ms.dontApply ) {
+            return;
+        }
+
+        switch ( op ) {
+
+        case INC: {
+            appendIncremented( b , in , ms );
+            break;
+        }
+
+        case SET: {
+            _checkForAppending( elt );
+            b.appendAs( elt , shortFieldName );
+            break;
+        }
+
+        case UNSET: {
+            appendUnset( b );
+            break;
+        }
+
+        case PUSH: {
+            uassert( 10131 ,  "$push can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+            while ( i.more() ) {
+                bb.append( i.next() );
+                n++;
+            }
+
+            ms.pushStartSize = n;
+
+            bb.appendAs( elt ,  bb.numStr( n ) );
+            bb.done();
+            break;
+        }
+
+        case ADDTOSET: {
+            uassert( 12592 ,  "$addToSet can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+
+            if ( isEach() ) {
+
+                BSONElementSet toadd;
+                parseEach( toadd );
+
+                while ( i.more() ) {
+                    BSONElement cur = i.next();
+                    bb.append( cur );
+                    n++;
+                    toadd.erase( cur );
+                }
+
+                {
+                    BSONObjIterator i( getEach() );
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( toadd.count(e) ) {
+                            bb.appendAs( e , BSONObjBuilder::numStr( n++ ) );
+                            toadd.erase( e );
+                        }
+                    }
+                }
+
+            }
+            else {
+
+                bool found = false;
+
+                while ( i.more() ) {
+                    BSONElement cur = i.next();
+                    bb.append( cur );
+                    n++;
+                    if ( elt.woCompare( cur , false ) == 0 )
+                        found = true;
+                }
+
+                if ( ! found )
+                    bb.appendAs( elt ,  bb.numStr( n ) );
+
+            }
+
+            bb.done();
+            break;
+        }
+
+
+
+        case PUSH_ALL: {
+            uassert( 10132 ,  "$pushAll can only be applied to an array" , in.type() == Array );
+            uassert( 10133 ,  "$pushAll has to be passed an array" , elt.type() );
+
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+            while ( i.more() ) {
+                bb.append( i.next() );
+                n++;
+            }
+
+            ms.pushStartSize = n;
+
+            i = BSONObjIterator( elt.embeddedObject() );
+            while ( i.more() ) {
+                bb.appendAs( i.next() , bb.numStr( n++ ) );
+            }
+
+            bb.done();
+            break;
+        }
+
+        case PULL:
+        case PULL_ALL: {
+            uassert( 10134 ,  "$pull/$pullAll can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            int n = 0;
+
+            BSONObjIterator i( in.embeddedObject() );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                bool allowed = true;
+
+                if ( op == PULL ) {
+                    allowed = ! _pullElementMatch( e );
+                }
+                else {
+                    BSONObjIterator j( elt.embeddedObject() );
+                    while( j.more() ) {
+                        BSONElement arrJ = j.next();
+                        if ( e.woCompare( arrJ, false ) == 0 ) {
+                            allowed = false;
+                            break;
+                        }
+                    }
+                }
+
+                if ( allowed )
+                    bb.appendAs( e , bb.numStr( n++ ) );
+            }
+
+            bb.done();
+            break;
+        }
+
+        case POP: {
+            uassert( 10135 ,  "$pop can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            int n = 0;
+
+            BSONObjIterator i( in.embeddedObject() );
+            if ( elt.isNumber() && elt.number() < 0 ) {
+                // pop from front
+                if ( i.more() ) {
+                    i.next();
+                    n++;
+                }
+
+                while( i.more() ) {
+                    bb.appendAs( i.next() , bb.numStr( n - 1 ) );
+                    n++;
+                }
+            }
+            else {
+                // pop from back
+                while( i.more() ) {
+                    n++;
+                    BSONElement arrI = i.next();
+                    if ( i.more() ) {
+                        bb.append( arrI );
+                    }
+                }
+            }
+
+            ms.pushStartSize = n;
+            assert( ms.pushStartSize == in.embeddedObject().nFields() );
+            bb.done();
+            break;
+        }
+
+        case BIT: {
+            uassert( 10136 ,  "$bit needs an array" , elt.type() == Object );
+            uassert( 10137 ,  "$bit can only be applied to numbers" , in.isNumber() );
+            uassert( 10138 ,  "$bit cannot update a value of type double" , in.type() != NumberDouble );
+
+            int x = in.numberInt();
+            long long y = in.numberLong();
+
+            BSONObjIterator it( elt.embeddedObject() );
+            while ( it.more() ) {
+                BSONElement e = it.next();
+                uassert( 10139 ,  "$bit field must be number" , e.isNumber() );
+                if ( str::equals(e.fieldName(), "and") ) {
+                    switch( in.type() ) {
+                    case NumberInt: x = x&e.numberInt(); break;
+                    case NumberLong: y = y&e.numberLong(); break;
+                    default: assert( 0 );
+                    }
+                }
+                else if ( str::equals(e.fieldName(), "or") ) {
+                    switch( in.type() ) {
+                    case NumberInt: x = x|e.numberInt(); break;
+                    case NumberLong: y = y|e.numberLong(); break;
+                    default: assert( 0 );
+                    }
+                }
+                else {
+                    uasserted(9016, str::stream() << "unknown $bit operation: " << e.fieldName());
+                }
+            }
+
+            switch( in.type() ) {
+            case NumberInt: b.append( shortFieldName , x ); break;
+            case NumberLong: b.append( shortFieldName , y ); break;
+            default: assert( 0 );
+            }
+
+            break;
+        }
+
+        case RENAME_FROM: {
+            break;
+        }
+
+        case RENAME_TO: {
+            ms.handleRename( b, shortFieldName );
+            break;
+        }
+
+        default:
+            stringstream ss;
+            ss << "Mod::apply can't handle type: " << op;
+            throw UserException( 9017, ss.str() );
+        }
+    }
+
+    // -1 inside a non-object (non-object could be array)
+    // 0 missing
+    // 1 found
+    int validRenamePath( BSONObj obj, const char *path ) {
+        while( const char *p = strchr( path, '.' ) ) {
+            string left( path, p - path );
+            BSONElement e = obj.getField( left );
+            if ( e.eoo() ) {
+                return 0;
+            }
+            if ( e.type() != Object ) {
+                return -1;
+            }
+            obj = e.embeddedObject();
+            path = p + 1;
+        }
+        return !obj.getField( path ).eoo();
+    }
+
+    auto_ptr<ModSetState> ModSet::prepare(const BSONObj &obj) const {
+        DEBUGUPDATE( "\t start prepare" );
+        auto_ptr<ModSetState> mss( new ModSetState( obj ) );
+
+
+        // Perform this check first, so that we don't leave a partially modified object on uassert.
+        for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            DEBUGUPDATE( "\t\t prepare : " << i->first );
+            ModState& ms = mss->_mods[i->first];
+
+            const Mod& m = i->second;
+            BSONElement e = obj.getFieldDotted(m.fieldName);
+
+            ms.m = &m;
+            ms.old = e;
+
+            if ( m.op == Mod::RENAME_FROM ) {
+                int source = validRenamePath( obj, m.fieldName );
+                uassert( 13489, "$rename source field invalid", source != -1 );
+                if ( source != 1 ) {
+                    ms.dontApply = true;
+                }
+                continue;
+            }
+
+            if ( m.op == Mod::RENAME_TO ) {
+                int source = validRenamePath( obj, m.renameFrom() );
+                if ( source == 1 ) {
+                    int target = validRenamePath( obj, m.fieldName );
+                    uassert( 13490, "$rename target field invalid", target != -1 );
+                    ms.newVal = obj.getFieldDotted( m.renameFrom() );
+                    mss->amIInPlacePossible( false );
+                }
+                else {
+                    ms.dontApply = true;
+                }
+                continue;
+            }
+
+            if ( e.eoo() ) {
+                mss->amIInPlacePossible( m.op == Mod::UNSET );
+                continue;
+            }
+
+            switch( m.op ) {
+            case Mod::INC:
+                uassert( 10140 ,  "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() );
+                if ( mss->amIInPlacePossible( e.isNumber() ) ) {
+                    // check more typing info here
+                    if ( m.elt.type() != e.type() ) {
+                        // if i'm incrementing with a double, then the storage has to be a double
+                        mss->amIInPlacePossible( m.elt.type() != NumberDouble );
+                    }
+
+                    // check for overflow
+                    if ( e.type() == NumberInt && e.numberLong() + m.elt.numberLong() > numeric_limits<int>::max() ) {
+                        mss->amIInPlacePossible( false );
+                    }
+                }
+                break;
+
+            case Mod::SET:
+                mss->amIInPlacePossible( m.elt.type() == e.type() &&
+                                         m.elt.valuesize() == e.valuesize() );
+                break;
+
+            case Mod::PUSH:
+            case Mod::PUSH_ALL:
+                uassert( 10141 ,  "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() );
+                mss->amIInPlacePossible( false );
+                break;
+
+            case Mod::PULL:
+            case Mod::PULL_ALL: {
+                uassert( 10142 ,  "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() );
+                BSONObjIterator i( e.embeddedObject() );
+                while( mss->_inPlacePossible && i.more() ) {
+                    BSONElement arrI = i.next();
+                    if ( m.op == Mod::PULL ) {
+                        mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) );
+                    }
+                    else if ( m.op == Mod::PULL_ALL ) {
+                        BSONObjIterator j( m.elt.embeddedObject() );
+                        while( mss->_inPlacePossible && j.moreWithEOO() ) {
+                            BSONElement arrJ = j.next();
+                            if ( arrJ.eoo() )
+                                break;
+                            mss->amIInPlacePossible( arrI.woCompare( arrJ, false ) );
+                        }
+                    }
+                }
+                break;
+            }
+
+            case Mod::POP: {
+                uassert( 10143 ,  "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() );
+                mss->amIInPlacePossible( e.embeddedObject().isEmpty() );
+                break;
+            }
+
+            case Mod::ADDTOSET: {
+                uassert( 12591 ,  "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() );
+
+                BSONObjIterator i( e.embeddedObject() );
+                if ( m.isEach() ) {
+                    BSONElementSet toadd;
+                    m.parseEach( toadd );
+                    while( i.more() ) {
+                        BSONElement arrI = i.next();
+                        toadd.erase( arrI );
+                    }
+                    mss->amIInPlacePossible( toadd.size() == 0 );
+                }
+                else {
+                    bool found = false;
+                    while( i.more() ) {
+                        BSONElement arrI = i.next();
+                        if ( arrI.woCompare( m.elt , false ) == 0 ) {
+                            found = true;
+                            break;
+                        }
+                    }
+                    mss->amIInPlacePossible( found );
+                }
+                break;
+            }
+
+            default:
+                // mods we don't know about shouldn't be done in place
+                mss->amIInPlacePossible( false );
+            }
+        }
+
+        DEBUGUPDATE( "\t mss\n" << mss->toString() << "\t--" );
+
+        return mss;
+    }
+
+    void ModState::appendForOpLog( BSONObjBuilder& b ) const {
+        if ( dontApply ) {
+            return;
+        }
+
+        if ( incType ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog inc fieldname: " << m->fieldName << " short:" << m->shortFieldName );
+            BSONObjBuilder bb( b.subobjStart( "$set" ) );
+            appendIncValue( bb , true );
+            bb.done();
+            return;
+        }
+
+        if ( m->op == Mod::RENAME_FROM ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fieldName:" << m->fieldName );
+            BSONObjBuilder bb( b.subobjStart( "$unset" ) );
+            bb.append( m->fieldName, 1 );
+            bb.done();
+            return;
+        }
+
+        if ( m->op == Mod::RENAME_TO ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fieldName:" << m->fieldName );
+            BSONObjBuilder bb( b.subobjStart( "$set" ) );
+            bb.appendAs( newVal, m->fieldName );
+            return;
+        }
+
+        const char * name = fixedOpName ? fixedOpName : Mod::modNames[op()];
+
+        DEBUGUPDATE( "\t\t\t\t\t appendForOpLog name:" << name << " fixed: " << fixed << " fn: " << m->fieldName );
+
+        BSONObjBuilder bb( b.subobjStart( name ) );
+        if ( fixed ) {
+            bb.appendAs( *fixed , m->fieldName );
+        }
+        else {
+            bb.appendAs( m->elt , m->fieldName );
+        }
+        bb.done();
+    }
+
+    string ModState::toString() const {
+        stringstream ss;
+        if ( fixedOpName )
+            ss << " fixedOpName: " << fixedOpName;
+        if ( fixed )
+            ss << " fixed: " << fixed;
+        return ss.str();
+    }
+
+    template< class Builder >
+    void ModState::handleRename( Builder &newObjBuilder, const char *shortFieldName ) {
+        newObjBuilder.appendAs( newVal , shortFieldName );
+        BSONObjBuilder b;
+        b.appendAs( newVal, shortFieldName );
+        assert( _objData.isEmpty() );
+        _objData = b.obj();
+        newVal = _objData.firstElement();
+    }
+
+    void ModSetState::applyModsInPlace( bool isOnDisk ) {
+        // TODO i think this assert means that we can get rid of the isOnDisk param
+        //      and just use isOwned as the determination
+        DEV assert( isOnDisk == ! _obj.isOwned() );
+
+        for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            ModState& m = i->second;
+
+            if ( m.dontApply ) {
+                continue;
+            }
+
+            switch ( m.m->op ) {
+            case Mod::UNSET:
+            case Mod::ADDTOSET:
+            case Mod::RENAME_FROM:
+            case Mod::RENAME_TO:
+                // this should have been handled by prepare
+                break;
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // this should have been handled by prepare
+                break;
+            case Mod::POP:
+                assert( m.old.eoo() || ( m.old.isABSONObj() && m.old.Obj().isEmpty() ) );
+                break;
+                // [dm] the BSONElementManipulator statements below are for replication (correct?)
+            case Mod::INC:
+                if ( isOnDisk )
+                    m.m->IncrementMe( m.old );
+                else
+                    m.m->incrementMe( m.old );
+                m.fixedOpName = "$set";
+                m.fixed = &(m.old);
+                break;
+            case Mod::SET:
+                if ( isOnDisk )
+                    BSONElementManipulator( m.old ).ReplaceTypeAndValue( m.m->elt );
+                else
+                    BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt );
+                break;
+            default:
+                uassert( 13478 ,  "can't apply mod in place - shouldn't have gotten here" , 0 );
+            }
+        }
+    }
+
+    void ModSet::extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ) {
+        if ( top.type() != Object ) {
+            fields[ base + top.fieldName() ] = top;
+            return;
+        }
+        BSONObjIterator i( top.embeddedObject() );
+        bool empty = true;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            extractFields( fields, e, base + top.fieldName() + "." );
+            empty = false;
+        }
+        if ( empty )
+            fields[ base + top.fieldName() ] = top;
+    }
+
+    template< class Builder >
+    void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ) {
+        const char * temp = m.fieldName();
+        temp += root.size();
+        const char * dot = strchr( temp , '.' );
+        if ( dot ) {
+            string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) );
+            string nf( temp , 0 , dot - temp );
+            if ( onedownseen.count( nf ) )
+                return;
+            onedownseen.insert( nf );
+            BSONObjBuilder bb ( b.subobjStart( nf ) );
+            createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name
+            bb.done();
+        }
+        else {
+            appendNewFromMod( m , b );
+        }
+
+    }
+
+    template< class Builder >
+    void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ) {
+        DEBUGUPDATE( "\t\t createNewFromMods root: " << root );
+        BSONObjIteratorSorted es( obj );
+        BSONElement e = es.next();
+
+        ModStateHolder::iterator m = _mods.lower_bound( root );
+        StringBuilder buf(root.size() + 2 );
+        buf << root << (char)255;
+        ModStateHolder::iterator mend = _mods.lower_bound( buf.str() );
+
+        set<string> onedownseen;
+
+        while ( e.type() && m != mend ) {
+            string field = root + e.fieldName();
+            FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field );
+
+            DEBUGUPDATE( "\t\t\t field:" << field << "\t mod:" << m->second.m->fieldName << "\t cmp:" << cmp << "\t short: " << e.fieldName() );
+
+            switch ( cmp ) {
+
+            case LEFT_SUBFIELD: { // Mod is embedded under this element
+                uassert( 10145 ,  str::stream() << "LEFT_SUBFIELD only supports Object: " << field << " not: " << e.type() , e.type() == Object || e.type() == Array );
+                if ( onedownseen.count( e.fieldName() ) == 0 ) {
+                    onedownseen.insert( e.fieldName() );
+                    if ( e.type() == Object ) {
+                        BSONObjBuilder bb( b.subobjStart( e.fieldName() ) );
+                        stringstream nr; nr << root << e.fieldName() << ".";
+                        createNewFromMods( nr.str() , bb , e.embeddedObject() );
+                        bb.done();
+                    }
+                    else {
+                        BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) );
+                        stringstream nr; nr << root << e.fieldName() << ".";
+                        createNewFromMods( nr.str() , ba , e.embeddedObject() );
+                        ba.done();
+                    }
+                    // inc both as we handled both
+                    e = es.next();
+                    m++;
+                }
+                else {
+                    // this is a very weird case
+                    // have seen it in production, but can't reproduce
+                    // this assert prevents an inf. loop
+                    // but likely isn't the correct solution
+                    assert(0);
+                }
+                continue;
+            }
+            case LEFT_BEFORE: // Mod on a field that doesn't exist
+                DEBUGUPDATE( "\t\t\t\t creating new field for: " << m->second.m->fieldName );
+                _appendNewFromMods( root , m->second , b , onedownseen );
+                m++;
+                continue;
+            case SAME:
+                DEBUGUPDATE( "\t\t\t\t applying mod on: " << m->second.m->fieldName );
+                m->second.apply( b , e );
+                e = es.next();
+                m++;
+                continue;
+            case RIGHT_BEFORE: // field that doesn't have a MOD
+                DEBUGUPDATE( "\t\t\t\t just copying" );
+                b.append( e ); // if array, ignore field name
+                e = es.next();
+                continue;
+            case RIGHT_SUBFIELD:
+                massert( 10399 ,  "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 );
+                break;
+            default:
+                massert( 10400 ,  "unhandled case" , 0 );
+            }
+        }
+
+        // finished looping the mods, just adding the rest of the elements
+        while ( e.type() ) {
+            DEBUGUPDATE( "\t\t\t copying: " << e.fieldName() );
+            b.append( e );  // if array, ignore field name
+            e = es.next();
+        }
+
+        // do mods that don't have fields already
+        for ( ; m != mend; m++ ) {
+            DEBUGUPDATE( "\t\t\t\t appending from mod at end: " << m->second.m->fieldName );
+            _appendNewFromMods( root , m->second , b , onedownseen );
+        }
+    }
+
+    BSONObj ModSetState::createNewFromMods() {
+        BSONObjBuilder b( (int)(_obj.objsize() * 1.1) );
+        createNewFromMods( "" , b , _obj );
+        return _newFromMods = b.obj();
+    }
+
+    string ModSetState::toString() const {
+        stringstream ss;
+        for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ) {
+            ss << "\t\t" << i->first << "\t" << i->second.toString() << "\n";
+        }
+        return ss.str();
+    }
+
+    bool ModSetState::FieldCmp::operator()( const string &l, const string &r ) const {
+        return lexNumCmp( l.c_str(), r.c_str() ) < 0;
+    }
+
+    BSONObj ModSet::createNewFromQuery( const BSONObj& query ) {
+        BSONObj newObj;
+
+        {
+            BSONObjBuilder bb;
+            EmbeddedBuilder eb( &bb );
+            BSONObjIteratorSorted i( query );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.fieldName()[0] == '$' ) // for $atomic and anything else we add
+                    continue;
+
+                if ( e.type() == Object && e.embeddedObject().firstElementFieldName()[0] == '$' ) {
+                    // this means this is a $gt type filter, so don't make part of the new object
+                    continue;
+                }
+
+                eb.appendAs( e , e.fieldName() );
+            }
+            eb.done();
+            newObj = bb.obj();
+        }
+
+        auto_ptr<ModSetState> mss = prepare( newObj );
+
+        if ( mss->canApplyInPlace() )
+            mss->applyModsInPlace( false );
+        else
+            newObj = mss->createNewFromMods();
+
+        return newObj;
+    }
+
+    /* get special operations like $inc
+       { $inc: { a:1, b:1 } }
+       { $set: { a:77 } }
+       { $push: { a:55 } }
+       { $pushAll: { a:[77,88] } }
+       { $pull: { a:66 } }
+       { $pullAll : { a:[99,1010] } }
+       NOTE: MODIFIES source from object!
+    */
+    ModSet::ModSet(
+        const BSONObj &from ,
+        const set<string>& idxKeys,
+        const set<string> *backgroundKeys)
+        : _isIndexed(0) , _hasDynamicArray( false ) {
+
+        BSONObjIterator it(from);
+
+        while ( it.more() ) {
+            BSONElement e = it.next();
+            const char *fn = e.fieldName();
+
+            uassert( 10147 ,  "Invalid modifier specified: " + string( fn ), e.type() == Object );
+            BSONObj j = e.embeddedObject();
+            DEBUGUPDATE( "\t" << j );
+
+            BSONObjIterator jt(j);
+            Mod::Op op = opFromStr( fn );
+
+            while ( jt.more() ) {
+                BSONElement f = jt.next(); // x:44
+
+                const char * fieldName = f.fieldName();
+
+                uassert( 15896 ,  "Modified field name may not start with $", fieldName[0] != '$' || op == Mod::UNSET );  // allow remove of invalid field name in case it was inserted before this check was added (~ version 2.1)
+                uassert( 10148 ,  "Mod on _id not allowed", strcmp( fieldName, "_id" ) != 0 );
+                uassert( 10149 ,  "Invalid mod field name, may not end in a period", fieldName[ strlen( fieldName ) - 1 ] != '.' );
+                uassert( 10150 ,  "Field name duplication not allowed with modifiers", ! haveModForField( fieldName ) );
+                uassert( 10151 ,  "have conflicting mods in update" , ! haveConflictingMod( fieldName ) );
+                uassert( 10152 ,  "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC );
+                uassert( 10153 ,  "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) );
+
+                if ( op == Mod::RENAME_TO ) {
+                    uassert( 13494, "$rename target must be a string", f.type() == String );
+                    const char *target = f.valuestr();
+                    uassert( 13495, "$rename source must differ from target", strcmp( fieldName, target ) != 0 );
+                    uassert( 13496, "invalid mod field name, source may not be empty", fieldName[0] );
+                    uassert( 13479, "invalid mod field name, target may not be empty", target[0] );
+                    uassert( 13480, "invalid mod field name, source may not begin or end in period", fieldName[0] != '.' && fieldName[ strlen( fieldName ) - 1 ] != '.' );
+                    uassert( 13481, "invalid mod field name, target may not begin or end in period", target[0] != '.' && target[ strlen( target ) - 1 ] != '.' );
+                    uassert( 13482, "$rename affecting _id not allowed", !( fieldName[0] == '_' && fieldName[1] == 'i' && fieldName[2] == 'd' && ( !fieldName[3] || fieldName[3] == '.' ) ) );
+                    uassert( 13483, "$rename affecting _id not allowed", !( target[0] == '_' && target[1] == 'i' && target[2] == 'd' && ( !target[3] || target[3] == '.' ) ) );
+                    uassert( 13484, "field name duplication not allowed with $rename target", !haveModForField( target ) );
+                    uassert( 13485, "conflicting mods not allowed with $rename target", !haveConflictingMod( target ) );
+                    uassert( 13486, "$rename target may not be a parent of source", !( strncmp( fieldName, target, strlen( target ) ) == 0 && fieldName[ strlen( target ) ] == '.' ) );
+                    uassert( 13487, "$rename source may not be dynamic array", strstr( fieldName , ".$" ) == 0 );
+                    uassert( 13488, "$rename target may not be dynamic array", strstr( target , ".$" ) == 0 );
+
+                    Mod from;
+                    from.init( Mod::RENAME_FROM, f );
+                    from.setFieldName( fieldName );
+                    updateIsIndexed( from, idxKeys, backgroundKeys );
+                    _mods[ from.fieldName ] = from;
+
+                    Mod to;
+                    to.init( Mod::RENAME_TO, f );
+                    to.setFieldName( target );
+                    updateIsIndexed( to, idxKeys, backgroundKeys );
+                    _mods[ to.fieldName ] = to;
+
+                    DEBUGUPDATE( "\t\t " << fieldName << "\t" << from.fieldName << "\t" << to.fieldName );
+                    continue;
+                }
+
+                _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0;
+
+                Mod m;
+                m.init( op , f );
+                m.setFieldName( f.fieldName() );
+                updateIsIndexed( m, idxKeys, backgroundKeys );
+                _mods[m.fieldName] = m;
+
+                DEBUGUPDATE( "\t\t " << fieldName << "\t" << m.fieldName << "\t" << _hasDynamicArray );
+            }
+        }
+
+    }
+
+    ModSet * ModSet::fixDynamicArray( const char * elemMatchKey ) const {
+        ModSet * n = new ModSet();
+        n->_isIndexed = _isIndexed;
+        n->_hasDynamicArray = _hasDynamicArray;
+        for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ) {
+            string s = i->first;
+            size_t idx = s.find( ".$" );
+            if ( idx == string::npos ) {
+                n->_mods[s] = i->second;
+                continue;
+            }
+            StringBuilder buf(s.size()+strlen(elemMatchKey));
+            buf << s.substr(0,idx+1) << elemMatchKey << s.substr(idx+2);
+            string fixed = buf.str();
+            DEBUGUPDATE( "fixed dynamic: " << s << " -->> " << fixed );
+            n->_mods[fixed] = i->second;
+            ModHolder::iterator temp = n->_mods.find( fixed );
+            temp->second.setFieldName( temp->first.c_str() );
+        }
+        return n;
+    }
+
+    void checkNoMods( BSONObj o ) {
+        BSONObjIterator i( o );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            uassert( 10154 ,  "Modifiers and non-modifiers cannot be mixed", e.fieldName()[ 0 ] != '$' );
+        }
+    }
+
+    static void checkTooLarge(const BSONObj& newObj) {
+        uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= BSONObjMaxUserSize );
+    }
+
+    /* note: this is only (as-is) called for
+
+             - not multi
+             - not mods is indexed
+             - not upsert
+    */
+    static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d,
+                                    NamespaceDetailsTransient *nsdt,
+                                    bool god, const char *ns,
+                                    const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) {
+
+        DiskLoc loc;
+        {
+            IndexDetails& i = d->idx(idIdxNo);
+            BSONObj key = i.getKeyFromQuery( patternOrig );            
+            loc = i.idxInterface().findSingle(i, i.head, key);
+            if( loc.isNull() ) {
+                // no upsert support in _updateById yet, so we are done.
+                return UpdateResult(0, 0, 0);
+            }
+        }
+        Record *r = loc.rec();
+
+        if ( ! r->likelyInPhysicalMemory() ) {
+            {
+                scoped_ptr<LockMongoFilesShared> lk( new LockMongoFilesShared() );
+                dbtempreleasewritelock t;
+                r->touch();
+                lk.reset(0); // we have to release mmmutex before we can re-acquire dbmutex
+            }
+            
+            {
+                // we need to re-find in case something changed
+                d = nsdetails( ns );
+                if ( ! d ) {
+                    // dropped 
+                    return UpdateResult(0, 0, 0);
+                }
+                nsdt = &NamespaceDetailsTransient::get(ns);
+                IndexDetails& i = d->idx(idIdxNo);
+                BSONObj key = i.getKeyFromQuery( patternOrig );            
+                loc = i.idxInterface().findSingle(i, i.head, key);
+                if( loc.isNull() ) {
+                    // no upsert support in _updateById yet, so we are done.
+                    return UpdateResult(0, 0, 0);
+                }
+                
+                r = loc.rec();
+            }
+        }
+
+        /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
+           regular ones at the moment. */
+        if ( isOperatorUpdate ) {
+            const BSONObj& onDisk = loc.obj();
+            auto_ptr<ModSetState> mss = mods->prepare( onDisk );
+
+            if( mss->canApplyInPlace() ) {
+                mss->applyModsInPlace(true);
+                DEBUGUPDATE( "\t\t\t updateById doing in place update" );
+            }
+            else {
+                BSONObj newObj = mss->createNewFromMods();
+                checkTooLarge(newObj);
+                assert(nsdt);
+                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+            }
+
+            if ( logop ) {
+                DEV assert( mods->size() );
+
+                BSONObj pattern = patternOrig;
+                if ( mss->haveArrayDepMod() ) {
+                    BSONObjBuilder patternBuilder;
+                    patternBuilder.appendElements( pattern );
+                    mss->appendSizeSpecForArrayDepMods( patternBuilder );
+                    pattern = patternBuilder.obj();
+                }
+
+                if( mss->needOpLogRewrite() ) {
+                    DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+                    logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                }
+                else {
+                    logOp("u", ns, updateobj, &pattern );
+                }
+            }
+            return UpdateResult( 1 , 1 , 1);
+        } // end $operator update
+
+        // regular update
+        BSONElementManipulator::lookForTimestamps( updateobj );
+        checkNoMods( updateobj );
+        assert(nsdt);
+        theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug );
+        if ( logop ) {
+            logOp("u", ns, updateobj, &patternOrig );
+        }
+        return UpdateResult( 1 , 0 , 1 );
+    }
+
+    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs ) {
+        DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi );
+        Client& client = cc();
+        int profile = client.database()->profile;
+        
+        debug.updateobj = updateobj;
+
+        // idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case
+        // The pointers may be left invalid on a failed or terminal yield recovery.
+        NamespaceDetails *d = nsdetails(ns); // can be null if an upsert...
+        NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get(ns);
+
+        auto_ptr<ModSet> mods;
+        bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$';
+        int modsIsIndexed = false; // really the # of indexes
+        if ( isOperatorUpdate ) {
+            if( d && d->indexBuildInProgress ) {
+                set<string> bgKeys;
+                d->inProgIdx().keyPattern().getFieldNames(bgKeys);
+                mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) );
+            }
+            else {
+                mods.reset( new ModSet(updateobj, nsdt->indexKeys()) );
+            }
+            modsIsIndexed = mods->isIndexed();
+        }
+
+        if( !multi && isSimpleIdQuery(patternOrig) && d && !modsIsIndexed ) {
+            int idxNo = d->findIdIndex();
+            if( idxNo >= 0 ) {
+                debug.idhack = true;
+                UpdateResult result = _updateById(isOperatorUpdate, idxNo, mods.get(), profile, d, nsdt, god, ns, updateobj, patternOrig, logop, debug);
+                if ( result.existing || ! upsert ) {
+                    return result;
+                }
+                else if ( upsert && ! isOperatorUpdate && ! logop) {
+                    // this handles repl inserts
+                    checkNoMods( updateobj );
+                    debug.upsert = true;
+                    BSONObj no = updateobj;
+                    theDataFileMgr.insertWithObjMod(ns, no, god);
+                    return UpdateResult( 0 , 0 , 1 , no );
+                }
+            }
+        }
+
+        int numModded = 0;
+        long long nscanned = 0;
+        shared_ptr< Cursor > c = NamespaceDetailsTransient::getCursor( ns, patternOrig );
+
+        d = nsdetails(ns);
+        nsdt = &NamespaceDetailsTransient::get(ns);
+        bool autoDedup = c->autoDedup();
+
+        if( c->ok() ) {
+            set<DiskLoc> seenObjects;
+            MatchDetails details;
+            auto_ptr<ClientCursor> cc;
+            do {
+                nscanned++;
+
+                bool atomic = c->matcher() && c->matcher()->docMatcher().atomic();
+                
+                if ( !atomic ) {
+                    // *****************
+                    if ( cc.get() == 0 ) {
+                        shared_ptr< Cursor > cPtr = c;
+                        cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                    }
+    
+                    bool didYield;
+                    if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) {
+                        cc.release();
+                        break;
+                    }
+                    if ( !c->ok() ) {
+                        break;
+                    }
+                
+                    if ( didYield ) {
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get(ns);
+                    }
+                    // *****************
+                }
+
+                if ( !c->currentMatches( &details ) ) {
+                    c->advance();
+
+                    if ( nscanned % 256 == 0 && ! atomic ) {
+                        if ( cc.get() == 0 ) {
+                            shared_ptr< Cursor > cPtr = c;
+                            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                        }
+                        if ( ! cc->yield() ) {
+                            cc.release();
+                            // TODO should we assert or something?
+                            break;
+                        }
+                        if ( !c->ok() ) {
+                            break;
+                        }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get(ns);
+                    }
+                    continue;
+                }
+
+                Record *r = c->_current();
+                DiskLoc loc = c->currLoc();
+
+                // TODO Maybe this is unnecessary since we have seenObjects
+                if ( c->getsetdup( loc ) && autoDedup ) {
+                    c->advance();
+                    continue;
+                }
+
+                BSONObj js(r);
+
+                BSONObj pattern = patternOrig;
+
+                if ( logop ) {
+                    BSONObjBuilder idPattern;
+                    BSONElement id;
+                    // NOTE: If the matching object lacks an id, we'll log
+                    // with the original pattern.  This isn't replay-safe.
+                    // It might make sense to suppress the log instead
+                    // if there's no id.
+                    if ( js.getObjectID( id ) ) {
+                        idPattern.append( id );
+                        pattern = idPattern.obj();
+                    }
+                    else {
+                        uassert( 10157 ,  "multi-update requires all modified objects to have an _id" , ! multi );
+                    }
+                }
+
+                if ( profile && !multi )
+                    debug.nscanned = (int) nscanned;
+
+                /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
+                    regular ones at the moment. */
+                if ( isOperatorUpdate ) {
+
+                    if ( multi ) {
+                        c->advance(); // go to next record in case this one moves
+                        if ( autoDedup && seenObjects.count( loc ) )
+                            continue;
+                    }
+
+                    const BSONObj& onDisk = loc.obj();
+
+                    ModSet * useMods = mods.get();
+                    bool forceRewrite = false;
+
+                    auto_ptr<ModSet> mymodset;
+                    if ( details._elemMatchKey && mods->hasDynamicArray() ) {
+                        useMods = mods->fixDynamicArray( details._elemMatchKey );
+                        mymodset.reset( useMods );
+                        forceRewrite = true;
+                    }
+
+                    auto_ptr<ModSetState> mss = useMods->prepare( onDisk );
+
+                    bool willAdvanceCursor = multi && c->ok() && ( modsIsIndexed || ! mss->canApplyInPlace() );
+
+                    if ( willAdvanceCursor ) {
+                        if ( cc.get() ) {
+                            cc->setDoingDeletes( true );
+                        }
+                        c->prepareToTouchEarlierIterate();
+                    }
+
+                    if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) {
+                        mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );
+
+                        DEBUGUPDATE( "\t\t\t doing in place update" );
+                        if ( profile && !multi ) 
+                            debug.fastmod = true;
+
+                        if ( modsIsIndexed ) {
+                            seenObjects.insert( loc );
+                        }
+
+                        d->paddingFits();
+                    }
+                    else {
+                        if ( rs )
+                            rs->goingToDelete( onDisk );
+
+                        BSONObj newObj = mss->createNewFromMods();
+                        checkTooLarge(newObj);
+                        DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+                        if ( newLoc != loc || modsIsIndexed ){
+                            // log() << "Moved obj " << newLoc.obj()["_id"] << " from " << loc << " to " << newLoc << endl;
+                            // object moved, need to make sure we don' get again
+                            seenObjects.insert( newLoc );
+                        }
+
+                    }
+
+                    if ( logop ) {
+                        DEV assert( mods->size() );
+
+                        if ( mss->haveArrayDepMod() ) {
+                            BSONObjBuilder patternBuilder;
+                            patternBuilder.appendElements( pattern );
+                            mss->appendSizeSpecForArrayDepMods( patternBuilder );
+                            pattern = patternBuilder.obj();
+                        }
+
+                        if ( forceRewrite || mss->needOpLogRewrite() ) {
+                            DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+                            logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                        }
+                        else {
+                            logOp("u", ns, updateobj, &pattern );
+                        }
+                    }
+                    numModded++;
+                    if ( ! multi )
+                        return UpdateResult( 1 , 1 , numModded );
+                    if ( willAdvanceCursor )
+                        c->recoverFromTouchingEarlierIterate();
+
+                    if ( nscanned % 64 == 0 && ! atomic ) {
+                        if ( cc.get() == 0 ) {
+                            shared_ptr< Cursor > cPtr = c;
+                            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                        }
+                        if ( ! cc->yield() ) {
+                            cc.release();
+                            break;
+                        }
+                        if ( !c->ok() ) {
+                            break;
+                        }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get(ns);
+                    }
+
+                    getDur().commitIfNeeded();
+
+                    continue;
+                }
+
+                uassert( 10158 ,  "multi update only works with $ operators" , ! multi );
+
+                BSONElementManipulator::lookForTimestamps( updateobj );
+                checkNoMods( updateobj );
+                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god);
+                if ( logop ) {
+                    DEV wassert( !god ); // god doesn't get logged, this would be bad.
+                    logOp("u", ns, updateobj, &pattern );
+                }
+                return UpdateResult( 1 , 0 , 1 );
+            } while ( c->ok() );
+        } // endif
+
+        if ( numModded )
+            return UpdateResult( 1 , 1 , numModded );
+
+        // todo: no need for "if( profile )" here as that probably just makes things slower?
+        if ( profile )
+            debug.nscanned = (int) nscanned;
+
+        if ( upsert ) {
+            if ( updateobj.firstElementFieldName()[0] == '$' ) {
+                // upsert of an $operation. build a default object 
+                BSONObj newObj = mods->createNewFromQuery( patternOrig );
+                checkNoMods( newObj );
+                debug.fastmodinsert = true;
+                theDataFileMgr.insertWithObjMod(ns, newObj, god);
+                if ( logop )
+                    logOp( "i", ns, newObj );
+
+                return UpdateResult( 0 , 1 , 1 , newObj );
+            }
+            uassert( 10159 ,  "multi update only works with $ operators" , ! multi );
+            checkNoMods( updateobj );
+            debug.upsert = true;
+            BSONObj no = updateobj;
+            theDataFileMgr.insertWithObjMod(ns, no, god);
+            if ( logop )
+                logOp( "i", ns, no );
+            return UpdateResult( 0 , 0 , 1 , no );
+        }
+
+        return UpdateResult( 0 , isOperatorUpdate , 0 );
+    }
+
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) {
+        uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 );
+        if ( strstr(ns, ".system.") ) {
+            /* dm: it's very important that system.indexes is never updated as IndexDetails has pointers into it */
+            uassert( 10156 , str::stream() << "cannot update system collection: " << ns << " q: " << patternOrig << " u: " << updateobj , legalClientSystemNS( ns , true ) );
+        }
+        return _updateObjects(false, ns, updateobj, patternOrig, upsert, multi, logop, debug);
+    }
+
+}
diff --git a/src/mongo/db/ops/update.h b/src/mongo/db/ops/update.h
new file mode 100644
index 00000000000..9446db06d36
--- /dev/null
+++ b/src/mongo/db/ops/update.h
@@ -0,0 +1,700 @@
+// update.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../../util/embedded_builder.h"
+#include "../matcher.h"
+
+namespace mongo {
+
+    // ---------- public -------------
+
+    struct UpdateResult {
+        const bool existing; // if existing objects were modified
+        const bool mod;      // was this a $ mod
+        const long long num; // how many objects touched
+        OID upserted;  // if something was upserted, the new _id of the object
+
+        UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() )
+            : existing(e) , mod(m), num(n) {
+            upserted.clear();
+            BSONElement id = upsertedObject["_id"];
+            if ( ! e && n == 1 && id.type() == jstOID ) {
+                upserted = id.OID();
+            }
+        }
+    };
+
+    class RemoveSaver;
+
+    /* returns true if an existing object was updated, false if no existing object was found.
+       multi - update multiple objects - mostly useful with things like $set
+       god - allow access to system namespaces
+    */
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
+    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern,
+                                bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 );
+
+
+
+    // ---------- private -------------
+
+    class ModState;
+    class ModSetState;
+
+    /* Used for modifiers such as $inc, $set, $push, ...
+     * stores the info about a single operation
+     * once created should never be modified
+     */
+    struct Mod {
+        // See opFromStr below
+        //        0    1    2     3         4     5          6    7      8       9       10    11        12           13
+        enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET, RENAME_FROM, RENAME_TO } op;
+
+        static const char* modNames[];
+        static unsigned modNamesNum;
+
+        const char *fieldName;
+        const char *shortFieldName;
+
+        BSONElement elt; // x:5 note: this is the actual element from the updateobj
+        boost::shared_ptr<Matcher> matcher;
+        bool matcherOnPrimitive;
+
+        void init( Op o , BSONElement& e ) {
+            op = o;
+            elt = e;
+            if ( op == PULL && e.type() == Object ) {
+                BSONObj t = e.embeddedObject();
+                if ( t.firstElement().getGtLtOp() == 0 ) {
+                    matcher.reset( new Matcher( t ) );
+                    matcherOnPrimitive = false;
+                }
+                else {
+                    matcher.reset( new Matcher( BSON( "" << t ) ) );
+                    matcherOnPrimitive = true;
+                }
+            }
+        }
+
+        void setFieldName( const char * s ) {
+            fieldName = s;
+            shortFieldName = strrchr( fieldName , '.' );
+            if ( shortFieldName )
+                shortFieldName++;
+            else
+                shortFieldName = fieldName;
+        }
+
+        /**
+         * @param in incrememnts the actual value inside in
+         */
+        void incrementMe( BSONElement& in ) const {
+            BSONElementManipulator manip( in );
+            switch ( in.type() ) {
+            case NumberDouble:
+                manip.setNumber( elt.numberDouble() + in.numberDouble() );
+                break;
+            case NumberLong:
+                manip.setLong( elt.numberLong() + in.numberLong() );
+                break;
+            case NumberInt:
+                manip.setInt( elt.numberInt() + in.numberInt() );
+                break;
+            default:
+                assert(0);
+            }
+        }
+        void IncrementMe( BSONElement& in ) const {
+            BSONElementManipulator manip( in );
+            switch ( in.type() ) {
+            case NumberDouble:
+                manip.SetNumber( elt.numberDouble() + in.numberDouble() );
+                break;
+            case NumberLong:
+                manip.SetLong( elt.numberLong() + in.numberLong() );
+                break;
+            case NumberInt:
+                manip.SetInt( elt.numberInt() + in.numberInt() );
+                break;
+            default:
+                assert(0);
+            }
+        }
+
+        template< class Builder >
+        void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const;
+
+        bool operator<( const Mod &other ) const {
+            return strcmp( fieldName, other.fieldName ) < 0;
+        }
+
+        bool arrayDep() const {
+            switch (op) {
+            case PUSH:
+            case PUSH_ALL:
+            case POP:
+                return true;
+            default:
+                return false;
+            }
+        }
+
+        static bool isIndexed( const string& fullName , const set<string>& idxKeys ) {
+            const char * fieldName = fullName.c_str();
+            // check if there is an index key that is a parent of mod
+            for( const char *dot = strchr( fieldName, '.' ); dot; dot = strchr( dot + 1, '.' ) )
+                if ( idxKeys.count( string( fieldName, dot - fieldName ) ) )
+                    return true;
+
+            // check if there is an index key equal to mod
+            if ( idxKeys.count(fullName) )
+                return true;
+            // check if there is an index key that is a child of mod
+            set< string >::const_iterator j = idxKeys.upper_bound( fullName );
+            if ( j != idxKeys.end() && j->find( fullName ) == 0 && (*j)[fullName.size()] == '.' )
+                return true;
+
+            return false;
+        }
+
+        bool isIndexed( const set<string>& idxKeys ) const {
+            string fullName = fieldName;
+
+            if ( isIndexed( fullName , idxKeys ) )
+                return true;
+
+            if ( strstr( fieldName , "." ) ) {
+                // check for a.0.1
+                StringBuilder buf( fullName.size() + 1 );
+                for ( size_t i=0; i<fullName.size(); i++ ) {
+                    char c = fullName[i];
+
+                    if ( c == '$' &&
+                            i > 0 && fullName[i-1] == '.' &&
+                            i+1<fullName.size() &&
+                            fullName[i+1] == '.' ) {
+                        i++;
+                        continue;
+                    }
+
+                    buf << c;
+
+                    if ( c != '.' )
+                        continue;
+
+                    if ( ! isdigit( fullName[i+1] ) )
+                        continue;
+
+                    bool possible = true;
+                    size_t j=i+2;
+                    for ( ; j<fullName.size(); j++ ) {
+                        char d = fullName[j];
+                        if ( d == '.' )
+                            break;
+                        if ( isdigit( d ) )
+                            continue;
+                        possible = false;
+                        break;
+                    }
+
+                    if ( possible )
+                        i = j;
+                }
+                string x = buf.str();
+                if ( isIndexed( x , idxKeys ) )
+                    return true;
+            }
+
+            return false;
+        }
+
+        template< class Builder >
+        void apply( Builder& b , BSONElement in , ModState& ms ) const;
+
+        /**
+         * @return true iff toMatch should be removed from the array
+         */
+        bool _pullElementMatch( BSONElement& toMatch ) const;
+
+        void _checkForAppending( const BSONElement& e ) const {
+            if ( e.type() == Object ) {
+                // this is a tiny bit slow, but rare and important
+                // only when setting something TO an object, not setting something in an object
+                // and it checks for { $set : { x : { 'a.b' : 1 } } }
+                // which is feel has been common
+                uassert( 12527 , "not okForStorage" , e.embeddedObject().okForStorage() );
+            }
+        }
+
+        bool isEach() const {
+            if ( elt.type() != Object )
+                return false;
+            BSONElement e = elt.embeddedObject().firstElement();
+            if ( e.type() != Array )
+                return false;
+            return strcmp( e.fieldName() , "$each" ) == 0;
+        }
+
+        BSONObj getEach() const {
+            return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck();
+        }
+
+        void parseEach( BSONElementSet& s ) const {
+            BSONObjIterator i(getEach());
+            while ( i.more() ) {
+                s.insert( i.next() );
+            }
+        }
+
+        const char *renameFrom() const {
+            massert( 13492, "mod must be RENAME_TO type", op == Mod::RENAME_TO );
+            return elt.fieldName();
+        }
+    };
+
+    /**
+     * stores a set of Mods
+     * once created, should never be changed
+     */
+    class ModSet : boost::noncopyable {
+        typedef map<string,Mod> ModHolder;
+        ModHolder _mods;
+        int _isIndexed;
+        bool _hasDynamicArray;
+
+        static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base );
+
+        FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const {
+            bool mDone = ( m == _mods.end() );
+            bool pDone = ( p == pEnd );
+            assert( ! mDone );
+            assert( ! pDone );
+            if ( mDone && pDone )
+                return SAME;
+            // If one iterator is done we want to read from the other one, so say the other one is lower.
+            if ( mDone )
+                return RIGHT_BEFORE;
+            if ( pDone )
+                return LEFT_BEFORE;
+
+            return compareDottedFieldNames( m->first, p->first.c_str() );
+        }
+
+        bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) {
+            for( string left = EmbeddedBuilder::splitDot( right );
+                    left.length() > 0 && left[ left.length() - 1 ] != '.';
+                    left += "." + EmbeddedBuilder::splitDot( right ) ) {
+                if ( existing.count( left ) > 0 && existing[ left ].type() != Object )
+                    return false;
+                if ( haveModForField( left.c_str() ) )
+                    return false;
+            }
+            return true;
+        }
+        static Mod::Op opFromStr( const char *fn ) {
+            assert( fn[0] == '$' );
+            switch( fn[1] ) {
+            case 'i': {
+                if ( fn[2] == 'n' && fn[3] == 'c' && fn[4] == 0 )
+                    return Mod::INC;
+                break;
+            }
+            case 's': {
+                if ( fn[2] == 'e' && fn[3] == 't' && fn[4] == 0 )
+                    return Mod::SET;
+                break;
+            }
+            case 'p': {
+                if ( fn[2] == 'u' ) {
+                    if ( fn[3] == 's' && fn[4] == 'h' ) {
+                        if ( fn[5] == 0 )
+                            return Mod::PUSH;
+                        if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+                            return Mod::PUSH_ALL;
+                    }
+                    else if ( fn[3] == 'l' && fn[4] == 'l' ) {
+                        if ( fn[5] == 0 )
+                            return Mod::PULL;
+                        if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+                            return Mod::PULL_ALL;
+                    }
+                }
+                else if ( fn[2] == 'o' && fn[3] == 'p' && fn[4] == 0 )
+                    return Mod::POP;
+                break;
+            }
+            case 'u': {
+                if ( fn[2] == 'n' && fn[3] == 's' && fn[4] == 'e' && fn[5] == 't' && fn[6] == 0 )
+                    return Mod::UNSET;
+                break;
+            }
+            case 'b': {
+                if ( fn[2] == 'i' && fn[3] == 't' ) {
+                    if ( fn[4] == 0 )
+                        return Mod::BIT;
+                    if ( fn[4] == 'a' && fn[5] == 'n' && fn[6] == 'd' && fn[7] == 0 )
+                        return Mod::BITAND;
+                    if ( fn[4] == 'o' && fn[5] == 'r' && fn[6] == 0 )
+                        return Mod::BITOR;
+                }
+                break;
+            }
+            case 'a': {
+                if ( fn[2] == 'd' && fn[3] == 'd' ) {
+                    // add
+                    if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 )
+                        return Mod::ADDTOSET;
+
+                }
+                break;
+            }
+            case 'r': {
+                if ( fn[2] == 'e' && fn[3] == 'n' && fn[4] == 'a' && fn[5] == 'm' && fn[6] =='e' ) {
+                    return Mod::RENAME_TO; // with this return code we handle both RENAME_TO and RENAME_FROM
+                }
+                break;
+            }
+            default: break;
+            }
+            uassert( 10161 ,  "Invalid modifier specified " + string( fn ), false );
+            return Mod::INC;
+        }
+
+        ModSet() {}
+
+        void updateIsIndexed( const Mod &m, const set<string> &idxKeys, const set<string> *backgroundKeys ) {
+            if ( m.isIndexed( idxKeys ) ||
+                    (backgroundKeys && m.isIndexed(*backgroundKeys)) ) {
+                _isIndexed++;
+            }
+        }
+
+    public:
+
+        ModSet( const BSONObj &from ,
+                const set<string>& idxKeys = set<string>(),
+                const set<string>* backgroundKeys = 0
+              );
+
+        // TODO: this is inefficient - should probably just handle when iterating
+        ModSet * fixDynamicArray( const char * elemMatchKey ) const;
+
+        bool hasDynamicArray() const { return _hasDynamicArray; }
+
+        /**
+         * creates a ModSetState suitable for operation on obj
+         * doesn't change or modify this ModSet or any underying Mod
+         */
+        auto_ptr<ModSetState> prepare( const BSONObj& obj ) const;
+
+        /**
+         * given a query pattern, builds an object suitable for an upsert
+         * will take the query spec and combine all $ operators
+         */
+        BSONObj createNewFromQuery( const BSONObj& query );
+
+        /**
+         *
+         */
+        int isIndexed() const {
+            return _isIndexed;
+        }
+
+        unsigned size() const { return _mods.size(); }
+
+        bool haveModForField( const char *fieldName ) const {
+            return _mods.find( fieldName ) != _mods.end();
+        }
+
+        bool haveConflictingMod( const string& fieldName ) {
+            size_t idx = fieldName.find( '.' );
+            if ( idx == string::npos )
+                idx = fieldName.size();
+
+            ModHolder::const_iterator start = _mods.lower_bound(fieldName.substr(0,idx));
+            for ( ; start != _mods.end(); start++ ) {
+                FieldCompareResult r = compareDottedFieldNames( fieldName , start->first );
+                switch ( r ) {
+                case LEFT_SUBFIELD: return true;
+                case LEFT_BEFORE: return false;
+                case SAME: return true;
+                case RIGHT_BEFORE: return false;
+                case RIGHT_SUBFIELD: return true;
+                }
+            }
+            return false;
+
+
+        }
+
+    };
+
+    /**
+     * stores any information about a single Mod operating on a single Object
+     */
+    class ModState {
+    public:
+        const Mod * m;
+        BSONElement old;
+        BSONElement newVal;
+        BSONObj _objData;
+
+        const char * fixedOpName;
+        BSONElement * fixed;
+        int pushStartSize;
+
+        BSONType incType;
+        int incint;
+        double incdouble;
+        long long inclong;
+
+        bool dontApply;
+
+        ModState() {
+            fixedOpName = 0;
+            fixed = 0;
+            pushStartSize = -1;
+            incType = EOO;
+            dontApply = false;
+        }
+
+        Mod::Op op() const {
+            return m->op;
+        }
+
+        const char * fieldName() const {
+            return m->fieldName;
+        }
+
+        bool needOpLogRewrite() const {
+            if ( dontApply )
+                return false;
+
+            if ( fixed || fixedOpName || incType )
+                return true;
+
+            switch( op() ) {
+            case Mod::RENAME_FROM:
+            case Mod::RENAME_TO:
+                return true;
+            case Mod::BIT:
+            case Mod::BITAND:
+            case Mod::BITOR:
+                // TODO: should we convert this to $set?
+                return false;
+            default:
+                return false;
+            }
+        }
+
+        void appendForOpLog( BSONObjBuilder& b ) const;
+
+        template< class Builder >
+        void apply( Builder& b , BSONElement in ) {
+            m->apply( b , in , *this );
+        }
+
+        template< class Builder >
+        void appendIncValue( Builder& b , bool useFullName ) const {
+            const char * n = useFullName ? m->fieldName : m->shortFieldName;
+
+            switch ( incType ) {
+            case NumberDouble:
+                b.append( n , incdouble ); break;
+            case NumberLong:
+                b.append( n , inclong ); break;
+            case NumberInt:
+                b.append( n , incint ); break;
+            default:
+                assert(0);
+            }
+        }
+
+        string toString() const;
+
+        template< class Builder >
+        void handleRename( Builder &newObjBuilder, const char *shortFieldName );
+    };
+
+    /**
+     * this is used to hold state, meta data while applying a ModSet to a BSONObj
+     * the goal is to make ModSet const so its re-usable
+     */
+    class ModSetState : boost::noncopyable {
+        struct FieldCmp {
+            bool operator()( const string &l, const string &r ) const;
+        };
+        typedef map<string,ModState,FieldCmp> ModStateHolder;
+        const BSONObj& _obj;
+        ModStateHolder _mods;
+        bool _inPlacePossible;
+        BSONObj _newFromMods; // keep this data alive, as oplog generation may depend on it
+
+        ModSetState( const BSONObj& obj )
+            : _obj( obj ) , _inPlacePossible(true) {
+        }
+
+        /**
+         * @return if in place is still possible
+         */
+        bool amIInPlacePossible( bool inPlacePossible ) {
+            if ( ! inPlacePossible )
+                _inPlacePossible = false;
+            return _inPlacePossible;
+        }
+
+        template< class Builder >
+        void createNewFromMods( const string& root , Builder& b , const BSONObj &obj );
+
+        template< class Builder >
+        void _appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen );
+
+        template< class Builder >
+        void appendNewFromMod( ModState& ms , Builder& b ) {
+            if ( ms.dontApply ) {
+                return;
+            }
+
+            //const Mod& m = *(ms.m); // HACK
+            Mod& m = *((Mod*)(ms.m)); // HACK
+
+            switch ( m.op ) {
+
+            case Mod::PUSH: {
+                if ( m.isEach() ) {
+                    b.appendArray( m.shortFieldName, m.getEach() );
+                } else {
+                    BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                    arr.appendAs( m.elt, "0" );
+                    arr.done();
+                }
+                break;
+            }
+            case Mod::ADDTOSET: {
+                if ( m.isEach() ) {
+                    // Remove any duplicates in given array
+                    BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                    BSONElementSet toadd;
+                    m.parseEach( toadd );
+                    BSONObjIterator i( m.getEach() );
+                    int n = 0;
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( toadd.count(e) ) {
+                            arr.appendAs( e , BSONObjBuilder::numStr( n++ ) );
+                            toadd.erase( e );
+                        }
+                    }
+                    arr.done();
+                }
+                else {
+                    BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                    arr.appendAs( m.elt, "0" );
+                    arr.done();
+                }
+                break;
+            }
+
+            case Mod::PUSH_ALL: {
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            }
+
+            case Mod::UNSET:
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // no-op b/c unset/pull of nothing does nothing
+                break;
+
+            case Mod::INC:
+                ms.fixedOpName = "$set";
+            case Mod::SET: {
+                m._checkForAppending( m.elt );
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            }
+            // shouldn't see RENAME_FROM here
+            case Mod::RENAME_TO:
+                ms.handleRename( b, m.shortFieldName );
+                break;
+            default:
+                stringstream ss;
+                ss << "unknown mod in appendNewFromMod: " << m.op;
+                throw UserException( 9015, ss.str() );
+            }
+
+        }
+
+    public:
+
+        bool canApplyInPlace() const {
+            return _inPlacePossible;
+        }
+
+        /**
+         * modified underlying _obj
+         * @param isOnDisk - true means this is an on disk object, and this update needs to be made durable
+         */
+        void applyModsInPlace( bool isOnDisk );
+
+        BSONObj createNewFromMods();
+
+        // re-writing for oplog
+
+        bool needOpLogRewrite() const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.needOpLogRewrite() )
+                    return true;
+            return false;
+        }
+
+        BSONObj getOpLogRewrite() const {
+            BSONObjBuilder b;
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                i->second.appendForOpLog( b );
+            return b.obj();
+        }
+
+        bool haveArrayDepMod() const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.m->arrayDep() )
+                    return true;
+            return false;
+        }
+
+        void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) {
+                const ModState& m = i->second;
+                if ( m.m->arrayDep() ) {
+                    if ( m.pushStartSize == -1 )
+                        b.appendNull( m.fieldName() );
+                    else
+                        b << m.fieldName() << BSON( "$size" << m.pushStartSize );
+                }
+            }
+        }
+
+        string toString() const;
+
+        friend class ModSet;
+    };
+
+}
+
diff --git a/src/mongo/db/pagefault.cpp b/src/mongo/db/pagefault.cpp
new file mode 100644
index 00000000000..4b9b1b23e02
--- /dev/null
+++ b/src/mongo/db/pagefault.cpp
@@ -0,0 +1,55 @@
+// @file pagefault.cpp
+
+#include "pch.h"
+#include "diskloc.h"
+#include "pagefault.h"
+#include "client.h"
+#include "pdfile.h"
+#include "server.h"
+
+namespace mongo { 
+
+    PageFaultException::PageFaultException(Record *_r)
+    {
+        assert( cc()._pageFaultRetryableSection != 0 );
+        cc()._pageFaultRetryableSection->_laps++;
+        assert( cc()._pageFaultRetryableSection->_laps < 1000 );
+        r = _r;
+        era = LockMongoFilesShared::getEra();
+    }
+
+    void PageFaultException::touch() { 
+        assert( !d.dbMutex.atLeastReadLocked() );
+        LockMongoFilesShared lk;
+        if( LockMongoFilesShared::getEra() != era ) {
+            // files opened and closed.  we don't try to handle but just bail out; this is much simpler
+            // and less error prone and saves us from taking a dbmutex readlock.
+            dlog(2) << "era changed" << endl;
+            return;
+        }
+        r->touch();
+    }
+
+    PageFaultRetryableSection::~PageFaultRetryableSection() {
+        cc()._pageFaultRetryableSection = old;
+    }
+    PageFaultRetryableSection::PageFaultRetryableSection() {
+        _laps = 0;
+        old = cc()._pageFaultRetryableSection;
+        if( d.dbMutex.atLeastReadLocked() ) { 
+            cc()._pageFaultRetryableSection = 0;
+            if( debug || logLevel > 2 ) { 
+                LOGSOME << "info PageFaultRetryableSection will not yield, already locked upon reaching" << endl;
+            }
+        }
+        else if( cc()._pageFaultRetryableSection ) { 
+            cc()._pageFaultRetryableSection = 0;
+            dlog(2) << "info nested PageFaultRetryableSection will not yield on fault" << endl;
+        }
+        else {
+            cc()._pageFaultRetryableSection = this;
+            cc()._hasWrittenThisPass = false;
+        }
+    }
+
+}
diff --git a/src/mongo/db/pagefault.h b/src/mongo/db/pagefault.h
new file mode 100644
index 00000000000..8bbf4ecab52
--- /dev/null
+++ b/src/mongo/db/pagefault.h
@@ -0,0 +1,46 @@
+// @file pagefault.h
+
+// define this : _PAGEFAULTEXCEPTION
+
+#pragma once
+
+namespace mongo {
+
+    class Record;
+
+    class PageFaultException /*: public DBException*/ { 
+        unsigned era;
+        Record *r;
+    public:
+        PageFaultException(const PageFaultException& rhs) : era(rhs.era), r(rhs.r) { }
+        explicit PageFaultException(Record*);
+        void touch();
+    };
+
+    class PageFaultRetryableSection : boost::noncopyable { 
+        PageFaultRetryableSection *old;
+    public:
+        unsigned _laps;
+        PageFaultRetryableSection();
+        ~PageFaultRetryableSection();
+    };
+#if 0
+    inline void how_to_use_example() {
+        // ...
+        {
+            PageFaultRetryableSection s;
+            while( 1 ) {
+                try {
+                    writelock lk; // or readlock
+                    // do work
+                    break;
+                }
+                catch( PageFaultException& e ) { 
+                    e.touch();
+                } 
+            }
+        }
+        // ...
+    }
+#endif
+}
diff --git a/src/mongo/db/pcre.txt b/src/mongo/db/pcre.txt
new file mode 100644
index 00000000000..3e21047eabc
--- /dev/null
+++ b/src/mongo/db/pcre.txt
@@ -0,0 +1,15 @@
+
+
+You need to install pcre.
+
+This could be scripted:
+
+cd /tmp
+curl -O ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-7.4.tar.gz
+tar -xzf pcre-7.4.tar.gz 
+./configure --enable-utf8 --with-match-limit=200000 --with-match-limit-recursion=4000
+make
+make install
+
+
+At that point is will be installed in /usr/*.  the version in p/pcre-7.4 is for VC++.
diff --git a/src/mongo/db/pdfile.cpp b/src/mongo/db/pdfile.cpp
new file mode 100644
index 00000000000..069eeadec37
--- /dev/null
+++ b/src/mongo/db/pdfile.cpp
@@ -0,0 +1,2425 @@
+// pdfile.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+todo:
+_ table scans must be sequential, not next/prev pointers
+_ coalesce deleted
+_ disallow system* manipulations from the database.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "../util/mmap.h"
+#include "../util/hashtab.h"
+#include "../util/file_allocator.h"
+#include "../util/processinfo.h"
+#include "../util/file.h"
+#include "btree.h"
+#include "btreebuilder.h"
+#include <algorithm>
+#include <list>
+#include "repl.h"
+#include "dbhelpers.h"
+#include "namespace-inl.h"
+#include "queryutil.h"
+#include "extsort.h"
+#include "curop-inl.h"
+#include "background.h"
+#include "compact.h"
+#include "ops/delete.h"
+#include "instance.h"
+#include "replutil.h"
+
+namespace mongo {
+
+    BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 );
+    BOOST_STATIC_ASSERT( sizeof(DataFileHeader)-4 == 8192 );
+
+    void printMemInfo( const char * where ) {
+        cout << "mem info: ";
+        if ( where )
+            cout << where << " ";
+        ProcessInfo pi;
+        if ( ! pi.supported() ) {
+            cout << " not supported" << endl;
+            return;
+        }
+
+        cout << "vsize: " << pi.getVirtualMemorySize() << " resident: " << pi.getResidentSize() << " mapped: " << ( MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ) << endl;
+    }
+
+    bool isValidNS( const StringData& ns ) {
+        // TODO: should check for invalid characters
+
+        const char * x = strchr( ns.data() , '.' );
+        if ( ! x )
+            return false;
+
+        x++;
+        return *x > 0;
+    }
+
+    bool inDBRepair = false;
+    struct doingRepair {
+        doingRepair() {
+            assert( ! inDBRepair );
+            inDBRepair = true;
+        }
+        ~doingRepair() {
+            inDBRepair = false;
+        }
+    };
+
+    map<string, unsigned> BackgroundOperation::dbsInProg;
+    set<string> BackgroundOperation::nsInProg;
+
+    bool BackgroundOperation::inProgForDb(const char *db) {
+        assertInWriteLock();
+        return dbsInProg[db] != 0;
+    }
+
+    bool BackgroundOperation::inProgForNs(const char *ns) {
+        assertInWriteLock();
+        return nsInProg.count(ns) != 0;
+    }
+
+    void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) {
+        uassert(12586, "cannot perform operation: a background operation is currently running for this database",
+                !inProgForDb(db));
+    }
+
+    void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) {
+        uassert(12587, "cannot perform operation: a background operation is currently running for this collection",
+                !inProgForNs(ns));
+    }
+
+    BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) {
+        assertInWriteLock();
+        dbsInProg[_ns.db]++;
+        assert( nsInProg.count(_ns.ns()) == 0 );
+        nsInProg.insert(_ns.ns());
+    }
+
+    BackgroundOperation::~BackgroundOperation() {
+        wassert( d.dbMutex.isWriteLocked() );
+        dbsInProg[_ns.db]--;
+        nsInProg.erase(_ns.ns());
+    }
+
+    void BackgroundOperation::dump(stringstream& ss) {
+        if( nsInProg.size() ) {
+            ss << "\n<b>Background Jobs in Progress</b>\n";
+            for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ )
+                ss << "  " << *i << '\n';
+        }
+        for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) {
+            if( i->second )
+                ss << "database " << i->first << ": " << i->second << '\n';
+        }
+    }
+
+    /* ----------------------------------------- */
+
+    string dbpath = "/data/db/";
+    const char FREELIST_NS[] = ".$freelist";
+    bool directoryperdb = false;
+    string repairpath;
+    string pidfilepath;
+
+    DataFileMgr theDataFileMgr;
+    DatabaseHolder _dbHolder;
+    int MAGIC = 0x1000;
+
+    DatabaseHolder& dbHolderUnchecked() {
+        return _dbHolder;
+    }
+
+    void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
+    void ensureIdIndexForNewNs(const char *ns) {
+        if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
+                strstr( ns, FREELIST_NS ) == 0 ) {
+            log( 1 ) << "adding _id index for collection " << ns << endl;
+            ensureHaveIdIndex( ns );
+        }
+    }
+
+    string getDbContext() {
+        stringstream ss;
+        Client * c = currentClient.get();
+        if ( c ) {
+            Client::Context * cx = c->getContext();
+            if ( cx ) {
+                Database *database = cx->db();
+                if ( database ) {
+                    ss << database->name << ' ';
+                    ss << cx->ns() << ' ';
+                }
+            }
+        }
+        return ss.str();
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    // inheritable class to implement an operation that may be applied to all
+    // files in a database using _applyOpToDataFiles()
+    class FileOp {
+    public:
+        virtual ~FileOp() {}
+        // Return true if file exists and operation successful
+        virtual bool apply( const boost::filesystem::path &p ) = 0;
+        virtual const char * op() const = 0;
+    };
+
+    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
+
+    void _deleteDataFiles(const char *database) {
+        if ( directoryperdb ) {
+            FileAllocator::get()->waitUntilFinished();
+            MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ), "delete data files with a directoryperdb" );
+            return;
+        }
+        class : public FileOp {
+            virtual bool apply( const boost::filesystem::path &p ) {
+                return boost::filesystem::remove( p );
+            }
+            virtual const char * op() const {
+                return "remove";
+            }
+        } deleter;
+        _applyOpToDataFiles( database, deleter, true );
+    }
+
+    int Extent::initialSize(int len) {
+        long long sz = len * 16;
+        if ( len < 1000 ) sz = len * 64;
+        if ( sz > 1000000000 )
+            sz = 1000000000;
+        int z = ((int)sz) & 0xffffff00;
+        assert( z > len );
+        return z;
+    }
+
+    bool _userCreateNS(const char *ns, const BSONObj& options, string& err, bool *deferIdIndex) {
+        if ( nsdetails(ns) ) {
+            err = "collection already exists";
+            return false;
+        }
+
+        log(1) << "create collection " << ns << ' ' << options << endl;
+
+        /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field
+           and then go back and set to ok : 1 after we are done.
+        */
+        bool isFreeList = strstr(ns, FREELIST_NS) != 0;
+        if( !isFreeList )
+            addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options);
+
+        long long size = Extent::initialSize(128);
+        {
+            BSONElement e = options.getField("size");
+            if ( e.isNumber() ) {
+                size = e.numberLong();
+                size += 256;
+                size &= 0xffffffffffffff00LL;
+            }
+        }
+
+        uassert( 10083 , "create collection invalid size spec", size > 0 );
+
+        bool newCapped = false;
+        int mx = 0;
+        if( options["capped"].trueValue() ) {
+            newCapped = true;
+            BSONElement e = options.getField("max");
+            if ( e.isNumber() ) {
+                mx = e.numberInt();
+            }
+        }
+
+        // $nExtents just for debug/testing.
+        BSONElement e = options.getField( "$nExtents" );
+        Database *database = cc().database();
+        if ( e.type() == Array ) {
+            // We create one extent per array entry, with size specified
+            // by the array value.
+            BSONObjIterator i( e.embeddedObject() );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                int size = int( e.number() );
+                assert( size <= 0x7fffffff );
+                // $nExtents is just for testing - always allocate new extents
+                // rather than reuse existing extents so we have some predictibility
+                // in the extent size used by our tests
+                database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped );
+            }
+        }
+        else if ( int( e.number() ) > 0 ) {
+            // We create '$nExtents' extents, each of size 'size'.
+            int nExtents = int( e.number() );
+            assert( size <= 0x7fffffff );
+            for ( int i = 0; i < nExtents; ++i ) {
+                assert( size <= 0x7fffffff );
+                // $nExtents is just for testing - always allocate new extents
+                // rather than reuse existing extents so we have some predictibility
+                // in the extent size used by our tests
+                database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped );
+            }
+        }
+        else {
+            // This is the non test case, where we don't have a $nExtents spec.
+            while ( size > 0 ) {
+                int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize;
+                int desiredExtentSize = (int) (size > max ? max : size);
+                if ( desiredExtentSize < Extent::minSize() ) {
+                    desiredExtentSize = Extent::minSize();
+                }
+                desiredExtentSize &= 0xffffff00;
+                Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped, true );
+                size -= e->length;
+            }
+        }
+
+        NamespaceDetails *d = nsdetails(ns);
+        assert(d);
+
+        bool ensure = false;
+        if ( options.getField( "autoIndexId" ).type() ) {
+            if ( options["autoIndexId"].trueValue() ) {
+                ensure = true;
+            }
+        }
+        else {
+            if ( !newCapped ) {
+                ensure=true;
+            }
+        }
+        if( ensure ) {
+            if( deferIdIndex )
+                *deferIdIndex = true;
+            else
+                ensureIdIndexForNewNs( ns );
+        }
+
+        if ( mx > 0 )
+            getDur().writingInt( d->max ) = mx;
+
+        return true;
+    }
+
+    /** { ..., capped: true, size: ..., max: ... }
+        @param deferIdIndex - if not not, defers id index creation.  sets the bool value to true if we wanted to create the id index.
+        @return true if successful
+    */
+    bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) {
+        const char *coll = strchr( ns, '.' ) + 1;
+        massert( 10356 ,  str::stream() << "invalid ns: " << ns , NamespaceString::validCollectionName(ns));
+        char cl[ 256 ];
+        nsToDatabase( ns, cl );
+        bool ok = _userCreateNS(ns, options, err, deferIdIndex);
+        if ( logForReplication && ok ) {
+            if ( options.getField( "create" ).eoo() ) {
+                BSONObjBuilder b;
+                b << "create" << coll;
+                b.appendElements( options );
+                options = b.obj();
+            }
+            string logNs = string( cl ) + ".$cmd";
+            logOp("c", logNs.c_str(), options);
+        }
+        return ok;
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    int MongoDataFile::maxSize() {
+        if ( sizeof( int* ) == 4 ) {
+            return 512 * 1024 * 1024;
+        }
+        else if ( cmdLine.smallfiles ) {
+            return 0x7ff00000 >> 2;
+        }
+        else {
+            return 0x7ff00000;
+        }
+    }
+
+    NOINLINE_DECL void MongoDataFile::badOfs2(int ofs) const {
+        stringstream ss;
+        ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+        uasserted(13441, ss.str());
+    }
+
+    NOINLINE_DECL void MongoDataFile::badOfs(int ofs) const {
+        stringstream ss;
+        ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+        uasserted(13440, ss.str());
+    }
+
+    int MongoDataFile::defaultSize( const char *filename ) const {
+        int size;
+        if ( fileNo <= 4 )
+            size = (64*1024*1024) << fileNo;
+        else
+            size = 0x7ff00000;
+        if ( cmdLine.smallfiles ) {
+            size = size >> 2;
+        }
+        return size;
+    }
+
+    static void check(void *_mb) { 
+        if( sizeof(char *) == 4 )
+            uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0);
+        else
+            uassert( 10085 , "can't map file memory", _mb != 0);
+    }
+
+    /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
+    bool MongoDataFile::openExisting( const char *filename ) {
+        assert( _mb == 0 );
+        if( !exists(filename) )
+            return false;
+        if( !mmf.open(filename,false) ) {
+            dlog(2) << "info couldn't open " << filename << " probably end of datafile list" << endl;
+            return false;
+        }
+        _mb = mmf.getView(); assert(_mb);
+        unsigned long long sz = mmf.length();
+        assert( sz <= 0x7fffffff );
+        assert( sz % 4096 == 0 );
+        if( sz < 64*1024*1024 && !cmdLine.smallfiles ) { 
+            if( sz >= 16*1024*1024 && sz % (1024*1024) == 0 ) { 
+                log() << "info openExisting file size " << sz << " but cmdLine.smallfiles=false" << endl;
+            }
+            else {
+                log() << "openExisting size " << sz << " less then minimum file size expectation " << filename << endl;
+                assert(false);
+            }
+        }
+        check(_mb);
+        if( header()->uninitialized() )
+            return false;
+        return true;
+    }
+
+    void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
+        long size = defaultSize( filename );
+        while ( size < minSize ) {
+            if ( size < maxSize() / 2 )
+                size *= 2;
+            else {
+                size = maxSize();
+                break;
+            }
+        }
+        if ( size > maxSize() )
+            size = maxSize();
+
+        assert( size >= 64*1024*1024 || cmdLine.smallfiles );
+        assert( size % 4096 == 0 );
+
+        if ( preallocateOnly ) {
+            if ( cmdLine.prealloc ) {
+                FileAllocator::get()->requestAllocation( filename, size );
+            }
+            return;
+        }
+
+        {
+            assert( _mb == 0 );
+            unsigned long long sz = size;
+            if( mmf.create(filename, sz, false) )
+                _mb = mmf.getView();
+            assert( sz <= 0x7fffffff );
+            size = (int) sz;
+        }
+        check(_mb);
+        header()->init(fileNo, size, filename);
+    }
+
+    void MongoDataFile::flush( bool sync ) {
+        mmf.flush( sync );
+    }
+
+    void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) {
+        NamespaceIndex *ni = nsindex(ns);
+        NamespaceDetails *details = ni->details(ns);
+        if ( details ) {
+            assert( !details->lastExtent.isNull() );
+            assert( !details->firstExtent.isNull() );
+            getDur().writingDiskLoc(e->xprev) = details->lastExtent;
+            getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc;
+            assert( !eloc.isNull() );
+            getDur().writingDiskLoc(details->lastExtent) = eloc;
+        }
+        else {
+            ni->add_ns(ns, eloc, capped);
+            details = ni->details(ns);
+        }
+
+        {
+            NamespaceDetails *dw = details->writingWithoutExtra();
+            dw->lastExtentSize = e->length;
+        }
+        details->addDeletedRec(emptyLoc.drec(), emptyLoc);
+    }
+
+    Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
+        {
+            // make sizes align with VM page size
+            int newSize = (approxSize + 0xfff) & 0xfffff000;
+            assert( newSize >= 0 );
+            if( newSize < Extent::maxSize() )
+                approxSize = newSize;
+        }
+        massert( 10357 ,  "shutdown in progress", ! inShutdown() );
+        massert( 10358 ,  "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() );
+        massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed
+        int ExtentSize = min(header()->unusedLength, approxSize);
+        DiskLoc loc;
+        if ( ExtentSize < Extent::minSize() ) {
+            /* note there could be a lot of looping here is db just started and
+               no files are open yet.  we might want to do something about that. */
+            if ( loops > 8 ) {
+                assert( loops < 10000 );
+                out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n';
+            }
+            log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n";
+            return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
+        }
+        int offset = header()->unused.getOfs();
+
+        DataFileHeader *h = header();
+        h->unused.writing().set( fileNo, offset + ExtentSize );
+        getDur().writingInt(h->unusedLength) = h->unusedLength - ExtentSize;
+        loc.set(fileNo, offset);
+        Extent *e = _getExtent(loc);
+        DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset, newCapped);
+
+        addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
+
+        DEV tlog(1) << "new extent " << ns << " size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset
+                    << " emptyLoc:" << hex << emptyLoc.getOfs() << dec << endl;
+        return e;
+    }
+
+    Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) {
+        string s = cc().database()->name + FREELIST_NS;
+        NamespaceDetails *f = nsdetails(s.c_str());
+        if( f ) {
+            int low, high;
+            if( capped ) {
+                // be strict about the size
+                low = approxSize;
+                if( low > 2048 ) low -= 256;
+                high = (int) (approxSize * 1.05) + 256;
+            }
+            else {
+                low = (int) (approxSize * 0.8);
+                high = (int) (approxSize * 1.4);
+            }
+            if( high <= 0 ) {
+                // overflowed
+                high = max(approxSize, Extent::maxSize());
+            }
+            int n = 0;
+            Extent *best = 0;
+            int bestDiff = 0x7fffffff;
+            {
+                Timer t;
+                DiskLoc L = f->firstExtent;
+                while( !L.isNull() ) {
+                    Extent * e = L.ext();
+                    if( e->length >= low && e->length <= high ) {
+                        int diff = abs(e->length - approxSize);
+                        if( diff < bestDiff ) {
+                            bestDiff = diff;
+                            best = e;
+                            if( ((double) diff) / approxSize < 0.1 ) { 
+                                // close enough
+                                break;
+                            }
+                            if( t.seconds() >= 2 ) { 
+                                // have spent lots of time in write lock, and we are in [low,high], so close enough
+                                // could come into play if extent freelist is very long
+                                break;
+                            }
+                        }
+                        else { 
+                            OCCASIONALLY {
+                                if( high < 64 * 1024 && t.seconds() >= 2 ) {
+                                    // be less picky if it is taking a long time
+                                    high = 64 * 1024;
+                                }
+                            }
+                        }
+                    }
+                    L = e->xnext;
+                    ++n;
+                }
+                if( t.seconds() >= 10 ) {
+                    log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl;
+                }
+            }
+
+            if( n > 128 ) log( n < 512 ) << "warning: newExtent " << n << " scanned\n";
+
+            if( best ) {
+                Extent *e = best;
+                // remove from the free list
+                if( !e->xprev.isNull() )
+                    e->xprev.ext()->xnext.writing() = e->xnext;
+                if( !e->xnext.isNull() )
+                    e->xnext.ext()->xprev.writing() = e->xprev;
+                if( f->firstExtent == e->myLoc )
+                    f->firstExtent.writing() = e->xnext;
+                if( f->lastExtent == e->myLoc )
+                    f->lastExtent.writing() = e->xprev;
+
+                // use it
+                OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
+                DiskLoc emptyLoc = e->reuse(ns, capped);
+                addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped);
+                return e;
+            }
+        }
+
+        return 0;
+        //        return createExtent(ns, approxSize, capped);
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    void Extent::markEmpty() { 
+        xnext.Null();
+        xprev.Null();
+        firstRecord.Null();
+        lastRecord.Null();
+    }
+
+    DiskLoc Extent::reuse(const char *nsname, bool capped) {
+        return getDur().writing(this)->_reuse(nsname, capped);
+    }
+
+    void getEmptyLoc(const char *ns, const DiskLoc extentLoc, int extentLength, bool capped, /*out*/DiskLoc& emptyLoc, /*out*/int& delRecLength) { 
+        emptyLoc = extentLoc;
+        emptyLoc.inc( Extent::HeaderSize() );
+        delRecLength = extentLength - Extent::HeaderSize();
+        if( delRecLength >= 32*1024 && str::contains(ns, '$') && !capped ) { 
+            // probably an index. so skip forward to keep its records page aligned 
+            int& ofs = emptyLoc.GETOFS();
+            int newOfs = (ofs + 0xfff) & ~0xfff; 
+            delRecLength -= (newOfs-ofs);
+            dassert( delRecLength > 0 );
+            ofs = newOfs;
+        }
+    }
+
+    DiskLoc Extent::_reuse(const char *nsname, bool capped) {
+        LOG(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
+        massert( 10360 ,  "Extent::reset bad magic value", magic == 0x41424344 );
+        nsDiagnostic = nsname;
+        markEmpty();
+
+        DiskLoc emptyLoc;
+        int delRecLength;
+        getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength);
+
+        // todo: some dup code here and below in Extent::init
+        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);
+        empty = getDur().writing(empty);
+        empty->lengthWithHeaders = delRecLength;
+        empty->extentOfs = myLoc.getOfs();
+        empty->nextDeleted.Null();
+
+        return emptyLoc;
+    }
+
+    /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
+    DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) {
+        magic = 0x41424344;
+        myLoc.set(_fileNo, _offset);
+        xnext.Null();
+        xprev.Null();
+        nsDiagnostic = nsname;
+        length = _length;
+        firstRecord.Null();
+        lastRecord.Null();
+
+        DiskLoc emptyLoc;
+        int delRecLength;
+        getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength);
+
+        DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength) );
+        empty->lengthWithHeaders = delRecLength;
+        empty->extentOfs = myLoc.getOfs();
+
+        return emptyLoc;
+    }
+
+    /*
+      Record* Extent::newRecord(int len) {
+      if( firstEmptyRegion.isNull() )8
+      return 0;
+
+      assert(len > 0);
+      int newRecSize = len + Record::HeaderSize;
+      DiskLoc newRecordLoc = firstEmptyRegion;
+      Record *r = getRecord(newRecordLoc);
+      int left = r->netLength() - len;
+      if( left < 0 ) {
+      //
+      firstEmptyRegion.Null();
+      return 0;
+      }
+
+      DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
+      r->lengthWithHeaders = newRecSize;
+      r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
+      if( !lastRecord.isNull() ) {
+      assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
+      getRecord(lastRecord)->next.set(newRecordLoc); // until now
+      r->prev.set(lastRecord);
+      }
+      else {
+      r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
+      assert( firstRecord.isNull() );
+      firstRecord = newRecordLoc;
+      }
+      lastRecord = newRecordLoc;
+
+      if( left < Record::HeaderSize + 32 ) {
+      firstEmptyRegion.Null();
+      }
+      else {
+      firstEmptyRegion.inc(newRecSize);
+      Record *empty = getRecord(firstEmptyRegion);
+      empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
+      empty->prev.Null();
+      empty->lengthWithHeaders = left;
+      }
+
+      return r;
+      }
+    */
+
+    int Extent::maxSize() {
+        int maxExtentSize = 0x7ff00000;
+        if ( cmdLine.smallfiles ) {
+            maxExtentSize >>= 2;
+        }
+        return maxExtentSize;
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
+        NamespaceDetails * d = nsdetails( ns );
+        if ( ! d )
+            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
+
+        DiskLoc loc = d->firstExtent;
+        Extent *e = getExtent(loc);
+
+        DEBUGGING {
+            out() << "listing extents for " << ns << endl;
+            DiskLoc tmp = loc;
+            set<DiskLoc> extents;
+
+            while ( 1 ) {
+                Extent *f = getExtent(tmp);
+                out() << "extent: " << tmp.toString() << endl;
+                extents.insert(tmp);
+                tmp = f->xnext;
+                if ( tmp.isNull() )
+                    break;
+                f = f->getNextExtent();
+            }
+
+            out() << endl;
+            d->dumpDeleted(&extents);
+        }
+
+        if ( d->capped )
+            return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) );
+
+        if ( !startLoc.isNull() )
+            return shared_ptr<Cursor>(new BasicCursor( startLoc ));
+
+        while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
+            /* todo: if extent is empty, free it for reuse elsewhere.
+               that is a bit complicated have to clean up the freelists.
+            */
+            RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl;
+            // find a nonempty extent
+            // it might be nice to free the whole extent here!  but have to clean up free recs then.
+            e = e->getNextExtent();
+        }
+        return shared_ptr<Cursor>(new BasicCursor( e->firstRecord ));
+    }
+
+    /* get a table scan cursor, but can be forward or reverse direction.
+       order.$natural - if set, > 0 means forward (asc), < 0 backward (desc).
+    */
+    shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) {
+        BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 }
+
+        if ( el.number() >= 0 )
+            return DataFileMgr::findAll(ns, startLoc);
+
+        // "reverse natural order"
+        NamespaceDetails *d = nsdetails(ns);
+
+        if ( !d )
+            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
+
+        if ( !d->capped ) {
+            if ( !startLoc.isNull() )
+                return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
+            Extent *e = d->lastExtent.ext();
+            while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
+                OCCASIONALLY out() << "  findTableScan: extent empty, skipping ahead" << endl;
+                e = e->getPrevExtent();
+            }
+            return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
+        }
+        else {
+            return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) );
+        }
+    }
+
+    void printFreeList() {
+        string s = cc().database()->name + FREELIST_NS;
+        log() << "dump freelist " << s << endl;
+        NamespaceDetails *freeExtents = nsdetails(s.c_str());
+        if( freeExtents == 0 ) {
+            log() << "  freeExtents==0" << endl;
+            return;
+        }
+        DiskLoc a = freeExtents->firstExtent;
+        while( !a.isNull() ) {
+            Extent *e = a.ext();
+            log() << "  extent " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << endl;
+            a = e->xnext;
+        }
+
+        log() << "end freelist" << endl;
+    }
+
+    /** free a list of extents that are no longer in use.  this is a double linked list of extents 
+        (could be just one in the list)
+    */
+    void freeExtents(DiskLoc firstExt, DiskLoc lastExt) {
+        {
+            assert( !firstExt.isNull() && !lastExt.isNull() );
+            Extent *f = firstExt.ext();
+            Extent *l = lastExt.ext();
+            assert( f->xprev.isNull() );
+            assert( l->xnext.isNull() );
+            assert( f==l || !f->xnext.isNull() );
+            assert( f==l || !l->xprev.isNull() );
+        }
+
+        string s = cc().database()->name + FREELIST_NS;
+        NamespaceDetails *freeExtents = nsdetails(s.c_str());
+        if( freeExtents == 0 ) {
+            string err;
+            _userCreateNS(s.c_str(), BSONObj(), err, 0); // todo: this actually allocates an extent, which is bad!
+            freeExtents = nsdetails(s.c_str());
+            massert( 10361 , "can't create .$freelist", freeExtents);
+        }
+        if( freeExtents->firstExtent.isNull() ) {
+            freeExtents->firstExtent.writing() = firstExt;
+            freeExtents->lastExtent.writing() = lastExt;
+        }
+        else {
+            DiskLoc a = freeExtents->firstExtent;
+            assert( a.ext()->xprev.isNull() );
+            getDur().writingDiskLoc( a.ext()->xprev ) = lastExt;
+            getDur().writingDiskLoc( lastExt.ext()->xnext ) = a;
+            getDur().writingDiskLoc( freeExtents->firstExtent ) = firstExt;
+        }
+
+        //printFreeList();
+    }
+
+    /* drop a collection/namespace */
+    void dropNS(const string& nsToDrop) {
+        NamespaceDetails* d = nsdetails(nsToDrop.c_str());
+        uassert( 10086 ,  (string)"ns not found: " + nsToDrop , d );
+
+        BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str());
+
+        NamespaceString s(nsToDrop);
+        assert( s.db == cc().database()->name );
+        if( s.isSystem() ) {
+            if( s.coll == "system.profile" )
+                uassert( 10087 ,  "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 );
+            else
+                uasserted( 12502, "can't drop system ns" );
+        }
+
+        {
+            // remove from the system catalog
+            BSONObj cond = BSON( "name" << nsToDrop );   // { name: "colltodropname" }
+            string system_namespaces = cc().database()->name + ".system.namespaces";
+            /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true);
+            // no check of return code as this ns won't exist for some of the new storage engines
+        }
+
+        // free extents
+        if( !d->firstExtent.isNull() ) {
+            freeExtents(d->firstExtent, d->lastExtent);
+            getDur().writingDiskLoc( d->firstExtent ).setInvalid();
+            getDur().writingDiskLoc( d->lastExtent ).setInvalid();
+        }
+
+        // remove from the catalog hashtable
+        cc().database()->namespaceIndex.kill_ns(nsToDrop.c_str());
+    }
+
+    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ) {
+        log(1) << "dropCollection: " << name << endl;
+        NamespaceDetails *d = nsdetails(name.c_str());
+        if( d == 0 )
+            return;
+
+        BackgroundOperation::assertNoBgOpInProgForNs(name.c_str());
+
+        if ( d->nIndexes != 0 ) {
+            try {
+                assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) );
+            }
+            catch( DBException& e ) {
+                stringstream ss;
+                ss << "drop: dropIndexes for collection failed - consider trying repair ";
+                ss << " cause: " << e.what();
+                uasserted(12503,ss.str());
+            }
+            assert( d->nIndexes == 0 );
+        }
+        log(1) << "\t dropIndexes done" << endl;
+        result.append("ns", name.c_str());
+        ClientCursor::invalidate(name.c_str());
+        Top::global.collectionDropped( name );
+        NamespaceDetailsTransient::eraseForPrefix( name.c_str() );
+        dropNS(name);
+    }
+
+    /* unindex all keys in index for this record. */
+    static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) {
+        BSONObjSet keys;
+        id.getKeysFromObject(obj, keys);
+        IndexInterface& ii = id.idxInterface();
+        for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+            BSONObj j = *i;
+
+            bool ok = false;
+            try {
+                ok = ii.unindex(id.head, id, j, dl);
+            }
+            catch (AssertionException& e) {
+                problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl;
+                out() << "Assertion failure: _unindex failed: " << e.what() << '\n';
+                out() << "  obj:" << obj.toString() << '\n';
+                out() << "  key:" << j.toString() << '\n';
+                out() << "  dl:" << dl.toString() << endl;
+                sayDbContext();
+            }
+
+            if ( !ok && logMissing ) {
+                log() << "unindex failed (key too big?) " << id.indexNamespace() << " key: " << j << " " << obj["_id"] << endl;
+            }
+        }
+    }
+//zzz
+    /* unindex all keys in all indexes for this record. */
+    static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) {
+        BSONObj obj(todelete);
+        int n = d->nIndexes;
+        for ( int i = 0; i < n; i++ )
+            _unindexRecord(d->idx(i), obj, dl, !noWarn);
+        if( d->indexBuildInProgress ) { // background index
+            // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
+            _unindexRecord(d->idx(n), obj, dl, false);
+        }
+    }
+
+    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
+       caller must check if capped
+    */
+    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) {
+        /* remove ourself from the record next/prev chain */
+        {
+            if ( todelete->prevOfs != DiskLoc::NullOfs )
+                getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
+            if ( todelete->nextOfs != DiskLoc::NullOfs )
+                getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
+        }
+
+        /* remove ourself from extent pointers */
+        {
+            Extent *e = getDur().writing( todelete->myExtent(dl) );
+            if ( e->firstRecord == dl ) {
+                if ( todelete->nextOfs == DiskLoc::NullOfs )
+                    e->firstRecord.Null();
+                else
+                    e->firstRecord.set(dl.a(), todelete->nextOfs);
+            }
+            if ( e->lastRecord == dl ) {
+                if ( todelete->prevOfs == DiskLoc::NullOfs )
+                    e->lastRecord.Null();
+                else
+                    e->lastRecord.set(dl.a(), todelete->prevOfs);
+            }
+        }
+
+        /* add to the free list */
+        {
+            {
+                NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+                s->datasize -= todelete->netLength();
+                s->nrecords--;
+            }
+
+            if ( strstr(ns, ".system.indexes") ) {
+                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+                   careful until validated more, as IndexDetails has pointers
+                   to this disk location.  so an incorrectly done remove would cause
+                   a lot of problems.
+                */
+                memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
+            }
+            else {
+                DEV {
+                    unsigned long long *p = (unsigned long long *) todelete->data;
+                    *getDur().writing(p) = 0;
+                    //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                }
+                d->addDeletedRec((DeletedRecord*)todelete, dl);
+            }
+        }
+    }
+
+    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) {
+        dassert( todelete == dl.rec() );
+
+        NamespaceDetails* d = nsdetails(ns);
+        if ( d->capped && !cappedOK ) {
+            out() << "failing remove on a capped ns " << ns << endl;
+            uassert( 10089 ,  "can't remove from a capped collection" , 0 );
+            return;
+        }
+        
+        BSONObj toDelete;
+        if ( doLog ) {
+            BSONElement e = dl.obj()["_id"];
+            if ( e.type() ) {
+                toDelete = e.wrap();
+            }
+        }
+
+        /* check if any cursors point to us.  if so, advance them. */
+        ClientCursor::aboutToDelete(dl);
+
+        unindexRecord(d, todelete, dl, noWarn);
+
+        _deleteRecord(d, ns, todelete, dl);
+        NamespaceDetailsTransient::get( ns ).notifyOfWriteOp();
+
+        if ( ! toDelete.isEmpty() ) {
+            logOp( "d" , ns , toDelete );
+        }
+    }
+
+
+    /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
+     */
+    const DiskLoc DataFileMgr::updateRecord(
+        const char *ns,
+        NamespaceDetails *d,
+        NamespaceDetailsTransient *nsdt,
+        Record *toupdate, const DiskLoc& dl,
+        const char *_buf, int _len, OpDebug& debug,  bool god) {
+
+        dassert( toupdate == dl.rec() );
+
+        BSONObj objOld(toupdate);
+        BSONObj objNew(_buf);
+        DEV assert( objNew.objsize() == _len );
+        DEV assert( objNew.objdata() == _buf );
+
+        if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
+            /* add back the old _id value if the update removes it.  Note this implementation is slow
+               (copies entire object multiple times), but this shouldn't happen often, so going for simple
+               code, not speed.
+            */
+            BSONObjBuilder b;
+            BSONElement e;
+            assert( objOld.getObjectID(e) );
+            b.append(e); // put _id first, for best performance
+            b.appendElements(objNew);
+            objNew = b.obj();
+        }
+
+        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
+           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
+        */
+        vector<IndexChanges> changes;
+        bool changedId = false;
+        getIndexChanges(changes, *d, objNew, objOld, changedId);
+        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId );
+        dupCheck(changes, *d, dl);
+
+        if ( toupdate->netLength() < objNew.objsize() ) {
+            // doesn't fit.  reallocate -----------------------------------------------------
+            uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->capped));
+            d->paddingTooSmall();
+            debug.moved = true;
+            deleteRecord(ns, toupdate, dl);
+            return insert(ns, objNew.objdata(), objNew.objsize(), god);
+        }
+
+        nsdt->notifyOfWriteOp();
+        d->paddingFits();
+
+        /* have any index keys changed? */
+        {
+            int keyUpdates = 0;
+            int z = d->nIndexesBeingBuilt();
+            for ( int x = 0; x < z; x++ ) {
+                IndexDetails& idx = d->idx(x);
+                IndexInterface& ii = idx.idxInterface();
+                for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) {
+                    try {
+                        bool found = ii.unindex(idx.head, idx, *changes[x].removed[i], dl);
+                        if ( ! found ) {
+                            RARELY warning() << "ns: " << ns << " couldn't unindex key: " << *changes[x].removed[i] 
+                                             << " for doc: " << objOld["_id"] << endl;
+                        }
+                    }
+                    catch (AssertionException&) {
+                        debug.extra << " exception update unindex ";
+                        problem() << " caught assertion update unindex " << idx.indexNamespace() << endl;
+                    }
+                }
+                assert( !dl.isNull() );
+                BSONObj idxKey = idx.info.obj().getObjectField("key");
+                Ordering ordering = Ordering::make(idxKey);
+                keyUpdates += changes[x].added.size();
+                for ( unsigned i = 0; i < changes[x].added.size(); i++ ) {
+                    try {
+                        /* we did the dupCheck() above.  so we don't have to worry about it here. */
+                        ii.bt_insert(
+                            idx.head,
+                            dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
+                    }
+                    catch (AssertionException& e) {
+                        debug.extra << " exception update index ";
+                        problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << " " << objNew["_id"] << endl;
+                    }
+                }
+            }
+            
+            debug.keyUpdates = keyUpdates;
+        }
+
+        //  update in place
+        int sz = objNew.objsize();
+        memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz);
+        return dl;
+    }
+
+    int Extent::followupSize(int len, int lastExtentLen) {
+        assert( len < Extent::maxSize() );
+        int x = initialSize(len);
+        // changed from 1.20 to 1.35 in v2.1.x to get to larger extent size faster
+        int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.35);
+        int sz = y > x ? y : x;
+
+        if ( sz < lastExtentLen ) {
+            // this means there was an int overflow
+            // so we should turn it into maxSize
+            sz = Extent::maxSize();
+        }
+        else if ( sz > Extent::maxSize() ) {
+            sz = Extent::maxSize();
+        }
+
+        sz = ((int)sz) & 0xffffff00;
+        assert( sz > len );
+
+        return sz;
+    }
+
+    /* step one of adding keys to index idxNo for a new record 
+       @return true means done.  false means multikey involved and more work to do
+    */
+    static void _addKeysToIndexStepOneOfTwo(BSONObjSet & /*out*/keys, NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, IndexDetails& idx) {
+        idx.getKeysFromObject(obj, keys);
+        if( keys.empty() )
+            return;
+        bool dupsAllowed = !idx.unique();
+        BSONObj order = idx.keyPattern();
+        IndexInterface& ii = idx.idxInterface();
+        Ordering ordering = Ordering::make(order);
+
+        assert( !recordLoc.isNull() );
+
+        try {
+            // we can't do the two step method with multi keys as insertion of one key changes the indexes 
+            // structure.  however we can do the first key of the set so we go ahead and do that FWIW
+            ii.phasedQueueItemToInsert(idxNo, idx.head, recordLoc, *keys.begin(), ordering, idx, dupsAllowed);
+        }
+        catch (AssertionException& e) {
+            if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
+                DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+            }
+            else {
+                throw;
+            }
+        }
+    }
+
+    namespace dur { 
+        extern unsigned notesThisLock;
+    }
+
+    void upgradeToWritable(bool shouldBeUnlocked) {
+        // todo upgrade!
+        DEV {
+            // verify we haven't written yet (usually)
+
+            // test binary does special things so this would assert there so don't check there
+            if( shouldBeUnlocked && !cmdLine.binaryName.empty() && cmdLine.binaryName != "test" ) {
+                static unsigned long long zeroes;
+                static unsigned long long tot;
+                tot++;
+                if( dur::notesThisLock == 0 )
+                    zeroes++;
+                if( tot > 1000 ) {
+                    static int n;
+                    DEV if( n++ == 0 ) 
+                        log() << "warning upgradeToWritable: already in writable too often" << endl;
+                }
+            }
+        }
+    }
+
+    /** add index keys for a newly inserted record 
+        done in two steps/phases to defer write lock portion
+    */
+    static void indexRecordUsingTwoSteps(NamespaceDetails *d, BSONObj obj, DiskLoc loc, bool shouldBeUnlocked) {
+        vector<int> multi;
+        vector<BSONObjSet> multiKeys;
+
+        IndexInterface::phasedBegin();
+
+        int n = d->nIndexesBeingBuilt();
+        {
+            BSONObjSet keys;
+            for ( int i = 0; i < n; i++ ) {
+                IndexDetails& idx = d->idx(i);
+                // this call throws on unique constraint violation.  we haven't done any writes yet so that is fine.
+                _addKeysToIndexStepOneOfTwo(/*out*/keys, d, i, obj, loc, idx);
+                if( keys.size() > 1 ) {
+                    multi.push_back(i);
+                    multiKeys.push_back(BSONObjSet());
+                    multiKeys[multiKeys.size()-1].swap(keys);
+                }
+                keys.clear();
+            }
+        }
+
+        // update lock to writable here.  TODO
+        
+        upgradeToWritable(shouldBeUnlocked);
+
+        IndexInterface::phasedFinish(); // step 2
+
+        // now finish adding multikeys
+        for( unsigned j = 0; j < multi.size(); j++ ) {
+            unsigned i = multi[j];
+            BSONObjSet& keys = multiKeys[j];
+            IndexDetails& idx = d->idx(i);
+            IndexInterface& ii = idx.idxInterface();
+            Ordering ordering = Ordering::make(idx.keyPattern());
+            d->setIndexIsMultikey(i);   
+            for( BSONObjSet::iterator k = ++keys.begin()/*skip 1*/; k != keys.end(); k++ ) {
+                try {
+                    ii.bt_insert(idx.head, loc, *k, ordering, !idx.unique(), idx);
+                } catch (AssertionException& e) {
+                    if( e.getCode() == 10287 && (int) i == d->nIndexes ) {
+                        DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+                    }
+                    else {
+                        /* roll back previously added index entries
+                           note must do self index as it is multikey and could require some cleanup itself
+                        */
+                        for( int j = 0; j < n; j++ ) {
+                            try {
+                                _unindexRecord(d->idx(j), obj, loc, false);
+                            }
+                            catch(...) {
+                                log(3) << "unindex fails on rollback after unique key constraint prevented insert\n";
+                            }
+                        }
+                        throw;
+                    }
+                }
+            }
+        }
+    }
+
+    /* add keys to index idxNo for a new record */
+    static void addKeysToIndex(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) {
+        IndexDetails& idx = d->idx(idxNo);
+        BSONObjSet keys;
+        idx.getKeysFromObject(obj, keys);
+        if( keys.empty() ) 
+            return;
+        BSONObj order = idx.keyPattern();
+        IndexInterface& ii = idx.idxInterface();
+        Ordering ordering = Ordering::make(order);
+        int n = 0;
+        for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+            if( ++n == 2 ) {
+                d->setIndexIsMultikey(idxNo);
+            }
+            assert( !recordLoc.isNull() );
+            try {
+                ii.bt_insert(idx.head, recordLoc, *i, ordering, dupsAllowed, idx);
+            }
+            catch (AssertionException& e) {
+                if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
+                    DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+                    continue;
+                }
+                if( !dupsAllowed ) {
+                    // dup key exception, presumably.
+                    throw;
+                }
+                problem() << " caught assertion addKeysToIndex " << idx.indexNamespace() << " " << obj["_id"] << endl;
+            }
+        }
+    }
+
+#if 0    
+    void testSorting() {
+        BSONObjBuilder b;
+        b.appendNull("");
+        BSONObj x = b.obj();
+
+        BSONObjExternalSorter sorter(*IndexDetails::iis[1]);
+
+        sorter.add(x, DiskLoc(3,7));
+        sorter.add(x, DiskLoc(4,7));
+        sorter.add(x, DiskLoc(2,7));
+        sorter.add(x, DiskLoc(1,7));
+        sorter.add(x, DiskLoc(3,77));
+
+        sorter.sort();
+
+        auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+        while( i->more() ) {
+            BSONObjExternalSorter::Data d = i->next();
+            /*cout << d.second.toString() << endl;
+            cout << d.first.objsize() << endl;
+            cout<<"SORTER next:" << d.first.toString() << endl;*/
+        }
+    }
+#endif
+
+    SortPhaseOne *precalced = 0;
+
+    template< class V >
+    void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter, 
+        bool dropDups, list<DiskLoc> &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm,
+        Timer& t
+        )
+    {
+        BtreeBuilder<V> btBuilder(dupsAllowed, idx);
+        BSONObj keyLast;
+        auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+        assert( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) );
+        while( i->more() ) {
+            RARELY killCurrentOp.checkForInterrupt();
+            BSONObjExternalSorter::Data d = i->next();
+
+            try {
+                if ( !dupsAllowed && dropDups ) {
+                    LastError::Disabled led( lastError.get() );
+                    btBuilder.addKey(d.first, d.second);
+                }
+                else {
+                    btBuilder.addKey(d.first, d.second);                    
+                }
+            }
+            catch( AssertionException& e ) {
+                if ( dupsAllowed ) {
+                    // unknow exception??
+                    throw;
+                }
+
+                if( e.interrupted() ) {
+                    killCurrentOp.checkForInterrupt();
+                }
+
+                if ( ! dropDups )
+                    throw;
+
+                /* we could queue these on disk, but normally there are very few dups, so instead we
+                    keep in ram and have a limit.
+                */
+                dupsToDrop.push_back(d.second);
+                uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
+            }
+            pm.hit();
+        }
+        pm.finished();
+        op->setMessage( "index: (3/3) btree-middle" );
+        log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
+        btBuilder.commit();
+        if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) {
+            warning() << "not all entries were added to the index, probably some keys were too large" << endl;
+        }
+    }
+
+    // throws DBException
+    unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+        CurOp * op = cc().curop();
+
+        Timer t;
+
+        tlog(1) << "fastBuildIndex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl;
+
+        bool dupsAllowed = !idx.unique();
+        bool dropDups = idx.dropDups() || inDBRepair;
+        BSONObj order = idx.keyPattern();
+
+        getDur().writingDiskLoc(idx.head).Null();
+
+        if ( logLevel > 1 ) printMemInfo( "before index start" );
+
+        /* get and sort all the keys ----- */
+        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
+        SortPhaseOne _ours;
+        SortPhaseOne *phase1 = precalced;
+        if( phase1 == 0 ) {
+            phase1 = &_ours;
+            SortPhaseOne& p1 = *phase1;
+            shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+            p1.sorter.reset( new BSONObjExternalSorter(idx.idxInterface(), order) );
+            p1.sorter->hintNumObjects( d->stats.nrecords );
+            const IndexSpec& spec = idx.getSpec();
+            while ( c->ok() ) {
+                BSONObj o = c->current();
+                DiskLoc loc = c->currLoc();
+                p1.addKeys(spec, o, loc);
+                c->advance();
+                pm.hit();
+                if ( logLevel > 1 && p1.n % 10000 == 0 ) {
+                    printMemInfo( "\t iterating objects" );
+                }
+            };
+        }
+        pm.finished();
+
+        BSONObjExternalSorter& sorter = *(phase1->sorter);
+
+        if( phase1->multi )
+            d->setIndexIsMultikey(idxNo);
+
+        if ( logLevel > 1 ) printMemInfo( "before final sort" );
+        phase1->sorter->sort();
+        if ( logLevel > 1 ) printMemInfo( "after final sort" );
+
+        log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
+
+        list<DiskLoc> dupsToDrop;
+
+        /* build index --- */
+        if( idx.version() == 0 )
+            buildBottomUpPhases2And3<V0>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
+        else if( idx.version() == 1 ) 
+            buildBottomUpPhases2And3<V1>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
+        else
+            assert(false);
+
+        log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
+
+        for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){
+            theDataFileMgr.deleteRecord( ns, i->rec(), *i, false /* cappedOk */ , true /* noWarn */ , isMaster( ns ) /* logOp */ );
+            getDur().commitIfNeeded();
+        }
+
+        return phase1->n;
+    }
+
+    class BackgroundIndexBuildJob : public BackgroundOperation {
+
+        unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+            bool dupsAllowed = !idx.unique();
+            bool dropDups = idx.dropDups();
+
+            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords );
+
+            unsigned long long n = 0;
+            auto_ptr<ClientCursor> cc;
+            {
+                shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+                cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) );
+            }
+            CursorId id = cc->cursorid();
+
+            while ( cc->ok() ) {
+                BSONObj js = cc->current();
+                try {
+                    {
+                        if ( !dupsAllowed && dropDups ) {
+                            LastError::Disabled led( lastError.get() );
+                            addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                        }
+                        else {
+                            addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                        }
+                    }
+                    cc->advance();
+                }
+                catch( AssertionException& e ) {
+                    if( e.interrupted() ) {
+                        killCurrentOp.checkForInterrupt();
+                    }
+
+                    if ( dropDups ) {
+                        DiskLoc toDelete = cc->currLoc();
+                        bool ok = cc->advance();
+                        cc->updateLocation();
+                        theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true , true );
+                        if( ClientCursor::find(id, false) == 0 ) {
+                            cc.release();
+                            if( !ok ) {
+                                /* we were already at the end. normal. */
+                            }
+                            else {
+                                uasserted(12585, "cursor gone during bg index; dropDups");
+                            }
+                            break;
+                        }
+                    }
+                    else {
+                        log() << "background addExistingToIndex exception " << e.what() << endl;
+                        throw;
+                    }
+                }
+                n++;
+                progress.hit();
+
+                getDur().commitIfNeeded();
+
+                if ( cc->yieldSometimes( ClientCursor::WillNeed ) ) {
+                    progress.setTotalWhileRunning( d->stats.nrecords );
+                }
+                else {
+                    cc.release();
+                    uasserted(12584, "cursor gone during bg index");
+                    break;
+                }
+            }
+            progress.finished();
+            return n;
+        }
+
+        /* we do set a flag in the namespace for quick checking, but this is our authoritative info -
+           that way on a crash/restart, we don't think we are still building one. */
+        set<NamespaceDetails*> bgJobsInProgress;
+
+        void prep(const char *ns, NamespaceDetails *d) {
+            assertInWriteLock();
+            uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , mongo::d.dbMutex.getState() == 1 );
+            bgJobsInProgress.insert(d);
+        }
+        void done(const char *ns, NamespaceDetails *d) {
+            NamespaceDetailsTransient::get(ns).addedIndex(); // clear query optimizer cache
+            assertInWriteLock();
+        }
+
+    public:
+        BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { }
+
+        unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+            unsigned long long n = 0;
+
+            prep(ns.c_str(), d);
+            assert( idxNo == d->nIndexes );
+            try {
+                idx.head.writing() = idx.idxInterface().addBucket(idx);
+                n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
+            }
+            catch(...) {
+                if( cc().database() && nsdetails(ns.c_str()) == d ) {
+                    assert( idxNo == d->nIndexes );
+                    done(ns.c_str(), d);
+                }
+                else {
+                    log() << "ERROR: db gone during bg index?" << endl;
+                }
+                throw;
+            }
+            assert( idxNo == d->nIndexes );
+            done(ns.c_str(), d);
+            return n;
+        }
+    };
+
+    /**
+     * For the lifetime of this object, an index build is indicated on the specified
+     * namespace and the newest index is marked as absent.  This simplifies
+     * the cleanup required on recovery.
+     */
+    class RecoverableIndexState {
+    public:
+        RecoverableIndexState( NamespaceDetails *d ) : _d( d ) {
+            indexBuildInProgress() = 1;
+            nIndexes()--;
+        }
+        ~RecoverableIndexState() {
+            DESTRUCTOR_GUARD (
+                nIndexes()++;
+                indexBuildInProgress() = 0;
+            )
+        }
+    private:
+        int &nIndexes() { return getDur().writingInt( _d->nIndexes ); }
+        int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); }
+        NamespaceDetails *_d;
+    };
+
+    // throws DBException
+    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
+        tlog() << "build index " << ns << ' ' << idx.keyPattern() << ( background ? " background" : "" ) << endl;
+        Timer t;
+        unsigned long long n;
+
+        assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
+        assert( d->indexBuildInProgress == 0 );
+        assertInWriteLock();
+        RecoverableIndexState recoverable( d );
+
+        // Build index spec here in case the collection is empty and the index details are invalid
+        idx.getSpec();
+
+        if( inDBRepair || !background ) {
+            n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
+            assert( !idx.head.isNull() );
+        }
+        else {
+            BackgroundIndexBuildJob j(ns.c_str());
+            n = j.go(ns, d, idx, idxNo);
+        }
+        tlog() << "build index done " << n << " records " << t.millis() / 1000.0 << " secs" << endl;
+    }
+
+    /* add keys to indexes for a new record */
+#if 0
+    static void oldIndexRecord__notused(NamespaceDetails *d, BSONObj obj, DiskLoc loc) {
+        int n = d->nIndexesBeingBuilt();
+        for ( int i = 0; i < n; i++ ) {
+            try {
+                bool unique = d->idx(i).unique();
+                addKeysToIndex(d, i, obj, loc, /*dupsAllowed*/!unique);
+            }
+            catch( DBException& ) {
+                /* try to roll back previously added index entries
+                   note <= i (not < i) is important here as the index we were just attempted
+                   may be multikey and require some cleanup.
+                */
+                for( int j = 0; j <= i; j++ ) {
+                    try {
+                        _unindexRecord(d->idx(j), obj, loc, false);
+                    }
+                    catch(...) {
+                        log(3) << "unindex fails on rollback after unique failure\n";
+                    }
+                }
+                throw;
+            }
+        }
+    }
+#endif
+
+    extern BSONObj id_obj; // { _id : 1 }
+
+    void ensureHaveIdIndex(const char *ns) {
+        NamespaceDetails *d = nsdetails(ns);
+        if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
+            return;
+
+        *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex;
+
+        {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                if( i.next().isIdIndex() )
+                    return;
+            }
+        }
+
+        string system_indexes = cc().database()->name + ".system.indexes";
+
+        BSONObjBuilder b;
+        b.append("name", "_id_");
+        b.append("ns", ns);
+        b.append("key", id_obj);
+        BSONObj o = b.done();
+
+        /* edge case: note the insert could fail if we have hit maxindexes already */
+        theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize(), true);
+    }
+
+#pragma pack(1)
+    struct IDToInsert_ {
+        char type;
+        char _id[4];
+        OID oid;
+        IDToInsert_() {
+            type = (char) jstOID;
+            strcpy(_id, "_id");
+            assert( sizeof(IDToInsert_) == 17 );
+        }
+    } idToInsert_;
+    struct IDToInsert : public BSONElement {
+        IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
+    } idToInsert;
+#pragma pack()
+
+    void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) {
+        BSONObj tmp = o;
+        insertWithObjMod( ns, tmp, god );
+        logOp( "i", ns, tmp );
+    }
+
+    /** @param o the object to insert. can be modified to add _id and thus be an in/out param
+     */
+    DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
+        bool addedID = false;
+        DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID );
+        if( addedID && !loc.isNull() )
+            o = BSONObj( loc.rec() );
+        return loc;
+    }
+
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );
+
+    // We are now doing two btree scans for all unique indexes (one here, and one when we've
+    // written the record to the collection.  This could be made more efficient inserting
+    // dummy data here, keeping pointers to the btree nodes holding the dummy data and then
+    // updating the dummy data with the DiskLoc of the real record.
+    void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) {
+        for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
+            if( d->idx(idxNo).unique() ) {
+                IndexDetails& idx = d->idx(idxNo);
+                BSONObjSet keys;
+                idx.getKeysFromObject(obj, keys);
+                BSONObj order = idx.keyPattern();
+                IndexInterface& ii = idx.idxInterface();
+                for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                    // WARNING: findSingle may not be compound index safe.  this may need to change.  see notes in 
+                    // findSingle code.
+                    uassert( 12582, "duplicate key insert for unique index of capped collection",
+                             ii.findSingle(idx, idx.head, *i ).isNull() );
+                }
+            }
+        }
+    }
+
+    /** add a record to the end of the linked list chain within this extent. 
+        require: you must have already declared write intent for the record header.        
+    */
+    void addRecordToRecListInExtent(Record *r, DiskLoc loc) {
+        dassert( loc.rec() == r );
+        Extent *e = r->myExtent(loc);
+        if ( e->lastRecord.isNull() ) {
+            Extent::FL *fl = getDur().writing(e->fl());
+            fl->firstRecord = fl->lastRecord = loc;
+            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+        }
+        else {
+            Record *oldlast = e->lastRecord.rec();
+            r->prevOfs = e->lastRecord.getOfs();
+            r->nextOfs = DiskLoc::NullOfs;
+            getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
+            getDur().writingDiskLoc(e->lastRecord) = loc;
+        }
+    }
+
+    NOINLINE_DECL DiskLoc outOfSpace(const char *ns, NamespaceDetails *d, int lenWHdr, bool god, DiskLoc extentLoc) {
+        DiskLoc loc;
+        if ( d->capped == 0 ) { // size capped doesn't grow
+            log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
+            cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
+            loc = d->alloc(ns, lenWHdr, extentLoc);
+            if ( loc.isNull() ) {
+                log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
+                for ( int z=0; z<10 && lenWHdr > d->lastExtentSize; z++ ) {
+                    log() << "try #" << z << endl;
+                    cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
+                    loc = d->alloc(ns, lenWHdr, extentLoc);
+                    if ( ! loc.isNull() )
+                        break;
+                }
+            }
+        }
+        return loc;
+    }
+
+    /** used by insert and also compact
+      * @return null loc if out of space 
+      */
+    DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god) {
+        DiskLoc extentLoc;
+        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+        if ( loc.isNull() ) {
+            loc = outOfSpace(ns, d, lenWHdr, god, extentLoc);
+        }
+        return loc;
+    }
+
+    bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) {
+        uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
+        if ( strstr(ns, ".system.") ) {
+            // later:check for dba-type permissions here if have that at some point separate
+            if ( strstr(ns, ".system.indexes" ) )
+                wouldAddIndex = true;
+            else if ( legalClientSystemNS( ns , true ) ) {
+                if ( obuf && strstr( ns , ".system.users" ) ) {
+                    BSONObj t( reinterpret_cast<const char *>( obuf ) );
+                    uassert( 14051 , "system.user entry needs 'user' field to be a string" , t["user"].type() == String );
+                    uassert( 14052 , "system.user entry needs 'pwd' field to be a string" , t["pwd"].type() == String );
+                    uassert( 14053 , "system.user entry needs 'user' field to be non-empty" , t["user"].String().size() );
+                    uassert( 14054 , "system.user entry needs 'pwd' field to be non-empty" , t["pwd"].String().size() );
+                }
+            }
+            else if ( !god ) {
+                // todo this should probably uasseert rather than doing this:
+                log() << "ERROR: attempt to insert in system namespace " << ns << endl;
+                return false;
+            }
+        }
+        return true;
+    }
+
+    NOINLINE_DECL NamespaceDetails* insert_newNamespace(const char *ns, int len, bool god) { 
+        addNewNamespaceToCatalog(ns);
+        /* todo: shouldn't be in the namespace catalog until after the allocations here work.
+            also if this is an addIndex, those checks should happen before this!
+        */
+        // This may create first file in the database.
+        int ies = Extent::initialSize(len);
+        if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { 
+            // probably an index.  so we pick a value here for the first extent instead of using initialExtentSize() which is more 
+            // for user collections.  TODO: we could look at the # of records in the parent collection to be smarter here.
+            ies = (32+4) * 1024;
+        }
+        cc().database()->allocExtent(ns, ies, false, false);
+        NamespaceDetails *d = nsdetails(ns);
+        if ( !god )
+            ensureIdIndexForNewNs(ns);
+        return d;
+    }
+
+    void NOINLINE_DECL insert_makeIndex(NamespaceDetails *tableToIndex, const string& tabletoidxns, const DiskLoc& loc) { 
+        uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );
+
+        BSONObj info = loc.obj();
+        bool background = info["background"].trueValue();
+        // if this is not readable, let's move things along
+        if (background && ((!theReplSet && cc().isSyncThread()) || (theReplSet && !theReplSet->isSecondary()))) {
+            log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
+            background = false;
+        }
+
+        int idxNo = tableToIndex->nIndexes;
+        IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
+        getDur().writingDiskLoc(idx.info) = loc;
+        try {
+            buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
+        }
+        catch( DBException& e ) {
+            // save our error msg string as an exception or dropIndexes will overwrite our message
+            LastError *le = lastError.get();
+            int savecode = 0;
+            string saveerrmsg;
+            if ( le ) {
+                savecode = le->code;
+                saveerrmsg = le->msg;
+            }
+            else {
+                savecode = e.getCode();
+                saveerrmsg = e.what();
+            }
+
+            // roll back this index
+            string name = idx.indexName();
+            BSONObjBuilder b;
+            string errmsg;
+            bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
+            if( !ok ) {
+                log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
+            }
+
+            assert( le && !saveerrmsg.empty() );
+            raiseError(savecode,saveerrmsg.c_str());
+            throw;
+        }
+    }
+
+    /* if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
+         after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
+
+       @param mayAddIndex almost always true, except for invocation from rename namespace command.
+       @param addedID if not null, set to true if adding _id element. you must assure false before calling
+              if using.
+    */
+
+    DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) {
+        bool wouldAddIndex = false;
+        massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
+        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
+        {
+            const char *sys = strstr(ns, "system.");
+            if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
+                return DiskLoc();
+        }
+        bool addIndex = wouldAddIndex && mayAddIndex;
+
+        NamespaceDetails *d = nsdetails(ns);
+        if ( d == 0 ) {
+            d = insert_newNamespace(ns, len, god);
+        }
+
+        NamespaceDetails *tableToIndex = 0;
+
+        string tabletoidxns;
+        BSONObj fixedIndexObject;
+        if ( addIndex ) {
+            assert( obuf );
+            BSONObj io((const char *) obuf);
+            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) {
+                // prepare creates _id itself, or this indicates to fail the build silently (such 
+                // as if index already exists)
+                return DiskLoc();
+            }
+            if ( ! fixedIndexObject.isEmpty() ) {
+                obuf = fixedIndexObject.objdata();
+                len = fixedIndexObject.objsize();
+            }
+        }
+
+        int addID = 0; // 0 if not adding _id; if adding, the length of that new element
+        if( !god ) {
+            /* Check if we have an _id field. If we don't, we'll add it.
+               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
+            */
+            BSONObj io((const char *) obuf);
+            BSONElement idField = io.getField( "_id" );
+            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
+            // we don't add _id for capped collections as they don't have an _id index
+            if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 && d->haveIdIndex() ) {
+                if( addedID )
+                    *addedID = true;
+                addID = len;
+                idToInsert_.oid.init();
+                len += idToInsert.size();
+            }
+
+            BSONElementManipulator::lookForTimestamps( io );
+        }
+
+        int lenWHdr = len + Record::HeaderSize;
+        lenWHdr = (int) (lenWHdr * d->paddingFactor);
+        if ( lenWHdr == 0 ) {
+            // old datafiles, backward compatible here.
+            assert( d->paddingFactor == 0 );
+            *getDur().writing(&d->paddingFactor) = 1.0;
+            lenWHdr = len + Record::HeaderSize;
+        }
+
+        // If the collection is capped, check if the new object will violate a unique index
+        // constraint before allocating space.
+        if ( d->nIndexes && d->capped && !god ) {
+            checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
+        }
+
+        bool earlyIndex = true;
+        DiskLoc loc;
+        if( addID || tableToIndex || d->capped ) {
+            // if need id, we don't do the early indexing. this is not the common case so that is sort of ok
+            earlyIndex = false;
+            loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+        }
+        else {
+            loc = d->allocWillBeAt(ns, lenWHdr);
+            if( loc.isNull() ) {
+                // need to get a new extent so we have to do the true alloc now (not common case)
+                earlyIndex = false;
+                loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+            }
+        }
+        if ( loc.isNull() ) {
+            log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl;
+            assert(d->capped);
+            return DiskLoc();
+        }
+
+        if( earlyIndex ) { 
+            // add record to indexes using two step method so we can do the reading outside a write lock
+            if ( d->nIndexes ) {
+                assert( obuf );
+                BSONObj obj((const char *) obuf);
+                try {
+                    indexRecordUsingTwoSteps(d, obj, loc, true);
+                }
+                catch( AssertionException& ) {
+                    // should be a dup key error on _id index
+                    dassert( !tableToIndex && !d->capped );
+                    // no need to delete/rollback the record as it was not added yet
+                    throw;
+                }
+            }
+            // really allocate now
+            DiskLoc real = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+            assert( real == loc );
+        }
+
+        Record *r = loc.rec();
+        {
+            assert( r->lengthWithHeaders >= lenWHdr );
+            r = (Record*) getDur().writingPtr(r, lenWHdr);
+            if( addID ) {
+                /* a little effort was made here to avoid a double copy when we add an ID */
+                ((int&)*r->data) = *((int*) obuf) + idToInsert.size();
+                memcpy(r->data+4, idToInsert.rawdata(), idToInsert.size());
+                memcpy(r->data+4+idToInsert.size(), ((char *)obuf)+4, addID-4);
+            }
+            else {
+                if( obuf ) // obuf can be null from internal callers
+                    memcpy(r->data, obuf, len);
+            }
+        }
+
+        addRecordToRecListInExtent(r, loc);
+
+        /* durability todo : this could be a bit annoying / slow to record constantly */
+        {
+            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
+
+        // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
+        if ( !god )
+            NamespaceDetailsTransient::get( ns ).notifyOfWriteOp();
+
+        if ( tableToIndex ) {
+            insert_makeIndex(tableToIndex, tabletoidxns, loc);
+        }
+
+        /* add this record to our indexes */
+        if ( !earlyIndex && d->nIndexes ) {
+            try {
+                BSONObj obj(r->data);
+                // not sure which of these is better -- either can be used.  oldIndexRecord may be faster, 
+                // but twosteps handles dup key errors more efficiently.
+                //oldIndexRecord(d, obj, loc);
+                indexRecordUsingTwoSteps(d, obj, loc, false);
+
+            }
+            catch( AssertionException& e ) {
+                // should be a dup key error on _id index
+                if( tableToIndex || d->capped ) {
+                    massert( 12583, "unexpected index insertion failure on capped collection", !d->capped );
+                    string s = e.toString();
+                    s += " : on addIndex/capped - collection and its index will not match";
+                    uassert_nothrow(s.c_str());
+                    error() << s << endl;
+                }
+                else {
+                    // normal case -- we can roll back
+                    _deleteRecord(d, ns, r, loc);
+                    throw;
+                }
+            }
+        }
+
+        d->paddingFits();
+
+        return loc;
+    }
+
+    /* special version of insert for transaction logging -- streamlined a bit.
+       assumes ns is capped and no indexes
+    */
+    Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) {
+        assert( d );
+        RARELY assert( d == nsdetails(ns) );
+        DEV assert( d == nsdetails(ns) );
+
+        DiskLoc extentLoc;
+        int lenWHdr = len + Record::HeaderSize;
+        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+        assert( !loc.isNull() );
+
+        Record *r = loc.rec();
+        assert( r->lengthWithHeaders >= lenWHdr );
+
+        Extent *e = r->myExtent(loc);
+        if ( e->lastRecord.isNull() ) {
+            Extent::FL *fl = getDur().writing( e->fl() );
+            fl->firstRecord = fl->lastRecord = loc;
+
+            Record::NP *np = getDur().writing(r->np());
+            np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
+        }
+        else {
+            Record *oldlast = e->lastRecord.rec();
+            Record::NP *np = getDur().writing(r->np());
+            np->prevOfs = e->lastRecord.getOfs();
+            np->nextOfs = DiskLoc::NullOfs;
+            getDur().writingInt( oldlast->nextOfs ) = loc.getOfs();
+            e->lastRecord.writing() = loc;
+        }
+
+        /* todo: don't update for oplog?  seems wasteful. */
+        {
+            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
+
+        return r;
+    }
+
+} // namespace mongo
+
+#include "clientcursor.h"
+
+namespace mongo {
+
+    void dropAllDatabasesExceptLocal() {
+        writelock lk("");
+
+        vector<string> n;
+        getDatabaseNames(n);
+        if( n.size() == 0 ) return;
+        log() << "dropAllDatabasesExceptLocal " << n.size() << endl;
+        for( vector<string>::iterator i = n.begin(); i != n.end(); i++ ) {
+            if( *i != "local" ) {
+                Client::Context ctx(*i);
+                dropDatabase(*i);
+            }
+        }
+    }
+
+    void dropDatabase(string db) {
+        log(1) << "dropDatabase " << db << endl;
+        Database *d = cc().database();
+        assert( d );
+        assert( d->name == db );
+
+        BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str());
+
+        mongo::d.dbMutex.assertWriteLocked();
+
+        // Not sure we need this here, so removed.  If we do, we need to move it down 
+        // within other calls both (1) as they could be called from elsewhere and 
+        // (2) to keep the lock order right - groupcommitmutex must be locked before 
+        // mmmutex (if both are locked).
+        //
+        //  RWLockRecursive::Exclusive lk(MongoFile::mmmutex);
+
+        getDur().syncDataAndTruncateJournal();
+
+        Database::closeDatabase( d->name.c_str(), d->path );
+        d = 0; // d is now deleted
+
+        _deleteDataFiles( db.c_str() );
+    }
+
+    typedef boost::filesystem::path Path;
+
+    void boostRenameWrapper( const Path &from, const Path &to ) {
+        try {
+            boost::filesystem::rename( from, to );
+        }
+        catch ( const boost::filesystem::filesystem_error & ) {
+            // boost rename doesn't work across partitions
+            boost::filesystem::copy_file( from, to);
+            boost::filesystem::remove( from );
+        }
+    }
+
+    // back up original database files to 'temp' dir
+    void _renameForBackup( const char *database, const Path &reservedPath ) {
+        Path newPath( reservedPath );
+        if ( directoryperdb )
+            newPath /= database;
+        class Renamer : public FileOp {
+        public:
+            Renamer( const Path &newPath ) : newPath_( newPath ) {}
+        private:
+            const boost::filesystem::path &newPath_;
+            virtual bool apply( const Path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                boostRenameWrapper( p, newPath_ / ( p.leaf() + ".bak" ) );
+                return true;
+            }
+            virtual const char * op() const {
+                return "renaming";
+            }
+        } renamer( newPath );
+        _applyOpToDataFiles( database, renamer, true );
+    }
+
+    // move temp files to standard data dir
+    void _replaceWithRecovered( const char *database, const char *reservedPathString ) {
+        Path newPath( dbpath );
+        if ( directoryperdb )
+            newPath /= database;
+        class Replacer : public FileOp {
+        public:
+            Replacer( const Path &newPath ) : newPath_( newPath ) {}
+        private:
+            const boost::filesystem::path &newPath_;
+            virtual bool apply( const Path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                boostRenameWrapper( p, newPath_ / p.leaf() );
+                return true;
+            }
+            virtual const char * op() const {
+                return "renaming";
+            }
+        } replacer( newPath );
+        _applyOpToDataFiles( database, replacer, true, reservedPathString );
+    }
+
+    // generate a directory name for storing temp data files
+    Path uniqueReservedPath( const char *prefix ) {
+        Path repairPath = Path( repairpath );
+        Path reservedPath;
+        int i = 0;
+        bool exists = false;
+        do {
+            stringstream ss;
+            ss << prefix << "_repairDatabase_" << i++;
+            reservedPath = repairPath / ss.str();
+            BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
+        }
+        while ( exists );
+        return reservedPath;
+    }
+
+    boost::intmax_t dbSize( const char *database ) {
+        class SizeAccumulator : public FileOp {
+        public:
+            SizeAccumulator() : totalSize_( 0 ) {}
+            boost::intmax_t size() const {
+                return totalSize_;
+            }
+        private:
+            virtual bool apply( const boost::filesystem::path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                totalSize_ += boost::filesystem::file_size( p );
+                return true;
+            }
+            virtual const char *op() const {
+                return "checking size";
+            }
+            boost::intmax_t totalSize_;
+        };
+        SizeAccumulator sa;
+        _applyOpToDataFiles( database, sa );
+        return sa.size();
+    }
+
+    bool repairDatabase( string dbNameS , string &errmsg,
+                         bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) {
+        doingRepair dr;
+        dbNameS = nsToDatabase( dbNameS );
+        const char * dbName = dbNameS.c_str();
+
+        stringstream ss;
+        ss << "localhost:" << cmdLine.port;
+        string localhost = ss.str();
+
+        problem() << "repairDatabase " << dbName << endl;
+        assert( cc().database()->name == dbName );
+        assert( cc().database()->path == dbpath );
+
+        BackgroundOperation::assertNoBgOpInProgForDb(dbName);
+
+        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+        boost::intmax_t totalSize = dbSize( dbName );
+        boost::intmax_t freeSize = File::freeSpace(repairpath);
+        if ( freeSize > -1 && freeSize < totalSize ) {
+            stringstream ss;
+            ss << "Cannot repair database " << dbName << " having size: " << totalSize
+               << " (bytes) because free disk space is: " << freeSize << " (bytes)";
+            errmsg = ss.str();
+            problem() << errmsg << endl;
+            return false;
+        }
+
+        Path reservedPath =
+            uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ?
+                                "backup" : "_tmp" );
+        BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
+        string reservedPathString = reservedPath.native_directory_string();
+
+        bool res;
+        {
+            // clone to temp location, which effectively does repair
+            Client::Context ctx( dbName, reservedPathString );
+            assert( ctx.justCreated() );
+
+            res = cloneFrom(localhost.c_str(), errmsg, dbName,
+                            /*logForReplication=*/false, /*slaveOk*/false, /*replauth*/false,
+                            /*snapshot*/false, /*mayYield*/false, /*mayBeInterrupted*/true);
+            Database::closeDatabase( dbName, reservedPathString.c_str() );
+        }
+
+        if ( !res ) {
+            errmsg = str::stream() << "clone failed for " << dbName << " with error: " << errmsg;
+            problem() << errmsg << endl;
+
+            if ( !preserveClonedFilesOnFailure )
+                BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+            getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+            return false;
+        }
+
+        MongoFile::flushAll(true);
+
+        Client::Context ctx( dbName );
+        Database::closeDatabase( dbName, dbpath );
+
+        if ( backupOriginalFiles ) {
+            _renameForBackup( dbName, reservedPath );
+        }
+        else {
+            _deleteDataFiles( dbName );
+            BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) );
+        }
+
+        _replaceWithRecovered( dbName, reservedPathString.c_str() );
+
+        if ( !backupOriginalFiles )
+            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+        return true;
+    }
+
+    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) {
+        if ( afterAllocator )
+            FileAllocator::get()->waitUntilFinished();
+        string c = database;
+        c += '.';
+        boost::filesystem::path p(path);
+        if ( directoryperdb )
+            p /= database;
+        boost::filesystem::path q;
+        q = p / (c+"ns");
+        bool ok = false;
+        BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) );
+        if ( ok )
+            log(2) << fo.op() << " file " << q.string() << endl;
+        int i = 0;
+        int extra = 10; // should not be necessary, this is defensive in case there are missing files
+        while ( 1 ) {
+            assert( i <= DiskLoc::MaxFiles );
+            stringstream ss;
+            ss << c << i;
+            q = p / ss.str();
+            BOOST_CHECK_EXCEPTION( ok = fo.apply(q) );
+            if ( ok ) {
+                if ( extra != 10 ) {
+                    log(1) << fo.op() << " file " << q.string() << endl;
+                    log() << "  _applyOpToDataFiles() warning: extra == " << extra << endl;
+                }
+            }
+            else if ( --extra <= 0 )
+                break;
+            i++;
+        }
+    }
+
+    NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }
+
+    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) {
+        log() << "DatabaseHolder::closeAll path:" << path << endl;
+        d.dbMutex.assertWriteLocked();
+
+        map<string,Database*>& m = _paths[path];
+        _size -= m.size();
+
+        set< string > dbs;
+        for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
+            wassert( i->second->path == path );
+            dbs.insert( i->first );
+        }
+
+        currentClient.get()->getContext()->_clear();
+
+        BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
+        int n = 0;
+        int nNotClosed = 0;
+        for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) {
+            string name = *i;
+            log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl;
+            Client::Context ctx( name , path );
+            if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) {
+                log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl;
+                nNotClosed++;
+            }
+            else {
+                Database::closeDatabase( name.c_str() , path );
+                bb.append( bb.numStr( n++ ) , name );
+            }
+        }
+        bb.done();
+        if( nNotClosed )
+            result.append("nNotClosed", nNotClosed);
+        else {
+            ClientCursor::assertNoCursors();
+        }
+
+        return true;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/pdfile.h b/src/mongo/db/pdfile.h
new file mode 100644
index 00000000000..cd6062b1a48
--- /dev/null
+++ b/src/mongo/db/pdfile.h
@@ -0,0 +1,546 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* pdfile.h
+
+   Files:
+     database.ns - namespace index
+     database.1  - data files
+     database.2
+     ...
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/mmap.h"
+#include "diskloc.h"
+#include "jsobjmanipulator.h"
+#include "namespace-inl.h"
+#include "client.h"
+#include "mongommf.h"
+
+namespace mongo {
+
+    class DataFileHeader;
+    class Extent;
+    class Record;
+    class Cursor;
+    class OpDebug;
+
+    void dropDatabase(string db);
+    bool repairDatabase(string db, string &errmsg, bool preserveClonedFilesOnFailure = false, bool backupOriginalFiles = false);
+
+    /* low level - only drops this ns */
+    void dropNS(const string& dropNs);
+
+    /* deletes this ns, indexes and cursors */
+    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result );
+    bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0);
+    shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc());
+
+    bool isValidNS( const StringData& ns );
+
+    /*---------------------------------------------------------------------*/
+
+    class MongoDataFile {
+        friend class DataFileMgr;
+        friend class BasicCursor;
+    public:
+        MongoDataFile(int fn) : _mb(0), fileNo(fn) { }
+
+        /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
+        bool openExisting( const char *filename );
+
+        /** creates if DNE */
+        void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false);
+
+        /* allocate a new extent from this datafile.
+           @param capped - true if capped collection
+           @param loops is our recursion check variable - you want to pass in zero
+        */
+        Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0);
+
+        DataFileHeader *getHeader() { return header(); }
+
+        unsigned long long length() const { return mmf.length(); }
+
+        /* return max size an extent may be */
+        static int maxSize();
+
+        /** fsync */
+        void flush( bool sync );
+
+        /** only use fore debugging */
+        Extent* debug_getExtent(DiskLoc loc) { return _getExtent( loc ); }
+    private:
+        void badOfs(int) const;
+        void badOfs2(int) const;
+        int defaultSize( const char *filename ) const;
+
+        Extent* getExtent(DiskLoc loc) const;
+        Extent* _getExtent(DiskLoc loc) const;
+        Record* recordAt(DiskLoc dl);
+        Record* makeRecord(DiskLoc dl, int size);
+        void grow(DiskLoc dl, int size);
+
+        char* p() const { return (char *) _mb; }
+        DataFileHeader* header() { return (DataFileHeader*) _mb; }
+
+        MongoMMF mmf;
+        void *_mb; // the memory mapped view
+        int fileNo;
+    };
+
+    class DataFileMgr {
+        friend class BasicCursor;
+    public:
+        void init(const string& path );
+
+        /* see if we can find an extent of the right size in the freelist. */
+        static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false);
+
+        /** @return DiskLoc where item ends up */
+        // changedId should be initialized to false
+        const DiskLoc updateRecord(
+            const char *ns,
+            NamespaceDetails *d,
+            NamespaceDetailsTransient *nsdt,
+            Record *toupdate, const DiskLoc& dl,
+            const char *buf, int len, OpDebug& debug, bool god=false);
+
+        // The object o may be updated if modified on insert.
+        void insertAndLog( const char *ns, const BSONObj &o, bool god = false );
+
+        /** insert will add an _id to the object if not present.  if you would like to see the final object
+            after such an addition, use this method.
+            @param o both and in and out param 
+            */
+        DiskLoc insertWithObjMod(const char *ns, BSONObj & /*out*/o, bool god = false);
+
+        /** @param obj in value only for this version. */
+        void insertNoReturnVal(const char *ns, BSONObj o, bool god = false);
+
+        DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, bool mayAddIndex = true, bool *addedID = 0);
+        static shared_ptr<Cursor> findAll(const char *ns, const DiskLoc &startLoc = DiskLoc());
+
+        /* special version of insert for transaction logging -- streamlined a bit.
+           assumes ns is capped and no indexes
+           no _id field check
+        */
+        Record* fast_oplog_insert(NamespaceDetails *d, const char *ns, int len);
+
+        static Extent* getExtent(const DiskLoc& dl);
+        static Record* getRecord(const DiskLoc& dl);
+        static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len);
+
+        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false, bool logOp=false);
+
+        /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */
+        void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl);
+
+    private:
+        vector<MongoDataFile *> files;
+    };
+
+    extern DataFileMgr theDataFileMgr;
+
+#pragma pack(1)
+
+    class DeletedRecord {
+    public:
+        int lengthWithHeaders;
+        int extentOfs;
+        DiskLoc nextDeleted;
+        DiskLoc myExtentLoc(const DiskLoc& myLoc) const {
+            return DiskLoc(myLoc.a(), extentOfs);
+        }
+        Extent* myExtent(const DiskLoc& myLoc) {
+            return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs));
+        }
+    };
+
+    /* Record is a record in a datafile.  DeletedRecord is similar but for deleted space.
+
+    *11:03:20 AM) dm10gen: regarding extentOfs...
+    (11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords
+    (11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs.  (64 bit total)
+    (11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset
+    (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo
+    (11:04:33 AM) dm10gen: see class DiskLoc for more info
+    (11:04:43 AM) dm10gen: so that is how Record::myExtent() works
+    (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must populate its extentOfs then
+    */
+    class Record {
+    public:
+        enum HeaderSizeValue { HeaderSize = 16 };
+        int lengthWithHeaders;
+        int extentOfs;
+        int nextOfs;
+        int prevOfs;
+
+        /** be careful when referencing this that your write intent was correct */
+        char data[4];
+
+        int netLength() {
+            return lengthWithHeaders - HeaderSize;
+        }
+        //void setNewLength(int netlen) { lengthWithHeaders = netlen + HeaderSize; }
+
+        /* use this when a record is deleted. basically a union with next/prev fields */
+        DeletedRecord& asDeleted() { return *((DeletedRecord*) this); }
+
+        Extent* myExtent(const DiskLoc& myLoc) { return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); }
+
+        /* get the next record in the namespace, traversing extents as necessary */
+        DiskLoc getNext(const DiskLoc& myLoc);
+        DiskLoc getPrev(const DiskLoc& myLoc);
+
+        DiskLoc nextInExtent(const DiskLoc& myLoc) { 
+            if ( nextOfs == DiskLoc::NullOfs )
+                return DiskLoc();
+            assert( nextOfs );
+            return DiskLoc(myLoc.a(), nextOfs);
+        }
+
+        struct NP {
+            int nextOfs;
+            int prevOfs;
+        };
+        NP* np() { return (NP*) &nextOfs; }
+
+        // ---------------------
+        // memory cache
+        // ---------------------
+
+        /** 
+         * touches the data so that is in physical memory
+         * @param entireRecrd if false, only the header and first byte is touched
+         *                    if true, the entire record is touched
+         * */
+        void touch( bool entireRecrd = false );
+
+        /**
+         * @return if this record is likely in physical memory
+         *         its not guaranteed because its possible it gets swapped out in a very unlucky windows
+         */
+        bool likelyInPhysicalMemory();
+
+        /**
+         * tell the cache this Record was accessed
+         * @return this, for simple chaining
+         */
+        Record* accessed();
+
+        static bool MemoryTrackingEnabled;
+    };
+
+    /* extents are datafile regions where all the records within the region
+       belong to the same namespace.
+
+    (11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord
+    (11:12:55 AM) dm10gen: and that is placed on the free list
+    */
+    class Extent {
+    public:
+        unsigned magic;
+        DiskLoc myLoc;
+        DiskLoc xnext, xprev; /* next/prev extent for this namespace */
+
+        /* which namespace this extent is for.  this is just for troubleshooting really
+           and won't even be correct if the collection were renamed!
+        */
+        Namespace nsDiagnostic;
+
+        int length;   /* size of the extent, including these fields */
+        DiskLoc firstRecord;
+        DiskLoc lastRecord;
+        char _extentData[4];
+
+        static int HeaderSize() { return sizeof(Extent)-4; }
+
+        bool validates() {
+            return !(firstRecord.isNull() ^ lastRecord.isNull()) &&
+                   length >= 0 && !myLoc.isNull();
+        }
+
+        BSONObj dump() {
+            return BSON( "loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev" << xprev.toString()
+                      << "nsdiag" << nsDiagnostic.toString()
+                      << "size" << length << "firstRecord" << firstRecord.toString() << "lastRecord" << lastRecord.toString());
+        }
+
+        void dump(iostream& s) {
+            s << "    loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n';
+            s << "    nsdiag:" << nsDiagnostic.toString() << '\n';
+            s << "    size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n';
+        }
+
+        /* assumes already zeroed -- insufficient for block 'reuse' perhaps
+        Returns a DeletedRecord location which is the data in the extent ready for us.
+        Caller will need to add that to the freelist structure in namespacedetail.
+        */
+        DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset, bool capped);
+
+        /* like init(), but for a reuse case */
+        DiskLoc reuse(const char *nsname, bool newUseIsAsCapped);
+
+        bool isOk() const { return magic == 0x41424344; }
+        void assertOk() const { assert(isOk()); }
+
+        Record* newRecord(int len);
+
+        Record* getRecord(DiskLoc dl) {
+            assert( !dl.isNull() );
+            assert( dl.sameFile(myLoc) );
+            int x = dl.getOfs() - myLoc.getOfs();
+            assert( x > 0 );
+            return (Record *) (((char *) this) + x);
+        }
+
+        Extent* getNextExtent() { return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); }
+        Extent* getPrevExtent() { return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); }
+
+        static int maxSize();
+        static int minSize() { return 0x100; }
+        /**
+         * @param len lengt of record we need
+         * @param lastRecord size of last extent which is a factor in next extent size
+         */
+        static int followupSize(int len, int lastExtentLen);
+
+        /** get a suggested size for the first extent in a namespace
+         *  @param len length of record we need to insert
+         */
+        static int initialSize(int len);
+
+        struct FL {
+            DiskLoc firstRecord;
+            DiskLoc lastRecord;
+        };
+        /** often we want to update just the firstRecord and lastRecord fields.
+            this helper is for that -- for use with getDur().writing() method
+        */
+        FL* fl() { return (FL*) &firstRecord; }
+
+        /** caller must declare write intent first */
+        void markEmpty();
+    private:
+        DiskLoc _reuse(const char *nsname, bool newUseIsAsCapped); // recycle an extent and reuse it for a different ns
+    };
+
+    /*  a datafile - i.e. the "dbname.<#>" files :
+
+          ----------------------
+          DataFileHeader
+          ----------------------
+          Extent (for a particular namespace)
+            Record
+            ...
+            Record (some chained for unused space)
+          ----------------------
+          more Extents...
+          ----------------------
+    */
+    class DataFileHeader {
+    public:
+        int version;
+        int versionMinor;
+        int fileLength;
+        DiskLoc unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */
+        int unusedLength;
+        char reserved[8192 - 4*4 - 8];
+
+        char data[4]; // first extent starts here
+
+        enum { HeaderSize = 8192 };
+
+        bool isCurrentVersion() const { return ( version == PDFILE_VERSION ) && ( versionMinor == PDFILE_VERSION_MINOR ); }
+
+        bool uninitialized() const { return version == 0; }
+
+        void init(int fileno, int filelength, const char* filename) {
+            if ( uninitialized() ) {
+                DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl;
+                if( !(filelength > 32768 ) ) { 
+                    massert(13640, str::stream() << "DataFileHeader looks corrupt at file open filelength:" << filelength << " fileno:" << fileno, false);
+                }
+
+                { 
+                    if( !d.dbMutex.isWriteLocked() ) { 
+                        log() << "*** TEMP NOT INITIALIZING FILE " << filename << ", not in a write lock." << endl;
+                        log() << "temp bypass until more elaborate change - case that is manifesting is benign anyway" << endl;
+                        return;
+/**
+                        log() << "ERROR can't create outside a write lock" << endl;
+                        printStackTrace();
+                        ::abort();
+**/
+                    }
+                }
+
+                getDur().createdFile(filename, filelength);
+                assert( HeaderSize == 8192 );
+                DataFileHeader *h = getDur().writing(this);
+                h->fileLength = filelength;
+                h->version = PDFILE_VERSION;
+                h->versionMinor = PDFILE_VERSION_MINOR;
+                h->unused.set( fileno, HeaderSize );
+                assert( (data-(char*)this) == HeaderSize );
+                h->unusedLength = fileLength - HeaderSize - 16;
+            }
+        }
+
+        bool isEmpty() const {
+            return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 );
+        }
+    };
+
+#pragma pack()
+
+    inline Extent* MongoDataFile::_getExtent(DiskLoc loc) const {
+        loc.assertOk();
+        Extent *e = (Extent *) (p()+loc.getOfs());
+        return e;
+    }
+
+    inline Extent* MongoDataFile::getExtent(DiskLoc loc) const {
+        Extent *e = _getExtent(loc);
+        e->assertOk();
+        return e;
+    }
+
+} // namespace mongo
+
+#include "cursor.h"
+
+namespace mongo {
+
+    inline Record* MongoDataFile::recordAt(DiskLoc dl) {
+        int ofs = dl.getOfs();
+        if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path
+        return (Record*) (p()+ofs);
+    }
+
+    inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) {
+        int ofs = dl.getOfs();
+        if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path
+        return (Record*) (p()+ofs);
+    }
+
+    inline DiskLoc Record::getNext(const DiskLoc& myLoc) {
+        if ( nextOfs != DiskLoc::NullOfs ) {
+            /* defensive */
+            if ( nextOfs >= 0 && nextOfs < 10 ) {
+                sayDbContext("Assertion failure - Record::getNext() referencing a deleted record?");
+                return DiskLoc();
+            }
+
+            return DiskLoc(myLoc.a(), nextOfs);
+        }
+        Extent *e = myExtent(myLoc);
+        while ( 1 ) {
+            if ( e->xnext.isNull() )
+                return DiskLoc(); // end of table.
+            e = e->xnext.ext();
+            if ( !e->firstRecord.isNull() )
+                break;
+            // entire extent could be empty, keep looking
+        }
+        return e->firstRecord;
+    }
+    inline DiskLoc Record::getPrev(const DiskLoc& myLoc) {
+        if ( prevOfs != DiskLoc::NullOfs )
+            return DiskLoc(myLoc.a(), prevOfs);
+        Extent *e = myExtent(myLoc);
+        if ( e->xprev.isNull() )
+            return DiskLoc();
+        return e->xprev.ext()->lastRecord;
+    }
+
+    inline BSONObj DiskLoc::obj() const {
+        return BSONObj(rec()->accessed());
+    }
+    inline DeletedRecord* DiskLoc::drec() const {
+        assert( _a != -1 );
+        return (DeletedRecord*) rec();
+    }
+    inline Extent* DiskLoc::ext() const {
+        return DataFileMgr::getExtent(*this);
+    }
+
+    template< class V >
+    inline 
+    const BtreeBucket<V> * DiskLoc::btree() const {
+        assert( _a != -1 );
+        return (const BtreeBucket<V> *) rec()->data;
+    }
+
+} // namespace mongo
+
+#include "database.h"
+
+namespace mongo {
+
+    boost::intmax_t dbSize( const char *database );
+
+    inline NamespaceIndex* nsindex(const char *ns) {
+        Database *database = cc().database();
+        assert( database );
+        DEV {
+            char buf[256];
+            nsToDatabase(ns, buf);
+            if ( database->name != buf ) {
+                out() << "ERROR: attempt to write to wrong database\n";
+                out() << " ns:" << ns << '\n';
+                out() << " database->name:" << database->name << endl;
+                assert( database->name == buf );
+            }
+        }
+        return &database->namespaceIndex;
+    }
+
+    inline NamespaceDetails* nsdetails(const char *ns) {
+        // if this faults, did you set the current db first?  (Client::Context + dblock)
+        return nsindex(ns)->details(ns);
+    }
+
+    inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) {
+        assert( dl.a() != -1 );
+        return cc().database()->getFile(dl.a())->getExtent(dl);
+    }
+
+    inline Record* DataFileMgr::getRecord(const DiskLoc& dl) {
+        assert( dl.a() != -1 );
+        return cc().database()->getFile(dl.a())->recordAt(dl);
+    }
+
+    BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) );
+
+    inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) {
+        assert( dl.a() != -1 );
+        return (DeletedRecord*) cc().database()->getFile(dl.a())->makeRecord(dl, sizeof(DeletedRecord));
+    }
+
+    void ensureHaveIdIndex(const char *ns);
+
+    bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex );
+
+    inline BSONObj::BSONObj(const Record *r) {
+        init(r->data);
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/pipeline/accumulator.cpp b/src/mongo/db/pipeline/accumulator.cpp
new file mode 100755
index 00000000000..9ef8aa39470
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator.cpp
@@ -0,0 +1,92 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/accumulator.h"
+
+#include "db/jsobj.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    void Accumulator::addOperand(
+        const intrusive_ptr<Expression> &pExpression) {
+	uassert(15943, str::stream() << "group accumulator " <<
+		getOpName() << " only accepts one operand",
+		vpOperand.size() < 1);
+	
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    Accumulator::Accumulator():
+        ExpressionNary() {
+    }
+
+    void Accumulator::opToBson(
+	BSONObjBuilder *pBuilder, string opName,
+	string fieldName, unsigned depth) const {
+	assert(vpOperand.size() == 1);
+	BSONObjBuilder builder;
+	vpOperand[0]->addToBsonObj(&builder, opName, depth);
+	pBuilder->append(fieldName, builder.done());
+    }
+
+    void Accumulator::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	opToBson(pBuilder, getOpName(), fieldName, depth);
+    }
+
+    void Accumulator::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	assert(false); // these can't appear in arrays
+    }
+
+    void agg_framework_reservedErrors() {
+	uassert(16017, "reserved error", false);
+	uassert(16018, "reserved error", false);
+	uassert(16019, "reserved error", false);
+	uassert(16020, "reserved error", false);
+	uassert(16021, "reserved error", false);
+	uassert(16022, "reserved error", false);
+	uassert(16023, "reserved error", false);
+	uassert(16024, "reserved error", false);
+	uassert(16025, "reserved error", false);
+	uassert(16026, "reserved error", false);
+	uassert(16027, "reserved error", false);
+	uassert(16028, "reserved error", false);
+	uassert(16029, "reserved error", false);
+	uassert(16030, "reserved error", false);
+	uassert(16031, "reserved error", false);
+	uassert(16032, "reserved error", false);
+	uassert(16033, "reserved error", false);
+
+	uassert(16036, "reserved error", false);
+	uassert(16037, "reserved error", false);
+	uassert(16038, "reserved error", false);
+	uassert(16039, "reserved error", false);
+	uassert(16040, "reserved error", false);
+	uassert(16041, "reserved error", false);
+	uassert(16042, "reserved error", false);
+	uassert(16043, "reserved error", false);
+	uassert(16044, "reserved error", false);
+	uassert(16045, "reserved error", false);
+	uassert(16046, "reserved error", false);
+	uassert(16047, "reserved error", false);
+	uassert(16048, "reserved error", false);
+	uassert(16049, "reserved error", false);
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator.h b/src/mongo/db/pipeline/accumulator.h
new file mode 100755
index 00000000000..a75b2c9abaa
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator.h
@@ -0,0 +1,259 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include <boost/unordered_set.hpp>
+#include "db/pipeline/value.h"
+#include "db/pipeline/expression.h"
+#include "bson/bsontypes.h"
+
+namespace mongo {
+    class ExpressionContext;
+
+    class Accumulator :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+	virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+        /*
+          Get the accumulated value.
+
+          @returns the accumulated value
+         */
+        virtual intrusive_ptr<const Value> getValue() const = 0;
+
+    protected:
+        Accumulator();
+
+	/*
+	  Convenience method for doing this for accumulators.  The pattern
+	  is always the same, so a common implementation works, but requires
+	  knowing the operator name.
+
+	  @param pBuilder the builder to add to
+	  @param fieldName the projected name
+	  @param opName the operator name
+	 */
+	void opToBson(
+	    BSONObjBuilder *pBuilder, string fieldName, string opName,
+	    unsigned depth) const;
+    };
+
+
+    class AccumulatorAddToSet :
+        public Accumulator {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual intrusive_ptr<const Value> getValue() const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create an appending accumulator.
+
+	  @param pCtx the expression context
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorAddToSet(const intrusive_ptr<ExpressionContext> &pTheCtx);
+        typedef boost::unordered_set<intrusive_ptr<const Value>, Value::Hash > SetType;
+        mutable SetType set;
+        mutable SetType::iterator itr; 
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    /*
+      This isn't a finished accumulator, but rather a convenient base class
+      for others such as $first, $last, $max, $min, and similar.  It just
+      provides a holder for a single Value, and the getter for that.  The
+      holder is protected so derived classes can manipulate it.
+     */
+    class AccumulatorSingleValue :
+        public Accumulator {
+    public:
+        // virtuals from Expression
+        virtual intrusive_ptr<const Value> getValue() const;
+
+    protected:
+        AccumulatorSingleValue();
+
+        mutable intrusive_ptr<const Value> pValue; /* current min/max */
+    };
+
+
+    class AccumulatorFirst :
+        public AccumulatorSingleValue {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create the accumulator.
+
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorFirst();
+    };
+
+
+    class AccumulatorLast :
+        public AccumulatorSingleValue {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create the accumulator.
+
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorLast();
+    };
+
+
+    class AccumulatorSum :
+        public Accumulator {
+    public:
+        // virtuals from Accumulator
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual intrusive_ptr<const Value> getValue() const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create a summing accumulator.
+
+	  @param pCtx the expression context
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    protected: /* reused by AccumulatorAvg */
+        AccumulatorSum();
+
+        mutable BSONType totalType;
+        mutable long long longTotal;
+        mutable double doubleTotal;
+    };
+
+
+    class AccumulatorMinMax :
+        public AccumulatorSingleValue {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create either the max or min accumulator.
+
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> createMin(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+        static intrusive_ptr<Accumulator> createMax(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorMinMax(int theSense);
+
+        int sense; /* 1 for min, -1 for max; used to "scale" comparison */
+    };
+
+
+    class AccumulatorPush :
+        public Accumulator {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual intrusive_ptr<const Value> getValue() const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create an appending accumulator.
+
+	  @param pCtx the expression context
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorPush(const intrusive_ptr<ExpressionContext> &pTheCtx);
+
+        mutable vector<intrusive_ptr<const Value> > vpValue;
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    class AccumulatorAvg :
+	public AccumulatorSum {
+        typedef AccumulatorSum Super;
+    public:
+        // virtuals from Accumulator
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual intrusive_ptr<const Value> getValue() const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create an averaging accumulator.
+
+	  @param pCtx the expression context
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+	static const char subTotalName[];
+	static const char countName[];
+
+        AccumulatorAvg(const intrusive_ptr<ExpressionContext> &pCtx);
+
+	mutable long long count;
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+}
diff --git a/src/mongo/db/pipeline/accumulator_add_to_set.cpp b/src/mongo/db/pipeline/accumulator_add_to_set.cpp
new file mode 100755
index 00000000000..94df0293de4
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_add_to_set.cpp
@@ -0,0 +1,79 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    intrusive_ptr<const Value> AccumulatorAddToSet::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+	intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+	if (prhs->getType() == Undefined)
+	    ; /* nothing to add to the array */
+	else if (!pCtx->getInRouter())
+	    set.insert(prhs);
+	else {
+	    /*
+	      If we're in the router, we need to take apart the arrays we
+	      receive and put their elements into the array we are collecting.
+	      If we didn't, then we'd get an array of arrays, with one array
+	      from each shard that responds.
+	     */
+	    assert(prhs->getType() == Array);
+	    
+	    intrusive_ptr<ValueIterator> pvi(prhs->getArray());
+	    while(pvi->more()) {
+		intrusive_ptr<const Value> pElement(pvi->next());
+		set.insert(pElement);
+	    }
+	}
+
+        return Value::getNull();
+    }
+
+    intrusive_ptr<const Value> AccumulatorAddToSet::getValue() const {
+        vector<intrusive_ptr<const Value> > valVec;
+
+        for (itr = set.begin(); itr != set.end(); ++itr) {
+            valVec.push_back(*itr);
+        }
+        /* there is no issue of scope since createArray copy constructs */
+        return Value::createArray(valVec);
+    }
+
+    AccumulatorAddToSet::AccumulatorAddToSet(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        Accumulator(),
+        set(),
+        pCtx(pTheCtx) {
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorAddToSet::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorAddToSet> pAccumulator(
+	    new AccumulatorAddToSet(pCtx));
+        return pAccumulator;
+    }
+
+    const char *AccumulatorAddToSet::getOpName() const {
+	return "$addToSet";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_avg.cpp b/src/mongo/db/pipeline/accumulator_avg.cpp
new file mode 100755
index 00000000000..9f18b1820c8
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_avg.cpp
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    const char AccumulatorAvg::subTotalName[] = "subTotal";
+    const char AccumulatorAvg::countName[] = "count";
+
+    intrusive_ptr<const Value> AccumulatorAvg::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	if (!pCtx->getInRouter()) {
+	    Super::evaluate(pDocument);
+	    ++count;
+	}
+	else {
+	    /*
+	      If we're in the router, we expect an object that contains
+	      both a subtotal and a count.  This is what getValue() produced
+	      below.
+	     */
+	    intrusive_ptr<const Value> prhs(
+		vpOperand[0]->evaluate(pDocument));
+	    assert(prhs->getType() == Object);
+	    intrusive_ptr<Document> pShardDoc(prhs->getDocument());
+
+	    intrusive_ptr<const Value> pSubTotal(
+		pShardDoc->getValue(subTotalName));
+	    assert(pSubTotal.get());
+	    BSONType subTotalType = pSubTotal->getType();
+	    if ((totalType == NumberLong) || (subTotalType == NumberLong))
+		totalType = NumberLong;
+	    if ((totalType == NumberDouble) || (subTotalType == NumberDouble))
+		totalType = NumberDouble;
+
+	    if (subTotalType == NumberInt) {
+		int v = pSubTotal->getInt();
+		longTotal += v;
+		doubleTotal += v;
+	    }
+	    else if (subTotalType == NumberLong) {
+		long long v = pSubTotal->getLong();
+		longTotal += v;
+		doubleTotal += v;
+	    }
+	    else {
+		double v = pSubTotal->getDouble();
+		doubleTotal += v;
+	    }
+		
+	    intrusive_ptr<const Value> pCount(pShardDoc->getValue(countName));
+	    count += pCount->getLong();
+	}
+
+        return Value::getZero();
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorAvg::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorAvg> pA(new AccumulatorAvg(pCtx));
+        return pA;
+    }
+
+    intrusive_ptr<const Value> AccumulatorAvg::getValue() const {
+	if (!pCtx->getInShard()) {
+	    double avg = 0;
+	    if (count) {
+		if (totalType != NumberDouble)
+		    avg = static_cast<double>(longTotal / count);
+		else
+		    avg = doubleTotal / count;
+	    }
+
+	    return Value::createDouble(avg);
+	}
+
+	intrusive_ptr<Document> pDocument(Document::create());
+
+	intrusive_ptr<const Value> pSubTotal;
+	if (totalType == NumberInt)
+	    pSubTotal = Value::createInt((int)longTotal);
+	else if (totalType == NumberLong)
+	    pSubTotal = Value::createLong(longTotal);
+	else
+	    pSubTotal = Value::createDouble(doubleTotal);
+	pDocument->addField(subTotalName, pSubTotal);
+
+	intrusive_ptr<const Value> pCount(Value::createLong(count));
+	pDocument->addField(countName, pCount);
+
+	return Value::createDocument(pDocument);
+    }
+
+    AccumulatorAvg::AccumulatorAvg(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        AccumulatorSum(),
+	count(0),
+	pCtx(pTheCtx) {
+    }
+
+    const char *AccumulatorAvg::getOpName() const {
+	return "$avg";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_first.cpp b/src/mongo/db/pipeline/accumulator_first.cpp
new file mode 100755
index 00000000000..c947aa83996
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_first.cpp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorFirst::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+
+	/* only remember the first value seen */
+	if (!pValue.get())
+	    pValue = vpOperand[0]->evaluate(pDocument);
+
+        return pValue;
+    }
+
+    AccumulatorFirst::AccumulatorFirst():
+	AccumulatorSingleValue() {
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorFirst::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorFirst> pAccumulator(
+	    new AccumulatorFirst());
+        return pAccumulator;
+    }
+
+    const char *AccumulatorFirst::getOpName() const {
+	return "$first";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_last.cpp b/src/mongo/db/pipeline/accumulator_last.cpp
new file mode 100755
index 00000000000..c134fc83159
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_last.cpp
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorLast::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+
+	/* always remember the last value seen */
+	pValue = vpOperand[0]->evaluate(pDocument);
+
+        return pValue;
+    }
+
+    AccumulatorLast::AccumulatorLast():
+	AccumulatorSingleValue() {
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorLast::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorLast> pAccumulator(
+	    new AccumulatorLast());
+        return pAccumulator;
+    }
+
+    const char *AccumulatorLast::getOpName() const {
+	return "$last";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_min_max.cpp b/src/mongo/db/pipeline/accumulator_min_max.cpp
new file mode 100755
index 00000000000..6f078187b44
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_min_max.cpp
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorMinMax::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+	intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+        /* if this is the first value, just use it */
+        if (!pValue.get())
+            pValue = prhs;
+        else {
+            /* compare with the current value; swap if appropriate */
+            int cmp = Value::compare(pValue, prhs) * sense;
+            if (cmp > 0)
+                pValue = prhs;
+        }
+
+        return pValue;
+    }
+
+    AccumulatorMinMax::AccumulatorMinMax(int theSense):
+	AccumulatorSingleValue(),
+        sense(theSense) {
+        assert((sense == 1) || (sense == -1));
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorMinMax::createMin(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorMinMax> pAccumulator(
+	    new AccumulatorMinMax(1));
+        return pAccumulator;
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorMinMax::createMax(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorMinMax> pAccumulator(
+	    new AccumulatorMinMax(-1));
+        return pAccumulator;
+    }
+
+    const char *AccumulatorMinMax::getOpName() const {
+	if (sense == 1)
+	    return "$min";
+	return "$max";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_push.cpp b/src/mongo/db/pipeline/accumulator_push.cpp
new file mode 100755
index 00000000000..2640bc4ecfd
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_push.cpp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    intrusive_ptr<const Value> AccumulatorPush::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+	intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+	if (prhs->getType() == Undefined)
+	    ; /* nothing to add to the array */
+	else if (!pCtx->getInRouter())
+	    vpValue.push_back(prhs);
+	else {
+	    /*
+	      If we're in the router, we need to take apart the arrays we
+	      receive and put their elements into the array we are collecting.
+	      If we didn't, then we'd get an array of arrays, with one array
+	      from each shard that responds.
+	     */
+	    assert(prhs->getType() == Array);
+	    
+	    intrusive_ptr<ValueIterator> pvi(prhs->getArray());
+	    while(pvi->more()) {
+		intrusive_ptr<const Value> pElement(pvi->next());
+		vpValue.push_back(pElement);
+	    }
+	}
+
+        return Value::getNull();
+    }
+
+    intrusive_ptr<const Value> AccumulatorPush::getValue() const {
+        return Value::createArray(vpValue);
+    }
+
+    AccumulatorPush::AccumulatorPush(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        Accumulator(),
+        vpValue(),
+        pCtx(pTheCtx) {
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorPush::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorPush> pAccumulator(
+	    new AccumulatorPush(pCtx));
+        return pAccumulator;
+    }
+
+    const char *AccumulatorPush::getOpName() const {
+	return "$push";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_single_value.cpp b/src/mongo/db/pipeline/accumulator_single_value.cpp
new file mode 100755
index 00000000000..bfec80387d3
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_single_value.cpp
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorSingleValue::getValue() const {
+	return pValue;
+    }
+
+    AccumulatorSingleValue::AccumulatorSingleValue():
+	pValue(intrusive_ptr<const Value>()) {
+    }
+
+}
diff --git a/src/mongo/db/pipeline/accumulator_sum.cpp b/src/mongo/db/pipeline/accumulator_sum.cpp
new file mode 100755
index 00000000000..e6526ac254a
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_sum.cpp
@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorSum::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+	intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+        /* upgrade to the widest type required to hold the result */
+	totalType = Value::getWidestNumeric(totalType, prhs->getType());
+
+        if (totalType == NumberInt) {
+            int v = prhs->coerceToInt();
+            longTotal += v;
+            doubleTotal += v;
+        }
+        else if (totalType == NumberLong) {
+            long long v = prhs->coerceToLong();
+            longTotal += v;
+            doubleTotal += v;
+        }
+        else { /* (totalType == NumberDouble) */
+            double v = prhs->coerceToDouble();
+            doubleTotal += v;
+        }
+
+        return Value::getZero();
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorSum::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorSum> pSummer(new AccumulatorSum());
+        return pSummer;
+    }
+
+    intrusive_ptr<const Value> AccumulatorSum::getValue() const {
+        if (totalType == NumberInt)
+            return Value::createInt((int)longTotal);
+        if (totalType == NumberLong)
+            return Value::createLong(longTotal);
+        return Value::createDouble(doubleTotal);
+    }
+
+    AccumulatorSum::AccumulatorSum():
+        Accumulator(),
+        totalType(NumberInt),
+        longTotal(0),
+        doubleTotal(0) {
+    }
+
+    const char *AccumulatorSum::getOpName() const {
+	return "$sum";
+    }
+}
diff --git a/src/mongo/db/pipeline/builder.cpp b/src/mongo/db/pipeline/builder.cpp
new file mode 100755
index 00000000000..cbde3705656
--- /dev/null
+++ b/src/mongo/db/pipeline/builder.cpp
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+
+
+namespace mongo {
+
+    void BuilderObj::append() {
+	pBuilder->appendNull(fieldName);
+    }
+
+    void BuilderObj::append(bool b) {
+	pBuilder->append(fieldName, b);
+    }
+
+    void BuilderObj::append(int i) {
+	pBuilder->append(fieldName, i);
+    }
+
+    void BuilderObj::append(long long ll) {
+	pBuilder->append(fieldName, ll);
+    }
+
+    void BuilderObj::append(double d) {
+	pBuilder->append(fieldName, d);
+    }
+
+    void BuilderObj::append(string s) {
+	pBuilder->append(fieldName, s);
+    }
+
+    void BuilderObj::append(const OID &o) {
+	pBuilder->append(fieldName, o);
+    }
+
+    void BuilderObj::append(const Date_t &d) {
+	pBuilder->append(fieldName, d);
+    }
+
+    void BuilderObj::append(BSONObjBuilder *pDone) {
+	pBuilder->append(fieldName, pDone->done());
+    }
+
+    void BuilderObj::append(BSONArrayBuilder *pDone) {
+	pBuilder->append(fieldName, pDone->arr());
+    }
+
+    BuilderObj::BuilderObj(
+	BSONObjBuilder *pObjBuilder, string theFieldName):
+        pBuilder(pObjBuilder),
+        fieldName(theFieldName) {
+    }
+
+
+    void BuilderArray::append() {
+	pBuilder->appendNull();
+    }
+
+    void BuilderArray::append(bool b) {
+	pBuilder->append(b);
+    }
+
+    void BuilderArray::append(int i) {
+	pBuilder->append(i);
+    }
+
+    void BuilderArray::append(long long ll) {
+	pBuilder->append(ll);
+    }
+
+    void BuilderArray::append(double d) {
+	pBuilder->append(d);
+    }
+
+    void BuilderArray::append(string s) {
+	pBuilder->append(s);
+    }
+
+    void BuilderArray::append(const OID &o) {
+	pBuilder->append(o);
+    }
+
+    void BuilderArray::append(const Date_t &d) {
+	pBuilder->append(d);
+    }
+
+    void BuilderArray::append(BSONObjBuilder *pDone) {
+	pBuilder->append(pDone->done());
+    }
+
+    void BuilderArray::append(BSONArrayBuilder *pDone) {
+	pBuilder->append(pDone->arr());
+    }
+
+    BuilderArray::BuilderArray(
+	BSONArrayBuilder *pArrayBuilder):
+        pBuilder(pArrayBuilder) {
+    }
+
+}
diff --git a/src/mongo/db/pipeline/builder.h b/src/mongo/db/pipeline/builder.h
new file mode 100755
index 00000000000..bdf71cd784c
--- /dev/null
+++ b/src/mongo/db/pipeline/builder.h
@@ -0,0 +1,95 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    class BSONArrayBuilder;
+    class BSONObjBuilder;
+
+    /*
+      Generic Builder.
+
+      The methods to append items to an object (on BSONObjBuilder) and an array
+      (on BSONArrayBuilder) differ only by their inclusion of a field name. 
+      For more complicated implementations of addToBsonObj() and
+      addToBsonArray(), it makes sense to abstract that out and use
+      this generic builder that always looks the same, and then implement
+      addToBsonObj() and addToBsonArray() by using a common method.
+    */
+    class Builder :
+        boost::noncopyable {
+    public:
+	virtual ~Builder() {};
+
+        virtual void append() = 0; // append a null
+	virtual void append(bool b) = 0;
+	virtual void append(int i) = 0;
+	virtual void append(long long ll) = 0;
+	virtual void append(double d) = 0;
+	virtual void append(string s) = 0;
+	virtual void append(const OID &o) = 0;
+	virtual void append(const Date_t &d) = 0;
+	virtual void append(BSONObjBuilder *pDone) = 0;
+	virtual void append(BSONArrayBuilder *pDone) = 0;
+    };
+
+    class BuilderObj :
+	public Builder {
+    public:
+	// virtuals from Builder
+        virtual void append();
+	virtual void append(bool b);
+	virtual void append(int i);
+	virtual void append(long long ll);
+	virtual void append(double d);
+	virtual void append(string s);
+	virtual void append(const OID &o);
+	virtual void append(const Date_t &d);
+	virtual void append(BSONObjBuilder *pDone);
+	virtual void append(BSONArrayBuilder *pDone);
+
+	BuilderObj(BSONObjBuilder *pBuilder, string fieldName);
+
+    private:
+	BSONObjBuilder *pBuilder;
+	string fieldName;
+    };
+
+    class BuilderArray :
+	public Builder {
+    public:
+	// virtuals from Builder
+        virtual void append();
+	virtual void append(bool b);
+	virtual void append(int i);
+	virtual void append(long long ll);
+	virtual void append(double d);
+	virtual void append(string s);
+	virtual void append(const OID &o);
+	virtual void append(const Date_t &d);
+	virtual void append(BSONObjBuilder *pDone);
+	virtual void append(BSONArrayBuilder *pDone);
+
+	BuilderArray(BSONArrayBuilder *pBuilder);
+
+    private:
+	BSONArrayBuilder *pBuilder;
+    };
+}
diff --git a/src/mongo/db/pipeline/doc_mem_monitor.cpp b/src/mongo/db/pipeline/doc_mem_monitor.cpp
new file mode 100755
index 00000000000..ffbe9c88854
--- /dev/null
+++ b/src/mongo/db/pipeline/doc_mem_monitor.cpp
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/doc_mem_monitor.h"
+#include "util/systeminfo.h"
+
+namespace mongo {
+
+    DocMemMonitor::DocMemMonitor(StringWriter *pW) {
+	/*
+	  Use the default values.
+
+	  Currently, we warn in log at 5%, and assert at 10%.
+	*/
+	size_t errorRam = SystemInfo::getPhysicalRam() / 10;
+	size_t warnRam = errorRam / 2;
+
+	init(pW, warnRam, errorRam);
+    }
+
+    DocMemMonitor::DocMemMonitor(StringWriter *pW,
+				 size_t warnLimit, size_t errorLimit) {
+	init(pW, warnLimit, errorLimit);
+    }
+
+    void DocMemMonitor::addToTotal(size_t amount) {
+	totalUsed += amount;
+
+	if (!warned) {
+	    if (warnLimit && (totalUsed > warnLimit)) {
+		stringstream ss;
+		ss << "warning, 5% of physical RAM used for ";
+		pWriter->writeString(ss);
+		ss << endl;
+		warning() << ss.str();
+		warned = true;
+	    }
+	}
+	
+	if (errorLimit) {
+	    uassert(15944, "terminating request:  request heap use exceeded 10% of physical RAM", (totalUsed <= errorLimit));
+	}
+    }
+
+    void DocMemMonitor::init(StringWriter *pW,
+			     size_t warnLimit, size_t errorLimit) {
+	this->pWriter = pW;
+	this->warnLimit = warnLimit;
+	this->errorLimit = errorLimit;
+
+	warned = false;
+	totalUsed = 0;
+    }
+}
diff --git a/src/mongo/db/pipeline/doc_mem_monitor.h b/src/mongo/db/pipeline/doc_mem_monitor.h
new file mode 100755
index 00000000000..e368acc906a
--- /dev/null
+++ b/src/mongo/db/pipeline/doc_mem_monitor.h
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "util/string_writer.h"
+
+
+namespace mongo {
+
+    /*
+      This utility class provides an easy way to total up, monitor, warn, and
+      signal an error when the amount of memory used for an operation exceeds
+      given thresholds.
+
+      Create a local instance of this class, and then inform it of any memory
+      that you consume using addToTotal().
+
+      Warnings or errors are issued as usage exceeds certain fractions of
+      physical memory on the host, as determined by SystemInfo.
+
+      This class is not guaranteed to warn or signal errors if the host system
+      does not support the ability to report its memory, as per the warnings
+      for SystemInfo in systeminfo.h.
+     */
+    class DocMemMonitor {
+    public:
+	/*
+	  Constructor.
+
+	  Uses default limits for warnings and errors.
+
+	  The StringWriter parameter must outlive the DocMemMonitor instance.
+
+	  @param pWriter string writer that provides information about the
+	      operation being monitored
+	 */
+	DocMemMonitor(StringWriter *pWriter);
+
+	/*
+	  Constructor.
+
+	  This variant allows explicit selection of the limits.  Note that
+	  limits of zero are treated as infinite.
+
+	  The StringWriter parameter must outlive the DocMemMonitor instance.
+
+	  @param pWriter string writer that provides information about the
+	      operation being monitored
+	  @param warnLimit the amount of ram to issue (log) a warning for
+	  @param errorLimit the amount of ram to throw an error for
+	 */
+	DocMemMonitor(StringWriter *pWriter, size_t warnLimit,
+		      size_t errorLimit);
+
+	/*
+	  Increment the total amount of memory used by the given amount.  If
+	  the warning threshold is exceeded, a warning will be logged.  If the
+	  error threshold is exceeded, an error will be thrown.
+
+	  @param amount the amount of memory to add to the current total
+	 */
+	void addToTotal(size_t amount);
+
+    private:
+	/*
+	  Real constructor body.
+
+	  Provides common construction for all the variant constructors.
+	 */
+	void init(StringWriter *pW, size_t warnLimit, size_t errorLimit);
+
+	bool warned;
+	size_t totalUsed;
+	size_t warnLimit;
+	size_t errorLimit;
+	StringWriter *pWriter;
+    };
+
+}
diff --git a/src/mongo/db/pipeline/document.cpp b/src/mongo/db/pipeline/document.cpp
new file mode 100755
index 00000000000..a49c7e303c1
--- /dev/null
+++ b/src/mongo/db/pipeline/document.cpp
@@ -0,0 +1,219 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/value.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    string Document::idName("_id");
+
+    intrusive_ptr<Document> Document::createFromBsonObj(BSONObj *pBsonObj) {
+	intrusive_ptr<Document> pDocument(new Document(pBsonObj));
+        return pDocument;
+    }
+
+    Document::Document(BSONObj *pBsonObj):
+        vFieldName(),
+        vpValue() {
+        BSONObjIterator bsonIterator(pBsonObj->begin());
+        while(bsonIterator.more()) {
+            BSONElement bsonElement(bsonIterator.next());
+            string fieldName(bsonElement.fieldName());
+	    intrusive_ptr<const Value> pValue(
+                Value::createFromBsonElement(&bsonElement));
+
+            vFieldName.push_back(fieldName);
+            vpValue.push_back(pValue);
+        }
+    }
+
+    void Document::toBson(BSONObjBuilder *pBuilder) {
+        const size_t n = vFieldName.size();
+        for(size_t i = 0; i < n; ++i)
+            vpValue[i]->addToBsonObj(pBuilder, vFieldName[i]);
+    }
+
+    intrusive_ptr<Document> Document::create(size_t sizeHint) {
+	intrusive_ptr<Document> pDocument(new Document(sizeHint));
+        return pDocument;
+    }
+
+    Document::Document(size_t sizeHint):
+        vFieldName(),
+        vpValue() {
+        if (sizeHint) {
+            vFieldName.reserve(sizeHint);
+            vpValue.reserve(sizeHint);
+        }
+    }
+
+    intrusive_ptr<Document> Document::clone() {
+        const size_t n = vFieldName.size();
+	intrusive_ptr<Document> pNew(Document::create(n));
+        for(size_t i = 0; i < n; ++i)
+            pNew->addField(vFieldName[i], vpValue[i]);
+
+        return pNew;
+    }
+
+    Document::~Document() {
+    }
+
+    FieldIterator *Document::createFieldIterator() {
+        return new FieldIterator(intrusive_ptr<Document>(this));
+    }
+
+    intrusive_ptr<const Value> Document::getValue(const string &fieldName) {
+        /*
+          For now, assume the number of fields is small enough that iteration
+          is ok.  Later, if this gets large, we can create a map into the
+          vector for these lookups.
+
+          Note that because of the schema-less nature of this data, we always
+          have to look, and can't assume that the requested field is always
+          in a particular place as we would with a statically compilable
+          reference.
+        */
+        const size_t n = vFieldName.size();
+        for(size_t i = 0; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+                return vpValue[i];
+        }
+
+        return(intrusive_ptr<const Value>());
+    }
+
+    void Document::addField(const string &fieldName,
+			    const intrusive_ptr<const Value> &pValue) {
+	uassert(15945, str::stream() << "cannot add undefined field " <<
+		fieldName << " to document", pValue->getType() != Undefined);
+
+        vFieldName.push_back(fieldName);
+        vpValue.push_back(pValue);
+    }
+
+    void Document::setField(size_t index,
+                            const string &fieldName,
+			    const intrusive_ptr<const Value> &pValue) {
+	/* special case:  should this field be removed? */
+	if (!pValue.get()) {
+	    vFieldName.erase(vFieldName.begin() + index);
+	    vpValue.erase(vpValue.begin() + index);
+	    return;
+	}
+
+	/* make sure we have a valid value */
+	uassert(15968, str::stream() << "cannot set undefined field " <<
+		fieldName << " to document", pValue->getType() != Undefined);
+
+	/* set the indicated field */
+        vFieldName[index] = fieldName;
+        vpValue[index] = pValue;
+    }
+
+    intrusive_ptr<const Value> Document::getField(const string &fieldName) const {
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+		return vpValue[i];
+	}
+
+	/* if we got here, there's no such field */
+	return intrusive_ptr<const Value>();
+    }
+
+    size_t Document::getApproximateSize() const {
+	size_t size = sizeof(Document);
+	const size_t n = vpValue.size();
+	for(size_t i = 0; i < n; ++i)
+	    size += vpValue[i]->getApproximateSize();
+
+	return size;
+    }
+
+    size_t Document::getFieldIndex(const string &fieldName) const {
+	const size_t n = vFieldName.size();
+	size_t i = 0;
+	for(; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+		break;
+	}
+
+	return i;
+    }
+
+    void Document::hash_combine(size_t &seed) const {
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    boost::hash_combine(seed, vFieldName[i]);
+	    vpValue[i]->hash_combine(seed);
+	}
+    }
+
+    int Document::compare(const intrusive_ptr<Document> &rL,
+                          const intrusive_ptr<Document> &rR) {
+        const size_t lSize = rL->vFieldName.size();
+        const size_t rSize = rR->vFieldName.size();
+
+        for(size_t i = 0; true; ++i) {
+            if (i >= lSize) {
+                if (i >= rSize)
+                    return 0; // documents are the same length
+
+                return -1; // left document is shorter
+            }
+
+            if (i >= rSize)
+                return 1; // right document is shorter
+
+            const int nameCmp = rL->vFieldName[i].compare(rR->vFieldName[i]);
+            if (nameCmp)
+                return nameCmp; // field names are unequal
+
+            const int valueCmp = Value::compare(rL->vpValue[i], rR->vpValue[i]);
+            if (valueCmp)
+                return valueCmp; // fields are unequal
+        }
+
+        /* NOTREACHED */
+        assert(false);
+        return 0;
+    }
+
+    /* ----------------------- FieldIterator ------------------------------- */
+
+    FieldIterator::FieldIterator(const intrusive_ptr<Document> &pTheDocument):
+        pDocument(pTheDocument),
+        index(0) {
+    }
+
+    bool FieldIterator::more() const {
+        return (index < pDocument->vFieldName.size());
+    }
+
+    pair<string, intrusive_ptr<const Value> > FieldIterator::next() {
+        assert(more());
+        pair<string, intrusive_ptr<const Value> > result(
+            pDocument->vFieldName[index], pDocument->vpValue[index]);
+        ++index;
+        return result;
+    }
+}
diff --git a/src/mongo/db/pipeline/document.h b/src/mongo/db/pipeline/document.h
new file mode 100755
index 00000000000..f11a825151e
--- /dev/null
+++ b/src/mongo/db/pipeline/document.h
@@ -0,0 +1,246 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+    class BSONObj;
+    class FieldIterator;
+    class Value;
+
+    class Document :
+        public IntrusiveCounterUnsigned {
+    public:
+        ~Document();
+
+        /*
+          Create a new Document from the given BSONObj.
+
+          Document field values may be pointed to in the BSONObj, so it
+          must live at least as long as the resulting Document.
+
+          @returns shared pointer to the newly created Document
+        */
+        static intrusive_ptr<Document> createFromBsonObj(BSONObj *pBsonObj);
+
+        /*
+          Create a new empty Document.
+
+          @param sizeHint a hint at what the number of fields will be; if
+            known, this can be used to increase memory allocation efficiency
+          @returns shared pointer to the newly created Document
+        */
+        static intrusive_ptr<Document> create(size_t sizeHint = 0);
+
+        /*
+          Clone a document.
+
+          The new document shares all the fields' values with the original.
+
+	  This is not a deep copy.  Only the fields on the top-level document
+	  are cloned.
+
+	  @returns the shallow clone of the document
+        */
+        intrusive_ptr<Document> clone();
+
+        /*
+          Add this document to the BSONObj under construction with the
+          given BSONObjBuilder.
+        */
+        void toBson(BSONObjBuilder *pBsonObjBuilder);
+
+        /*
+          Create a new FieldIterator that can be used to examine the
+          Document's fields.
+        */
+        FieldIterator *createFieldIterator();
+
+        /*
+          Get the value of the specified field.
+
+          @param fieldName the name of the field
+          @return point to the requested field
+        */
+        intrusive_ptr<const Value> getValue(const string &fieldName);
+
+        /*
+          Add the given field to the Document.
+
+          BSON documents' fields are ordered; the new Field will be
+          appened to the current list of fields.
+
+          It is an error to add a field that has the same name as another
+          field.
+        */
+        void addField(const string &fieldName,
+		      const intrusive_ptr<const Value> &pValue);
+
+        /*
+          Set the given field to be at the specified position in the
+          Document.  This will replace any field that is currently in that
+          position.  The index must be within the current range of field
+          indices.
+
+	  pValue.get() may be NULL, in which case the field will be
+	  removed.  fieldName is ignored in this case.
+
+	  @param index the field index in the list of fields
+	  @param fieldName the new field name
+	  @param pValue the new Value
+        */
+        void setField(size_t index,
+                      const string &fieldName,
+		      const intrusive_ptr<const Value> &pValue);
+
+	/*
+	  Convenience type for dealing with fields.
+	 */
+	typedef pair<string, intrusive_ptr<const Value> > FieldPair;
+
+	/*
+	  Get the indicated field.
+
+	  @param index the field index in the list of fields
+	  @returns the field name and value of the field
+	 */
+	FieldPair getField(size_t index) const;
+
+	/*
+	  Get the number of fields in the Document.
+
+	  @returns the number of fields in the Document
+	 */
+	size_t getFieldCount() const;
+
+	/*
+	  Get the index of the given field.
+
+	  @param fieldName the name of the field
+	  @returns the index of the field, or if it does not exist, the number
+	    of fields (getFieldCount())
+	*/
+	size_t getFieldIndex(const string &fieldName) const;
+
+	/*
+	  Get a field by name.
+
+	  @param fieldName the name of the field
+	  @returns the value of the field
+	*/
+	intrusive_ptr<const Value> getField(const string &fieldName) const;
+
+	/*
+	  Get the approximate storage size of the document, in bytes.
+
+	  Under the assumption that field name strings are shared, they are
+	  not included in the total.
+
+	  @returns the approximate storage
+	*/
+	size_t getApproximateSize() const;
+
+        /*
+          Compare two documents.
+
+          BSON document field order is significant, so this just goes through
+          the fields in order.  The comparison is done in roughly the same way
+          as strings are compared, but comparing one field at a time instead
+          of one character at a time.
+        */
+        static int compare(const intrusive_ptr<Document> &rL,
+                           const intrusive_ptr<Document> &rR);
+
+	static string idName; // shared "_id"
+
+	/*
+	  Calculate a hash value.
+
+	  Meant to be used to create composite hashes suitable for
+	  boost classes such as unordered_map<>.
+
+	  @param seed value to augment with this' hash
+	*/
+	void hash_combine(size_t &seed) const;
+
+    private:
+        friend class FieldIterator;
+
+        Document(size_t sizeHint);
+        Document(BSONObj *pBsonObj);
+
+        /* these two vectors parallel each other */
+        vector<string> vFieldName;
+        vector<intrusive_ptr<const Value> > vpValue;
+    };
+
+
+    class FieldIterator :
+            boost::noncopyable {
+    public:
+        /*
+          Ask if there are more fields to return.
+
+          @return true if there are more fields, false otherwise
+        */
+        bool more() const;
+
+        /*
+          Move the iterator to point to the next field and return it.
+
+          @return the next field's <name, Value>
+        */
+	Document::FieldPair next();
+
+    private:
+        friend class Document;
+
+        /*
+          Constructor.
+
+          @param pDocument points to the document whose fields are being
+              iterated
+        */
+        FieldIterator(const intrusive_ptr<Document> &pDocument);
+
+        /*
+          We'll hang on to the original document to ensure we keep the
+          fieldPtr vector alive.
+        */
+	intrusive_ptr<Document> pDocument;
+        size_t index; // current field in iteration
+    };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline size_t Document::getFieldCount() const {
+	return vFieldName.size();
+    }
+    
+    inline Document::FieldPair Document::getField(size_t index) const {
+        assert( index < vFieldName.size() );
+        return FieldPair(vFieldName[index], vpValue[index]);
+    }
+
+}
diff --git a/src/mongo/db/pipeline/document_source.cpp b/src/mongo/db/pipeline/document_source.cpp
new file mode 100755
index 00000000000..813852e35c6
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source.cpp
@@ -0,0 +1,52 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+namespace mongo {
+    DocumentSource::~DocumentSource() {
+    }
+
+    void DocumentSource::setSource(
+	const intrusive_ptr<DocumentSource> &pTheSource) {
+	assert(!pSource.get());
+	pSource = pTheSource;
+    }
+
+    bool DocumentSource::coalesce(
+	const intrusive_ptr<DocumentSource> &pNextSource) {
+	return false;
+    }
+
+    void DocumentSource::optimize() {
+    }
+
+    void DocumentSource::addToBsonArray(BSONArrayBuilder *pBuilder) const {
+	BSONObjBuilder insides;
+	sourceToBson(&insides);
+	pBuilder->append(insides.done());
+    }
+
+    void DocumentSource::writeString(stringstream &ss) const {
+	BSONArrayBuilder bab;
+	addToBsonArray(&bab);
+	BSONArray ba(bab.arr());
+	ss << ba.toString(/* isArray */true); 
+            // our toString should use standard string types.....
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source.h b/src/mongo/db/pipeline/document_source.h
new file mode 100755
index 00000000000..8d5f0f70847
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source.h
@@ -0,0 +1,985 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include <boost/unordered_map.hpp>
+#include "util/intrusive_counter.h"
+#include "client/parallel.h"
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+#include "util/string_writer.h"
+
+namespace mongo {
+    class Accumulator;
+    class Cursor;
+    class Document;
+    class Expression;
+    class ExpressionContext;
+    class ExpressionFieldPath;
+    class ExpressionObject;
+    class Matcher;
+
+    class DocumentSource :
+        public IntrusiveCounterUnsigned,
+	public StringWriter {
+    public:
+	virtual ~DocumentSource();
+
+	// virtuals from StringWriter
+	/*
+	  Write out a string representation of this pipeline operator.
+
+	  @param ss string stream to write the string representation to
+	 */
+	virtual void writeString(stringstream &ss) const;
+
+
+        /*
+	  Is the source at EOF?
+
+	  @returns true if the source has no more Documents to return.
+        */
+        virtual bool eof() = 0;
+
+        /*
+	  Advance the state of the DocumentSource so that it will return the
+	  next Document.
+
+	  @returns whether there is another document to fetch, i.e., whether or
+	    not getCurrent() will succeed.
+        */
+        virtual bool advance() = 0;
+
+        /*
+          Advance the source, and return the next Expression.
+
+	  @returns the current Document
+          TODO throws an exception if there are no more expressions to return.
+        */
+        virtual intrusive_ptr<Document> getCurrent() = 0;
+
+	/*
+	  Set the underlying source this source should use to get Documents
+	  from.
+
+	  It is an error to set the source more than once.  This is to
+	  prevent changing sources once the original source has been started;
+	  this could break the state maintained by the DocumentSource.
+
+	  @param pSource the underlying source to use
+	 */
+	virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+	/*
+	  Attempt to coalesce this DocumentSource with its successor in the
+	  document processing pipeline.  If successful, the successor
+	  DocumentSource should be removed from the pipeline and discarded.
+
+	  If successful, this operation can be applied repeatedly, in an
+	  attempt to coalesce several sources together.
+
+	  The default implementation is to do nothing, and return false.
+
+	  @param pNextSource the next source in the document processing chain.
+	  @returns whether or not the attempt to coalesce was successful or not;
+	    if the attempt was not successful, nothing has been changed
+	 */
+	virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+
+	/*
+	  Optimize the pipeline operation, if possible.  This is a local
+	  optimization that only looks within this DocumentSource.  For best
+	  results, first coalesce compatible sources using coalesce().
+
+	  This is intended for any operations that include expressions, and
+	  provides a hook for those to optimize those operations.
+
+	  The default implementation is to do nothing.
+	 */
+	virtual void optimize();
+
+        /*
+	  Add the DocumentSource to the array builder.
+
+	  The default implementation calls sourceToBson() in order to
+	  convert the inner part of the object which will be added to the
+	  array being built here.
+
+	  @param pBuilder the array builder to add the operation to.
+         */
+	virtual void addToBsonArray(BSONArrayBuilder *pBuilder) const;
+	
+    protected:
+	/*
+	  Create an object that represents the document source.  The object
+	  will have a single field whose name is the source's name.  This
+	  will be used by the default implementation of addToBsonArray()
+	  to add this object to a pipeline being represented in BSON.
+
+	  @param pBuilder a blank object builder to write to
+	 */
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const = 0;
+
+	/*
+	  Most DocumentSources have an underlying source they get their data
+	  from.  This is a convenience for them.
+
+	  The default implementation of setSource() sets this; if you don't
+	  need a source, override that to assert().  The default is to
+	  assert() if this has already been set.
+	*/
+	intrusive_ptr<DocumentSource> pSource;
+    };
+
+
+    class DocumentSourceBsonArray :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceBsonArray();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+	/*
+	  Create a document source based on a BSON array.
+
+	  This is usually put at the beginning of a chain of document sources
+	  in order to fetch data from the database.
+
+	  CAUTION:  the BSON is not read until the source is used.  Any
+	  elements that appear after these documents must not be read until
+	  this source is exhausted.
+
+	  @param pBsonElement the BSON array to treat as a document source
+	  @returns the newly created document source
+	*/
+	static intrusive_ptr<DocumentSourceBsonArray> create(
+	    BSONElement *pBsonElement);
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceBsonArray(BSONElement *pBsonElement);
+
+	BSONObj embeddedObject;
+	BSONObjIterator arrayIterator;
+	BSONElement currentElement;
+	bool haveCurrent;
+    };
+
+    
+    class DocumentSourceCommandFutures :
+	public DocumentSource {
+    public:
+	// virtuals from DocumentSource
+	virtual ~DocumentSourceCommandFutures();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+	/* convenient shorthand for a commonly used type */
+	typedef list<shared_ptr<Future::CommandResult> > FuturesList;
+
+	/*
+	  Create a DocumentSource that wraps a list of Command::Futures.
+
+	  @param errmsg place to write error messages to; must exist for the
+	    lifetime of the created DocumentSourceCommandFutures
+	  @param pList the list of futures
+	 */
+	static intrusive_ptr<DocumentSourceCommandFutures> create(
+	    string &errmsg, FuturesList *pList);
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+	DocumentSourceCommandFutures(string &errmsg, FuturesList *pList);
+
+	/*
+	  Advance to the next document, setting pCurrent appropriately.
+
+	  Adjusts pCurrent, pBsonSource, and iterator, as needed.  On exit,
+	  pCurrent is the Document to return, or NULL.  If NULL, this
+	  indicates there is nothing more to return.
+	 */
+	void getNextDocument();
+
+	bool newSource; // set to true for the first item of a new source
+	intrusive_ptr<DocumentSourceBsonArray> pBsonSource;
+	intrusive_ptr<Document> pCurrent;
+	FuturesList::iterator iterator;
+	FuturesList::iterator listEnd;
+	string &errmsg;
+    };
+
+
+    class DocumentSourceCursor :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceCursor();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+	/*
+	  Create a document source based on a cursor.
+
+	  This is usually put at the beginning of a chain of document sources
+	  in order to fetch data from the database.
+
+	  @param pCursor the cursor to use to fetch data
+	*/
+	static intrusive_ptr<DocumentSourceCursor> create(
+	    const shared_ptr<Cursor> &pCursor);
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceCursor(const shared_ptr<Cursor> &pTheCursor);
+
+	void findNext();
+        shared_ptr<Cursor> pCursor;
+	intrusive_ptr<Document> pCurrent;
+    };
+
+
+    /*
+      This contains all the basic mechanics for filtering a stream of
+      Documents, except for the actual predicate evaluation itself.  This was
+      factored out so we could create DocumentSources that use both Matcher
+      style predicates as well as full Expressions.
+     */
+    class DocumentSourceFilterBase :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceFilterBase();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+	/*
+	  Create a BSONObj suitable for Matcher construction.
+
+	  This is used after filter analysis has moved as many filters to
+	  as early a point as possible in the document processing pipeline.
+	  See db/Matcher.h and the associated wiki documentation for the
+	  format.  This conversion is used to move back to the low-level
+	  find() Cursor mechanism.
+
+	  @param pBuilder the builder to write to
+	 */
+	virtual void toMatcherBson(BSONObjBuilder *pBuilder) const = 0;
+
+    protected:
+        DocumentSourceFilterBase();
+
+	/*
+	  Test the given document against the predicate and report if it
+	  should be accepted or not.
+
+	  @param pDocument the document to test
+	  @returns true if the document matches the filter, false otherwise
+	 */
+	virtual bool accept(const intrusive_ptr<Document> &pDocument) const = 0;
+
+    private:
+
+        void findNext();
+
+        bool unstarted;
+        bool hasNext;
+        intrusive_ptr<Document> pCurrent;
+    };
+
+
+    class DocumentSourceFilter :
+        public DocumentSourceFilterBase {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceFilter();
+	virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+	virtual void optimize();
+
+	/*
+	  Create a filter.
+
+          @param pBsonElement the raw BSON specification for the filter
+          @returns the filter
+	 */
+	static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+        /*
+          Create a filter.
+
+          @param pFilter the expression to use to filter
+          @returns the filter
+         */
+        static intrusive_ptr<DocumentSourceFilter> create(
+            const intrusive_ptr<Expression> &pFilter);
+
+	/*
+	  Create a BSONObj suitable for Matcher construction.
+
+	  This is used after filter analysis has moved as many filters to
+	  as early a point as possible in the document processing pipeline.
+	  See db/Matcher.h and the associated wiki documentation for the
+	  format.  This conversion is used to move back to the low-level
+	  find() Cursor mechanism.
+
+	  @param pBuilder the builder to write to
+	 */
+	void toMatcherBson(BSONObjBuilder *pBuilder) const;
+
+	static const char filterName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+	// virtuals from DocumentSourceFilterBase
+	virtual bool accept(const intrusive_ptr<Document> &pDocument) const;
+
+    private:
+        DocumentSourceFilter(const intrusive_ptr<Expression> &pFilter);
+
+        intrusive_ptr<Expression> pFilter;
+    };
+
+
+    class DocumentSourceGroup :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceGroup();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+        /*
+          Create a new grouping DocumentSource.
+	  
+	  @param pCtx the expression context
+	  @returns the DocumentSource
+         */
+        static intrusive_ptr<DocumentSourceGroup> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+        /*
+          Set the Id Expression.
+
+          Documents that pass through the grouping Document are grouped
+          according to this key.  This will generate the id_ field in the
+          result documents.
+
+          @param pExpression the group key
+         */
+        void setIdExpression(const intrusive_ptr<Expression> &pExpression);
+
+        /*
+          Add an accumulator.
+
+          Accumulators become fields in the Documents that result from
+          grouping.  Each unique group document must have it's own
+          accumulator; the accumulator factory is used to create that.
+
+          @param fieldName the name the accumulator result will have in the
+                result documents
+          @param pAccumulatorFactory used to create the accumulator for the
+                group field
+         */
+        void addAccumulator(string fieldName,
+			    intrusive_ptr<Accumulator> (*pAccumulatorFactory)(
+			    const intrusive_ptr<ExpressionContext> &),
+                            const intrusive_ptr<Expression> &pExpression);
+
+	/*
+	  Create a grouping DocumentSource from BSON.
+
+	  This is a convenience method that uses the above, and operates on
+	  a BSONElement that has been deteremined to be an Object with an
+	  element named $group.
+
+	  @param pBsonElement the BSONELement that defines the group
+	  @param pCtx the expression context
+	  @returns the grouping DocumentSource
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+	/*
+	  Create a unifying group that can be used to combine group results
+	  from shards.
+
+	  @returns the grouping DocumentSource
+	*/
+	intrusive_ptr<DocumentSource> createMerger();
+
+	static const char groupName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceGroup(const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Before returning anything, this source must fetch everything from
+	  the underlying source and group it.  populate() is used to do that
+	  on the first call to any method on this source.  The populated
+	  boolean indicates that this has been done.
+	 */
+        void populate();
+        bool populated;
+
+        intrusive_ptr<Expression> pIdExpression;
+
+	typedef boost::unordered_map<intrusive_ptr<const Value>,
+	    vector<intrusive_ptr<Accumulator> >, Value::Hash> GroupsType;
+        GroupsType groups;
+
+        /*
+          The field names for the result documents and the accumulator
+          factories for the result documents.  The Expressions are the
+          common expressions used by each instance of each accumulator
+          in order to find the right-hand side of what gets added to the
+          accumulator.  Note that each of those is the same for each group,
+          so we can share them across all groups by adding them to the
+          accumulators after we use the factories to make a new set of
+          accumulators for each new group.
+
+          These three vectors parallel each other.
+        */
+        vector<string> vFieldName;
+        vector<intrusive_ptr<Accumulator> (*)(
+	    const intrusive_ptr<ExpressionContext> &)> vpAccumulatorFactory;
+        vector<intrusive_ptr<Expression> > vpExpression;
+
+
+        intrusive_ptr<Document> makeDocument(
+	    const GroupsType::iterator &rIter);
+
+        GroupsType::iterator groupsIterator;
+        intrusive_ptr<Document> pCurrent;
+
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    class DocumentSourceMatch :
+        public DocumentSourceFilterBase {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceMatch();
+
+	/*
+	  Create a filter.
+
+          @param pBsonElement the raw BSON specification for the filter
+          @returns the filter
+	 */
+	static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Create a BSONObj suitable for Matcher construction.
+
+	  This is used after filter analysis has moved as many filters to
+	  as early a point as possible in the document processing pipeline.
+	  See db/Matcher.h and the associated wiki documentation for the
+	  format.  This conversion is used to move back to the low-level
+	  find() Cursor mechanism.
+
+	  @param pBuilder the builder to write to
+	 */
+	void toMatcherBson(BSONObjBuilder *pBuilder) const;
+
+	static const char matchName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+	// virtuals from DocumentSourceFilterBase
+	virtual bool accept(const intrusive_ptr<Document> &pDocument) const;
+
+    private:
+        DocumentSourceMatch(const BSONObj &query);
+
+	Matcher matcher;
+    };
+
+
+    class DocumentSourceOut :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceOut();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+	/*
+	  Create a document source for output and pass-through.
+
+	  This can be put anywhere in a pipeline and will store content as
+	  well as pass it on.
+
+	  @returns the newly created document source
+	*/
+	static intrusive_ptr<DocumentSourceOut> createFromBson(
+	    BSONElement *pBsonElement);
+
+	static const char outName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceOut(BSONElement *pBsonElement);
+    };
+
+    
+    class DocumentSourceProject :
+        public DocumentSource,
+        public boost::enable_shared_from_this<DocumentSourceProject> {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceProject();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	virtual void optimize();
+
+        /*
+          Create a new DocumentSource that can implement projection.
+
+	  @returns the projection DocumentSource
+        */
+        static intrusive_ptr<DocumentSourceProject> create();
+
+	/*
+	  Include a field path in a projection.
+
+	  @param fieldPath the path of the field to include
+	*/
+	void includePath(const string &fieldPath);
+
+	/*
+	  Exclude a field path from the projection.
+
+	  @param fieldPath the path of the field to exclude
+	 */
+	void excludePath(const string &fieldPath);
+
+        /*
+          Add an output Expression in the projection.
+
+          BSON document fields are ordered, so the new field will be
+          appended to the existing set.
+
+          @param fieldName the name of the field as it will appear
+          @param pExpression the expression used to compute the field
+        */
+        void addField(const string &fieldName,
+		      const intrusive_ptr<Expression> &pExpression);
+
+	/*
+	  Create a new projection DocumentSource from BSON.
+
+	  This is a convenience for directly handling BSON, and relies on the
+	  above methods.
+
+	  @param pBsonElement the BSONElement with an object named $project
+	  @returns the created projection
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+            BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	static const char projectName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceProject();
+
+        // configuration state
+	bool excludeId;
+	intrusive_ptr<ExpressionObject> pEO;
+    };
+
+
+    class DocumentSourceSort :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceSort();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	/*
+	  TODO
+	  Adjacent sorts should reduce to the last sort.
+	virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+	*/
+
+        /*
+          Create a new sorting DocumentSource.
+	  
+	  @param pCtx the expression context
+	  @returns the DocumentSource
+         */
+        static intrusive_ptr<DocumentSourceSort> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Add sort key field.
+
+	  Adds a sort key field to the key being built up.  A concatenated
+	  key is built up by calling this repeatedly.
+
+	  @param fieldPath the field path to the key component
+	  @param ascending if true, use the key for an ascending sort,
+	    otherwise, use it for descending
+	*/
+	void addKey(const string &fieldPath, bool ascending);
+
+	/*
+	  Write out an object whose contents are the sort key.
+
+	  @param pBuilder initialized object builder.
+	  @param fieldPrefix specify whether or not to include the field prefix
+	 */
+	void sortKeyToBson(BSONObjBuilder *pBuilder, bool usePrefix) const;
+
+	/*
+	  Create a sorting DocumentSource from BSON.
+
+	  This is a convenience method that uses the above, and operates on
+	  a BSONElement that has been deteremined to be an Object with an
+	  element named $group.
+
+	  @param pBsonElement the BSONELement that defines the group
+	  @param pCtx the expression context
+	  @returns the grouping DocumentSource
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+	static const char sortName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceSort(const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Before returning anything, this source must fetch everything from
+	  the underlying source and group it.  populate() is used to do that
+	  on the first call to any method on this source.  The populated
+	  boolean indicates that this has been done.
+	 */
+        void populate();
+        bool populated;
+        long long count;
+
+	/* these two parallel each other */
+	vector<intrusive_ptr<ExpressionFieldPath> > vSortKey;
+	vector<bool> vAscending;
+
+	class Carrier {
+	public:
+	    /*
+	      We need access to the key for compares, so we have to carry
+	      this around.
+	    */
+	    DocumentSourceSort *pSort;
+
+	    intrusive_ptr<Document> pDocument;
+
+	    Carrier(DocumentSourceSort *pSort,
+		    const intrusive_ptr<Document> &pDocument);
+
+	    static bool lessThan(const Carrier &rL, const Carrier &rR);
+	};
+
+	/*
+	  Compare two documents according to the specified sort key.
+
+	  @param rL reference to the left document
+	  @param rR reference to the right document
+	  @returns a number less than, equal to, or greater than zero,
+	    indicating pL < pR, pL == pR, or pL > pR, respectively
+	 */
+	int compare(const intrusive_ptr<Document> &pL,
+		    const intrusive_ptr<Document> &pR);
+
+	typedef list<Carrier> ListType;
+	ListType documents;
+
+        ListType::iterator listIterator;
+        intrusive_ptr<Document> pCurrent;
+
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    class DocumentSourceLimit :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceLimit();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+        /*
+          Create a new limiting DocumentSource.
+
+	  @param pCtx the expression context
+	  @returns the DocumentSource
+         */
+        static intrusive_ptr<DocumentSourceLimit> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Create a limiting DocumentSource from BSON.
+
+	  This is a convenience method that uses the above, and operates on
+	  a BSONElement that has been deteremined to be an Object with an
+	  element named $limit.
+
+	  @param pBsonElement the BSONELement that defines the limit
+	  @param pCtx the expression context
+	  @returns the grouping DocumentSource
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+	static const char limitName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceLimit(const intrusive_ptr<ExpressionContext> &pCtx);
+
+        long long limit;
+        long long count;
+        intrusive_ptr<Document> pCurrent;
+
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+    class DocumentSourceSkip :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceSkip();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+        /*
+          Create a new skipping DocumentSource.
+
+	  @param pCtx the expression context
+	  @returns the DocumentSource
+         */
+        static intrusive_ptr<DocumentSourceSkip> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Create a skipping DocumentSource from BSON.
+
+	  This is a convenience method that uses the above, and operates on
+	  a BSONElement that has been deteremined to be an Object with an
+	  element named $skip.
+
+	  @param pBsonElement the BSONELement that defines the skip
+	  @param pCtx the expression context
+	  @returns the grouping DocumentSource
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+	static const char skipName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceSkip(const intrusive_ptr<ExpressionContext> &pCtx);
+
+        /*
+          Skips initial documents.
+         */
+        void skipper();
+
+        long long skip;
+        long long count;
+        intrusive_ptr<Document> pCurrent;
+
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    class DocumentSourceUnwind :
+        public DocumentSource,
+        public boost::enable_shared_from_this<DocumentSourceUnwind> {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceUnwind();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+        /*
+          Create a new DocumentSource that can implement unwind.
+
+	  @returns the projection DocumentSource
+        */
+        static intrusive_ptr<DocumentSourceUnwind> create();
+
+        /*
+	  Specify the field to unwind.  There must be exactly one before
+	  the pipeline begins execution.
+
+	  @param rFieldPath - path to the field to unwind
+        */
+	void unwindField(const FieldPath &rFieldPath);
+
+	/*
+	  Create a new projection DocumentSource from BSON.
+
+	  This is a convenience for directly handling BSON, and relies on the
+	  above methods.
+
+	  @param pBsonElement the BSONElement with an object named $project
+	  @returns the created projection
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+            BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	static const char unwindName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceUnwind();
+
+        // configuration state
+	FieldPath unwindPath;
+
+	vector<int> fieldIndex; /* for the current document, the indices
+				   leading down to the field being unwound */
+
+        // iteration state
+        intrusive_ptr<Document> pNoUnwindDocument;
+                                              // document to return, pre-unwind
+        intrusive_ptr<const Value> pUnwindArray; // field being unwound
+        intrusive_ptr<ValueIterator> pUnwinder; // iterator used for unwinding
+        intrusive_ptr<const Value> pUnwindValue; // current value
+
+	/*
+	  Clear all the state related to unwinding an array.
+	 */
+	void resetArray();
+
+	/*
+	  Clone the current document being unwound.
+
+	  This is a partial deep clone.  Because we're going to replace the
+	  value at the end, we have to replace everything along the path
+	  leading to that in order to not share that change with any other
+	  clones (or the original) that we've made.
+
+	  This expects pUnwindValue to have been set by a prior call to
+	  advance().  However, pUnwindValue may also be NULL, in which case
+	  the field will be removed -- this is the action for an empty
+	  array.
+
+	  @returns a partial deep clone of pNoUnwindDocument
+	 */
+	intrusive_ptr<Document> clonePath() const;
+
+    };
+
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline void DocumentSourceGroup::setIdExpression(
+        const intrusive_ptr<Expression> &pExpression) {
+        pIdExpression = pExpression;
+    }
+
+    inline void DocumentSourceUnwind::resetArray() {
+	pNoUnwindDocument.reset();
+	pUnwindArray.reset();
+	pUnwinder.reset();
+	pUnwindValue.reset();
+    }
+
+    inline DocumentSourceSort::Carrier::Carrier(
+	DocumentSourceSort *pTheSort,
+	const intrusive_ptr<Document> &pTheDocument):
+	pSort(pTheSort),
+	pDocument(pTheDocument) {
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_bson_array.cpp b/src/mongo/db/pipeline/document_source_bson_array.cpp
new file mode 100755
index 00000000000..5d187b03ef9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_bson_array.cpp
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/pipeline/document.h"
+
+namespace mongo {
+
+    DocumentSourceBsonArray::~DocumentSourceBsonArray() {
+    }
+
+    bool DocumentSourceBsonArray::eof() {
+	return !haveCurrent;
+    }
+
+    bool DocumentSourceBsonArray::advance() {
+	if (eof())
+	    return false;
+
+	if (!arrayIterator.more()) {
+	    haveCurrent = false;
+	    return false;
+	}
+
+	currentElement = arrayIterator.next();
+	return true;
+    }
+
+    intrusive_ptr<Document> DocumentSourceBsonArray::getCurrent() {
+	assert(haveCurrent);
+        BSONObj documentObj(currentElement.Obj());
+        intrusive_ptr<Document> pDocument(
+            Document::createFromBsonObj(&documentObj));
+        return pDocument;
+    }
+
+    void DocumentSourceBsonArray::setSource(
+	const intrusive_ptr<DocumentSource> &pSource) {
+	/* this doesn't take a source */
+	assert(false);
+    }
+
+    DocumentSourceBsonArray::DocumentSourceBsonArray(
+	BSONElement *pBsonElement):
+        embeddedObject(pBsonElement->embeddedObject()),
+        arrayIterator(embeddedObject),
+        haveCurrent(false) {
+	if (arrayIterator.more()) {
+	    currentElement = arrayIterator.next();
+	    haveCurrent = true;
+	}
+    }
+
+    intrusive_ptr<DocumentSourceBsonArray> DocumentSourceBsonArray::create(
+	BSONElement *pBsonElement) {
+
+	assert(pBsonElement->type() == Array);
+	intrusive_ptr<DocumentSourceBsonArray> pSource(
+	    new DocumentSourceBsonArray(pBsonElement));
+
+	return pSource;
+    }
+
+    void DocumentSourceBsonArray::sourceToBson(BSONObjBuilder *pBuilder) const {
+	assert(false); // this has no analog in the BSON world
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_command_futures.cpp b/src/mongo/db/pipeline/document_source_command_futures.cpp
new file mode 100755
index 00000000000..61a257cf16f
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_command_futures.cpp
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+namespace mongo {
+
+    DocumentSourceCommandFutures::~DocumentSourceCommandFutures() {
+    }
+
+    bool DocumentSourceCommandFutures::eof() {
+	/* if we haven't even started yet, do so */
+	if (!pCurrent.get())
+	    getNextDocument();
+
+	return (pCurrent.get() == NULL);
+    }
+
+    bool DocumentSourceCommandFutures::advance() {
+	if (eof())
+	    return false;
+
+	/* advance */
+	getNextDocument();
+
+	return (pCurrent.get() != NULL);
+    }
+
+    intrusive_ptr<Document> DocumentSourceCommandFutures::getCurrent() {
+	assert(!eof());
+	return pCurrent;
+    }
+
+    void DocumentSourceCommandFutures::setSource(
+	const intrusive_ptr<DocumentSource> &pSource) {
+	/* this doesn't take a source */
+	assert(false);
+    }
+
+    void DocumentSourceCommandFutures::sourceToBson(
+	BSONObjBuilder *pBuilder) const {
+        /* this has no BSON equivalent */
+	assert(false);
+    }
+
+    DocumentSourceCommandFutures::DocumentSourceCommandFutures(
+	string &theErrmsg, FuturesList *pList):
+        newSource(false),
+        pBsonSource(),
+        pCurrent(),
+        iterator(pList->begin()),
+        listEnd(pList->end()),
+        errmsg(theErrmsg) {
+    }
+
+    intrusive_ptr<DocumentSourceCommandFutures>
+    DocumentSourceCommandFutures::create(
+	string &errmsg, FuturesList *pList) {
+	intrusive_ptr<DocumentSourceCommandFutures> pSource(
+	    new DocumentSourceCommandFutures(errmsg, pList));
+	return pSource;
+    }
+
+    void DocumentSourceCommandFutures::getNextDocument() {
+	while(true) {
+	    if (!pBsonSource.get()) {
+		/* if there aren't any more futures, we're done */
+		if (iterator == listEnd) {
+		    pCurrent.reset();
+		    return;
+		}
+
+		/* grab the next command result */
+		shared_ptr<Future::CommandResult> pResult(*iterator);
+		++iterator;
+
+		/* try to wait for it */
+		if (!pResult->join()) {
+		    error() << "sharded pipeline failed on shard: " <<
+			pResult->getServer() << " error: " <<
+			pResult->result() << endl;
+		    errmsg += "-- mongod pipeline failed: ";
+		    errmsg += pResult->result().toString();
+
+		    /* move on to the next command future */
+		    continue;
+		}
+
+		/* grab the result array out of the shard server's response */
+		BSONObj shardResult(pResult->result());
+		BSONObjIterator objIterator(shardResult);
+		while(objIterator.more()) {
+		    BSONElement element(objIterator.next());
+		    const char *pFieldName = element.fieldName();
+
+		    /* find the result array and quit this loop */
+		    if (strcmp(pFieldName, "result") == 0) {
+			pBsonSource = DocumentSourceBsonArray::create(&element);
+			newSource = true;
+			break;
+		    }
+		}
+	    }
+
+	    /* if we're done with this shard's results, try the next */
+	    if (pBsonSource->eof() ||
+		(!newSource && !pBsonSource->advance())) {
+		pBsonSource.reset();
+		continue;
+	    }
+
+	    pCurrent = pBsonSource->getCurrent();
+	    newSource = false;
+	    return;
+	}
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_filter.cpp b/src/mongo/db/pipeline/document_source_filter.cpp
new file mode 100755
index 00000000000..66e57ba2e93
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_filter.cpp
@@ -0,0 +1,98 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    const char DocumentSourceFilter::filterName[] = "$filter";
+
+    DocumentSourceFilter::~DocumentSourceFilter() {
+    }
+
+    bool DocumentSourceFilter::coalesce(
+	const intrusive_ptr<DocumentSource> &pNextSource) {
+
+	/* we only know how to coalesce other filters */
+	DocumentSourceFilter *pDocFilter =
+	    dynamic_cast<DocumentSourceFilter *>(pNextSource.get());
+	if (!pDocFilter)
+	    return false;
+
+	/*
+	  Two adjacent filters can be combined by creating a conjunction of
+	  their predicates.
+	 */
+	intrusive_ptr<ExpressionNary> pAnd(ExpressionAnd::create());
+	pAnd->addOperand(pFilter);
+	pAnd->addOperand(pDocFilter->pFilter);
+	pFilter = pAnd;
+
+	return true;
+    }
+
+    void DocumentSourceFilter::optimize() {
+	pFilter = pFilter->optimize();
+    }
+
+    void DocumentSourceFilter::sourceToBson(BSONObjBuilder *pBuilder) const {
+	pFilter->addToBsonObj(pBuilder, filterName, 0);
+    }
+
+    bool DocumentSourceFilter::accept(
+	const intrusive_ptr<Document> &pDocument) const {
+	intrusive_ptr<const Value> pValue(pFilter->evaluate(pDocument));
+	return pValue->coerceToBool();
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceFilter::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15946, "a document filter expression must be an object",
+		pBsonElement->type() == Object);
+
+	Expression::ObjectCtx oCtx(0);
+        intrusive_ptr<Expression> pExpression(
+	    Expression::parseObject(pBsonElement, &oCtx));
+        intrusive_ptr<DocumentSourceFilter> pFilter(
+            DocumentSourceFilter::create(pExpression));
+
+        return pFilter;
+    }
+
+    intrusive_ptr<DocumentSourceFilter> DocumentSourceFilter::create(
+        const intrusive_ptr<Expression> &pFilter) {
+        intrusive_ptr<DocumentSourceFilter> pSource(
+            new DocumentSourceFilter(pFilter));
+        return pSource;
+    }
+
+    DocumentSourceFilter::DocumentSourceFilter(
+        const intrusive_ptr<Expression> &pTheFilter):
+	DocumentSourceFilterBase(),
+        pFilter(pTheFilter) {
+    }
+
+    void DocumentSourceFilter::toMatcherBson(BSONObjBuilder *pBuilder) const {
+	pFilter->toMatcherBson(pBuilder, 0);
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_filter_base.cpp b/src/mongo/db/pipeline/document_source_filter_base.cpp
new file mode 100755
index 00000000000..dbda34b7151
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_filter_base.cpp
@@ -0,0 +1,85 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    DocumentSourceFilterBase::~DocumentSourceFilterBase() {
+    }
+
+    void DocumentSourceFilterBase::findNext() {
+        /* only do this the first time */
+        if (unstarted) {
+            hasNext = !pSource->eof();
+            unstarted = false;
+        }
+
+        while(hasNext) {
+            boost::intrusive_ptr<Document> pDocument(pSource->getCurrent());
+            hasNext = pSource->advance();
+
+            if (accept(pDocument)) {
+                pCurrent = pDocument;
+                return;
+            }
+        }
+
+        pCurrent.reset();
+    }
+
+    bool DocumentSourceFilterBase::eof() {
+        if (unstarted)
+            findNext();
+
+        return (pCurrent.get() == NULL);
+    }
+
+    bool DocumentSourceFilterBase::advance() {
+        if (unstarted)
+            findNext();
+
+        /*
+          This looks weird after the above, but is correct.  Note that calling
+          getCurrent() when first starting already yields the first document
+          in the collection.  Calling advance() without using getCurrent()
+          first will skip over the first item.
+         */
+        findNext();
+
+        return (pCurrent.get() != NULL);
+    }
+
+    boost::intrusive_ptr<Document> DocumentSourceFilterBase::getCurrent() {
+        if (unstarted)
+            findNext();
+
+        assert(pCurrent.get() != NULL);
+        return pCurrent;
+    }
+
+    DocumentSourceFilterBase::DocumentSourceFilterBase():
+        unstarted(true),
+        hasNext(false),
+        pCurrent() {
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_group.cpp b/src/mongo/db/pipeline/document_source_group.cpp
new file mode 100755
index 00000000000..244561589da
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_group.cpp
@@ -0,0 +1,391 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    const char DocumentSourceGroup::groupName[] = "$group";
+
+    DocumentSourceGroup::~DocumentSourceGroup() {
+    }
+
+    bool DocumentSourceGroup::eof() {
+        if (!populated)
+            populate();
+
+        return (groupsIterator == groups.end());
+    }
+
+    bool DocumentSourceGroup::advance() {
+        if (!populated)
+            populate();
+
+        assert(groupsIterator != groups.end());
+
+        ++groupsIterator;
+        if (groupsIterator == groups.end()) {
+            pCurrent.reset();
+            return false;
+        }
+
+        pCurrent = makeDocument(groupsIterator);
+        return true;
+    }
+
+    intrusive_ptr<Document> DocumentSourceGroup::getCurrent() {
+        if (!populated)
+            populate();
+
+        return pCurrent;
+    }
+
+    void DocumentSourceGroup::sourceToBson(BSONObjBuilder *pBuilder) const {
+	BSONObjBuilder insides;
+
+	/* add the _id */
+	pIdExpression->addToBsonObj(&insides, Document::idName.c_str(), 0);
+
+	/* add the remaining fields */
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<Accumulator> pA((*vpAccumulatorFactory[i])(pCtx));
+	    pA->addOperand(vpExpression[i]);
+	    pA->addToBsonObj(&insides, vFieldName[i], 0);
+	}
+
+	pBuilder->append(groupName, insides.done());
+    }
+
+    intrusive_ptr<DocumentSourceGroup> DocumentSourceGroup::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        intrusive_ptr<DocumentSourceGroup> pSource(
+            new DocumentSourceGroup(pCtx));
+        return pSource;
+    }
+
+    DocumentSourceGroup::DocumentSourceGroup(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        populated(false),
+        pIdExpression(),
+        groups(),
+        vFieldName(),
+        vpAccumulatorFactory(),
+        vpExpression(),
+        pCtx(pTheCtx) {
+    }
+
+    void DocumentSourceGroup::addAccumulator(
+        string fieldName,
+        intrusive_ptr<Accumulator> (*pAccumulatorFactory)(
+	    const intrusive_ptr<ExpressionContext> &),
+        const intrusive_ptr<Expression> &pExpression) {
+        vFieldName.push_back(fieldName);
+        vpAccumulatorFactory.push_back(pAccumulatorFactory);
+        vpExpression.push_back(pExpression);
+    }
+
+
+    struct GroupOpDesc {
+        const char *pName;
+        intrusive_ptr<Accumulator> (*pFactory)(
+	    const intrusive_ptr<ExpressionContext> &);
+    };
+
+    static int GroupOpDescCmp(const void *pL, const void *pR) {
+        return strcmp(((const GroupOpDesc *)pL)->pName,
+                      ((const GroupOpDesc *)pR)->pName);
+    }
+
+    /*
+      Keep these sorted alphabetically so we can bsearch() them using
+      GroupOpDescCmp() above.
+    */
+    static const GroupOpDesc GroupOpTable[] = {
+        {"$addToSet", AccumulatorAddToSet::create},
+        {"$avg", AccumulatorAvg::create},
+        {"$first", AccumulatorFirst::create},
+        {"$last", AccumulatorLast::create},
+        {"$max", AccumulatorMinMax::createMax},
+        {"$min", AccumulatorMinMax::createMin},
+        {"$push", AccumulatorPush::create},
+        {"$sum", AccumulatorSum::create},
+    };
+
+    static const size_t NGroupOp = sizeof(GroupOpTable)/sizeof(GroupOpTable[0]);
+
+    intrusive_ptr<DocumentSource> DocumentSourceGroup::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15947, "a group's fields must be specified in an object",
+		pBsonElement->type() == Object);
+
+        intrusive_ptr<DocumentSourceGroup> pGroup(
+	    DocumentSourceGroup::create(pCtx));
+        bool idSet = false;
+
+        BSONObj groupObj(pBsonElement->Obj());
+        BSONObjIterator groupIterator(groupObj);
+        while(groupIterator.more()) {
+            BSONElement groupField(groupIterator.next());
+            const char *pFieldName = groupField.fieldName();
+
+            if (strcmp(pFieldName, Document::idName.c_str()) == 0) {
+		uassert(15948, "a group's _id may only be specified once",
+			!idSet);
+
+		BSONType groupType = groupField.type();
+
+		if (groupType == Object) {
+		    /*
+		      Use the projection-like set of field paths to create the
+		      group-by key.
+		    */
+		    Expression::ObjectCtx oCtx(
+			Expression::ObjectCtx::DOCUMENT_OK);
+		    intrusive_ptr<Expression> pId(
+			Expression::parseObject(&groupField, &oCtx));
+
+		    pGroup->setIdExpression(pId);
+		    idSet = true;
+		}
+		else if (groupType == String) {
+		    string groupString(groupField.String());
+		    const char *pGroupString = groupString.c_str();
+		    if ((groupString.length() == 0) ||
+			(pGroupString[0] != '$'))
+			goto StringConstantId;
+
+		    string pathString(
+			Expression::removeFieldPrefix(groupString));
+		    intrusive_ptr<ExpressionFieldPath> pFieldPath(
+			ExpressionFieldPath::create(pathString));
+		    pGroup->setIdExpression(pFieldPath);
+		    idSet = true;
+		}
+		else {
+		    /* pick out the constant types that are allowed */
+		    switch(groupType) {
+		    case NumberDouble:
+		    case String:
+		    case Object:
+		    case Array:
+		    case jstOID:
+		    case Bool:
+		    case Date:
+		    case NumberInt:
+		    case Timestamp:
+		    case NumberLong:
+		    case jstNULL:
+		    StringConstantId: // from string case above
+		    {
+			intrusive_ptr<const Value> pValue(
+			    Value::createFromBsonElement(&groupField));
+			intrusive_ptr<ExpressionConstant> pConstant(
+			    ExpressionConstant::create(pValue));
+			pGroup->setIdExpression(pConstant);
+			idSet = true;
+			break;
+		    }
+
+		    default:
+			uassert(15949, str::stream() <<
+				"a group's _id may not include fields of BSON type " << groupType,
+				false);
+		    }
+		}
+            }
+            else {
+                /*
+                  Treat as a projection field with the additional ability to
+                  add aggregation operators.
+                */
+		uassert(15950, str::stream() <<
+			"the group aggregate field name " <<
+			*pFieldName << " cannot be an operator name",
+			*pFieldName != '$');
+
+		uassert(15951, str::stream() << 
+			"the group aggregate field " << *pFieldName <<
+			"must be defined as an expression inside an object",
+			groupField.type() == Object);
+
+                BSONObj subField(groupField.Obj());
+                BSONObjIterator subIterator(subField);
+                size_t subCount = 0;
+                for(; subIterator.more(); ++subCount) {
+                    BSONElement subElement(subIterator.next());
+
+                    /* look for the specified operator */
+                    GroupOpDesc key;
+                    key.pName = subElement.fieldName();
+                    const GroupOpDesc *pOp =
+			(const GroupOpDesc *)bsearch(
+                              &key, GroupOpTable, NGroupOp, sizeof(GroupOpDesc),
+                                      GroupOpDescCmp);
+
+		    uassert(15952, str::stream() <<
+			    "unknown group operator \"" <<
+			    key.pName << "\"",
+			    pOp);
+
+                    intrusive_ptr<Expression> pGroupExpr;
+
+                    BSONType elementType = subElement.type();
+                    if (elementType == Object) {
+			Expression::ObjectCtx oCtx(
+			    Expression::ObjectCtx::DOCUMENT_OK);
+                        pGroupExpr = Expression::parseObject(
+			    &subElement, &oCtx);
+		    }
+                    else if (elementType == Array) {
+			uassert(15953, str::stream() <<
+				"aggregating group operators are unary (" <<
+				key.pName << ")", false);
+                    }
+                    else { /* assume its an atomic single operand */
+                        pGroupExpr = Expression::parseOperand(&subElement);
+                    }
+
+                    pGroup->addAccumulator(
+                        pFieldName, pOp->pFactory, pGroupExpr);
+                }
+
+		uassert(15954, str::stream() <<
+			"the computed aggregate \"" <<
+			pFieldName << "\" must specify exactly one operator",
+			subCount == 1);
+            }
+        }
+
+	uassert(15955, "a group specification must include an _id", idSet);
+
+        return pGroup;
+    }
+
+    void DocumentSourceGroup::populate() {
+        for(bool hasNext = !pSource->eof(); hasNext;
+                hasNext = pSource->advance()) {
+            intrusive_ptr<Document> pDocument(pSource->getCurrent());
+
+            /* get the _id document */
+            intrusive_ptr<const Value> pId(pIdExpression->evaluate(pDocument));
+	    uassert(15956, "the _id field for a group must not be undefined",
+		    pId->getType() != Undefined);
+
+            /*
+              Look for the _id value in the map; if it's not there, add a
+	      new entry with a blank accumulator.
+            */
+            vector<intrusive_ptr<Accumulator> > *pGroup;
+            GroupsType::iterator it(groups.find(pId));
+            if (it != groups.end()) {
+                /* point at the existing accumulators */
+                pGroup = &it->second;
+            }
+            else {
+                /* insert a new group into the map */
+                groups.insert(it,
+                              pair<intrusive_ptr<const Value>,
+                              vector<intrusive_ptr<Accumulator> > >(
+                                  pId, vector<intrusive_ptr<Accumulator> >()));
+
+                /* find the accumulator vector (the map value) */
+                it = groups.find(pId);
+                pGroup = &it->second;
+
+                /* add the accumulators */
+                const size_t n = vpAccumulatorFactory.size();
+                pGroup->reserve(n);
+                for(size_t i = 0; i < n; ++i) {
+                    intrusive_ptr<Accumulator> pAccumulator(
+                        (*vpAccumulatorFactory[i])(pCtx));
+                    pAccumulator->addOperand(vpExpression[i]);
+                    pGroup->push_back(pAccumulator);
+                }
+            }
+
+            /* point at the existing key */
+            // unneeded atm // pId = it.first;
+
+            /* tickle all the accumulators for the group we found */
+            const size_t n = pGroup->size();
+            for(size_t i = 0; i < n; ++i)
+                (*pGroup)[i]->evaluate(pDocument);
+        }
+
+        /* start the group iterator */
+        groupsIterator = groups.begin();
+        if (groupsIterator != groups.end())
+            pCurrent = makeDocument(groupsIterator);
+        populated = true;
+    }
+
+    intrusive_ptr<Document> DocumentSourceGroup::makeDocument(
+        const GroupsType::iterator &rIter) {
+        vector<intrusive_ptr<Accumulator> > *pGroup = &rIter->second;
+        const size_t n = vFieldName.size();
+        intrusive_ptr<Document> pResult(Document::create(1 + n));
+
+        /* add the _id field */
+        pResult->addField(Document::idName, rIter->first);
+
+        /* add the rest of the fields */
+        for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<const Value> pValue((*pGroup)[i]->getValue());
+	    if (pValue->getType() != Undefined)
+		pResult->addField(vFieldName[i], pValue);
+	}
+
+        return pResult;
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceGroup::createMerger() {
+	intrusive_ptr<DocumentSourceGroup> pMerger(
+	    DocumentSourceGroup::create(pCtx));
+
+	/* the merger will use the same grouping key */
+	pMerger->setIdExpression(ExpressionFieldPath::create(
+				     Document::idName.c_str()));
+
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    /*
+	      The merger's output field names will be the same, as will the
+	      accumulator factories.  However, for some accumulators, the
+	      expression to be accumulated will be different.  The original
+	      accumulator may be collecting an expression based on a field
+	      expression or constant.  Here, we accumulate the output of the
+	      same name from the prior group.
+	    */
+	    pMerger->addAccumulator(
+		vFieldName[i], vpAccumulatorFactory[i],
+		ExpressionFieldPath::create(vFieldName[i]));
+	}
+
+	return pMerger;
+    }
+}
+
+
diff --git a/src/mongo/db/pipeline/document_source_limit.cpp b/src/mongo/db/pipeline/document_source_limit.cpp
new file mode 100644
index 00000000000..a73d4da2005
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_limit.cpp
@@ -0,0 +1,83 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    const char DocumentSourceLimit::limitName[] = "$limit";
+
+    DocumentSourceLimit::DocumentSourceLimit(const intrusive_ptr<ExpressionContext> &pTheCtx):
+        limit(0),
+        count(0),
+        pCtx(pTheCtx) {
+    }
+
+    DocumentSourceLimit::~DocumentSourceLimit() {
+    }
+
+    bool DocumentSourceLimit::eof() {
+        return pSource->eof() || count >= limit;
+    }
+
+    bool DocumentSourceLimit::advance() {
+        ++count;
+        if (count >= limit) {
+            pCurrent.reset();
+            return false;
+        }
+        pCurrent = pSource->getCurrent();
+        return pSource->advance();
+    }
+
+    intrusive_ptr<Document> DocumentSourceLimit::getCurrent() {
+        return pSource->getCurrent();
+    }
+
+    void DocumentSourceLimit::sourceToBson(BSONObjBuilder *pBuilder) const {
+	pBuilder->append("$limit", limit);
+    }
+
+    intrusive_ptr<DocumentSourceLimit> DocumentSourceLimit::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        intrusive_ptr<DocumentSourceLimit> pSource(
+            new DocumentSourceLimit(pCtx));
+        return pSource;
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceLimit::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15957, "the limit must be specified as a number",
+		pBsonElement->isNumber());
+
+        intrusive_ptr<DocumentSourceLimit> pLimit(
+	    DocumentSourceLimit::create(pCtx));
+
+        pLimit->limit = (int)pBsonElement->numberLong();
+	uassert(15958, "the limit must be positive",
+		pLimit->limit > 0);
+
+        return pLimit;
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_match.cpp b/src/mongo/db/pipeline/document_source_match.cpp
new file mode 100755
index 00000000000..bedac3ef717
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_match.cpp
@@ -0,0 +1,80 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/matcher.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+
+namespace mongo {
+
+    const char DocumentSourceMatch::matchName[] = "$match";
+
+    DocumentSourceMatch::~DocumentSourceMatch() {
+    }
+
+    void DocumentSourceMatch::sourceToBson(BSONObjBuilder *pBuilder) const {
+	const BSONObj *pQuery = matcher.getQuery();
+	pBuilder->append(matchName, *pQuery);
+    }
+
+    bool DocumentSourceMatch::accept(
+	const intrusive_ptr<Document> &pDocument) const {
+
+	/*
+	  The matcher only takes BSON documents, so we have to make one.
+
+	  LATER
+	  We could optimize this by making a document with only the
+	  fields referenced by the Matcher.  We could do this by looking inside
+	  the Matcher's BSON before it is created, and recording those.  The
+	  easiest implementation might be to hold onto an ExpressionDocument
+	  in here, and give that pDocument to create the created subset of
+	  fields, and then convert that instead.
+	*/
+	BSONObjBuilder objBuilder;
+	pDocument->toBson(&objBuilder);
+	BSONObj obj(objBuilder.done());
+
+	return matcher.matches(obj);
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceMatch::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15959, "the match filter must be an expression in an object",
+		pBsonElement->type() == Object);
+
+        intrusive_ptr<DocumentSourceMatch> pMatcher(
+	    new DocumentSourceMatch(pBsonElement->Obj()));
+
+        return pMatcher;
+    }
+
+    void DocumentSourceMatch::toMatcherBson(BSONObjBuilder *pBuilder) const {
+	const BSONObj *pQuery = matcher.getQuery();
+	pBuilder->appendElements(*pQuery);
+    }
+
+    DocumentSourceMatch::DocumentSourceMatch(const BSONObj &query):
+	DocumentSourceFilterBase(),
+        matcher(query) {
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_out.cpp b/src/mongo/db/pipeline/document_source_out.cpp
new file mode 100755
index 00000000000..5a30342d25c
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_out.cpp
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+
+namespace mongo {
+
+    const char DocumentSourceOut::outName[] = "$out";
+
+    DocumentSourceOut::~DocumentSourceOut() {
+    }
+
+    bool DocumentSourceOut::eof() {
+	return pSource->eof();
+    }
+
+    bool DocumentSourceOut::advance() {
+	return pSource->advance();
+    }
+
+    boost::intrusive_ptr<Document> DocumentSourceOut::getCurrent() {
+	return pSource->getCurrent();
+    }
+
+    DocumentSourceOut::DocumentSourceOut(BSONElement *pBsonElement) {
+	assert(false && "unimplemented");
+    }
+
+    intrusive_ptr<DocumentSourceOut> DocumentSourceOut::createFromBson(
+	BSONElement *pBsonElement) {
+	intrusive_ptr<DocumentSourceOut> pSource(
+	    new DocumentSourceOut(pBsonElement));
+
+	return pSource;
+    }
+
+    void DocumentSourceOut::sourceToBson(BSONObjBuilder *pBuilder) const {
+	assert(false); // CW TODO
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_project.cpp b/src/mongo/db/pipeline/document_source_project.cpp
new file mode 100755
index 00000000000..bb7a0b5a6d9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_project.cpp
@@ -0,0 +1,201 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    const char DocumentSourceProject::projectName[] = "$project";
+
+    DocumentSourceProject::~DocumentSourceProject() {
+    }
+
+    DocumentSourceProject::DocumentSourceProject():
+	excludeId(false),
+	pEO(ExpressionObject::create()) {
+    }
+
+    bool DocumentSourceProject::eof() {
+        return pSource->eof();
+    }
+
+    bool DocumentSourceProject::advance() {
+        return pSource->advance();
+    }
+
+    intrusive_ptr<Document> DocumentSourceProject::getCurrent() {
+	intrusive_ptr<Document> pInDocument(pSource->getCurrent());
+
+	/* create the result document */
+	const size_t sizeHint =
+	    pEO->getSizeHint(pInDocument) + (excludeId ? 0 : 1);
+	intrusive_ptr<Document> pResultDocument(Document::create(sizeHint));
+
+	if (!excludeId) {
+	    intrusive_ptr<const Value> pId(
+		pInDocument->getField(Document::idName));
+	    pResultDocument->addField(Document::idName, pId);
+	}
+
+	/* use the ExpressionObject to create the base result */
+	pEO->addToDocument(pResultDocument, pInDocument);
+
+        return pResultDocument;
+    }
+
+    void DocumentSourceProject::optimize() {
+	intrusive_ptr<Expression> pE(pEO->optimize());
+	pEO = dynamic_pointer_cast<ExpressionObject>(pE);
+    }
+
+    void DocumentSourceProject::sourceToBson(BSONObjBuilder *pBuilder) const {
+	BSONObjBuilder insides;
+	if (excludeId)
+	    insides.append(Document::idName, false);
+	pEO->documentToBson(&insides, 0);
+	pBuilder->append(projectName, insides.done());
+    }
+
+    intrusive_ptr<DocumentSourceProject> DocumentSourceProject::create() {
+        intrusive_ptr<DocumentSourceProject> pSource(
+            new DocumentSourceProject());
+        return pSource;
+    }
+
+    void DocumentSourceProject::addField(
+        const string &fieldName, const intrusive_ptr<Expression> &pExpression) {
+	uassert(15960,
+		"projection fields must be defined by non-empty expressions",
+		pExpression);
+
+	pEO->addField(fieldName, pExpression);
+    }
+
+    void DocumentSourceProject::includePath(const string &fieldPath) {
+	if (Document::idName.compare(fieldPath) == 0) {
+	    uassert(15961, str::stream() << projectName <<
+		    ":  _id cannot be included once it has been excluded",
+		    !excludeId);
+
+	    return;
+	}
+
+	pEO->includePath(fieldPath);
+    }
+
+    void DocumentSourceProject::excludePath(const string &fieldPath) {
+	if (Document::idName.compare(fieldPath) == 0) {
+	    excludeId = true;
+	    return;
+	}
+
+	pEO->excludePath(fieldPath);
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceProject::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        /* validate */
+	uassert(15969, str::stream() << projectName <<
+		" specification must be an object",
+		pBsonElement->type() == Object);
+
+        /* chain the projection onto the original source */
+        intrusive_ptr<DocumentSourceProject> pProject(
+	    DocumentSourceProject::create());
+
+        /*
+          Pull out the $project object.  This should just be a list of
+          field inclusion or exclusion specifications.  Note you can't do
+          both, except for the case of _id.
+         */
+        BSONObj projectObj(pBsonElement->Obj());
+        BSONObjIterator fieldIterator(projectObj);
+	Expression::ObjectCtx objectCtx(
+	    Expression::ObjectCtx::DOCUMENT_OK);
+        while(fieldIterator.more()) {
+            BSONElement outFieldElement(fieldIterator.next());
+            string outFieldPath(outFieldElement.fieldName());
+            string inFieldName(outFieldPath);
+            BSONType specType = outFieldElement.type();
+            int fieldInclusion = -1;
+
+            switch(specType) {
+            case NumberDouble: {
+                double inclusion = outFieldElement.numberDouble();
+		fieldInclusion = static_cast<int>(inclusion);
+                goto IncludeExclude;
+            }
+
+            case NumberInt:
+                /* just a plain integer include/exclude specification */
+                fieldInclusion = outFieldElement.numberInt();
+
+IncludeExclude:
+		uassert(15970, str::stream() <<
+			"field inclusion or exclusion specification for \"" <<
+			outFieldPath <<
+			"\" must be true, 1, false, or zero",
+			((fieldInclusion == 0) || (fieldInclusion == 1)));
+
+                if (fieldInclusion == 0)
+		    pProject->excludePath(outFieldPath);
+                else 
+                    pProject->includePath(outFieldPath);
+                break;
+
+            case Bool:
+                /* just a plain boolean include/exclude specification */
+                fieldInclusion = (outFieldElement.Bool() ? 1 : 0);
+                goto IncludeExclude;
+
+            case String:
+                /* include a field, with rename */
+                fieldInclusion = 1;
+                inFieldName = outFieldElement.String();
+		pProject->addField(
+		    outFieldPath,
+		    ExpressionFieldPath::create(
+			Expression::removeFieldPrefix(inFieldName)));
+		break;
+
+            case Object: {
+                intrusive_ptr<Expression> pDocument(
+                    Expression::parseObject(&outFieldElement, &objectCtx));
+
+                /* add The document expression to the projection */
+                pProject->addField(outFieldPath, pDocument);
+                break;
+            }
+
+            default:
+		uassert(15971, str::stream() <<
+			"invalid BSON type (" << specType <<
+			") for " << projectName <<
+			" field " << outFieldPath, false);
+            }
+
+        }
+
+        return pProject;
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_skip.cpp b/src/mongo/db/pipeline/document_source_skip.cpp
new file mode 100644
index 00000000000..74bf2360ce9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_skip.cpp
@@ -0,0 +1,99 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    const char DocumentSourceSkip::skipName[] = "$skip";
+
+    DocumentSourceSkip::DocumentSourceSkip(const intrusive_ptr<ExpressionContext> &pTheCtx):
+        skip(0),
+        count(0),
+        pCtx(pTheCtx) {
+    }
+
+    DocumentSourceSkip::~DocumentSourceSkip() {
+    }
+
+    void DocumentSourceSkip::skipper() {
+        if (count == 0) {
+            while (!pSource->eof() && count++ < skip) {
+                pSource->advance();
+            }
+        }
+
+        if (pSource->eof()) {
+            pCurrent.reset();
+            return;
+        }
+
+        pCurrent = pSource->getCurrent();
+    }
+
+    bool DocumentSourceSkip::eof() {
+        skipper();
+        return pSource->eof();
+    }
+
+    bool DocumentSourceSkip::advance() {
+        if (eof()) {
+            pCurrent.reset();
+            return false;
+        }
+
+        pCurrent = pSource->getCurrent();
+        return pSource->advance();
+    }
+
+    intrusive_ptr<Document> DocumentSourceSkip::getCurrent() {
+        skipper();
+        return pCurrent;
+    }
+
+    void DocumentSourceSkip::sourceToBson(BSONObjBuilder *pBuilder) const {
+	pBuilder->append("$skip", skip);
+    }
+
+    intrusive_ptr<DocumentSourceSkip> DocumentSourceSkip::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        intrusive_ptr<DocumentSourceSkip> pSource(
+            new DocumentSourceSkip(pCtx));
+        return pSource;
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceSkip::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15972, str::stream() << "the value to " <<
+		skipName << " must be a number", pBsonElement->isNumber());
+
+        intrusive_ptr<DocumentSourceSkip> pSkip(
+	    DocumentSourceSkip::create(pCtx));
+
+        pSkip->skip = (int)pBsonElement->numberLong();
+        assert(pSkip->skip > 0); // CW TODO error code
+
+        return pSkip;
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_sort.cpp b/src/mongo/db/pipeline/document_source_sort.cpp
new file mode 100755
index 00000000000..bf4739af7d1
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_sort.cpp
@@ -0,0 +1,216 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/doc_mem_monitor.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+
+namespace mongo {
+    const char DocumentSourceSort::sortName[] = "$sort";
+
+    DocumentSourceSort::~DocumentSourceSort() {
+    }
+
+    bool DocumentSourceSort::eof() {
+        if (!populated)
+            populate();
+
+        return (listIterator == documents.end());
+    }
+
+    bool DocumentSourceSort::advance() {
+        if (!populated)
+            populate();
+
+        assert(listIterator != documents.end());
+
+        ++listIterator;
+        if (listIterator == documents.end()) {
+            pCurrent.reset();
+            count = 0;
+            return false;
+        }
+	pCurrent = listIterator->pDocument;
+
+        return true;
+    }
+
+    intrusive_ptr<Document> DocumentSourceSort::getCurrent() {
+        if (!populated)
+            populate();
+
+        return pCurrent;
+    }
+
+    void DocumentSourceSort::sourceToBson(BSONObjBuilder *pBuilder) const {
+	BSONObjBuilder insides;
+	sortKeyToBson(&insides, false);
+	pBuilder->append(sortName, insides.done());
+    }
+
+    intrusive_ptr<DocumentSourceSort> DocumentSourceSort::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        intrusive_ptr<DocumentSourceSort> pSource(
+            new DocumentSourceSort(pCtx));
+        return pSource;
+    }
+
+    DocumentSourceSort::DocumentSourceSort(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        populated(false),
+        pCtx(pTheCtx) {
+    }
+
+    void DocumentSourceSort::addKey(const string &fieldPath, bool ascending) {
+	intrusive_ptr<ExpressionFieldPath> pE(
+	    ExpressionFieldPath::create(fieldPath));
+	vSortKey.push_back(pE);
+	vAscending.push_back(ascending);
+    }
+
+    void DocumentSourceSort::sortKeyToBson(
+	BSONObjBuilder *pBuilder, bool usePrefix) const {
+	/* add the key fields */
+	const size_t n = vSortKey.size();
+	for(size_t i = 0; i < n; ++i) {
+	    /* create the "field name" */
+	    stringstream ss;
+	    vSortKey[i]->writeFieldPath(ss, usePrefix);
+
+	    /* append a named integer based on the sort order */
+	    pBuilder->append(ss.str(), (vAscending[i] ? 1 : -1));
+	}
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceSort::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15973, str::stream() << " the " <<
+		sortName << " key specification must be an object",
+		pBsonElement->type() == Object);
+
+        intrusive_ptr<DocumentSourceSort> pSort(
+	    DocumentSourceSort::create(pCtx));
+
+        /* check for then iterate over the sort object */
+	size_t sortKeys = 0;
+	for(BSONObjIterator keyIterator(pBsonElement->Obj().begin());
+	    keyIterator.more();) {
+	    BSONElement keyField(keyIterator.next());
+	    const char *pKeyFieldName = keyField.fieldName();
+	    int sortOrder = 0;
+		
+	    uassert(15974, str::stream() << sortName <<
+		    " key ordering must be specified using a number",
+		    keyField.isNumber());
+	    sortOrder = (int)keyField.numberInt();
+
+	    uassert(15975,  str::stream() << sortName <<
+		    " key ordering must be 1 (for ascending) or -1 (for descending",
+		    ((sortOrder == 1) || (sortOrder == -1)));
+
+	    pSort->addKey(pKeyFieldName, (sortOrder > 0));
+	    ++sortKeys;
+	}
+
+	uassert(15976, str::stream() << sortName <<
+		" must have at least one sort key", (sortKeys > 0));
+
+        return pSort;
+    }
+
+    void DocumentSourceSort::populate() {
+	/* make sure we've got a sort key */
+	assert(vSortKey.size());
+
+	/* track and warn about how much physical memory has been used */
+	DocMemMonitor dmm(this);
+
+	/* pull everything from the underlying source */
+        for(bool hasNext = !pSource->eof(); hasNext;
+	    hasNext = pSource->advance()) {
+	    intrusive_ptr<Document> pDocument(pSource->getCurrent());
+	    documents.push_back(Carrier(this, pDocument));
+
+	    dmm.addToTotal(pDocument->getApproximateSize());
+	}
+
+	/* sort the list */
+	documents.sort(Carrier::lessThan);
+
+        /* start the sort iterator */
+        listIterator = documents.begin();
+
+        if (listIterator != documents.end())
+            pCurrent = listIterator->pDocument;
+        populated = true;
+    }
+
+    int DocumentSourceSort::compare(
+	const intrusive_ptr<Document> &pL, const intrusive_ptr<Document> &pR) {
+
+	/*
+	  populate() already checked that there is a non-empty sort key,
+	  so we shouldn't have to worry about that here.
+
+	  However, the tricky part is what to do is none of the sort keys are
+	  present.  In this case, consider the document less.
+	*/
+	const size_t n = vSortKey.size();
+	for(size_t i = 0; i < n; ++i) {
+	    /* evaluate the sort keys */
+	    ExpressionFieldPath *pE = vSortKey[i].get();
+	    intrusive_ptr<const Value> pLeft(pE->evaluate(pL));
+	    intrusive_ptr<const Value> pRight(pE->evaluate(pR));
+
+	    /*
+	      Compare the two values; if they differ, return.  If they are
+	      the same, move on to the next key.
+	    */
+	    int cmp = Value::compare(pLeft, pRight);
+	    if (cmp) {
+		/* if necessary, adjust the return value by the key ordering */
+		if (!vAscending[i])
+		    cmp = -cmp;
+
+		return cmp;
+	    }
+	}
+
+	/*
+	  If we got here, everything matched (or didn't exist), so we'll
+	  consider the documents equal for purposes of this sort.
+	*/
+	return 0;
+    }
+
+    bool DocumentSourceSort::Carrier::lessThan(
+	const Carrier &rL, const Carrier &rR) {
+	/* make sure these aren't from different lists */
+	assert(rL.pSort == rR.pSort);
+
+	/* compare the documents according to the sort key */
+	return (rL.pSort->compare(rL.pDocument, rR.pDocument) < 0);
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_unwind.cpp b/src/mongo/db/pipeline/document_source_unwind.cpp
new file mode 100755
index 00000000000..bb231451113
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_unwind.cpp
@@ -0,0 +1,234 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    const char DocumentSourceUnwind::unwindName[] = "$unwind";
+
+    DocumentSourceUnwind::~DocumentSourceUnwind() {
+    }
+
+    DocumentSourceUnwind::DocumentSourceUnwind():
+	unwindPath(),
+        pNoUnwindDocument(),
+        pUnwindArray(),
+        pUnwinder(),
+        pUnwindValue() {
+    }
+
+    bool DocumentSourceUnwind::eof() {
+        /*
+          If we're unwinding an array, and there are more elements, then we
+          can return more documents.
+        */
+        if (pUnwinder.get() && pUnwinder->more())
+            return false;
+
+        return pSource->eof();
+    }
+
+    bool DocumentSourceUnwind::advance() {
+        if (pUnwinder.get() && pUnwinder->more()) {
+            pUnwindValue = pUnwinder->next();
+            return true;
+        }
+
+        /* release the last document and advance */
+	resetArray();
+        return pSource->advance();
+    }
+
+    intrusive_ptr<Document> DocumentSourceUnwind::getCurrent() {
+        if (!pNoUnwindDocument.get()) {
+            intrusive_ptr<Document> pInDocument(pSource->getCurrent());
+
+	    /* create the result document */
+	    pNoUnwindDocument = pInDocument;
+	    fieldIndex.clear();
+
+	    /*
+	      First we'll look to see if the path is there.  If it isn't,
+	      we'll pass this document through.  If it is, we record the
+	      indexes of the fields down the field path so that we can
+	      quickly replace them as we clone the documents along the
+	      field path.
+
+	      We have to clone all the documents along the field path so
+	      that we don't share the end value across documents that have
+	      come out of this pipeline operator.
+	     */
+	    intrusive_ptr<Document> pCurrent(pInDocument);
+	    const size_t pathLength = unwindPath.getPathLength();
+	    for(size_t i = 0; i < pathLength; ++i) {
+		size_t idx = pCurrent->getFieldIndex(
+		    unwindPath.getFieldName(i));
+		if (idx == pCurrent->getFieldCount() ) {
+		    /* this document doesn't contain the target field */
+		    resetArray();
+		    return pInDocument;
+		    break;
+		}
+
+		fieldIndex.push_back(idx);
+		Document::FieldPair fp(pCurrent->getField(idx));
+		intrusive_ptr<const Value> pPathValue(fp.second);
+		if (i < pathLength - 1) {
+		    if (pPathValue->getType() != Object) {
+			/* can't walk down the field path */
+			resetArray();
+			uassert(15977, str::stream() << unwindName <<
+				":  cannot traverse field path past scalar value for \"" <<
+				fp.first << "\"", false);
+			break;
+		    }
+
+		    /* move down the object tree */
+		    pCurrent = pPathValue->getDocument();
+		}
+		else /* (i == pathLength - 1) */ {
+		    if (pPathValue->getType() != Array) {
+			/* last item on path must be an array to unwind */
+			resetArray();
+			uassert(15978, str::stream() << unwindName <<
+				":  value at end of field path must be an array",
+				false);
+			break;
+		    }
+
+		    /* keep track of the array we're unwinding */
+		    pUnwindArray = pPathValue;
+		    if (pUnwindArray->getArrayLength() == 0) {
+			/*
+			  The $unwind of an empty array is a NULL value.  If we
+			  encounter this, use the non-unwind path, but replace
+			  pOutField with a null.
+
+			  Make sure unwind value is clear so the array is
+			  removed.
+			*/
+			pUnwindValue.reset();
+			intrusive_ptr<Document> pClone(clonePath());
+			resetArray();
+			return pClone;
+		    }
+
+		    /* get the iterator we'll use to unwind the array */
+		    pUnwinder = pUnwindArray->getArray();
+		    assert(pUnwinder->more()); // we just checked above...
+		    pUnwindValue = pUnwinder->next();
+		}
+	    }
+	}
+
+        /*
+          If we're unwinding a field, create an alternate document.  In the
+          alternate (clone), replace the unwound array field with the element
+          at the appropriate index.
+         */
+        if (pUnwindArray.get()) {
+            /* clone the document with an array we're unwinding */
+            intrusive_ptr<Document> pUnwindDocument(clonePath());
+
+            return pUnwindDocument;
+        }
+
+        return pNoUnwindDocument;
+    }
+
+    intrusive_ptr<Document> DocumentSourceUnwind::clonePath() const {
+	/*
+	  For this to be valid, we must already have pNoUnwindDocument set,
+	  and have set up the vector of indices for that document in fieldIndex.
+	 */
+	assert(pNoUnwindDocument.get());
+	assert(pUnwinder.get());
+
+	intrusive_ptr<Document> pClone(pNoUnwindDocument->clone());
+	intrusive_ptr<Document> pCurrent(pClone);
+	const size_t n = fieldIndex.size();
+	assert(n);
+	for(size_t i = 0; i < n; ++i) {
+	    const size_t fi = fieldIndex[i];
+	    Document::FieldPair fp(pCurrent->getField(fi));
+	    if (i + 1 < n) {
+		/*
+		  For every object in the path but the last, clone it and
+		  continue on down.
+		*/
+		intrusive_ptr<Document> pNext(
+		    fp.second->getDocument()->clone());
+		pCurrent->setField(fi, fp.first, Value::createDocument(pNext));
+		pCurrent = pNext;
+	    }
+	    else {
+		/* for the last, subsitute the next unwound value */
+		pCurrent->setField(fi, fp.first, pUnwindValue);
+	    }
+	}
+
+	return pClone;
+    }
+
+    void DocumentSourceUnwind::sourceToBson(BSONObjBuilder *pBuilder) const {
+	pBuilder->append(unwindName, unwindPath.getPath(true));
+    }
+
+    intrusive_ptr<DocumentSourceUnwind> DocumentSourceUnwind::create() {
+        intrusive_ptr<DocumentSourceUnwind> pSource(
+            new DocumentSourceUnwind());
+        return pSource;
+    }
+
+    void DocumentSourceUnwind::unwindField(const FieldPath &rFieldPath) {
+	/* can't set more than one unwind field */
+	uassert(15979, str::stream() << unwindName <<
+		"can't unwind more than one path at once",
+		!unwindPath.getPathLength());
+
+	uassert(15980, "the path of the field to unwind cannot be empty",
+		false);
+
+	/* record the field path */
+	unwindPath = rFieldPath;
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceUnwind::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        /*
+	  The value of $unwind should just be a field path.
+         */
+	uassert(15981, str::stream() << "the " << unwindName <<
+		" field path must be specified as a string",
+		pBsonElement->type() == String);
+
+	string prefixedPathString(pBsonElement->String());
+	string pathString(Expression::removeFieldPrefix(prefixedPathString));
+        intrusive_ptr<DocumentSourceUnwind> pUnwind(
+	    DocumentSourceUnwind::create());
+	pUnwind->unwindPath = FieldPath(pathString);
+
+        return pUnwind;
+    }
+}
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
new file mode 100755
index 00000000000..b3caefcf899
--- /dev/null
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -0,0 +1,2815 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/expression.h"
+
+#include <cstdio>
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    /* --------------------------- Expression ------------------------------ */
+
+    void Expression::toMatcherBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+	assert(false && "Expression::toMatcherBson()");
+    }
+
+    Expression::ObjectCtx::ObjectCtx(int theOptions):
+        options(theOptions),
+        unwindField() {
+    }
+
+    void Expression::ObjectCtx::unwind(string fieldName) {
+        assert(unwindOk());
+        assert(!unwindUsed());
+        assert(fieldName.size());
+        unwindField = fieldName;
+    }
+
+    bool Expression::ObjectCtx::documentOk() const {
+        return ((options & DOCUMENT_OK) != 0);
+    }
+
+    const char Expression::unwindName[] = "$unwind";
+
+    string Expression::removeFieldPrefix(const string &prefixedField) {
+	const char *pPrefixedField = prefixedField.c_str();
+	uassert(15982, str::stream() <<
+		"field path references must be prefixed with a '$' (\"" <<
+		prefixedField << "\"", pPrefixedField[0] == '$');
+
+	return string(pPrefixedField + 1);
+    }
+
+    intrusive_ptr<Expression> Expression::parseObject(
+        BSONElement *pBsonElement, ObjectCtx *pCtx) {
+        /*
+          An object expression can take any of the following forms:
+
+          f0: {f1: ..., f2: ..., f3: ...}
+          f0: {$operator:[operand1, operand2, ...]}
+          f0: {$unwind:"fieldpath"}
+
+          We handle $unwind as a special case, because this is done by the
+          projection source.  For any other expression, we hand over control to
+          code that parses the expression and returns an expression.
+        */
+
+        intrusive_ptr<Expression> pExpression; // the result
+        intrusive_ptr<ExpressionObject> pExpressionObject; // alt result
+        int isOp = -1; /* -1 -> unknown, 0 -> not an operator, 1 -> operator */
+        enum { UNKNOWN, NOTOPERATOR, OPERATOR } kind = UNKNOWN;
+
+        BSONObj obj(pBsonElement->Obj());
+        BSONObjIterator iter(obj);
+        for(size_t fieldCount = 0; iter.more(); ++fieldCount) {
+            BSONElement fieldElement(iter.next());
+            const char *pFieldName = fieldElement.fieldName();
+
+            if (pFieldName[0] == '$') {
+		uassert(15983, str::stream() <<
+			"the operator must be the only field in a pipeline object (at \""
+			<< pFieldName << "\"",
+			fieldCount == 0);
+
+                /* we've determined this "object" is an operator expression */
+                isOp = 1;
+                kind = OPERATOR;
+
+                if (strcmp(pFieldName, unwindName) != 0) {
+                    pExpression = parseExpression(pFieldName, &fieldElement);
+                }
+                else {
+                    assert(pCtx->unwindOk());
+                    // CW TODO error: it's not OK to unwind in this context
+
+                    assert(!pCtx->unwindUsed());
+                    // CW TODO error: this projection already has an unwind
+
+                    assert(fieldElement.type() == String);
+                    // CW TODO $unwind operand must be single field name
+
+		    string fieldPath(removeFieldPrefix(fieldElement.String()));
+                    pExpression = ExpressionFieldPath::create(fieldPath);
+                    pCtx->unwind(fieldPath);
+                }
+            }
+            else {
+		uassert(15984, str::stream() << "this object is already an operator expression, and can't be used as a document expression (at \"" <<
+			pFieldName << "\")",
+			isOp != 1);
+		uassert(15990, str::stream() << "this object is already an operator expression, and can't be used as a document expression (at \"" <<
+			pFieldName << "\")",
+			kind != OPERATOR);
+
+                /* if it's our first time, create the document expression */
+                if (!pExpression.get()) {
+                    assert(pCtx->documentOk());
+                    // CW TODO error: document not allowed in this context
+
+                    pExpressionObject = ExpressionObject::create();
+                    pExpression = pExpressionObject;
+
+                    /* this "object" is not an operator expression */
+                    isOp = 0;
+                    kind = NOTOPERATOR;
+                }
+
+                BSONType fieldType = fieldElement.type();
+                string fieldName(pFieldName);
+                if (fieldType == Object) {
+                    /* it's a nested document */
+		    ObjectCtx oCtx(
+			(pCtx->documentOk() ? ObjectCtx::DOCUMENT_OK : 0));
+                    intrusive_ptr<Expression> pNested(
+                        parseObject(&fieldElement, &oCtx));
+                    pExpressionObject->addField(fieldName, pNested);
+                }
+                else if (fieldType == String) {
+                    /* it's a renamed field */
+		    // CW TODO could also be a constant
+                    intrusive_ptr<Expression> pPath(
+                        ExpressionFieldPath::create(
+			    removeFieldPrefix(fieldElement.String())));
+                    pExpressionObject->addField(fieldName, pPath);
+                }
+                else if (fieldType == NumberDouble) {
+                    /* it's an inclusion specification */
+                    int inclusion = static_cast<int>(fieldElement.Double());
+		    if (inclusion == 0)
+			pExpressionObject->excludePath(fieldName);
+		    else if (inclusion == 1)
+			pExpressionObject->includePath(fieldName);
+		    else
+			uassert(15991, str::stream() <<
+				"\"" << fieldName <<
+				"\" numeric inclusion or exclusion must be 1 or 0 (or boolean)",
+				false);
+                }
+                else if (fieldType == Bool) {
+		    bool inclusion = fieldElement.Bool();
+		    if (!inclusion)
+			pExpressionObject->excludePath(fieldName);
+		    else
+			pExpressionObject->includePath(fieldName);
+		}
+		else { /* nothing else is allowed */
+		    uassert(15992, str::stream() <<
+			    "disallowed field type " << fieldType <<
+			    " in object expression (at \"" <<
+			    fieldName << "\")", false);
+                }
+            }
+        }
+
+        return pExpression;
+    }
+
+
+    struct OpDesc {
+        const char *pName;
+        intrusive_ptr<ExpressionNary> (*pFactory)(void);
+    };
+
+    static int OpDescCmp(const void *pL, const void *pR) {
+        return strcmp(((const OpDesc *)pL)->pName, ((const OpDesc *)pR)->pName);
+    }
+
+    /*
+      Keep these sorted alphabetically so we can bsearch() them using
+      OpDescCmp() above.
+    */
+    static const OpDesc OpTable[] = {
+        {"$add", ExpressionAdd::create},
+        {"$and", ExpressionAnd::create},
+        {"$cmp", ExpressionCompare::createCmp},
+	{"$cond", ExpressionCond::create},
+	{"$const", ExpressionNoOp::create},
+        {"$dayOfMonth", ExpressionDayOfMonth::create},
+        {"$dayOfWeek", ExpressionDayOfWeek::create},
+        {"$dayOfYear", ExpressionDayOfYear::create},
+        {"$divide", ExpressionDivide::create},
+        {"$eq", ExpressionCompare::createEq},
+        {"$gt", ExpressionCompare::createGt},
+        {"$gte", ExpressionCompare::createGte},
+        {"$hour", ExpressionHour::create},
+        {"$ifNull", ExpressionIfNull::create},
+        {"$lt", ExpressionCompare::createLt},
+        {"$lte", ExpressionCompare::createLte},
+        {"$minute", ExpressionMinute::create},
+        {"$mod", ExpressionMod::create},
+        {"$month", ExpressionMonth::create},
+        {"$multiply", ExpressionMultiply::create},
+        {"$ne", ExpressionCompare::createNe},
+        {"$not", ExpressionNot::create},
+        {"$or", ExpressionOr::create},
+        {"$second", ExpressionSecond::create},
+        {"$strcasecmp", ExpressionStrcasecmp::create},
+        {"$substr", ExpressionSubstr::create},
+        {"$subtract", ExpressionSubtract::create},
+        {"$toLower", ExpressionToLower::create},
+        {"$toUpper", ExpressionToUpper::create},
+        {"$week", ExpressionWeek::create},
+        {"$year", ExpressionYear::create},
+    };
+
+    static const size_t NOp = sizeof(OpTable)/sizeof(OpTable[0]);
+
+    intrusive_ptr<Expression> Expression::parseExpression(
+        const char *pOpName, BSONElement *pBsonElement) {
+        /* look for the specified operator */
+        OpDesc key;
+        key.pName = pOpName;
+        const OpDesc *pOp = (const OpDesc *)bsearch(
+                                &key, OpTable, NOp, sizeof(OpDesc), OpDescCmp);
+
+	uassert(15999, str::stream() << "invalid operator \"" <<
+		pOpName << "\"", pOp);
+
+        /* make the expression node */
+        intrusive_ptr<ExpressionNary> pExpression((*pOp->pFactory)());
+
+        /* add the operands to the expression node */
+        BSONType elementType = pBsonElement->type();
+        if (elementType == Object) {
+            /* the operator must be unary and accept an object argument */
+            BSONObj objOperand(pBsonElement->Obj());
+	    ObjectCtx oCtx(ObjectCtx::DOCUMENT_OK);
+            intrusive_ptr<Expression> pOperand(
+                Expression::parseObject(pBsonElement, &oCtx));
+            pExpression->addOperand(pOperand);
+        }
+        else if (elementType == Array) {
+            /* multiple operands - an n-ary operator */
+            vector<BSONElement> bsonArray(pBsonElement->Array());
+            const size_t n = bsonArray.size();
+            for(size_t i = 0; i < n; ++i) {
+                BSONElement *pBsonOperand = &bsonArray[i];
+                intrusive_ptr<Expression> pOperand(
+		    Expression::parseOperand(pBsonOperand));
+                pExpression->addOperand(pOperand);
+            }
+        }
+        else { /* assume it's an atomic operand */
+            intrusive_ptr<Expression> pOperand(
+		Expression::parseOperand(pBsonElement));
+            pExpression->addOperand(pOperand);
+        }
+
+        return pExpression;
+    }
+
+    intrusive_ptr<Expression> Expression::parseOperand(BSONElement *pBsonElement) {
+        BSONType type = pBsonElement->type();
+
+        switch(type) {
+        case String: {
+            /*
+              This could be a field path, or it could be a constant
+              string.
+
+              We make a copy of the BSONElement reader so we can read its
+              value without advancing its state, in case we need to read it
+              again in the constant code path.
+            */
+            BSONElement opCopy(*pBsonElement);
+            string value(opCopy.String());
+
+            /* check for a field path */
+	    if (value[0] != '$')
+                goto ExpectConstant;  // assume plain string constant
+
+            /* if we got here, this is a field path expression */
+	    string fieldPath(removeFieldPrefix(value));
+            intrusive_ptr<Expression> pFieldExpr(
+                ExpressionFieldPath::create(fieldPath));
+            return pFieldExpr;
+        }
+
+        case Object: {
+	    ObjectCtx oCtx(ObjectCtx::DOCUMENT_OK);
+            intrusive_ptr<Expression> pSubExpression(
+                Expression::parseObject(pBsonElement, &oCtx));
+            return pSubExpression;
+        }
+
+        default:
+	ExpectConstant: {
+                intrusive_ptr<Expression> pOperand(
+                    ExpressionConstant::createFromBsonElement(pBsonElement));
+                return pOperand;
+            }
+
+        } // switch(type)
+
+        /* NOTREACHED */
+        assert(false);
+        return intrusive_ptr<Expression>();
+    }
+
+    /* ------------------------- ExpressionAdd ----------------------------- */
+
+    ExpressionAdd::~ExpressionAdd() {
+    }
+
+    intrusive_ptr<Expression> ExpressionAdd::optimize() {
+	intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+	ExpressionAdd *pA = dynamic_cast<ExpressionAdd *>(pE.get());
+	if (pA) {
+	    /* don't create a circular reference */
+	    if (pA != this)
+		pA->pAdd = this;
+	}
+
+	return pE;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionAdd::create() {
+        intrusive_ptr<ExpressionAdd> pExpression(new ExpressionAdd());
+        return pExpression;
+    }
+
+    ExpressionAdd::ExpressionAdd():
+        ExpressionNary(),
+        useOriginal(false) {
+    }
+
+    intrusive_ptr<const Value> ExpressionAdd::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        unsigned stringCount = 0;
+	unsigned nonConstStringCount = 0;
+        unsigned dateCount = 0;
+        const size_t n = vpOperand.size();
+	vector<intrusive_ptr<const Value> > vpValue; /* evaluated operands */
+
+	/* use the original, if we've been told to do so */
+	if (useOriginal) {
+	    return pAdd->evaluate(pDocument);
+	}
+
+        for (size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(
+		vpOperand[i]->evaluate(pDocument));
+	    vpValue.push_back(pValue);
+
+	    BSONType valueType = pValue->getType();
+            if (valueType == String) {
+                ++stringCount;
+		if (!dynamic_cast<ExpressionConstant *>(vpOperand[i].get()))
+		    ++nonConstStringCount;
+	    }
+            else if (valueType == Date)
+                ++dateCount;
+        }
+
+        /* 
+	   We don't allow adding two dates because it doesn't make sense
+	   especially since they are in epoch time. However, if there is a
+	   string present then we would be appending the dates to a string so
+	   having many would not be not a problem.
+        */
+        if ((dateCount > 1) && !stringCount) {
+	    uassert(16000, "can't add two dates together", false);
+            return Value::getNull();
+        }
+
+	/*
+	  If there are non-constant strings, and we've got a copy of the
+	  original, then use that from this point forward.  This is necessary
+	  to keep the order of strings the same for string concatenation;
+	  constant-folding would violate the order preservation.
+
+	  This is a one-way conversion we do if we see one of these.  It is
+	  possible that these could vary from document to document, but any
+	  sane schema probably isn't going to do that, so once we see a string,
+	  we can probably assume they're going to be strings all the way down.
+	 */
+	if (nonConstStringCount && pAdd.get()) {
+	    useOriginal = true;
+	    return pAdd->evaluate(pDocument);
+	}
+
+        if (stringCount) {
+            stringstream stringTotal;
+            for (size_t i = 0; i < n; ++i) {
+                intrusive_ptr<const Value> pValue(vpValue[i]);
+                stringTotal << pValue->coerceToString();
+            }
+
+            return Value::createString(stringTotal.str());
+        }
+
+        if (dateCount) {
+            long long dateTotal = 0;
+            for (size_t i = 0; i < n; ++i) {
+                intrusive_ptr<const Value> pValue(vpValue[i]);
+                if (pValue->getType() == Date) 
+                    dateTotal += pValue->coerceToDate();
+                else 
+                    dateTotal += static_cast<long long>(pValue->coerceToDouble()*24*60*60*1000);
+            }
+
+            return Value::createDate(Date_t(dateTotal));
+        }
+
+        /*
+          We'll try to return the narrowest possible result value.  To do that
+          without creating intermediate Values, do the arithmetic for double
+          and integral types in parallel, tracking the current narrowest
+          type.
+         */
+        double doubleTotal = 0;
+        long long longTotal = 0;
+        BSONType totalType = NumberInt;
+        for(size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(vpValue[i]);
+
+            totalType = Value::getWidestNumeric(totalType, pValue->getType());
+            doubleTotal += pValue->coerceToDouble();
+            longTotal += pValue->coerceToLong();
+        }
+
+        if (totalType == NumberDouble)
+            return Value::createDouble(doubleTotal);
+        if (totalType == NumberLong)
+            return Value::createLong(longTotal);
+        return Value::createInt((int)longTotal);
+    }
+
+    const char *ExpressionAdd::getOpName() const {
+	return "$add";
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionAdd::getFactory() const)() {
+	return ExpressionAdd::create;
+    }
+
+    void ExpressionAdd::toBson(
+	BSONObjBuilder *pBuilder, const char *pOpName, unsigned depth) const {
+
+	if (pAdd)
+	    pAdd->toBson(pBuilder, pOpName, depth);
+	else
+	    ExpressionNary::toBson(pBuilder, pOpName, depth);
+    }
+
+
+    /* ------------------------- ExpressionAnd ----------------------------- */
+
+    ExpressionAnd::~ExpressionAnd() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionAnd::create() {
+        intrusive_ptr<ExpressionNary> pExpression(new ExpressionAnd());
+        return pExpression;
+    }
+
+    ExpressionAnd::ExpressionAnd():
+        ExpressionNary() {
+    }
+
+    intrusive_ptr<Expression> ExpressionAnd::optimize() {
+	/* optimize the conjunction as much as possible */
+	intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+	/* if the result isn't a conjunction, we can't do anything */
+	ExpressionAnd *pAnd = dynamic_cast<ExpressionAnd *>(pE.get());
+	if (!pAnd)
+	    return pE;
+
+	/*
+	  Check the last argument on the result; if it's not constant (as
+	  promised by ExpressionNary::optimize(),) then there's nothing
+	  we can do.
+	*/
+	const size_t n = pAnd->vpOperand.size();
+	intrusive_ptr<Expression> pLast(pAnd->vpOperand[n - 1]);
+	const ExpressionConstant *pConst =
+	    dynamic_cast<ExpressionConstant *>(pLast.get());
+	if (!pConst)
+	    return pE;
+
+	/*
+	  Evaluate and coerce the last argument to a boolean.  If it's false,
+	  then we can replace this entire expression.
+	 */
+	bool last = pLast->evaluate(intrusive_ptr<Document>())->coerceToBool();
+	if (!last) {
+	    intrusive_ptr<ExpressionConstant> pFinal(
+		ExpressionConstant::create(Value::getFalse()));
+	    return pFinal;
+	}
+
+	/*
+	  If we got here, the final operand was true, so we don't need it
+	  anymore.  If there was only one other operand, we don't need the
+	  conjunction either.  Note we still need to keep the promise that
+	  the result will be a boolean.
+	 */
+	if (n == 2) {
+	    intrusive_ptr<Expression> pFinal(
+		ExpressionCoerceToBool::create(pAnd->vpOperand[0]));
+	    return pFinal;
+	}
+
+	/*
+	  Remove the final "true" value, and return the new expression.
+
+	  CW TODO:
+	  Note that because of any implicit conversions, we may need to
+	  apply an implicit boolean conversion.
+	*/
+	pAnd->vpOperand.resize(n - 1);
+	return pE;
+    }
+
+    intrusive_ptr<const Value> ExpressionAnd::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        const size_t n = vpOperand.size();
+        for(size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+            if (!pValue->coerceToBool())
+                return Value::getFalse();
+        }
+
+        return Value::getTrue();
+    }
+
+    const char *ExpressionAnd::getOpName() const {
+	return "$and";
+    }
+
+    void ExpressionAnd::toMatcherBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+	/*
+	  There are two patterns we can handle:
+	  (1) one or two comparisons on the same field: { a:{$gte:3, $lt:7} }
+	  (2) multiple field comparisons: {a:7, b:{$lte:6}, c:2}
+	    This can be recognized as a conjunction of a set of  range
+	    expressions.  Direct equality is a degenerate range expression;
+	    range expressions can be open-ended.
+	*/
+	assert(false && "unimplemented");
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionAnd::getFactory() const)() {
+	return ExpressionAnd::create;
+    }
+
+    /* -------------------- ExpressionCoerceToBool ------------------------- */
+
+    ExpressionCoerceToBool::~ExpressionCoerceToBool() {
+    }
+
+    intrusive_ptr<ExpressionCoerceToBool> ExpressionCoerceToBool::create(
+	const intrusive_ptr<Expression> &pExpression) {
+        intrusive_ptr<ExpressionCoerceToBool> pNew(
+	    new ExpressionCoerceToBool(pExpression));
+        return pNew;
+    }
+
+    ExpressionCoerceToBool::ExpressionCoerceToBool(
+	const intrusive_ptr<Expression> &pTheExpression):
+        Expression(),
+        pExpression(pTheExpression) {
+    }
+
+    intrusive_ptr<Expression> ExpressionCoerceToBool::optimize() {
+	/* optimize the operand */
+	pExpression = pExpression->optimize();
+
+	/* if the operand already produces a boolean, then we don't need this */
+	/* LATER - Expression to support a "typeof" query? */
+	Expression *pE = pExpression.get();
+	if (dynamic_cast<ExpressionAnd *>(pE) ||
+	    dynamic_cast<ExpressionOr *>(pE) ||
+	    dynamic_cast<ExpressionNot *>(pE) ||
+	    dynamic_cast<ExpressionCoerceToBool *>(pE))
+	    return pExpression;
+
+	return intrusive_ptr<Expression>(this);
+    }
+
+    intrusive_ptr<const Value> ExpressionCoerceToBool::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+
+	intrusive_ptr<const Value> pResult(pExpression->evaluate(pDocument));
+        bool b = pResult->coerceToBool();
+        if (b)
+            return Value::getTrue();
+        return Value::getFalse();
+    }
+
+    void ExpressionCoerceToBool::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	assert(false && "not possible"); // no equivalent of this
+    }
+
+    void ExpressionCoerceToBool::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	assert(false && "not possible"); // no equivalent of this
+    }
+
+    /* ----------------------- ExpressionCompare --------------------------- */
+
+    ExpressionCompare::~ExpressionCompare() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createEq() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(EQ));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createNe() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(NE));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createGt() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(GT));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createGte() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(GTE));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createLt() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(LT));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createLte() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(LTE));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createCmp() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(CMP));
+        return pExpression;
+    }
+
+    ExpressionCompare::ExpressionCompare(CmpOp theCmpOp):
+        ExpressionNary(),
+        cmpOp(theCmpOp) {
+    }
+
+    void ExpressionCompare::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    /*
+      Lookup table for truth value returns
+    */
+    struct CmpLookup {
+	bool truthValue[3]; /* truth value for -1, 0, 1 */
+	Expression::CmpOp reverse; /* reverse comparison operator */
+	char name[5]; /* string name (w/trailing '\0') */
+    };
+    static const CmpLookup cmpLookup[7] = {
+        /*             -1      0      1      reverse          name   */
+        /* EQ  */ { { false, true,  false }, Expression::EQ,  "$eq"  },
+        /* NE  */ { { true,  false, true },  Expression::NE,  "$ne"  },
+        /* GT  */ { { false, false, true },  Expression::LTE, "$gt"  },
+        /* GTE */ { { false, true,  true },  Expression::LT,  "$gte" },
+        /* LT  */ { { true,  false, false }, Expression::GTE, "$lt"  },
+        /* LTE */ { { true,  true,  false }, Expression::GT,  "$lte" },
+        /* CMP */ { { false, false, false }, Expression::CMP, "$cmp" },
+    };
+
+    intrusive_ptr<Expression> ExpressionCompare::optimize() {
+	/* first optimize the comparison operands */
+	intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+	/*
+	  If the result of optimization is no longer a comparison, there's
+	  nothing more we can do.
+	*/
+	ExpressionCompare *pCmp = dynamic_cast<ExpressionCompare *>(pE.get());
+	if (!pCmp)
+	    return pE;
+
+	/* check to see if optimizing comparison operator is supported */
+	CmpOp newOp = pCmp->cmpOp;
+	if (newOp == CMP)
+	    return pE; // not reversible: there's nothing more we can do
+
+	/*
+	  There's one localized optimization we recognize:  a comparison
+	  between a field and a constant.  If we recognize that pattern,
+	  replace it with an ExpressionFieldRange.
+
+	  When looking for this pattern, note that the operands could appear
+	  in any order.  If we need to reverse the sense of the comparison to
+	  put it into the required canonical form, do so.
+	 */
+	intrusive_ptr<Expression> pLeft(pCmp->vpOperand[0]);
+	intrusive_ptr<Expression> pRight(pCmp->vpOperand[1]);
+	intrusive_ptr<ExpressionFieldPath> pFieldPath(
+	    dynamic_pointer_cast<ExpressionFieldPath>(pLeft));
+	intrusive_ptr<ExpressionConstant> pConstant;
+	if (pFieldPath.get()) {
+	    pConstant = dynamic_pointer_cast<ExpressionConstant>(pRight);
+	    if (!pConstant.get())
+		return pE; // there's nothing more we can do
+	}
+	else {
+	    /* if the first operand wasn't a path, see if it's a constant */
+	    pConstant = dynamic_pointer_cast<ExpressionConstant>(pLeft);
+	    if (!pConstant.get())
+		return pE; // there's nothing more we can do
+
+	    /* the left operand was a constant; see if the right is a path */
+	    pFieldPath = dynamic_pointer_cast<ExpressionFieldPath>(pRight);
+	    if (!pFieldPath.get())
+		return pE; // there's nothing more we can do
+
+	    /* these were not in canonical order, so reverse the sense */
+	    newOp = cmpLookup[newOp].reverse;
+	}
+
+	return ExpressionFieldRange::create(
+	    pFieldPath, newOp, pConstant->getValue());
+    }
+
+    intrusive_ptr<const Value> ExpressionCompare::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+        BSONType leftType = pLeft->getType();
+        BSONType rightType = pRight->getType();
+	uassert(15994, str::stream() << getOpName() <<
+		":  no automatic conversion for types " <<
+		leftType << " and " << rightType,
+		leftType == rightType);
+        // CW TODO at least for now.  later, handle automatic conversions
+
+        int cmp = 0;
+        switch(leftType) {
+        case NumberDouble: {
+            double left = pLeft->getDouble();
+            double right = pRight->getDouble();
+
+            if (left < right)
+                cmp = -1;
+            else if (left > right)
+                cmp = 1;
+            break;
+        }
+
+        case NumberInt: {
+            int left = pLeft->getInt();
+            int right = pRight->getInt();
+
+            if (left < right)
+                cmp = -1;
+            else if (left > right)
+                cmp = 1;
+            break;
+        }
+
+        case String: {
+            string left(pLeft->getString());
+            string right(pRight->getString());
+            cmp = signum(left.compare(right));
+            break;
+        }
+
+        default:
+	    uassert(15995, str::stream() <<
+		    "can't compare values of type " << leftType, false);
+            break;
+        }
+
+        if (cmpOp == CMP) {
+            switch(cmp) {
+            case -1:
+                return Value::getMinusOne();
+            case 0:
+                return Value::getZero();
+            case 1:
+                return Value::getOne();
+
+            default:
+                assert(false); // CW TODO internal error
+                return Value::getNull();
+            }
+        }
+
+        bool returnValue = cmpLookup[cmpOp].truthValue[cmp + 1];
+        if (returnValue)
+            return Value::getTrue();
+        return Value::getFalse();
+    }
+
+    const char *ExpressionCompare::getOpName() const {
+	return cmpLookup[cmpOp].name;
+    }
+
+    /* ----------------------- ExpressionCond ------------------------------ */
+
+    ExpressionCond::~ExpressionCond() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCond::create() {
+        intrusive_ptr<ExpressionCond> pExpression(new ExpressionCond());
+        return pExpression;
+    }
+
+    ExpressionCond::ExpressionCond():
+        ExpressionNary() {
+    }
+
+    void ExpressionCond::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(3);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionCond::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(3);
+        intrusive_ptr<const Value> pCond(vpOperand[0]->evaluate(pDocument));
+	int idx = pCond->coerceToBool() ? 1 : 2;
+	return vpOperand[idx]->evaluate(pDocument);
+    }
+
+    const char *ExpressionCond::getOpName() const {
+	return "$cond";
+    }
+
+    /* ---------------------- ExpressionConstant --------------------------- */
+
+    ExpressionConstant::~ExpressionConstant() {
+    }
+
+    intrusive_ptr<ExpressionConstant> ExpressionConstant::createFromBsonElement(
+        BSONElement *pBsonElement) {
+        intrusive_ptr<ExpressionConstant> pEC(
+            new ExpressionConstant(pBsonElement));
+        return pEC;
+    }
+
+    ExpressionConstant::ExpressionConstant(BSONElement *pBsonElement):
+        pValue(Value::createFromBsonElement(pBsonElement)) {
+    }
+
+    intrusive_ptr<ExpressionConstant> ExpressionConstant::create(
+        const intrusive_ptr<const Value> &pValue) {
+        intrusive_ptr<ExpressionConstant> pEC(new ExpressionConstant(pValue));
+        return pEC;
+    }
+
+    ExpressionConstant::ExpressionConstant(
+	const intrusive_ptr<const Value> &pTheValue):
+        pValue(pTheValue) {
+    }
+
+
+    intrusive_ptr<Expression> ExpressionConstant::optimize() {
+	/* nothing to do */
+	return intrusive_ptr<Expression>(this);
+    }
+
+    intrusive_ptr<const Value> ExpressionConstant::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        return pValue;
+    }
+
+    void ExpressionConstant::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+
+	/*
+	  For depth greater than one, do the regular thing
+
+	  This will be one because any top level expression will actually
+	  be an operator node, so by the time we get to an expression
+	  constant, we're at level 1 (counting up as we go down the
+	  expression tree).
+
+	  See the comment below for more on why this happens.
+	*/
+	if (depth > 1) {
+	    pValue->addToBsonObj(pBuilder, fieldName);
+	    return;
+	}
+
+	/*
+	  If this happens at the top level, we don't have any direct way
+	  to express it.  However, we may need to if constant folding
+	  reduced expressions to constants, and we need to re-materialize
+	  the pipeline in order to ship it to a shard server.  This has
+	  forced the introduction of {$const: ...}.
+	 */
+	BSONObjBuilder constBuilder;
+	pValue->addToBsonObj(&constBuilder, "$const");
+	pBuilder->append(fieldName, constBuilder.done());
+    }
+
+    void ExpressionConstant::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	pValue->addToBsonArray(pBuilder);
+    }
+
+    const char *ExpressionConstant::getOpName() const {
+	assert(false); // this has no name
+	return NULL;
+    }
+
+    /* ---------------------- ExpressionDayOfMonth ------------------------- */
+
+    ExpressionDayOfMonth::~ExpressionDayOfMonth() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionDayOfMonth::create() {
+        intrusive_ptr<ExpressionDayOfMonth> pExpression(new ExpressionDayOfMonth());
+        return pExpression;
+    }
+
+    ExpressionDayOfMonth::ExpressionDayOfMonth():
+        ExpressionNary() {
+    }
+
+    void ExpressionDayOfMonth::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionDayOfMonth::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_mday); 
+    }
+
+    const char *ExpressionDayOfMonth::getOpName() const {
+	return "$dayOfMonth";
+    }
+
+    /* ------------------------- ExpressionDayOfWeek ----------------------------- */
+
+    ExpressionDayOfWeek::~ExpressionDayOfWeek() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionDayOfWeek::create() {
+        intrusive_ptr<ExpressionDayOfWeek> pExpression(new ExpressionDayOfWeek());
+        return pExpression;
+    }
+
+    ExpressionDayOfWeek::ExpressionDayOfWeek():
+        ExpressionNary() {
+    }
+
+    void ExpressionDayOfWeek::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionDayOfWeek::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_wday+1); // MySQL uses 1-7 tm uses 0-6
+    }
+
+    const char *ExpressionDayOfWeek::getOpName() const {
+	return "$dayOfWeek";
+    }
+
+    /* ------------------------- ExpressionDayOfYear ----------------------------- */
+
+    ExpressionDayOfYear::~ExpressionDayOfYear() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionDayOfYear::create() {
+        intrusive_ptr<ExpressionDayOfYear> pExpression(new ExpressionDayOfYear());
+        return pExpression;
+    }
+
+    ExpressionDayOfYear::ExpressionDayOfYear():
+        ExpressionNary() {
+    }
+
+    void ExpressionDayOfYear::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionDayOfYear::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_yday+1); // MySQL uses 1-366 tm uses 0-365
+    }
+
+    const char *ExpressionDayOfYear::getOpName() const {
+	return "$dayOfYear";
+    }
+
+    /* ----------------------- ExpressionDivide ---------------------------- */
+
+    ExpressionDivide::~ExpressionDivide() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionDivide::create() {
+        intrusive_ptr<ExpressionDivide> pExpression(new ExpressionDivide());
+        return pExpression;
+    }
+
+    ExpressionDivide::ExpressionDivide():
+        ExpressionNary() {
+    }
+
+    void ExpressionDivide::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionDivide::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+        double right = pRight->coerceToDouble();
+	if (right == 0)
+	    return Value::getUndefined();
+
+        double left = pLeft->coerceToDouble();
+
+        return Value::createDouble(left / right);
+    }
+
+    const char *ExpressionDivide::getOpName() const {
+	return "$divide";
+    }
+
+    /* ---------------------- ExpressionObject --------------------------- */
+
+    ExpressionObject::~ExpressionObject() {
+    }
+
+    intrusive_ptr<ExpressionObject> ExpressionObject::create() {
+        intrusive_ptr<ExpressionObject> pExpression(new ExpressionObject());
+        return pExpression;
+    }
+
+    ExpressionObject::ExpressionObject():
+	excludePaths(false),
+	path(),
+        vFieldName(),
+        vpExpression() {
+    }
+
+    intrusive_ptr<Expression> ExpressionObject::optimize() {
+	const size_t n = vpExpression.size();
+	for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<Expression> pE(vpExpression[i]->optimize());
+	    vpExpression[i] = pE;
+	}
+
+	return intrusive_ptr<Expression>(this);
+    }
+
+    void ExpressionObject::addToDocument(
+	const intrusive_ptr<Document> &pResult,
+        const intrusive_ptr<Document> &pDocument) const {
+	const size_t pathSize = path.size();
+	set<string>::const_iterator end(path.end());
+
+	/*
+	  Take care of inclusions or exclusions.  Note that _id is special,
+	  that that it is always included, unless it is specifically excluded.
+	  we use excludeId for that in case excludePaths if false, which means
+	  to include paths.
+	*/
+	if (pathSize) {
+	    auto_ptr<FieldIterator> pIter(pDocument->createFieldIterator());
+	    if (excludePaths) {
+		while(pIter->more()) {
+		    pair<string, intrusive_ptr<const Value> > field(pIter->next());
+
+		    /*
+		      If the field in the document is not in the exclusion set,
+		      add it to the result document.
+
+		      Note that exclusions are only allowed on leaves, so we
+		      can assume we don't have to descend recursively here.
+		     */
+		    if (path.find(field.first) != end)
+			continue; // we found it, so don't add it
+
+		    pResult->addField(field.first, field.second);
+		}
+	    }
+	    else { /* !excludePaths */
+		while(pIter->more()) {
+		    pair<string, intrusive_ptr<const Value> > field(
+			pIter->next());
+		    /*
+		      If the field in the document is in the inclusion set,
+		      add it to the result document.  Or, if we're not
+		      excluding _id, and it is _id, include it.
+
+		      Note that this could be an inclusion along a pathway,
+		      so we look for an ExpressionObject in vpExpression; when
+		      we find one, we populate the result with the evaluation
+		      of that on the nested object, yielding relative paths.
+		      This also allows us to handle intermediate arrays; if we
+		      encounter one, we repeat this for each array element.
+		     */
+		    if (path.find(field.first) != end) {
+			/* find the Expression */
+			const size_t n = vFieldName.size();
+			size_t i;
+			Expression *pE = NULL;
+			for(i = 0; i < n; ++i) {
+			    if (field.first.compare(vFieldName[i]) == 0) {
+				pE = vpExpression[i].get();
+				break;
+			    }
+			}
+
+			/*
+			  If we didn't find an expression, it's the last path
+			  element to include.
+			*/
+			if (!pE) {
+			    pResult->addField(field.first, field.second);
+			    continue;
+			}
+
+			ExpressionObject *pChild =
+			    dynamic_cast<ExpressionObject *>(pE);
+			assert(pChild);
+
+			/*
+			  Check on the type of the result object.  If it's an
+			  object, just walk down into that recursively, and
+			  add it to the result.
+			*/
+			BSONType valueType = field.second->getType();
+			if (valueType == Object) {
+			    intrusive_ptr<Document> pD(
+				pChild->evaluateDocument(
+				    field.second->getDocument()));
+			    pResult->addField(vFieldName[i],
+					      Value::createDocument(pD));
+			}
+			else if (valueType == Array) {
+			    /*
+			      If it's an array, we have to do the same thing,
+			      but to each array element.  Then, add the array
+			      of results to the current document.
+			    */
+			    vector<intrusive_ptr<const Value> > result;
+			    intrusive_ptr<ValueIterator> pVI(
+				field.second->getArray());
+			    while(pVI->more()) {
+				intrusive_ptr<Document> pD(
+				    pChild->evaluateDocument(
+					pVI->next()->getDocument()));
+				result.push_back(Value::createDocument(pD));
+			    }
+
+			    pResult->addField(vFieldName[i],
+					      Value::createArray(result));
+			}
+		    }
+		}
+	    }
+	}
+
+	/* add any remaining fields we haven't already taken care of */
+        const size_t n = vFieldName.size();
+        for(size_t i = 0; i < n; ++i) {
+	    string fieldName(vFieldName[i]);
+
+	    /* if we've already dealt with this field, above, do nothing */
+	    if (path.find(fieldName) != end)
+		continue;
+
+	    intrusive_ptr<const Value> pValue(
+		vpExpression[i]->evaluate(pDocument));
+
+	    /*
+	      Don't add non-existent values (note:  different from NULL);
+	      this is consistent with existing selection syntax which doesn't
+	      force the appearnance of non-existent fields.
+	    */
+	    if (pValue->getType() == Undefined)
+		continue;
+
+	    pResult->addField(fieldName, pValue);
+        }
+    }
+
+    size_t ExpressionObject::getSizeHint(
+	const intrusive_ptr<Document> &pDocument) const {
+	size_t sizeHint = pDocument->getFieldCount();
+	const size_t pathSize = path.size();
+	if (!excludePaths)
+	    sizeHint += pathSize;
+	else {
+	    size_t excludeCount = pathSize;
+	    if (sizeHint > excludeCount)
+		sizeHint -= excludeCount;
+	    else
+		sizeHint = 0;
+	}
+
+	/* account for the additional computed fields */
+	sizeHint += vFieldName.size();
+
+	return sizeHint;
+    }
+
+    intrusive_ptr<Document> ExpressionObject::evaluateDocument(
+        const intrusive_ptr<Document> &pDocument) const {
+	/* create and populate the result */
+        intrusive_ptr<Document> pResult(
+	    Document::create(getSizeHint(pDocument)));
+	addToDocument(pResult, pDocument);
+        return pResult;
+    }
+
+    intrusive_ptr<const Value> ExpressionObject::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	return Value::createDocument(evaluateDocument(pDocument));
+    }
+
+    void ExpressionObject::addField(const string &fieldName,
+				    const intrusive_ptr<Expression> &pExpression) {
+	/* must have an expression */
+	assert(pExpression.get());
+
+	/* parse the field path */
+	FieldPath fieldPath(fieldName);
+	uassert(16008, str::stream() <<
+		"an expression object's field names cannot be field paths (at \"" <<
+		fieldName << "\")", fieldPath.getPathLength() == 1);
+
+	/* make sure it isn't a name we've included or excluded */
+	set<string>::iterator ex(path.find(fieldName));
+	uassert(16009, str::stream() <<
+		"can't add a field to an object expression that has already been excluded (at \"" <<
+		fieldName << "\")", ex == path.end());
+
+	/* make sure it isn't a name we've already got */
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    uassert(16010, str::stream() <<
+		    "can't add the same field to an object expression more than once (at \"" <<
+		    fieldName << "\")",
+		    fieldName.compare(vFieldName[i]) != 0);
+	}
+
+	vFieldName.push_back(fieldName);
+	vpExpression.push_back(pExpression);
+    }
+
+    void ExpressionObject::includePath(
+	const FieldPath *pPath, size_t pathi, size_t pathn, bool excludeLast) {
+
+	/* get the current path field name */
+	string fieldName(pPath->getFieldName(pathi));
+	uassert(16011,
+		"an object expression can't include an empty field-name",
+		fieldName.length());
+
+	const size_t pathCount = path.size();
+
+	/* if this is the leaf-most object, stop */
+	if (pathi == pathn - 1) {
+	    /*
+	      Make sure the exclusion configuration of this node matches
+	      the requested result.  Or, that this is the first (determining)
+	      specification.
+	    */
+	    uassert(16012, str::stream() <<
+		    "incompatible exclusion for \"" <<
+		    pPath->getPath(false) <<
+		    "\" because of a prior inclusion that includes a common sub-path",
+		    ((excludePaths == excludeLast) || !pathCount));
+
+	    excludePaths = excludeLast; // if (!pathCount), set this
+	    path.insert(fieldName);
+	    return;
+	}
+
+	/* this level had better be about inclusions */
+	uassert(16013, str::stream() <<
+		"incompatible inclusion for \"" << pPath->getPath(false) <<
+		"\" because of a prior exclusion that includes a common sub-path",
+		!excludePaths);
+
+	/* see if we already know about this field */
+	const size_t n = vFieldName.size();
+	size_t i;
+	for(i = 0; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+		break;
+	}
+
+	/* find the right object, and continue */
+	ExpressionObject *pChild;
+	if (i < n) {
+	    /* the intermediate child already exists */
+	    pChild = dynamic_cast<ExpressionObject *>(vpExpression[i].get());
+	    assert(pChild);
+	}
+	else {
+	    /*
+	      If we get here, the intervening child isn't already there,
+	      so create it.
+	    */
+	    intrusive_ptr<ExpressionObject> pSharedChild(
+		ExpressionObject::create());
+	    path.insert(fieldName);
+	    vFieldName.push_back(fieldName);
+	    vpExpression.push_back(pSharedChild);
+	    pChild = pSharedChild.get();
+	}
+
+	// LATER CW TODO turn this into a loop
+	pChild->includePath(pPath, pathi + 1, pathn, excludeLast);
+    }
+
+    void ExpressionObject::includePath(const string &theFieldPath) {
+	/* parse the field path */
+	FieldPath fieldPath(theFieldPath);
+	includePath(&fieldPath, 0, fieldPath.getPathLength(), false);
+    }
+
+    void ExpressionObject::excludePath(const string &theFieldPath) {
+	/* parse the field path */
+	FieldPath fieldPath(theFieldPath);
+	includePath(&fieldPath, 0, fieldPath.getPathLength(), true);
+    }
+
+    intrusive_ptr<Expression> ExpressionObject::getField(
+	const string &fieldName) const {
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+		return vpExpression[i];
+	}
+
+	/* if we got here, we didn't find it */
+	return intrusive_ptr<Expression>();
+    }
+
+    void ExpressionObject::emitPaths(
+	BSONObjBuilder *pBuilder, vector<string> *pvPath) const {
+	if (!path.size())
+	    return;
+	
+	/* we use these for loops */
+	const size_t nField = vFieldName.size();
+	const size_t nPath = pvPath->size();
+
+	/*
+	  We can iterate over the inclusion/exclusion paths in their
+	  (random) set order because they don't affect the order that
+	  fields are listed in the result.  That comes from the underlying
+	  Document they are fetched from.
+	 */
+	for(set<string>::const_iterator end(path.end()),
+		iter(path.begin()); iter != end; ++iter) {
+
+	    /* find the matching field description */
+	    size_t iField = 0;
+	    for(; iField < nField; ++iField) {
+		if (iter->compare(vFieldName[iField]) == 0)
+		    break;
+	    }
+
+	    if (iField == nField) {
+		/*
+		  If we didn't find a matching field description, this is the
+		  leaf, so add the path.
+		*/
+		stringstream ss;
+
+		for(size_t iPath = 0; iPath < nPath; ++iPath)
+		    ss << (*pvPath)[iPath] << ".";
+		ss << *iter;
+
+		pBuilder->append(ss.str(), !excludePaths);
+	    }
+	    else {
+		/*
+		  If we found a matching field description, then we need to
+		  descend into the next level.
+		*/
+		Expression *pE = vpExpression[iField].get();
+		ExpressionObject *pEO = dynamic_cast<ExpressionObject *>(pE);
+		assert(pEO);
+
+		/*
+		  Add the current field name to the path being built up,
+		  then go down into the next level.
+		 */
+		PathPusher pathPusher(pvPath, vFieldName[iField]);
+		pEO->emitPaths(pBuilder, pvPath);
+	    }
+	}
+    }
+
+    void ExpressionObject::documentToBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+
+	/* emit any inclusion/exclusion paths */
+	vector<string> vPath;
+	emitPaths(pBuilder, &vPath);
+
+	/* then add any expressions */
+	const size_t nField = vFieldName.size();
+	const set<string>::const_iterator pathEnd(path.end());
+	for(size_t iField = 0; iField < nField; ++iField) {
+	    string fieldName(vFieldName[iField]);
+
+	    /* if we already took care of this, don't repeat it */
+	    if (path.find(fieldName) != pathEnd)
+		continue;
+
+	    vpExpression[iField]->addToBsonObj(pBuilder, fieldName, depth + 1);
+	}
+    }
+
+    void ExpressionObject::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+
+	BSONObjBuilder objBuilder;
+	documentToBson(&objBuilder, depth);
+	pBuilder->append(fieldName, objBuilder.done());
+    }
+
+    void ExpressionObject::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+
+	BSONObjBuilder objBuilder;
+	documentToBson(&objBuilder, depth);
+	pBuilder->append(objBuilder.done());
+    }
+
+    /* --------------------- ExpressionFieldPath --------------------------- */
+
+    ExpressionFieldPath::~ExpressionFieldPath() {
+    }
+
+    intrusive_ptr<ExpressionFieldPath> ExpressionFieldPath::create(
+        const string &fieldPath) {
+        intrusive_ptr<ExpressionFieldPath> pExpression(
+            new ExpressionFieldPath(fieldPath));
+        return pExpression;
+    }
+
+    ExpressionFieldPath::ExpressionFieldPath(
+	const string &theFieldPath):
+        fieldPath(theFieldPath) {
+    }
+
+    intrusive_ptr<Expression> ExpressionFieldPath::optimize() {
+	/* nothing can be done for these */
+	return intrusive_ptr<Expression>(this);
+    }
+
+    intrusive_ptr<const Value> ExpressionFieldPath::evaluatePath(
+	size_t index, const size_t pathLength,
+	intrusive_ptr<Document> pDocument) const {
+        intrusive_ptr<const Value> pValue; /* the return value */
+
+	pValue = pDocument->getValue(fieldPath.getFieldName(index));
+
+	/* if the field doesn't exist, quit with an undefined value */
+	if (!pValue.get())
+	    return Value::getUndefined();
+
+	/* if we've hit the end of the path, stop */
+	++index;
+	if (index >= pathLength)
+	    return pValue;
+
+	/*
+	  We're diving deeper.  If the value was null, return null.
+	*/
+	BSONType type = pValue->getType();
+	if ((type == Undefined) || (type == jstNULL))
+	    return Value::getUndefined();
+
+	if (type == Object) {
+	    /* extract from the next level down */
+	    return evaluatePath(index, pathLength, pValue->getDocument());
+	}
+
+	if (type == Array) {
+	    /*
+	      We're going to repeat this for each member of the array,
+	      building up a new array as we go.
+	    */
+	    vector<intrusive_ptr<const Value> > result;
+	    intrusive_ptr<ValueIterator> pIter(pValue->getArray());
+	    while(pIter->more()) {
+		intrusive_ptr<const Value> pItem(pIter->next());
+		BSONType iType = pItem->getType();
+		if ((iType == Undefined) || (iType == jstNULL)) {
+		    result.push_back(pItem);
+		    continue;
+		}
+
+		uassert(16014, str::stream() << 
+			"the element \"" << fieldPath.getFieldName(index) <<
+			"\" along the dotted path \"" <<
+			fieldPath.getPath(false) <<
+			"\" is not an object, and cannot be navigated",
+			iType == Object);
+		intrusive_ptr<const Value> itemResult(
+		    evaluatePath(index, pathLength, pItem->getDocument()));
+		result.push_back(itemResult);
+	    }
+
+	    return Value::createArray(result);
+	}
+
+	uassert(16015, str::stream() <<
+		"can't navigate into value of type " << type <<
+		"at \"" << fieldPath.getFieldName(index) <<
+		"\" in dotted path \"" << fieldPath.getPath(false),
+		false);
+	return intrusive_ptr<const Value>();
+    }
+
+    intrusive_ptr<const Value> ExpressionFieldPath::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	return evaluatePath(0, fieldPath.getPathLength(), pDocument);
+    }
+
+    void ExpressionFieldPath::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	pBuilder->append(fieldName, fieldPath.getPath(true));
+    }
+
+    void ExpressionFieldPath::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	pBuilder->append(getFieldPath(true));
+    }
+
+    /* --------------------- ExpressionFieldPath --------------------------- */
+
+    ExpressionFieldRange::~ExpressionFieldRange() {
+    }
+
+    intrusive_ptr<Expression> ExpressionFieldRange::optimize() {
+	/* if there is no range to match, this will never evaluate true */
+	if (!pRange.get())
+	    return ExpressionConstant::create(Value::getFalse());
+
+	/*
+	  If we ended up with a double un-ended range, anything matches.  I
+	  don't know how that can happen, given intersect()'s interface, but
+	  here it is, just in case.
+	*/
+	if (!pRange->pBottom.get() && !pRange->pTop.get())
+	    return ExpressionConstant::create(Value::getTrue());
+
+	/*
+	  In all other cases, we have to test candidate values.  The
+	  intersect() method has already optimized those tests, so there
+	  aren't any more optimizations to look for here.
+	*/
+	return intrusive_ptr<Expression>(this);
+    }
+
+    intrusive_ptr<const Value> ExpressionFieldRange::evaluate(
+	const intrusive_ptr<Document> &pDocument) const {
+	/* if there's no range, there can't be a match */
+	if (!pRange.get())
+	    return Value::getFalse();
+
+	/* get the value of the specified field */
+	intrusive_ptr<const Value> pValue(pFieldPath->evaluate(pDocument));
+
+	/* see if it fits within any of the ranges */
+	if (pRange->contains(pValue))
+	    return Value::getTrue();
+
+	return Value::getFalse();
+    }
+
+    void ExpressionFieldRange::addToBson(
+	Builder *pBuilder, unsigned depth) const {
+	if (!pRange.get()) {
+	    /* nothing will satisfy this predicate */
+	    pBuilder->append(false);
+	    return;
+	}
+
+	if (!pRange->pTop.get() && !pRange->pBottom.get()) {
+	    /* any value will satisfy this predicate */
+	    pBuilder->append(true);
+	    return;
+	}
+
+	if (pRange->pTop.get() == pRange->pBottom.get()) {
+	    BSONArrayBuilder operands;
+	    pFieldPath->addToBsonArray(&operands, depth);
+	    pRange->pTop->addToBsonArray(&operands);
+	    
+	    BSONObjBuilder equals;
+	    equals.append("$eq", operands.arr());
+	    pBuilder->append(&equals);
+	    return;
+	}
+
+	BSONObjBuilder leftOperator;
+	if (pRange->pBottom.get()) {
+	    BSONArrayBuilder leftOperands;
+	    pFieldPath->addToBsonArray(&leftOperands, depth);
+	    pRange->pBottom->addToBsonArray(&leftOperands);
+	    leftOperator.append(
+		(pRange->bottomOpen ? "$gt" : "$gte"),
+		leftOperands.arr());
+
+	    if (!pRange->pTop.get()) {
+		pBuilder->append(&leftOperator);
+		return;
+	    }
+	}
+
+	BSONObjBuilder rightOperator;
+	if (pRange->pTop.get()) {
+	    BSONArrayBuilder rightOperands;
+	    pFieldPath->addToBsonArray(&rightOperands, depth);
+	    pRange->pTop->addToBsonArray(&rightOperands);
+	    rightOperator.append(
+		(pRange->topOpen ? "$lt" : "$lte"),
+		rightOperands.arr());
+
+	    if (!pRange->pBottom.get()) {
+		pBuilder->append(&rightOperator);
+		return;
+	    }
+	}
+
+	BSONArrayBuilder andOperands;
+	andOperands.append(leftOperator.done());
+	andOperands.append(rightOperator.done());
+	BSONObjBuilder andOperator;
+	andOperator.append("$and", andOperands.arr());
+	pBuilder->append(&andOperator);
+    }
+
+    void ExpressionFieldRange::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	BuilderObj builder(pBuilder, fieldName);
+	addToBson(&builder, depth);
+    }
+
+    void ExpressionFieldRange::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	BuilderArray builder(pBuilder);
+	addToBson(&builder, depth);
+    }
+
+    void ExpressionFieldRange::toMatcherBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+	assert(pRange.get()); // otherwise, we can't do anything
+
+	/* if there are no endpoints, then every value is accepted */
+	if (!pRange->pBottom.get() && !pRange->pTop.get())
+	    return; // nothing to add to the predicate
+
+	/* we're going to need the field path */
+	string fieldPath(pFieldPath->getFieldPath(false));
+
+	BSONObjBuilder range;
+	if (pRange->pBottom.get()) {
+	    /* the test for equality doesn't generate a subobject */
+	    if (pRange->pBottom.get() == pRange->pTop.get()) {
+		pRange->pBottom->addToBsonObj(pBuilder, fieldPath);
+		return;
+	    }
+
+	    pRange->pBottom->addToBsonObj(
+		pBuilder, (pRange->bottomOpen ? "$gt" : "$gte"));
+	}
+
+	if (pRange->pTop.get()) {
+	    pRange->pTop->addToBsonObj(
+		pBuilder, (pRange->topOpen ? "$lt" : "$lte"));
+	}
+
+	pBuilder->append(fieldPath, range.done());
+    }
+
+    intrusive_ptr<ExpressionFieldRange> ExpressionFieldRange::create(
+	const intrusive_ptr<ExpressionFieldPath> &pFieldPath, CmpOp cmpOp,
+	const intrusive_ptr<const Value> &pValue) {
+	intrusive_ptr<ExpressionFieldRange> pE(
+	    new ExpressionFieldRange(pFieldPath, cmpOp, pValue));
+	return pE;
+    }
+
+    ExpressionFieldRange::ExpressionFieldRange(
+	const intrusive_ptr<ExpressionFieldPath> &pTheFieldPath, CmpOp cmpOp,
+	const intrusive_ptr<const Value> &pValue):
+        pFieldPath(pTheFieldPath),
+	pRange(new Range(cmpOp, pValue)) {
+    }
+
+    void ExpressionFieldRange::intersect(
+	CmpOp cmpOp, const intrusive_ptr<const Value> &pValue) {
+
+	/* create the new range */
+	scoped_ptr<Range> pNew(new Range(cmpOp, pValue));
+
+	/*
+	  Go through the range list.  For every range, either add the
+	  intersection of that to the range list, or if there is none, the
+	  original range.  This has the effect of restricting overlapping
+	  ranges, but leaving non-overlapping ones as-is.
+	*/
+	pRange.reset(pRange->intersect(pNew.get()));
+    }
+
+    ExpressionFieldRange::Range::Range(
+	CmpOp cmpOp, const intrusive_ptr<const Value> &pValue):
+	bottomOpen(false),
+	topOpen(false),
+	pBottom(),
+	pTop() {
+	switch(cmpOp) {
+	case NE:
+	    bottomOpen = topOpen = true;
+	    /* FALLTHROUGH */
+	case EQ:
+	    pBottom = pTop = pValue;
+	    break;
+
+	case GT:
+	    bottomOpen = true;
+	    /* FALLTHROUGH */
+	case GTE:
+	    topOpen = true;
+	    pBottom = pValue;
+	    break;
+
+	case LT:
+	    topOpen = true;
+	    /* FALLTHROUGH */
+	case LTE:
+	    bottomOpen = true;
+	    pTop = pValue;
+	    break;
+
+	case CMP:
+	    assert(false); // not allowed
+	    break;
+	}
+    }
+
+    ExpressionFieldRange::Range::Range(const Range &rRange):
+	bottomOpen(rRange.bottomOpen),
+	topOpen(rRange.topOpen),
+	pBottom(rRange.pBottom),
+	pTop(rRange.pTop) {
+    }
+
+    ExpressionFieldRange::Range::Range(
+	const intrusive_ptr<const Value> &pTheBottom, bool theBottomOpen,
+	const intrusive_ptr<const Value> &pTheTop, bool theTopOpen):
+	bottomOpen(theBottomOpen),
+	topOpen(theTopOpen),
+	pBottom(pTheBottom),
+	pTop(pTheTop) {
+    }
+	
+    ExpressionFieldRange::Range *ExpressionFieldRange::Range::intersect(
+	const Range *pRange) const {
+	/*
+	  Find the max of the bottom end of the ranges.
+
+	  Start by assuming the maximum is from pRange.  Then, if we have
+	  values of our own, see if they're greater.
+	*/
+	intrusive_ptr<const Value> pMaxBottom(pRange->pBottom);
+	bool maxBottomOpen = pRange->bottomOpen;
+	if (pBottom.get()) {
+	    if (!pRange->pBottom.get()) {
+		pMaxBottom = pBottom;
+		maxBottomOpen = bottomOpen;
+	    }
+	    else {
+		const int cmp = Value::compare(pBottom, pRange->pBottom);
+		if (cmp == 0)
+		    maxBottomOpen = bottomOpen || pRange->bottomOpen;
+		else if (cmp > 0) {
+		    pMaxBottom = pBottom;
+		    maxBottomOpen = bottomOpen;
+		}
+	    }
+	}
+
+	/*
+	  Find the minimum of the tops of the ranges.
+
+	  Start by assuming the minimum is from pRange.  Then, if we have
+	  values of our own, see if they are less.
+	*/
+	intrusive_ptr<const Value> pMinTop(pRange->pTop);
+	bool minTopOpen = pRange->topOpen;
+	if (pTop.get()) {
+	    if (!pRange->pTop.get()) {
+		pMinTop = pTop;
+		minTopOpen = topOpen;
+	    }
+	    else {
+		const int cmp = Value::compare(pTop, pRange->pTop);
+		if (cmp == 0)
+		    minTopOpen = topOpen || pRange->topOpen;
+		else if (cmp < 0) {
+		    pMinTop = pTop;
+		    minTopOpen = topOpen;
+		}
+	    }
+	}
+
+	/*
+	  If the intersections didn't create a disjoint set, create the
+	  new range.
+	*/
+	if (Value::compare(pMaxBottom, pMinTop) <= 0)
+	    return new Range(pMaxBottom, maxBottomOpen, pMinTop, minTopOpen);
+
+	/* if we got here, the intersection is empty */
+	return NULL;
+    }
+
+    bool ExpressionFieldRange::Range::contains(
+	const intrusive_ptr<const Value> &pValue) const {
+	if (pBottom.get()) {
+	    const int cmp = Value::compare(pValue, pBottom);
+	    if (cmp < 0)
+		return false;
+	    if (bottomOpen && (cmp == 0))
+		return false;
+	}
+
+	if (pTop.get()) {
+	    const int cmp = Value::compare(pValue, pTop);
+	    if (cmp > 0)
+		return false;
+	    if (topOpen && (cmp == 0))
+		return false;
+	}
+
+	return true;
+    }
+
+    /* ------------------------- ExpressionMinute ----------------------------- */
+
+    ExpressionMinute::~ExpressionMinute() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionMinute::create() {
+        intrusive_ptr<ExpressionMinute> pExpression(new ExpressionMinute());
+        return pExpression;
+    }
+
+    ExpressionMinute::ExpressionMinute():
+        ExpressionNary() {
+    }
+
+    void ExpressionMinute::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionMinute::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_min);
+    }
+
+    const char *ExpressionMinute::getOpName() const {
+	return "$minute";
+    }
+
+    /* ----------------------- ExpressionMod ---------------------------- */
+
+    ExpressionMod::~ExpressionMod() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionMod::create() {
+        intrusive_ptr<ExpressionMod> pExpression(new ExpressionMod());
+        return pExpression;
+    }
+
+    ExpressionMod::ExpressionMod():
+        ExpressionNary() {
+    }
+
+    void ExpressionMod::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionMod::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        BSONType productType;
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+        productType = Value::getWidestNumeric(pRight->getType(), pLeft->getType());
+
+        long long right = pRight->coerceToLong();
+	if (right == 0)
+	    return Value::getUndefined();
+
+        long long left = pLeft->coerceToLong();
+        if (productType == NumberLong)
+            return Value::createLong(left % right);
+        return Value::createInt((int)left % right);
+    }
+
+    const char *ExpressionMod::getOpName() const {
+	return "$mod";
+    }
+
+    /* ------------------------- ExpressionMonth ----------------------------- */
+
+    ExpressionMonth::~ExpressionMonth() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionMonth::create() {
+        intrusive_ptr<ExpressionMonth> pExpression(new ExpressionMonth());
+        return pExpression;
+    }
+
+    ExpressionMonth::ExpressionMonth():
+        ExpressionNary() {
+    }
+
+    void ExpressionMonth::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionMonth::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_mon+1); // MySQL uses 1-12 tm uses 0-11
+    }
+
+    const char *ExpressionMonth::getOpName() const {
+	return "$month";
+    }
+
+    /* ------------------------- ExpressionMultiply ----------------------------- */
+
+    ExpressionMultiply::~ExpressionMultiply() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionMultiply::create() {
+        intrusive_ptr<ExpressionMultiply> pExpression(new ExpressionMultiply());
+        return pExpression;
+    }
+
+    ExpressionMultiply::ExpressionMultiply():
+        ExpressionNary() {
+    }
+
+    intrusive_ptr<const Value> ExpressionMultiply::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        /*
+          We'll try to return the narrowest possible result value.  To do that
+          without creating intermediate Values, do the arithmetic for double
+          and integral types in parallel, tracking the current narrowest
+          type.
+         */
+        double doubleProduct = 1;
+        long long longProduct = 1;
+        BSONType productType = NumberInt;
+
+        const size_t n = vpOperand.size();
+        for(size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+
+            productType = Value::getWidestNumeric(productType, pValue->getType());
+            doubleProduct *= pValue->coerceToDouble();
+            longProduct *= pValue->coerceToLong();
+        }
+
+        if (productType == NumberDouble)
+            return Value::createDouble(doubleProduct);
+        if (productType == NumberLong)
+            return Value::createLong(longProduct);
+        return Value::createInt((int)longProduct);
+    }
+
+    const char *ExpressionMultiply::getOpName() const {
+    return "$multiply";
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionMultiply::getFactory() const)() {
+    return ExpressionMultiply::create;
+    }
+
+    /* ------------------------- ExpressionHour ----------------------------- */
+
+    ExpressionHour::~ExpressionHour() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionHour::create() {
+        intrusive_ptr<ExpressionHour> pExpression(new ExpressionHour());
+        return pExpression;
+    }
+
+    ExpressionHour::ExpressionHour():
+        ExpressionNary() {
+    }
+
+    void ExpressionHour::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionHour::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_hour);
+    }
+
+    const char *ExpressionHour::getOpName() const {
+	return "$hour";
+    }
+
+    /* ----------------------- ExpressionIfNull ---------------------------- */
+
+    ExpressionIfNull::~ExpressionIfNull() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionIfNull::create() {
+        intrusive_ptr<ExpressionIfNull> pExpression(new ExpressionIfNull());
+        return pExpression;
+    }
+
+    ExpressionIfNull::ExpressionIfNull():
+        ExpressionNary() {
+    }
+
+    void ExpressionIfNull::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionIfNull::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+	BSONType leftType = pLeft->getType();
+
+        if ((leftType != Undefined) && (leftType != jstNULL))
+            return pLeft;
+
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+        return pRight;
+    }
+
+    const char *ExpressionIfNull::getOpName() const {
+	return "$ifNull";
+    }
+
+    /* ------------------------ ExpressionNary ----------------------------- */
+
+    ExpressionNary::ExpressionNary():
+        vpOperand() {
+    }
+
+    intrusive_ptr<Expression> ExpressionNary::optimize() {
+	unsigned constCount = 0; // count of constant operands
+	unsigned stringCount = 0; // count of constant string operands
+	const size_t n = vpOperand.size();
+	for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<Expression> pNew(vpOperand[i]->optimize());
+
+	    /* subsitute the optimized expression */
+	    vpOperand[i] = pNew;
+
+	    /* check to see if the result was a constant */
+	    const ExpressionConstant *pConst =
+		dynamic_cast<ExpressionConstant *>(pNew.get());
+	    if (pConst) {
+		++constCount;
+		if (pConst->getValue()->getType() == String)
+		    ++stringCount;
+	    }
+	}
+
+	/*
+	  If all the operands are constant, we can replace this expression
+	  with a constant.  We can find the value by evaluating this
+	  expression over a NULL Document because evaluating the
+	  ExpressionConstant never refers to the argument Document.
+	*/
+	if (constCount == n) {
+	    intrusive_ptr<const Value> pResult(
+		evaluate(intrusive_ptr<Document>()));
+	    intrusive_ptr<Expression> pReplacement(
+		ExpressionConstant::create(pResult));
+	    return pReplacement;
+	}
+
+	/*
+	  If there are any strings, we can't re-arrange anything, so stop
+	  now.
+
+	  LATER:  we could concatenate adjacent strings as a special case.
+	 */
+	if (stringCount)
+	    return intrusive_ptr<Expression>(this);
+
+	/*
+	  If there's no more than one constant, then we can't do any
+	  constant folding, so don't bother going any further.
+	 */
+	if (constCount <= 1)
+	    return intrusive_ptr<Expression>(this);
+	    
+	/*
+	  If the operator isn't commutative or associative, there's nothing
+	  more we can do.  We test that by seeing if we can get a factory;
+	  if we can, we can use it to construct a temporary expression which
+	  we'll evaluate to collapse as many constants as we can down to
+	  a single one.
+	 */
+	intrusive_ptr<ExpressionNary> (*const pFactory)() = getFactory();
+	if (!pFactory)
+	    return intrusive_ptr<Expression>(this);
+
+	/*
+	  Create a new Expression that will be the replacement for this one.
+	  We actually create two:  one to hold constant expressions, and
+	  one to hold non-constants.  Once we've got these, we evaluate
+	  the constant expression to produce a single value, as above.
+	  We then add this operand to the end of the non-constant expression,
+	  and return that.
+	 */
+	intrusive_ptr<ExpressionNary> pNew((*pFactory)());
+	intrusive_ptr<ExpressionNary> pConst((*pFactory)());
+	for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<Expression> pE(vpOperand[i]);
+	    if (dynamic_cast<ExpressionConstant *>(pE.get()))
+		pConst->addOperand(pE);
+	    else {
+		/*
+		  If the child operand is the same type as this, then we can
+		  extract its operands and inline them here because we already
+		  know this is commutative and associative because it has a
+		  factory.  We can detect sameness of the child operator by
+		  checking for equality of the factory
+
+		  Note we don't have to do this recursively, because we
+		  called optimize() on all the children first thing in
+		  this call to optimize().
+		*/
+		ExpressionNary *pNary =
+		    dynamic_cast<ExpressionNary *>(pE.get());
+		if (!pNary)
+		    pNew->addOperand(pE);
+		else {
+		    intrusive_ptr<ExpressionNary> (*const pChildFactory)() =
+			pNary->getFactory();
+		    if (pChildFactory != pFactory)
+			pNew->addOperand(pE);
+		    else {
+			/* same factory, so flatten */
+			size_t nChild = pNary->vpOperand.size();
+			for(size_t iChild = 0; iChild < nChild; ++iChild) {
+			    intrusive_ptr<Expression> pCE(
+				pNary->vpOperand[iChild]);
+			    if (dynamic_cast<ExpressionConstant *>(pCE.get()))
+				pConst->addOperand(pCE);
+			    else
+				pNew->addOperand(pCE);
+			}
+		    }
+		}
+	    }
+	}
+
+	/*
+	  If there was only one constant, add it to the end of the expression
+	  operand vector.
+	*/
+	if (pConst->vpOperand.size() == 1)
+	    pNew->addOperand(pConst->vpOperand[0]);
+	else if (pConst->vpOperand.size() > 1) {
+	    /*
+	      If there was more than one constant, collapse all the constants
+	      together before adding the result to the end of the expression
+	      operand vector.
+	    */
+	    intrusive_ptr<const Value> pResult(
+		pConst->evaluate(intrusive_ptr<Document>()));
+	    pNew->addOperand(ExpressionConstant::create(pResult));
+	}
+
+	return pNew;
+    }
+
+    void ExpressionNary::addOperand(
+        const intrusive_ptr<Expression> &pExpression) {
+        vpOperand.push_back(pExpression);
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionNary::getFactory() const)() {
+	return NULL;
+    }
+
+    void ExpressionNary::toBson(
+	BSONObjBuilder *pBuilder, const char *pOpName, unsigned depth) const {
+	const size_t nOperand = vpOperand.size();
+	assert(nOperand > 0);
+	if (nOperand == 1) {
+	    vpOperand[0]->addToBsonObj(pBuilder, pOpName, depth + 1);
+	    return;
+	}
+
+	/* build up the array */
+	BSONArrayBuilder arrBuilder;
+	for(size_t i = 0; i < nOperand; ++i)
+	    vpOperand[i]->addToBsonArray(&arrBuilder, depth + 1);
+
+	pBuilder->append(pOpName, arrBuilder.arr());
+    }
+
+    void ExpressionNary::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	BSONObjBuilder exprBuilder;
+	toBson(&exprBuilder, getOpName(), depth);
+	pBuilder->append(fieldName, exprBuilder.done());
+    }
+
+    void ExpressionNary::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	BSONObjBuilder exprBuilder;
+	toBson(&exprBuilder, getOpName(), depth);
+	pBuilder->append(exprBuilder.done());
+    }
+
+    void ExpressionNary::checkArgLimit(unsigned maxArgs) const {
+	uassert(15993, str::stream() << getOpName() <<
+		" only takes " << maxArgs <<
+		" operand" << (maxArgs == 1 ? "" : "s"),
+		vpOperand.size() < maxArgs);
+    }
+
+    void ExpressionNary::checkArgCount(unsigned reqArgs) const {
+	uassert(15997, str::stream() << getOpName() <<
+		":  insufficient operands; " << reqArgs <<
+		" required, only got " << vpOperand.size(),
+		vpOperand.size() == reqArgs);
+    }
+
+    /* ----------------------- ExpressionNoOp ------------------------------ */
+
+    ExpressionNoOp::~ExpressionNoOp() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionNoOp::create() {
+        intrusive_ptr<ExpressionNoOp> pExpression(new ExpressionNoOp());
+        return pExpression;
+    }
+
+    intrusive_ptr<Expression> ExpressionNoOp::optimize() {
+	checkArgCount(1);
+	intrusive_ptr<Expression> pR(vpOperand[0]->optimize());
+	return pR;
+    }
+
+    ExpressionNoOp::ExpressionNoOp():
+        ExpressionNary() {
+    }
+
+    void ExpressionNoOp::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionNoOp::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pValue(vpOperand[0]->evaluate(pDocument));
+	return pValue;
+    }
+
+    const char *ExpressionNoOp::getOpName() const {
+	return "$noOp";
+    }
+
+    /* ------------------------- ExpressionNot ----------------------------- */
+
+    ExpressionNot::~ExpressionNot() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionNot::create() {
+        intrusive_ptr<ExpressionNot> pExpression(new ExpressionNot());
+        return pExpression;
+    }
+
+    ExpressionNot::ExpressionNot():
+        ExpressionNary() {
+    }
+
+    void ExpressionNot::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionNot::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pOp(vpOperand[0]->evaluate(pDocument));
+
+        bool b = pOp->coerceToBool();
+        if (b)
+            return Value::getFalse();
+        return Value::getTrue();
+    }
+
+    const char *ExpressionNot::getOpName() const {
+	return "$not";
+    }
+
+    /* -------------------------- ExpressionOr ----------------------------- */
+
+    ExpressionOr::~ExpressionOr() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionOr::create() {
+        intrusive_ptr<ExpressionNary> pExpression(new ExpressionOr());
+        return pExpression;
+    }
+
+    ExpressionOr::ExpressionOr():
+        ExpressionNary() {
+    }
+
+    intrusive_ptr<const Value> ExpressionOr::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        const size_t n = vpOperand.size();
+        for(size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+            if (pValue->coerceToBool())
+                return Value::getTrue();
+        }
+
+        return Value::getFalse();
+    }
+
+    void ExpressionOr::toMatcherBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+	BSONObjBuilder opArray;
+	const size_t n = vpOperand.size();
+	for(size_t i = 0; i < n; ++i)
+	    vpOperand[i]->toMatcherBson(&opArray, depth + 1);
+
+	pBuilder->append("$or", opArray.done());
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionOr::getFactory() const)() {
+	return ExpressionOr::create;
+    }
+
+    intrusive_ptr<Expression> ExpressionOr::optimize() {
+	/* optimize the disjunction as much as possible */
+	intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+	/* if the result isn't a conjunction, we can't do anything */
+	ExpressionOr *pOr = dynamic_cast<ExpressionOr *>(pE.get());
+	if (!pOr)
+	    return pE;
+
+	/*
+	  Check the last argument on the result; if it's not constant (as
+	  promised by ExpressionNary::optimize(),) then there's nothing
+	  we can do.
+	*/
+	const size_t n = pOr->vpOperand.size();
+	intrusive_ptr<Expression> pLast(pOr->vpOperand[n - 1]);
+	const ExpressionConstant *pConst =
+	    dynamic_cast<ExpressionConstant *>(pLast.get());
+	if (!pConst)
+	    return pE;
+
+	/*
+	  Evaluate and coerce the last argument to a boolean.  If it's true,
+	  then we can replace this entire expression.
+	 */
+	bool last = pLast->evaluate(intrusive_ptr<Document>())->coerceToBool();
+	if (last) {
+	    intrusive_ptr<ExpressionConstant> pFinal(
+		ExpressionConstant::create(Value::getTrue()));
+	    return pFinal;
+	}
+
+	/*
+	  If we got here, the final operand was false, so we don't need it
+	  anymore.  If there was only one other operand, we don't need the
+	  conjunction either.  Note we still need to keep the promise that
+	  the result will be a boolean.
+	 */
+	if (n == 2) {
+	    intrusive_ptr<Expression> pFinal(
+		ExpressionCoerceToBool::create(pOr->vpOperand[0]));
+	    return pFinal;
+	}
+
+	/*
+	  Remove the final "false" value, and return the new expression.
+	*/
+	pOr->vpOperand.resize(n - 1);
+	return pE;
+    }
+
+    const char *ExpressionOr::getOpName() const {
+	return "$or";
+    }
+
+    /* ------------------------- ExpressionSecond ----------------------------- */
+
+    ExpressionSecond::~ExpressionSecond() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionSecond::create() {
+        intrusive_ptr<ExpressionSecond> pExpression(new ExpressionSecond());
+        return pExpression;
+    }
+
+    ExpressionSecond::ExpressionSecond():
+        ExpressionNary() {
+    }
+
+    void ExpressionSecond::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionSecond::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_sec);
+    }
+
+    const char *ExpressionSecond::getOpName() const {
+	return "$second";
+    }
+
+    /* ----------------------- ExpressionStrcasecmp ---------------------------- */
+
+    ExpressionStrcasecmp::~ExpressionStrcasecmp() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionStrcasecmp::create() {
+        intrusive_ptr<ExpressionStrcasecmp> pExpression(new ExpressionStrcasecmp());
+        return pExpression;
+    }
+
+    ExpressionStrcasecmp::ExpressionStrcasecmp():
+        ExpressionNary() {
+    }
+
+    void ExpressionStrcasecmp::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionStrcasecmp::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(2);
+        intrusive_ptr<const Value> pString1(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pString2(vpOperand[1]->evaluate(pDocument));
+
+        /* boost::iequals returns a bool not an int so strings must actually be allocated */
+        string str1 = boost::to_upper_copy( pString1->coerceToString() );
+        string str2 = boost::to_upper_copy( pString2->coerceToString() );
+        int result = str1.compare(str2);
+
+        if (result == 0)
+            return Value::getZero();
+        if (result > 0)
+            return Value::getOne();
+        return Value::getMinusOne();
+    }
+
+    const char *ExpressionStrcasecmp::getOpName() const {
+	return "$strcasecmp";
+    }
+
+    /* ----------------------- ExpressionSubstr ---------------------------- */
+
+    ExpressionSubstr::~ExpressionSubstr() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionSubstr::create() {
+        intrusive_ptr<ExpressionSubstr> pExpression(new ExpressionSubstr());
+        return pExpression;
+    }
+
+    ExpressionSubstr::ExpressionSubstr():
+        ExpressionNary() {
+    }
+
+    void ExpressionSubstr::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(3);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionSubstr::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(3);
+        intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pLower(vpOperand[1]->evaluate(pDocument));
+        intrusive_ptr<const Value> pLength(vpOperand[2]->evaluate(pDocument));
+
+        string str = pString->coerceToString();
+	uassert(16034, str::stream() << getOpName() <<
+		":  starting index must be a numeric type (is BSON type " <<
+		pLower->getType() << ")",
+		(pLower->getType() == NumberInt 
+		 || pLower->getType() == NumberLong 
+		 || pLower->getType() == NumberDouble));
+	uassert(16035, str::stream() << getOpName() <<
+		":  length must be a numeric type (is BSON type " <<
+		pLength->getType() << ")",
+		(pLength->getType() == NumberInt 
+		 || pLength->getType() == NumberLong 
+		 || pLength->getType() == NumberDouble));
+        string::size_type lower = static_cast< string::size_type >( pLower->coerceToLong() );
+        string::size_type length = static_cast< string::size_type >( pLength->coerceToLong() );
+        return Value::createString( str.substr(lower, length) );
+    }
+
+    const char *ExpressionSubstr::getOpName() const {
+	return "$substr";
+    }
+
+    /* ----------------------- ExpressionSubtract ---------------------------- */
+
+    ExpressionSubtract::~ExpressionSubtract() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionSubtract::create() {
+        intrusive_ptr<ExpressionSubtract> pExpression(new ExpressionSubtract());
+        return pExpression;
+    }
+
+    ExpressionSubtract::ExpressionSubtract():
+        ExpressionNary() {
+    }
+
+    void ExpressionSubtract::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionSubtract::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        BSONType productType;
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+        if (pLeft->getType() == Date) {
+            long long right;
+            long long left = pLeft->coerceToDate();
+            if (pRight->getType() == Date)
+                right = pRight->coerceToDate();
+            else 
+                right = static_cast<long long>(pRight->coerceToDouble()*24*60*60*1000);
+            return Value::createDate(Date_t(left-right));
+        }
+            
+	uassert(15996, "cannot subtract one date from another",
+		pRight->getType() != Date);
+
+        productType = Value::getWidestNumeric(
+	    pRight->getType(), pLeft->getType());
+        
+
+        if (productType == NumberDouble) {
+            double right = pRight->coerceToDouble();
+            double left = pLeft->coerceToDouble();
+            return Value::createDouble(left - right);
+        } 
+
+        long long right = pRight->coerceToLong();
+        long long left = pLeft->coerceToLong();
+        if (productType == NumberLong)
+            return Value::createLong(left - right);
+        return Value::createInt((int)(left - right));
+    }
+
+    const char *ExpressionSubtract::getOpName() const {
+	return "$subtract";
+    }
+
+    /* ------------------------- ExpressionToLower ----------------------------- */
+
+    ExpressionToLower::~ExpressionToLower() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionToLower::create() {
+        intrusive_ptr<ExpressionToLower> pExpression(new ExpressionToLower());
+        return pExpression;
+    }
+
+    ExpressionToLower::ExpressionToLower():
+        ExpressionNary() {
+    }
+
+    void ExpressionToLower::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionToLower::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+        string str = pString->coerceToString();
+        boost::to_lower(str);
+        return Value::createString(str);
+    }
+
+    const char *ExpressionToLower::getOpName() const {
+	return "$toLower";
+    }
+
+    /* ------------------------- ExpressionToUpper -------------------------- */
+
+    ExpressionToUpper::~ExpressionToUpper() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionToUpper::create() {
+        intrusive_ptr<ExpressionToUpper> pExpression(new ExpressionToUpper());
+        return pExpression;
+    }
+
+    ExpressionToUpper::ExpressionToUpper():
+        ExpressionNary() {
+    }
+
+    void ExpressionToUpper::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionToUpper::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+        string str(pString->coerceToString());
+        boost::to_upper(str);
+        return Value::createString(str);
+    }
+
+    const char *ExpressionToUpper::getOpName() const {
+	return "$toUpper";
+    }
+
+    /* ------------------------- ExpressionWeek ----------------------------- */
+
+    ExpressionWeek::~ExpressionWeek() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionWeek::create() {
+        intrusive_ptr<ExpressionWeek> pExpression(new ExpressionWeek());
+        return pExpression;
+    }
+
+    ExpressionWeek::ExpressionWeek():
+        ExpressionNary() {
+    }
+
+    void ExpressionWeek::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionWeek::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        int dayOfWeek = date.tm_wday+1;
+        int dayOfYear = date.tm_yday;
+        int week = 0;
+        int janFirst = 0;
+        int offset = 0;
+
+        janFirst = dayOfWeek - dayOfYear % 7;
+        offset = (janFirst + 6) % 7;
+        week = (dayOfYear + offset) / 7;
+        return Value::createInt(week);
+    }
+
+    const char *ExpressionWeek::getOpName() const {
+	return "$week";
+    }
+
+    /* ------------------------- ExpressionYear ----------------------------- */
+
+    ExpressionYear::~ExpressionYear() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionYear::create() {
+        intrusive_ptr<ExpressionYear> pExpression(new ExpressionYear());
+        return pExpression;
+    }
+
+    ExpressionYear::ExpressionYear():
+        ExpressionNary() {
+    }
+
+    void ExpressionYear::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionYear::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_year+1900); // tm_year is years since 1900
+    }
+
+    const char *ExpressionYear::getOpName() const {
+	return "$year";
+    }
+}
diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h
new file mode 100755
index 00000000000..c49e385a3c7
--- /dev/null
+++ b/src/mongo/db/pipeline/expression.h
@@ -0,0 +1,1223 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "db/pipeline/field_path.h"
+#include "util/intrusive_counter.h"
+
+
+namespace mongo {
+    class BSONArrayBuilder;
+    class BSONElement;
+    class BSONObjBuilder;
+    class Builder;
+    class Document;
+    class ExpressionContext;
+    class Value;
+
+    class Expression :
+        public IntrusiveCounterUnsigned {
+    public:
+        virtual ~Expression() {};
+
+	/*
+	  Optimize the Expression.
+
+	  This provides an opportunity to do constant folding, or to
+	  collapse nested operators that have the same precedence, such as
+	  $add, $and, or $or.
+
+	  The Expression should be replaced with the return value, which may
+	  or may not be the same object.  In the case of constant folding,
+	  a computed expression may be replaced by a constant.
+
+	  @returns the optimized Expression
+	 */
+	virtual intrusive_ptr<Expression> optimize() = 0;
+
+        /*
+          Evaluate the Expression using the given document as input.
+
+          @returns the computed value
+        */
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const = 0;
+
+	/*
+	  Add the Expression (and any descendant Expressions) into a BSON
+	  object that is under construction.
+
+	  Unevaluated Expressions always materialize as objects.  Evaluation
+	  may produce a scalar or another object, either of which will be
+	  substituted inline.
+
+	  @param pBuilder the builder to add the expression to
+	  @param fieldName the name the object should be given
+	 */
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName,
+	    unsigned depth) const = 0;
+
+	/*
+	  Add the Expression (and any descendant Expressions) into a BSON
+	  array that is under construction.
+
+	  Unevaluated Expressions always materialize as objects.  Evaluation
+	  may produce a scalar or another object, either of which will be
+	  substituted inline.
+
+	  @param pBuilder the builder to add the expression to
+	 */
+	virtual void addToBsonArray(BSONArrayBuilder *pBuilder,
+	    unsigned depth) const = 0;
+
+	/*
+	  Convert the expression into a BSONObj that corresponds to the
+	  db.collection.find() predicate language.  This is intended for
+	  use by DocumentSourceFilter.
+
+	  This is more limited than the full expression language supported
+	  by all available expressions in a DocumentSource processing
+	  pipeline, and will fail with an assertion if an attempt is made
+	  to go outside the bounds of the recognized patterns, which don't
+	  include full computed expressions.  There are other methods available
+	  on DocumentSourceFilter which can be used to analyze a filter
+	  predicate and break it up into appropriate expressions which can
+	  be translated within these constraints.  As a result, the default
+	  implementation is to fail with an assertion; only a subset of
+	  operators will be able to fulfill this request.
+
+	  @param pBuilder the builder to add the expression to.
+	 */
+	virtual void toMatcherBson(
+	    BSONObjBuilder *pBuilder, unsigned depth) const;
+
+	/*
+	  Utility class for parseObject() below.
+
+	  Only one array can be unwound in a processing pipeline.  If the
+	  UNWIND_OK option is used, unwindOk() will return true, and a field
+	  can be declared as unwound using unwind(), after which unwindUsed()
+	  will return true.  Only specify UNWIND_OK if it is OK to unwind an
+	  array in the current context.
+
+	  DOCUMENT_OK indicates that it is OK to use a Document in the current
+	  context.
+	 */
+        class ObjectCtx {
+        public:
+            ObjectCtx(int options);
+            static const int UNWIND_OK = 0x0001;
+            static const int DOCUMENT_OK = 0x0002;
+
+            bool unwindOk() const;
+            bool unwindUsed() const;
+            void unwind(string fieldName);
+
+            bool documentOk() const;
+
+        private:
+            int options;
+            string unwindField;
+        };
+
+	/*
+	  Parse a BSONElement Object.  The object could represent a functional
+	  expression or a Document expression.
+
+	  @param pBsonElement the element representing the object
+	  @param pCtx a MiniCtx representing the options above
+	  @returns the parsed Expression
+	 */
+        static intrusive_ptr<Expression> parseObject(
+            BSONElement *pBsonElement, ObjectCtx *pCtx);
+
+	static const char unwindName[];
+
+        /*
+	  Parse a BSONElement Object which has already been determined to be
+	  functional expression.
+
+	  @param pOpName the name of the (prefix) operator
+	  @param pBsonElement the BSONElement to parse
+	  @returns the parsed Expression
+	*/
+        static intrusive_ptr<Expression> parseExpression(
+            const char *pOpName, BSONElement *pBsonElement);
+
+
+	/*
+	  Parse a BSONElement which is an operand in an Expression.
+
+	  @param pBsonElement the expected operand's BSONElement
+	  @returns the parsed operand, as an Expression
+	 */
+        static intrusive_ptr<Expression> parseOperand(
+	    BSONElement *pBsonElement);
+
+	/*
+	  Produce a field path string with the field prefix removed.
+
+	  Throws an error if the field prefix is not present.
+
+	  @param prefixedField the prefixed field
+	  @returns the field path with the prefix removed
+	 */
+	static string removeFieldPrefix(const string &prefixedField);
+
+	/*
+	  Enumeration of comparison operators.  These are shared between a
+	  few expression implementations, so they are factored out here.
+
+	  Any changes to these values require adjustment of the lookup
+	  table in the implementation.
+	*/
+	enum CmpOp {
+	    EQ = 0, // return true for a == b, false otherwise
+	    NE = 1, // return true for a != b, false otherwise
+	    GT = 2, // return true for a > b, false otherwise
+	    GTE = 3, // return true for a >= b, false otherwise
+	    LT = 4, // return true for a < b, false otherwise
+	    LTE = 5, // return true for a <= b, false otherwise
+	    CMP = 6, // return -1, 0, 1 for a < b, a == b, a > b
+	};
+
+	static int signum(int i);
+    };
+
+
+    class ExpressionNary :
+	public Expression,
+        public boost::enable_shared_from_this<ExpressionNary> {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<Expression> optimize();
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+        /*
+          Add an operand to the n-ary expression.
+
+          @param pExpression the expression to add
+        */
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+	/*
+	  Return a factory function that will make Expression nodes of
+	  the same type as this.  This will be used to create constant
+	  expressions for constant folding for optimize().  Only return
+	  a factory function if this operator is both associative and
+	  commutative.  The default implementation returns NULL; optimize()
+	  will recognize that and stop.
+
+	  Note that ExpressionNary::optimize() promises that if it uses this
+	  to fold constants, then if optimize() returns an ExpressionNary,
+	  any remaining constant will be the last one in vpOperand.  Derived
+	  classes may take advantage of this to do further optimizations in
+	  their optimize().
+
+	  @returns pointer to a factory function or NULL
+	 */
+	virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+	/*
+	  Get the name of the operator.
+
+	  @returns the name of the operator; this string belongs to the class
+	    implementation, and should not be deleted
+	    and should not
+	*/
+	virtual const char *getOpName() const = 0;
+
+    protected:
+        ExpressionNary();
+
+        vector<intrusive_ptr<Expression> > vpOperand;
+
+	/*
+	  Add the expression to the builder.
+
+	  If there is only one operand (a unary operator), then the operand
+	  is added directly, without an array.  For more than one operand,
+	  a named array is created.  In both cases, the result is an object.
+
+	  @param pBuilder the (blank) builder to add the expression to
+	  @param pOpName the name of the operator
+	 */
+	virtual void toBson(BSONObjBuilder *pBuilder,
+			    const char *pOpName, unsigned depth) const;
+
+	/*
+	  Checks the current size of vpOperand; if the size equal to or
+	  greater than maxArgs, fires a user assertion indicating that this
+	  operator cannot have this many arguments.
+
+	  The equal is there because this is intended to be used in
+	  addOperand() to check for the limit *before* adding the requested
+	  argument.
+
+	  @param maxArgs the maximum number of arguments the operator accepts
+	*/
+	void checkArgLimit(unsigned maxArgs) const;
+
+	/*
+	  Checks the current size of vpOperand; if the size is not equal to
+	  reqArgs, fires a user assertion indicating that this must have
+	  exactly reqArgs arguments.
+
+	  This is meant to be used in evaluate(), *before* the evaluation
+	  takes place.
+
+	  @param reqArgs the number of arguments this operator requires
+	*/
+	void checkArgCount(unsigned reqArgs) const;
+    };
+
+
+    class ExpressionAdd :
+        public ExpressionNary {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionAdd();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+
+	// virtuals from ExpressionNary
+	virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+        /*
+          Create an expression that finds the sum of n operands.
+
+          @returns addition expression
+         */
+        static intrusive_ptr<ExpressionNary> create();
+
+    protected:
+	// virtuals from ExpressionNary
+	virtual void toBson(BSONObjBuilder *pBuilder,
+			    const char *pOpName, unsigned depth) const;
+
+    private:
+        ExpressionAdd();
+
+	/*
+	  If the operator can be optimized, we save the original here.
+
+	  This is necessary because addition must follow its original operand
+	  ordering strictly if a string is detected, otherwise string
+	  concatenation may appear to have re-ordered the operands.
+	 */
+	intrusive_ptr<ExpressionAdd> pAdd;
+	mutable bool useOriginal;
+    };
+
+
+    class ExpressionAnd :
+        public ExpressionNary {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionAnd();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+	virtual void toMatcherBson(
+	    BSONObjBuilder *pBuilder, unsigned depth) const;
+
+	// virtuals from ExpressionNary
+	virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+        /*
+          Create an expression that finds the conjunction of n operands.
+          The conjunction uses short-circuit logic; the expressions are
+          evaluated in the order they were added to the conjunction, and
+          the evaluation stops and returns false on the first operand that
+          evaluates to false.
+
+          @returns conjunction expression
+         */
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionAnd();
+    };
+
+
+    class ExpressionCoerceToBool :
+	public Expression,
+        public boost::enable_shared_from_this<ExpressionCoerceToBool> {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionCoerceToBool();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+        static intrusive_ptr<ExpressionCoerceToBool> create(
+	    const intrusive_ptr<Expression> &pExpression);
+
+    private:
+        ExpressionCoerceToBool(const intrusive_ptr<Expression> &pExpression);
+
+	intrusive_ptr<Expression> pExpression;
+    };
+
+
+    class ExpressionCompare :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionCompare();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        /*
+          Shorthands for creating various comparisons expressions.
+          Provide for conformance with the uniform function pointer signature
+          required for parsing.
+
+          These create a particular comparision operand, without any
+          operands.  Those must be added via ExpressionNary::addOperand().
+        */
+        static intrusive_ptr<ExpressionNary> createCmp();
+        static intrusive_ptr<ExpressionNary> createEq();
+        static intrusive_ptr<ExpressionNary> createNe();
+        static intrusive_ptr<ExpressionNary> createGt();
+        static intrusive_ptr<ExpressionNary> createGte();
+        static intrusive_ptr<ExpressionNary> createLt();
+        static intrusive_ptr<ExpressionNary> createLte();
+
+    private:
+	friend class ExpressionFieldRange;
+        ExpressionCompare(CmpOp cmpOp);
+
+        CmpOp cmpOp;
+    };
+
+
+    class ExpressionCond :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionCond();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionCond();
+    };
+
+
+    class ExpressionConstant :
+        public Expression,
+        public boost::enable_shared_from_this<ExpressionConstant> {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionConstant();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+        static intrusive_ptr<ExpressionConstant> createFromBsonElement(
+            BSONElement *pBsonElement);
+	static intrusive_ptr<ExpressionConstant> create(
+	    const intrusive_ptr<const Value> &pValue);
+
+	/*
+	  Get the constant value represented by this Expression.
+
+	  @returns the value
+	 */
+	intrusive_ptr<const Value> getValue() const;
+
+    private:
+        ExpressionConstant(BSONElement *pBsonElement);
+	ExpressionConstant(const intrusive_ptr<const Value> &pValue);
+
+        intrusive_ptr<const Value> pValue;
+    };
+
+
+    class ExpressionDayOfMonth :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionDayOfMonth();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionDayOfMonth();
+    };
+
+
+    class ExpressionDayOfWeek :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionDayOfWeek();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionDayOfWeek();
+    };
+
+
+    class ExpressionDayOfYear :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionDayOfYear();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionDayOfYear();
+    };
+
+
+    class ExpressionDivide :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionDivide();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionDivide();
+    };
+
+
+    class ExpressionFieldPath :
+        public Expression,
+        public boost::enable_shared_from_this<ExpressionFieldPath> {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionFieldPath();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+	/*
+	  Create a field path expression.
+
+	  Evaluation will extract the value associated with the given field
+	  path from the source document.
+
+	  @param fieldPath the field path string, without any leading document
+	    indicator
+	  @returns the newly created field path expression
+	 */
+        static intrusive_ptr<ExpressionFieldPath> create(
+	    const string &fieldPath);
+
+	/*
+	  Return a string representation of the field path.
+
+	  @param fieldPrefix whether or not to include the document field
+	    indicator prefix
+	  @returns the dot-delimited field path
+	 */
+	string getFieldPath(bool fieldPrefix) const;
+
+	/*
+	  Write a string representation of the field path to a stream.
+
+	  @param the stream to write to
+	  @param fieldPrefix whether or not to include the document field
+	    indicator prefix
+	 */
+	void writeFieldPath(ostream &outStream, bool fieldPrefix) const;
+
+    private:
+        ExpressionFieldPath(const string &fieldPath);
+
+	/*
+	  Internal implementation of evaluate(), used recursively.
+
+	  The internal implementation doesn't just use a loop because of
+	  the possibility that we need to skip over an array.  If the path
+	  is "a.b.c", and a is an array, then we fan out from there, and
+	  traverse "b.c" for each element of a:[...].  This requires that
+	  a be an array of objects in order to navigate more deeply.
+
+	  @param index current path field index to extract
+	  @param pathLength maximum number of fields on field path
+	  @param pDocument current document traversed to (not the top-level one)
+	  @returns the field found; could be an array
+	 */
+	intrusive_ptr<const Value> evaluatePath(
+	    size_t index, const size_t pathLength, 
+	    intrusive_ptr<Document> pDocument) const;
+
+	FieldPath fieldPath;
+    };
+
+
+    class ExpressionFieldRange :
+	public Expression,
+	public boost::enable_shared_from_this<ExpressionFieldRange> {
+    public:
+	// virtuals from expression
+        virtual ~ExpressionFieldRange();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+	virtual void toMatcherBson(
+	    BSONObjBuilder *pBuilder, unsigned depth) const;
+
+	/*
+	  Create a field range expression.
+
+	  Field ranges are meant to match up with classic Matcher semantics,
+	  and therefore are conjunctions.  For example, these appear in
+	  mongo shell predicates in one of these forms:
+	  { a : C } -> (a == C) // degenerate "point" range
+	  { a : { $lt : C } } -> (a < C) // open range
+	  { a : { $gt : C1, $lte : C2 } } -> ((a > C1) && (a <= C2)) // closed
+
+	  When initially created, a field range only includes one end of
+	  the range.  Additional points may be added via intersect().
+
+	  Note that NE and CMP are not supported.
+
+	  @param pFieldPath the field path for extracting the field value
+	  @param cmpOp the comparison operator
+	  @param pValue the value to compare against
+	  @returns the newly created field range expression
+	 */
+	static intrusive_ptr<ExpressionFieldRange> create(
+	    const intrusive_ptr<ExpressionFieldPath> &pFieldPath,
+	    CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+
+	/*
+	  Add an intersecting range.
+
+	  This can be done any number of times after creation.  The
+	  range is internally optimized for each new addition.  If the new
+	  intersection extends or reduces the values within the range, the
+	  internal representation is adjusted to reflect that.
+
+	  Note that NE and CMP are not supported.
+
+	  @param cmpOp the comparison operator
+	  @param pValue the value to compare against
+	 */
+	void intersect(CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+
+    private:
+	ExpressionFieldRange(const intrusive_ptr<ExpressionFieldPath> &pFieldPath,
+			     CmpOp cmpOp,
+			     const intrusive_ptr<const Value> &pValue);
+
+	intrusive_ptr<ExpressionFieldPath> pFieldPath;
+
+	class Range {
+	public:
+	    Range(CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+	    Range(const Range &rRange);
+
+	    Range *intersect(const Range *pRange) const;
+	    bool contains(const intrusive_ptr<const Value> &pValue) const;
+
+	    Range(const intrusive_ptr<const Value> &pBottom, bool bottomOpen,
+		  const intrusive_ptr<const Value> &pTop, bool topOpen);
+
+	    bool bottomOpen;
+	    bool topOpen;
+	    intrusive_ptr<const Value> pBottom;
+	    intrusive_ptr<const Value> pTop;
+	};
+
+	scoped_ptr<Range> pRange;
+
+	/*
+	  Add to a generic Builder.
+
+	  The methods to append items to an object and an array differ by
+	  their inclusion of a field name.  For more complicated objects,
+	  it makes sense to abstract that out and use a generic builder that
+	  always looks the same, and then implement addToBsonObj() and
+	  addToBsonArray() by using the common method.
+	 */
+	void addToBson(Builder *pBuilder, unsigned depth) const;
+    };
+
+
+    class ExpressionHour :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionHour();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionHour();
+    };
+
+
+    class ExpressionIfNull :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionIfNull();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionIfNull();
+    };
+
+
+    class ExpressionMinute :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionMinute();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionMinute();
+    };
+
+
+    class ExpressionMod :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionMod();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionMod();
+    };
+    
+
+    class ExpressionMultiply :
+        public ExpressionNary {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionMultiply();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual const char *getOpName() const;
+
+        // virtuals from ExpressionNary
+        virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+        /*
+          Create an expression that finds the product of n operands.
+
+          @returns multiplication expression
+         */
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionMultiply();
+    };
+
+
+    class ExpressionMonth :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionMonth();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionMonth();
+    };
+
+
+    class ExpressionNoOp :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionNoOp();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionNoOp();
+    };
+
+
+    class ExpressionNot :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionNot();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionNot();
+    };
+
+
+    class ExpressionObject :
+        public Expression,
+        public boost::enable_shared_from_this<ExpressionObject> {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionObject();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+	/*
+	  evaluate(), but return a Document instead of a Value-wrapped
+	  Document.
+
+	  @param pDocument the input Document
+	  @returns the result document
+	 */
+	intrusive_ptr<Document> evaluateDocument(
+	    const intrusive_ptr<Document> &pDocument) const;
+
+	/*
+	  evaluate(), but add the evaluated fields to a given document
+	  instead of creating a new one.
+
+	  @param pResult the Document to add the evaluated expressions to
+	  @param pDocument the input Document
+	 */
+	void addToDocument(const intrusive_ptr<Document> &pResult,
+			   const intrusive_ptr<Document> &pDocument) const;
+
+	/*
+	  Estimate the number of fields that will result from evaluating
+	  this over pDocument.  Does not include _id.  This is an estimate
+	  (really an upper bound) because we can't account for undefined
+	  fields without actually doing the evaluation.  But this is still
+	  useful as an argument to Document::create(), if you plan to use
+	  addToDocument().
+
+	  @param pDocument the input document
+	  @returns estimated number of fields that will result
+	 */
+	size_t getSizeHint(const intrusive_ptr<Document> &pDocument) const;
+
+        /*
+          Create an empty expression.  Until fields are added, this
+          will evaluate to an empty document (object).
+         */
+        static intrusive_ptr<ExpressionObject> create();
+
+        /*
+          Add a field to the document expression.
+
+          @param fieldPath the path the evaluated expression will have in the
+                 result Document
+          @param pExpression the expression to evaluate obtain this field's
+                 Value in the result Document
+        */
+        void addField(const string &fieldPath,
+		      const intrusive_ptr<Expression> &pExpression);
+
+	/*
+	  Add a field path to the set of those to be included.
+
+	  Note that including a nested field implies including everything on
+	  the path leading down to it.
+
+	  @param fieldPath the name of the field to be included
+	*/
+	void includePath(const string &fieldPath);
+
+	/*
+	  Add a field path to the set of those to be excluded.
+
+	  Note that excluding a nested field implies including everything on
+	  the path leading down to it (because you're stating you want to see
+	  all the other fields that aren't being excluded).
+
+	  @param fieldName the name of the field to be excluded
+	 */
+	void excludePath(const string &fieldPath);
+
+	/*
+	  Return the expression for a field.
+
+	  @param fieldName the field name for the expression to return
+	  @returns the expression used to compute the field, if it is present,
+	    otherwise NULL.
+	*/
+	intrusive_ptr<Expression> getField(const string &fieldName) const;
+
+	/*
+	  Get a count of the added fields.
+
+	  @returns how many fields have been added
+	 */
+	size_t getFieldCount() const;
+
+	/*
+	  Get a count of the exclusions.
+
+	  @returns how many fields have been excluded.
+	*/
+	size_t getExclusionCount() const;
+
+	/*
+	  Specialized BSON conversion that allows for writing out a
+	  $project specification.  This creates a standalone object, which must
+	  be added to a containing object with a name
+
+	  @param pBuilder where to write the object to
+	 */
+	void documentToBson(BSONObjBuilder *pBuilder, unsigned depth) const;
+
+    private:
+        ExpressionObject();
+
+	void includePath(
+	    const FieldPath *pPath, size_t pathi, size_t pathn,
+	    bool excludeLast);
+
+	bool excludePaths;
+	set<string> path;
+
+        /* these two vectors are maintained in parallel */
+        vector<string> vFieldName;
+        vector<intrusive_ptr<Expression> > vpExpression;
+
+	/*
+	  Utility function used by documentToBson().  Emits inclusion
+	  and exclusion paths by recursively walking down the nested
+	  ExpressionObject trees these have created.
+
+	  @param pBuilder the builder to write boolean valued path "fields" to
+	  @param pvPath pointer to a vector of strings describing the path on
+	    descent; the top-level call should pass an empty vector
+	 */
+	void emitPaths(BSONObjBuilder *pBuilder, vector<string> *pvPath) const;
+
+	/* utility class used by emitPaths() */
+	class PathPusher :
+	    boost::noncopyable {
+	public:
+	    PathPusher(vector<string> *pvPath, const string &s);
+	    ~PathPusher();
+
+	private:
+	    vector<string> *pvPath;
+	};
+    };
+
+
+    class ExpressionOr :
+        public ExpressionNary {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionOr();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+	virtual void toMatcherBson(
+	    BSONObjBuilder *pBuilder, unsigned depth) const;
+
+	// virtuals from ExpressionNary
+	virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+        /*
+          Create an expression that finds the conjunction of n operands.
+          The conjunction uses short-circuit logic; the expressions are
+          evaluated in the order they were added to the conjunction, and
+          the evaluation stops and returns false on the first operand that
+          evaluates to false.
+
+          @returns conjunction expression
+         */
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionOr();
+    };
+
+
+    class ExpressionSecond :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionSecond();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionSecond();
+    };
+
+
+    class ExpressionStrcasecmp :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionStrcasecmp();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionStrcasecmp();
+    };
+
+
+    class ExpressionSubstr :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionSubstr();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionSubstr();
+    };
+
+
+    class ExpressionSubtract :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionSubtract();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionSubtract();
+    };
+
+
+    class ExpressionToLower :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionToLower();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionToLower();
+    };
+
+
+    class ExpressionToUpper :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionToUpper();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionToUpper();
+    };
+
+
+    class ExpressionWeek :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionWeek();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionWeek();
+    };
+
+
+    class ExpressionYear :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionYear();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionYear();
+    };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline bool Expression::ObjectCtx::unwindOk() const {
+        return ((options & UNWIND_OK) != 0);
+    }
+
+    inline bool Expression::ObjectCtx::unwindUsed() const {
+        return (unwindField.size() != 0);
+    }
+
+    inline int Expression::signum(int i) {
+	if (i < 0)
+	    return -1;
+	if (i > 0)
+	    return 1;
+	return 0;
+    }
+
+    inline intrusive_ptr<const Value> ExpressionConstant::getValue() const {
+	return pValue;
+    }
+
+    inline string ExpressionFieldPath::getFieldPath(bool fieldPrefix) const {
+	return fieldPath.getPath(fieldPrefix);
+    }
+
+    inline void ExpressionFieldPath::writeFieldPath(
+	ostream &outStream, bool fieldPrefix) const {
+	return fieldPath.writePath(outStream, fieldPrefix);
+    }
+
+    inline size_t ExpressionObject::getFieldCount() const {
+	return vFieldName.size();
+    }
+
+    inline ExpressionObject::PathPusher::PathPusher(
+	vector<string> *pTheVPath, const string &s):
+	pvPath(pTheVPath) {
+	pvPath->push_back(s);
+    }
+
+    inline ExpressionObject::PathPusher::~PathPusher() {
+	pvPath->pop_back();
+    }
+
+}
diff --git a/src/mongo/db/pipeline/expression_context.cpp b/src/mongo/db/pipeline/expression_context.cpp
new file mode 100755
index 00000000000..4835dcfa5a9
--- /dev/null
+++ b/src/mongo/db/pipeline/expression_context.cpp
@@ -0,0 +1,35 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/expression_context.h"
+
+namespace mongo {
+
+    ExpressionContext::~ExpressionContext() {
+    }
+
+    inline ExpressionContext::ExpressionContext():
+	inShard(false),
+	inRouter(false) {
+    }
+
+    ExpressionContext *ExpressionContext::create() {
+	return new ExpressionContext();
+    }
+
+}
diff --git a/src/mongo/db/pipeline/expression_context.h b/src/mongo/db/pipeline/expression_context.h
new file mode 100755
index 00000000000..0277039c80b
--- /dev/null
+++ b/src/mongo/db/pipeline/expression_context.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+
+    class ExpressionContext :
+        public IntrusiveCounterUnsigned {
+    public:
+	virtual ~ExpressionContext();
+
+	void setInShard(bool b);
+	void setInRouter(bool b);
+
+	bool getInShard() const;
+	bool getInRouter() const;
+
+	static ExpressionContext *create();
+
+    private:
+	ExpressionContext();
+	
+	bool inShard;
+	bool inRouter;
+    };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline void ExpressionContext::setInShard(bool b) {
+	inShard = b;
+    }
+    
+    inline void ExpressionContext::setInRouter(bool b) {
+	inRouter = b;
+    }
+
+    inline bool ExpressionContext::getInShard() const {
+	return inShard;
+    }
+
+    inline bool ExpressionContext::getInRouter() const {
+	return inRouter;
+    }
+
+};
diff --git a/src/mongo/db/pipeline/field_path.cpp b/src/mongo/db/pipeline/field_path.cpp
new file mode 100755
index 00000000000..96e1fc92f83
--- /dev/null
+++ b/src/mongo/db/pipeline/field_path.cpp
@@ -0,0 +1,87 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/field_path.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    FieldPath::~FieldPath() {
+    }
+
+    FieldPath::FieldPath():
+	vFieldName() {
+    }
+
+    FieldPath::FieldPath(const string &fieldPath):
+	vFieldName() {
+        /*
+          The field path could be using dot notation.
+          Break the field path up by peeling off successive pieces.
+        */
+        size_t startpos = 0;
+        while(true) {
+            /* find the next dot */
+            const size_t dotpos = fieldPath.find('.', startpos);
+
+            /* if there are no more dots, use the remainder of the string */
+            if (dotpos == fieldPath.npos) {
+                vFieldName.push_back(fieldPath.substr(startpos, dotpos));
+                break;
+            }
+
+            /* use the string up to the dot */
+            const size_t length = dotpos - startpos;
+	    uassert(15998, str::stream() <<
+		    "field names cannot be zero length (in path \"" <<
+		    fieldPath << "\")",
+		    length > 0);
+
+            vFieldName.push_back(fieldPath.substr(startpos, length));
+
+            /* next time, search starting one spot after that */
+            startpos = dotpos + 1;
+        }
+    }
+
+    string FieldPath::getPath(bool fieldPrefix) const {
+	stringstream ss;
+	writePath(ss, fieldPrefix);
+	return ss.str();
+    }
+
+    void FieldPath::writePath(ostream &outStream, bool fieldPrefix) const {
+	if (fieldPrefix)
+	    outStream << "$";
+
+	outStream << vFieldName[0];
+
+	const size_t n = vFieldName.size();
+	for(size_t i = 1; i < n; ++i)
+	    outStream << "." << vFieldName[i];
+    }
+
+    FieldPath &FieldPath::operator=(const FieldPath &rRHS) {
+	if (this != &rRHS) {
+	    vFieldName = rRHS.vFieldName;
+	}
+
+	return *this;
+    }
+
+}
diff --git a/src/mongo/db/pipeline/field_path.h b/src/mongo/db/pipeline/field_path.h
new file mode 100755
index 00000000000..810c5d0c7ea
--- /dev/null
+++ b/src/mongo/db/pipeline/field_path.h
@@ -0,0 +1,82 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    class FieldPath {
+    public:
+	virtual ~FieldPath();
+
+	FieldPath(const string &fieldPath);
+	FieldPath();
+
+	/*
+	  Get the number of path elements in the field path.
+
+	  @returns the number of path elements
+	 */
+	size_t getPathLength() const;
+
+	/*
+	  Get a particular path element from the path.
+
+	  @param i the index of the path element
+	  @returns the path element
+	 */
+	string getFieldName(size_t i) const;
+
+	/*
+	  Get the full path.
+
+	  @param fieldPrefix whether or not to include the field prefix
+	  @returns the complete field path
+	 */
+	string getPath(bool fieldPrefix) const;
+
+	/*
+	  Write the full path.
+
+	  @param outStream where to write the path to
+	  @param fieldPrefix whether or not to include the field prefix
+	*/
+	void writePath(ostream &outStream, bool fieldPrefix) const;
+
+	FieldPath &operator=(const FieldPath &rRHS);
+
+    private:
+	vector<string> vFieldName;
+    };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline size_t FieldPath::getPathLength() const {
+	return vFieldName.size();
+    }
+
+    inline string FieldPath::getFieldName(size_t i) const {
+	return vFieldName[i];
+    }
+
+}
+
diff --git a/src/mongo/db/pipeline/value.cpp b/src/mongo/db/pipeline/value.cpp
new file mode 100755
index 00000000000..b83dec359cf
--- /dev/null
+++ b/src/mongo/db/pipeline/value.cpp
@@ -0,0 +1,1034 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/value.h"
+
+#include <boost/functional/hash.hpp>
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+#include "db/pipeline/document.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    const intrusive_ptr<const Value> Value::pFieldUndefined(
+	new ValueStatic(Undefined));
+    const intrusive_ptr<const Value> Value::pFieldNull(new ValueStatic());
+    const intrusive_ptr<const Value> Value::pFieldTrue(new ValueStatic(true));
+    const intrusive_ptr<const Value> Value::pFieldFalse(new ValueStatic(false));
+    const intrusive_ptr<const Value> Value::pFieldMinusOne(new ValueStatic(-1));
+    const intrusive_ptr<const Value> Value::pFieldZero(new ValueStatic(0));
+    const intrusive_ptr<const Value> Value::pFieldOne(new ValueStatic(1));
+
+    Value::~Value() {
+    }
+
+    Value::Value():
+        type(jstNULL),
+        oidValue(),
+        dateValue(),
+        stringValue(),
+        pDocumentValue(),
+        vpValue() {
+    }
+
+    Value::Value(BSONType theType):
+        type(theType),
+        oidValue(),
+        dateValue(),
+        stringValue(),
+        pDocumentValue(),
+        vpValue() {
+	switch(type) {
+	case Undefined:
+	case jstNULL:
+	case Object: // empty
+	case Array: // empty
+	    break;
+
+	case NumberDouble:
+	    simple.doubleValue = 0;
+	    break;
+
+	case Bool:
+	    simple.boolValue = false;
+	    break;
+
+	case NumberInt:
+	    simple.intValue = 0;
+	    break;
+
+	case Timestamp:
+	    simple.timestampValue = 0;
+	    break;
+
+	case NumberLong:
+	    simple.longValue = 0;
+	    break;
+
+	default:
+	    // nothing else is allowed
+	    uassert(16001, str::stream() <<
+		    "can't create empty Value of type " << type, false);
+	    break;
+	}
+    }
+
+    Value::Value(bool boolValue):
+        type(Bool),
+        pDocumentValue(),
+        vpValue() {
+        simple.boolValue = boolValue;
+    }
+
+    intrusive_ptr<const Value> Value::createFromBsonElement(
+        BSONElement *pBsonElement) {
+        intrusive_ptr<const Value> pValue(new Value(pBsonElement));
+        return pValue;
+    }
+
+    Value::Value(BSONElement *pBsonElement):
+        type(pBsonElement->type()),
+        pDocumentValue(),
+        vpValue() {
+        switch(type) {
+        case NumberDouble:
+            simple.doubleValue = pBsonElement->Double();
+            break;
+
+        case String:
+            stringValue = pBsonElement->String();
+            break;
+
+        case Object: {
+            BSONObj document(pBsonElement->embeddedObject());
+            pDocumentValue = Document::createFromBsonObj(&document);
+            break;
+        }
+
+        case Array: {
+            vector<BSONElement> vElement(pBsonElement->Array());
+            const size_t n = vElement.size();
+
+            vpValue.reserve(n); // save on realloc()ing
+
+            for(size_t i = 0; i < n; ++i) {
+                vpValue.push_back(
+                    Value::createFromBsonElement(&vElement[i]));
+            }
+            break;
+        }
+
+        case jstOID:
+            oidValue = pBsonElement->OID();
+            break;
+
+        case Bool:
+            simple.boolValue = pBsonElement->Bool();
+            break;
+
+        case Date:
+            dateValue = pBsonElement->Date();
+            break;
+
+        case RegEx:
+            stringValue = pBsonElement->regex();
+            // TODO pBsonElement->regexFlags();
+            break;
+
+        case NumberInt:
+            simple.intValue = pBsonElement->numberInt();
+            break;
+
+        case Timestamp:
+            dateValue = pBsonElement->timestampTime();
+            break;
+
+        case NumberLong:
+            simple.longValue = pBsonElement->numberLong();
+            break;
+
+        case jstNULL:
+	    break;
+
+        case BinData:
+        case Symbol:
+        case CodeWScope:
+	    uassert(16002, str::stream() <<
+		    "can't create Value of type " << type, false);
+	    break;
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case Undefined:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        }
+    }
+
+    Value::Value(int intValue):
+        type(NumberInt),
+        pDocumentValue(),
+        vpValue() {
+        simple.intValue = intValue;
+    }
+
+    intrusive_ptr<const Value> Value::createInt(int value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(long long longValue):
+        type(NumberLong),
+        pDocumentValue(),
+        vpValue() {
+        simple.longValue = longValue;
+    }
+
+    intrusive_ptr<const Value> Value::createLong(long long value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(double value):
+        type(NumberDouble),
+        pDocumentValue(),
+        vpValue() {
+        simple.doubleValue = value;
+    }
+
+    intrusive_ptr<const Value> Value::createDouble(double value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(const Date_t &value):
+        type(Date),
+        pDocumentValue(),
+        vpValue() {
+        dateValue = value;
+    }
+
+    intrusive_ptr<const Value> Value::createDate(const Date_t &value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(const string &value):
+        type(String),
+        pDocumentValue(),
+        vpValue() {
+        stringValue = value;
+    }
+
+    intrusive_ptr<const Value> Value::createString(const string &value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(const intrusive_ptr<Document> &pDocument):
+        type(Object),
+        pDocumentValue(pDocument),
+        vpValue() {
+    }
+
+    intrusive_ptr<const Value> Value::createDocument(
+        const intrusive_ptr<Document> &pDocument) {
+        intrusive_ptr<const Value> pValue(new Value(pDocument));
+        return pValue;
+    }
+
+    Value::Value(const vector<intrusive_ptr<const Value> > &thevpValue):
+        type(Array),
+        pDocumentValue(),
+        vpValue(thevpValue) {
+    }
+
+    intrusive_ptr<const Value> Value::createArray(
+        const vector<intrusive_ptr<const Value> > &vpValue) {
+        intrusive_ptr<const Value> pValue(new Value(vpValue));
+        return pValue;
+    }
+
+    double Value::getDouble() const {
+        BSONType type = getType();
+        if (type == NumberInt)
+            return simple.intValue;
+        if (type == NumberLong)
+            return static_cast< double >( simple.longValue );
+
+        assert(type == NumberDouble);
+        return simple.doubleValue;
+    }
+
+    string Value::getString() const {
+        assert(getType() == String);
+        return stringValue;
+    }
+
+    intrusive_ptr<Document> Value::getDocument() const {
+        assert(getType() == Object);
+        return pDocumentValue;
+    }
+
+    ValueIterator::~ValueIterator() {
+    }
+
+    Value::vi::~vi() {
+    }
+
+    bool Value::vi::more() const {
+        return (nextIndex < size);
+    }
+
+    intrusive_ptr<const Value> Value::vi::next() {
+        assert(more());
+        return (*pvpValue)[nextIndex++];
+    }
+
+    Value::vi::vi(const intrusive_ptr<const Value> &pValue,
+                  const vector<intrusive_ptr<const Value> > *thepvpValue):
+        size(thepvpValue->size()),
+        nextIndex(0),
+        pvpValue(thepvpValue) {
+    }
+
+    intrusive_ptr<ValueIterator> Value::getArray() const {
+        assert(getType() == Array);
+        intrusive_ptr<ValueIterator> pVI(
+	    new vi(intrusive_ptr<const Value>(this), &vpValue));
+        return pVI;
+    }
+
+    OID Value::getOid() const {
+        assert(getType() == jstOID);
+        return oidValue;
+    }
+
+    bool Value::getBool() const {
+        assert(getType() == Bool);
+        return simple.boolValue;
+    }
+
+    Date_t Value::getDate() const {
+        assert(getType() == Date);
+        return dateValue;
+    }
+
+    string Value::getRegex() const {
+        assert(getType() == RegEx);
+        return stringValue;
+    }
+
+    string Value::getSymbol() const {
+        assert(getType() == Symbol);
+        return stringValue;
+    }
+
+    int Value::getInt() const {
+        assert(getType() == NumberInt);
+        return simple.intValue;
+    }
+
+    unsigned long long Value::getTimestamp() const {
+        assert(getType() == Timestamp);
+        return dateValue;
+    }
+
+    long long Value::getLong() const {
+        BSONType type = getType();
+        if (type == NumberInt)
+            return simple.intValue;
+
+        assert(type == NumberLong);
+        return simple.longValue;
+    }
+
+    void Value::addToBson(Builder *pBuilder) const {
+        switch(getType()) {
+        case NumberDouble:
+            pBuilder->append(getDouble());
+            break;
+
+        case String:
+            pBuilder->append(getString());
+            break;
+
+        case Object: {
+            intrusive_ptr<Document> pDocument(getDocument());
+            BSONObjBuilder subBuilder;
+            pDocument->toBson(&subBuilder);
+            subBuilder.done();
+            pBuilder->append(&subBuilder);
+            break;
+        }
+
+        case Array: {
+            const size_t n = vpValue.size();
+            BSONArrayBuilder arrayBuilder(n);
+            for(size_t i = 0; i < n; ++i) {
+                vpValue[i]->addToBsonArray(&arrayBuilder);
+            }
+
+            pBuilder->append(&arrayBuilder);
+            break;
+        }
+
+        case BinData:
+            // pBuilder->appendBinData(fieldName, ...);
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case jstOID:
+            pBuilder->append(getOid());
+            break;
+
+        case Bool:
+            pBuilder->append(getBool());
+            break;
+
+        case Date:
+            pBuilder->append(getDate());
+            break;
+
+        case RegEx:
+            pBuilder->append(getRegex());
+            break;
+
+        case Symbol:
+            pBuilder->append(getSymbol());
+            break;
+
+        case CodeWScope:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case NumberInt:
+            pBuilder->append(getInt());
+            break;
+
+        case Timestamp:
+            pBuilder->append((long long)getTimestamp());
+            break;
+
+        case NumberLong:
+            pBuilder->append(getLong());
+            break;
+
+	case jstNULL:
+	    pBuilder->append();
+	    break;
+
+            /* these shouldn't appear in this context */
+        case MinKey:
+        case EOO:
+        case Undefined:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        }
+    }
+
+    void Value::addToBsonObj(BSONObjBuilder *pBuilder, string fieldName) const {
+	BuilderObj objBuilder(pBuilder, fieldName);
+	addToBson(&objBuilder);
+    }
+
+    void Value::addToBsonArray(BSONArrayBuilder *pBuilder) const {
+	BuilderArray arrBuilder(pBuilder);
+	addToBson(&arrBuilder);
+    }
+
+    bool Value::coerceToBool() const {
+        BSONType type = getType();
+        switch(type) {
+        case NumberDouble:
+            if (simple.doubleValue != 0)
+                return true;
+            break;
+
+        case String:
+        case Object:
+        case Array:
+        case BinData:
+        case jstOID:
+        case Date:
+        case RegEx:
+        case Symbol:
+        case Timestamp:
+            return true;
+
+        case Bool:
+            if (simple.boolValue)
+                return true;
+            break;
+
+        case CodeWScope:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case NumberInt:
+            if (simple.intValue != 0)
+                return true;
+            break;
+
+        case NumberLong:
+            if (simple.longValue != 0)
+                return true;
+            break;
+
+        case jstNULL:
+        case Undefined:
+            /* nothing to do */
+            break;
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        }
+
+        return false;
+    }
+
+    intrusive_ptr<const Value> Value::coerceToBoolean() const {
+        bool result = coerceToBool();
+
+        /* always normalize to the singletons */
+        if (result)
+            return Value::getTrue();
+        return Value::getFalse();
+    }
+
+    int Value::coerceToInt() const {
+        switch(type) {
+        case NumberDouble:
+            return (int)simple.doubleValue;
+
+        case NumberInt:
+            return simple.intValue;
+
+        case NumberLong:
+            return (int)simple.longValue;
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        case String:
+        default:
+	    uassert(16003, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to int",
+		    false);
+        } // switch(type)
+
+        return (int)0;
+    }
+
+    long long Value::coerceToLong() const {
+        switch(type) {
+        case NumberDouble:
+            return (long long)simple.doubleValue;
+
+        case NumberInt:
+            return simple.intValue;
+
+        case NumberLong:
+            return simple.longValue;
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        case String:
+        default:
+	    uassert(16004, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to long",
+		    false);
+        } // switch(type)
+
+        return (long long)0;
+    }
+
+    double Value::coerceToDouble() const {
+        switch(type) {
+        case NumberDouble:
+            return simple.doubleValue;
+
+        case NumberInt:
+            return (double)simple.intValue;
+
+        case NumberLong:
+            return (double)simple.longValue;
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        case String:
+        default:
+	    uassert(16005, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to double",
+		    false);
+        } // switch(type)
+
+        return (double)0;
+    }
+
+    Date_t Value::coerceToDate() const {
+        switch(type) {
+
+        case Date:
+            return dateValue; 
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        default:
+	    uassert(16006, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to double",
+		    false);
+        } // switch(type)
+
+            assert(false); // CW TODO no conversion available
+        return jstNULL; 
+    }
+
+    string Value::coerceToString() const {
+        stringstream ss;
+        switch(type) {
+        case NumberDouble:
+            ss << simple.doubleValue;
+            return ss.str();
+
+        case NumberInt:
+            ss << simple.intValue;
+            return ss.str();
+
+        case NumberLong:
+            ss << simple.longValue;
+            return ss.str();
+
+        case String:
+            return stringValue;
+
+        case Date:
+            return dateValue.toString();
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        default:
+	    uassert(16007, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to double",
+		    false);
+        } // switch(type)
+
+        return "";
+    }
+
+    int Value::compare(const intrusive_ptr<const Value> &rL,
+                       const intrusive_ptr<const Value> &rR) {
+        BSONType lType = rL->getType();
+	BSONType rType = rR->getType();
+
+	/*
+	  Special handling for Undefined and NULL values; these are types,
+	  so it's easier to handle them here before we go below to handle
+	  values of the same types.  This allows us to compare Undefined and
+	  NULL values with everything else.  As coded now:
+	  (*) Undefined is less than everything except itself (which is equal)
+	  (*) NULL is less than everything except Undefined and itself
+	 */
+	if (lType == Undefined) {
+	    if (rType == Undefined)
+		return 0;
+
+	    /* if rType is anything else, the left value is less */
+	    return -1;
+	}
+	
+	if (lType == jstNULL) {
+	    if (rType == Undefined)
+		return 1;
+	    if (rType == jstNULL)
+		return 0;
+
+	    return -1;
+	}
+
+	if ((rType == Undefined) || (rType == jstNULL)) {
+	    /*
+	      We know the left value isn't Undefined, because of the above.
+	      Count a NULL value as greater than an undefined one.
+	    */
+	    return 1;
+	}
+
+        // CW TODO for now, only compare like values
+	uassert(16016, str::stream() <<
+		"can't compare values of BSON types " << lType <<
+		" and " << rType,
+		lType == rType);
+
+        switch(lType) {
+        case NumberDouble:
+            if (rL->simple.doubleValue < rR->simple.doubleValue)
+                return -1;
+            if (rL->simple.doubleValue > rR->simple.doubleValue)
+                return 1;
+            return 0;
+
+        case String:
+            return rL->stringValue.compare(rR->stringValue);
+
+        case Object:
+            return Document::compare(rL->getDocument(), rR->getDocument());
+
+        case Array: {
+            intrusive_ptr<ValueIterator> pli(rL->getArray());
+            intrusive_ptr<ValueIterator> pri(rR->getArray());
+
+            while(true) {
+                /* have we run out of left array? */
+                if (!pli->more()) {
+                    if (!pri->more())
+                        return 0; // the arrays are the same length
+
+                    return -1; // the left array is shorter
+                }
+
+                /* have we run out of right array? */
+                if (!pri->more())
+                    return 1; // the right array is shorter
+
+                /* compare the two corresponding elements */
+                intrusive_ptr<const Value> plv(pli->next());
+                intrusive_ptr<const Value> prv(pri->next());
+                const int cmp = Value::compare(plv, prv);
+                if (cmp)
+                    return cmp; // values are unequal
+            }
+
+            /* NOTREACHED */
+            assert(false);
+            break;
+        }
+
+        case BinData:
+            // pBuilder->appendBinData(fieldName, ...);
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case jstOID:
+            if (rL->oidValue < rR->oidValue)
+                return -1;
+            if (rL->oidValue == rR->oidValue)
+                return 0;
+            return 1;
+
+        case Bool:
+            if (rL->simple.boolValue == rR->simple.boolValue)
+                return 0;
+            if (rL->simple.boolValue)
+                return 1;
+            return -1;
+
+        case Date:
+            if (rL->dateValue < rR->dateValue)
+                return -1;
+            if (rL->dateValue > rR->dateValue)
+                return 1;
+            return 0;
+
+        case RegEx:
+            return rL->stringValue.compare(rR->stringValue);
+
+        case Symbol:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case CodeWScope:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case NumberInt:
+            if (rL->simple.intValue < rR->simple.intValue)
+                return -1;
+            if (rL->simple.intValue > rR->simple.intValue)
+                return 1;
+            return 0;
+
+        case Timestamp:
+            if (rL->dateValue < rR->dateValue)
+                return -1;
+            if (rL->dateValue > rR->dateValue)
+                return 1;
+            return 0;
+
+        case NumberLong:
+            if (rL->simple.longValue < rR->simple.longValue)
+                return -1;
+            if (rL->simple.longValue > rR->simple.longValue)
+                return 1;
+            return 0;
+
+        case Undefined:
+        case jstNULL:
+	    return 0; // treat two Undefined or NULL values as equal
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        } // switch(lType)
+
+        /* NOTREACHED */
+        return 0;
+    }
+
+    void Value::hash_combine(size_t &seed) const {
+	BSONType type = getType();
+	boost::hash_combine(seed, (int)type);
+
+        switch(type) {
+        case NumberDouble:
+	    boost::hash_combine(seed, simple.doubleValue);
+	    break;
+
+        case String:
+	    boost::hash_combine(seed, stringValue);
+	    break;
+
+        case Object:
+	    getDocument()->hash_combine(seed);
+	    break;
+
+        case Array: {
+	    intrusive_ptr<ValueIterator> pIter(getArray());
+	    while(pIter->more()) {
+		intrusive_ptr<const Value> pValue(pIter->next());
+		pValue->hash_combine(seed);
+	    };
+            break;
+        }
+
+        case BinData:
+            // pBuilder->appendBinData(fieldName, ...);
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case jstOID:
+	    oidValue.hash_combine(seed);
+	    break;
+
+        case Bool:
+	    boost::hash_combine(seed, simple.boolValue);
+	    break;
+
+        case Date:
+	    boost::hash_combine(seed, (unsigned long long)dateValue);
+	    break;
+
+        case RegEx:
+	    boost::hash_combine(seed, stringValue);
+	    break;
+
+        case Symbol:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case CodeWScope:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case NumberInt:
+	    boost::hash_combine(seed, simple.intValue);
+	    break;
+
+        case Timestamp:
+	    boost::hash_combine(seed, (unsigned long long)dateValue);
+	    break;
+
+        case NumberLong:
+	    boost::hash_combine(seed, simple.longValue);
+	    break;
+
+        case Undefined:
+        case jstNULL:
+	    break;
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        } // switch(type)
+    }
+
+    BSONType Value::getWidestNumeric(BSONType lType, BSONType rType) {
+	if (lType == NumberDouble) {
+	    switch(rType) {
+	    case NumberDouble:
+	    case NumberLong:
+	    case NumberInt:
+	    case jstNULL:
+	    case Undefined:
+		return NumberDouble;
+
+	    default:
+		break;
+	    }
+	}
+	else if (lType == NumberLong) {
+	    switch(rType) {
+	    case NumberDouble:
+		return NumberDouble;
+
+	    case NumberLong:
+	    case NumberInt:
+	    case jstNULL:
+	    case Undefined:
+		return NumberLong;
+
+	    default:
+		break;
+	    }
+	}
+	else if (lType == NumberInt) {
+	    switch(rType) {
+	    case NumberDouble:
+		return NumberDouble;
+
+	    case NumberLong:
+		return NumberLong;
+
+	    case NumberInt:
+	    case jstNULL:
+	    case Undefined:
+		return NumberInt;
+
+	    default:
+		break;
+	    }
+	}
+	else if ((lType == jstNULL) || (lType == Undefined)) {
+	    switch(rType) {
+	    case NumberDouble:
+		return NumberDouble;
+
+	    case NumberLong:
+		return NumberLong;
+
+	    case NumberInt:
+		return NumberInt;
+
+	    default:
+		break;
+	    }
+	}
+
+        /* NOTREACHED */
+        return Undefined;
+    }
+
+    size_t Value::getApproximateSize() const {
+        switch(type) {
+        case String:
+	    return sizeof(Value) + stringValue.length();
+
+        case Object:
+	    return sizeof(Value) + pDocumentValue->getApproximateSize();
+
+        case Array: {
+	    size_t size = sizeof(Value);
+            const size_t n = vpValue.size();
+            for(size_t i = 0; i < n; ++i) {
+		size += vpValue[i]->getApproximateSize();
+            }
+	    return size;
+        }
+
+	case NumberDouble:
+        case BinData:
+        case jstOID:
+        case Bool:
+        case Date:
+        case RegEx:
+        case Symbol:
+        case CodeWScope:
+        case NumberInt:
+        case Timestamp:
+        case NumberLong:
+        case jstNULL:
+        case Undefined:
+	    return sizeof(Value);
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+	    return sizeof(Value);
+        }
+
+	/*
+	  We shouldn't get here.  In order to make the implementor think about
+	  these cases, they are all listed explicitly, above.  The compiler
+	  should complain if they aren't all listed, because there's no
+	  default.  However, not all the compilers seem to do that.  Therefore,
+	  this final catch-all is here.
+	 */
+	assert(false);
+	return sizeof(Value);
+    }
+
+
+    void ValueStatic::addRef() const {
+    }
+
+    void ValueStatic::release() const {
+    }
+
+}
diff --git a/src/mongo/db/pipeline/value.h b/src/mongo/db/pipeline/value.h
new file mode 100755
index 00000000000..8bd1bcbbbfd
--- /dev/null
+++ b/src/mongo/db/pipeline/value.h
@@ -0,0 +1,468 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "bson/bsontypes.h"
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+    class BSONElement;
+    class Builder;
+    class Document;
+    class Value;
+
+    class ValueIterator :
+        public IntrusiveCounterUnsigned {
+    public:
+        virtual ~ValueIterator();
+
+        /*
+          Ask if there are more fields to return.
+
+          @returns true if there are more fields, false otherwise
+        */
+        virtual bool more() const = 0;
+
+        /*
+          Move the iterator to point to the next field and return it.
+
+          @returns the next field's <name, Value>
+        */
+        virtual intrusive_ptr<const Value> next() = 0;
+    };
+
+
+    /*
+      Values are immutable, so these are passed around as
+      intrusive_ptr<const Value>.
+     */
+    class Value :
+        public IntrusiveCounterUnsigned {
+    public:
+        ~Value();
+
+        /*
+          Construct a Value from a BSONElement.
+
+	  This ignores the name of the element, and only uses the value,
+	  whatever type it is.
+
+          @returns a new Value initialized from the bsonElement
+        */
+        static intrusive_ptr<const Value> createFromBsonElement(
+            BSONElement *pBsonElement);
+
+        /*
+          Construct an integer-valued Value.
+
+          For commonly used values, consider using one of the singleton
+          instances defined below.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createInt(int value);
+
+        /*
+          Construct an long(long)-valued Value.
+
+          For commonly used values, consider using one of the singleton
+          instances defined below.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createLong(long long value);
+
+        /*
+          Construct a double-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createDouble(double value);
+
+        /*
+          Construct a string-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createString(const string &value);
+
+        /*
+          Construct a date-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createDate(const Date_t &value);
+
+        /*
+          Construct a document-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createDocument(
+            const intrusive_ptr<Document> &pDocument);
+
+        /*
+          Construct an array-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createArray(
+            const vector<intrusive_ptr<const Value> > &vpValue);
+
+        /*
+          Get the BSON type of the field.
+
+          If the type is jstNULL, no value getter will work.
+
+          @return the BSON type of the field.
+        */
+        BSONType getType() const;
+
+        /*
+          Getters.
+
+          @returns the Value's value; asserts if the requested value type is
+          incorrect.
+        */
+        double getDouble() const;
+        string getString() const;
+        intrusive_ptr<Document> getDocument() const;
+        intrusive_ptr<ValueIterator> getArray() const;
+        OID getOid() const;
+        bool getBool() const;
+        Date_t getDate() const;
+        string getRegex() const;
+        string getSymbol() const;
+        int getInt() const;
+        unsigned long long getTimestamp() const;
+        long long getLong() const;
+
+	/*
+	  Get the length of an array value.
+
+	  @returns the length of the array, if this is array-valued; otherwise
+	     throws an error
+	*/
+	size_t getArrayLength() const;
+
+        /*
+          Add this value to the BSON object under construction.
+        */
+        void addToBsonObj(BSONObjBuilder *pBuilder, string fieldName) const;
+
+        /*
+          Add this field to the BSON array under construction.
+
+          As part of an array, the Value's name will be ignored.
+        */
+        void addToBsonArray(BSONArrayBuilder *pBuilder) const;
+
+        /*
+          Get references to singleton instances of commonly used field values.
+         */
+	static intrusive_ptr<const Value> getUndefined();
+        static intrusive_ptr<const Value> getNull();
+        static intrusive_ptr<const Value> getTrue();
+        static intrusive_ptr<const Value> getFalse();
+        static intrusive_ptr<const Value> getMinusOne();
+        static intrusive_ptr<const Value> getZero();
+        static intrusive_ptr<const Value> getOne();
+
+        /*
+          Coerce (cast) a value to a native bool, using JSON rules.
+
+          @returns the bool value
+        */
+        bool coerceToBool() const;
+
+        /*
+          Coerce (cast) a value to a Boolean Value, using JSON rules.
+
+          @returns the Boolean Value value
+        */
+        intrusive_ptr<const Value> coerceToBoolean() const;
+
+        /*
+          Coerce (cast) a value to an int, using JSON rules.
+
+          @returns the int value
+        */
+        int coerceToInt() const;
+
+        /*
+          Coerce (cast) a value to a long long, using JSON rules.
+
+          @returns the long value
+        */
+        long long coerceToLong() const;
+
+        /*
+          Coerce (cast) a value to a double, using JSON rules.
+
+          @returns the double value
+        */
+        double coerceToDouble() const;
+
+        /*
+          Coerce (cast) a value to a date, using JSON rules.
+
+          @returns the date value
+        */
+        Date_t coerceToDate() const;
+
+        /*
+          Coerce (cast) a value to a string, using JSON rules.
+
+          @returns the date value
+        */
+        string coerceToString() const;
+
+        /*
+          Compare two Values.
+
+          @param rL left value
+          @param rR right value
+          @returns an integer less than zero, zero, or an integer greater than
+            zero, depending on whether rL < rR, rL == rR, or rL > rR
+         */
+        static int compare(const intrusive_ptr<const Value> &rL,
+                           const intrusive_ptr<const Value> &rR);
+
+
+        /*
+          Figure out what the widest of two numeric types is.
+
+          Widest can be thought of as "most capable," or "able to hold the
+          largest or most precise value."  The progression is Int, Long, Double.
+
+          @param rL left value
+          @param rR right value
+          @returns a BSONType of NumberInt, NumberLong, or NumberDouble
+        */
+        static BSONType getWidestNumeric(BSONType lType, BSONType rType);
+
+	/*
+	  Get the approximate storage size of the value, in bytes.
+
+	  @returns approximate storage size of the value.
+	 */
+	size_t getApproximateSize() const;
+
+	/*
+	  Calculate a hash value.
+
+	  Meant to be used to create composite hashes suitable for
+	  boost classes such as unordered_map<>.
+
+	  @param seed value to augment with this' hash
+	*/
+	void hash_combine(size_t &seed) const;
+
+	/*
+	  struct Hash is defined to enable the use of Values as
+	  keys in boost::unordered_map<>.
+
+	  Values are always referenced as immutables in the form
+	  intrusive_ptr<const Value>, so these operate on that construction.
+	*/
+	struct Hash :
+	    unary_function<intrusive_ptr<const Value>, size_t> {
+	    size_t operator()(const intrusive_ptr<const Value> &rV) const;
+	};
+
+    protected:
+        Value(); // creates null value
+	Value(BSONType type); // creates an empty (unitialized value) of type
+	                                        // mostly useful for Undefined
+        Value(bool boolValue);
+        Value(int intValue);
+
+    private:
+        Value(BSONElement *pBsonElement);
+
+        Value(long long longValue);
+        Value(double doubleValue);
+        Value(const Date_t &dateValue);
+        Value(const string &stringValue);
+        Value(const intrusive_ptr<Document> &pDocument);
+        Value(const vector<intrusive_ptr<const Value> > &vpValue);
+
+	void addToBson(Builder *pBuilder) const;
+
+        BSONType type;
+
+        /* store value in one of these */
+        union {
+            double doubleValue;
+            bool boolValue;
+            int intValue;
+            unsigned long long timestampValue;
+            long long longValue;
+
+        } simple; // values that don't need a ctor/dtor
+        OID oidValue;
+        Date_t dateValue;
+        string stringValue; // String, Regex, Symbol
+        intrusive_ptr<Document> pDocumentValue;
+        vector<intrusive_ptr<const Value> > vpValue; // for arrays
+
+
+        /*
+        These are often used as the result of boolean or comparison
+        expressions.
+
+        These are obtained via public static getters defined above.
+        */
+	static const intrusive_ptr<const Value> pFieldUndefined;
+        static const intrusive_ptr<const Value> pFieldNull;
+        static const intrusive_ptr<const Value> pFieldTrue;
+        static const intrusive_ptr<const Value> pFieldFalse;
+        static const intrusive_ptr<const Value> pFieldMinusOne;
+        static const intrusive_ptr<const Value> pFieldZero;
+        static const intrusive_ptr<const Value> pFieldOne;
+
+        /* this implementation is used for getArray() */
+        class vi :
+            public ValueIterator {
+        public:
+            // virtuals from ValueIterator
+	    virtual ~vi();
+            virtual bool more() const;
+            virtual intrusive_ptr<const Value> next();
+
+        private:
+            friend class Value;
+            vi(const intrusive_ptr<const Value> &pSource,
+               const vector<intrusive_ptr<const Value> > *pvpValue);
+
+            size_t size;
+            size_t nextIndex;
+            const vector<intrusive_ptr<const Value> > *pvpValue;
+	}; /* class vi */
+
+    };
+
+    /*
+      Equality operator for values.
+
+      Useful for unordered_map<>, etc.
+     */
+    inline bool operator==(const intrusive_ptr<const Value> &v1,
+		    const intrusive_ptr<const Value> &v2) {
+	return (Value::compare(v1, v2) == 0);
+    }
+
+    /*
+      For performance reasons, there are various sharable static values
+      defined in class Value, obtainable by methods such as getUndefined(),
+      getTrue(), getOne(), etc.  We don't want these to go away as they are
+      used by a multitude of threads evaluating pipelines.  In order to avoid
+      having to use atomic integers in the intrusive reference counter, this
+      class overrides the reference counting methods to do nothing, making it
+      safe to use for static Values.
+
+      At this point, only the constructors necessary for the static Values in
+      common use have been defined.  The remainder can be defined if necessary.
+     */
+    class ValueStatic :
+        public Value {
+    public:
+	// virtuals from IntrusiveCounterUnsigned
+	virtual void addRef() const;
+	virtual void release() const;
+
+	// constructors
+	ValueStatic();
+	ValueStatic(BSONType type);
+	ValueStatic(bool boolValue);
+	ValueStatic(int intValue);
+    };
+}
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline BSONType Value::getType() const {
+        return type;
+    }
+
+    inline size_t Value::getArrayLength() const {
+        assert(getType() == Array);
+	return vpValue.size();
+    }
+
+    inline intrusive_ptr<const Value> Value::getUndefined() {
+        return pFieldUndefined;
+    }
+
+    inline intrusive_ptr<const Value> Value::getNull() {
+        return pFieldNull;
+    }
+
+    inline intrusive_ptr<const Value> Value::getTrue() {
+        return pFieldTrue;
+    }
+
+    inline intrusive_ptr<const Value> Value::getFalse() {
+        return pFieldFalse;
+    }
+
+    inline intrusive_ptr<const Value> Value::getMinusOne() {
+        return pFieldMinusOne;
+    }
+
+    inline intrusive_ptr<const Value> Value::getZero() {
+        return pFieldZero;
+    }
+
+    inline intrusive_ptr<const Value> Value::getOne() {
+        return pFieldOne;
+    }
+
+    inline size_t Value::Hash::operator()(
+	const intrusive_ptr<const Value> &rV) const {
+	size_t seed = 0xf0afbeef;
+	rV->hash_combine(seed);
+	return seed;
+    }
+
+    inline ValueStatic::ValueStatic():
+	Value() {
+    }
+
+    inline ValueStatic::ValueStatic(BSONType type):
+	Value(type) {
+    }
+
+    inline ValueStatic::ValueStatic(bool boolValue):
+	Value(boolValue) {
+    }
+
+    inline ValueStatic::ValueStatic(int intValue):
+	Value(intValue) {
+    }
+
+};
diff --git a/src/mongo/db/projection.cpp b/src/mongo/db/projection.cpp
new file mode 100644
index 00000000000..d07e56527af
--- /dev/null
+++ b/src/mongo/db/projection.cpp
@@ -0,0 +1,301 @@
+// projection.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "projection.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+
+    void Projection::init( const BSONObj& o ) {
+        massert( 10371 , "can only add to Projection once", _source.isEmpty());
+        _source = o;
+
+        BSONObjIterator i( o );
+        int true_false = -1;
+        while ( i.more() ) {
+            BSONElement e = i.next();
+
+            if ( ! e.isNumber() )
+                _hasNonSimple = true;
+
+            if (e.type() == Object) {
+                BSONObj obj = e.embeddedObject();
+                BSONElement e2 = obj.firstElement();
+                if ( strcmp(e2.fieldName(), "$slice") == 0 ) {
+                    if (e2.isNumber()) {
+                        int i = e2.numberInt();
+                        if (i < 0)
+                            add(e.fieldName(), i, -i); // limit is now positive
+                        else
+                            add(e.fieldName(), 0, i);
+
+                    }
+                    else if (e2.type() == Array) {
+                        BSONObj arr = e2.embeddedObject();
+                        uassert(13099, "$slice array wrong size", arr.nFields() == 2 );
+
+                        BSONObjIterator it(arr);
+                        int skip = it.next().numberInt();
+                        int limit = it.next().numberInt();
+                        uassert(13100, "$slice limit must be positive", limit > 0 );
+                        add(e.fieldName(), skip, limit);
+
+                    }
+                    else {
+                        uassert(13098, "$slice only supports numbers and [skip, limit] arrays", false);
+                    }
+                }
+                else {
+                    uassert(13097, string("Unsupported projection option: ") + obj.firstElementFieldName(), false);
+                }
+
+            }
+            else if (!strcmp(e.fieldName(), "_id") && !e.trueValue()) {
+                _includeID = false;
+
+            }
+            else {
+
+                add (e.fieldName(), e.trueValue());
+
+                // validate input
+                if (true_false == -1) {
+                    true_false = e.trueValue();
+                    _include = !e.trueValue();
+                }
+                else {
+                    uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." ,
+                             (bool)true_false == e.trueValue() );
+                }
+            }
+        }
+    }
+
+    void Projection::add(const string& field, bool include) {
+        if (field.empty()) { // this is the field the user referred to
+            _include = include;
+        }
+        else {
+            _include = !include;
+
+            const size_t dot = field.find('.');
+            const string subfield = field.substr(0,dot);
+            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
+
+            boost::shared_ptr<Projection>& fm = _fields[subfield];
+            if (!fm)
+                fm.reset(new Projection());
+
+            fm->add(rest, include);
+        }
+    }
+
+    void Projection::add(const string& field, int skip, int limit) {
+        _special = true; // can't include or exclude whole object
+
+        if (field.empty()) { // this is the field the user referred to
+            _skip = skip;
+            _limit = limit;
+        }
+        else {
+            const size_t dot = field.find('.');
+            const string subfield = field.substr(0,dot);
+            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
+
+            boost::shared_ptr<Projection>& fm = _fields[subfield];
+            if (!fm)
+                fm.reset(new Projection());
+
+            fm->add(rest, skip, limit);
+        }
+    }
+
+    void Projection::transform( const BSONObj& in , BSONObjBuilder& b ) const {
+        BSONObjIterator i(in);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( mongoutils::str::equals( "_id" , e.fieldName() ) ) {
+                if ( _includeID )
+                    b.append( e );
+            }
+            else {
+                append( b , e );
+            }
+        }
+    }
+
+    BSONObj Projection::transform( const BSONObj& in ) const {
+        BSONObjBuilder b;
+        transform( in , b );
+        return b.obj();
+    }
+
+
+    //b will be the value part of an array-typed BSONElement
+    void Projection::appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested) const {
+        int skip  = nested ?  0 : _skip;
+        int limit = nested ? -1 : _limit;
+
+        if (skip < 0) {
+            skip = max(0, skip + a.nFields());
+        }
+
+        int i=0;
+        BSONObjIterator it(a);
+        while (it.more()) {
+            BSONElement e = it.next();
+
+            if (skip) {
+                skip--;
+                continue;
+            }
+
+            if (limit != -1 && (limit-- == 0)) {
+                break;
+            }
+
+            switch(e.type()) {
+            case Array: {
+                BSONObjBuilder subb;
+                appendArray(subb , e.embeddedObject(), true);
+                b.appendArray(b.numStr(i++), subb.obj());
+                break;
+            }
+            case Object: {
+                BSONObjBuilder subb;
+                BSONObjIterator jt(e.embeddedObject());
+                while (jt.more()) {
+                    append(subb , jt.next());
+                }
+                b.append(b.numStr(i++), subb.obj());
+                break;
+            }
+            default:
+                if (_include)
+                    b.appendAs(e, b.numStr(i++));
+            }
+        }
+    }
+
+    void Projection::append( BSONObjBuilder& b , const BSONElement& e ) const {
+        FieldMap::const_iterator field = _fields.find( e.fieldName() );
+
+        if (field == _fields.end()) {
+            if (_include)
+                b.append(e);
+        }
+        else {
+            Projection& subfm = *field->second;
+
+            if ((subfm._fields.empty() && !subfm._special) || !(e.type()==Object || e.type()==Array) ) {
+                if (subfm._include)
+                    b.append(e);
+            }
+            else if (e.type() == Object) {
+                BSONObjBuilder subb;
+                BSONObjIterator it(e.embeddedObject());
+                while (it.more()) {
+                    subfm.append(subb, it.next());
+                }
+                b.append(e.fieldName(), subb.obj());
+
+            }
+            else { //Array
+                BSONObjBuilder subb;
+                subfm.appendArray(subb, e.embeddedObject());
+                b.appendArray(e.fieldName(), subb.obj());
+            }
+        }
+    }
+
+    Projection::KeyOnly* Projection::checkKey( const BSONObj& keyPattern ) const {
+        if ( _include ) {
+            // if we default to including then we can't
+            // use an index because we don't know what we're missing
+            return 0;
+        }
+
+        if ( _hasNonSimple )
+            return 0;
+
+        if ( _includeID && keyPattern["_id"].eoo() )
+            return 0;
+
+        // at this point we know its all { x : 1 } style
+
+        auto_ptr<KeyOnly> p( new KeyOnly() );
+
+        int got = 0;
+        BSONObjIterator i( keyPattern );
+        while ( i.more() ) {
+            BSONElement k = i.next();
+
+            if ( _source[k.fieldName()].type() ) {
+
+                if ( strchr( k.fieldName() , '.' ) ) {
+                    // TODO we currently don't support dotted fields
+                    //      SERVER-2104
+                    return 0;
+                }
+
+                if ( ! _includeID && mongoutils::str::equals( k.fieldName() , "_id" ) ) {
+                    p->addNo();
+                }
+                else {
+                    p->addYes( k.fieldName() );
+                    got++;
+                }
+            }
+            else if ( mongoutils::str::equals( "_id" , k.fieldName() ) && _includeID ) {
+                p->addYes( "_id" );
+            }
+            else {
+                p->addNo();
+            }
+
+        }
+
+        int need = _source.nFields();
+        if ( ! _includeID )
+            need--;
+
+        if ( got == need )
+            return p.release();
+
+        return 0;
+    }
+
+    BSONObj Projection::KeyOnly::hydrate( const BSONObj& key ) const {
+        assert( _include.size() == _names.size() );
+
+        BSONObjBuilder b( key.objsize() + _stringSize + 16 );
+
+        BSONObjIterator i(key);
+        unsigned n=0;
+        while ( i.more() ) {
+            assert( n < _include.size() );
+            BSONElement e = i.next();
+            if ( _include[n] ) {
+                b.appendAs( e , _names[n] );
+            }
+            n++;
+        }
+
+        return b.obj();
+    }
+}
diff --git a/src/mongo/db/projection.h b/src/mongo/db/projection.h
new file mode 100644
index 00000000000..b5e0a0c4289
--- /dev/null
+++ b/src/mongo/db/projection.h
@@ -0,0 +1,129 @@
+// projection.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    /**
+     * given a document and a projection specification
+     * can transform the document
+     * currently supports specifying which fields and $slice
+     */
+    class Projection {
+    public:
+
+        class KeyOnly {
+        public:
+
+            KeyOnly() : _stringSize(0) {}
+
+            BSONObj hydrate( const BSONObj& key ) const;
+
+            void addNo() { _add( false , "" ); }
+            void addYes( const string& name ) { _add( true , name ); }
+
+        private:
+
+            void _add( bool b , const string& name ) {
+                _include.push_back( b );
+                _names.push_back( name );
+                _stringSize += name.size();
+            }
+
+            vector<bool> _include; // one entry per field in key.  true iff should be in output
+            vector<string> _names; // name of field since key doesn't have names
+
+            int _stringSize;
+        };
+
+        Projection() :
+            _include(true) ,
+            _special(false) ,
+            _includeID(true) ,
+            _skip(0) ,
+            _limit(-1) ,
+            _hasNonSimple(false) {
+        }
+
+        /**
+         * called once per lifetime
+         * e.g. { "x" : 1 , "a.y" : 1 }
+         */
+        void init( const BSONObj& spec );
+
+        /**
+         * @return the spec init was called with
+         */
+        BSONObj getSpec() const { return _source; }
+
+        /**
+         * transforms in according to spec
+         */
+        BSONObj transform( const BSONObj& in ) const;
+
+
+        /**
+         * transforms in according to spec
+         */
+        void transform( const BSONObj& in , BSONObjBuilder& b ) const;
+
+
+        /**
+         * @return if the keyPattern has all the information needed to return then
+         *         return a new KeyOnly otherwise null
+         *         NOTE: a key may have modified the actual data
+         *               which has to be handled above this (arrays, geo)
+         */
+        KeyOnly* checkKey( const BSONObj& keyPattern ) const;
+
+        bool includeID() const { return _includeID; }
+
+    private:
+
+        /**
+         * appends e to b if user wants it
+         * will descend into e if needed
+         */
+        void append( BSONObjBuilder& b , const BSONElement& e ) const;
+
+
+        void add( const string& field, bool include );
+        void add( const string& field, int skip, int limit );
+        void appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested=false) const;
+
+        bool _include; // true if default at this level is to include
+        bool _special; // true if this level can't be skipped or included without recursing
+
+        //TODO: benchmark vector<pair> vs map
+        typedef map<string, boost::shared_ptr<Projection> > FieldMap;
+        FieldMap _fields;
+        BSONObj _source;
+        bool _includeID;
+
+        // used for $slice operator
+        int _skip;
+        int _limit;
+
+        bool _hasNonSimple;
+    };
+
+
+}
diff --git a/src/mongo/db/queryoptimizer.cpp b/src/mongo/db/queryoptimizer.cpp
new file mode 100644
index 00000000000..9d9040d51e2
--- /dev/null
+++ b/src/mongo/db/queryoptimizer.cpp
@@ -0,0 +1,1337 @@
+// @file queryoptimizer.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "queryoptimizer.h"
+#include "cmdline.h"
+#include "clientcursor.h"
+
+//#define DEBUGQO(x) cout << x << endl;
+#define DEBUGQO(x)
+
+namespace mongo {
+
+    void checkTableScanAllowed( const char * ns ) {
+        if ( ! cmdLine.noTableScan )
+            return;
+
+        if ( strstr( ns , ".system." ) ||
+                strstr( ns , "local." ) )
+            return;
+
+        if ( ! nsdetails( ns ) )
+            return;
+
+        uassert( 10111 ,  (string)"table scans not allowed:" + ns , ! cmdLine.noTableScan );
+    }
+
+    double elementDirection( const BSONElement &e ) {
+        if ( e.isNumber() )
+            return e.number();
+        return 1;
+    }
+
+    QueryPlan::QueryPlan(
+        NamespaceDetails *d, int idxNo,
+        const FieldRangeSetPair &frsp, const FieldRangeSetPair *originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONObj &startKey, const BSONObj &endKey , string special ) :
+        _d(d), _idxNo(idxNo),
+        _frs( frsp.frsForIndex( _d, _idxNo ) ),
+        _frsMulti( frsp.frsForIndex( _d, -1 ) ),
+        _originalQuery( originalQuery ),
+        _order( order ),
+        _index( 0 ),
+        _optimal( false ),
+        _scanAndOrderRequired( true ),
+        _exactKeyMatch( false ),
+        _direction( 0 ),
+        _endKeyInclusive( endKey.isEmpty() ),
+        _unhelpful( false ),
+        _impossible( false ),
+        _special( special ),
+        _type(0),
+        _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ),
+        _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) {
+
+        BSONObj idxKey = _idxNo < 0 ? BSONObj() : d->idx( _idxNo ).keyPattern();
+            
+        if ( !_frs.matchPossibleForIndex( idxKey ) ) {
+            _impossible = true;
+            _scanAndOrderRequired = false;
+            return;
+        }
+            
+        if ( willScanTable() ) {
+            if ( _order.isEmpty() || !strcmp( _order.firstElementFieldName(), "$natural" ) )
+                _scanAndOrderRequired = false;
+            return;                
+        }
+            
+        _index = &d->idx(_idxNo);
+
+        // If the parsing or index indicates this is a special query, don't continue the processing
+        if ( _special.size() ||
+            ( _index->getSpec().getType() && _index->getSpec().getType()->suitability( originalQuery, order ) != USELESS ) ) {
+
+            if( _special.size() ) _optimal = true;
+
+            _type  = _index->getSpec().getType();
+            if( !_special.size() ) _special = _index->getSpec().getType()->getPlugin()->getName();
+
+            massert( 13040 , (string)"no type for special: " + _special , _type );
+            // hopefully safe to use original query in these contexts - don't think we can mix special with $or clause separation yet
+            _scanAndOrderRequired = _type->scanAndOrderRequired( _originalQuery , order );
+            return;
+        }
+
+        const IndexSpec &idxSpec = _index->getSpec();
+        BSONObjIterator o( order );
+        BSONObjIterator k( idxKey );
+        if ( !o.moreWithEOO() )
+            _scanAndOrderRequired = false;
+        while( o.moreWithEOO() ) {
+            BSONElement oe = o.next();
+            if ( oe.eoo() ) {
+                _scanAndOrderRequired = false;
+                break;
+            }
+            if ( !k.moreWithEOO() )
+                break;
+            BSONElement ke;
+            while( 1 ) {
+                ke = k.next();
+                if ( ke.eoo() )
+                    goto doneCheckOrder;
+                if ( strcmp( oe.fieldName(), ke.fieldName() ) == 0 )
+                    break;
+                if ( !_frs.range( ke.fieldName() ).equality() )
+                    goto doneCheckOrder;
+            }
+            int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1;
+            if ( _direction == 0 )
+                _direction = d;
+            else if ( _direction != d )
+                break;
+        }
+doneCheckOrder:
+        if ( _scanAndOrderRequired )
+            _direction = 0;
+        BSONObjIterator i( idxKey );
+        int exactIndexedQueryCount = 0;
+        int optimalIndexedQueryCount = 0;
+        bool stillOptimalIndexedQueryCount = true;
+        set<string> orderFieldsUnindexed;
+        order.getFieldNames( orderFieldsUnindexed );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            const FieldRange &fr = _frs.range( e.fieldName() );
+            if ( stillOptimalIndexedQueryCount ) {
+                if ( fr.nontrivial() )
+                    ++optimalIndexedQueryCount;
+                if ( !fr.equality() )
+                    stillOptimalIndexedQueryCount = false;
+            }
+            else {
+                if ( fr.nontrivial() )
+                    optimalIndexedQueryCount = -1;
+            }
+            if ( fr.equality() ) {
+                BSONElement e = fr.max();
+                if ( !e.isNumber() && !e.mayEncapsulate() && e.type() != RegEx )
+                    ++exactIndexedQueryCount;
+            }
+            orderFieldsUnindexed.erase( e.fieldName() );
+        }
+        if ( !_scanAndOrderRequired &&
+                ( optimalIndexedQueryCount == _frs.nNontrivialRanges() ) )
+            _optimal = true;
+        if ( exactIndexedQueryCount == _frs.nNontrivialRanges() &&
+                orderFieldsUnindexed.size() == 0 &&
+                exactIndexedQueryCount == idxKey.nFields() &&
+                exactIndexedQueryCount == _originalQuery.nFields() ) {
+            _exactKeyMatch = true;
+        }
+        _frv.reset( new FieldRangeVector( _frs, idxSpec, _direction ) );
+        if ( originalFrsp ) {
+            _originalFrv.reset( new FieldRangeVector( originalFrsp->frsForIndex( _d, _idxNo ), idxSpec, _direction ) );
+        }
+        else {
+            _originalFrv = _frv;
+        }
+        if ( _startOrEndSpec ) {
+            BSONObj newStart, newEnd;
+            if ( !startKey.isEmpty() )
+                _startKey = startKey;
+            else
+                _startKey = _frv->startKey();
+            if ( !endKey.isEmpty() )
+                _endKey = endKey;
+            else
+                _endKey = _frv->endKey();
+        }
+
+        if ( ( _scanAndOrderRequired || _order.isEmpty() ) &&
+                !_frs.range( idxKey.firstElementFieldName() ).nontrivial() ) {
+            _unhelpful = true;
+        }
+    }
+
+    shared_ptr<Cursor> QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const {
+
+        if ( _type ) {
+            // hopefully safe to use original query in these contexts - don't think we can mix type with $or clause separation yet
+            return _type->newCursor( _originalQuery , _order , numWanted );
+        }
+
+        if ( _impossible ) {
+            // TODO We might want to allow this dummy table scan even in no table
+            // scan mode, since it won't scan anything.
+            if ( _frs.nNontrivialRanges() )
+                checkTableScanAllowed( _frs.ns() );
+            return shared_ptr<Cursor>( new BasicCursor( DiskLoc() ) );
+        }
+
+        if ( willScanTable() ) {
+            if ( _frs.nNontrivialRanges() ) {
+                checkTableScanAllowed( _frs.ns() );
+                
+                // if we are doing a table scan on _id
+                // and it's a capped collection
+                // we warn /*disallow*/ as it's a common user error
+                // .system. and local collections are exempt
+                if ( _d && _d->capped && _frs.range( "_id" ).nontrivial() ) {
+                    if ( cc().isSyncThread() ||
+                         str::contains( _frs.ns() , ".system." ) || 
+                         str::startsWith( _frs.ns() , "local." ) ) {
+                        // ok
+                    }
+                    else {
+                        warning() << "_id query on capped collection without an _id index, performance will be poor collection: " << _frs.ns() << endl;
+                        //uassert( 14820, str::stream() << "doing _id query on a capped collection without an index is not allowed: " << _frs.ns() ,
+                    }
+                }
+            }
+            return findTableScan( _frs.ns(), _order, startLoc );
+        }
+                
+        massert( 10363 ,  "newCursor() with start location not implemented for indexed plans", startLoc.isNull() );
+
+        if ( _startOrEndSpec ) {
+            // we are sure to spec _endKeyInclusive
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _startKey, _endKey, _endKeyInclusive, _direction >= 0 ? 1 : -1 ) );
+        }
+        else if ( _index->getSpec().getType() ) {
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv->startKey(), _frv->endKey(), true, _direction >= 0 ? 1 : -1 ) );
+        }
+        else {
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv, _direction >= 0 ? 1 : -1 ) );
+        }
+    }
+
+    shared_ptr<Cursor> QueryPlan::newReverseCursor() const {
+        if ( willScanTable() ) {
+            int orderSpec = _order.getIntField( "$natural" );
+            if ( orderSpec == INT_MIN )
+                orderSpec = 1;
+            return findTableScan( _frs.ns(), BSON( "$natural" << -orderSpec ) );
+        }
+        massert( 10364 ,  "newReverseCursor() not implemented for indexed plans", false );
+        return shared_ptr<Cursor>();
+    }
+
+    BSONObj QueryPlan::indexKey() const {
+        if ( !_index )
+            return BSON( "$natural" << 1 );
+        return _index->keyPattern();
+    }
+
+    void QueryPlan::registerSelf( long long nScanned ) const {
+        // Impossible query constraints can be detected before scanning, and we
+        // don't have a reserved pattern enum value for impossible constraints.
+        if ( _impossible ) {
+            return;
+        }
+
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( _frs.pattern( _order ), indexKey(), nScanned );
+    }
+    
+    /**
+     * @return a copy of the inheriting class, which will be run with its own
+     * query plan.  If multiple plan sets are required for an $or query, the
+     * QueryOp of the winning plan from a given set will be cloned to generate
+     * QueryOps for the subsequent plan set.  This function should only be called
+     * after the query op has completed executing.
+     */    
+    QueryOp *QueryOp::createChild() {
+        if( _orConstraint.get() ) {
+            _matcher->advanceOrClause( _orConstraint );
+            _orConstraint.reset();
+        }
+        QueryOp *ret = _createChild();
+        ret->_oldMatcher = _matcher;
+        return ret;
+    }    
+
+    bool QueryPlan::isMultiKey() const {
+        if ( _idxNo < 0 )
+            return false;
+        return _d->isMultikey( _idxNo );
+    }
+    
+    void QueryOp::init() {
+        if ( _oldMatcher.get() ) {
+            _matcher.reset( _oldMatcher->nextClauseMatcher( qp().indexKey() ) );
+        }
+        else {
+            _matcher.reset( new CoveredIndexMatcher( qp().originalQuery(), qp().indexKey(), alwaysUseRecord() ) );
+        }
+        _init();
+    }    
+
+    QueryPlanSet::QueryPlanSet( const char *ns, auto_ptr<FieldRangeSetPair> frsp, auto_ptr<FieldRangeSetPair> originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) :
+        _ns(ns),
+        _originalQuery( originalQuery ),
+        _frsp( frsp ),
+        _originalFrsp( originalFrsp ),
+        _mayRecordPlan( false ),
+        _usingCachedPlan( false ),
+        _hint( BSONObj() ),
+        _order( order.getOwned() ),
+        _oldNScanned( 0 ),
+        _honorRecordedPlan( honorRecordedPlan ),
+        _min( min.getOwned() ),
+        _max( max.getOwned() ),
+        _bestGuessOnly( bestGuessOnly ),
+        _mayYield( mayYield ),
+	    _yieldSometimesTracker( 256, 20 ),
+        _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) {
+        if ( hint && !hint->eoo() ) {
+            _hint = hint->wrap();
+        }
+        init();
+    }
+
+    bool QueryPlanSet::modifiedKeys() const {
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i )
+            if ( (*i)->isMultiKey() )
+                return true;
+        return false;
+    }
+
+    bool QueryPlanSet::hasMultiKey() const {
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i )
+            if ( (*i)->isMultiKey() )
+                return true;
+        return false;
+    }
+
+
+    void QueryPlanSet::addHint( IndexDetails &id ) {
+        if ( !_min.isEmpty() || !_max.isEmpty() ) {
+            string errmsg;
+            BSONObj keyPattern = id.keyPattern();
+            // This reformats _min and _max to be used for index lookup.
+            massert( 10365 ,  errmsg, indexDetailsForRange( _frsp->ns(), errmsg, _min, _max, keyPattern ) );
+        }
+        NamespaceDetails *d = nsdetails(_ns);
+        _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) );
+    }
+
+    // returns an IndexDetails * for a hint, 0 if hint is $natural.
+    // hint must not be eoo()
+    IndexDetails *parseHint( const BSONElement &hint, NamespaceDetails *d ) {
+        massert( 13292, "hint eoo", !hint.eoo() );
+        if( hint.type() == String ) {
+            string hintstr = hint.valuestr();
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if ( ii.indexName() == hintstr ) {
+                    return &ii;
+                }
+            }
+        }
+        else if( hint.type() == Object ) {
+            BSONObj hintobj = hint.embeddedObject();
+            uassert( 10112 ,  "bad hint", !hintobj.isEmpty() );
+            if ( !strcmp( hintobj.firstElementFieldName(), "$natural" ) ) {
+                return 0;
+            }
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if( ii.keyPattern().woCompare(hintobj) == 0 ) {
+                    return &ii;
+                }
+            }
+        }
+        uassert( 10113 ,  "bad hint", false );
+        return 0;
+    }
+
+    void QueryPlanSet::init() {
+        DEBUGQO( "QueryPlanSet::init " << ns << "\t" << _originalQuery );
+        _runner.reset();
+        _plans.clear();
+        _usingCachedPlan = false;
+
+        const char *ns = _frsp->ns();
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d || !_frsp->matchPossible() ) {
+            // Table scan plan, when no matches are possible
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+            return;
+        }
+
+        BSONElement hint = _hint.firstElement();
+        if ( !hint.eoo() ) {
+            IndexDetails *id = parseHint( hint, d );
+            if ( id ) {
+                addHint( *id );
+            }
+            else {
+                massert( 10366 ,  "natural order cannot be specified with $min/$max", _min.isEmpty() && _max.isEmpty() );
+                // Table scan plan
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+            }
+            return;
+        }
+
+        if ( !_min.isEmpty() || !_max.isEmpty() ) {
+            string errmsg;
+            BSONObj keyPattern;
+            IndexDetails *idx = indexDetailsForRange( ns, errmsg, _min, _max, keyPattern );
+            massert( 10367 ,  errmsg, idx );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) );
+            return;
+        }
+
+        if ( isSimpleIdQuery( _originalQuery ) ) {
+            int idx = d->findIdIndex();
+            if ( idx >= 0 ) {
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_frsp , _originalFrsp.get() , _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+                return;
+            }
+        }
+
+        if ( _originalQuery.isEmpty() && _order.isEmpty() ) {
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+            return;
+        }
+
+        DEBUGQO( "\t special : " << _frsp->getSpecial() );
+        if ( _frsp->getSpecial().size() ) {
+            _special = _frsp->getSpecial();
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                int j = i.pos();
+                IndexDetails& ii = i.next();
+                const IndexSpec& spec = ii.getSpec();
+                if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , _order ) ) {
+                    _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_frsp , _originalFrsp.get() , _originalQuery, _order ,
+                                                    _mustAssertOnYieldFailure , BSONObj() , BSONObj() , _special ) ) );
+                    return;
+                }
+            }
+            uassert( 13038 , (string)"can't find special index: " + _special + " for: " + _originalQuery.toString() , 0 );
+        }
+
+        if ( _honorRecordedPlan ) {
+            pair< BSONObj, long long > best = QueryUtilIndexed::bestIndexForPatterns( *_frsp, _order );
+            BSONObj bestIndex = best.first;
+            long long oldNScanned = best.second;
+            if ( !bestIndex.isEmpty() ) {
+                QueryPlanPtr p;
+                _oldNScanned = oldNScanned;
+                if ( !strcmp( bestIndex.firstElementFieldName(), "$natural" ) ) {
+                    // Table scan plan
+                    p.reset( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+                }
+
+                NamespaceDetails::IndexIterator i = d->ii();
+                while( i.more() ) {
+                    int j = i.pos();
+                    IndexDetails& ii = i.next();
+                    if( ii.keyPattern().woCompare(bestIndex) == 0 ) {
+                        p.reset( new QueryPlan( d, j, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+                    }
+                }
+
+                massert( 10368 ,  "Unable to locate previously recorded index", p.get() );
+                if ( !( _bestGuessOnly && p->scanAndOrderRequired() ) ) {
+                    _usingCachedPlan = true;
+                    _plans.push_back( p );
+                    return;
+                }
+            }
+        }
+
+        addOtherPlans( false );
+    }
+
+    void QueryPlanSet::addOtherPlans( bool checkFirst ) {
+        const char *ns = _frsp->ns();
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d )
+            return;
+
+        // If table scan is optimal or natural order requested or tailable cursor requested
+        if ( !_frsp->matchPossible() || ( _frsp->noNontrivialRanges() && _order.isEmpty() ) ||
+                ( !_order.isEmpty() && !strcmp( _order.firstElementFieldName(), "$natural" ) ) ) {
+            // Table scan plan
+            addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst );
+            return;
+        }
+
+        bool normalQuery = _hint.isEmpty() && _min.isEmpty() && _max.isEmpty();
+
+        PlanSet plans;
+        QueryPlanPtr optimalPlan;
+        QueryPlanPtr specialPlan;
+        for( int i = 0; i < d->nIndexes; ++i ) {
+            if ( normalQuery ) {
+                BSONObj keyPattern = d->idx( i ).keyPattern();
+                if ( !_frsp->matchPossibleForIndex( d, i, keyPattern ) ) {
+                    // If no match is possible, only generate a trival plan that won't
+                    // scan any documents.
+                    QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+                    addPlan( p, checkFirst );
+                    return;
+                }
+                if ( !QueryUtilIndexed::indexUseful( *_frsp, d, i, _order ) ) {
+                    continue;
+                }
+            }
+
+            QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+            if ( p->optimal() ) {
+                if ( !optimalPlan.get() ) {
+                    optimalPlan = p;
+                }
+            }
+            else if ( !p->unhelpful() ) {
+                if ( p->special().empty() ) {
+                    plans.push_back( p );
+                }
+                else {
+                    specialPlan = p;
+                }
+            }
+        }
+        if ( optimalPlan.get() ) {
+            addPlan( optimalPlan, checkFirst );
+            return;
+        }
+        for( PlanSet::const_iterator i = plans.begin(); i != plans.end(); ++i ) {
+            addPlan( *i, checkFirst );
+        }
+
+        // Only add a special plan if no standard btree plans have been added. SERVER-4531
+        if ( plans.empty() && specialPlan ) {
+            addPlan( specialPlan, checkFirst );
+            return;
+        }
+
+        // Table scan plan
+        addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst );
+        _mayRecordPlan = true;
+    }
+
+    shared_ptr<QueryOp> QueryPlanSet::runOp( QueryOp &op ) {
+        if ( _usingCachedPlan ) {
+            Runner r( *this, op );
+            shared_ptr<QueryOp> res = r.runUntilFirstCompletes();
+            // _plans.size() > 1 if addOtherPlans was called in Runner::runUntilFirstCompletes().
+            if ( _bestGuessOnly || res->complete() || _plans.size() > 1 )
+                return res;
+            // A cached plan was used, so clear the plan for this query pattern and retry the query without a cached plan.
+            // Carefull here, as the namespace may have been dropped.
+            QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
+            init();
+        }
+        Runner r( *this, op );
+        return r.runUntilFirstCompletes();
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::nextOp( QueryOp &originalOp, bool retried ) {
+        if ( !_runner ) {
+            _runner.reset( new Runner( *this, originalOp ) );
+            shared_ptr<QueryOp> op = _runner->init();
+            if ( op->complete() ) {
+                return op;   
+            }
+        }
+        shared_ptr<QueryOp> op = _runner->nextNonError();
+        if ( !op->error() ) {
+            return op;   
+        }
+        if ( !_usingCachedPlan || _bestGuessOnly || _plans.size() > 1 ) {
+            return op;
+        }
+
+        // Avoid an infinite loop here - this should never occur.
+        verify( 15878, !retried );
+
+        // A cached plan was used, so clear the plan for this query pattern and retry the query without a cached plan.
+        QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
+        init();
+        return nextOp( originalOp, true );
+    }
+
+    bool QueryPlanSet::prepareToYield() {
+        return _runner ? _runner->prepareToYield() : true;   
+    }
+    
+    void QueryPlanSet::recoverFromYield() {
+        if ( _runner ) {
+            _runner->recoverFromYield();   
+        }
+    }
+    
+    void QueryPlanSet::clearRunner() {
+        if ( _runner ) {
+            _runner.reset();
+        }
+    }
+    
+    BSONObj QueryPlanSet::explain() const {
+        vector<BSONObj> arr;
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) {
+            shared_ptr<Cursor> c = (*i)->newCursor();
+            BSONObjBuilder explain;
+            explain.append( "cursor", c->toString() );
+            explain.append( "indexBounds", c->prettyIndexBounds() );
+            arr.push_back( explain.obj() );
+        }
+        BSONObjBuilder b;
+        b.append( "allPlans", arr );
+        return b.obj();
+    }
+
+    QueryPlanSet::QueryPlanPtr QueryPlanSet::getBestGuess() const {
+        assert( _plans.size() );
+        if ( _plans[ 0 ]->scanAndOrderRequired() ) {
+            for ( unsigned i=1; i<_plans.size(); i++ ) {
+                if ( ! _plans[i]->scanAndOrderRequired() )
+                    return _plans[i];
+            }
+
+            warning() << "best guess query plan requested, but scan and order are required for all plans "
+            		  << " query: " << _originalQuery
+            		  << " order: " << _order
+            		  << " choices: ";
+
+            for ( unsigned i=0; i<_plans.size(); i++ )
+            	warning() << _plans[i]->indexKey() << " ";
+            warning() << endl;
+
+            return QueryPlanPtr();
+        }
+        return _plans[0];
+    }
+
+    QueryPlanSet::Runner::Runner( QueryPlanSet &plans, QueryOp &op ) :
+        _op( op ),
+        _plans( plans ) {
+    }
+
+    bool QueryPlanSet::Runner::prepareToYield() {
+        for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            if ( !prepareToYieldOp( **i ) ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void QueryPlanSet::Runner::recoverFromYield() {
+        for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            recoverFromYieldOp( **i );
+        }        
+    }
+
+    void QueryPlanSet::Runner::mayYield() {
+        if ( ! _plans._mayYield ) 
+            return;
+        
+        if ( ! _plans._yieldSometimesTracker.intervalHasElapsed() ) 
+            return;
+        
+        int micros = ClientCursor::suggestYieldMicros();
+        if ( micros <= 0 ) 
+            return;
+
+        if ( !prepareToYield() ) 
+            return;   
+        
+        ClientCursor::staticYield( micros , _plans._ns , 0 );
+        recoverFromYield();
+    }
+
+    shared_ptr<QueryOp> QueryPlanSet::Runner::init() {
+        massert( 10369 ,  "no plans", _plans._plans.size() > 0 );
+        
+        if ( _plans._bestGuessOnly ) {
+            shared_ptr<QueryOp> op( _op.createChild() );
+            shared_ptr<QueryPlan> plan = _plans.getBestGuess();
+            massert( 15894, "no index matches QueryPlanSet's sort with _bestGuessOnly", plan.get() );
+            op->setQueryPlan( plan.get() );
+            _ops.push_back( op );
+        }
+        else {
+            if ( _plans._plans.size() > 1 )
+                log(1) << "  running multiple plans" << endl;
+            for( PlanSet::iterator i = _plans._plans.begin(); i != _plans._plans.end(); ++i ) {
+                shared_ptr<QueryOp> op( _op.createChild() );
+                op->setQueryPlan( i->get() );
+                _ops.push_back( op );
+            }
+        }
+        
+        // Initialize ops.
+        for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            initOp( **i );
+            if ( (*i)->complete() )
+                return *i;
+        }
+        
+        // Put runnable ops in the priority queue.
+        for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            if ( !(*i)->error() ) {
+                _queue.push( *i );
+            }
+        }
+        
+        return *_ops.begin();
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::nextNonError() {
+        if ( _queue.empty() ) {
+            return *_ops.begin();   
+        }
+        shared_ptr<QueryOp> ret;
+        do {
+            ret = next();
+        } while( ret->error() && !_queue.empty() );
+        return ret;
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::next() {
+        mayYield();
+        dassert( !_queue.empty() );
+        OpHolder holder = _queue.pop();
+        QueryOp &op = *holder._op;
+        nextOp( op );
+        if ( op.complete() ) {
+            if ( _plans._mayRecordPlan && op.mayRecordPlan() ) {
+                op.qp().registerSelf( op.nscanned() );
+            }
+            return holder._op;
+        }
+        if ( op.error() ) {
+            return holder._op;
+        }
+        if ( !_plans._bestGuessOnly && _plans._usingCachedPlan && op.nscanned() > _plans._oldNScanned * 10 && _plans._special.empty() ) {
+            holder._offset = -op.nscanned();
+            _plans.addOtherPlans( /* avoid duplicating the initial plan */ true );
+            PlanSet::iterator i = _plans._plans.begin();
+            ++i;
+            for( ; i != _plans._plans.end(); ++i ) {
+                shared_ptr<QueryOp> op( _op.createChild() );
+                op->setQueryPlan( i->get() );
+                _ops.push_back( op );
+                initOp( *op );
+                if ( op->complete() )
+                    return op;
+                _queue.push( op );
+            }
+            _plans._usingCachedPlan = false;
+        }
+        _queue.push( holder );
+        return holder._op;
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::runUntilFirstCompletes() {
+        shared_ptr<QueryOp> potentialFinisher = init();
+        if ( potentialFinisher->complete() ) {
+         	return potentialFinisher;
+        }
+        
+        while( !_queue.empty() ) {
+            shared_ptr<QueryOp> potentialFinisher = next();
+            if ( potentialFinisher->complete() ) {
+                return potentialFinisher;
+            }
+        }
+        return _ops[ 0 ];
+    }
+
+#define GUARD_OP_EXCEPTION( op, expression ) \
+    try { \
+        expression; \
+    } \
+    catch ( DBException& e ) { \
+        op.setException( e.getInfo() ); \
+    } \
+    catch ( const std::exception &e ) { \
+        op.setException( ExceptionInfo( e.what() , 0 ) ); \
+    } \
+    catch ( ... ) { \
+        op.setException( ExceptionInfo( "Caught unknown exception" , 0 ) ); \
+    }
+
+
+    void QueryPlanSet::Runner::initOp( QueryOp &op ) {
+        GUARD_OP_EXCEPTION( op, op.init() );
+    }
+
+    void QueryPlanSet::Runner::nextOp( QueryOp &op ) {
+        GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.next(); } );
+    }
+
+    bool QueryPlanSet::Runner::prepareToYieldOp( QueryOp &op ) {
+        GUARD_OP_EXCEPTION( op,
+        if ( op.error() ) {
+            return true;
+        }
+        else {
+            return op.prepareToYield();
+        } );
+        return true;
+    }
+
+    void QueryPlanSet::Runner::recoverFromYieldOp( QueryOp &op ) {
+        GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.recoverFromYield(); } );
+    }
+
+    /**
+     * NOTE on our $or implementation: In our current qo implementation we don't
+     * keep statistics on our data, but we can conceptualize the problem of
+     * selecting an index when statistics exist for all index ranges.  The
+     * d-hitting set problem on k sets and n elements can be reduced to the
+     * problem of index selection on k $or clauses and n index ranges (where
+     * d is the max number of indexes, and the number of ranges n is unbounded).
+     * In light of the fact that d-hitting set is np complete, and we don't even
+     * track statistics (so cost calculations are expensive) our first
+     * implementation uses the following greedy approach: We take one $or clause
+     * at a time and treat each as a separate query for index selection purposes.
+     * But if an index range is scanned for a particular $or clause, we eliminate
+     * that range from all subsequent clauses.  One could imagine an opposite
+     * implementation where we select indexes based on the union of index ranges
+     * for all $or clauses, but this can have much poorer worst case behavior.
+     * (An index range that suits one $or clause may not suit another, and this
+     * is worse than the typical case of index range choice staleness because
+     * with $or the clauses may likely be logically distinct.)  The greedy
+     * implementation won't do any worse than all the $or clauses individually,
+     * and it can often do better.  In the first cut we are intentionally using
+     * QueryPattern tracking to record successful plans on $or clauses for use by
+     * subsequent $or clauses, even though there may be a significant aggregate
+     * $nor component that would not be represented in QueryPattern.    
+     */
+    
+    MultiPlanScanner::MultiPlanScanner( const char *ns,
+                                        const BSONObj &query,
+                                        const BSONObj &order,
+                                        const BSONElement *hint,
+                                        bool honorRecordedPlan,
+                                        const BSONObj &min,
+                                        const BSONObj &max,
+                                        bool bestGuessOnly,
+                                        bool mayYield ) :
+        _ns( ns ),
+        _or( !query.getField( "$or" ).eoo() ),
+        _query( query.getOwned() ),
+        _i(),
+        _honorRecordedPlan( honorRecordedPlan ),
+        _bestGuessOnly( bestGuessOnly ),
+        _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ),
+        _mayYield( mayYield ),
+        _tableScanned() {
+        if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() ) {
+            _or = false;
+        }
+        if ( _or ) {
+            // Only construct an OrRangeGenerator if we may handle $or clauses.
+            _org.reset( new OrRangeGenerator( ns, _query ) );
+            if ( !_org->getSpecial().empty() ) {
+                _or = false;
+            }
+            else if ( uselessOr( _hint.firstElement() ) ) {
+                _or = false;   
+            }
+        }
+        // if _or == false, don't use or clauses for index selection
+        if ( !_or ) {
+            auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, _query, true ) );
+            _currentQps.reset( new QueryPlanSet( ns, frsp, auto_ptr<FieldRangeSetPair>(), _query, order, false, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
+        }
+        else {
+            BSONElement e = _query.getField( "$or" );
+            massert( 13268, "invalid $or spec", e.type() == Array && e.embeddedObject().nFields() > 0 );
+        }
+    }
+
+    shared_ptr<QueryOp> MultiPlanScanner::runOpOnce( QueryOp &op ) {
+        assertMayRunMore();
+        if ( !_or ) {
+            ++_i;
+            return _currentQps->runOp( op );
+        }
+        ++_i;
+        auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+        auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
+        BSONElement hintElt = _hint.firstElement();
+        _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+        shared_ptr<QueryOp> ret( _currentQps->runOp( op ) );
+        if ( ! ret->complete() )
+            throw MsgAssertionException( ret->exception() );
+        if ( ret->qp().willScanTable() ) {
+            _tableScanned = true;
+        } else {
+            // If the full table was scanned, don't bother popping the last or clause.
+	        _org->popOrClause( ret->qp().nsd(), ret->qp().idxNo(), ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() );
+        }
+        return ret;
+    }
+
+    shared_ptr<QueryOp> MultiPlanScanner::runOp( QueryOp &op ) {
+        shared_ptr<QueryOp> ret = runOpOnce( op );
+        while( !ret->stopRequested() && mayRunMore() ) {
+            ret = runOpOnce( *ret );
+        }
+        return ret;
+    }
+    
+    shared_ptr<QueryOp> MultiPlanScanner::nextOpHandleEndOfClause() {
+        shared_ptr<QueryOp> op = _currentQps->nextOp( *_baseOp );
+        if ( !op->complete() ) {
+            return op;   
+        }
+        if ( op->qp().willScanTable() ) {
+            _tableScanned = true;   
+        } else {
+            _org->popOrClause( op->qp().nsd(), op->qp().idxNo(), op->qp().indexed() ? op->qp().indexKey() : BSONObj() );         	   
+        }
+        return op;
+    }
+    
+    shared_ptr<QueryOp> MultiPlanScanner::nextOpBeginningClause() {
+        assertMayRunMore();
+        shared_ptr<QueryOp> op;
+        while( mayRunMore() ) {
+	        ++_i;
+    	    auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+        	auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
+	        BSONElement hintElt = _hint.firstElement();
+    	    _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+            op = nextOpHandleEndOfClause();
+            if ( !op->complete() ) {
+             	return op;
+            }
+            _baseOp = op;
+        }
+        return op;
+    }
+
+    shared_ptr<QueryOp> MultiPlanScanner::nextOp() {
+        if ( !_or ) {
+            if ( _i == 0 ) {
+                assertMayRunMore();
+	         	++_i;
+            }            
+            return _currentQps->nextOp( *_baseOp );   
+        }
+        if ( _i == 0 ) {
+            return nextOpBeginningClause();
+        }
+        shared_ptr<QueryOp> op = nextOpHandleEndOfClause();
+        if ( !op->complete() ) {
+            return op;   
+        }
+        if ( !op->stopRequested() && mayRunMore() ) {
+            // Finished scanning the clause, but stop hasn't been requested.
+            // Start scanning the next clause.
+            _baseOp = op;
+            return nextOpBeginningClause();
+        }
+        return op;
+    }
+    
+    bool MultiPlanScanner::prepareToYield() {
+        return _currentQps.get() ? _currentQps->prepareToYield() : true;
+    }
+    
+    void MultiPlanScanner::recoverFromYield() {
+        if ( _currentQps.get() ) {
+            _currentQps->recoverFromYield();
+        }
+    }
+
+    void MultiPlanScanner::clearRunner() {
+        if ( _currentQps.get() ) {
+            _currentQps->clearRunner();
+        }    
+    }
+    
+    int MultiPlanScanner::currentNPlans() const {
+        return _currentQps.get() ? _currentQps->nPlans() : 0;
+    }
+
+    shared_ptr<Cursor> MultiPlanScanner::singleCursor() const {
+        const QueryPlan *qp = singlePlan();
+        if ( !qp ) {
+            return shared_ptr<Cursor>();            
+        }
+        // If there is only one plan and it does not require an in memory
+        // sort, we do not expect its cursor op to throw an exception and
+        // so do not need a QueryOptimizerCursor to handle this case.
+        return qp->newCursor();
+    }
+
+    const QueryPlan *MultiPlanScanner::singlePlan() const {
+        if ( _or || _currentQps->nPlans() != 1 || _currentQps->firstPlan()->scanAndOrderRequired() || _currentQps->usingCachedPlan() ) {
+            return 0;
+        }
+        return _currentQps->firstPlan().get();
+    }
+
+    bool MultiPlanScanner::uselessOr( const BSONElement &hint ) const {
+        NamespaceDetails *nsd = nsdetails( _ns );
+        if ( !nsd ) {
+            return true;
+        }
+        if ( !hint.eoo() ) {
+            IndexDetails *id = parseHint( hint, nsd );
+            if ( !id ) {
+                return true;
+            }
+            return QueryUtilIndexed::uselessOr( *_org, nsd, nsd->idxNo( *id ) );
+        }
+        return QueryUtilIndexed::uselessOr( *_org, nsd, -1 );
+    }
+    
+    MultiCursor::MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op, bool mayYield )
+    : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ), _nscanned() {
+        if ( op.get() ) {
+            _op = op;
+        }
+        else {
+            _op.reset( new NoOp() );
+        }
+        if ( _mps->mayRunMore() ) {
+            nextClause();
+            if ( !ok() ) {
+                advance();
+            }
+        }
+        else {
+            _c.reset( new BasicCursor( DiskLoc() ) );
+        }
+    }    
+
+    MultiCursor::MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned )
+    : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ), _nscanned( nscanned ) {
+        _mps->setBestGuessOnly();
+        _mps->mayYield( false ); // with a NoOp, there's no need to yield in QueryPlanSet
+        if ( !ok() ) {
+            // would have been advanced by UserQueryOp if possible
+            advance();
+        }
+    }
+    
+    void MultiCursor::nextClause() {
+        if ( _nscanned >= 0 && _c.get() ) {
+            _nscanned += _c->nscanned();
+        }
+        shared_ptr<CursorOp> best = _mps->runOpOnce( *_op );
+        if ( ! best->complete() )
+            throw MsgAssertionException( best->exception() );
+        _c = best->newCursor();
+        _matcher = best->matcher( _c );
+        _op = best;
+    }    
+    
+    bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) {
+        BSONObjIterator p( idxPattern );
+        BSONObjIterator k( sampleKey );
+        int i = 0;
+        while( 1 ) {
+            BSONElement pe = p.next();
+            BSONElement ke = k.next();
+            if ( pe.eoo() && ke.eoo() )
+                return true;
+            if ( pe.eoo() || ke.eoo() )
+                return false;
+            if ( strcmp( pe.fieldName(), ke.fieldName() ) != 0 )
+                return false;
+            if ( ( i == firstSignificantField ) && !( ( direction > 0 ) == ( pe.number() > 0 ) ) )
+                return false;
+            ++i;
+        }
+        return false;
+    }
+
+    BSONObj extremeKeyForIndex( const BSONObj &idxPattern, int baseDirection ) {
+        BSONObjIterator i( idxPattern );
+        BSONObjBuilder b;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            int idxDirection = e.number() >= 0 ? 1 : -1;
+            int direction = idxDirection * baseDirection;
+            switch( direction ) {
+            case 1:
+                b.appendMaxKey( e.fieldName() );
+                break;
+            case -1:
+                b.appendMinKey( e.fieldName() );
+                break;
+            default:
+                assert( false );
+            }
+        }
+        return b.obj();
+    }
+
+    pair<int,int> keyAudit( const BSONObj &min, const BSONObj &max ) {
+        int direction = 0;
+        int firstSignificantField = 0;
+        BSONObjIterator i( min );
+        BSONObjIterator a( max );
+        while( 1 ) {
+            BSONElement ie = i.next();
+            BSONElement ae = a.next();
+            if ( ie.eoo() && ae.eoo() )
+                break;
+            if ( ie.eoo() || ae.eoo() || strcmp( ie.fieldName(), ae.fieldName() ) != 0 ) {
+                return make_pair( -1, -1 );
+            }
+            int cmp = ie.woCompare( ae );
+            if ( cmp < 0 )
+                direction = 1;
+            if ( cmp > 0 )
+                direction = -1;
+            if ( direction != 0 )
+                break;
+            ++firstSignificantField;
+        }
+        return make_pair( direction, firstSignificantField );
+    }
+
+    pair<int,int> flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) {
+        if ( min.isEmpty() || max.isEmpty() ) {
+            return make_pair( 1, -1 );
+        }
+        else {
+            return keyAudit( min, max );
+        }
+    }
+
+    // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
+    IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+        if ( min.isEmpty() && max.isEmpty() ) {
+            errmsg = "one of min or max must be specified";
+            return 0;
+        }
+
+        Client::Context ctx( ns );
+        IndexDetails *id = 0;
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d ) {
+            errmsg = "ns not found";
+            return 0;
+        }
+
+        pair<int,int> ret = flexibleKeyAudit( min, max );
+        if ( ret == make_pair( -1, -1 ) ) {
+            errmsg = "min and max keys do not share pattern";
+            return 0;
+        }
+        if ( keyPattern.isEmpty() ) {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) {
+                    if ( ii.getSpec().getType() == 0 ) {
+                        id = &ii;
+                        keyPattern = ii.keyPattern();
+                        break;
+                    }
+                }
+            }
+
+        }
+        else {
+            if ( !indexWorks( keyPattern, min.isEmpty() ? max : min, ret.first, ret.second ) ) {
+                errmsg = "requested keyPattern does not match specified keys";
+                return 0;
+            }
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if( ii.keyPattern().woCompare(keyPattern) == 0 ) {
+                    id = &ii;
+                    break;
+                }
+                if ( keyPattern.nFields() == 1 && ii.keyPattern().nFields() == 1 &&
+                        IndexDetails::isIdIndexPattern( keyPattern ) &&
+                        ii.isIdIndex() ) {
+                    id = &ii;
+                    break;
+                }
+
+            }
+        }
+
+        if ( min.isEmpty() ) {
+            min = extremeKeyForIndex( keyPattern, -1 );
+        }
+        else if ( max.isEmpty() ) {
+            max = extremeKeyForIndex( keyPattern, 1 );
+        }
+
+        if ( !id ) {
+            errmsg = str::stream() << "no index found for specified keyPattern: " << keyPattern.toString() 
+                                   << " min: " << min << " max: " << max;
+            return 0;
+        }
+
+        min = min.extractFieldsUnDotted( keyPattern );
+        max = max.extractFieldsUnDotted( keyPattern );
+
+        return id;
+    }
+    
+    bool isSimpleIdQuery( const BSONObj& query ) {
+        BSONObjIterator i(query);
+        
+        if( !i.more() ) 
+            return false;
+
+        BSONElement e = i.next();
+
+        if( i.more() ) 
+            return false;
+
+        if( strcmp("_id", e.fieldName()) != 0 ) 
+            return false;
+        
+        if ( e.isSimpleType() ) // e.g. not something like { _id : { $gt : ...
+            return true;
+
+        if ( e.type() == Object )
+            return e.Obj().firstElementFieldName()[0] != '$';
+
+        return false;
+    }
+
+    shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ) {
+        if( !query.getField( "$or" ).eoo() ) {
+            return shared_ptr<Cursor>( new MultiCursor( ns, query, sort ) );
+        }
+        else {
+            auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, query, true ) );
+            auto_ptr<FieldRangeSetPair> origFrsp( new FieldRangeSetPair( *frsp ) );
+
+            QueryPlanSet qps( ns, frsp, origFrsp, query, sort, false );
+            QueryPlanSet::QueryPlanPtr qpp = qps.getBestGuess();
+            if( ! qpp.get() ) return shared_ptr<Cursor>();
+
+            shared_ptr<Cursor> ret = qpp->newCursor();
+
+            // If we don't already have a matcher, supply one.
+            if ( !query.isEmpty() && ! ret->matcher() ) {
+                shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, ret->indexKeyPattern() ) );
+                ret->setMatcher( matcher );
+            }
+            return ret;
+        }
+    }
+
+    bool QueryUtilIndexed::indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order ) {
+        DEV frsp.assertValidIndex( d, idxNo );
+        BSONObj keyPattern = d->idx( idxNo ).keyPattern();
+        if ( !frsp.matchPossibleForIndex( d, idxNo, keyPattern ) ) {
+            // No matches are possible in the index so the index may be useful.
+            return true;   
+        }
+        return d->idx( idxNo ).getSpec().suitability( frsp.simplifiedQueryForIndex( d, idxNo, keyPattern ), order ) != USELESS;
+    }
+    
+    void QueryUtilIndexed::clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() );
+        nsd.registerIndexForPattern( frsp._singleKey.pattern( order ), BSONObj(), 0 );
+        nsd.registerIndexForPattern( frsp._multiKey.pattern( order ), BSONObj(), 0 );
+    }
+    
+    pair< BSONObj, long long > QueryUtilIndexed::bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() );
+        // TODO Maybe it would make sense to return the index with the lowest
+        // nscanned if there are two possibilities.
+        if ( frsp._singleKey.matchPossible() ) {
+            QueryPattern pattern = frsp._singleKey.pattern( order );
+            BSONObj oldIdx = nsd.indexForPattern( pattern );
+            if ( !oldIdx.isEmpty() ) {
+                long long oldNScanned = nsd.nScannedForPattern( pattern );
+                return make_pair( oldIdx, oldNScanned );
+            }
+        }
+        if ( frsp._multiKey.matchPossible() ) {
+            QueryPattern pattern = frsp._multiKey.pattern( order );
+            BSONObj oldIdx = nsd.indexForPattern( pattern );
+            if ( !oldIdx.isEmpty() ) {
+                long long oldNScanned = nsd.nScannedForPattern( pattern );
+                return make_pair( oldIdx, oldNScanned );
+            }
+        }
+        return make_pair( BSONObj(), 0 );
+    }
+    
+    bool QueryUtilIndexed::uselessOr( const OrRangeGenerator &org, NamespaceDetails *d, int hintIdx ) {
+        for( list<FieldRangeSetPair>::const_iterator i = org._originalOrSets.begin(); i != org._originalOrSets.end(); ++i ) {
+            if ( hintIdx != -1 ) {
+                if ( !indexUseful( *i, d, hintIdx, BSONObj() ) ) {
+                    return true;   
+                }
+            }
+            else {
+                bool useful = false;
+                for( int j = 0; j < d->nIndexes; ++j ) {
+                    if ( indexUseful( *i, d, j, BSONObj() ) ) {
+                        useful = true;
+                        break;
+                    }
+                }
+                if ( !useful ) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+    
+} // namespace mongo
diff --git a/src/mongo/db/queryoptimizer.h b/src/mongo/db/queryoptimizer.h
new file mode 100644
index 00000000000..297c6fe9505
--- /dev/null
+++ b/src/mongo/db/queryoptimizer.h
@@ -0,0 +1,599 @@
+// @file queryoptimizer.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "cursor.h"
+#include "jsobj.h"
+#include "queryutil.h"
+#include "matcher.h"
+#include "../util/net/listen.h"
+#include <queue>
+
+namespace mongo {
+
+    class IndexDetails;
+    class IndexType;
+    class ElapsedTracker;
+
+    /** A plan for executing a query using the given index spec and FieldRangeSet. */
+    class QueryPlan : boost::noncopyable {
+    public:
+
+        /**
+         * @param originalFrsp - original constraints for this query clause.  If null, frsp will be used instead.
+         */
+        QueryPlan(NamespaceDetails *d,
+                  int idxNo, // -1 = no index
+                  const FieldRangeSetPair &frsp,
+                  const FieldRangeSetPair *originalFrsp,
+                  const BSONObj &originalQuery,
+                  const BSONObj &order,
+                  bool mustAssertOnYieldFailure = true,
+                  const BSONObj &startKey = BSONObj(),
+                  const BSONObj &endKey = BSONObj(),
+                  string special="" );
+
+        /** @return true iff no other plans should be considered. */
+        bool optimal() const { return _optimal; }
+        /* @return true iff this plan should not be considered at all. */
+        bool unhelpful() const { return _unhelpful; }
+        /** @return true iff ScanAndOrder processing will be required for result set. */
+        bool scanAndOrderRequired() const { return _scanAndOrderRequired; }
+        /**
+         * @return true iff the index we are using has keys such that it can completely resolve the
+         * query expression to match by itself without ever checking the main object.
+         */
+        bool exactKeyMatch() const { return _exactKeyMatch; }
+        /** @return true iff this QueryPlan would perform an unindexed scan. */
+        bool willScanTable() const { return _idxNo < 0 && !_impossible; }
+        /** @return 'special' attribute of the plan, which was either set explicitly or generated from the index. */
+        const string &special() const { return _special; }
+                
+        /** @return a new cursor based on this QueryPlan's index and FieldRangeSet. */
+        shared_ptr<Cursor> newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const;
+        /** @return a new reverse cursor if this is an unindexed plan. */
+        shared_ptr<Cursor> newReverseCursor() const;
+        /** Register this plan as a winner for its QueryPattern, with specified 'nscanned'. */
+        void registerSelf( long long nScanned ) const;
+
+        int direction() const { return _direction; }
+        BSONObj indexKey() const;
+        bool indexed() const { return _index; }
+        int idxNo() const { return _idxNo; }
+        const char *ns() const { return _frs.ns(); }
+        NamespaceDetails *nsd() const { return _d; }
+        BSONObj originalQuery() const { return _originalQuery; }
+        BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return _frs.simplifiedQuery( fields ); }
+        const FieldRange &range( const char *fieldName ) const { return _frs.range( fieldName ); }
+        shared_ptr<FieldRangeVector> originalFrv() const { return _originalFrv; }
+
+        const FieldRangeSet &multikeyFrs() const { return _frsMulti; }
+        
+        bool mustAssertOnYieldFailure() const { return _mustAssertOnYieldFailure; }
+        
+        /** The following member functions are just for testing. */
+        
+        shared_ptr<FieldRangeVector> frv() const { return _frv; }
+        bool isMultiKey() const;
+        
+    private:
+        NamespaceDetails * _d;
+        int _idxNo;
+        const FieldRangeSet &_frs;
+        const FieldRangeSet &_frsMulti;
+        const BSONObj &_originalQuery;
+        const BSONObj &_order;
+        const IndexDetails * _index;
+        bool _optimal;
+        bool _scanAndOrderRequired;
+        bool _exactKeyMatch;
+        int _direction;
+        shared_ptr<FieldRangeVector> _frv;
+        shared_ptr<FieldRangeVector> _originalFrv;
+        BSONObj _startKey;
+        BSONObj _endKey;
+        bool _endKeyInclusive;
+        bool _unhelpful;
+        bool _impossible;
+        string _special;
+        IndexType * _type;
+        bool _startOrEndSpec;
+        bool _mustAssertOnYieldFailure;
+    };
+
+    /**
+     * Inherit from this interface to implement a new query operation.
+     * The query optimizer will clone the QueryOp that is provided, giving
+     * each clone its own query plan.
+     *
+     * Normal sequence of events:
+     * 1) A new QueryOp is generated using createChild().
+     * 2) A QueryPlan is assigned to this QueryOp with setQueryPlan().
+     * 3) _init() is called on the QueryPlan.
+     * 4) next() is called repeatedly, with nscanned() checked after each call.
+     * 5) In one of these calls to next(), setComplete() is called.
+     * 6) The QueryPattern for the QueryPlan may be recorded as a winner.
+     */
+    class QueryOp {
+    public:
+        QueryOp() : _complete(), _stopRequested(), _qp(), _error() {}
+
+        /** Used when handing off from one QueryOp to another. */
+        QueryOp( const QueryOp &other ) :
+            _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ),
+            _orConstraint( other._orConstraint ) {}
+
+        virtual ~QueryOp() {}
+
+        /** @return QueryPlan assigned to this QueryOp by the query optimizer. */
+        const QueryPlan &qp() const { return *_qp; }
+                
+        /** Advance to next potential matching document (eg using a cursor). */
+        virtual void next() = 0;
+        /**
+         * @return current 'nscanned' metric for this QueryOp.  Used to compare
+         * cost to other QueryOps.
+         */
+        virtual long long nscanned() = 0;
+        /** Take any steps necessary before the db mutex is yielded. */
+        virtual bool prepareToYield() { massert( 13335, "yield not supported", false ); return false; }
+        /** Recover once the db mutex is regained. */
+        virtual void recoverFromYield() { massert( 13336, "yield not supported", false ); }
+        
+        /**
+         * @return true iff the QueryPlan for this QueryOp may be registered
+         * as a winning plan.
+         */
+        virtual bool mayRecordPlan() const = 0;
+
+        /** @return true iff the implementation called setComplete() or setStop(). */
+        bool complete() const { return _complete; }
+        /** @return true iff the implementation called steStop(). */
+        bool stopRequested() const { return _stopRequested; }
+        /** @return true iff the implementation threw an exception. */
+        bool error() const { return _error; }
+        /** @return the exception thrown by implementation if one was thrown. */
+        ExceptionInfo exception() const { return _exception; }
+        
+        /** To be called by QueryPlanSet::Runner only. */
+        
+        QueryOp *createChild();
+        void setQueryPlan( const QueryPlan *qp ) { _qp = qp; assert( _qp != NULL ); }
+        void init();        
+        void setException( const DBException &e ) {
+            _error = true;
+            _exception = e.getInfo();
+        }
+
+        shared_ptr<CoveredIndexMatcher> matcher( const shared_ptr<Cursor>& c ) const {
+           return matcher( c.get() );
+        }
+        shared_ptr<CoveredIndexMatcher> matcher( Cursor* c ) const {
+            if( ! c ) return _matcher;
+            return c->matcher() ? c->matcherPtr() : _matcher;
+        }
+        
+    protected:
+        /** Call if all results have been found. */
+        void setComplete() {
+            _orConstraint = qp().originalFrv();
+            _complete = true;
+        }
+        /** Call if the scan is complete even if not all results have been found. */
+        void setStop() { setComplete(); _stopRequested = true; }
+
+        /** Handle initialization after a QueryPlan has been set. */
+        virtual void _init() = 0;
+
+        /** @return a copy of the inheriting class, which will be run with its own query plan. */
+        virtual QueryOp *_createChild() const = 0;
+
+        virtual bool alwaysUseRecord() const { return false; }
+
+    private:
+        bool _complete;
+        bool _stopRequested;
+        ExceptionInfo _exception;
+        const QueryPlan *_qp;
+        bool _error;
+        shared_ptr<CoveredIndexMatcher> _matcher;
+        shared_ptr<CoveredIndexMatcher> _oldMatcher;
+        shared_ptr<FieldRangeVector> _orConstraint;
+    };
+
+    // temp.  this class works if T::operator< is variant unlike a regular stl priority queue.
+    // but it's very slow.  however if v.size() is always very small, it would be fine, 
+    // maybe even faster than a smart impl that does more memory allocations.
+    template<class T>
+    class our_priority_queue : boost::noncopyable { 
+        vector<T> v;
+    public:
+        our_priority_queue() { 
+            v.reserve(4);
+        }
+        int size() const { return v.size(); }
+        bool empty() const { return v.empty(); }
+        void push(const T & x) { 
+            v.push_back(x); 
+        }
+        T pop() { 
+            size_t t = 0;
+            for( size_t i = 1; i < v.size(); i++ ) { 
+                if( v[t] < v[i] )
+                    t = i;
+            }
+            T ret = v[t];
+            v.erase(v.begin()+t);
+            return ret;
+        }
+    };
+
+    /**
+     * A set of candidate query plans for a query.  This class can return a best buess plan or run a
+     * QueryOp on all the plans.
+     */
+    class QueryPlanSet {
+    public:
+
+        typedef boost::shared_ptr<QueryPlan> QueryPlanPtr;
+        typedef vector<QueryPlanPtr> PlanSet;
+
+        /**
+         * @param originalFrsp - original constraints for this query clause; if null, frsp will be used.
+         */
+        QueryPlanSet( const char *ns,
+                      auto_ptr<FieldRangeSetPair> frsp,
+                      auto_ptr<FieldRangeSetPair> originalFrsp,
+                      const BSONObj &originalQuery,
+                      const BSONObj &order,
+                      bool mustAssertOnYieldFailure = true,
+                      const BSONElement *hint = 0,
+                      bool honorRecordedPlan = true,
+                      const BSONObj &min = BSONObj(),
+                      const BSONObj &max = BSONObj(),
+                      bool bestGuessOnly = false,
+                      bool mayYield = false);
+
+        /** @return number of candidate plans. */
+        int nPlans() const { return _plans.size(); }
+
+        /**
+         * Clone op for each query plan, and @return the first cloned op to call
+         * setComplete() or setStop().
+         */
+
+        shared_ptr<QueryOp> runOp( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOp( T &op ) {
+            return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) );
+        }
+
+        /** Initialize or iterate a runner generated from @param originalOp. */
+        shared_ptr<QueryOp> nextOp( QueryOp &originalOp, bool retried = false );
+        
+        /** Yield the runner member. */
+        
+        bool prepareToYield();
+        void recoverFromYield();
+        
+        /** Clear the runner member. */
+        void clearRunner();
+        
+        QueryPlanPtr firstPlan() const { return _plans[ 0 ]; }
+        
+        /** @return metadata about cursors and index bounds for all plans, suitable for explain output. */
+        BSONObj explain() const;
+        /** @return true iff a plan is selected based on previous success of this plan. */
+        bool usingCachedPlan() const { return _usingCachedPlan; }
+        /** @return a single plan that may work well for the specified query. */
+        QueryPlanPtr getBestGuess() const;
+
+        //for testing
+        const FieldRangeSetPair &frsp() const { return *_frsp; }
+        const FieldRangeSetPair *originalFrsp() const { return _originalFrsp.get(); }
+        bool modifiedKeys() const;
+        bool hasMultiKey() const;
+
+    private:
+        void addOtherPlans( bool checkFirst );
+        void addPlan( QueryPlanPtr plan, bool checkFirst ) {
+            if ( checkFirst && plan->indexKey().woCompare( _plans[ 0 ]->indexKey() ) == 0 )
+                return;
+            _plans.push_back( plan );
+        }
+        void init();
+        void addHint( IndexDetails &id );
+        class Runner {
+        public:
+            Runner( QueryPlanSet &plans, QueryOp &op );
+
+            /**
+             * Iterate interactively through candidate documents on all plans.
+             * QueryOp objects are returned at each interleaved step.
+             */
+            
+            /** @return a plan that has completed, otherwise an arbitrary plan. */
+            shared_ptr<QueryOp> init();
+            /**
+             * Move the Runner forward one iteration, and @return the plan for
+             * this iteration.
+             */
+            shared_ptr<QueryOp> next();
+            /** @return next non error op if there is one, otherwise an error op. */
+            shared_ptr<QueryOp> nextNonError();
+
+            bool prepareToYield();
+            void recoverFromYield();
+            
+            /** Run until first op completes. */
+            shared_ptr<QueryOp> runUntilFirstCompletes();
+             
+            void mayYield();
+            QueryOp &_op;
+            QueryPlanSet &_plans;
+            static void initOp( QueryOp &op );
+            static void nextOp( QueryOp &op );
+            static bool prepareToYieldOp( QueryOp &op );
+            static void recoverFromYieldOp( QueryOp &op );
+        private:
+            vector<shared_ptr<QueryOp> > _ops;
+            struct OpHolder {
+                OpHolder( const shared_ptr<QueryOp> &op ) : _op( op ), _offset() {}
+                shared_ptr<QueryOp> _op;
+                long long _offset;
+                bool operator<( const OpHolder &other ) const {
+                    return _op->nscanned() + _offset > other._op->nscanned() + other._offset;
+                }
+            };
+            our_priority_queue<OpHolder> _queue;
+        };
+
+        const char *_ns;
+        BSONObj _originalQuery;
+        auto_ptr<FieldRangeSetPair> _frsp;
+        auto_ptr<FieldRangeSetPair> _originalFrsp;
+        PlanSet _plans;
+        bool _mayRecordPlan;
+        bool _usingCachedPlan;
+        BSONObj _hint;
+        BSONObj _order;
+        long long _oldNScanned;
+        bool _honorRecordedPlan;
+        BSONObj _min;
+        BSONObj _max;
+        string _special;
+        bool _bestGuessOnly;
+        bool _mayYield;
+        ElapsedTracker _yieldSometimesTracker;
+        shared_ptr<Runner> _runner;
+        bool _mustAssertOnYieldFailure;
+    };
+
+    /** Handles $or type queries by generating a QueryPlanSet for each $or clause. */
+    class MultiPlanScanner {
+    public:
+        MultiPlanScanner( const char *ns,
+                          const BSONObj &query,
+                          const BSONObj &order,
+                          const BSONElement *hint = 0,
+                          bool honorRecordedPlan = true,
+                          const BSONObj &min = BSONObj(),
+                          const BSONObj &max = BSONObj(),
+                          bool bestGuessOnly = false,
+                          bool mayYield = false);
+
+        /**
+         * Clone op for each query plan of a single $or clause, and @return the first cloned op
+         * to call setComplete() or setStop().
+         */
+
+        shared_ptr<QueryOp> runOpOnce( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOpOnce( T &op ) {
+            return dynamic_pointer_cast<T>( runOpOnce( static_cast<QueryOp&>( op ) ) );
+        }
+
+        /**
+         * For each $or clause, calls runOpOnce on the child QueryOp cloned from the winning QueryOp
+         * of the previous $or clause (or from the supplied 'op' for the first $or clause).
+         */
+
+        shared_ptr<QueryOp> runOp( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOp( T &op ) {
+            return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) );
+        }
+
+        /** Initialize or iterate a runner generated from @param originalOp. */
+        
+        void initialOp( const shared_ptr<QueryOp> &originalOp ) { _baseOp = originalOp; }
+        shared_ptr<QueryOp> nextOp();
+        
+        /** Yield the runner member. */
+        
+        bool prepareToYield();
+        void recoverFromYield();
+        
+        /** Clear the runner member. */
+        void clearRunner();
+        
+        int currentNPlans() const;
+
+        /**
+         * @return a single simple cursor if the scanner would run a single cursor
+         * for this query, otherwise return an empty shared_ptr.
+         */
+        shared_ptr<Cursor> singleCursor() const;
+
+        /**
+         * @return the query plan that would be used if the scanner would run a single
+         * cursor for this query, otherwise 0.  The returned plan is invalid if this
+         * MultiPlanScanner is destroyed, hence we return a raw pointer.
+         */
+        const QueryPlan *singlePlan() const;
+
+        /** @return true iff more $or clauses need to be scanned. */
+        bool mayRunMore() const { return _or ? ( !_tableScanned && !_org->orFinished() ) : _i == 0; }
+        /** @return non-$or version of explain output. */
+        BSONObj oldExplain() const { assertNotOr(); return _currentQps->explain(); }
+        /** @return true iff this is not a $or query and a plan is selected based on previous success of this plan. */
+        bool usingCachedPlan() const { return !_or && _currentQps->usingCachedPlan(); }
+        /** Don't attempt to scan multiple plans, just use the best guess. */
+        void setBestGuessOnly() { _bestGuessOnly = true; }
+        /** Yielding is allowed while running each QueryPlan. */
+        void mayYield( bool val ) { _mayYield = val; }
+        bool modifiedKeys() const { return _currentQps->modifiedKeys(); }
+        bool hasMultiKey() const { return _currentQps->hasMultiKey(); }
+
+    private:
+        void assertNotOr() const {
+            massert( 13266, "not implemented for $or query", !_or );
+        }
+        void assertMayRunMore() const {
+            massert( 13271, "can't run more ops", mayRunMore() );
+        }
+        shared_ptr<QueryOp> nextOpBeginningClause();
+        shared_ptr<QueryOp> nextOpHandleEndOfClause();
+        bool uselessOr( const BSONElement &hint ) const;
+        const char * _ns;
+        bool _or;
+        BSONObj _query;
+        shared_ptr<OrRangeGenerator> _org; // May be null in certain non $or query cases.
+        auto_ptr<QueryPlanSet> _currentQps;
+        int _i;
+        bool _honorRecordedPlan;
+        bool _bestGuessOnly;
+        BSONObj _hint;
+        bool _mayYield;
+        bool _tableScanned;
+        shared_ptr<QueryOp> _baseOp;
+    };
+
+    /** Provides a cursor interface for certain limited uses of a MultiPlanScanner. */
+    class MultiCursor : public Cursor {
+    public:
+        class CursorOp : public QueryOp {
+        public:
+            CursorOp() {}
+            CursorOp( const QueryOp &other ) : QueryOp( other ) {}
+            virtual shared_ptr<Cursor> newCursor() const = 0;
+        };
+        /** takes ownership of 'op' */
+        MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op = shared_ptr<CursorOp>(), bool mayYield = false );
+        /**
+         * Used
+         * 1. To handoff a query to a getMore()
+         * 2. To handoff a QueryOptimizerCursor
+         * @param nscanned is an optional initial value, if not supplied nscanned()
+         * will always return -1
+         */
+        MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned = -1 );
+
+        virtual bool ok() { return _c->ok(); }
+        virtual Record* _current() { return _c->_current(); }
+        virtual BSONObj current() { return _c->current(); }
+        virtual DiskLoc currLoc() { return _c->currLoc(); }
+        virtual bool advance() {
+            _c->advance();
+            while( !ok() && _mps->mayRunMore() ) {
+                nextClause();
+            }
+            return ok();
+        }
+        virtual BSONObj currKey() const { return _c->currKey(); }
+        virtual DiskLoc refLoc() { return _c->refLoc(); }
+        virtual void noteLocation() { _c->noteLocation(); }
+        virtual void checkLocation() { _c->checkLocation(); }
+        virtual bool supportGetMore() { return true; }
+        virtual bool supportYields() { return _c->supportYields(); }
+        virtual BSONObj indexKeyPattern() { return _c->indexKeyPattern(); }
+
+        /**
+         * with update we could potentially get the same document on multiple
+         * indexes, but update appears to already handle this with seenObjects
+         * so we don't have to do anything special here.
+         */
+        virtual bool getsetdup(DiskLoc loc) { return _c->getsetdup( loc ); }
+
+        virtual bool autoDedup() const { return _c->autoDedup(); }
+
+        virtual bool modifiedKeys() const { return _mps->modifiedKeys(); }
+
+        virtual bool isMultiKey() const { return _mps->hasMultiKey(); }
+
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+        virtual CoveredIndexMatcher* matcher() const { return _matcher.get(); }
+
+        virtual bool capped() const { return _c->capped(); }
+
+        /** return -1 if we're a getmore handoff */
+        virtual long long nscanned() { return _nscanned >= 0 ? _nscanned + _c->nscanned() : _nscanned; }
+        /** just for testing */
+        shared_ptr<Cursor> sub_c() const { return _c; }
+    private:
+        class NoOp : public CursorOp {
+        public:
+            NoOp() {}
+            NoOp( const QueryOp &other ) : CursorOp( other ) {}
+            virtual void _init() { setComplete(); }
+            virtual void next() {}
+            virtual bool mayRecordPlan() const { return false; }
+            virtual QueryOp *_createChild() const { return new NoOp(); }
+            virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
+            virtual long long nscanned() { assert( false ); return 0; }
+        };
+        void nextClause();
+        shared_ptr<CursorOp> _op;
+        shared_ptr<Cursor> _c;
+        auto_ptr<MultiPlanScanner> _mps;
+        shared_ptr<CoveredIndexMatcher> _matcher;
+        long long _nscanned;
+    };
+
+    /** NOTE min, max, and keyPattern will be updated to be consistent with the selected index. */
+    IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern );
+
+    bool isSimpleIdQuery( const BSONObj& query );
+
+    /**
+     * @return a single cursor that may work well for the given query.
+     * It is possible no cursor is returned if the sort is not supported by an index.  Clients are responsible
+     * for checking this if they are not sure an index for a sort exists, and defaulting to a non-sort if
+     * no suitable indices exist.
+     */
+    shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort );
+
+    /**
+     * Add-on functionality for queryutil classes requiring access to indexing
+     * functionality not currently linked to mongos.
+     * TODO Clean this up a bit, possibly with separate sharded and non sharded
+     * implementations for the appropriate queryutil classes or by pulling index
+     * related functionality into separate wrapper classes.
+     */
+    struct QueryUtilIndexed {
+        /** @return true if the index may be useful according to its KeySpec. */
+        static bool indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order );
+        /** Clear any indexes recorded as the best for either the single or multi key pattern. */
+        static void clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order );
+        /** Return a recorded best index for the single or multi key pattern. */
+        static pair< BSONObj, long long > bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order );        
+        static bool uselessOr( const OrRangeGenerator& org, NamespaceDetails *d, int hintIdx );
+    };
+    
+} // namespace mongo
diff --git a/src/mongo/db/queryoptimizercursor.cpp b/src/mongo/db/queryoptimizercursor.cpp
new file mode 100644
index 00000000000..07f8df12815
--- /dev/null
+++ b/src/mongo/db/queryoptimizercursor.cpp
@@ -0,0 +1,530 @@
+// @file queryoptimizercursor.cpp
+
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "queryoptimizer.h"
+#include "pdfile.h"
+#include "clientcursor.h"
+#include "btree.h"
+#include "queryoptimizercursor.h"
+
+namespace mongo {
+    
+    static const int OutOfOrderDocumentsAssertionCode = 14810;
+    
+    /**
+     * A QueryOp implementation utilized by the QueryOptimizerCursor
+     */
+    class QueryOptimizerCursorOp : public QueryOp {
+    public:
+        /**
+         * @param aggregateNscanned - shared long long counting total nscanned for
+         * query ops for all cursors.
+         * @param requireIndex - if unindexed scans should be prohibited.
+         */
+        QueryOptimizerCursorOp( long long &aggregateNscanned, bool requireIndex, int cumulativeCount = 0 ) : _matchCounter( aggregateNscanned, cumulativeCount ), _countingMatches(), _mustAdvance(), _capped(), _yieldRecoveryFailed(), _requireIndex( requireIndex ) {}
+        
+        virtual void _init() {
+            if ( qp().scanAndOrderRequired() ) {
+                throw MsgAssertionException( OutOfOrderDocumentsAssertionCode, "order spec cannot be satisfied with index" );
+            }
+            if ( _requireIndex && strcmp( qp().indexKey().firstElementFieldName(), "$natural" ) == 0 ) {
+                throw MsgAssertionException( 9011, "Not an index cursor" );                
+            }
+            _c = qp().newCursor();
+
+            // The QueryOptimizerCursor::prepareToTouchEarlierIterate() implementation requires _c->prepareToYield() to work.
+            verify( 15940, _c->supportYields() );
+            _capped = _c->capped();
+
+            // TODO This violates the current Cursor interface abstraction, but for now it's simpler to keep our own set of
+            // dups rather than avoid poisoning the cursor's dup set with unreturned documents.  Deduping documents
+            // matched in this QueryOptimizerCursorOp will run against the takeover cursor.
+            _matchCounter.setCheckDups( _c->isMultiKey() );
+
+            _matchCounter.updateNscanned( _c->nscanned() );
+        }
+        
+        virtual long long nscanned() {
+            return _c ? _c->nscanned() : _matchCounter.nscanned();
+        }
+        
+        virtual bool prepareToYield() {
+            if ( _c && !_cc ) {
+                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) );
+            }
+            if ( _cc ) {
+                recordCursorLocation();
+                return _cc->prepareToYield( _yieldData );
+            }
+            // no active cursor - ok to yield
+            return true;
+        }
+        
+        virtual void recoverFromYield() {
+            if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _yieldRecoveryFailed = true;
+                _c.reset();
+                _cc.reset();
+                
+                if ( _capped ) {
+                    msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun: " << qp().ns() );
+                }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15892, str::stream() << "QueryOptimizerCursorOp::recoverFromYield() failed to recover" );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+                    // also, see SERVER-2454
+                }
+            }
+            else {
+                checkCursorAdvanced();
+            }
+        }
+
+        void prepareToTouchEarlierIterate() {
+            recordCursorLocation();
+            if ( _c ) {
+                _c->prepareToTouchEarlierIterate();
+            }
+        }
+
+        void recoverFromTouchingEarlierIterate() {
+            if ( _c ) {
+                _c->recoverFromTouchingEarlierIterate();
+            }
+            checkCursorAdvanced();
+        }
+        
+        virtual void next() {
+            mayAdvance();
+            
+            if ( _matchCounter.enoughCumulativeMatchesToChooseAPlan() ) {
+                setStop();
+                return;
+            }
+            if ( !_c || !_c->ok() ) {
+                setComplete();
+                return;
+            }
+            
+            _mustAdvance = true;
+        }
+        virtual QueryOp *_createChild() const {
+            return new QueryOptimizerCursorOp( _matchCounter.aggregateNscanned(), _requireIndex, _matchCounter.cumulativeCount() );
+        }
+        DiskLoc currLoc() const { return _c ? _c->currLoc() : DiskLoc(); }
+        BSONObj currKey() const { return _c ? _c->currKey() : BSONObj(); }
+        bool currentMatches( MatchDetails *details ) {
+            bool ret = ( _c && _c->ok() ) ? matcher( _c.get() )->matchesCurrent( _c.get(), details ) : false;
+            // Cache the match, so we can count it in mayAdvance().
+            _matchCounter.setMatch( ret );
+            return ret;
+        }
+        virtual bool mayRecordPlan() const {
+            return !_yieldRecoveryFailed && complete() && ( !stopRequested() || _matchCounter.enoughMatchesToRecordPlan() );
+        }
+        shared_ptr<Cursor> cursor() const { return _c; }
+    private:
+        void mayAdvance() {
+            if ( !_c ) {
+                return;
+            }
+            if ( countingMatches() ) {
+                // Check match if not yet known.
+                if ( !_matchCounter.knowMatch() ) {
+                    currentMatches( 0 );
+                }
+                _matchCounter.countMatch( currLoc() );
+            }
+            if ( _mustAdvance ) {
+                _c->advance();
+                handleCursorAdvanced();
+            }
+            _matchCounter.updateNscanned( _c->nscanned() );
+        }
+        // Don't count matches on the first call to next(), which occurs before the first result is returned.
+        bool countingMatches() {
+            if ( _countingMatches ) {
+                return true;
+            }
+            _countingMatches = true;
+            return false;
+        }
+
+        void recordCursorLocation() {
+            _posBeforeYield = currLoc();
+        }
+        void checkCursorAdvanced() {
+            // This check will not correctly determine if we are looking at a different document in
+            // all cases, but it is adequate for updating the query plan's match count (just used to pick
+            // plans, not returned to the client) and adjust iteration via _mustAdvance.
+            if ( _posBeforeYield != currLoc() ) {
+                // If the yield advanced our position, the next next() will be a no op.
+                handleCursorAdvanced();
+            }
+        }
+        void handleCursorAdvanced() {
+            _mustAdvance = false;
+            _matchCounter.resetMatch();
+        }
+
+        CachedMatchCounter _matchCounter;
+        bool _countingMatches;
+        bool _mustAdvance;
+        bool _capped;
+        shared_ptr<Cursor> _c;
+        ClientCursor::CleanupPointer _cc;
+        DiskLoc _posBeforeYield;
+        ClientCursor::YieldData _yieldData;
+        bool _yieldRecoveryFailed;
+        bool _requireIndex;
+    };
+    
+    /**
+     * This cursor runs a MultiPlanScanner iteratively and returns results from
+     * the scanner's cursors as they become available.  Once the scanner chooses
+     * a single plan, this cursor becomes a simple wrapper around that single
+     * plan's cursor (called the 'takeover' cursor).
+     */
+    class QueryOptimizerCursor : public Cursor {
+    public:
+        QueryOptimizerCursor( auto_ptr<MultiPlanScanner> &mps, bool requireIndex ) :
+        _mps( mps ),
+        _originalOp( new QueryOptimizerCursorOp( _nscanned, requireIndex ) ),
+        _currOp(),
+        _nscanned() {
+            _mps->initialOp( _originalOp );
+            shared_ptr<QueryOp> op = _mps->nextOp();
+            rethrowOnError( op );
+            if ( !op->complete() ) {
+                _currOp = dynamic_cast<QueryOptimizerCursorOp*>( op.get() );
+            }
+        }
+        
+        virtual bool ok() { return _takeover ? _takeover->ok() : !currLoc().isNull(); }
+        
+        virtual Record* _current() {
+            if ( _takeover ) {
+                return _takeover->_current();
+            }
+            assertOk();
+            return currLoc().rec();
+        }
+        
+        virtual BSONObj current() {
+            if ( _takeover ) {
+                return _takeover->current();
+            }
+            assertOk();
+            return currLoc().obj();
+        }
+        
+        virtual DiskLoc currLoc() { return _takeover ? _takeover->currLoc() : _currLoc(); }
+        
+        DiskLoc _currLoc() const {
+            dassert( !_takeover );
+            return _currOp ? _currOp->currLoc() : DiskLoc();
+        }
+        
+        virtual bool advance() {
+            return _advance( false );
+        }
+        
+        virtual BSONObj currKey() const {
+            if ( _takeover ) {
+             	return _takeover->currKey();   
+            }
+            assertOk();
+            return _currOp->currKey();
+        }
+        
+        /**
+         * When return value isNull(), our cursor will be ignored for yielding by the client cursor implementation.
+         * In such cases, an internal ClientCursor will update the position of component cursors when necessary.
+         */
+        virtual DiskLoc refLoc() { return _takeover ? _takeover->refLoc() : DiskLoc(); }
+        
+        virtual BSONObj indexKeyPattern() {
+            if ( _takeover ) {
+                return _takeover->indexKeyPattern();
+            }
+            assertOk();
+            return _currOp->cursor()->indexKeyPattern();
+        }
+        
+        virtual bool supportGetMore() { return false; }
+
+        virtual bool supportYields() { return _takeover ? _takeover->supportYields() : true; }
+        
+        virtual void prepareToTouchEarlierIterate() {
+            if ( _takeover ) {
+                _takeover->prepareToTouchEarlierIterate();
+            }
+            else if ( _currOp ) {
+                if ( _mps->currentNPlans() == 1 ) {
+                    // This single plan version is a bit more performant, so we use it when possible.
+                    _currOp->prepareToTouchEarlierIterate();
+                }
+                else {
+                    // With multiple plans, the 'earlier iterate' could be the current iterate of one of
+                    // the component plans.  We do a full yield of all plans, using ClientCursors.
+                    verify( 15941, _mps->prepareToYield() );
+                }
+            }
+        }
+
+        virtual void recoverFromTouchingEarlierIterate() {
+            if ( _takeover ) {
+                _takeover->recoverFromTouchingEarlierIterate();
+            }
+            else if ( _currOp ) {
+                if ( _mps->currentNPlans() == 1 ) {
+                    _currOp->recoverFromTouchingEarlierIterate();
+                }
+                else {
+                    recoverFromYield();
+                }
+            }
+        }
+
+        virtual bool prepareToYield() {
+            if ( _takeover ) {
+                return _takeover->prepareToYield();
+            }
+            else if ( _currOp ) {
+                return _mps->prepareToYield();
+            }
+            else {
+                // No state needs to be protected, so yielding is fine.
+                return true;
+            }
+        }
+        
+        virtual void recoverFromYield() {
+            if ( _takeover ) {
+                _takeover->recoverFromYield();
+                return;
+            }
+            if ( _currOp ) {
+                _mps->recoverFromYield();
+                if ( _currOp->error() || !ok() ) {
+                    // Advance to a non error op if on of the ops errored out.
+                    // Advance to a following $or clause if the $or clause returned all results.
+                    _advance( true );
+                }
+            }
+        }
+        
+        virtual string toString() { return "QueryOptimizerCursor"; }
+        
+        virtual bool getsetdup(DiskLoc loc) {
+            if ( _takeover ) {
+                if ( getdupInternal( loc ) ) {
+                    return true;   
+                }
+             	return _takeover->getsetdup( loc );   
+            }
+            assertOk();
+            return getsetdupInternal( loc );                
+        }
+        
+        /** Matcher needs to know if the the cursor being forwarded to is multikey. */
+        virtual bool isMultiKey() const {
+            if ( _takeover ) {
+                return _takeover->isMultiKey();
+            }
+            assertOk();
+            return _currOp->cursor()->isMultiKey();
+        }
+        
+        virtual bool modifiedKeys() const { return true; }
+
+        /** Initial capped wrapping cases (before takeover) are handled internally by a component ClientCursor. */
+        virtual bool capped() const { return _takeover ? _takeover->capped() : false; }
+
+        virtual long long nscanned() { return _takeover ? _takeover->nscanned() : _nscanned; }
+
+        virtual shared_ptr<CoveredIndexMatcher> matcherPtr() const {
+            if ( _takeover ) {
+                return _takeover->matcherPtr();
+            }
+            assertOk();
+            return _currOp->matcher( _currOp->cursor() );
+        }
+
+        virtual CoveredIndexMatcher* matcher() const {
+            if ( _takeover ) {
+                return _takeover->matcher();
+            }
+            assertOk();
+            return _currOp->matcher( _currOp->cursor() ).get();
+        }
+
+        virtual bool currentMatches( MatchDetails *details = 0 ) {
+            if ( _takeover ) {
+                return _takeover->currentMatches( details );
+            }
+            assertOk();
+            return _currOp->currentMatches( details );
+        }
+
+    private:
+        /**
+         * Advances the QueryPlanSet::Runner.
+         * @param force - advance even if the current query op is not valid.  The 'force' param should only be specified
+         * when there are plans left in the runner.
+         */
+        bool _advance( bool force ) {
+            if ( _takeover ) {
+                return _takeover->advance();
+            }
+
+            if ( !force && !ok() ) {
+                return false;
+            }
+
+            DiskLoc prevLoc = _currLoc();
+
+            _currOp = 0;
+            shared_ptr<QueryOp> op = _mps->nextOp();
+            rethrowOnError( op );
+
+            // Avoiding dynamic_cast here for performance.  Soon we won't need to
+            // do a cast at all.
+            QueryOptimizerCursorOp *qocop = (QueryOptimizerCursorOp*)( op.get() );
+
+            if ( !op->complete() ) {
+                // The 'qocop' will be valid until we call _mps->nextOp() again.  We return 'current' values from this op.
+                _currOp = qocop;
+            }
+            else if ( op->stopRequested() ) {
+                if ( qocop->cursor() ) {
+                    // Ensure that prepareToTouchEarlierIterate() may be called safely when a BasicCursor takes over.
+                    if ( !prevLoc.isNull() && prevLoc == qocop->currLoc() ) {
+                        qocop->cursor()->advance();
+                    }
+                    // Clear the Runner and any unnecessary QueryOps and their ClientCursors.
+                    _mps->clearRunner();
+                    _takeover.reset( new MultiCursor( _mps,
+                                                     qocop->cursor(),
+                                                     op->matcher( qocop->cursor() ),
+                                                     *op,
+                                                     _nscanned - qocop->cursor()->nscanned() ) );
+                }
+            }
+
+            return ok();
+        }
+        /** Forward an exception when the runner errs out. */
+        void rethrowOnError( const shared_ptr< QueryOp > &op ) {
+            if ( op->error() ) {
+                throw MsgAssertionException( op->exception() );   
+            }
+        }
+        
+        void assertOk() const {
+            massert( 14809, "Invalid access for cursor that is not ok()", !_currLoc().isNull() );
+        }
+
+        /** Insert and check for dups before takeover occurs */
+        bool getsetdupInternal(const DiskLoc &loc) {
+            return _dups.getsetdup( loc );
+        }
+
+        /** Just check for dups - after takeover occurs */
+        bool getdupInternal(const DiskLoc &loc) {
+            dassert( _takeover );
+            return _dups.getdup( loc );
+        }
+        
+        auto_ptr<MultiPlanScanner> _mps;
+        shared_ptr<QueryOptimizerCursorOp> _originalOp;
+        QueryOptimizerCursorOp *_currOp;
+        shared_ptr<Cursor> _takeover;
+        long long _nscanned;
+        // Using a SmallDupSet seems a bit hokey, but I've measured a 5% performance improvement with ~100 document non multi key scans.
+        SmallDupSet _dups;
+    };
+    
+    shared_ptr<Cursor> newQueryOptimizerCursor( auto_ptr<MultiPlanScanner> mps, bool requireIndex ) {
+        try {
+            return shared_ptr<Cursor>( new QueryOptimizerCursor( mps, requireIndex ) );
+        } catch( const AssertionException &e ) {
+            if ( e.getCode() == OutOfOrderDocumentsAssertionCode ) {
+                // If no indexes follow the requested sort order, return an
+                // empty pointer.  This is legacy behavior based on bestGuessCursor().
+                return shared_ptr<Cursor>();
+            }
+            throw;
+        }
+        return shared_ptr<Cursor>();
+    }
+    
+    shared_ptr<Cursor> NamespaceDetailsTransient::getCursor( const char *ns, const BSONObj &query,
+                                                            const BSONObj &order, bool requireIndex,
+                                                            bool *simpleEqualityMatch ) {
+        if ( simpleEqualityMatch ) {
+            *simpleEqualityMatch = false;
+        }
+        if ( query.isEmpty() && order.isEmpty() && !requireIndex ) {
+            // TODO This will not use a covered index currently.
+            return theDataFileMgr.findAll( ns );
+        }
+        if ( isSimpleIdQuery( query ) ) {
+            Database *database = cc().database();
+            verify( 15985, database );
+            NamespaceDetails *d = database->namespaceIndex.details(ns);
+            if ( d ) {
+                int idxNo = d->findIdIndex();
+                if ( idxNo >= 0 ) {
+                    IndexDetails& i = d->idx( idxNo );
+                    BSONObj key = i.getKeyFromQuery( query );
+                    return shared_ptr<Cursor>( BtreeCursor::make( d, idxNo, i, key, key, true, 1 ) );
+                }
+            }
+        }
+        auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false
+        shared_ptr<Cursor> single = mps->singleCursor();
+        if ( single ) {
+            if ( !( requireIndex &&
+                   dynamic_cast<BasicCursor*>( single.get() ) /* May not use an unindexed cursor */ ) ) {
+                if ( !query.isEmpty() && !single->matcher() ) {
+                    shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, single->indexKeyPattern() ) );
+                    single->setMatcher( matcher );
+                }
+                if ( simpleEqualityMatch ) {
+                    const QueryPlan *qp = mps->singlePlan();
+                    if ( qp->exactKeyMatch() && !single->matcher()->needRecord() ) {
+                        *simpleEqualityMatch = true;
+                    }
+                }
+                return single;
+            }
+        }
+        return newQueryOptimizerCursor( mps, requireIndex );
+    }
+
+    /** This interface just available for testing. */
+    shared_ptr<Cursor> newQueryOptimizerCursor( const char *ns, const BSONObj &query, const BSONObj &order, bool requireIndex ) {
+        auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false
+        return newQueryOptimizerCursor( mps, requireIndex );
+    }
+        
+} // namespace mongo;
diff --git a/src/mongo/db/queryoptimizercursor.h b/src/mongo/db/queryoptimizercursor.h
new file mode 100644
index 00000000000..ee5a1663370
--- /dev/null
+++ b/src/mongo/db/queryoptimizercursor.h
@@ -0,0 +1,150 @@
+// @file queryoptimizercursor.h
+
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+namespace mongo {
+    
+    /** Helper class for caching and counting matches during execution of a QueryPlan. */
+    class CachedMatchCounter {
+    public:
+        /**
+         * @param aggregateNscanned - shared count of nscanned for this and othe plans.
+         * @param cumulativeCount - starting point for accumulated count over a series of plans.
+         */
+        CachedMatchCounter( long long &aggregateNscanned, int cumulativeCount ) : _aggregateNscanned( aggregateNscanned ), _nscanned(), _cumulativeCount( cumulativeCount ), _count(), _checkDups(), _match( Unknown ), _counted() {}
+        
+        /** Set whether dup checking is enabled when counting. */
+        void setCheckDups( bool checkDups ) { _checkDups = checkDups; }
+        
+        /**
+         * Usual sequence of events:
+         * 1) resetMatch() - reset stored match value to Unkonwn.
+         * 2) setMatch() - set match value to a definite true/false value.
+         * 3) knowMatch() - check if setMatch() has been called.
+         * 4) countMatch() - increment count if match is true.
+         */
+        
+        void resetMatch() {
+            _match = Unknown;
+            _counted = false;
+        }
+        void setMatch( bool match ) { _match = match ? True : False; }
+        bool knowMatch() const { return _match != Unknown; }
+        void countMatch( const DiskLoc &loc ) {
+            if ( !_counted && _match == True && !getsetdup( loc ) ) {
+                ++_cumulativeCount;
+                ++_count;
+                _counted = true;
+            }
+        }
+
+        bool enoughCumulativeMatchesToChooseAPlan() const {
+            // This is equivalent to the default condition for switching from
+            // a query to a getMore, which was the historical default match count for
+            // choosing a plan.
+            return _cumulativeCount >= 101;
+        }
+        bool enoughMatchesToRecordPlan() const {
+            // Recording after 50 matches is a historical default (101 default limit / 2).
+            return _count > 50;
+        }
+
+        int cumulativeCount() const { return _cumulativeCount; }
+        int count() const { return _count; }
+        
+        /** Update local and aggregate nscanned counts. */
+        void updateNscanned( long long nscanned ) {
+            _aggregateNscanned += ( nscanned - _nscanned );
+            _nscanned = nscanned;
+        }
+        long long nscanned() const { return _nscanned; }
+        long long &aggregateNscanned() const { return _aggregateNscanned; }
+    private:
+        bool getsetdup( const DiskLoc &loc ) {
+            if ( !_checkDups ) {
+                return false;
+            }
+            pair<set<DiskLoc>::iterator, bool> p = _dups.insert( loc );
+            return !p.second;
+        }
+        long long &_aggregateNscanned;
+        long long _nscanned;
+        int _cumulativeCount;
+        int _count;
+        bool _checkDups;
+        enum MatchState { Unknown, False, True };
+        MatchState _match;
+        bool _counted;
+        set<DiskLoc> _dups;
+    };
+    
+    /** Dup tracking class, optimizing one common case with small set and few initial reads. */
+    class SmallDupSet {
+    public:
+        SmallDupSet() : _accesses() {
+            _vec.reserve( 250 );
+        }
+        /** @return true if @param 'loc' already added to the set, false if adding to the set in this call. */
+        bool getsetdup( const DiskLoc &loc ) {
+            access();
+            return vec() ? getsetdupVec( loc ) : getsetdupSet( loc );
+        }
+        /** @return true when @param loc in the set. */
+        bool getdup( const DiskLoc &loc ) {
+            access();
+            return vec() ? getdupVec( loc ) : getdupSet( loc );
+        }            
+    private:
+        void access() {
+            ++_accesses;
+            mayUpgrade();
+        }
+        void mayUpgrade() {
+            if ( vec() && _accesses > 500 ) {
+                _set.insert( _vec.begin(), _vec.end() );
+            }
+        }
+        bool vec() const {
+            return _set.size() == 0;
+        }
+        bool getsetdupVec( const DiskLoc &loc ) {
+            if ( getdupVec( loc ) ) {
+                return true;
+            }
+            _vec.push_back( loc );
+            return false;
+        }
+        bool getdupVec( const DiskLoc &loc ) const {
+            for( vector<DiskLoc>::const_iterator i = _vec.begin(); i != _vec.end(); ++i ) {
+                if ( *i == loc ) {
+                    return true;
+                }
+            }
+            return false;
+        }
+        bool getsetdupSet( const DiskLoc &loc ) {
+            pair<set<DiskLoc>::iterator, bool> p = _set.insert(loc);
+            return !p.second;
+        }
+        bool getdupSet( const DiskLoc &loc ) {
+            return _set.count( loc ) > 0;
+        }
+        vector<DiskLoc> _vec;
+        set<DiskLoc> _set;
+        long long _accesses;
+    };
+} // namespace mongo
diff --git a/src/mongo/db/querypattern.cpp b/src/mongo/db/querypattern.cpp
new file mode 100644
index 00000000000..e20e2b6a6ae
--- /dev/null
+++ b/src/mongo/db/querypattern.cpp
@@ -0,0 +1,99 @@
+// @file querypattern.cpp - Query pattern matching for selecting similar plans given similar queries.
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "querypattern.h"
+
+namespace mongo {
+
+    QueryPattern::QueryPattern( const FieldRangeSet &frs, const BSONObj &sort ) {
+        for( map<string,FieldRange>::const_iterator i = frs.ranges().begin(); i != frs.ranges().end(); ++i ) {
+            if ( i->second.equality() ) {
+                _fieldTypes[ i->first ] = QueryPattern::Equality;
+            }
+            else if ( i->second.empty() ) {
+                // This case generally results from an upper and lower bound that are inconsistent for a single key index.
+                _fieldTypes[ i->first ] = QueryPattern::UpperAndLowerBound;
+            }
+            else if ( i->second.nontrivial() ) {
+                bool upper = i->second.max().type() != MaxKey;
+                bool lower = i->second.min().type() != MinKey;
+                if ( upper && lower )
+                    _fieldTypes[ i->first ] = QueryPattern::UpperAndLowerBound;
+                else if ( upper )
+                    _fieldTypes[ i->first ] = QueryPattern::UpperBound;
+                else if ( lower )
+                    _fieldTypes[ i->first ] = QueryPattern::LowerBound;
+            }
+        }
+        setSort( sort );
+    }
+
+    /** for testing only - speed unimportant */
+    bool QueryPattern::operator==( const QueryPattern &other ) const {
+        bool less = operator<( other );
+        bool more = other.operator<( *this );
+        assert( !( less && more ) );
+        return !( less || more );
+    }
+    
+    /** for testing only - speed unimportant */
+    bool QueryPattern::operator!=( const QueryPattern &other ) const {
+        return !operator==( other );
+    }
+    
+    string typeToString( enum QueryPattern::Type t ) {
+        switch (t) {
+            case QueryPattern::Equality:
+                return "Equality";
+            case QueryPattern::LowerBound:
+                return "LowerBound";
+            case QueryPattern::UpperBound:
+                return "UpperBound";
+            case QueryPattern::UpperAndLowerBound:
+                return "UpperAndLowerBound";
+        }
+        return "";
+    }
+    
+    string QueryPattern::toString() const {
+        BSONObjBuilder b;
+        for( map<string,Type>::const_iterator i = _fieldTypes.begin(); i != _fieldTypes.end(); ++i ) {
+            b << i->first << typeToString( i->second );
+        }
+        return BSON( "query" << b.done() << "sort" << _sort ).toString();
+    }
+    
+    void QueryPattern::setSort( const BSONObj sort ) {
+        _sort = normalizeSort( sort );
+    }
+    
+    BSONObj QueryPattern::normalizeSort( const BSONObj &spec ) {
+        if ( spec.isEmpty() )
+            return spec;
+        int direction = ( spec.firstElement().number() >= 0 ) ? 1 : -1;
+        BSONObjIterator i( spec );
+        BSONObjBuilder b;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            b.append( e.fieldName(), direction * ( ( e.number() >= 0 ) ? -1 : 1 ) );
+        }
+        return b.obj();
+    }
+    
+} // namespace mongo
diff --git a/src/mongo/db/querypattern.h b/src/mongo/db/querypattern.h
new file mode 100644
index 00000000000..000c301a0de
--- /dev/null
+++ b/src/mongo/db/querypattern.h
@@ -0,0 +1,78 @@
+// @file querypattern.h - Query pattern matching for selecting similar plans given similar queries.
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "queryutil.h"
+
+namespace mongo {
+
+    /**
+     * Implements query pattern matching, used to determine if a query is
+     * similar to an earlier query and should use the same plan.
+     *
+     * Two queries will generate the same QueryPattern, and therefore match each
+     * other, if their fields have the same Types and they have the same sort
+     * spec.
+     */
+    class QueryPattern {
+    public:
+        QueryPattern( const FieldRangeSet &frs, const BSONObj &sort );
+        enum Type {
+            Equality,
+            LowerBound,
+            UpperBound,
+            UpperAndLowerBound
+        };
+        bool operator<( const QueryPattern &other ) const;
+        /** for testing only */
+        bool operator==( const QueryPattern &other ) const;
+        /** for testing only */
+        bool operator!=( const QueryPattern &other ) const;
+        /** for development / debugging */
+        string toString() const;
+    private:
+        void setSort( const BSONObj sort );
+        static BSONObj normalizeSort( const BSONObj &spec );
+        map<string,Type> _fieldTypes;
+        BSONObj _sort;
+    };
+
+    inline bool QueryPattern::operator<( const QueryPattern &other ) const {
+        map<string,Type>::const_iterator i = _fieldTypes.begin();
+        map<string,Type>::const_iterator j = other._fieldTypes.begin();
+        while( i != _fieldTypes.end() ) {
+            if ( j == other._fieldTypes.end() )
+                return false;
+            if ( i->first < j->first )
+                return true;
+            else if ( i->first > j->first )
+                return false;
+            if ( i->second < j->second )
+                return true;
+            else if ( i->second > j->second )
+                return false;
+            ++i;
+            ++j;
+        }
+        if ( j != other._fieldTypes.end() )
+            return true;
+        return _sort.woCompare( other._sort ) < 0;
+    }
+        
+} // namespace mongo
diff --git a/src/mongo/db/queryutil-inl.h b/src/mongo/db/queryutil-inl.h
new file mode 100644
index 00000000000..08d3b1fac52
--- /dev/null
+++ b/src/mongo/db/queryutil-inl.h
@@ -0,0 +1,153 @@
+// @file queryutil-inl.h - Inline definitions for frequently called queryutil.h functions
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+namespace mongo {
+    
+    inline bool FieldInterval::equality() const {
+        if ( _cachedEquality == -1 ) {
+            _cachedEquality = ( _lower._inclusive && _upper._inclusive && _lower._bound.woCompare( _upper._bound, false ) == 0 );
+        }
+        return _cachedEquality != 0;
+    }
+
+    inline bool FieldRange::equality() const {
+        return
+            !empty() &&
+            min().woCompare( max(), false ) == 0 &&
+            maxInclusive() &&
+            minInclusive();
+    }
+
+    inline bool FieldRange::inQuery() const {
+        if ( equality() ) {
+            return true;
+        }
+        for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            if ( !i->equality() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * TODO Assumes intervals are contiguous and minKey/maxKey will not be
+     * matched against.
+     */
+    inline bool FieldRange::nontrivial() const {
+        return
+            ! empty() &&
+            ( _intervals.size() != 1 ||
+              minKey.firstElement().woCompare( min(), false ) != 0 ||
+              maxKey.firstElement().woCompare( max(), false ) != 0 );
+    }
+
+    inline const FieldRange &FieldRangeSet::range( const char *fieldName ) const {
+        map<string,FieldRange>::const_iterator f = _ranges.find( fieldName );
+        if ( f == _ranges.end() )
+            return trivialRange();
+        return f->second;
+    }
+
+    inline FieldRange &FieldRangeSet::range( const char *fieldName ) {
+        map<string,FieldRange>::iterator f = _ranges.find( fieldName );
+        if ( f == _ranges.end() ) {
+            _ranges.insert( make_pair( string( fieldName ), trivialRange() ) );
+            return _ranges.find( fieldName )->second;
+        }
+        return f->second;
+    }
+
+    inline int FieldRangeSet::nNontrivialRanges() const {
+        int count = 0;
+        for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            if ( i->second.nontrivial() )
+                ++count;
+        }
+        return count;
+    }
+
+    inline bool FieldRangeSet::matchPossible() const {
+        for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            if ( i->second.empty() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    inline bool FieldRangeSet::matchPossibleForIndex( const BSONObj &keyPattern ) const {
+        if ( !_singleKey ) {
+            return matchPossible();   
+        }
+        BSONObjIterator i( keyPattern );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( e.fieldName() == string( "$natural" ) ) {
+                return true;
+            }
+            if ( range( e.fieldName() ).empty() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    inline long long FieldRangeVector::size() {
+        long long ret = 1;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            ret *= i->intervals().size();
+        }
+        return ret;
+    }
+
+    inline FieldRangeSetPair *OrRangeGenerator::topFrsp() const {
+        FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet );
+        if (_orSets.size()) {
+            *ret &= _orSets.front();
+        }
+        return ret;
+    }
+
+    inline FieldRangeSetPair *OrRangeGenerator::topFrspOriginal() const {
+        FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet );
+        if (_originalOrSets.size()) {
+            *ret &= _originalOrSets.front();
+        }
+        return ret;
+    }
+    
+    inline bool FieldRangeSetPair::matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+        assertValidIndexOrNoIndex( d, idxNo );
+        if ( !matchPossible() ) {
+            return false;
+        }
+        if ( idxNo < 0 ) {
+            // multi key matchPossible() is true, so return true.
+            return true;   
+        }
+        return frsForIndex( d, idxNo ).matchPossibleForIndex( keyPattern );
+    }
+
+    inline void FieldRangeSetPair::assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const {
+        massert( 14049, "FieldRangeSetPair invalid index specified", idxNo >= -1 );
+        if ( idxNo >= 0 ) {
+            assertValidIndex( d, idxNo );   
+        }
+    }        
+    
+} // namespace mongo
diff --git a/src/mongo/db/queryutil.cpp b/src/mongo/db/queryutil.cpp
new file mode 100644
index 00000000000..e6748c4bc2e
--- /dev/null
+++ b/src/mongo/db/queryutil.cpp
@@ -0,0 +1,1551 @@
+// @file queryutil.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#include "btree.h"
+#include "matcher.h"
+#include "pdfile.h"
+#include "queryoptimizer.h"
+#include "../util/unittest.h"
+#include "dbmessage.h"
+#include "indexkey.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+    extern BSONObj staticNull;
+    extern BSONObj staticUndefined;
+
+    /** returns a string that when used as a matcher, would match a super set of regex()
+        returns "" for complex regular expressions
+        used to optimize queries in some simple regex cases that start with '^'
+
+        if purePrefix != NULL, sets it to whether the regex can be converted to a range query
+    */
+    string simpleRegex(const char* regex, const char* flags, bool* purePrefix) {
+        string r = "";
+
+        if (purePrefix) *purePrefix = false;
+
+        bool multilineOK;
+        if ( regex[0] == '\\' && regex[1] == 'A') {
+            multilineOK = true;
+            regex += 2;
+        }
+        else if (regex[0] == '^') {
+            multilineOK = false;
+            regex += 1;
+        }
+        else {
+            return r;
+        }
+
+        bool extended = false;
+        while (*flags) {
+            switch (*(flags++)) {
+            case 'm': // multiline
+                if (multilineOK)
+                    continue;
+                else
+                    return r;
+            case 'x': // extended
+                extended = true;
+                break;
+            default:
+                return r; // cant use index
+            }
+        }
+
+        stringstream ss;
+
+        while(*regex) {
+            char c = *(regex++);
+            if ( c == '*' || c == '?' ) {
+                // These are the only two symbols that make the last char optional
+                r = ss.str();
+                r = r.substr( 0 , r.size() - 1 );
+                return r; //breaking here fails with /^a?/
+            }
+            else if (c == '|') {
+                // whole match so far is optional. Nothing we can do here.
+                return string();
+            }
+            else if (c == '\\') {
+                c = *(regex++);
+                if (c == 'Q'){
+                    // \Q...\E quotes everything inside
+                    while (*regex) {
+                        c = (*regex++);
+                        if (c == '\\' && (*regex == 'E')){
+                            regex++; //skip the 'E'
+                            break; // go back to start of outer loop
+                        }
+                        else {
+                            ss << c; // character should match itself
+                        }
+                    }
+                }
+                else if ((c >= 'A' && c <= 'Z') ||
+                        (c >= 'a' && c <= 'z') ||
+                        (c >= '0' && c <= '0') ||
+                        (c == '\0')) {
+                    // don't know what to do with these
+                    r = ss.str();
+                    break;
+                }
+                else {
+                    // slash followed by non-alphanumeric represents the following char
+                    ss << c;
+                }
+            }
+            else if (strchr("^$.[()+{", c)) {
+                // list of "metacharacters" from man pcrepattern
+                r = ss.str();
+                break;
+            }
+            else if (extended && c == '#') {
+                // comment
+                r = ss.str();
+                break;
+            }
+            else if (extended && isspace(c)) {
+                continue;
+            }
+            else {
+                // self-matching char
+                ss << c;
+            }
+        }
+
+        if ( r.empty() && *regex == 0 ) {
+            r = ss.str();
+            if (purePrefix) *purePrefix = !r.empty();
+        }
+
+        return r;
+    }
+    inline string simpleRegex(const BSONElement& e) {
+        switch(e.type()) {
+        case RegEx:
+            return simpleRegex(e.regex(), e.regexFlags());
+        case Object: {
+            BSONObj o = e.embeddedObject();
+            return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe());
+        }
+        default: assert(false); return ""; //return squashes compiler warning
+        }
+    }
+
+    string simpleRegexEnd( string regex ) {
+        ++regex[ regex.length() - 1 ];
+        return regex;
+    }
+
+
+    FieldRange::FieldRange( const BSONElement &e, bool singleKey, bool isNot, bool optimize )
+    : _singleKey( singleKey ) {
+        int op = e.getGtLtOp();
+
+        // NOTE with $not, we could potentially form a complementary set of intervals.
+        if ( !isNot && !e.eoo() && e.type() != RegEx && op == BSONObj::opIN ) {
+            set<BSONElement,element_lt> vals;
+            vector<FieldRange> regexes;
+            uassert( 12580 , "invalid query" , e.isABSONObj() );
+            BSONObjIterator i( e.embeddedObject() );
+            while( i.more() ) {
+                BSONElement ie = i.next();
+                uassert( 15881, "$elemMatch not allowed within $in",
+                         ie.type() != Object ||
+                         ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
+                if ( ie.type() == RegEx ) {
+                    regexes.push_back( FieldRange( ie, singleKey, false, optimize ) );
+                }
+                else {
+                    // A document array may be indexed by its first element, by undefined
+                    // if it is empty, or as a full array if it is embedded within another
+                    // array.
+                    vals.insert( ie );                        
+                    if ( ie.type() == Array ) {
+                        BSONElement temp = ie.embeddedObject().firstElement();
+                        if ( temp.eoo() ) {
+                            temp = staticUndefined.firstElement();
+                        }                        
+                        vals.insert( temp );
+                    }
+                }
+            }
+
+            for( set<BSONElement,element_lt>::const_iterator i = vals.begin(); i != vals.end(); ++i )
+                _intervals.push_back( FieldInterval(*i) );
+
+            for( vector<FieldRange>::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
+                *this |= *i;
+
+            return;
+        }
+
+        // A document array may be indexed by its first element, by undefined
+        // if it is empty, or as a full array if it is embedded within another
+        // array.
+        if ( e.type() == Array && op == BSONObj::Equality ) {
+
+            _intervals.push_back( FieldInterval(e) );
+            BSONElement temp = e.embeddedObject().firstElement();
+            if ( temp.eoo() ) {
+             	temp = staticUndefined.firstElement();
+            }
+            if ( temp < e ) {
+                _intervals.insert( _intervals.begin() , temp );
+            }
+            else {
+                _intervals.push_back( FieldInterval(temp) );
+            }
+
+            return;
+        }
+
+        _intervals.push_back( FieldInterval() );
+        FieldInterval &initial = _intervals[ 0 ];
+        BSONElement &lower = initial._lower._bound;
+        bool &lowerInclusive = initial._lower._inclusive;
+        BSONElement &upper = initial._upper._bound;
+        bool &upperInclusive = initial._upper._inclusive;
+        lower = minKey.firstElement();
+        lowerInclusive = true;
+        upper = maxKey.firstElement();
+        upperInclusive = true;
+
+        if ( e.eoo() )
+            return;
+
+        bool existsSpec = false;
+        if ( op == BSONObj::opEXISTS ) {
+            existsSpec = e.trueValue();
+        }
+        
+        if ( e.type() == RegEx
+                || (e.type() == Object && !e.embeddedObject()["$regex"].eoo())
+           ) {
+            uassert( 13454, "invalid regular expression operator", op == BSONObj::Equality || op == BSONObj::opREGEX );
+            if ( !isNot ) { // no optimization for negated regex - we could consider creating 2 intervals comprising all nonmatching prefixes
+                const string r = simpleRegex(e);
+                if ( r.size() ) {
+                    lower = addObj( BSON( "" << r ) ).firstElement();
+                    upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement();
+                    upperInclusive = false;
+                }
+                else {
+                    BSONObjBuilder b1(32), b2(32);
+                    b1.appendMinForType( "" , String );
+                    lower = addObj( b1.obj() ).firstElement();
+
+                    b2.appendMaxForType( "" , String );
+                    upper = addObj( b2.obj() ).firstElement();
+                    upperInclusive = false; //MaxForType String is an empty Object
+                }
+
+                // regex matches self - regex type > string type
+                if (e.type() == RegEx) {
+                    BSONElement re = addObj( BSON( "" << e ) ).firstElement();
+                    _intervals.push_back( FieldInterval(re) );
+                }
+                else {
+                    BSONObj orig = e.embeddedObject();
+                    BSONObjBuilder b;
+                    b.appendRegex("", orig["$regex"].valuestrsafe(), orig["$options"].valuestrsafe());
+                    BSONElement re = addObj( b.obj() ).firstElement();
+                    _intervals.push_back( FieldInterval(re) );
+                }
+
+            }
+            return;
+        }
+        if ( isNot ) {
+            switch( op ) {
+            case BSONObj::Equality:
+                return;
+//                    op = BSONObj::NE;
+//                    break;
+            case BSONObj::opALL:
+            case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in)
+            case BSONObj::opTYPE:
+                // no bound calculation
+                return;
+            case BSONObj::NE:
+                op = BSONObj::Equality;
+                break;
+            case BSONObj::LT:
+                op = BSONObj::GTE;
+                break;
+            case BSONObj::LTE:
+                op = BSONObj::GT;
+                break;
+            case BSONObj::GT:
+                op = BSONObj::LTE;
+                break;
+            case BSONObj::GTE:
+                op = BSONObj::LT;
+                break;
+            case BSONObj::opEXISTS:
+                existsSpec = !existsSpec;
+                break;
+            default: // otherwise doesn't matter
+                break;
+            }
+        }
+        switch( op ) {
+        case BSONObj::Equality:
+            lower = upper = e;
+            break;
+        case BSONObj::NE: {
+            // this will invalidate the upper/lower references above
+            _intervals.push_back( FieldInterval() );
+            // optimize doesn't make sense for negative ranges
+            _intervals[ 0 ]._upper._bound = e;
+            _intervals[ 0 ]._upper._inclusive = false;
+            _intervals[ 1 ]._lower._bound = e;
+            _intervals[ 1 ]._lower._inclusive = false;
+            _intervals[ 1 ]._upper._bound = maxKey.firstElement();
+            _intervals[ 1 ]._upper._inclusive = true;
+            optimize = false; // don't run optimize code below
+            break;
+        }
+        case BSONObj::LT:
+            upperInclusive = false;
+        case BSONObj::LTE:
+            upper = e;
+            break;
+        case BSONObj::GT:
+            lowerInclusive = false;
+        case BSONObj::GTE:
+            lower = e;
+            break;
+        case BSONObj::opALL: {
+            uassert( 10370 ,  "$all requires array", e.type() == Array );
+            BSONObjIterator i( e.embeddedObject() );
+            bool bound = false;
+            while ( i.more() ) {
+                BSONElement x = i.next();
+                if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+                    // taken care of elsewhere
+                }
+                else if ( x.type() != RegEx ) {
+                    lower = upper = x;
+                    bound = true;
+                    break;
+                }
+            }
+            if ( !bound ) { // if no good non regex bound found, try regex bounds
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.more() ) {
+                    BSONElement x = i.next();
+                    if ( x.type() != RegEx )
+                        continue;
+                    string simple = simpleRegex( x.regex(), x.regexFlags() );
+                    if ( !simple.empty() ) {
+                        lower = addObj( BSON( "" << simple ) ).firstElement();
+                        upper = addObj( BSON( "" << simpleRegexEnd( simple ) ) ).firstElement();
+                        break;
+                    }
+                }
+            }
+            break;
+        }
+        case BSONObj::opMOD: {
+            {
+                BSONObjBuilder b;
+                b.appendMinForType( "" , NumberDouble );
+                lower = addObj( b.obj() ).firstElement();
+            }
+            {
+                BSONObjBuilder b;
+                b.appendMaxForType( "" , NumberDouble );
+                upper = addObj( b.obj() ).firstElement();
+            }
+            break;
+        }
+        case BSONObj::opTYPE: {
+            BSONType t = (BSONType)e.numberInt();
+            {
+                BSONObjBuilder b;
+                b.appendMinForType( "" , t );
+                lower = addObj( b.obj() ).firstElement();
+            }
+            {
+                BSONObjBuilder b;
+                b.appendMaxForType( "" , t );
+                upper = addObj( b.obj() ).firstElement();
+            }
+
+            break;
+        }
+        case BSONObj::opREGEX:
+        case BSONObj::opOPTIONS:
+            // do nothing
+            break;
+        case BSONObj::opELEM_MATCH: {
+            log() << "warning: shouldn't get here?" << endl;
+            break;
+        }
+        case BSONObj::opNEAR:
+        case BSONObj::opWITHIN:
+            _special = "2d";
+            break;
+        case BSONObj::opEXISTS: {
+            if ( !existsSpec ) {
+                lower = upper = staticNull.firstElement();
+            }
+            optimize = false;
+            break;
+        }
+        default:
+            break;
+        }
+
+        if ( optimize ) {
+            if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ) { // TODO: get rid of isSimpleType
+                BSONObjBuilder b;
+                b.appendMaxForType( lower.fieldName() , lower.type() );
+                upper = addObj( b.obj() ).firstElement();
+            }
+            else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ) { // TODO: get rid of isSimpleType
+                if( upper.type() == Date ) 
+                    lowerInclusive = false;
+                BSONObjBuilder b;
+                b.appendMinForType( upper.fieldName() , upper.type() );
+                lower = addObj( b.obj() ).firstElement();
+            }
+        }
+
+    }
+
+    void FieldRange::finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other ) {
+        _intervals = newIntervals;
+        for( vector<BSONObj>::const_iterator i = other._objData.begin(); i != other._objData.end(); ++i )
+            _objData.push_back( *i );
+        if ( _special.size() == 0 && other._special.size() )
+            _special = other._special;
+    }
+
+    // as called, these functions find the max/min of a bound in the
+    // opposite direction, so inclusive bounds are considered less
+    // superlative
+    FieldBound maxFieldBound( const FieldBound &a, const FieldBound &b ) {
+        int cmp = a._bound.woCompare( b._bound, false );
+        if ( ( cmp == 0 && !b._inclusive ) || cmp < 0 )
+            return b;
+        return a;
+    }
+
+    FieldBound minFieldBound( const FieldBound &a, const FieldBound &b ) {
+        int cmp = a._bound.woCompare( b._bound, false );
+        if ( ( cmp == 0 && !b._inclusive ) || cmp > 0 )
+            return b;
+        return a;
+    }
+
+    bool fieldIntervalOverlap( const FieldInterval &one, const FieldInterval &two, FieldInterval &result ) {
+        result._lower = maxFieldBound( one._lower, two._lower );
+        result._upper = minFieldBound( one._upper, two._upper );
+        return result.strictValid();
+    }
+
+    const FieldRange &FieldRange::operator&=( const FieldRange &other ) {
+        if ( !_singleKey && nontrivial() ) {
+            if ( other <= *this ) {
+             	*this = other;
+            }
+            return *this;
+        }
+        vector<FieldInterval> newIntervals;
+        vector<FieldInterval>::const_iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
+        while( i != _intervals.end() && j != other._intervals.end() ) {
+            FieldInterval overlap;
+            if ( fieldIntervalOverlap( *i, *j, overlap ) ) {
+                newIntervals.push_back( overlap );
+            }
+            if ( i->_upper == minFieldBound( i->_upper, j->_upper ) ) {
+                ++i;
+            }
+            else {
+                ++j;
+            }
+        }
+        finishOperation( newIntervals, other );
+        return *this;
+    }
+
+    void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector<FieldInterval> &newIntervals ) {
+        if ( low._bound.eoo() ) {
+            low = lower._lower; high = lower._upper;
+        }
+        else {
+            int cmp = high._bound.woCompare( lower._lower._bound, false );
+            if ( ( cmp < 0 ) || ( cmp == 0 && !high._inclusive && !lower._lower._inclusive ) ) {
+                FieldInterval tmp;
+                tmp._lower = low;
+                tmp._upper = high;
+                newIntervals.push_back( tmp );
+                low = lower._lower; high = lower._upper;
+            }
+            else {
+                high = lower._upper;
+            }
+        }
+    }
+
+    const FieldRange &FieldRange::operator|=( const FieldRange &other ) {
+        vector<FieldInterval> newIntervals;
+        FieldBound low;
+        FieldBound high;
+        vector<FieldInterval>::const_iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
+        while( i != _intervals.end() && j != other._intervals.end() ) {
+            int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
+            if ( ( cmp == 0 && i->_lower._inclusive ) || cmp < 0 ) {
+                handleInterval( *i, low, high, newIntervals );
+                ++i;
+            }
+            else {
+                handleInterval( *j, low, high, newIntervals );
+                ++j;
+            }
+        }
+        while( i != _intervals.end() ) {
+            handleInterval( *i, low, high, newIntervals );
+            ++i;
+        }
+        while( j != other._intervals.end() ) {
+            handleInterval( *j, low, high, newIntervals );
+            ++j;
+        }
+        FieldInterval tmp;
+        tmp._lower = low;
+        tmp._upper = high;
+        newIntervals.push_back( tmp );
+        finishOperation( newIntervals, other );
+        return *this;
+    }
+
+    const FieldRange &FieldRange::operator-=( const FieldRange &other ) {
+        vector<FieldInterval> newIntervals;
+        vector<FieldInterval>::iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
+        while( i != _intervals.end() && j != other._intervals.end() ) {
+            int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
+            if ( cmp < 0 ||
+                    ( cmp == 0 && i->_lower._inclusive && !j->_lower._inclusive ) ) {
+                int cmp2 = i->_upper._bound.woCompare( j->_lower._bound, false );
+                if ( cmp2 < 0 ) {
+                    newIntervals.push_back( *i );
+                    ++i;
+                }
+                else if ( cmp2 == 0 ) {
+                    newIntervals.push_back( *i );
+                    if ( newIntervals.back()._upper._inclusive && j->_lower._inclusive ) {
+                        newIntervals.back()._upper._inclusive = false;
+                    }
+                    ++i;
+                }
+                else {
+                    newIntervals.push_back( *i );
+                    newIntervals.back()._upper = j->_lower;
+                    newIntervals.back()._upper.flipInclusive();
+                    int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false );
+                    if ( cmp3 < 0 ||
+                            ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
+                        ++i;
+                    }
+                    else {
+                        i->_lower = j->_upper;
+                        i->_lower.flipInclusive();
+                        ++j;
+                    }
+                }
+            }
+            else {
+                int cmp2 = i->_lower._bound.woCompare( j->_upper._bound, false );
+                if ( cmp2 > 0 ||
+                        ( cmp2 == 0 && ( !i->_lower._inclusive || !j->_upper._inclusive ) ) ) {
+                    ++j;
+                }
+                else {
+                    int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false );
+                    if ( cmp3 < 0 ||
+                            ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
+                        ++i;
+                    }
+                    else {
+                        i->_lower = j->_upper;
+                        i->_lower.flipInclusive();
+                        ++j;
+                    }
+                }
+            }
+        }
+        while( i != _intervals.end() ) {
+            newIntervals.push_back( *i );
+            ++i;
+        }
+        finishOperation( newIntervals, other );
+        return *this;
+    }
+
+    // TODO write a proper implementation that doesn't do a full copy
+    bool FieldRange::operator<=( const FieldRange &other ) const {
+        FieldRange temp = *this;
+        temp -= other;
+        return temp.empty();
+    }
+
+    void FieldRange::setExclusiveBounds() {
+        for( vector<FieldInterval>::iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            i->_lower._inclusive = false;
+            i->_upper._inclusive = false;
+        }
+    }
+
+    void FieldRange::reverse( FieldRange &ret ) const {
+        assert( _special.empty() );
+        ret._intervals.clear();
+        ret._objData = _objData;
+        for( vector<FieldInterval>::const_reverse_iterator i = _intervals.rbegin(); i != _intervals.rend(); ++i ) {
+            FieldInterval fi;
+            fi._lower = i->_upper;
+            fi._upper = i->_lower;
+            ret._intervals.push_back( fi );
+        }
+    }
+    
+    BSONObj FieldRange::addObj( const BSONObj &o ) {
+        _objData.push_back( o );
+        return o;
+    }
+
+    string FieldInterval::toString() const {
+        StringBuilder buf;
+        buf << ( _lower._inclusive ? "[" : "(" );
+        buf << _lower._bound;
+        buf << " , ";
+        buf << _upper._bound;
+        buf << ( _upper._inclusive ? "]" : ")" );
+        return buf.str();
+    }
+
+    string FieldRange::toString() const {
+        StringBuilder buf;
+        buf << "(FieldRange special: " << _special << " singleKey: " << _special << " intervals: ";
+        for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            buf << i->toString();
+        }
+
+        buf << ")";
+        return buf.str();
+    }
+
+    string FieldRangeSet::getSpecial() const {
+        string s = "";
+        for ( map<string,FieldRange>::const_iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) {
+            if ( i->second.getSpecial().size() == 0 )
+                continue;
+            uassert( 13033 , "can't have 2 special fields" , s.size() == 0 );
+            s = i->second.getSpecial();
+        }
+        return s;
+    }
+
+    /**
+     * Btree scanning for a multidimentional key range will yield a
+     * multidimensional box.  The idea here is that if an 'other'
+     * multidimensional box contains the current box we don't have to scan
+     * the current box.  If the 'other' box contains the current box in
+     * all dimensions but one, we can safely subtract the values of 'other'
+     * along that one dimension from the values for the current box on the
+     * same dimension.  In other situations, subtracting the 'other'
+     * box from the current box yields a result that is not a box (but
+     * rather can be expressed as a union of boxes).  We don't support
+     * such splitting currently in calculating index ranges.  Note that
+     * where I have said 'box' above, I actually mean sets of boxes because
+     * a field range can consist of multiple intervals.
+     */    
+    const FieldRangeSet &FieldRangeSet::operator-=( const FieldRangeSet &other ) {
+        int nUnincluded = 0;
+        string unincludedKey;
+        map<string,FieldRange>::iterator i = _ranges.begin();
+        map<string,FieldRange>::const_iterator j = other._ranges.begin();
+        while( nUnincluded < 2 && i != _ranges.end() && j != other._ranges.end() ) {
+            int cmp = i->first.compare( j->first );
+            if ( cmp == 0 ) {
+                if ( i->second <= j->second ) {
+                    // nothing
+                }
+                else {
+                    ++nUnincluded;
+                    unincludedKey = i->first;
+                }
+                ++i;
+                ++j;
+            }
+            else if ( cmp < 0 ) {
+                ++i;
+            }
+            else {
+                // other has a bound we don't, nothing can be done
+                return *this;
+            }
+        }
+        if ( j != other._ranges.end() ) {
+            // other has a bound we don't, nothing can be done
+            return *this;
+        }
+        if ( nUnincluded > 1 ) {
+            return *this;
+        }
+        if ( nUnincluded == 0 ) {
+            makeEmpty();
+            return *this;
+        }
+        // nUnincluded == 1
+        range( unincludedKey.c_str() ) -= other.range( unincludedKey.c_str() );
+        appendQueries( other );
+        return *this;
+    }
+    
+    const FieldRangeSet &FieldRangeSet::operator&=( const FieldRangeSet &other ) {
+        map<string,FieldRange>::iterator i = _ranges.begin();
+        map<string,FieldRange>::const_iterator j = other._ranges.begin();
+        while( i != _ranges.end() && j != other._ranges.end() ) {
+            int cmp = i->first.compare( j->first );
+            if ( cmp == 0 ) {
+                // Same field name, so find range intersection.
+                i->second &= j->second;
+                ++i;
+                ++j;
+            }
+            else if ( cmp < 0 ) {
+                // Field present in *this.
+                ++i;
+            }
+            else {
+                // Field not present in *this, so add it.
+                range( j->first.c_str() ) = j->second;
+                ++j;
+            }
+        }
+        while( j != other._ranges.end() ) {
+            // Field not present in *this, add it.
+            range( j->first.c_str() ) = j->second;
+            ++j;
+        }
+        appendQueries( other );
+        return *this;
+    }    
+    
+    void FieldRangeSet::appendQueries( const FieldRangeSet &other ) {
+        for( vector<BSONObj>::const_iterator i = other._queries.begin(); i != other._queries.end(); ++i ) {
+            _queries.push_back( *i );
+        }
+    }
+    
+    void FieldRangeSet::makeEmpty() {
+        for( map<string,FieldRange>::iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            i->second.makeEmpty();
+        }
+    }    
+    
+    void FieldRangeSet::processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ) {
+        BSONElement g = f;
+        int op2 = g.getGtLtOp();
+        if ( op2 == BSONObj::opALL ) {
+            BSONElement h = g;
+            uassert( 13050 ,  "$all requires array", h.type() == Array );
+            BSONObjIterator i( h.embeddedObject() );
+            if( i.more() ) {
+                BSONElement x = i.next();
+                if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+                    g = x.embeddedObject().firstElement();
+                    op2 = g.getGtLtOp();
+                }
+            }
+        }
+        if ( op2 == BSONObj::opELEM_MATCH ) {
+            BSONObjIterator k( g.embeddedObjectUserCheck() );
+            while ( k.more() ) {
+                BSONElement h = k.next();
+                StringBuilder buf(32);
+                buf << fieldName << "." << h.fieldName();
+                string fullname = buf.str();
+
+                int op3 = getGtLtOp( h );
+                if ( op3 == BSONObj::Equality ) {
+                    range( fullname.c_str() ) &= FieldRange( h , _singleKey , isNot , optimize );
+                }
+                else {
+                    BSONObjIterator l( h.embeddedObject() );
+                    while ( l.more() ) {
+                        range( fullname.c_str() ) &= FieldRange( l.next() , _singleKey , isNot , optimize );
+                    }
+                }
+            }
+        }
+        else {
+            range( fieldName ) &= FieldRange( f , _singleKey , isNot , optimize );
+        }
+    }
+
+    void FieldRangeSet::processQueryField( const BSONElement &e, bool optimize ) {
+        if ( e.fieldName()[ 0 ] == '$' ) {
+            if ( strcmp( e.fieldName(), "$and" ) == 0 ) {
+                uassert( 14816 , "$and expression must be a nonempty array" , e.type() == Array && e.embeddedObject().nFields() > 0 );
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.more() ) {
+                    BSONElement e = i.next();
+                    uassert( 14817 , "$and elements must be objects" , e.type() == Object );
+                    BSONObjIterator j( e.embeddedObject() );
+                    while( j.more() ) {
+                        processQueryField( j.next(), optimize );
+                    }
+                }            
+            }
+        
+            if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
+                return;
+            }
+        
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                return;
+            }
+        
+            if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
+                return;
+            }
+        }
+        
+        bool equality = ( getGtLtOp( e ) == BSONObj::Equality );
+        if ( equality && e.type() == Object ) {
+            equality = ( strcmp( e.embeddedObject().firstElementFieldName(), "$not" ) != 0 );
+        }
+
+        if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) {
+            range( e.fieldName() ) &= FieldRange( e , _singleKey , false , optimize );
+        }
+        if ( !equality ) {
+            BSONObjIterator j( e.embeddedObject() );
+            while( j.more() ) {
+                BSONElement f = j.next();
+                if ( strcmp( f.fieldName(), "$not" ) == 0 ) {
+                    switch( f.type() ) {
+                    case Object: {
+                        BSONObjIterator k( f.embeddedObject() );
+                        while( k.more() ) {
+                            BSONElement g = k.next();
+                            uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality );
+                            processOpElement( e.fieldName(), g, true, optimize );
+                        }
+                        break;
+                    }
+                    case RegEx:
+                        processOpElement( e.fieldName(), f, true, optimize );
+                        break;
+                    default:
+                        uassert( 13041, "invalid use of $not", false );
+                    }
+                }
+                else {
+                    processOpElement( e.fieldName(), f, false, optimize );
+                }
+            }
+        }
+    }
+
+    FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query, bool singleKey, bool optimize )
+        : _ns( ns ), _queries( 1, query.getOwned() ), _singleKey( singleKey ) {
+        BSONObjIterator i( _queries[ 0 ] );
+
+        while( i.more() ) {
+            processQueryField( i.next(), optimize );
+        }
+    }
+    
+    FieldRangeVector::FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction )
+    :_indexSpec( indexSpec ), _direction( direction >= 0 ? 1 : -1 ) {
+        _queries = frs._queries;
+        BSONObjIterator i( _indexSpec.keyPattern );
+        set< string > baseObjectNontrivialPrefixes;
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const FieldRange *range = &frs.range( e.fieldName() );
+            if ( !frs.singleKey() ) {
+                string prefix = str::before( e.fieldName(), '.' );
+                if ( baseObjectNontrivialPrefixes.count( prefix ) > 0 ) {
+                    // A field with the same parent field has already been
+                    // constrainted, and with a multikey index we cannot
+                    // constrain this field.
+                    range = &frs.trivialRange();
+                } else {
+                    if ( range->nontrivial() ) {
+                        baseObjectNontrivialPrefixes.insert( prefix );
+                    }
+                }
+            }
+            int number = (int) e.number(); // returns 0.0 if not numeric
+            bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
+            if ( forward ) {
+                _ranges.push_back( *range );
+            }
+            else {
+                _ranges.push_back( FieldRange( BSONObj().firstElement(), frs.singleKey(), false, true ) );
+                range->reverse( _ranges.back() );
+            }
+            assert( !_ranges.back().empty() );
+        }
+        uassert( 13385, "combinatorial limit of $in partitioning of result set exceeded", size() < 1000000 );
+    }    
+
+    BSONObj FieldRangeVector::startKey() const {
+        BSONObjBuilder b;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            const FieldInterval &fi = i->intervals().front();
+            b.appendAs( fi._lower._bound, "" );
+        }
+        return b.obj();
+    }
+
+    BSONObj FieldRangeVector::endKey() const {
+        BSONObjBuilder b;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            const FieldInterval &fi = i->intervals().back();
+            b.appendAs( fi._upper._bound, "" );
+        }
+        return b.obj();
+    }
+
+    BSONObj FieldRangeVector::obj() const {
+        BSONObjBuilder b;
+        BSONObjIterator k( _indexSpec.keyPattern );
+        for( int i = 0; i < (int)_ranges.size(); ++i ) {
+            BSONArrayBuilder a( b.subarrayStart( k.next().fieldName() ) );
+            for( vector<FieldInterval>::const_iterator j = _ranges[ i ].intervals().begin();
+                j != _ranges[ i ].intervals().end(); ++j ) {
+                a << BSONArray( BSON_ARRAY( j->_lower._bound << j->_upper._bound ).clientReadable() );
+            }
+            a.done();
+        }
+        return b.obj();
+    }
+    
+    FieldRange *FieldRangeSet::__singleKeyTrivialRange = 0;
+    FieldRange *FieldRangeSet::__multiKeyTrivialRange = 0;
+    const FieldRange &FieldRangeSet::trivialRange() const {
+        FieldRange *&ret = _singleKey ? __singleKeyTrivialRange : __multiKeyTrivialRange;
+        if ( ret == 0 ) {
+            ret = new FieldRange( BSONObj().firstElement(), _singleKey, false, true );
+        }
+        return *ret;
+    }
+
+    BSONObj FieldRangeSet::simplifiedQuery( const BSONObj &_fields ) const {
+        BSONObj fields = _fields;
+        if ( fields.isEmpty() ) {
+            BSONObjBuilder b;
+            for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+                b.append( i->first, 1 );
+            }
+            fields = b.obj();
+        }
+        BSONObjBuilder b;
+        BSONObjIterator i( fields );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const char *name = e.fieldName();
+            const FieldRange &eRange = range( name );
+            assert( !eRange.empty() );
+            if ( eRange.equality() )
+                b.appendAs( eRange.min(), name );
+            else if ( eRange.nontrivial() ) {
+                BSONObj o;
+                BSONObjBuilder c;
+                if ( eRange.min().type() != MinKey )
+                    c.appendAs( eRange.min(), eRange.minInclusive() ? "$gte" : "$gt" );
+                if ( eRange.max().type() != MaxKey )
+                    c.appendAs( eRange.max(), eRange.maxInclusive() ? "$lte" : "$lt" );
+                o = c.obj();
+                b.append( name, o );
+            }
+        }
+        return b.obj();
+    }
+
+    QueryPattern FieldRangeSet::pattern( const BSONObj &sort ) const {
+        return QueryPattern( *this, sort );
+    }
+
+    // TODO get rid of this
+    BoundList FieldRangeSet::indexBounds( const BSONObj &keyPattern, int direction ) const {
+        typedef vector<pair<shared_ptr<BSONObjBuilder>, shared_ptr<BSONObjBuilder> > > BoundBuilders;
+        BoundBuilders builders;
+        builders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+        BSONObjIterator i( keyPattern );
+        bool ineq = false; // until ineq is true, we are just dealing with equality and $in bounds
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const FieldRange &fr = range( e.fieldName() );
+            int number = (int) e.number(); // returns 0.0 if not numeric
+            bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
+            if ( !ineq ) {
+                if ( fr.equality() ) {
+                    for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) {
+                        j->first->appendAs( fr.min(), "" );
+                        j->second->appendAs( fr.min(), "" );
+                    }
+                }
+                else {
+                    if ( !fr.inQuery() ) {
+                        ineq = true;
+                    }
+                    BoundBuilders newBuilders;
+                    const vector<FieldInterval> &intervals = fr.intervals();
+                    for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) {
+                        BSONObj first = i->first->obj();
+                        BSONObj second = i->second->obj();
+
+                        const unsigned maxCombinations = 4000000;
+                        if ( forward ) {
+                            for( vector<FieldInterval>::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) {
+                                uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
+                                newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+                                newBuilders.back().first->appendElements( first );
+                                newBuilders.back().second->appendElements( second );
+                                newBuilders.back().first->appendAs( j->_lower._bound, "" );
+                                newBuilders.back().second->appendAs( j->_upper._bound, "" );
+                            }
+                        }
+                        else {
+                            for( vector<FieldInterval>::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) {
+                                uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
+                                newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+                                newBuilders.back().first->appendElements( first );
+                                newBuilders.back().second->appendElements( second );
+                                newBuilders.back().first->appendAs( j->_upper._bound, "" );
+                                newBuilders.back().second->appendAs( j->_lower._bound, "" );
+                            }
+                        }
+                    }
+                    builders = newBuilders;
+                }
+            }
+            else {
+                for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) {
+                    j->first->appendAs( forward ? fr.min() : fr.max(), "" );
+                    j->second->appendAs( forward ? fr.max() : fr.min(), "" );
+                }
+            }
+        }
+        BoundList ret;
+        for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i )
+            ret.push_back( make_pair( i->first->obj(), i->second->obj() ) );
+        return ret;
+    }
+
+    FieldRangeSet *FieldRangeSet::subset( const BSONObj &fields ) const {
+        FieldRangeSet *ret = new FieldRangeSet( _ns, BSONObj(), _singleKey, true );
+        BSONObjIterator i( fields );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( range( e.fieldName() ).nontrivial() ) {
+                ret->range( e.fieldName() ) = range( e.fieldName() );
+            }
+        }
+        ret->_queries = _queries;
+        return ret;
+    }
+    
+    bool FieldRangeSetPair::noNontrivialRanges() const {
+        return _singleKey.matchPossible() && _singleKey.nNontrivialRanges() == 0 &&
+                 _multiKey.matchPossible() && _multiKey.nNontrivialRanges() == 0;
+    }
+    
+    FieldRangeSetPair &FieldRangeSetPair::operator&=( const FieldRangeSetPair &other ) {
+        _singleKey &= other._singleKey;
+        _multiKey &= other._multiKey;
+        return *this;
+    }
+
+    FieldRangeSetPair &FieldRangeSetPair::operator-=( const FieldRangeSet &scanned ) {
+        _singleKey -= scanned;
+        _multiKey -= scanned;
+        return *this;            
+    }    
+    
+    BSONObj FieldRangeSetPair::simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+        return frsForIndex( d, idxNo ).simplifiedQuery( keyPattern );
+    }    
+    
+    void FieldRangeSetPair::assertValidIndex( const NamespaceDetails *d, int idxNo ) const {
+        massert( 14048, "FieldRangeSetPair invalid index specified", idxNo >= 0 && idxNo < d->nIndexes );   
+    }
+        
+    const FieldRangeSet &FieldRangeSetPair::frsForIndex( const NamespaceDetails* nsd, int idxNo ) const {
+        assertValidIndexOrNoIndex( nsd, idxNo );
+        if ( idxNo < 0 ) {
+            // An unindexed cursor cannot have a "single key" constraint.
+            return _multiKey;
+        }
+        return nsd->isMultikey( idxNo ) ? _multiKey : _singleKey;
+    }    
+        
+    bool FieldRangeVector::matchesElement( const BSONElement &e, int i, bool forward ) const {
+        bool eq;
+        int l = matchingLowElement( e, i, forward, eq );
+        return ( l % 2 == 0 ); // if we're inside an interval
+    }
+
+    // binary search for interval containing the specified element
+    // an even return value indicates that the element is contained within a valid interval
+    int FieldRangeVector::matchingLowElement( const BSONElement &e, int i, bool forward, bool &lowEquality ) const {
+        lowEquality = false;
+        int l = -1;
+        int h = _ranges[ i ].intervals().size() * 2;
+        while( l + 1 < h ) {
+            int m = ( l + h ) / 2;
+            BSONElement toCmp;
+            bool toCmpInclusive;
+            const FieldInterval &interval = _ranges[ i ].intervals()[ m / 2 ];
+            if ( m % 2 == 0 ) {
+                toCmp = interval._lower._bound;
+                toCmpInclusive = interval._lower._inclusive;
+            }
+            else {
+                toCmp = interval._upper._bound;
+                toCmpInclusive = interval._upper._inclusive;
+            }
+            int cmp = toCmp.woCompare( e, false );
+            if ( !forward ) {
+                cmp = -cmp;
+            }
+            if ( cmp < 0 ) {
+                l = m;
+            }
+            else if ( cmp > 0 ) {
+                h = m;
+            }
+            else {
+                if ( m % 2 == 0 ) {
+                    lowEquality = true;
+                }
+                int ret = m;
+                // if left match and inclusive, all good
+                // if left match and not inclusive, return right before left bound
+                // if right match and inclusive, return left bound
+                // if right match and not inclusive, return right bound
+                if ( ( m % 2 == 0 && !toCmpInclusive ) || ( m % 2 == 1 && toCmpInclusive ) ) {
+                    --ret;
+                }
+                return ret;
+            }
+        }
+        assert( l + 1 == h );
+        return l;
+    }
+
+    bool FieldRangeVector::matchesKey( const BSONObj &key ) const {
+        BSONObjIterator j( key );
+        BSONObjIterator k( _indexSpec.keyPattern );
+        for( int l = 0; l < (int)_ranges.size(); ++l ) {
+            int number = (int) k.next().number();
+            bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0;
+            if ( !matchesElement( j.next(), l, forward ) ) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    bool FieldRangeVector::matches( const BSONObj &obj ) const {
+
+        bool ok = false;
+
+        // TODO The representation of matching keys could potentially be optimized
+        // more for the case at hand.  (For example, we can potentially consider
+        // fields individually instead of constructing several bson objects using
+        // multikey arrays.)  But getKeys() canonically defines the key set for a
+        // given object and for now we are using it as is.
+        BSONObjSet keys;
+        _indexSpec.getKeys( obj, keys );
+        for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+            if ( matchesKey( *i ) ) {
+                ok = true;
+                break;
+            }
+        }
+
+        LOG(5) << "FieldRangeVector::matches() returns " << ok << endl;
+
+        return ok;
+    }
+
+    BSONObj FieldRangeVector::firstMatch( const BSONObj &obj ) const {
+        // NOTE Only works in forward direction.
+        assert( _direction >= 0 );
+        BSONObjSet keys( BSONObjCmp( _indexSpec.keyPattern ) );
+        _indexSpec.getKeys( obj, keys );
+        for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+            if ( matchesKey( *i ) ) {
+                return *i;
+            }
+        }
+        return BSONObj();
+    }
+    
+    // TODO optimize more
+    int FieldRangeVectorIterator::advance( const BSONObj &curr ) {
+        BSONObjIterator j( curr );
+        BSONObjIterator o( _v._indexSpec.keyPattern );
+        // track first field for which we are not at the end of the valid values,
+        // since we may need to advance from the key prefix ending with this field
+        int latestNonEndpoint = -1;
+        // iterate over fields to determine appropriate advance method
+        for( int i = 0; i < (int)_i.size(); ++i ) {
+            if ( i > 0 && !_v._ranges[ i - 1 ].intervals()[ _i[ i - 1 ] ].equality() ) {
+                // if last bound was inequality, we don't know anything about where we are for this field
+                // TODO if possible avoid this certain cases when value in previous field of the previous
+                // key is the same as value of previous field in current key
+                setMinus( i );
+            }
+            bool eq = false;
+            BSONElement oo = o.next();
+            bool reverse = ( ( oo.number() < 0 ) ^ ( _v._direction < 0 ) );
+            BSONElement jj = j.next();
+            if ( _i[ i ] == -1 ) { // unknown position for this field, do binary search
+                bool lowEquality;
+                int l = _v.matchingLowElement( jj, i, !reverse, lowEquality );
+                if ( l % 2 == 0 ) { // we are in a valid range for this field
+                    _i[ i ] = l / 2;
+                    int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ];
+                    if ( diff > 1 ) {
+                        latestNonEndpoint = i;
+                    }
+                    else if ( diff == 1 ) {
+                        int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false );
+                        if ( x != 0 ) {
+                            latestNonEndpoint = i;
+                        }
+                    }
+                    continue;
+                }
+                else {   // not in a valid range for this field - determine if and how to advance
+                    // check if we're after the last interval for this field
+                    if ( l == (int)_v._ranges[ i ].intervals().size() * 2 - 1 ) {
+                        if ( latestNonEndpoint == -1 ) {
+                            return -2;
+                        }
+                        setZero( latestNonEndpoint + 1 );
+                        // skip to curr / latestNonEndpoint + 1 / superlative
+                        _after = true;
+                        return latestNonEndpoint + 1;
+                    }
+                    _i[ i ] = ( l + 1 ) / 2;
+                    if ( lowEquality ) {
+                        // skip to curr / i + 1 / superlative
+                        _after = true;
+                        return i + 1;
+                    }
+                    // skip to curr / i / nextbounds
+                    _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound;
+                    _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive;
+                    for( int j = i + 1; j < (int)_i.size(); ++j ) {
+                        _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+                        _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+                    }
+                    _after = false;
+                    return i;
+                }
+            }
+            bool first = true;
+            // _i[ i ] != -1, so we have a starting interval for this field
+            // which serves as a lower/equal bound on the first iteration -
+            // we advance from this interval to find a matching interval
+            while( _i[ i ] < (int)_v._ranges[ i ].intervals().size() ) {
+                // compare to current interval's upper bound
+                int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false );
+                if ( reverse ) {
+                    x = -x;
+                }
+                if ( x == 0 && _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._inclusive ) {
+                    eq = true;
+                    break;
+                }
+                // see if we're less than the upper bound
+                if ( x > 0 ) {
+                    if ( i == 0 && first ) {
+                        // the value of 1st field won't go backward, so don't check lower bound
+                        // TODO maybe we can check first only?
+                        break;
+                    }
+                    // if it's an equality interval, don't need to compare separately to lower bound
+                    if ( !_v._ranges[ i ].intervals()[ _i[ i ] ].equality() ) {
+                        // compare to current interval's lower bound
+                        x = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound.woCompare( jj, false );
+                        if ( reverse ) {
+                            x = -x;
+                        }
+                    }
+                    // if we're equal to and not inclusive the lower bound, advance
+                    if ( ( x == 0 && !_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive ) ) {
+                        setZero( i + 1 );
+                        // skip to curr / i + 1 / superlative
+                        _after = true;
+                        return i + 1;
+                    }
+                    // if we're less than the lower bound, advance
+                    if ( x > 0 ) {
+                        setZero( i + 1 );
+                        // skip to curr / i / nextbounds
+                        _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound;
+                        _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive;
+                        for( int j = i + 1; j < (int)_i.size(); ++j ) {
+                            _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+                            _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+                        }
+                        _after = false;
+                        return i;
+                    }
+                    else {
+                        break;
+                    }
+                }
+                // we're above the upper bound, so try next interval and reset remaining fields
+                ++_i[ i ];
+                setZero( i + 1 );
+                first = false;
+            }
+            int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ];
+            if ( diff > 1 || ( !eq && diff == 1 ) ) {
+                // check if we're not at the end of valid values for this field
+                latestNonEndpoint = i;
+            }
+            else if ( diff == 0 ) {   // check if we're past the last interval for this field
+                if ( latestNonEndpoint == -1 ) {
+                    return -2;
+                }
+                // more values possible, skip...
+                setZero( latestNonEndpoint + 1 );
+                // skip to curr / latestNonEndpoint + 1 / superlative
+                _after = true;
+                return latestNonEndpoint + 1;
+            }
+        }
+        return -1;
+    }
+
+    void FieldRangeVectorIterator::prepDive() {
+        for( int j = 0; j < (int)_i.size(); ++j ) {
+            _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+            _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+        }
+    }
+
+    BSONObj FieldRangeVectorIterator::startKey() {
+        BSONObjBuilder b;
+        for( int unsigned i = 0; i < _i.size(); ++i ) {
+            const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
+            b.appendAs( fi._lower._bound, "" );
+        }
+        return b.obj();
+    }
+
+    // temp
+    BSONObj FieldRangeVectorIterator::endKey() {
+        BSONObjBuilder b;
+        for( int unsigned i = 0; i < _i.size(); ++i ) {
+            const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
+            b.appendAs( fi._upper._bound, "" );
+        }
+        return b.obj();
+    }
+    
+    OrRangeGenerator::OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize )
+    : _baseSet( ns, query, optimize ), _orFound() {
+        
+        BSONObjIterator i( _baseSet.originalQuery() );
+        
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                uassert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+                BSONObjIterator j( e.embeddedObject() );
+                while( j.more() ) {
+                    BSONElement f = j.next();
+                    uassert( 13263, "$or array must contain objects", f.type() == Object );
+                    _orSets.push_back( FieldRangeSetPair( ns, f.embeddedObject(), optimize ) );
+                    uassert( 13291, "$or may not contain 'special' query", _orSets.back().getSpecial().empty() );
+                    _originalOrSets.push_back( _orSets.back() );
+                }
+                _orFound = true;
+                continue;
+            }
+        }
+    }
+
+    void OrRangeGenerator::assertMayPopOrClause() {
+        massert( 13274, "no or clause to pop", !orFinished() );        
+    }
+    
+    void OrRangeGenerator::popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern ) {
+        assertMayPopOrClause();
+        auto_ptr<FieldRangeSet> holder;
+        const FieldRangeSet *toDiff = &_originalOrSets.front().frsForIndex( nsd, idxNo );
+        BSONObj indexSpec = keyPattern;
+        if ( !indexSpec.isEmpty() && toDiff->matchPossibleForIndex( indexSpec ) ) {
+            holder.reset( toDiff->subset( indexSpec ) );
+            toDiff = holder.get();
+        }
+        popOrClause( toDiff, nsd, idxNo, keyPattern );
+    }
+    
+    void OrRangeGenerator::popOrClauseSingleKey() {
+        assertMayPopOrClause();
+        FieldRangeSet *toDiff = &_originalOrSets.front()._singleKey;
+        popOrClause( toDiff );
+    }
+    
+    /**
+     * Removes the top or clause, which would have been recently scanned, and
+     * removes the field ranges it covers from all subsequent or clauses.  As a
+     * side effect, this function may invalidate the return values of topFrs()
+     * calls made before this function was called.
+     * @param indexSpec - Keys of the index that was used to satisfy the last or
+     * clause.  Used to determine the range of keys that were scanned.  If
+     * empty we do not constrain the previous clause's ranges using index keys,
+     * which may reduce opportunities for range elimination.
+     */
+    void OrRangeGenerator::popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) {
+        list<FieldRangeSetPair>::iterator i = _orSets.begin();
+        list<FieldRangeSetPair>::iterator j = _originalOrSets.begin();
+        ++i;
+        ++j;
+        while( i != _orSets.end() ) {
+            *i -= *toDiff;
+            // Check if match is possible at all, and if it is possible for the recently scanned index.
+            if( !i->matchPossible() || ( d && !i->matchPossibleForIndex( d, idxNo, keyPattern ) ) ) {
+                i = _orSets.erase( i );
+                j = _originalOrSets.erase( j );
+            }
+            else {
+                ++i;
+                ++j;
+            }
+        }
+        _oldOrSets.push_front( _orSets.front() );
+        _orSets.pop_front();
+        _originalOrSets.pop_front();
+    }
+    
+    struct SimpleRegexUnitTest : UnitTest {
+        void run() {
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^foo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "foo" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f?oo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^fz?oo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f", "");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "\\Af", "");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f", "m");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "\\Af", "m");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "\\Af", "mi");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "\\Af \t\vo\n\ro  \\ \\# #comment", "mx");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "foo #" );
+            }
+            {
+                assert( simpleRegex("^\\Qasdf\\E", "", NULL) == "asdf" );
+                assert( simpleRegex("^\\Qasdf\\E.*", "", NULL) == "asdf" );
+                assert( simpleRegex("^\\Qasdf", "", NULL) == "asdf" ); // PCRE supports this
+                assert( simpleRegex("^\\Qasdf\\\\E", "", NULL) == "asdf\\" );
+                assert( simpleRegex("^\\Qas.*df\\E", "", NULL) == "as.*df" );
+                assert( simpleRegex("^\\Qas\\Q[df\\E", "", NULL) == "as\\Q[df" );
+                assert( simpleRegex("^\\Qas\\E\\\\E\\Q$df\\E", "", NULL) == "as\\E$df" ); // quoted string containing \E
+            }
+
+        }
+    } simple_regex_unittest;
+
+
+    long long applySkipLimit( long long num , const BSONObj& cmd ) {
+        BSONElement s = cmd["skip"];
+        BSONElement l = cmd["limit"];
+
+        if ( s.isNumber() ) {
+            num = num - s.numberLong();
+            if ( num < 0 ) {
+                num = 0;
+            }
+        }
+
+        if ( l.isNumber() ) {
+            long long limit = l.numberLong();
+            if ( limit < num ) {
+                num = limit;
+            }
+        }
+
+        return num;
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/db/queryutil.h b/src/mongo/db/queryutil.h
new file mode 100644
index 00000000000..aefef27cc8b
--- /dev/null
+++ b/src/mongo/db/queryutil.h
@@ -0,0 +1,443 @@
+// @file queryutil.h - Utility classes representing ranges of valid BSONElement values for a query.
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "indexkey.h"
+
+namespace mongo {
+
+    /**
+     * One side of an interval of valid BSONElements, specified by a value and a
+     * boolean indicating whether the interval includes the value.
+     */
+    struct FieldBound {
+        BSONElement _bound;
+        bool _inclusive;
+        bool operator==( const FieldBound &other ) const {
+            return _bound.woCompare( other._bound ) == 0 &&
+                   _inclusive == other._inclusive;
+        }
+        void flipInclusive() { _inclusive = !_inclusive; }
+    };
+
+    /** A closed interval composed of a lower and an upper FieldBound. */
+    struct FieldInterval {
+        FieldInterval() : _cachedEquality( -1 ) {}
+        FieldInterval( const BSONElement& e ) : _cachedEquality( -1 ) {
+            _lower._bound = _upper._bound = e;
+            _lower._inclusive = _upper._inclusive = true;
+        }
+        FieldBound _lower;
+        FieldBound _upper;
+        /** @return true iff no single element can be contained in the interval. */
+        bool strictValid() const {
+            int cmp = _lower._bound.woCompare( _upper._bound, false );
+            return ( cmp < 0 || ( cmp == 0 && _lower._inclusive && _upper._inclusive ) );
+        }
+        /** @return true iff the interval is an equality constraint. */
+        bool equality() const;
+        mutable int _cachedEquality;
+
+        string toString() const;
+    };
+
+    /**
+     * An ordered list of FieldIntervals expressing constraints on valid
+     * BSONElement values for a field.
+     */
+    class FieldRange {
+    public:
+        FieldRange( const BSONElement &e , bool singleKey , bool isNot=false , bool optimize=true );
+
+        /** @return Range intersection with 'other'. */
+        const FieldRange &operator&=( const FieldRange &other );
+        /** @return Range union with 'other'. */
+        const FieldRange &operator|=( const FieldRange &other );
+        /** @return Range of elements elements included in 'this' but not 'other'. */
+        const FieldRange &operator-=( const FieldRange &other );
+        /** @return true iff this range is a subset of 'other'. */
+        bool operator<=( const FieldRange &other ) const;
+
+        /**
+         * If there are any valid values for this range, the extreme values can
+         * be extracted.
+         */
+        
+        BSONElement min() const { assert( !empty() ); return _intervals[ 0 ]._lower._bound; }
+        BSONElement max() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._bound; }
+        bool minInclusive() const { assert( !empty() ); return _intervals[ 0 ]._lower._inclusive; }
+        bool maxInclusive() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._inclusive; }
+
+        /** @return true iff this range expresses a single equality interval. */
+        bool equality() const;
+        /** @return true if all the intervals for this range are equalities */
+        bool inQuery() const;
+        /** @return true iff this range does not include every BSONElement */
+        bool nontrivial() const;
+        /** @return true iff this range matches no BSONElements. */
+        bool empty() const { return _intervals.empty(); }
+        
+        /** Empty the range so it matches no BSONElements. */
+        void makeEmpty() { _intervals.clear(); }
+        const vector<FieldInterval> &intervals() const { return _intervals; }
+        string getSpecial() const { return _special; }
+        /** Make component intervals noninclusive. */
+        void setExclusiveBounds();
+        /**
+         * Constructs a range where all FieldIntervals and FieldBounds are in
+         * the opposite order of the current range.
+         * NOTE the resulting intervals might not be strictValid().
+         */
+        void reverse( FieldRange &ret ) const;
+
+        string toString() const;
+    private:
+        BSONObj addObj( const BSONObj &o );
+        void finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other );
+        vector<FieldInterval> _intervals;
+        // Owns memory for our BSONElements.
+        vector<BSONObj> _objData;
+        string _special;
+        bool _singleKey;
+    };
+
+    /**
+     * A BoundList contains intervals specified by inclusive start
+     * and end bounds.  The intervals should be nonoverlapping and occur in
+     * the specified direction of traversal.  For example, given a simple index {i:1}
+     * and direction +1, one valid BoundList is: (1, 2); (4, 6).  The same BoundList
+     * would be valid for index {i:-1} with direction -1.
+     */
+    typedef vector<pair<BSONObj,BSONObj> > BoundList;
+
+    class QueryPattern;
+    
+    /**
+     * A set of FieldRanges determined from constraints on the fields of a query,
+     * that may be used to determine index bounds.
+     */
+    class FieldRangeSet {
+    public:
+        friend class OrRangeGenerator;
+        friend class FieldRangeVector;
+        FieldRangeSet( const char *ns, const BSONObj &query , bool singleKey , bool optimize=true );
+        
+        /** @return true if there is a nontrivial range for the given field. */
+        bool hasRange( const char *fieldName ) const {
+            map<string, FieldRange>::const_iterator f = _ranges.find( fieldName );
+            return f != _ranges.end();
+        }
+        /** @return range for the given field. */
+        const FieldRange &range( const char *fieldName ) const;
+        /** @return range for the given field. */
+        FieldRange &range( const char *fieldName );
+        /** @return the number of nontrivial ranges. */
+        int nNontrivialRanges() const;
+        /** @return the field ranges comprising this set. */
+        const map<string,FieldRange> &ranges() const { return _ranges; }
+        /** 
+         * @return true if a match could be possible on every field. Generally this
+         * is not useful information for a single key FieldRangeSet and
+         * matchPossibleForIndex() should be used instead.
+         */
+        bool matchPossible() const;
+        /**
+         * @return true if a match could be possible given the value of _singleKey
+         * and index key 'keyPattern'.
+         * @param keyPattern May be {} or {$natural:1} for a non index scan.
+         */
+        bool matchPossibleForIndex( const BSONObj &keyPattern ) const;
+        
+        const char *ns() const { return _ns; }
+        
+        /**
+         * @return a simplified query from the extreme values of the nontrivial
+         * fields.
+         * @param fields If specified, the fields of the returned object are
+         * ordered to match those of 'fields'.
+         */
+        BSONObj simplifiedQuery( const BSONObj &fields = BSONObj() ) const;
+        
+        QueryPattern pattern( const BSONObj &sort = BSONObj() ) const;
+        string getSpecial() const;
+
+        /**
+         * @return a FieldRangeSet approximation of the documents in 'this' but
+         * not in 'other'.  The approximation will be a superset of the documents
+         * in 'this' but not 'other'.
+         */
+        const FieldRangeSet &operator-=( const FieldRangeSet &other );
+        /** @return intersection of 'this' with 'other'. */
+        const FieldRangeSet &operator&=( const FieldRangeSet &other );
+        
+        /**
+         * @return an ordered list of bounds generated using an index key pattern
+         * and traversal direction.
+         *
+         * NOTE This function is deprecated in the query optimizer and only
+         * currently used by the sharding code.
+         */
+        BoundList indexBounds( const BSONObj &keyPattern, int direction ) const;
+
+        /**
+         * @return - A new FieldRangeSet based on this FieldRangeSet, but with only
+         * a subset of the fields.
+         * @param fields - Only fields which are represented as field names in this object
+         * will be included in the returned FieldRangeSet.
+         */
+        FieldRangeSet *subset( const BSONObj &fields ) const;
+        
+        bool singleKey() const { return _singleKey; }
+        
+        BSONObj originalQuery() const { return _queries[ 0 ]; }
+    private:
+        void appendQueries( const FieldRangeSet &other );
+        void makeEmpty();
+        void processQueryField( const BSONElement &e, bool optimize );
+        void processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize );
+        static FieldRange *__singleKeyTrivialRange;
+        static FieldRange *__multiKeyTrivialRange;
+        const FieldRange &trivialRange() const;
+        map<string,FieldRange> _ranges;
+        const char *_ns;
+        // Owns memory for FieldRange BSONElements.
+        vector<BSONObj> _queries;
+        bool _singleKey;
+    };
+
+    class NamespaceDetails;
+    
+    /**
+     * A pair of FieldRangeSets, one representing constraints for single key
+     * indexes and the other representing constraints for multi key indexes and
+     * unindexed scans.  In several member functions the caller is asked to
+     * supply an index so that the implementation may utilize the proper
+     * FieldRangeSet and return results that are appropriate with respect to that
+     * supplied index.
+     */
+    class FieldRangeSetPair {
+    public:
+        FieldRangeSetPair( const char *ns, const BSONObj &query, bool optimize=true )
+        :_singleKey( ns, query, true, optimize ), _multiKey( ns, query, false, optimize ) {}
+
+        /**
+         * @return the appropriate single or multi key FieldRangeSet for the specified index.
+         * @param idxNo -1 for non index scan.
+         */
+        const FieldRangeSet &frsForIndex( const NamespaceDetails* nsd, int idxNo ) const;
+
+        /** @return a field range in the single key FieldRangeSet. */
+        const FieldRange &singleKeyRange( const char *fieldName ) const {
+            return _singleKey.range( fieldName );
+        }
+        /** @return true if the range limits are equivalent to an empty query. */
+        bool noNontrivialRanges() const;
+        /** @return false if a match is impossible regardless of index. */
+        bool matchPossible() const { return _multiKey.matchPossible(); }
+        /**
+         * @return false if a match is impossible on the specified index.
+         * @param idxNo -1 for non index scan.
+         */
+        bool matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const;
+        
+        const char *ns() const { return _singleKey.ns(); }
+
+        string getSpecial() const { return _singleKey.getSpecial(); }
+
+        /** Intersect with another FieldRangeSetPair. */
+        FieldRangeSetPair &operator&=( const FieldRangeSetPair &other );
+        /**
+         * Subtract a FieldRangeSet, generally one expressing a range that has
+         * already been scanned.
+         */
+        FieldRangeSetPair &operator-=( const FieldRangeSet &scanned );
+
+        BoundList singleKeyIndexBounds( const BSONObj &keyPattern, int direction ) const {
+            return _singleKey.indexBounds( keyPattern, direction );
+        }
+        
+        BSONObj originalQuery() const { return _singleKey.originalQuery(); }
+
+    private:
+        FieldRangeSetPair( const FieldRangeSet &singleKey, const FieldRangeSet &multiKey )
+        :_singleKey( singleKey ), _multiKey( multiKey ) {}
+        void assertValidIndex( const NamespaceDetails *d, int idxNo ) const;
+        void assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const;
+        /** matchPossibleForIndex() must be true. */
+        BSONObj simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const;        
+        FieldRangeSet _singleKey;
+        FieldRangeSet _multiKey;
+        friend class OrRangeGenerator;
+        friend struct QueryUtilIndexed;
+    };
+    
+    class IndexSpec;
+
+    /**
+     * An ordered list of fields and their FieldRanges, corresponding to valid
+     * index keys for a given index spec.
+     */
+    class FieldRangeVector {
+    public:
+        /**
+         * @param frs The valid ranges for all fields, as defined by the query spec
+         * @param indexSpec The index spec (key pattern and info)
+         * @param direction The direction of index traversal
+         */
+        FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction );
+
+        /** @return the number of index ranges represented by 'this' */
+        long long size();
+        /** @return starting point for an index traversal. */
+        BSONObj startKey() const;
+        /** @return end point for an index traversal. */
+        BSONObj endKey() const;
+        /** @return a client readable representation of 'this' */
+        BSONObj obj() const;
+        
+        const IndexSpec& getSpec(){ return _indexSpec; }
+
+        /**
+         * @return true iff the provided document matches valid ranges on all
+         * of this FieldRangeVector's fields, which is the case iff this document
+         * would be returned while scanning the index corresponding to this
+         * FieldRangeVector.  This function is used for $or clause deduping.
+         */
+        bool matches( const BSONObj &obj ) const;
+        
+        /**
+         * @return first key of 'obj' that would be encountered by a forward
+         * index scan using this FieldRangeVector, BSONObj() if no such key.
+         */
+        BSONObj firstMatch( const BSONObj &obj ) const;
+        
+    private:
+        int matchingLowElement( const BSONElement &e, int i, bool direction, bool &lowEquality ) const;
+        bool matchesElement( const BSONElement &e, int i, bool direction ) const;
+        bool matchesKey( const BSONObj &key ) const;
+        vector<FieldRange> _ranges;
+        const IndexSpec _indexSpec;
+        int _direction;
+        vector<BSONObj> _queries; // make sure mem owned
+        friend class FieldRangeVectorIterator;
+    };
+    
+    /**
+     * Helper class for iterating through an ordered representation of keys
+     * to find those keys that match a specified FieldRangeVector.
+     */
+    class FieldRangeVectorIterator {
+    public:
+        FieldRangeVectorIterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _inc( _v._ranges.size(), false ), _after() {
+        }
+        static BSONObj minObject() {
+            BSONObjBuilder b; b.appendMinKey( "" );
+            return b.obj();
+        }
+        static BSONObj maxObject() {
+            BSONObjBuilder b; b.appendMaxKey( "" );
+            return b.obj();
+        }
+        /**
+         * @return Suggested advance method, based on current key.
+         *   -2 Iteration is complete, no need to advance.
+         *   -1 Advance to the next key, without skipping.
+         *  >=0 Skip parameter.  If @return is r, skip to the key comprised
+         *      of the first r elements of curr followed by the (r+1)th and
+         *      remaining elements of cmp() (with inclusivity specified by
+         *      the (r+1)th and remaining elements of inc()).  If after() is
+         *      true, skip past this key not to it.
+         */
+        int advance( const BSONObj &curr );
+        const vector<const BSONElement *> &cmp() const { return _cmp; }
+        const vector<bool> &inc() const { return _inc; }
+        bool after() const { return _after; }
+        void prepDive();
+        void setZero( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = 0; }
+        void setMinus( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = -1; }
+        bool ok() { return _i[ 0 ] < (int)_v._ranges[ 0 ].intervals().size(); }
+        BSONObj startKey();
+        // temp
+        BSONObj endKey();
+    private:
+        const FieldRangeVector &_v;
+        vector<int> _i;
+        vector<const BSONElement*> _cmp;
+        vector<bool> _inc;
+        bool _after;
+    };
+    
+    /**
+     * As we iterate through $or clauses this class generates a FieldRangeSetPair
+     * for the current $or clause, in some cases by excluding ranges that were
+     * included in a previous clause.
+     */
+    class OrRangeGenerator {
+    public:
+        OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize=true );
+
+        /**
+         * @return true iff we are done scanning $or clauses.  if there's a
+         * useless or clause, we won't use or index ranges to help with scanning.
+         */
+        bool orFinished() const { return _orFound && _orSets.empty(); }
+        /** Iterates to the next $or clause by removing the current $or clause. */
+        void popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern );
+        void popOrClauseSingleKey();
+        /** @return FieldRangeSetPair for the current $or clause. */
+        FieldRangeSetPair *topFrsp() const;
+        /**
+         * @return original FieldRangeSetPair for the current $or clause. While the
+         * original bounds are looser, they are composed of fewer ranges and it
+         * is faster to do operations with them; when they can be used instead of
+         * more precise bounds, they should.
+         */
+        FieldRangeSetPair *topFrspOriginal() const;
+        
+        string getSpecial() const { return _baseSet.getSpecial(); }
+
+        bool moreOrClauses() const { return !_orSets.empty(); }
+    private:
+        void assertMayPopOrClause();
+        void popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d = 0, int idxNo = -1, const BSONObj &keyPattern = BSONObj() );
+        FieldRangeSetPair _baseSet;
+        list<FieldRangeSetPair> _orSets;
+        list<FieldRangeSetPair> _originalOrSets;
+        // ensure memory is owned
+        list<FieldRangeSetPair> _oldOrSets;
+        bool _orFound;
+        friend struct QueryUtilIndexed;
+    };
+
+    /** returns a string that when used as a matcher, would match a super set of regex()
+        returns "" for complex regular expressions
+        used to optimize queries in some simple regex cases that start with '^'
+
+        if purePrefix != NULL, sets it to whether the regex can be converted to a range query
+    */
+    string simpleRegex(const char* regex, const char* flags, bool* purePrefix=NULL);
+
+    /** returns the upper bound of a query that matches prefix */
+    string simpleRegexEnd( string prefix );
+
+    long long applySkipLimit( long long num , const BSONObj& cmd );
+
+} // namespace mongo
+
+#include "queryutil-inl.h"
diff --git a/src/mongo/db/record.cpp b/src/mongo/db/record.cpp
new file mode 100644
index 00000000000..17987002efc
--- /dev/null
+++ b/src/mongo/db/record.cpp
@@ -0,0 +1,267 @@
+// record.cpp
+
+#include "pch.h"
+#include "pdfile.h"
+#include "../util/processinfo.h"
+#include "../util/net/listen.h"
+#include "pagefault.h"
+
+namespace mongo {
+
+    namespace ps {
+        
+        enum State {
+            In , Out, Unk
+        };
+
+        enum Constants {
+            SliceSize = 65536 , 
+            MaxChain = 20 , // intentionally very low
+            NumSlices = 10 ,
+            RotateTimeSecs = 90 
+        };
+        
+        int hash( size_t region ) {
+            return 
+                abs( ( ( 7 + (int)(region & 0xFFFF) ) 
+                       * ( 11 + (int)( ( region >> 16 ) & 0xFFFF ) ) 
+#if defined(_WIN64) || defined(__amd64__)
+                       * ( 13 + (int)( ( region >> 32 ) & 0xFFFF ) )
+                       * ( 17 + (int)( ( region >> 48 ) & 0xFFFF ) )
+#endif
+                       ) % SliceSize );
+        }
+        
+                
+        /**
+         * simple hash map for region -> status
+         * this constitures a single region of time
+         * it does chaining, but very short chains
+         */
+        class Slice {
+            
+            struct Entry {
+                size_t region;
+                unsigned long long value;
+            };
+
+        public:
+            
+            Slice() {
+                reset();
+            }
+            
+            void reset() {
+                memset( _data , 0 , SliceSize * sizeof(Entry) );
+            }
+
+            State get( int regionHash , size_t region  , short offset ) {
+                DEV assert( hash( region ) == regionHash );
+                
+                Entry * e = _get( regionHash , region , false );
+                if ( ! e )
+                    return Unk;
+                
+                return ( e->value & ( ((unsigned long long)1) << offset ) ) ? In : Out;
+            }
+            
+            /**
+             * @return true if added, false if full
+             */
+            bool in( int regionHash , size_t region , short offset ) {
+                DEV assert( hash( region ) == regionHash );
+                
+                Entry * e = _get( regionHash , region , true );
+                if ( ! e )
+                    return false;
+                
+                e->value |= ((unsigned long long)1) << offset;
+                return true;
+            }
+
+        private:
+
+            Entry* _get( int start , size_t region , bool add ) {
+                for ( int i=0; i<MaxChain; i++ ) {
+
+                    int bucket = ( start + i ) % SliceSize;
+                    
+                    if ( _data[bucket].region == 0 ) {
+                        if ( ! add ) 
+                            return 0;
+
+                        _data[bucket].region = region;
+                        return &_data[bucket];
+                    }
+                    
+                    if ( _data[bucket].region == region ) {
+                        return &_data[bucket];
+                    }
+                }
+                return 0;
+            }
+
+            Entry _data[SliceSize];
+        };
+        
+        
+        /**
+         * this contains many slices of times
+         * the idea you put mem status in the current time slice
+         * and then after a certain period of time, it rolls off so we check again
+         */
+        class Rolling {
+            
+        public:
+            Rolling() 
+                : _lock( "ps::Rolling" ){
+                _curSlice = 0;
+                _lastRotate = Listener::getElapsedTimeMillis();
+            }
+            
+
+            /**
+             * after this call, we assume the page is in ram
+             * @param doHalf if this is a known good access, want to put in first half
+             * @return whether we know the page is in ram
+             */
+            bool access( size_t region , short offset , bool doHalf ) {
+                int regionHash = hash(region);
+                
+                SimpleMutex::scoped_lock lk( _lock );
+
+                static int rarely_count = 0;
+                if ( rarely_count++ % 2048 == 0 ) {
+                    long long now = Listener::getElapsedTimeMillis();
+                    RARELY if ( now == 0 ) {
+                        tlog() << "warning Listener::getElapsedTimeMillis returning 0ms" << endl;
+                    }
+                    
+                    if ( now - _lastRotate > ( 1000 * RotateTimeSecs ) ) {
+                        _rotate();
+                    }
+                }
+                
+                for ( int i=0; i<NumSlices / ( doHalf ? 2 : 1 ); i++ ) {
+                    int pos = (_curSlice+i)%NumSlices;
+                    State s = _slices[pos].get( regionHash , region , offset );
+
+                    if ( s == In )
+                        return true;
+                    
+                    if ( s == Out ) {
+                        _slices[pos].in( regionHash , region , offset );
+                        return false;
+                    }
+                }
+
+                // we weren't in any slice
+                // so add to cur
+                if ( ! _slices[_curSlice].in( regionHash , region , offset ) ) {
+                    _rotate();
+                    _slices[_curSlice].in( regionHash , region , offset );
+                }
+                return false;
+            }
+            
+        private:
+            
+            void _rotate() {
+                _curSlice = ( _curSlice + 1 ) % NumSlices;
+                _slices[_curSlice].reset();
+                _lastRotate = Listener::getElapsedTimeMillis();
+            }
+
+            int _curSlice;
+            long long _lastRotate;
+            Slice _slices[NumSlices];
+
+            SimpleMutex _lock;
+        } rolling;
+        
+    }
+
+    bool Record::MemoryTrackingEnabled = true;
+    
+    volatile int __record_touch_dummy = 1; // this is used to make sure the compiler doesn't get too smart on us
+    void Record::touch( bool entireRecrd ) {
+        if ( lengthWithHeaders > HeaderSize ) { // this also makes sure lengthWithHeaders is in memory
+            char * addr = data;
+            char * end = data + netLength();
+            for ( ; addr <= end ; addr += 2048 ) {
+                __record_touch_dummy += addr[0];
+
+                break; // TODO: remove this, pending SERVER-3711
+                
+                // note if this is a touch of a deletedrecord, we don't want to touch more than the first part. we may simply
+                // be updated the linked list and a deletedrecord could be gigantic.  similar circumstance just less extreme 
+                // exists for any record if we are just updating its header, say on a remove(); some sort of hints might be 
+                // useful.
+
+                if ( ! entireRecrd )
+                    break;
+            }
+        }
+    }
+
+    const bool blockSupported = ProcessInfo::blockCheckSupported();
+
+    bool Record::likelyInPhysicalMemory() {
+        if ( ! MemoryTrackingEnabled )
+            return true;
+
+        const size_t page = (size_t)data >> 12;
+        const size_t region = page >> 6;
+        const size_t offset = page & 0x3f;
+        
+        if ( ps::rolling.access( region , offset , false ) )
+            return true;
+
+        if ( ! blockSupported ) {
+            // this means we don't fallback to system call 
+            // and assume things aren't in memory
+            // possible we yield too much - but better than not yielding through a fault
+            return false;
+        }
+
+        return ProcessInfo::blockInMemory( data );
+    }
+
+
+    Record* Record::accessed() {
+        const size_t page = (size_t)data >> 12;
+        const size_t region = page >> 6;
+        const size_t offset = page & 0x3f;        
+        ps::rolling.access( region , offset , true );
+        return this;
+    }
+    
+    Record* DiskLoc::rec() const {
+        Record *r = DataFileMgr::getRecord(*this);
+#if defined(_PAGEFAULTEXCEPTION)
+        DEV ONCE { 
+            log() << "_DEBUG info _PAGEFAULTEXCEPTION is ON -- experimental at this time" << endl;
+        }
+        bool fault = !r->likelyInPhysicalMemory();
+        DEV if( rand() % 100 == 0 ) 
+            fault = true;
+        if( fault &&
+            !cc()._hasWrittenThisPass &&
+            cc()._pageFaultRetryableSection ) 
+        {
+            if( cc()._pageFaultRetryableSection->_laps > 100 ) { 
+                log() << "info pagefaultexception _laps > 100" << endl;
+            }
+            else {
+                throw PageFaultException(r);
+            }
+        }
+#else 
+        DEV ONCE { 
+            log() << "_DEBUG info _PAGEFAULTEXCEPTION is off" << endl;
+        }
+#endif
+        return r;
+    }
+
+}
diff --git a/src/mongo/db/repl.cpp b/src/mongo/db/repl.cpp
new file mode 100644
index 00000000000..25ecb6b455f
--- /dev/null
+++ b/src/mongo/db/repl.cpp
@@ -0,0 +1,1516 @@
+// repl.cpp
+
+/* TODO
+   PAIRING
+    _ on a syncexception, don't allow going back to master state?
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Collections we use:
+
+   local.sources         - indicates what sources we pull from as a "slave", and the last update of each
+   local.oplog.$main     - our op log as "master"
+   local.dbinfo.<dbname> - no longer used???
+   local.pair.startup    - [deprecated] can contain a special value indicating for a pair that we have the master copy.
+                           used when replacing other half of the pair which has permanently failed.
+   local.pair.sync       - [deprecated] { initialsynccomplete: 1 }
+*/
+
+#include "pch.h"
+#include "jsobj.h"
+#include "../util/goodies.h"
+#include "repl.h"
+#include "../util/net/message.h"
+#include "../util/background.h"
+#include "../client/dbclient.h"
+#include "../client/connpool.h"
+#include "pdfile.h"
+#include "ops/query.h"
+#include "db.h"
+#include "commands.h"
+#include "security.h"
+#include "cmdline.h"
+#include "repl_block.h"
+#include "repl/rs.h"
+#include "replutil.h"
+#include "repl/connections.h"
+#include "ops/update.h"
+
+namespace mongo {
+
+    // our config from command line etc.
+    ReplSettings replSettings;
+
+    /* if 1 sync() is running */
+    volatile int syncing = 0;
+    static volatile int relinquishSyncingSome = 0;
+
+    /* "dead" means something really bad happened like replication falling completely out of sync.
+       when non-null, we are dead and the string is informational
+    */
+    const char *replAllDead = 0;
+
+    time_t lastForcedResync = 0;
+
+} // namespace mongo
+
+namespace mongo {
+
+    /* output by the web console */
+    const char *replInfo = "";
+    struct ReplInfo {
+        ReplInfo(const char *msg) {
+            replInfo = msg;
+        }
+        ~ReplInfo() {
+            replInfo = "?";
+        }
+    };
+
+    /* operator requested resynchronization of replication (on the slave).  { resync : 1 } */
+    class CmdResync : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual bool logTheOp() { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        void help(stringstream&h) const { h << "resync (from scratch) an out of date replica slave.\nhttp://www.mongodb.org/display/DOCS/Master+Slave"; }
+        CmdResync() : Command("resync") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( cmdLine.usingReplSets() ) {
+                errmsg = "resync command not currently supported with replica sets.  See RS102 info in the mongodb documentations";
+                result.append("info", "http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member");
+                return false;
+            }
+
+            if ( cmdObj.getBoolField( "force" ) ) {
+                if ( !waitForSyncToFinish( errmsg ) )
+                    return false;
+                replAllDead = "resync forced";
+            }
+            if ( !replAllDead ) {
+                errmsg = "not dead, no need to resync";
+                return false;
+            }
+            if ( !waitForSyncToFinish( errmsg ) )
+                return false;
+
+            ReplSource::forceResyncDead( "client" );
+            result.append( "info", "triggered resync for all sources" );
+            return true;
+        }
+        bool waitForSyncToFinish( string &errmsg ) const {
+            // Wait for slave thread to finish syncing, so sources will be be
+            // reloaded with new saved state on next pass.
+            Timer t;
+            while ( 1 ) {
+                if ( syncing == 0 || t.millis() > 30000 )
+                    break;
+                {
+                    dbtemprelease t;
+                    relinquishSyncingSome = 1;
+                    sleepmillis(1);
+                }
+            }
+            if ( syncing ) {
+                errmsg = "timeout waiting for sync() to finish";
+                return false;
+            }
+            return true;
+        }
+    } cmdResync;
+
+    bool anyReplEnabled() {
+        return replSettings.slave || replSettings.master || theReplSet;
+    }
+
+    bool replAuthenticate(DBClientBase *conn);
+
+    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ) {
+
+        if ( replSet ) {
+            if( theReplSet == 0 ) {
+                result.append("ismaster", false);
+                result.append("secondary", false);
+                result.append("info", ReplSet::startupStatusMsg.get());
+                result.append( "isreplicaset" , true );
+                return;
+            }
+
+            theReplSet->fillIsMaster(result);
+            return;
+        }
+
+        if ( replAllDead ) {
+            result.append("ismaster", 0);
+            string s = string("dead: ") + replAllDead;
+            result.append("info", s);
+        }
+        else {
+            result.appendBool("ismaster", _isMaster() );
+        }
+
+        if ( level && replSet ) {
+            result.append( "info" , "is replica set" );
+        }
+        else if ( level ) {
+            BSONObjBuilder sources( result.subarrayStart( "sources" ) );
+
+            readlock lk( "local.sources" );
+            Client::Context ctx( "local.sources", dbpath, authed );
+            shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+            int n = 0;
+            while ( c->ok() ) {
+                BSONObj s = c->current();
+
+                BSONObjBuilder bb;
+                bb.append( s["host"] );
+                string sourcename = s["source"].valuestr();
+                if ( sourcename != "main" )
+                    bb.append( s["source"] );
+
+                {
+                    BSONElement e = s["syncedTo"];
+                    BSONObjBuilder t( bb.subobjStart( "syncedTo" ) );
+                    t.appendDate( "time" , e.timestampTime() );
+                    t.append( "inc" , e.timestampInc() );
+                    t.done();
+                }
+
+                if ( level > 1 ) {
+                    dbtemprelease unlock;
+                    // note: there is no so-style timeout on this connection; perhaps we should have one.
+                    ScopedDbConnection conn( s["host"].valuestr() );
+                    DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() );
+                    if ( cliConn && replAuthenticate( cliConn ) ) {
+                        BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) );
+                        BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) );
+                        bb.appendDate( "masterFirst" , first["ts"].timestampTime() );
+                        bb.appendDate( "masterLast" , last["ts"].timestampTime() );
+                        double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime());
+                        bb.append( "lagSeconds" , lag / 1000 );
+                    }
+                    conn.done();
+                }
+
+                sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() );
+                c->advance();
+            }
+
+            sources.done();
+        }
+    }
+
+    class CmdIsMaster : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream &help ) const {
+            help << "Check if this server is primary for a replica pair/set; also if it is --master or --slave in simple master/slave setups.\n";
+            help << "{ isMaster : 1 }";
+        }
+        virtual LockType locktype() const { return NONE; }
+        CmdIsMaster() : Command("isMaster", true, "ismaster") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not
+               authenticated.
+               we allow unauthenticated ismaster but we aren't as verbose informationally if
+               one is not authenticated for admin db to be safe.
+            */
+            bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+            appendReplicationInfo( result , authed );
+
+            result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
+            return true;
+        }
+    } cmdismaster;
+
+    ReplSource::ReplSource() {
+        nClonedThisPass = 0;
+    }
+
+    ReplSource::ReplSource(BSONObj o) : nClonedThisPass(0) {
+        only = o.getStringField("only");
+        hostName = o.getStringField("host");
+        _sourceName = o.getStringField("source");
+        uassert( 10118 ,  "'host' field not set in sources collection object", !hostName.empty() );
+        uassert( 10119 ,  "only source='main' allowed for now with replication", sourceName() == "main" );
+        BSONElement e = o.getField("syncedTo");
+        if ( !e.eoo() ) {
+            uassert( 10120 ,  "bad sources 'syncedTo' field value", e.type() == Date || e.type() == Timestamp );
+            OpTime tmp( e.date() );
+            syncedTo = tmp;
+        }
+
+        BSONObj dbsObj = o.getObjectField("dbsNextPass");
+        if ( !dbsObj.isEmpty() ) {
+            BSONObjIterator i(dbsObj);
+            while ( 1 ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                addDbNextPass.insert( e.fieldName() );
+            }
+        }
+
+        dbsObj = o.getObjectField("incompleteCloneDbs");
+        if ( !dbsObj.isEmpty() ) {
+            BSONObjIterator i(dbsObj);
+            while ( 1 ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                incompleteCloneDbs.insert( e.fieldName() );
+            }
+        }
+    }
+
+    /* Turn our C++ Source object into a BSONObj */
+    BSONObj ReplSource::jsobj() {
+        BSONObjBuilder b;
+        b.append("host", hostName);
+        b.append("source", sourceName());
+        if ( !only.empty() )
+            b.append("only", only);
+        if ( !syncedTo.isNull() )
+            b.appendTimestamp("syncedTo", syncedTo.asDate());
+
+        BSONObjBuilder dbsNextPassBuilder;
+        int n = 0;
+        for ( set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) {
+            n++;
+            dbsNextPassBuilder.appendBool(*i, 1);
+        }
+        if ( n )
+            b.append("dbsNextPass", dbsNextPassBuilder.done());
+
+        BSONObjBuilder incompleteCloneDbsBuilder;
+        n = 0;
+        for ( set<string>::iterator i = incompleteCloneDbs.begin(); i != incompleteCloneDbs.end(); i++ ) {
+            n++;
+            incompleteCloneDbsBuilder.appendBool(*i, 1);
+        }
+        if ( n )
+            b.append("incompleteCloneDbs", incompleteCloneDbsBuilder.done());
+
+        return b.obj();
+    }
+
+    void ReplSource::save() {
+        BSONObjBuilder b;
+        assert( !hostName.empty() );
+        b.append("host", hostName);
+        // todo: finish allowing multiple source configs.
+        // this line doesn't work right when source is null, if that is allowed as it is now:
+        //b.append("source", _sourceName);
+        BSONObj pattern = b.done();
+
+        BSONObj o = jsobj();
+        log( 1 ) << "Saving repl source: " << o << endl;
+
+        {
+            OpDebug debug;
+            Client::Context ctx("local.sources");
+            UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug);
+            assert( ! res.mod );
+            assert( res.num == 1 );
+        }
+    }
+
+    static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, ReplSource::SourceVector &old) {
+        if ( !s.syncedTo.isNull() ) { // Don't reuse old ReplSource if there was a forced resync.
+            for ( ReplSource::SourceVector::iterator i = old.begin(); i != old.end();  ) {
+                if ( s == **i ) {
+                    v.push_back(*i);
+                    old.erase(i);
+                    return;
+                }
+                i++;
+            }
+        }
+
+        v.push_back( shared_ptr< ReplSource >( new ReplSource( s ) ) );
+    }
+
+    /* we reuse our existing objects so that we can keep our existing connection
+       and cursor in effect.
+    */
+    void ReplSource::loadAll(SourceVector &v) {
+        Client::Context ctx("local.sources");
+        SourceVector old = v;
+        v.clear();
+
+        if ( !cmdLine.source.empty() ) {
+            // --source <host> specified.
+            // check that no items are in sources other than that
+            // add if missing
+            shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+            int n = 0;
+            while ( c->ok() ) {
+                n++;
+                ReplSource tmp(c->current());
+                if ( tmp.hostName != cmdLine.source ) {
+                    log() << "repl: --source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl;
+                    log() << "repl: for instructions on changing this slave's source, see:" << endl;
+                    log() << "http://dochub.mongodb.org/core/masterslave" << endl;
+                    log() << "repl: terminating mongod after 30 seconds" << endl;
+                    sleepsecs(30);
+                    dbexit( EXIT_REPLICATION_ERROR );
+                }
+                if ( tmp.only != cmdLine.only ) {
+                    log() << "--only " << cmdLine.only << " != " << tmp.only << " from local.sources collection" << endl;
+                    log() << "terminating after 30 seconds" << endl;
+                    sleepsecs(30);
+                    dbexit( EXIT_REPLICATION_ERROR );
+                }
+                c->advance();
+            }
+            uassert( 10002 ,  "local.sources collection corrupt?", n<2 );
+            if ( n == 0 ) {
+                // source missing.  add.
+                ReplSource s;
+                s.hostName = cmdLine.source;
+                s.only = cmdLine.only;
+                s.save();
+            }
+        }
+        else {
+            try {
+                massert( 10384 , "--only requires use of --source", cmdLine.only.empty());
+            }
+            catch ( ... ) {
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+
+        shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+        while ( c->ok() ) {
+            ReplSource tmp(c->current());
+            if ( tmp.syncedTo.isNull() ) {
+                DBDirectClient c;
+                if ( c.exists( "local.oplog.$main" ) ) {
+                    BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) );
+                    if ( !op.isEmpty() ) {
+                        tmp.syncedTo = op[ "ts" ].date();
+                    }
+                }
+            }
+            addSourceToList(v, tmp, old);
+            c->advance();
+        }
+    }
+
+    BSONObj opTimeQuery = fromjson("{\"getoptime\":1}");
+
+    bool ReplSource::throttledForceResyncDead( const char *requester ) {
+        if ( time( 0 ) - lastForcedResync > 600 ) {
+            forceResyncDead( requester );
+            lastForcedResync = time( 0 );
+            return true;
+        }
+        return false;
+    }
+
+    void ReplSource::forceResyncDead( const char *requester ) {
+        if ( !replAllDead )
+            return;
+        SourceVector sources;
+        ReplSource::loadAll(sources);
+        for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) {
+            log() << requester << " forcing resync from "  << (*i)->hostName << endl;
+            (*i)->forceResync( requester );
+        }
+        replAllDead = 0;
+    }
+
+    void ReplSource::forceResync( const char *requester ) {
+        BSONObj info;
+        {
+            dbtemprelease t;
+            if (!oplogReader.connect(hostName)) {
+                msgassertedNoTrace( 14051 , "unable to connect to resync");
+            }
+            /* todo use getDatabaseNames() method here */
+            bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+            massert( 10385 ,  "Unable to get database list", ok );
+        }
+        BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            string name = e.embeddedObject().getField( "name" ).valuestr();
+            if ( !e.embeddedObject().getBoolField( "empty" ) ) {
+                if ( name != "local" ) {
+                    if ( only.empty() || only == name ) {
+                        resyncDrop( name.c_str(), requester );
+                    }
+                }
+            }
+        }
+        syncedTo = OpTime();
+        addDbNextPass.clear();
+        save();
+    }
+
+    string ReplSource::resyncDrop( const char *db, const char *requester ) {
+        log() << "resync: dropping database " << db << endl;
+        Client::Context ctx(db);
+        dropDatabase(db);
+        return db;
+    }
+
+    /* grab initial copy of a database from the master */
+    void ReplSource::resync(string db) {
+        string dummyNs = resyncDrop( db.c_str(), "internal" );
+        Client::Context ctx( dummyNs );
+        {
+            log() << "resync: cloning database " << db << " to get an initial copy" << endl;
+            ReplInfo r("resync: cloning a database");
+            string errmsg;
+            int errCode = 0;
+            bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveOk*/ true, /*replauth*/ true, /*snapshot*/false, /*mayYield*/true, /*mayBeInterrupted*/false, &errCode);
+            if ( !ok ) {
+                if ( errCode == DatabaseDifferCaseCode ) {
+                    resyncDrop( db.c_str(), "internal" );
+                    log() << "resync: database " << db << " not valid on the master due to a name conflict, dropping." << endl;
+                    return;
+                }
+                else {
+                    problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl;
+                    throw SyncException();
+                }
+            }
+        }
+
+        log() << "resync: done with initial clone for db: " << db << endl;
+
+        return;
+    }
+    
+    DatabaseIgnorer ___databaseIgnorer;
+    
+    void DatabaseIgnorer::doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime ) {
+        if ( futureOplogTime > _ignores[ db ] ) {
+            _ignores[ db ] = futureOplogTime;   
+        }
+    }
+
+    bool DatabaseIgnorer::ignoreAt( const string &db, const OpTime &currentOplogTime ) {
+        if ( _ignores[ db ].isNull() ) {
+            return false;
+        }
+        if ( _ignores[ db ] >= currentOplogTime ) {
+            return true;
+        } else {
+            // The ignore state has expired, so clear it.
+            _ignores.erase( db );
+            return false;
+        }
+    }
+
+    bool ReplSource::handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db ) {
+        if ( dbHolder()._isLoaded( ns, dbpath ) ) {
+            // Database is already present.
+            return true;   
+        }
+        BSONElement ts = op.getField( "ts" );
+        if ( ( ts.type() == Date || ts.type() == Timestamp ) && ___databaseIgnorer.ignoreAt( db, ts.date() ) ) {
+            // Database is ignored due to a previous indication that it is
+            // missing from master after optime "ts".
+            return false;   
+        }
+        if ( Database::duplicateUncasedName( false, db, dbpath ).empty() ) {
+            // No duplicate database names are present.
+            return true;
+        }
+        
+        OpTime lastTime;
+        bool dbOk = false;
+        {
+            dbtemprelease release;
+        
+            // We always log an operation after executing it (never before), so
+            // a database list will always be valid as of an oplog entry generated
+            // before it was retrieved.
+            
+            BSONObj last = oplogReader.findOne( this->ns().c_str(), Query().sort( BSON( "$natural" << -1 ) ) );
+            if ( !last.isEmpty() ) {
+	            BSONElement ts = last.getField( "ts" );
+	            massert( 14032, "Invalid 'ts' in remote log", ts.type() == Date || ts.type() == Timestamp );
+	            lastTime = OpTime( ts.date() );
+            }
+
+            BSONObj info;
+            bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+            massert( 14033, "Unable to get database list", ok );
+            BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+            while( i.more() ) {
+                BSONElement e = i.next();
+            
+                const char * name = e.embeddedObject().getField( "name" ).valuestr();
+                if ( strcasecmp( name, db ) != 0 )
+                    continue;
+                
+                if ( strcmp( name, db ) == 0 ) {
+                    // The db exists on master, still need to check that no conflicts exist there.
+                    dbOk = true;
+                    continue;
+                }
+                
+                // The master has a db name that conflicts with the requested name.
+                dbOk = false;
+                break;
+            }
+        }
+        
+        if ( !dbOk ) {
+            ___databaseIgnorer.doIgnoreUntilAfter( db, lastTime );
+            incompleteCloneDbs.erase(db);
+            addDbNextPass.erase(db);
+            return false;   
+        }
+        
+        // Check for duplicates again, since we released the lock above.
+        set< string > duplicates;
+        Database::duplicateUncasedName( false, db, dbpath, &duplicates );
+        
+        // The database is present on the master and no conflicting databases
+        // are present on the master.  Drop any local conflicts.
+        for( set< string >::const_iterator i = duplicates.begin(); i != duplicates.end(); ++i ) {
+            ___databaseIgnorer.doIgnoreUntilAfter( *i, lastTime );
+            incompleteCloneDbs.erase(*i);
+            addDbNextPass.erase(*i);
+            Client::Context ctx(*i);
+            dropDatabase(*i);
+        }
+        
+        massert( 14034, "Duplicate database names present after attempting to delete duplicates",
+                Database::duplicateUncasedName( false, db, dbpath ).empty() );
+        return true;
+    }
+
+    void ReplSource::applyOperation(const BSONObj& op) {
+        try {
+            bool failedUpdate = applyOperation_inlock( op );
+            if (failedUpdate) {
+                Sync sync(hostName);
+                if (sync.shouldRetry(op)) {
+                    uassert(15914, "Failure retrying initial sync update", !applyOperation_inlock(op));
+                }
+            }
+        }
+        catch ( UserException& e ) {
+            log() << "sync: caught user assertion " << e << " while applying op: " << op << endl;;
+        }
+        catch ( DBException& e ) {
+            log() << "sync: caught db exception " << e << " while applying op: " << op << endl;;
+        }
+
+    }
+
+    /* local.$oplog.main is of the form:
+         { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> }
+         ...
+       see logOp() comments.
+
+       @param alreadyLocked caller already put us in write lock if true
+    */
+    void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked) {
+        if( logLevel >= 6 ) // op.tostring is expensive so doing this check explicitly
+            log(6) << "processing op: " << op << endl;
+
+        if( op.getStringField("op")[0] == 'n' )
+            return;
+
+        char clientName[MaxDatabaseNameLen];
+        const char *ns = op.getStringField("ns");
+        nsToDatabase(ns, clientName);
+
+        if ( *ns == '.' ) {
+            problem() << "skipping bad op in oplog: " << op.toString() << endl;
+            return;
+        }
+        else if ( *ns == 0 ) {
+            /*if( op.getStringField("op")[0] != 'n' )*/ {
+                problem() << "halting replication, bad op in oplog:\n  " << op.toString() << endl;
+                replAllDead = "bad object in oplog";
+                throw SyncException();
+            }
+            //ns = "local.system.x";
+            //nsToDatabase(ns, clientName);
+        }
+
+        if ( !only.empty() && only != clientName )
+            return;
+
+        if( cmdLine.pretouch && !alreadyLocked/*doesn't make sense if in write lock already*/ ) {
+            if( cmdLine.pretouch > 1 ) {
+                /* note: this is bad - should be put in ReplSource.  but this is first test... */
+                static int countdown;
+                assert( countdown >= 0 );
+                if( countdown > 0 ) {
+                    countdown--; // was pretouched on a prev pass
+                }
+                else {
+                    const int m = 4;
+                    if( tp.get() == 0 ) {
+                        int nthr = min(8, cmdLine.pretouch);
+                        nthr = max(nthr, 1);
+                        tp.reset( new ThreadPool(nthr) );
+                    }
+                    vector<BSONObj> v;
+                    oplogReader.peek(v, cmdLine.pretouch);
+                    unsigned a = 0;
+                    while( 1 ) {
+                        if( a >= v.size() ) break;
+                        unsigned b = a + m - 1; // v[a..b]
+                        if( b >= v.size() ) b = v.size() - 1;
+                        tp->schedule(pretouchN, v, a, b);
+                        DEV cout << "pretouch task: " << a << ".." << b << endl;
+                        a += m;
+                    }
+                    // we do one too...
+                    pretouchOperation(op);
+                    tp->join();
+                    countdown = v.size();
+                }
+            }
+            else {
+                pretouchOperation(op);
+            }
+        }
+
+        scoped_ptr<writelock> lk( alreadyLocked ? 0 : new writelock() );
+
+        if ( replAllDead ) {
+            // hmmm why is this check here and not at top of this function? does it get set between top and here?
+            log() << "replAllDead, throwing SyncException: " << replAllDead << endl;
+            throw SyncException();
+        }
+
+        if ( !handleDuplicateDbName( op, ns, clientName ) ) {
+            return;   
+        }
+                
+        Client::Context ctx( ns );
+        ctx.getClient()->curop()->reset();
+
+        bool empty = ctx.db()->isEmpty();
+        bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0;
+
+        if( logLevel >= 6 )
+            log(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl;
+
+        // always apply admin command command
+        // this is a bit hacky -- the semantics of replication/commands aren't well specified
+        if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) {
+            applyOperation( op );
+            return;
+        }
+
+        if ( ctx.justCreated() || empty || incompleteClone ) {
+            // we must add to incomplete list now that setClient has been called
+            incompleteCloneDbs.insert( clientName );
+            if ( nClonedThisPass ) {
+                /* we only clone one database per pass, even if a lot need done.  This helps us
+                 avoid overflowing the master's transaction log by doing too much work before going
+                 back to read more transactions. (Imagine a scenario of slave startup where we try to
+                 clone 100 databases in one pass.)
+                 */
+                addDbNextPass.insert( clientName );
+            }
+            else {
+                if ( incompleteClone ) {
+                    log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl;
+                }
+                save();
+                Client::Context ctx(ns);
+                nClonedThisPass++;
+                resync(ctx.db()->name);
+                addDbNextPass.erase(clientName);
+                incompleteCloneDbs.erase( clientName );
+            }
+            save();
+        }
+        else {
+            applyOperation( op );
+            addDbNextPass.erase( clientName );
+        }
+    }
+
+    void ReplSource::syncToTailOfRemoteLog() {
+        string _ns = ns();
+        BSONObjBuilder b;
+        if ( !only.empty() ) {
+            b.appendRegex("ns", string("^") + only);
+        }
+        BSONObj last = oplogReader.findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) );
+        if ( !last.isEmpty() ) {
+            BSONElement ts = last.getField( "ts" );
+            massert( 10386 ,  "non Date ts found: " + last.toString(), ts.type() == Date || ts.type() == Timestamp );
+            syncedTo = OpTime( ts.date() );
+        }
+    }
+
+    extern unsigned replApplyBatchSize;
+
+    /* slave: pull some data from the master's oplog
+       note: not yet in db mutex at this point.
+       @return -1 error
+               0 ok, don't sleep
+               1 ok, sleep
+    */
+    int ReplSource::sync_pullOpLog(int& nApplied) {
+        int okResultCode = 1;
+        string ns = string("local.oplog.$") + sourceName();
+        log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n';
+
+        bool tailing = true;
+        oplogReader.tailCheck();
+
+        bool initial = syncedTo.isNull();
+
+        if ( !oplogReader.haveCursor() || initial ) {
+            if ( initial ) {
+                // Important to grab last oplog timestamp before listing databases.
+                syncToTailOfRemoteLog();
+                BSONObj info;
+                bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+                massert( 10389 ,  "Unable to get database list", ok );
+                BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+                while( i.moreWithEOO() ) {
+                    BSONElement e = i.next();
+                    if ( e.eoo() )
+                        break;
+                    string name = e.embeddedObject().getField( "name" ).valuestr();
+                    if ( !e.embeddedObject().getBoolField( "empty" ) ) {
+                        if ( name != "local" ) {
+                            if ( only.empty() || only == name ) {
+                                log( 2 ) << "adding to 'addDbNextPass': " << name << endl;
+                                addDbNextPass.insert( name );
+                            }
+                        }
+                    }
+                }
+                dblock lk;
+                save();
+            }
+
+            BSONObjBuilder q;
+            q.appendDate("$gte", syncedTo.asDate());
+            BSONObjBuilder query;
+            query.append("ts", q.done());
+            if ( !only.empty() ) {
+                // note we may here skip a LOT of data table scanning, a lot of work for the master.
+                query.appendRegex("ns", string("^") + only); // maybe append "\\." here?
+            }
+            BSONObj queryObj = query.done();
+            // e.g. queryObj = { ts: { $gte: syncedTo } }
+
+            oplogReader.tailingQuery(ns.c_str(), queryObj);
+            tailing = false;
+        }
+        else {
+            log(2) << "repl: tailing=true\n";
+        }
+
+        if( !oplogReader.haveCursor() ) {
+            problem() << "repl: dbclient::query returns null (conn closed?)" << endl;
+            oplogReader.resetConnection();
+            return -1;
+        }
+
+        // show any deferred database creates from a previous pass
+        {
+            set<string>::iterator i = addDbNextPass.begin();
+            if ( i != addDbNextPass.end() ) {
+                BSONObjBuilder b;
+                b.append("ns", *i + '.');
+                b.append("op", "db");
+                BSONObj op = b.done();
+                sync_pullOpLog_applyOperation(op, false);
+            }
+        }
+
+        if ( !oplogReader.more() ) {
+            if ( tailing ) {
+                log(2) << "repl: tailing & no new activity\n";
+                if( oplogReader.awaitCapable() )
+                    okResultCode = 0; // don't sleep
+
+            }
+            else {
+                log() << "repl:   " << ns << " oplog is empty\n";
+            }
+            {
+                dblock lk;
+                save();
+            }
+            return okResultCode;
+        }
+
+        OpTime nextOpTime;
+        {
+            BSONObj op = oplogReader.next();
+            BSONElement ts = op.getField("ts");
+            if ( ts.type() != Date && ts.type() != Timestamp ) {
+                string err = op.getStringField("$err");
+                if ( !err.empty() ) {
+                    // 13051 is "tailable cursor requested on non capped collection"
+                    if (op.getIntField("code") == 13051) {
+                        problem() << "trying to slave off of a non-master" << '\n';
+                        massert( 13344 ,  "trying to slave off of a non-master", false );
+                    }
+                    else {
+                        problem() << "repl: $err reading remote oplog: " + err << '\n';
+                        massert( 10390 ,  "got $err reading remote oplog", false );
+                    }
+                }
+                else {
+                    problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n';
+                    massert( 10391 , "repl: bad object read from remote oplog", false);
+                }
+            }
+
+            nextOpTime = OpTime( ts.date() );
+            log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
+            if ( initial ) {
+                log(1) << "repl:   initial run\n";
+            }
+            if( tailing ) {
+                if( !( syncedTo < nextOpTime ) ) {
+                    log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl;
+                    log() << "repl syncTo:     " << syncedTo.toStringLong() << endl;
+                    log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl;
+                    assert(false);
+                }
+                oplogReader.putBack( op ); // op will be processed in the loop below
+                nextOpTime = OpTime(); // will reread the op below
+            }
+            else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error
+                Nullstream& l = log();
+                l << "repl:   nextOpTime " << nextOpTime.toStringLong() << ' ';
+                if ( nextOpTime < syncedTo )
+                    l << "<??";
+                else
+                    l << ">";
+
+                l << " syncedTo " << syncedTo.toStringLong() << '\n';
+                log() << "repl:   time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n";
+                log() << "repl:   tailing: " << tailing << '\n';
+                log() << "repl:   data too stale, halting replication" << endl;
+                replInfo = replAllDead = "data too stale halted replication";
+                assert( syncedTo < nextOpTime );
+                throw SyncException();
+            }
+            else {
+                /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */
+            }
+        }
+
+        // apply operations
+        {
+            int n = 0;
+            time_t saveLast = time(0);
+            while ( 1 ) {
+
+                bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place.
+
+                if ( moreInitialSyncsPending || !oplogReader.more() ) {
+                    dblock lk;
+
+                    // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior.
+                    {
+                        dbtemprelease t;
+                        if ( !moreInitialSyncsPending && oplogReader.more() ) {
+                            continue;
+                        }
+                        // otherwise, break out of loop so we can set to completed or clone more dbs
+                    }
+                    
+                    if( oplogReader.awaitCapable() && tailing )
+                        okResultCode = 0; // don't sleep
+                    syncedTo = nextOpTime;
+                    save(); // note how far we are synced up to now
+                    log() << "repl:   applied " << n << " operations" << endl;
+                    nApplied = n;
+                    log() << "repl:  end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl;
+                    break;
+                }
+                else {
+                }
+
+                OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) {
+                    // periodically note our progress, in case we are doing a lot of work and crash
+                    dblock lk;
+                    syncedTo = nextOpTime;
+                    // can't update local log ts since there are pending operations from our peer
+                    save();
+                    log() << "repl:   checkpoint applied " << n << " operations" << endl;
+                    log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
+                    saveLast = time(0);
+                    n = 0;
+                }
+
+                BSONObj op = oplogReader.next();
+
+                unsigned b = replApplyBatchSize;
+                bool justOne = b == 1;
+                scoped_ptr<writelock> lk( justOne ? 0 : new writelock() );
+                while( 1 ) {
+
+                    BSONElement ts = op.getField("ts");
+                    if( !( ts.type() == Date || ts.type() == Timestamp ) ) {
+                        log() << "sync error: problem querying remote oplog record" << endl;
+                        log() << "op: " << op.toString() << endl;
+                        log() << "halting replication" << endl;
+                        replInfo = replAllDead = "sync error: no ts found querying remote oplog record";
+                        throw SyncException();
+                    }
+                    OpTime last = nextOpTime;
+                    nextOpTime = OpTime( ts.date() );
+                    if ( !( last < nextOpTime ) ) {
+                        log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl;
+                        log() << " last:       " << last.toStringLong() << endl;
+                        log() << " nextOpTime: " << nextOpTime.toStringLong() << endl;
+                        log() << " halting replication" << endl;
+                        replInfo = replAllDead = "sync error last >= nextOpTime";
+                        uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false);
+                    }
+                    if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) {
+                        assert( justOne );
+                        oplogReader.putBack( op );
+                        _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1;
+                        dblock lk;
+                        if ( n > 0 ) {
+                            syncedTo = last;
+                            save();
+                        }
+                        log() << "repl:   applied " << n << " operations" << endl;
+                        log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
+                        log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl;
+                        return okResultCode;
+                    }
+
+                    sync_pullOpLog_applyOperation(op, !justOne);
+                    n++;
+
+                    if( --b == 0 )
+                        break;
+                    // if to here, we are doing mulpile applications in a singel write lock acquisition
+                    if( !oplogReader.moreInCurrentBatch() ) {
+                        // break if no more in batch so we release lock while reading from the master
+                        break;
+                    }
+                    op = oplogReader.next();
+
+                    getDur().commitIfNeeded();
+                }
+            }
+        }
+
+        return okResultCode;
+    }
+
+    BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}");
+
+    bool replAuthenticate(DBClientBase *conn) {
+        if( noauth ) {
+            return true;
+        }
+        if( ! cc().isAdmin() ) {
+            log() << "replauthenticate: requires admin permissions, failing\n";
+            return false;
+        }
+
+        string u;
+        string p;
+        if (internalSecurity.pwd.length() > 0) {
+            u = internalSecurity.user;
+            p = internalSecurity.pwd;
+        }
+        else {
+            BSONObj user;
+            {
+                dblock lk;
+                Client::Context ctxt("local.");
+                if( !Helpers::findOne("local.system.users", userReplQuery, user) ||
+                        // try the first user in local
+                        !Helpers::getSingleton("local.system.users", user) ) {
+                    log() << "replauthenticate: no user in local.system.users to use for authentication\n";
+                    return false;
+                }
+            }
+            u = user.getStringField("user");
+            p = user.getStringField("pwd");
+            massert( 10392 , "bad user object? [1]", !u.empty());
+            massert( 10393 , "bad user object? [2]", !p.empty());
+        }
+
+        string err;
+        if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) {
+            log() << "replauthenticate: can't authenticate to master server, user:" << u << endl;
+            return false;
+        }
+        return true;
+    }
+
+    bool replHandshake(DBClientConnection *conn) {
+
+        string myname = getHostName();
+
+        BSONObj me;
+        {
+            
+            dblock l;
+            // local.me is an identifier for a server for getLastError w:2+
+            if ( ! Helpers::getSingleton( "local.me" , me ) ||
+                 ! me.hasField("host") ||
+                 me["host"].String() != myname ) {
+
+                // clean out local.me
+                Helpers::emptyCollection("local.me");
+
+                // repopulate
+                BSONObjBuilder b;
+                b.appendOID( "_id" , 0 , true );
+                b.append( "host", myname );
+                me = b.obj();
+                Helpers::putSingleton( "local.me" , me );
+            }
+        }
+
+        BSONObjBuilder cmd;
+        cmd.appendAs( me["_id"] , "handshake" );
+        if (theReplSet) {
+            cmd.append("member", theReplSet->selfId());
+        }
+
+        BSONObj res;
+        bool ok = conn->runCommand( "admin" , cmd.obj() , res );
+        // ignoring for now on purpose for older versions
+        log(ok) << "replHandshake res not: " << ok << " res: " << res << endl;
+        return true;
+    }
+
+    bool OplogReader::commonConnect(const string& hostName) {
+        if( conn() == 0 ) {
+            _conn = shared_ptr<DBClientConnection>(new DBClientConnection( false, 0, 0 /* tcp timeout */));
+            string errmsg;
+            ReplInfo r("trying to connect to sync source");
+            if ( !_conn->connect(hostName.c_str(), errmsg) ||
+                 (!noauth && !replAuthenticate(_conn.get())) ) {
+                resetConnection();
+                log() << "repl: " << errmsg << endl;
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    bool OplogReader::connect(string hostName) {
+        if (conn() != 0) {
+            return true;
+        }
+
+        if (commonConnect(hostName)) {
+            return replHandshake(_conn.get());
+        }
+        return false;
+    }
+
+    bool OplogReader::connect(const BSONObj& rid, const int from, const string& to) {
+        if (conn() != 0) {
+            return true;
+        }
+        if (commonConnect(to)) {
+            log() << "handshake between " << from << " and " << to << endl;
+            return passthroughHandshake(rid, from);
+        }
+        return false;
+    }
+
+    bool OplogReader::passthroughHandshake(const BSONObj& rid, const int f) {
+        BSONObjBuilder cmd;
+        cmd.appendAs( rid["_id"], "handshake" );
+        cmd.append( "member" , f );
+
+        BSONObj res;
+        return conn()->runCommand( "admin" , cmd.obj() , res );
+    }
+
+    /* note: not yet in mutex at this point.
+       returns >= 0 if ok.  return -1 if you want to reconnect.
+       return value of zero indicates no sleep necessary before next call
+    */
+    int ReplSource::sync(int& nApplied) {
+        _sleepAdviceTime = 0;
+        ReplInfo r("sync");
+        if ( !cmdLine.quiet ) {
+            Nullstream& l = log();
+            l << "repl: syncing from ";
+            if( sourceName() != "main" ) {
+                l << "source:" << sourceName() << ' ';
+            }
+            l << "host:" << hostName << endl;
+        }
+        nClonedThisPass = 0;
+
+        // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName.
+        if ( (string("localhost") == hostName || string("127.0.0.1") == hostName) && cmdLine.port == CmdLine::DefaultDBPort ) {
+            log() << "repl:   can't sync from self (localhost). sources configuration may be wrong." << endl;
+            sleepsecs(5);
+            return -1;
+        }
+
+        if ( !oplogReader.connect(hostName) ) {
+            log(4) << "repl:  can't connect to sync source" << endl;
+            return -1;
+        }
+
+        /*
+            // get current mtime at the server.
+            BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
+            BSONElement e = o.getField("optime");
+            if( e.eoo() ) {
+                log() << "repl:   failed to get cur optime from master" << endl;
+                log() << "        " << o.toString() << endl;
+                return false;
+            }
+            uassert( 10124 ,  e.type() == Date );
+            OpTime serverCurTime;
+            serverCurTime.asDate() = e.date();
+        */
+        return sync_pullOpLog(nApplied);
+    }
+
+    /* --------------------------------------------------------------*/
+
+    /*
+    TODO:
+    _ source has autoptr to the cursor
+    _ reuse that cursor when we can
+    */
+
+    /* returns: # of seconds to sleep before next pass
+                0 = no sleep recommended
+                1 = special sentinel indicating adaptive sleep recommended
+    */
+    int _replMain(ReplSource::SourceVector& sources, int& nApplied) {
+        {
+            ReplInfo r("replMain load sources");
+            dblock lk;
+            ReplSource::loadAll(sources);
+            replSettings.fastsync = false; // only need this param for initial reset
+        }
+
+        if ( sources.empty() ) {
+            /* replication is not configured yet (for --slave) in local.sources.  Poll for config it
+            every 20 seconds.
+            */
+            log() << "no source given, add a master to local.sources to start replication" << endl;
+            return 20;
+        }
+
+        int sleepAdvice = 1;
+        for ( ReplSource::SourceVector::iterator i = sources.begin(); i != sources.end(); i++ ) {
+            ReplSource *s = i->get();
+            int res = -1;
+            try {
+                res = s->sync(nApplied);
+                bool moreToSync = s->haveMoreDbsToSync();
+                if( res < 0 ) {
+                    sleepAdvice = 3;
+                }
+                else if( moreToSync ) {
+                    sleepAdvice = 0;
+                }
+                else if ( s->sleepAdvice() ) {
+                    sleepAdvice = s->sleepAdvice();
+                }
+                else
+                    sleepAdvice = res;
+            }
+            catch ( const SyncException& ) {
+                log() << "caught SyncException" << endl;
+                return 10;
+            }
+            catch ( AssertionException& e ) {
+                if ( e.severe() ) {
+                    log() << "replMain AssertionException " << e.what() << endl;
+                    return 60;
+                }
+                else {
+                    log() << "repl: AssertionException " << e.what() << '\n';
+                }
+                replInfo = "replMain caught AssertionException";
+            }
+            catch ( const DBException& e ) {
+                log() << "repl: DBException " << e.what() << endl;
+                replInfo = "replMain caught DBException";
+            }
+            catch ( const std::exception &e ) {
+                log() << "repl: std::exception " << e.what() << endl;
+                replInfo = "replMain caught std::exception";
+            }
+            catch ( ... ) {
+                log() << "unexpected exception during replication.  replication will halt" << endl;
+                replAllDead = "caught unexpected exception during replication";
+            }
+            if ( res < 0 )
+                s->oplogReader.resetConnection();
+        }
+        return sleepAdvice;
+    }
+
+    void replMain() {
+        ReplSource::SourceVector sources;
+        while ( 1 ) {
+            int s = 0;
+            {
+                dblock lk;
+                if ( replAllDead ) {
+                    // throttledForceResyncDead can throw
+                    if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) ) {
+                        log() << "all sources dead: " << replAllDead << ", sleeping for 5 seconds" << endl;
+                        break;
+                    }
+                }
+                assert( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this.
+                syncing++;
+            }
+            try {
+                int nApplied = 0;
+                s = _replMain(sources, nApplied);
+                if( s == 1 ) {
+                    if( nApplied == 0 ) s = 2;
+                    else if( nApplied > 100 ) {
+                        // sleep very little - just enought that we aren't truly hammering master
+                        sleepmillis(75);
+                        s = 0;
+                    }
+                }
+            }
+            catch (...) {
+                out() << "caught exception in _replMain" << endl;
+                s = 4;
+            }
+            {
+                dblock lk;
+                assert( syncing == 1 );
+                syncing--;
+            }
+
+            if( relinquishSyncingSome )  {
+                relinquishSyncingSome = 0;
+                s = 1; // sleep before going back in to syncing=1
+            }
+
+            if ( s ) {
+                stringstream ss;
+                ss << "repl: sleep " << s << " sec before next pass";
+                string msg = ss.str();
+                if ( ! cmdLine.quiet )
+                    log() << msg << endl;
+                ReplInfo r(msg.c_str());
+                sleepsecs(s);
+            }
+        }
+    }
+
+    static void replMasterThread() {
+        sleepsecs(4);
+        Client::initThread("replmaster");
+        int toSleep = 10;
+        while( 1 ) {
+
+            sleepsecs( toSleep );
+            /* write a keep-alive like entry to the log.  this will make things like
+               printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date
+               even when things are idle.
+            */
+            {
+                writelocktry lk("",1);
+                if ( lk.got() ) {
+                    toSleep = 10;
+
+                    replLocalAuth();
+
+                    try {
+                        logKeepalive();
+                    }
+                    catch(...) {
+                        log() << "caught exception in replMasterThread()" << endl;
+                    }
+                }
+                else {
+                    log(5) << "couldn't logKeepalive" << endl;
+                    toSleep = 1;
+                }
+            }
+        }
+    }
+
+    void replSlaveThread() {
+        sleepsecs(1);
+        Client::initThread("replslave");
+        cc().iAmSyncThread();
+
+        {
+            dblock lk;
+            replLocalAuth();
+        }
+
+        while ( 1 ) {
+            try {
+                replMain();
+                sleepsecs(5);
+            }
+            catch ( AssertionException& ) {
+                ReplInfo r("Assertion in replSlaveThread(): sleeping 5 minutes before retry");
+                problem() << "Assertion in replSlaveThread(): sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
+            catch ( DBException& e ) {
+                problem() << "exception in replSlaveThread(): " << e.what()
+                          << ", sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
+            catch ( ... ) {
+                problem() << "error in replSlaveThread(): sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
+        }
+    }
+
+    void tempThread() {
+        while ( 1 ) {
+            out() << d.dbMutex.info().isLocked() << endl;
+            sleepmillis(100);
+        }
+    }
+
+    void newRepl();
+    void oldRepl();
+    void startReplSets(ReplSetCmdline*);
+    void startReplication() {
+        /* if we are going to be a replica set, we aren't doing other forms of replication. */
+        if( !cmdLine._replSet.empty() ) {
+            if( replSettings.slave || replSettings.master ) {
+                log() << "***" << endl;
+                log() << "ERROR: can't use --slave or --master replication options with --replSet" << endl;
+                log() << "***" << endl;
+            }
+            newRepl();
+
+            replSet = true;
+            ReplSetCmdline *replSetCmdline = new ReplSetCmdline(cmdLine._replSet);
+            boost::thread t( boost::bind( &startReplSets, replSetCmdline) );
+
+            return;
+        }
+
+        oldRepl();
+
+        /* this was just to see if anything locks for longer than it should -- we need to be careful
+           not to be locked when trying to connect() or query() the other side.
+           */
+        //boost::thread tempt(tempThread);
+
+        if( !replSettings.slave && !replSettings.master )
+            return;
+
+        {
+            dblock lk;
+            replLocalAuth();
+        }
+
+        if ( replSettings.slave ) {
+            assert( replSettings.slave == SimpleSlave );
+            log(1) << "slave=true" << endl;
+            boost::thread repl_thread(replSlaveThread);
+        }
+
+        if ( replSettings.master ) {
+            log(1) << "master=true" << endl;
+            replSettings.master = true;
+            createOplog();
+            boost::thread t(replMasterThread);
+        }
+
+        while( replSettings.fastsync ) // don't allow writes until we've set up from log
+            sleepmillis( 50 );
+    }
+
+    void testPretouch() {
+        int nthr = min(8, 8);
+        nthr = max(nthr, 1);
+        int m = 8 / nthr;
+        ThreadPool tp(nthr);
+        vector<BSONObj> v;
+
+        BSONObj x = BSON( "ns" << "test.foo" << "o" << BSON( "_id" << 1 ) << "op" << "i" );
+
+        v.push_back(x);
+        v.push_back(x);
+        v.push_back(x);
+
+        unsigned a = 0;
+        while( 1 ) {
+            if( a >= v.size() ) break;
+            unsigned b = a + m - 1; // v[a..b]
+            if( b >= v.size() ) b = v.size() - 1;
+            tp.schedule(pretouchN, v, a, b);
+            DEV cout << "pretouch task: " << a << ".." << b << endl;
+            a += m;
+        }
+        tp.join();
+    }
+
+    class ReplApplyBatchSizeValidator : public ParameterValidator {
+    public:
+        ReplApplyBatchSizeValidator() : ParameterValidator( "replApplyBatchSize" ) {}
+
+        virtual bool isValid( BSONElement e , string& errmsg ) const {
+            int b = e.numberInt();
+            if( b < 1 || b > 1024 ) {
+                errmsg = "replApplyBatchSize has to be >= 1 and < 1024";
+                return false;
+            }
+
+            if ( replSettings.slavedelay != 0 && b > 1 ) {
+                errmsg = "can't use a batch size > 1 with slavedelay";
+                return false;
+            }
+            if ( ! replSettings.slave ) {
+                errmsg = "can't set replApplyBatchSize on a non-slave machine";
+                return false;
+            }
+
+            return true;
+        }
+    } replApplyBatchSizeValidator;
+
+} // namespace mongo
diff --git a/src/mongo/db/repl.h b/src/mongo/db/repl.h
new file mode 100644
index 00000000000..83242d0a4ce
--- /dev/null
+++ b/src/mongo/db/repl.h
@@ -0,0 +1,199 @@
+// repl.h - replication
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* replication data overview
+
+   at the slave:
+     local.sources { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+   at the master:
+     local.oplog.$<source>
+*/
+
+#pragma once
+
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "../client/dbclient.h"
+#include "../util/optime.h"
+#include "oplog.h"
+#include "../util/concurrency/thread_pool.h"
+#include "oplogreader.h"
+#include "cloner.h"
+
+namespace mongo {
+
+    /* replication slave? (possibly with slave)
+       --slave cmd line setting -> SimpleSlave
+    */
+    typedef enum { NotSlave=0, SimpleSlave } SlaveTypes;
+
+    class ReplSettings {
+    public:
+        SlaveTypes slave;
+
+        /** true means we are master and doing replication.  if we are not writing to oplog, this won't be true. */
+        bool master;
+
+        bool fastsync;
+
+        bool autoresync;
+
+        int slavedelay;
+
+        set<string> discoveredSeeds;
+        mutex discoveredSeeds_mx;
+
+        BSONObj reconfig;
+
+        ReplSettings()
+            : slave(NotSlave),
+            master(false),
+            fastsync(),
+            autoresync(false),
+            slavedelay(),
+            discoveredSeeds(),
+            discoveredSeeds_mx("ReplSettings::discoveredSeeds") {
+        }
+
+    };
+
+    extern ReplSettings replSettings;
+
+    /* A replication exception */
+    class SyncException : public DBException {
+    public:
+        SyncException() : DBException( "sync exception" , 10001 ) {}
+    };
+
+    /* A Source is a source from which we can pull (replicate) data.
+       stored in collection local.sources.
+
+       Can be a group of things to replicate for several databases.
+
+          { host: ..., source: ..., only: ..., syncedTo: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+       'source' defaults to 'main'; support for multiple source names is
+       not done (always use main for now).
+    */
+    class ReplSource {
+        shared_ptr<ThreadPool> tp;
+
+        void resync(string db);
+
+        /** @param alreadyLocked caller already put us in write lock if true */
+        void sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked);
+
+        /* pull some operations from the master's oplog, and apply them.
+           calls sync_pullOpLog_applyOperation
+        */
+        int sync_pullOpLog(int& nApplied);
+
+        /* we only clone one database per pass, even if a lot need done.  This helps us
+           avoid overflowing the master's transaction log by doing too much work before going
+           back to read more transactions. (Imagine a scenario of slave startup where we try to
+           clone 100 databases in one pass.)
+        */
+        set<string> addDbNextPass;
+
+        set<string> incompleteCloneDbs;
+
+        ReplSource();
+
+        // returns the dummy ns used to do the drop
+        string resyncDrop( const char *db, const char *requester );
+        // call without the db mutex
+        void syncToTailOfRemoteLog();
+        string ns() const { return string( "local.oplog.$" ) + sourceName(); }
+        unsigned _sleepAdviceTime;
+
+        /**
+         * If 'db' is a new database and its name would conflict with that of
+         * an existing database, synchronize these database names with the
+         * master.
+         * @return true iff an op with the specified ns may be applied.
+         */
+        bool handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db );
+
+    public:
+        OplogReader oplogReader;
+
+        void applyOperation(const BSONObj& op);
+        string hostName;    // ip addr or hostname plus optionally, ":<port>"
+        string _sourceName;  // a logical source name.
+        string sourceName() const { return _sourceName.empty() ? "main" : _sourceName; }
+        string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating.
+
+        /* the last time point we have already synced up to (in the remote/master's oplog). */
+        OpTime syncedTo;
+
+        int nClonedThisPass;
+
+        typedef vector< shared_ptr< ReplSource > > SourceVector;
+        static void loadAll(SourceVector&);
+        explicit ReplSource(BSONObj);
+
+        /* -1 = error */
+        int sync(int& nApplied);
+
+        void save(); // write ourself to local.sources
+
+        // make a jsobj from our member fields of the form
+        //   { host: ..., source: ..., syncedTo: ... }
+        BSONObj jsobj();
+
+        bool operator==(const ReplSource&r) const {
+            return hostName == r.hostName && sourceName() == r.sourceName();
+        }
+        string toString() const { return sourceName() + "@" + hostName; }
+
+        bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); }
+        int sleepAdvice() const {
+            if ( !_sleepAdviceTime )
+                return 0;
+            int wait = _sleepAdviceTime - unsigned( time( 0 ) );
+            return wait > 0 ? wait : 0;
+        }
+
+        static bool throttledForceResyncDead( const char *requester );
+        static void forceResyncDead( const char *requester );
+        void forceResync( const char *requester );
+    };
+
+    bool anyReplEnabled();
+    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 );
+
+    /**
+     * Helper class used to set and query an ignore state for a named database.
+     * The ignore state will expire after a specified OpTime.
+     */
+    class DatabaseIgnorer {
+    public:
+        /** Indicate that operations for 'db' should be ignored until after 'futureOplogTime' */
+        void doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime );
+        /**
+         * Query ignore state of 'db'; if 'currentOplogTime' is after the ignore
+         * limit, the ignore state will be cleared.
+         */
+        bool ignoreAt( const string &db, const OpTime &currentOplogTime );
+    private:
+        map< string, OpTime > _ignores;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/repl/connections.h b/src/mongo/db/repl/connections.h
new file mode 100644
index 00000000000..3e08f80b047
--- /dev/null
+++ b/src/mongo/db/repl/connections.h
@@ -0,0 +1,128 @@
+// @file
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+#include "../../client/dbclient.h"
+#include "../security_common.h"
+
+namespace mongo {
+
+    /** here we keep a single connection (with reconnect) for a set of hosts,
+        one each, and allow one user at a time per host.  if in use already for that
+        host, we block.  so this is an easy way to keep a 1-deep pool of connections
+        that many threads can share.
+
+        thread-safe.
+
+        Example:
+        {
+            ScopedConn c("foo.acme.com:9999");
+            c->runCommand(...);
+        }
+
+        throws exception on connect error (but fine to try again later with a new
+        scopedconn object for same host).
+    */
+    class ScopedConn {
+    public:
+        /** throws assertions if connect failure etc. */
+        ScopedConn(string hostport);
+        ~ScopedConn() {
+            // conLock releases...
+        }
+        void reconnect() {
+          conn()->port().shutdown();
+          connect();
+        }
+
+        /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic.
+           So here what we do is wrapper known safe methods and not allow cursor-style queries at all.  This makes
+           ScopedConn limited in functionality but very safe.  More non-cursor wrappers can be added here if needed.
+           */
+        bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0) {
+            return conn()->runCommand(dbname, cmd, info, options);
+        }
+        unsigned long long count(const string &ns) {
+            return conn()->count(ns);
+        }
+        BSONObj findOne(const string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) {
+            return conn()->findOne(ns, q, fieldsToReturn, queryOptions);
+        }
+
+    private:
+        auto_ptr<scoped_lock> connLock;
+        static mongo::mutex mapMutex;
+        struct X {
+            mongo::mutex z;
+            DBClientConnection cc;
+            bool connected;
+            X() : z("X"), cc(/*reconnect*/ true, 0, /*timeout*/ 10.0), connected(false) {
+                cc._logLevel = 2;
+            }
+        } *x;
+        typedef map<string,ScopedConn::X*> M;
+        static M& _map;
+        DBClientConnection* conn() { return &x->cc; }
+        const string _hostport;
+
+        // we should already be locked...
+        bool connect() {
+          string err;
+          if (!x->cc.connect(_hostport, err)) {
+            log() << "couldn't connect to " << _hostport << ": " << err << rsLog;
+            return false;
+          }
+          x->connected = true;
+
+          // if we cannot authenticate against a member, then either its key file
+          // or our key file has to change.  if our key file has to change, we'll
+          // be rebooting. if their file has to change, they'll be rebooted so the
+          // connection created above will go dead, reconnect, and reauth.
+          if (!noauth && !x->cc.auth("local", internalSecurity.user, internalSecurity.pwd, err, false)) {
+            log() << "could not authenticate against " << _hostport << ", " << err << rsLog;
+            return false;
+          }
+
+          return true;
+        }
+    };
+
+    inline ScopedConn::ScopedConn(string hostport) : _hostport(hostport) {
+        bool first = false;
+        {
+            scoped_lock lk(mapMutex);
+            x = _map[_hostport];
+            if( x == 0 ) {
+                x = _map[_hostport] = new X();
+                first = true;
+                connLock.reset( new scoped_lock(x->z) );
+            }
+        }
+
+        // Keep trying to connect if we're not yet connected
+        if( !first && x->connected ) {
+            connLock.reset( new scoped_lock(x->z) );
+            return;
+        }
+
+        connect();
+    }
+
+}
diff --git a/src/mongo/db/repl/consensus.cpp b/src/mongo/db/repl/consensus.cpp
new file mode 100644
index 00000000000..3995373f5ef
--- /dev/null
+++ b/src/mongo/db/repl/consensus.cpp
@@ -0,0 +1,449 @@
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "rs.h"
+#include "multicmd.h"
+
+namespace mongo {
+
+    class CmdReplSetFresh : public ReplSetCommand {
+    public:
+        CmdReplSetFresh() : ReplSetCommand("replSetFresh") { }
+    private:
+
+        bool shouldVeto(const BSONObj& cmdObj, string& errmsg) {
+            unsigned id = cmdObj["id"].Int();
+            const Member* primary = theReplSet->box.getPrimary();
+            const Member* hopeful = theReplSet->findById(id);
+            const Member *highestPriority = theReplSet->getMostElectable();
+
+            if( !hopeful ) {
+                errmsg = str::stream() << "replSet couldn't find member with id " << id;
+                return true;
+            }
+            else if( theReplSet->isPrimary() && theReplSet->lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
+                // hbinfo is not updated, so we have to check the primary's last optime separately
+                errmsg = str::stream() << "I am already primary, " << hopeful->fullName() <<
+                    " can try again once I've stepped down";
+                return true;
+            }
+            else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
+                // other members might be aware of more up-to-date nodes
+                errmsg = str::stream() << hopeful->fullName() << " is trying to elect itself but " <<
+                    primary->fullName() << " is already primary and more up-to-date";
+                return true;
+            }
+            else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
+                errmsg = str::stream() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName();
+                return true;
+            }
+
+            // don't veto older versions
+            if (cmdObj["id"].eoo()) {
+                // they won't be looking for the veto field
+                return false;
+            }
+
+            if ( !theReplSet->isElectable(id) ||
+                (highestPriority && highestPriority->config().priority > hopeful->config().priority)) {
+                return true;
+            }
+
+            return false;
+        }
+
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+
+            if( cmdObj["set"].String() != theReplSet->name() ) {
+                errmsg = "wrong repl set name";
+                return false;
+            }
+            string who = cmdObj["who"].String();
+            int cfgver = cmdObj["cfgver"].Int();
+            OpTime opTime(cmdObj["opTime"].Date());
+
+            bool weAreFresher = false;
+            if( theReplSet->config().version > cfgver ) {
+                log() << "replSet member " << who << " is not yet aware its cfg version " << cfgver << " is stale" << rsLog;
+                result.append("info", "config version stale");
+                weAreFresher = true;
+            }
+            // check not only our own optime, but any other member we can reach
+            else if( opTime < theReplSet->lastOpTimeWritten ||
+                     opTime < theReplSet->lastOtherOpTime())  {
+                weAreFresher = true;
+            }
+            result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
+            result.append("fresher", weAreFresher);
+            result.append("veto", shouldVeto(cmdObj, errmsg));
+
+            return true;
+        }
+    } cmdReplSetFresh;
+
+    class CmdReplSetElect : public ReplSetCommand {
+    public:
+        CmdReplSetElect() : ReplSetCommand("replSetElect") { }
+    private:
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            theReplSet->elect.electCmdReceived(cmdObj, &result);
+            return true;
+        }
+    } cmdReplSetElect;
+
+    int Consensus::totalVotes() const {
+        static int complain = 0;
+        int vTot = rs._self->config().votes;
+        for( Member *m = rs.head(); m; m=m->next() )
+            vTot += m->config().votes;
+        if( vTot % 2 == 0 && vTot && complain++ == 0 )
+            log() << "replSet " /*buildbot! warning */ "total number of votes is even - add arbiter or give one member an extra vote" << rsLog;
+        return vTot;
+    }
+
+    bool Consensus::aMajoritySeemsToBeUp() const {
+        int vUp = rs._self->config().votes;
+        for( Member *m = rs.head(); m; m=m->next() )
+            vUp += m->hbinfo().up() ? m->config().votes : 0;
+        return vUp * 2 > totalVotes();
+    }
+
+    bool Consensus::shouldRelinquish() const {
+        int vUp = rs._self->config().votes;
+        const long long T = rs.config().ho.heartbeatTimeoutMillis * rs.config().ho.heartbeatConnRetries;
+        for( Member *m = rs.head(); m; m=m->next() ) {
+            long long dt = m->hbinfo().timeDown();
+            if( dt < T )
+                vUp += m->config().votes;
+        }
+
+        // the manager will handle calling stepdown if another node should be
+        // primary due to priority
+
+        return !( vUp * 2 > totalVotes() );
+    }
+
+    static const int VETO = -10000;
+
+    const time_t LeaseTime = 30;
+
+    SimpleMutex Consensus::lyMutex("ly");
+
+    unsigned Consensus::yea(unsigned memberId) { /* throws VoteException */
+        SimpleMutex::scoped_lock lk(lyMutex);
+        LastYea &L = this->ly.ref(lk);
+        time_t now = time(0);
+        if( L.when + LeaseTime >= now && L.who != memberId ) {
+            LOG(1) << "replSet not voting yea for " << memberId <<
+                   " voted for " << L.who << ' ' << now-L.when << " secs ago" << rsLog;
+            throw VoteException();
+        }
+        L.when = now;
+        L.who = memberId;
+        return rs._self->config().votes;
+    }
+
+    /* we vote for ourself at start of election.  once it fails, we can cancel the lease we had in
+       place instead of leaving it for a long time.
+       */
+    void Consensus::electionFailed(unsigned meid) {
+        SimpleMutex::scoped_lock lk(lyMutex);
+        LastYea &L = ly.ref(lk);
+        DEV assert( L.who == meid ); // this may not always always hold, so be aware, but adding for now as a quick sanity test
+        if( L.who == meid )
+            L.when = 0;
+    }
+
+    /* todo: threading **************** !!!!!!!!!!!!!!!! */
+    void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) {
+        BSONObjBuilder& b = *_b;
+        DEV log() << "replSet received elect msg " << cmd.toString() << rsLog;
+        else LOG(2) << "replSet received elect msg " << cmd.toString() << rsLog;
+        string set = cmd["set"].String();
+        unsigned whoid = cmd["whoid"].Int();
+        int cfgver = cmd["cfgver"].Int();
+        OID round = cmd["round"].OID();
+        int myver = rs.config().version;
+
+        const Member* primary = rs.box.getPrimary();
+        const Member* hopeful = rs.findById(whoid);
+        const Member* highestPriority = rs.getMostElectable();
+
+        int vote = 0;
+        if( set != rs.name() ) {
+            log() << "replSet error received an elect request for '" << set << "' but our set name is '" << rs.name() << "'" << rsLog;
+        }
+        else if( myver < cfgver ) {
+            // we are stale.  don't vote
+        }
+        else if( myver > cfgver ) {
+            // they are stale!
+            log() << "replSet electCmdReceived info got stale version # during election" << rsLog;
+            vote = -10000;
+        }
+        else if( !hopeful ) {
+            log() << "replSet electCmdReceived couldn't find member with id " << whoid << rsLog;
+            vote = -10000;
+        }
+        else if( primary && primary == rs._self && rs.lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
+            // hbinfo is not updated, so we have to check the primary's last optime separately
+            log() << "I am already primary, " << hopeful->fullName()
+                  << " can try again once I've stepped down" << rsLog;
+            vote = -10000;
+        }
+        else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
+            // other members might be aware of more up-to-date nodes
+            log() << hopeful->fullName() << " is trying to elect itself but " <<
+                  primary->fullName() << " is already primary and more up-to-date" << rsLog;
+            vote = -10000;
+        }
+        else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
+            log() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName();
+            vote = -10000;
+        }
+        else {
+            try {
+                vote = yea(whoid);
+                dassert( hopeful->id() == whoid );
+                rs.relinquish();
+                log() << "replSet info voting yea for " <<  hopeful->fullName() << " (" << whoid << ')' << rsLog;
+            }
+            catch(VoteException&) {
+                log() << "replSet voting no for " << hopeful->fullName() << " already voted for another" << rsLog;
+            }
+        }
+
+        b.append("vote", vote);
+        b.append("round", round);
+    }
+
+    void ReplSetImpl::_getTargets(list<Target>& L, int& configVersion) {
+        configVersion = config().version;
+        for( Member *m = head(); m; m=m->next() )
+            if( m->hbinfo().maybeUp() )
+                L.push_back( Target(m->fullName()) );
+    }
+
+    /* config version is returned as it is ok to use this unlocked.  BUT, if unlocked, you would need
+       to check later that the config didn't change. */
+    void ReplSetImpl::getTargets(list<Target>& L, int& configVersion) {
+        if( lockedByMe() ) {
+            _getTargets(L, configVersion);
+            return;
+        }
+        lock lk(this);
+        _getTargets(L, configVersion);
+    }
+
+    /* Do we have the newest data of them all?
+       @param allUp - set to true if all members are up.  Only set if true returned.
+       @return true if we are freshest.  Note we may tie.
+    */
+    bool Consensus::weAreFreshest(bool& allUp, int& nTies) {
+        const OpTime ord = theReplSet->lastOpTimeWritten;
+        nTies = 0;
+        assert( !ord.isNull() );
+        BSONObj cmd = BSON(
+                          "replSetFresh" << 1 <<
+                          "set" << rs.name() <<
+                          "opTime" << Date_t(ord.asDate()) <<
+                          "who" << rs._self->fullName() <<
+                          "cfgver" << rs._cfg->version <<
+                          "id" << rs._self->id());
+        list<Target> L;
+        int ver;
+        /* the following queries arbiters, even though they are never fresh.  wonder if that makes sense.
+           it doesn't, but it could, if they "know" what freshness it one day.  so consider removing
+           arbiters from getTargets() here.  although getTargets is used elsewhere for elections; there
+           arbiters are certainly targets - so a "includeArbs" bool would be necessary if we want to make
+           not fetching them herein happen.
+           */
+        rs.getTargets(L, ver);
+        multiCommand(cmd, L);
+        int nok = 0;
+        allUp = true;
+        for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+            if( i->ok ) {
+                nok++;
+                if( i->result["fresher"].trueValue() ) {
+                    log() << "not electing self, we are not freshest" << rsLog;
+                    return false;
+                }
+                OpTime remoteOrd( i->result["opTime"].Date() );
+                if( remoteOrd == ord )
+                    nTies++;
+                assert( remoteOrd <= ord );
+
+                if( i->result["veto"].trueValue() ) {
+                    BSONElement msg = i->result["errmsg"];
+                    if (!msg.eoo()) {
+                        log() << "not electing self, " << i->toHost << " would veto with '" <<
+                            msg.String() << "'" << rsLog;
+                    }
+                    else {
+                        log() << "not electing self, " << i->toHost << " would veto" << rsLog;
+                    }
+                    return false;
+                }
+            }
+            else {
+                DEV log() << "replSet freshest returns " << i->result.toString() << rsLog;
+                allUp = false;
+            }
+        }
+        LOG(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
+        assert( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working...
+        return true;
+    }
+
+    extern time_t started;
+
+    void Consensus::multiCommand(BSONObj cmd, list<Target>& L) {
+        assert( !rs.lockedByMe() );
+        mongo::multiCommand(cmd, L);
+    }
+
+    void Consensus::_electSelf() {
+        if( time(0) < steppedDown )
+            return;
+
+        {
+            const OpTime ord = theReplSet->lastOpTimeWritten;
+            if( ord == 0 ) {
+                log() << "replSet info not trying to elect self, do not yet have a complete set of data from any point in time" << rsLog;
+                return;
+            }
+        }
+
+        bool allUp;
+        int nTies;
+        if( !weAreFreshest(allUp, nTies) ) {
+            return;
+        }
+
+        rs.sethbmsg("",9);
+
+        if( !allUp && time(0) - started < 60 * 5 ) {
+            /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data
+               if we don't have to -- we'd rather be offline and wait a little longer instead
+               todo: make this configurable.
+               */
+            rs.sethbmsg("not electing self, not all members up and we have been up less than 5 minutes");
+            return;
+        }
+
+        Member& me = *rs._self;
+
+        if( nTies ) {
+            /* tie?  we then randomly sleep to try to not collide on our voting. */
+            /* todo: smarter. */
+            if( me.id() == 0 || sleptLast ) {
+                // would be fine for one node not to sleep
+                // todo: biggest / highest priority nodes should be the ones that get to not sleep
+            }
+            else {
+                assert( !rs.lockedByMe() ); // bad to go to sleep locked
+                unsigned ms = ((unsigned) rand()) % 1000 + 50;
+                DEV log() << "replSet tie " << nTies << " sleeping a little " << ms << "ms" << rsLog;
+                sleptLast = true;
+                sleepmillis(ms);
+                throw RetryAfterSleepException();
+            }
+        }
+        sleptLast = false;
+
+        time_t start = time(0);
+        unsigned meid = me.id();
+        int tally = yea( meid );
+        bool success = false;
+        try {
+            log() << "replSet info electSelf " << meid << rsLog;
+
+            BSONObj electCmd = BSON(
+                                   "replSetElect" << 1 <<
+                                   "set" << rs.name() <<
+                                   "who" << me.fullName() <<
+                                   "whoid" << me.hbinfo().id() <<
+                                   "cfgver" << rs._cfg->version <<
+                                   "round" << OID::gen() /* this is just for diagnostics */
+                               );
+
+            int configVersion;
+            list<Target> L;
+            rs.getTargets(L, configVersion);
+            multiCommand(electCmd, L);
+
+            {
+                for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+                    DEV log() << "replSet elect res: " << i->result.toString() << rsLog;
+                    if( i->ok ) {
+                        int v = i->result["vote"].Int();
+                        tally += v;
+                    }
+                }
+                if( tally*2 <= totalVotes() ) {
+                    log() << "replSet couldn't elect self, only received " << tally << " votes" << rsLog;
+                }
+                else if( time(0) - start > 30 ) {
+                    // defensive; should never happen as we have timeouts on connection and operation for our conn
+                    log() << "replSet too much time passed during our election, ignoring result" << rsLog;
+                }
+                else if( configVersion != rs.config().version ) {
+                    log() << "replSet config version changed during our election, ignoring result" << rsLog;
+                }
+                else {
+                    /* succeeded. */
+                    log(1) << "replSet election succeeded, assuming primary role" << rsLog;
+                    success = true;
+                    rs.assumePrimary();
+                }
+            }
+        }
+        catch( std::exception& ) {
+            if( !success ) electionFailed(meid);
+            throw;
+        }
+        if( !success ) electionFailed(meid);
+    }
+
+    void Consensus::electSelf() {
+        assert( !rs.lockedByMe() );
+        assert( !rs.myConfig().arbiterOnly );
+        assert( rs.myConfig().slaveDelay == 0 );
+        try {
+            _electSelf();
+        }
+        catch(RetryAfterSleepException&) {
+            throw;
+        }
+        catch(VoteException& ) {
+            log() << "replSet not trying to elect self as responded yea to someone else recently" << rsLog;
+        }
+        catch(DBException& e) {
+            log() << "replSet warning caught unexpected exception in electSelf() " << e.toString() << rsLog;
+        }
+        catch(...) {
+            log() << "replSet warning caught unexpected exception in electSelf()" << rsLog;
+        }
+    }
+
+}
diff --git a/src/mongo/db/repl/health.cpp b/src/mongo/db/repl/health.cpp
new file mode 100644
index 00000000000..0b7ed87eac3
--- /dev/null
+++ b/src/mongo/db/repl/health.cpp
@@ -0,0 +1,449 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "health.h"
+#include "../../util/background.h"
+#include "../../client/dbclient.h"
+#include "../../client/connpool.h"
+#include "../commands.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/task.h"
+#include "../../util/mongoutils/html.h"
+#include "../../util/goodies.h"
+#include "../../util/ramlog.h"
+#include "../helpers/dblogger.h"
+#include "connections.h"
+#include "../../util/unittest.h"
+#include "../dbhelpers.h"
+
+namespace mongo {
+    /* decls for connections.h */
+    ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M());
+    mutex ScopedConn::mapMutex("ScopedConn::mapMutex");
+}
+
+namespace mongo {
+
+    using namespace mongoutils::html;
+    using namespace bson;
+
+    static RamLog * _rsLog = new RamLog( "rs" );
+    Tee *rsLog = _rsLog;
+    extern bool replSetBlind; // for testing
+
+    string ago(time_t t) {
+        if( t == 0 ) return "";
+
+        time_t x = time(0) - t;
+        stringstream s;
+        if( x < 180 ) {
+            s << x << " sec";
+            if( x != 1 ) s << 's';
+        }
+        else if( x < 3600 ) {
+            s.precision(2);
+            s << x / 60.0 << " mins";
+        }
+        else {
+            s.precision(2);
+            s << x / 3600.0 << " hrs";
+        }
+        return s.str();
+    }
+
+    void Member::summarizeMember(stringstream& s) const {
+        s << tr();
+        {
+            stringstream u;
+            u << "http://" << h().host() << ':' << (h().port() + 1000) << "/_replSet";
+            s << td( a(u.str(), "", fullName()) );
+        }
+        s << td( id() );
+        double h = hbinfo().health;
+        bool ok = h > 0;
+        s << td(red(str::stream() << h,h == 0));
+        s << td(ago(hbinfo().upSince));
+        bool never = false;
+        {
+            string h;
+            time_t hb = hbinfo().lastHeartbeat;
+            if( hb == 0 ) {
+                h = "never";
+                never = true;
+            }
+            else h = ago(hb) + " ago";
+            s << td(h);
+        }
+        s << td(config().votes);
+        s << td(config().priority);
+        {
+            string stateText = state().toString();
+            if( _config.hidden )
+                stateText += " (hidden)";
+            if( ok || stateText.empty() )
+                s << td(stateText); // text blank if we've never connected
+            else
+                s << td( grey(str::stream() << "(was " << state().toString() << ')', true) );
+        }
+        s << td( grey(hbinfo().lastHeartbeatMsg,!ok) );
+        stringstream q;
+        q << "/_replSetOplog?_id=" << id();
+        s << td( a(q.str(), "", never ? "?" : hbinfo().opTime.toString()) );
+        if( hbinfo().skew > INT_MIN ) {
+            s << td( grey(str::stream() << hbinfo().skew,!ok) );
+        }
+        else
+            s << td("");
+        s << _tr();
+    }
+
+    string ReplSetImpl::stateAsHtml(MemberState s) {
+        if( s.s == MemberState::RS_STARTUP ) return a("", "serving still starting up, or still trying to initiate the set", "STARTUP");
+        if( s.s == MemberState::RS_PRIMARY ) return a("", "this server thinks it is primary", "PRIMARY");
+        if( s.s == MemberState::RS_SECONDARY ) return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY");
+        if( s.s == MemberState::RS_RECOVERING ) return a("", "recovering/resyncing; after recovery usually auto-transitions to secondary", "RECOVERING");
+        if( s.s == MemberState::RS_FATAL ) return a("", "something bad has occurred and server is not completely offline with regard to the replica set.  fatal error.", "FATAL");
+        if( s.s == MemberState::RS_STARTUP2 ) return a("", "loaded config, still determining who is primary", "STARTUP2");
+        if( s.s == MemberState::RS_ARBITER ) return a("", "this server is an arbiter only", "ARBITER");
+        if( s.s == MemberState::RS_DOWN ) return a("", "member is down, slow, or unreachable", "DOWN");
+        if( s.s == MemberState::RS_ROLLBACK ) return a("", "rolling back operations to get in sync", "ROLLBACK");
+        return "";
+    }
+
+    extern time_t started;
+
+    // oplogdiags in web ui
+    static void say(stringstream&ss, const bo& op) {
+        ss << "<tr>";
+
+        set<string> skip;
+        be e = op["ts"];
+        if( e.type() == Date || e.type() == Timestamp ) {
+            OpTime ot = e._opTime();
+            ss << td( time_t_to_String_short( ot.getSecs() ) );
+            ss << td( ot.toString() );
+            skip.insert("ts");
+        }
+        else ss << td("?") << td("?");
+
+        e = op["h"];
+        if( e.type() == NumberLong ) {
+            ss << "<td>" << hex << e.Long() << "</td>\n";
+            skip.insert("h");
+        }
+        else
+            ss << td("?");
+
+        ss << td(op["op"].valuestrsafe());
+        ss << td(op["ns"].valuestrsafe());
+        skip.insert("op");
+        skip.insert("ns");
+
+        ss << "<td>";
+        for( bo::iterator i(op); i.more(); ) {
+            be e = i.next();
+            if( skip.count(e.fieldName()) ) continue;
+            ss << e.toString() << ' ';
+        }
+        ss << "</td></tr>\n";
+    }
+
+    void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const {
+        const Member *m = findById(server_id);
+        if( m == 0 ) {
+            ss << "Error : can't find a member with id: " << server_id << '\n';
+            return;
+        }
+
+        ss << p("Server : " + m->fullName() + "<br>ns : " + rsoplog );
+
+        //const bo fields = BSON( "o" << false << "o2" << false );
+        const bo fields;
+
+        /** todo fix we might want an so timeout here */
+        DBClientConnection conn(false, 0, /*timeout*/ 20);
+        {
+            string errmsg;
+            if( !conn.connect(m->fullName(), errmsg) ) {
+                ss << "couldn't connect to " << m->fullName() << ' ' << errmsg;
+                return;
+            }
+        }
+
+        auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields);
+        if( c.get() == 0 ) {
+            ss << "couldn't query " << rsoplog;
+            return;
+        }
+        static const char *h[] = {"ts","optime", "h","op","ns","rest",0};
+
+        ss << "<style type=\"text/css\" media=\"screen\">"
+           "table { font-size:75% }\n"
+           // "th { background-color:#bbb; color:#000 }\n"
+           // "td,th { padding:.25em }\n"
+           "</style>\n";
+
+        ss << table(h, true);
+        //ss << "<pre>\n";
+        int n = 0;
+        OpTime otFirst;
+        OpTime otLast;
+        OpTime otEnd;
+        while( c->more() ) {
+            bo o = c->next();
+            otLast = o["ts"]._opTime();
+            if( otFirst.isNull() )
+                otFirst = otLast;
+            say(ss, o);
+            n++;
+        }
+        if( n == 0 ) {
+            ss << rsoplog << " is empty\n";
+        }
+        else {
+            auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
+            if( c.get() == 0 ) {
+                ss << "couldn't query [2] " << rsoplog;
+                return;
+            }
+            string x;
+            bo o = c->next();
+            otEnd = o["ts"]._opTime();
+            while( 1 ) {
+                stringstream z;
+                if( o["ts"]._opTime() == otLast )
+                    break;
+                say(z, o);
+                x = z.str() + x;
+                if( !c->more() )
+                    break;
+                o = c->next();
+            }
+            if( !x.empty() ) {
+                ss << "<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>\n" << x;
+                //ss << "\n...\n\n" << x;
+            }
+        }
+        ss << _table();
+        ss << p(time_t_to_String_short(time(0)) + " current time");
+
+        if( !otEnd.isNull() ) {
+            ss << "<p>Log length in time: ";
+            unsigned d = otEnd.getSecs() - otFirst.getSecs();
+            double h = d / 3600.0;
+            ss.precision(3);
+            if( h < 72 )
+                ss << h << " hours";
+            else
+                ss << h / 24.0 << " days";
+            ss << "</p>\n";
+        }
+    }
+
+    void ReplSetImpl::_summarizeAsHtml(stringstream& s) const {
+        s << table(0, false);
+        s << tr("Set name:", _name);
+        s << tr("Majority up:", elect.aMajoritySeemsToBeUp()?"yes":"no" );
+        s << _table();
+
+        const char *h[] = {"Member",
+                           "<a title=\"member id in the replset config\">id</a>",
+                           "Up",
+                           "<a title=\"length of time we have been continuously connected to the other member with no reconnects (for self, shows uptime)\">cctime</a>",
+                           "<a title=\"when this server last received a heartbeat response - includes error code responses\">Last heartbeat</a>",
+                           "Votes", "Priority", "State", "Messages",
+                           "<a title=\"how up to date this server is.  this value polled every few seconds so actually lag is typically much lower than value shown here.\">optime</a>",
+                           "<a title=\"Clock skew in seconds relative to this server. Informational; server clock variances will make the diagnostics hard to read, but otherwise are benign..\">skew</a>",
+                           0
+                          };
+        s << table(h);
+
+        /* this is to sort the member rows by their ordinal _id, so they show up in the same
+           order on all the different web ui's; that is less confusing for the operator. */
+        map<int,string> mp;
+
+        string myMinValid;
+        try {
+            readlocktry lk("local.replset.minvalid", 300);
+            if( lk.got() ) {
+                BSONObj mv;
+                if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
+                    myMinValid = "minvalid:" + mv["ts"]._opTime().toString();
+                }
+            }
+            else myMinValid = ".";
+        }
+        catch(...) {
+            myMinValid = "exception fetching minvalid";
+        }
+
+        const Member *_self = this->_self;
+        assert(_self);
+        {
+            stringstream s;
+            /* self row */
+            s << tr() << td(_self->fullName() + " (me)") <<
+              td(_self->id()) <<
+              td("1") <<  //up
+              td(ago(started)) <<
+              td("") << // last heartbeat
+              td(ToString(_self->config().votes)) <<
+              td(ToString(_self->config().priority)) <<
+              td( stateAsHtml(box.getState()) + (_self->config().hidden?" (hidden)":"") );
+            s << td( _hbmsg );
+            stringstream q;
+            q << "/_replSetOplog?_id=" << _self->id();
+            s << td( a(q.str(), myMinValid, theReplSet->lastOpTimeWritten.toString()) );
+            s << td(""); // skew
+            s << _tr();
+            mp[_self->hbinfo().id()] = s.str();
+        }
+        Member *m = head();
+        while( m ) {
+            stringstream s;
+            m->summarizeMember(s);
+            mp[m->hbinfo().id()] = s.str();
+            m = m->next();
+        }
+
+        for( map<int,string>::const_iterator i = mp.begin(); i != mp.end(); i++ )
+            s << i->second;
+        s << _table();
+    }
+
+
+    void fillRsLog(stringstream& s) {
+        _rsLog->toHTML( s );
+    }
+
+    const Member* ReplSetImpl::findById(unsigned id) const {
+        if( _self && id == _self->id() ) return _self;
+
+        for( Member *m = head(); m; m = m->next() )
+            if( m->id() == id )
+                return m;
+        return 0;
+    }
+
+    const OpTime ReplSetImpl::lastOtherOpTime() const {
+        OpTime closest(0,0);
+
+        for( Member *m = _members.head(); m; m=m->next() ) {
+            if (!m->hbinfo().up()) {
+                continue;
+            }
+
+            if (m->hbinfo().opTime > closest) {
+                closest = m->hbinfo().opTime;
+            }
+        }
+
+        return closest;
+    }
+
+    void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const {
+        vector<BSONObj> v;
+
+        const Member *_self = this->_self;
+        assert( _self );
+
+        MemberState myState = box.getState();
+
+        // add self
+        {
+            BSONObjBuilder bb;
+            bb.append("_id", (int) _self->id());
+            bb.append("name", _self->fullName());
+            bb.append("health", 1.0);
+            bb.append("state", (int)myState.s);
+            bb.append("stateStr", myState.toString());
+            bb.append("uptime", (unsigned)(time(0) - cmdLine.started));
+            if (!_self->config().arbiterOnly) {
+                bb.appendTimestamp("optime", lastOpTimeWritten.asDate());
+                bb.appendDate("optimeDate", lastOpTimeWritten.getSecs() * 1000LL);
+            }
+
+            int maintenance = _maintenanceMode;
+            if (maintenance) {
+                bb.append("maintenanceMode", maintenance);
+            }
+
+            if (theReplSet) {
+                string s = theReplSet->hbmsg();
+                if( !s.empty() )
+                    bb.append("errmsg", s);
+            }
+            bb.append("self", true);
+            v.push_back(bb.obj());
+        }
+
+        Member *m =_members.head();
+        while( m ) {
+            BSONObjBuilder bb;
+            bb.append("_id", (int) m->id());
+            bb.append("name", m->fullName());
+            double h = m->hbinfo().health;
+            bb.append("health", h);
+            bb.append("state", (int) m->state().s);
+            if( h == 0 ) {
+                // if we can't connect the state info is from the past and could be confusing to show
+                bb.append("stateStr", "(not reachable/healthy)");
+            }
+            else {
+                bb.append("stateStr", m->state().toString());
+            }
+            bb.append("uptime", (unsigned) (m->hbinfo().upSince ? (time(0)-m->hbinfo().upSince) : 0));
+            if (!m->config().arbiterOnly) {
+                bb.appendTimestamp("optime", m->hbinfo().opTime.asDate());
+                bb.appendDate("optimeDate", m->hbinfo().opTime.getSecs() * 1000LL);
+            }
+            bb.appendTimeT("lastHeartbeat", m->hbinfo().lastHeartbeat);
+            bb.append("pingMs", m->hbinfo().ping);
+            string s = m->lhb();
+            if( !s.empty() )
+                bb.append("errmsg", s);
+
+            if (m->hbinfo().authIssue) {
+                bb.append("authenticated", false);
+            }
+
+            v.push_back(bb.obj());
+            m = m->next();
+        }
+        sort(v.begin(), v.end());
+        b.append("set", name());
+        b.appendTimeT("date", time(0));
+        b.append("myState", myState.s);
+        const Member *syncTarget = _currentSyncTarget;
+        if (syncTarget && myState != MemberState::RS_PRIMARY) {
+            b.append("syncingTo", syncTarget->fullName());
+        }
+        b.append("members", v);
+        if( replSetBlind )
+            b.append("blind",true); // to avoid confusion if set...normally never set except for testing.
+    }
+
+    static struct Test : public UnitTest {
+        void run() {
+            HealthOptions a,b;
+            assert( a == b );
+            assert( a.isDefault() );
+        }
+    } test;
+
+}
diff --git a/src/mongo/db/repl/health.h b/src/mongo/db/repl/health.h
new file mode 100644
index 00000000000..55cca93a27e
--- /dev/null
+++ b/src/mongo/db/repl/health.h
@@ -0,0 +1,50 @@
+// replset.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /* throws */
+    bool requestHeartbeat(string setname, string fromHost, string memberFullName, BSONObj& result, int myConfigVersion, int& theirConfigVersion, bool checkEmpty = false);
+
+    struct HealthOptions {
+        HealthOptions() :  
+            heartbeatSleepMillis(2000), 
+            heartbeatTimeoutMillis( 10000 ),
+            heartbeatConnRetries(2) 
+        { }
+
+        bool isDefault() const { return *this == HealthOptions(); }
+
+        // see http://www.mongodb.org/display/DOCS/Replica+Set+Internals
+        unsigned heartbeatSleepMillis;
+        unsigned heartbeatTimeoutMillis;
+        unsigned heartbeatConnRetries ;
+
+        void check() {
+            uassert(13112, "bad replset heartbeat option", heartbeatSleepMillis >= 10);
+            uassert(13113, "bad replset heartbeat option", heartbeatTimeoutMillis >= 10);
+        }
+
+        bool operator==(const HealthOptions& r) const {
+            return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==r.heartbeatConnRetries;
+        }
+    };
+    
+}
diff --git a/src/mongo/db/repl/heartbeat.cpp b/src/mongo/db/repl/heartbeat.cpp
new file mode 100644
index 00000000000..331812af85a
--- /dev/null
+++ b/src/mongo/db/repl/heartbeat.cpp
@@ -0,0 +1,382 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "health.h"
+#include "../../util/background.h"
+#include "../../client/dbclient.h"
+#include "../commands.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/task.h"
+#include "../../util/concurrency/msg.h"
+#include "../../util/mongoutils/html.h"
+#include "../../util/goodies.h"
+#include "../../util/ramlog.h"
+#include "../helpers/dblogger.h"
+#include "connections.h"
+#include "../../util/unittest.h"
+#include "../instance.h"
+#include "../repl.h"
+
+namespace mongo {
+
+    using namespace bson;
+
+    extern bool replSetBlind;
+    extern ReplSettings replSettings;
+
+    unsigned int HeartbeatInfo::numPings;
+
+    long long HeartbeatInfo::timeDown() const {
+        if( up() ) return 0;
+        if( downSince == 0 )
+            return 0; // still waiting on first heartbeat
+        return jsTime() - downSince;
+    }
+
+    /* { replSetHeartbeat : <setname> } */
+    class CmdReplSetHeartbeat : public ReplSetCommand {
+    public:
+        CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( replSetBlind ) {
+                if (theReplSet) {
+                    errmsg = str::stream() << theReplSet->selfFullName() << " is blind";
+                }
+                return false;
+            }
+
+            /* we don't call ReplSetCommand::check() here because heartbeat
+               checks many things that are pre-initialization. */
+            if( !replSet ) {
+                errmsg = "not running with --replSet";
+                return false;
+            }
+
+            if (!checkAuth(errmsg, result)) {
+                return false;
+            }
+
+            /* we want to keep heartbeat connections open when relinquishing primary.  tag them here. */
+            {
+                AbstractMessagingPort *mp = cc().port();
+                if( mp )
+                    mp->tag |= 1;
+            }
+
+            if( cmdObj["pv"].Int() != 1 ) {
+                errmsg = "incompatible replset protocol version";
+                return false;
+            }
+            {
+                string s = string(cmdObj.getStringField("replSetHeartbeat"));
+                if( cmdLine.ourSetName() != s ) {
+                    errmsg = "repl set names do not match";
+                    log() << "replSet set names do not match, our cmdline: " << cmdLine._replSet << rsLog;
+                    log() << "replSet s: " << s << rsLog;
+                    result.append("mismatch", true);
+                    return false;
+                }
+            }
+
+            result.append("rs", true);
+            if( cmdObj["checkEmpty"].trueValue() ) {
+                result.append("hasData", replHasDatabases());
+            }
+            if( theReplSet == 0 ) {
+                string from( cmdObj.getStringField("from") );
+                if( !from.empty() ) {
+                    scoped_lock lck( replSettings.discoveredSeeds_mx );
+                    replSettings.discoveredSeeds.insert(from);
+                }
+                result.append("hbmsg", "still initializing");
+                return true;
+            }
+
+            if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) {
+                errmsg = "repl set names do not match (2)";
+                result.append("mismatch", true);
+                return false;
+            }
+            result.append("set", theReplSet->name());
+            result.append("state", theReplSet->state().s);
+            result.append("e", theReplSet->iAmElectable());
+            result.append("hbmsg", theReplSet->hbmsg());
+            result.append("time", (long long) time(0));
+            result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
+            int v = theReplSet->config().version;
+            result.append("v", v);
+            if( v > cmdObj["v"].Int() )
+                result << "config" << theReplSet->config().asBson();
+
+            return true;
+        }
+    } cmdReplSetHeartbeat;
+
+    bool requestHeartbeat(string setName, string from, string memberFullName, BSONObj& result,
+                          int myCfgVersion, int& theirCfgVersion, bool checkEmpty) {
+        if( replSetBlind ) {
+            return false;
+        }
+
+        BSONObj cmd = BSON( "replSetHeartbeat" << setName <<
+                            "v" << myCfgVersion <<
+                            "pv" << 1 <<
+                            "checkEmpty" << checkEmpty <<
+                            "from" << from );
+
+        // generally not a great idea to do outbound waiting calls in a
+        // write lock. heartbeats can be slow (multisecond to respond), so
+        // generally we don't want to be locked, at least not without
+        // thinking acarefully about it first.
+        uassert(15900, "can't heartbeat: too much lock",
+                !d.dbMutex.isWriteLocked() || theReplSet == 0 || !theReplSet->lockedByMe() );
+
+        ScopedConn conn(memberFullName);
+        return conn.runCommand("admin", cmd, result, 0);
+    }
+
+    /**
+     * Poll every other set member to check its status.
+     *
+     * A detail about local machines and authentication: suppose we have 2
+     * members, A and B, on the same machine using different keyFiles. A is
+     * primary. If we're just starting the set, there are no admin users, so A
+     * and B can access each other because it's local access.
+     *
+     * Then we add a user to A. B cannot sync this user from A, because as soon
+     * as we add a an admin user, A requires auth. However, A can still
+     * heartbeat B, because B *doesn't* have an admin user.  So A can reach B
+     * but B cannot reach A.
+     *
+     * Once B is restarted with the correct keyFile, everything should work as
+     * expected.
+     */
+    class ReplSetHealthPollTask : public task::Task {
+    private:
+        HostAndPort h;
+        HeartbeatInfo m;
+        int tries;
+        const int threshold;
+    public:
+        ReplSetHealthPollTask(const HostAndPort& hh, const HeartbeatInfo& mm)
+            : h(hh), m(mm), tries(0), threshold(15) { }
+
+        string name() const { return "rsHealthPoll"; }
+        void doWork() {
+            if ( !theReplSet ) {
+                LOG(2) << "replSet not initialized yet, skipping health poll this round" << rsLog;
+                return;
+            }
+
+            HeartbeatInfo mem = m;
+            HeartbeatInfo old = mem;
+            try {
+                BSONObj info;
+                int theirConfigVersion = -10000;
+
+                bool ok = _requestHeartbeat(mem, info, theirConfigVersion);
+
+                // weight new ping with old pings
+                // on the first ping, just use the ping value
+                if (old.ping != 0) {
+                    mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2));
+                }
+
+                if( ok ) {
+                    up(info, mem);
+                }
+                else if (!info["errmsg"].eoo() &&
+                         info["errmsg"].str() == "need to login") {
+                    authIssue(mem);
+                }
+                else {
+                    down(mem, info.getStringField("errmsg"));
+                }
+            }
+            catch(DBException& e) {
+                down(mem, e.what());
+            }
+            catch(...) {
+                down(mem, "replSet unexpected exception in ReplSetHealthPollTask");
+            }
+            m = mem;
+
+            theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) );
+
+            static time_t last = 0;
+            time_t now = time(0);
+            bool changed = mem.changed(old);
+            if( changed ) {
+                if( old.hbstate != mem.hbstate )
+                    log() << "replSet member " << h.toString() << " is now in state " << mem.hbstate.toString() << rsLog;
+            }
+            if( changed || now-last>4 ) {
+                last = now;
+                theReplSet->mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+            }
+        }
+
+    private:
+        bool _requestHeartbeat(HeartbeatInfo& mem, BSONObj& info, int& theirConfigVersion) {
+            if (tries++ % threshold == (threshold - 1)) {
+                ScopedConn conn(h.toString());
+                conn.reconnect();
+            }
+
+            Timer timer;
+            time_t before = curTimeMicros64() / 1000000;
+
+            bool ok = requestHeartbeat(theReplSet->name(), theReplSet->selfFullName(),
+                                       h.toString(), info, theReplSet->config().version, theirConfigVersion);
+
+            mem.ping = (unsigned int)timer.millis();
+
+            // we set this on any response - we don't get this far if
+            // couldn't connect because exception is thrown
+            time_t after = mem.lastHeartbeat = before + (mem.ping / 1000);
+
+            if ( info["time"].isNumber() ) {
+                long long t = info["time"].numberLong();
+                if( t > after )
+                    mem.skew = (int) (t - after);
+                else if( t < before )
+                    mem.skew = (int) (t - before); // negative
+            }
+            else {
+                // it won't be there if remote hasn't initialized yet
+                if( info.hasElement("time") )
+                    warning() << "heatbeat.time isn't a number: " << info << endl;
+                mem.skew = INT_MIN;
+            }
+
+            {
+                be state = info["state"];
+                if( state.ok() )
+                    mem.hbstate = MemberState(state.Int());
+            }
+
+            return ok;
+        }
+
+        void authIssue(HeartbeatInfo& mem) {
+            mem.authIssue = true;
+            mem.hbstate = MemberState::RS_UNKNOWN;
+
+            // set health to 0 so that this doesn't count towards majority
+            mem.health = 0.0;
+            theReplSet->rmFromElectable(mem.id());
+        }
+
+        void down(HeartbeatInfo& mem, string msg) {
+            mem.authIssue = false;
+            mem.health = 0.0;
+            mem.ping = 0;
+            if( mem.upSince || mem.downSince == 0 ) {
+                mem.upSince = 0;
+                mem.downSince = jsTime();
+                mem.hbstate = MemberState::RS_DOWN;
+                log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog;
+            }
+            mem.lastHeartbeatMsg = msg;
+            theReplSet->rmFromElectable(mem.id());
+        }
+
+        void up(const BSONObj& info, HeartbeatInfo& mem) {
+            HeartbeatInfo::numPings++;
+            mem.authIssue = false;
+
+            if( mem.upSince == 0 ) {
+                log() << "replSet member " << h.toString() << " is up" << rsLog;
+                mem.upSince = mem.lastHeartbeat;
+            }
+            mem.health = 1.0;
+            mem.lastHeartbeatMsg = info["hbmsg"].String();
+            if( info.hasElement("opTime") )
+                mem.opTime = info["opTime"].Date();
+
+            // see if this member is in the electable set
+            if( info["e"].eoo() ) {
+                // for backwards compatibility
+                const Member *member = theReplSet->findById(mem.id());
+                if (member && member->config().potentiallyHot()) {
+                    theReplSet->addToElectable(mem.id());
+                }
+                else {
+                    theReplSet->rmFromElectable(mem.id());
+                }
+            }
+            // add this server to the electable set if it is within 10
+            // seconds of the latest optime we know of
+            else if( info["e"].trueValue() &&
+                     mem.opTime >= theReplSet->lastOpTimeWritten.getSecs() - 10) {
+                unsigned lastOp = theReplSet->lastOtherOpTime().getSecs();
+                if (lastOp > 0 && mem.opTime >= lastOp - 10) {
+                    theReplSet->addToElectable(mem.id());
+                }
+            }
+            else {
+                theReplSet->rmFromElectable(mem.id());
+            }
+
+            be cfg = info["config"];
+            if( cfg.ok() ) {
+                // received a new config
+                boost::function<void()> f =
+                    boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
+                theReplSet->mgr->send(f);
+            }
+        }
+    };
+
+    void ReplSetImpl::endOldHealthTasks() {
+        unsigned sz = healthTasks.size();
+        for( set<ReplSetHealthPollTask*>::iterator i = healthTasks.begin(); i != healthTasks.end(); i++ )
+            (*i)->halt();
+        healthTasks.clear();
+        if( sz )
+            DEV log() << "replSet debug: cleared old tasks " << sz << endl;
+    }
+
+    void ReplSetImpl::startHealthTaskFor(Member *m) {
+        ReplSetHealthPollTask *task = new ReplSetHealthPollTask(m->h(), m->hbinfo());
+        healthTasks.insert(task);
+        task::repeat(task, 2000);
+    }
+
+    void startSyncThread();
+
+    /** called during repl set startup.  caller expects it to return fairly quickly.
+        note ReplSet object is only created once we get a config - so this won't run
+        until the initiation.
+    */
+    void ReplSetImpl::startThreads() {
+        task::fork(mgr);
+        mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+
+        boost::thread t(startSyncThread);
+
+        task::fork(ghost);
+
+        // member heartbeats are started in ReplSetImpl::initFromConfig
+    }
+
+}
+
+/* todo:
+   stop bg job and delete on removefromset
+*/
diff --git a/src/mongo/db/repl/manager.cpp b/src/mongo/db/repl/manager.cpp
new file mode 100644
index 00000000000..91648a1b506
--- /dev/null
+++ b/src/mongo/db/repl/manager.cpp
@@ -0,0 +1,274 @@
+/* @file manager.cpp
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "connections.h"
+#include "../client.h"
+
+namespace mongo {
+
+    enum {
+        NOPRIMARY = -2,
+        SELFPRIMARY = -1
+    };
+
+    /* check members OTHER THAN US to see if they think they are primary */
+    const Member * Manager::findOtherPrimary(bool& two) {
+        two = false;
+        Member *m = rs->head();
+        Member *p = 0;
+        while( m ) {
+            DEV assert( m != rs->_self );
+            if( m->state().primary() && m->hbinfo().up() ) {
+                if( p ) {
+                    two = true;
+                    return 0;
+                }
+                p = m;
+            }
+            m = m->next();
+        }
+        if( p )
+            noteARemoteIsPrimary(p);
+        return p;
+    }
+
+    Manager::Manager(ReplSetImpl *_rs) :
+        task::Server("rsMgr"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) {
+    }
+
+    Manager::~Manager() {
+        /* we don't destroy the replset object we sit in; however, the destructor could have thrown on init.
+           the log message below is just a reminder to come back one day and review this code more, and to
+           make it cleaner.
+           */
+        log() << "info: ~Manager called" << rsLog;
+        rs->mgr = 0;
+    }
+
+    void Manager::starting() {
+        Client::initThread("rsMgr");
+        replLocalAuth();
+    }
+
+    void Manager::noteARemoteIsPrimary(const Member *m) {
+        if( rs->box.getPrimary() == m )
+            return;
+        rs->_self->lhb() = "";
+        if( rs->iAmArbiterOnly() ) {
+            rs->box.set(MemberState::RS_ARBITER, m);
+        }
+        else {
+            rs->box.noteRemoteIsPrimary(m);
+        }
+    }
+
+    void Manager::checkElectableSet() {
+        unsigned otherOp = rs->lastOtherOpTime().getSecs();
+        
+        // make sure the electable set is up-to-date
+        if (rs->elect.aMajoritySeemsToBeUp() &&
+            rs->iAmPotentiallyHot() &&
+            (otherOp == 0 || rs->lastOpTimeWritten.getSecs() >= otherOp - 10)) {
+            theReplSet->addToElectable(rs->selfId());
+        }
+        else {
+            theReplSet->rmFromElectable(rs->selfId());
+        }
+
+        // check if we should ask the primary (possibly ourselves) to step down
+        const Member *highestPriority = theReplSet->getMostElectable();
+        const Member *primary = rs->box.getPrimary();
+        
+        if (primary && highestPriority &&
+            highestPriority->config().priority > primary->config().priority) {
+            log() << "stepping down " << primary->fullName() << endl;
+
+            if (primary->h().isSelf()) {
+                // replSetStepDown tries to acquire the same lock
+                // msgCheckNewState takes, so we can't call replSetStepDown on
+                // ourselves.
+                rs->relinquish();
+            }
+            else {
+                BSONObj cmd = BSON( "replSetStepDown" << 1 );
+                ScopedConn conn(primary->fullName());
+                BSONObj result;
+                if (!conn.runCommand("admin", cmd, result, 0)) {
+                    log() << "stepping down " << primary->fullName()
+                          << " failed: " << result << endl;
+                }
+            }
+        }
+    }
+
+    void Manager::checkAuth() {
+        int down = 0, authIssue = 0, total = 0;
+
+        for( Member *m = rs->head(); m; m=m->next() ) {
+            total++;
+
+            // all authIssue servers will also be not up
+            if (!m->hbinfo().up()) {
+                down++;
+                if (m->hbinfo().authIssue) {
+                    authIssue++;
+                }
+            }
+        }
+
+        // if all nodes are down or failed auth AND at least one failed
+        // auth, go into recovering.  If all nodes are down, stay a
+        // secondary.
+        if (authIssue > 0 && down == total) {
+            log() << "replset error could not reach/authenticate against any members" << endl;
+
+            if (rs->box.getPrimary() == rs->_self) {
+                log() << "auth problems, relinquishing primary" << rsLog;
+                rs->relinquish();
+            }
+
+            rs->blockSync(true);
+        }
+        else {
+            rs->blockSync(false);
+        }
+    }
+
+    /** called as the health threads get new results */
+    void Manager::msgCheckNewState() {
+        {
+            theReplSet->assertValid();
+            rs->assertValid();
+
+            RSBase::lock lk(rs);
+
+            if( busyWithElectSelf ) return;
+            
+            checkElectableSet();
+            checkAuth();
+
+            const Member *p = rs->box.getPrimary();
+            if( p && p != rs->_self ) {
+                if( !p->hbinfo().up() ||
+                        !p->hbinfo().hbstate.primary() ) {
+                    p = 0;
+                    rs->box.setOtherPrimary(0);
+                }
+            }
+
+            const Member *p2;
+            {
+                bool two;
+                p2 = findOtherPrimary(two);
+                if( two ) {
+                    /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */
+                    log() << "replSet info two primaries (transiently)" << rsLog;
+                    return;
+                }
+            }
+
+            if( p2 ) {
+                /* someone else thinks they are primary. */
+                if( p == p2 ) {
+                    // we thought the same; all set.
+                    return;
+                }
+                if( p == 0 ) {
+                    noteARemoteIsPrimary(p2);
+                    return;
+                }
+                // todo xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+                if( p != rs->_self ) {
+                    // switch primary from oldremotep->newremotep2
+                    noteARemoteIsPrimary(p2);
+                    return;
+                }
+                /* we thought we were primary, yet now someone else thinks they are. */
+                if( !rs->elect.aMajoritySeemsToBeUp() ) {
+                    /* we can't see a majority.  so the other node is probably the right choice. */
+                    noteARemoteIsPrimary(p2);
+                    return;
+                }
+                /* ignore for now, keep thinking we are master.
+                   this could just be timing (we poll every couple seconds) or could indicate
+                   a problem?  if it happens consistently for a duration of time we should
+                   alert the sysadmin.
+                */
+                return;
+            }
+
+            /* didn't find anyone who wants to be primary */
+
+            if( p ) {
+                /* we are already primary */
+
+                if( p != rs->_self ) {
+                    rs->sethbmsg("error p != rs->self in checkNewState");
+                    log() << "replSet " << p->fullName() << rsLog;
+                    log() << "replSet " << rs->_self->fullName() << rsLog;
+                    return;
+                }
+
+                if( rs->elect.shouldRelinquish() ) {
+                    log() << "can't see a majority of the set, relinquishing primary" << rsLog;
+                    rs->relinquish();
+                }
+
+                return;
+            }
+
+            if( !rs->iAmPotentiallyHot() ) { // if not we never try to be primary
+                OCCASIONALLY log() << "replSet I don't see a primary and I can't elect myself" << endl;
+                return;
+            }
+
+            /* no one seems to be primary.  shall we try to elect ourself? */
+            if( !rs->elect.aMajoritySeemsToBeUp() ) {
+                static time_t last;
+                static int n;
+                int ll = 0;
+                if( ++n > 5 ) ll++;
+                if( last + 60 > time(0 ) ) ll++;
+                log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog;
+                last = time(0);
+                return;
+            }
+
+            if( !rs->iAmElectable() ) {
+                return;
+            }
+
+            busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one.
+        }
+        try {
+            rs->elect.electSelf();
+        }
+        catch(RetryAfterSleepException&) {
+            /* we want to process new inbounds before trying this again.  so we just put a checkNewstate in the queue for eval later. */
+            requeue();
+        }
+        catch(...) {
+            log() << "replSet error unexpected assertion in rs manager" << rsLog;
+        }
+        busyWithElectSelf = false;
+    }
+
+}
diff --git a/src/mongo/db/repl/multicmd.h b/src/mongo/db/repl/multicmd.h
new file mode 100644
index 00000000000..2d70c551f64
--- /dev/null
+++ b/src/mongo/db/repl/multicmd.h
@@ -0,0 +1,75 @@
+// @file multicmd.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/background.h"
+#include "connections.h"
+
+namespace mongo {
+
+    struct Target {
+        Target(string hostport) : toHost(hostport), ok(false) { }
+        //Target() : ok(false) { }
+        const string toHost;
+        bool ok;
+        BSONObj result;
+    };
+
+    /** send a command to several servers in parallel.  waits for all to complete before 
+        returning.  
+        
+        in: Target::toHost
+        out: Target::result and Target::ok
+    */
+    void multiCommand(BSONObj cmd, list<Target>& L);
+
+    class _MultiCommandJob : public BackgroundJob {
+    public:
+        BSONObj& cmd;
+        Target& d;
+        _MultiCommandJob(BSONObj& _cmd, Target& _d) : cmd(_cmd), d(_d) { }
+
+    private:
+        string name() const { return "MultiCommandJob"; }
+        void run() {
+            try {
+                ScopedConn c(d.toHost);
+                d.ok = c.runCommand("admin", cmd, d.result);
+            }
+            catch(DBException&) {
+                DEV log() << "dev caught dbexception on multiCommand " << d.toHost << rsLog;
+            }
+        }
+    };
+
+    inline void multiCommand(BSONObj cmd, list<Target>& L) {
+        list< shared_ptr<BackgroundJob> > jobs;
+
+        for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+            Target& d = *i;
+            _MultiCommandJob *j = new _MultiCommandJob(cmd, d);
+            jobs.push_back( shared_ptr<BackgroundJob>(j) );
+            j->go();
+        }
+
+        for( list< shared_ptr<BackgroundJob> >::iterator i = jobs.begin(); i != jobs.end(); i++ ) {
+            (*i)->wait();
+        }
+    }
+}
diff --git a/src/mongo/db/repl/replset_commands.cpp b/src/mongo/db/repl/replset_commands.cpp
new file mode 100644
index 00000000000..84f16e53466
--- /dev/null
+++ b/src/mongo/db/repl/replset_commands.cpp
@@ -0,0 +1,404 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../commands.h"
+#include "../repl.h"
+#include "health.h"
+#include "rs.h"
+#include "rs_config.h"
+#include "../dbwebserver.h"
+#include "../../util/mongoutils/html.h"
+#include "../../client/dbclient.h"
+#include "../repl_block.h"
+
+using namespace bson;
+
+namespace mongo {
+
+    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial);
+
+    /* commands in other files:
+         replSetHeartbeat - health.cpp
+         replSetInitiate  - rs_mod.cpp
+    */
+
+    bool replSetBlind = false;
+    unsigned replSetForceInitialSyncFailure = 0;
+
+    class CmdReplSetTest : public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "Just for regression tests.\n";
+        }
+        CmdReplSetTest() : ReplSetCommand("replSetTest") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            log() << "replSet replSetTest command received: " << cmdObj.toString() << rsLog;
+
+            if (!checkAuth(errmsg, result)) {
+                return false;
+            }
+
+            if( cmdObj.hasElement("forceInitialSyncFailure") ) {
+                replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number();
+                return true;
+            }
+
+            if( !check(errmsg, result) )
+                return false;
+
+            if( cmdObj.hasElement("blind") ) {
+                replSetBlind = cmdObj.getBoolField("blind");
+                return true;
+            }
+
+            if (cmdObj.hasElement("sethbmsg")) {
+                replset::sethbmsg(cmdObj["sethbmsg"].String());
+                return true;
+            }
+
+            return false;
+        }
+    } cmdReplSetTest;
+
+    /** get rollback id.  used to check if a rollback happened during some interval of time.
+        as consumed, the rollback id is not in any particular order, it simply changes on each rollback.
+        @see incRBID()
+    */
+    class CmdReplSetGetRBID : public ReplSetCommand {
+    public:
+        /* todo: ideally this should only change on rollbacks NOT on mongod restarts also. fix... */
+        int rbid;
+        virtual void help( stringstream &help ) const {
+            help << "internal";
+        }
+        CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") {
+            // this is ok but micros or combo with some rand() and/or 64 bits might be better --
+            // imagine a restart and a clock correction simultaneously (very unlikely but possible...)
+            rbid = (int) curTimeMillis64();
+        }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            result.append("rbid",rbid);
+            return true;
+        }
+    } cmdReplSetRBID;
+
+    /** we increment the rollback id on every rollback event. */
+    void incRBID() {
+        cmdReplSetRBID.rbid++;
+    }
+
+    /** helper to get rollback id from another server. */
+    int getRBID(DBClientConnection *c) {
+        bo info;
+        c->simpleCommand("admin", &info, "replSetGetRBID");
+        return info["rbid"].numberInt();
+    }
+
+    class CmdReplSetGetStatus : public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "Report status of a replica set from the POV of this server\n";
+            help << "{ replSetGetStatus : 1 }";
+            help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+        CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( cmdObj["forShell"].trueValue() )
+                lastError.disableForCommand();
+
+            if( !check(errmsg, result) )
+                return false;
+            theReplSet->summarizeStatus(result);
+            return true;
+        }
+    } cmdReplSetGetStatus;
+
+    class CmdReplSetReconfig : public ReplSetCommand {
+        RWLock mutex; /* we don't need rw but we wanted try capability. :-( */
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "Adjust configuration of a replica set\n";
+            help << "{ replSetReconfig : config_object }";
+            help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+        CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { }
+        virtual bool run(const string& a, BSONObj& b, int e, string& errmsg, BSONObjBuilder& c, bool d) {
+            try {
+                rwlock_try_write lk(mutex);
+                return _run(a,b,e,errmsg,c,d);
+            }
+            catch(rwlock_try_write::exception&) { }
+            errmsg = "a replSetReconfig is already in progress";
+            return false;
+        }
+    private:
+        bool _run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( !checkAuth(errmsg, result) ) {
+                return false;
+            }
+
+            if( cmdObj["replSetReconfig"].type() != Object ) {
+                errmsg = "no configuration specified";
+                return false;
+            }
+
+            bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+            if( force && !theReplSet ) {
+                replSettings.reconfig = cmdObj["replSetReconfig"].Obj().getOwned();
+                result.append("msg", "will try this config momentarily, try running rs.conf() again in a few seconds");
+                return true;
+            }
+
+            if ( !check(errmsg, result) ) {
+                return false;
+            }
+
+            if( !force && !theReplSet->box.getState().primary() ) {
+                errmsg = "replSetReconfig command must be sent to the current replica set primary.";
+                return false;
+            }
+
+            {
+                // just make sure we can get a write lock before doing anything else.  we'll reacquire one
+                // later.  of course it could be stuck then, but this check lowers the risk if weird things
+                // are up - we probably don't want a change to apply 30 minutes after the initial attempt.
+                time_t t = time(0);
+                writelock lk("");
+                if( time(0)-t > 20 ) {
+                    errmsg = "took a long time to get write lock, so not initiating.  Initiate when server less busy?";
+                    return false;
+                }
+            }
+
+            try {
+                ReplSetConfig newConfig(cmdObj["replSetReconfig"].Obj(), force);
+
+                log() << "replSet replSetReconfig config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
+
+                if( !ReplSetConfig::legalChange(theReplSet->getConfig(), newConfig, errmsg) ) {
+                    return false;
+                }
+
+                checkMembersUpForConfigChange(newConfig, result, false);
+
+                log() << "replSet replSetReconfig [2]" << rsLog;
+
+                theReplSet->haveNewConfig(newConfig, true);
+                ReplSet::startupStatusMsg.set("replSetReconfig'd");
+            }
+            catch( DBException& e ) {
+                log() << "replSet replSetReconfig exception: " << e.what() << rsLog;
+                throw;
+            }
+            catch( string& se ) {
+                log() << "replSet reconfig exception: " << se << rsLog;
+                errmsg = se;
+                return false;
+            }
+
+            resetSlaveCache();
+            return true;
+        }
+    } cmdReplSetReconfig;
+
+    class CmdReplSetFreeze : public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "{ replSetFreeze : <seconds> }";
+            help << "'freeze' state of member to the extent we can do that.  What this really means is that\n";
+            help << "this node will not attempt to become primary until the time period specified expires.\n";
+            help << "You can call again with {replSetFreeze:0} to unfreeze sooner.\n";
+            help << "A process restart unfreezes the member also.\n";
+            help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+
+        CmdReplSetFreeze() : ReplSetCommand("replSetFreeze") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            int secs = (int) cmdObj.firstElement().numberInt();
+            if( theReplSet->freeze(secs) ) {
+                if( secs == 0 )
+                    result.append("info","unfreezing");
+            }
+            if( secs == 1 )
+                result.append("warning", "you really want to freeze for only 1 second?");
+            return true;
+        }
+    } cmdReplSetFreeze;
+
+    class CmdReplSetStepDown: public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "{ replSetStepDown : <seconds> }\n";
+            help << "Step down as primary.  Will not try to reelect self for the specified time period (1 minute if no numeric secs value specified).\n";
+            help << "(If another member with same priority takes over in the meantime, it will stay primary.)\n";
+            help << "http://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+
+        CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            if( !theReplSet->box.getState().primary() ) {
+                errmsg = "not primary so can't step down";
+                return false;
+            }
+
+            bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+
+            // only step down if there is another node synced to within 10
+            // seconds of this node
+            if (!force) {
+                long long int lastOp = (long long int)theReplSet->lastOpTimeWritten.getSecs();
+                long long int closest = (long long int)theReplSet->lastOtherOpTime().getSecs();
+
+                long long int diff = lastOp - closest;
+                result.append("closest", closest);
+                result.append("difference", diff);
+
+                if (diff < 0) {
+                    // not our problem, but we'll wait until thing settle down
+                    errmsg = "someone is ahead of the primary?";
+                    return false;
+                }
+
+                if (diff > 10) {
+                    errmsg = "no secondaries within 10 seconds of my optime";
+                    return false;
+                }
+            }
+
+            int secs = (int) cmdObj.firstElement().numberInt();
+            if( secs == 0 )
+                secs = 60;
+            return theReplSet->stepDown(secs);
+        }
+    } cmdReplSetStepDown;
+
+    class CmdReplSetMaintenance: public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "{ replSetMaintenance : bool }\n";
+            help << "Enable or disable maintenance mode.";
+        }
+
+        CmdReplSetMaintenance() : ReplSetCommand("replSetMaintenance") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            if( theReplSet->box.getState().primary() ) {
+                errmsg = "primaries can't modify maintenance mode";
+                return false;
+            }
+
+            theReplSet->setMaintenanceMode(cmdObj["replSetMaintenance"].trueValue());
+            return true;
+        }
+    } cmdReplSetMaintenance;
+
+    using namespace bson;
+    using namespace mongoutils::html;
+    extern void fillRsLog(stringstream&);
+
+    class ReplSetHandler : public DbWebHandler {
+    public:
+        ReplSetHandler() : DbWebHandler( "_replSet" , 1 , true ) {}
+
+        virtual bool handles( const string& url ) const {
+            return startsWith( url , "/_replSet" );
+        }
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+
+            if( url == "/_replSetOplog" ) {
+                responseMsg = _replSetOplog(params);
+            }
+            else
+                responseMsg = _replSet();
+            responseCode = 200;
+        }
+
+        string _replSetOplog(bo parms) {
+            int _id = (int) str::toUnsigned( parms["_id"].String() );
+
+            stringstream s;
+            string t = "Replication oplog";
+            s << start(t);
+            s << p(t);
+
+            if( theReplSet == 0 ) {
+                if( cmdLine._replSet.empty() )
+                    s << p("Not using --replSet");
+                else  {
+                    s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated")
+                           + ".<br>" + ReplSet::startupStatusMsg.get());
+                }
+            }
+            else {
+                try {
+                    theReplSet->getOplogDiagsAsHtml(_id, s);
+                }
+                catch(std::exception& e) {
+                    s << "error querying oplog: " << e.what() << '\n';
+                }
+            }
+
+            s << _end();
+            return s.str();
+        }
+
+        /* /_replSet show replica set status in html format */
+        string _replSet() {
+            stringstream s;
+            s << start("Replica Set Status " + prettyHostName());
+            s << p( a("/", "back", "Home") + " | " +
+                    a("/local/system.replset/?html=1", "", "View Replset Config") + " | " +
+                    a("/replSetGetStatus?text=1", "", "replSetGetStatus") + " | " +
+                    a("http://www.mongodb.org/display/DOCS/Replica+Sets", "", "Docs")
+                  );
+
+            if( theReplSet == 0 ) {
+                if( cmdLine._replSet.empty() )
+                    s << p("Not using --replSet");
+                else  {
+                    s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated")
+                           + ".<br>" + ReplSet::startupStatusMsg.get());
+                }
+            }
+            else {
+                try {
+                    theReplSet->summarizeAsHtml(s);
+                }
+                catch(...) { s << "error summarizing replset status\n"; }
+            }
+            s << p("Recent replset log activity:");
+            fillRsLog(s);
+            s << _end();
+            return s.str();
+        }
+
+
+
+    } replSetHandler;
+
+}
diff --git a/src/mongo/db/repl/rs.cpp b/src/mongo/db/repl/rs.cpp
new file mode 100644
index 00000000000..fff5d72bcc0
--- /dev/null
+++ b/src/mongo/db/repl/rs.cpp
@@ -0,0 +1,778 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../../util/net/sock.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "../dbhelpers.h"
+#include "../../s/d_logic.h"
+#include "rs.h"
+#include "connections.h"
+#include "../repl.h"
+#include "../instance.h"
+
+using namespace std;
+
+namespace mongo {
+    
+    using namespace bson;
+
+    bool replSet = false;
+    ReplSet *theReplSet = 0;
+
+    bool isCurrentlyAReplSetPrimary() { 
+        return theReplSet && theReplSet->isPrimary();
+    }
+
+    void replset::sethbmsg(const string& s, const int level) {
+        if (theReplSet) {
+            theReplSet->sethbmsg(s, logLevel);
+        }
+    }
+
+    void ReplSetImpl::sethbmsg(string s, int logLevel) {
+        static time_t lastLogged;
+        _hbmsgTime = time(0);
+
+        if( s == _hbmsg ) {
+            // unchanged
+            if( _hbmsgTime - lastLogged < 60 )
+                return;
+        }
+
+        unsigned sz = s.size();
+        if( sz >= 256 )
+            memcpy(_hbmsg, s.c_str(), 255);
+        else {
+            _hbmsg[sz] = 0;
+            memcpy(_hbmsg, s.c_str(), sz);
+        }
+        if( !s.empty() ) {
+            lastLogged = _hbmsgTime;
+            log(logLevel) << "replSet " << s << rsLog;
+        }
+    }
+
+    void ReplSetImpl::assumePrimary() {
+        LOG(2) << "replSet assuming primary" << endl;
+        assert( iAmPotentiallyHot() );
+        writelock lk("admin."); // so we are synchronized with _logOp()
+
+        // Make sure that new OpTimes are higher than existing ones even with clock skew
+        DBDirectClient c;
+        BSONObj lastOp = c.findOne( "local.oplog.rs", Query().sort(reverseNaturalObj), NULL, QueryOption_SlaveOk );
+        if ( !lastOp.isEmpty() ) {
+            OpTime::setLast( lastOp[ "ts" ].date() );
+        }
+
+        changeState(MemberState::RS_PRIMARY);
+    }
+
+    void ReplSetImpl::changeState(MemberState s) { box.change(s, _self); }
+
+    void ReplSetImpl::setMaintenanceMode(const bool inc) {
+        lock lk(this);
+
+        if (inc) {
+            log() << "replSet going into maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+
+            _maintenanceMode++;
+            changeState(MemberState::RS_RECOVERING);
+        }
+        else {
+            _maintenanceMode--;
+            // no need to change state, syncTail will try to go live as a secondary soon
+
+            log() << "leaving maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+        }
+    }
+
+    Member* ReplSetImpl::getMostElectable() {
+        lock lk(this);
+
+        Member *max = 0;
+
+        for (set<unsigned>::iterator it = _electableSet.begin(); it != _electableSet.end(); it++) {
+            const Member *temp = findById(*it);
+            if (!temp) {
+                log() << "couldn't find member: " << *it << endl;
+                _electableSet.erase(*it);
+                continue;
+            }
+            if (!max || max->config().priority < temp->config().priority) {
+                max = (Member*)temp;
+            }
+        }
+
+        return max;
+    }
+
+    const bool closeOnRelinquish = true;
+
+    void ReplSetImpl::relinquish() {
+        LOG(2) << "replSet attempting to relinquish" << endl;
+        if( box.getState().primary() ) {
+            {
+                writelock lk("admin."); // so we are synchronized with _logOp()
+            
+                log() << "replSet relinquishing primary state" << rsLog;
+                changeState(MemberState::RS_SECONDARY);
+            }
+            
+            if( closeOnRelinquish ) {
+                /* close sockets that were talking to us so they don't blithly send many writes that will fail
+                   with "not master" (of course client could check result code, but in case they are not)
+                */
+                log() << "replSet closing client sockets after reqlinquishing primary" << rsLog;
+                MessagingPort::closeAllSockets(1);
+            }
+
+            // now that all connections were closed, strip this mongod from all sharding details
+            // if and when it gets promoted to a primary again, only then it should reload the sharding state
+            // the rationale here is that this mongod won't bring stale state when it regains primaryhood
+            shardingState.resetShardingState();
+
+        }
+        else if( box.getState().startup2() ) {
+            // ? add comment
+            changeState(MemberState::RS_RECOVERING);
+        }
+    }
+
+    /* look freshly for who is primary - includes relinquishing ourself. */
+    void ReplSetImpl::forgetPrimary() {
+        if( box.getState().primary() )
+            relinquish();
+        else {
+            box.setOtherPrimary(0);
+        }
+    }
+
+    // for the replSetStepDown command
+    bool ReplSetImpl::_stepDown(int secs) {
+        lock lk(this);
+        if( box.getState().primary() ) {
+            elect.steppedDown = time(0) + secs;
+            log() << "replSet info stepping down as primary secs=" << secs << rsLog;
+            relinquish();
+            return true;
+        }
+        return false;
+    }
+
+    bool ReplSetImpl::_freeze(int secs) {
+        lock lk(this);
+        /* note if we are primary we remain primary but won't try to elect ourself again until
+           this time period expires.
+           */
+        if( secs == 0 ) {
+            elect.steppedDown = 0;
+            log() << "replSet info 'unfreezing'" << rsLog;
+        }
+        else {
+            if( !box.getState().primary() ) {
+                elect.steppedDown = time(0) + secs;
+                log() << "replSet info 'freezing' for " << secs << " seconds" << rsLog;
+            }
+            else {
+                log() << "replSet info received freeze command but we are primary" << rsLog;
+            }
+        }
+        return true;
+    }
+
+    void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) {
+        for( Member *m = _members.head(); m; m=m->next() ) {
+            if( m->id() == h.id() ) {
+                m->_hbinfo = h;
+                return;
+            }
+        }
+    }
+
+    list<HostAndPort> ReplSetImpl::memberHostnames() const {
+        list<HostAndPort> L;
+        L.push_back(_self->h());
+        for( Member *m = _members.head(); m; m = m->next() )
+            L.push_back(m->h());
+        return L;
+    }
+
+    void ReplSetImpl::_fillIsMasterHost(const Member *m, vector<string>& hosts, vector<string>& passives, vector<string>& arbiters) {
+        assert( m );
+        if( m->config().hidden )
+            return;
+
+        if( m->potentiallyHot() ) {
+            hosts.push_back(m->h().toString());
+        }
+        else if( !m->config().arbiterOnly ) {
+            if( m->config().slaveDelay ) {
+                /* hmmm - we don't list these as they are stale. */
+            }
+            else {
+                passives.push_back(m->h().toString());
+            }
+        }
+        else {
+            arbiters.push_back(m->h().toString());
+        }
+    }
+
+    void ReplSetImpl::_fillIsMaster(BSONObjBuilder& b) {
+        lock lk(this);
+        
+        const StateBox::SP sp = box.get();
+        bool isp = sp.state.primary();
+        b.append("setName", name());
+        b.append("ismaster", isp);
+        b.append("secondary", sp.state.secondary());
+        {
+            vector<string> hosts, passives, arbiters;
+            _fillIsMasterHost(_self, hosts, passives, arbiters);
+
+            for( Member *m = _members.head(); m; m = m->next() ) {
+                assert( m );
+                _fillIsMasterHost(m, hosts, passives, arbiters);
+            }
+
+            if( hosts.size() > 0 ) {
+                b.append("hosts", hosts);
+            }
+            if( passives.size() > 0 ) {
+                b.append("passives", passives);
+            }
+            if( arbiters.size() > 0 ) {
+                b.append("arbiters", arbiters);
+            }
+        }
+
+        if( !isp ) {
+            const Member *m = sp.primary;
+            if( m )
+                b.append("primary", m->h().toString());
+        }
+        else {
+            b.append("primary", _self->fullName());
+        }
+
+        if( myConfig().arbiterOnly )
+            b.append("arbiterOnly", true);
+        if( myConfig().priority == 0 && !myConfig().arbiterOnly)
+            b.append("passive", true);
+        if( myConfig().slaveDelay )
+            b.append("slaveDelay", myConfig().slaveDelay);
+        if( myConfig().hidden )
+            b.append("hidden", true);
+        if( !myConfig().buildIndexes )
+            b.append("buildIndexes", false);
+        if( !myConfig().tags.empty() ) {
+            BSONObjBuilder a;
+            for( map<string,string>::const_iterator i = myConfig().tags.begin(); i != myConfig().tags.end(); i++ )
+                a.append((*i).first, (*i).second);
+            b.append("tags", a.done());
+        }
+        b.append("me", myConfig().h.toString());
+    }
+
+    /** @param cfgString <setname>/<seedhost1>,<seedhost2> */
+
+    void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet ) {
+        const char *p = cfgString.c_str();
+        const char *slash = strchr(p, '/');
+        if( slash )
+            setname = string(p, slash-p);
+        else
+            setname = p;
+        uassert(13093, "bad --replSet config string format is: <setname>[/<seedhost1>,<seedhost2>,...]", !setname.empty());
+
+        if( slash == 0 )
+            return;
+
+        p = slash + 1;
+        while( 1 ) {
+            const char *comma = strchr(p, ',');
+            if( comma == 0 ) comma = strchr(p,0);
+            if( p == comma )
+                break;
+            {
+                HostAndPort m;
+                try {
+                    m = HostAndPort( string(p, comma-p) );
+                }
+                catch(...) {
+                    uassert(13114, "bad --replSet seed hostname", false);
+                }
+                uassert(13096, "bad --replSet command line config string - dups?", seedSet.count(m) == 0 );
+                seedSet.insert(m);
+                //uassert(13101, "can't use localhost in replset host list", !m.isLocalHost());
+                if( m.isSelf() ) {
+                    log(1) << "replSet ignoring seed " << m.toString() << " (=self)" << rsLog;
+                }
+                else
+                    seeds.push_back(m);
+                if( *comma == 0 )
+                    break;
+                p = comma + 1;
+            }
+        }
+    }
+
+    ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this),
+        _currentSyncTarget(0),
+        _blockSync(false),
+        _hbmsgTime(0),
+        _self(0),
+        _maintenanceMode(0),
+        mgr( new Manager(this) ),
+        ghost( new GhostSync(this) ) {
+
+        _cfg = 0;
+        memset(_hbmsg, 0, sizeof(_hbmsg));
+        strcpy( _hbmsg , "initial startup" );
+        lastH = 0;
+        changeState(MemberState::RS_STARTUP);
+
+        _seeds = &replSetCmdline.seeds;
+
+        LOG(1) << "replSet beginning startup..." << rsLog;
+
+        loadConfig();
+
+        unsigned sss = replSetCmdline.seedSet.size();
+        for( Member *m = head(); m; m = m->next() ) {
+            replSetCmdline.seedSet.erase(m->h());
+        }
+        for( set<HostAndPort>::iterator i = replSetCmdline.seedSet.begin(); i != replSetCmdline.seedSet.end(); i++ ) {
+            if( i->isSelf() ) {
+                if( sss == 1 ) {
+                    LOG(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog;
+                }
+            }
+            else {
+                log() << "replSet warning command line seed " << i->toString() << " is not present in the current repl set config" << rsLog;
+            }
+        }
+    }
+
+    void newReplUp();
+
+    void ReplSetImpl::loadLastOpTimeWritten(bool quiet) {
+        readlock lk(rsoplog);
+        BSONObj o;
+        if( Helpers::getLast(rsoplog, o) ) {
+            lastH = o["h"].numberLong();
+            lastOpTimeWritten = o["ts"]._opTime();
+            uassert(13290, "bad replSet oplog entry?", quiet || !lastOpTimeWritten.isNull());
+        }
+    }
+
+    /* call after constructing to start - returns fairly quickly after launching its threads */
+    void ReplSetImpl::_go() {
+        try {
+            loadLastOpTimeWritten();
+        }
+        catch(std::exception& e) {
+            log() << "replSet error fatal couldn't query the local " << rsoplog << " collection.  Terminating mongod after 30 seconds." << rsLog;
+            log() << e.what() << rsLog;
+            sleepsecs(30);
+            dbexit( EXIT_REPLICATION_ERROR );
+            return;
+        }
+
+        changeState(MemberState::RS_STARTUP2);
+        startThreads();
+        newReplUp(); // oplog.cpp
+    }
+
+    ReplSetImpl::StartupStatus ReplSetImpl::startupStatus = PRESTART;
+    DiagStr ReplSetImpl::startupStatusMsg;
+
+    extern BSONObj *getLastErrorDefault;
+
+    void ReplSetImpl::setSelfTo(Member *m) {
+        // already locked in initFromConfig
+        _self = m;
+        _id = m->id();
+        _config = m->config();
+        if( m ) _buildIndexes = m->config().buildIndexes;
+        else _buildIndexes = true;
+    }
+
+    /** @param reconf true if this is a reconfiguration and not an initial load of the configuration.
+        @return true if ok; throws if config really bad; false if config doesn't include self
+    */
+    bool ReplSetImpl::initFromConfig(ReplSetConfig& c, bool reconf) {
+        /* NOTE: haveNewConfig() writes the new config to disk before we get here.  So
+                 we cannot error out at this point, except fatally.  Check errors earlier.
+                 */
+        lock lk(this);
+
+        if( getLastErrorDefault || !c.getLastErrorDefaults.isEmpty() ) {
+            // see comment in dbcommands.cpp for getlasterrordefault
+            getLastErrorDefault = new BSONObj( c.getLastErrorDefaults );
+        }
+
+        list<ReplSetConfig::MemberCfg*> newOnes;
+        // additive short-cuts the new config setup. If we are just adding a
+        // node/nodes and nothing else is changing, this is additive. If it's
+        // not a reconfig, we're not adding anything
+        bool additive = reconf;
+        {
+            unsigned nfound = 0;
+            int me = 0;
+            for( vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++ ) {
+                
+                ReplSetConfig::MemberCfg& m = *i;
+                if( m.h.isSelf() ) {
+                    me++;
+                }
+                
+                if( reconf ) {
+                    if (m.h.isSelf() && (!_self || (int)_self->id() != m._id)) {
+                        log() << "self doesn't match: " << m._id << rsLog;
+                        assert(false);
+                    }
+
+                    const Member *old = findById(m._id);
+                    if( old ) {
+                        nfound++;
+                        assert( (int) old->id() == m._id );
+                        if( old->config() != m ) {
+                            additive = false;
+                        }
+                    }
+                    else {
+                        newOnes.push_back(&m);
+                    }
+                }
+            }
+            if( me == 0 ) {
+                _members.orphanAll();
+
+                // sending hbs must continue to pick up new config, so we leave
+                // hb threads alone
+
+                // close sockets to force clients to re-evaluate this member
+                MessagingPort::closeAllSockets(0);
+
+                // stop sync thread
+                box.set(MemberState::RS_STARTUP, 0);
+
+                // go into holding pattern
+                log() << "replSet error self not present in the repl set configuration:" << rsLog;
+                log() << c.toString() << rsLog;
+                return false;
+            }
+            uassert( 13302, "replSet error self appears twice in the repl set configuration", me<=1 );
+
+            // if we found different members that the original config, reload everything
+            if( reconf && config().members.size() != nfound )
+                additive = false;
+        }
+
+        _cfg = new ReplSetConfig(c);
+        assert( _cfg->ok() );
+        assert( _name.empty() || _name == _cfg->_id );
+        _name = _cfg->_id;
+        assert( !_name.empty() );
+
+        // this is a shortcut for simple changes
+        if( additive ) {
+            log() << "replSet info : additive change to configuration" << rsLog;
+            for( list<ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) {
+                ReplSetConfig::MemberCfg *m = *i;
+                Member *mi = new Member(m->h, m->_id, m, false);
+
+                /** we will indicate that new members are up() initially so that we don't relinquish our
+                    primary state because we can't (transiently) see a majority.  they should be up as we
+                    check that new members are up before getting here on reconfig anyway.
+                    */
+                mi->get_hbinfo().health = 0.1;
+
+                _members.push(mi);
+                startHealthTaskFor(mi);
+            }
+
+            // if we aren't creating new members, we may have to update the
+            // groups for the current ones
+            _cfg->updateMembers(_members);
+
+            return true;
+        }
+
+        // start with no members.  if this is a reconfig, drop the old ones.
+        _members.orphanAll();
+
+        endOldHealthTasks();
+
+        int oldPrimaryId = -1;
+        {
+            const Member *p = box.getPrimary();
+            if( p )
+                oldPrimaryId = p->id();
+        }
+        forgetPrimary();
+
+        // not setting _self to 0 as other threads use _self w/o locking
+        int me = 0;
+
+        // For logging
+        string members = "";
+
+        for( vector<ReplSetConfig::MemberCfg>::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) {
+            ReplSetConfig::MemberCfg& m = *i;
+            Member *mi;
+            members += ( members == "" ? "" : ", " ) + m.h.toString();
+            if( m.h.isSelf() ) {
+                assert( me++ == 0 );
+                mi = new Member(m.h, m._id, &m, true);
+                if (!reconf) {
+                    log() << "replSet I am " << m.h.toString() << rsLog;
+                }
+                setSelfTo(mi);
+
+                if( (int)mi->id() == oldPrimaryId )
+                    box.setSelfPrimary(mi);
+            }
+            else {
+                mi = new Member(m.h, m._id, &m, false);
+                _members.push(mi);
+                startHealthTaskFor(mi);
+                if( (int)mi->id() == oldPrimaryId )
+                    box.setOtherPrimary(mi);
+            }
+        }
+
+        if( me == 0 ){
+            log() << "replSet warning did not detect own host in full reconfig, members " << members << " config: " << c << rsLog;
+        }
+
+        return true;
+    }
+
+    // Our own config must be the first one.
+    bool ReplSetImpl::_loadConfigFinish(vector<ReplSetConfig>& cfgs) {
+        int v = -1;
+        ReplSetConfig *highest = 0;
+        int myVersion = -2000;
+        int n = 0;
+        for( vector<ReplSetConfig>::iterator i = cfgs.begin(); i != cfgs.end(); i++ ) {
+            ReplSetConfig& cfg = *i;
+            if( ++n == 1 ) myVersion = cfg.version;
+            if( cfg.ok() && cfg.version > v ) {
+                highest = &cfg;
+                v = cfg.version;
+            }
+        }
+        assert( highest );
+
+        if( !initFromConfig(*highest) )
+            return false;
+
+        if( highest->version > myVersion && highest->version >= 0 ) {
+            log() << "replSet got config version " << highest->version << " from a remote, saving locally" << rsLog;
+            highest->saveConfigLocally(BSONObj());
+        }
+        return true;
+    }
+
+    void ReplSetImpl::loadConfig() {
+        while( 1 ) {
+            startupStatus = LOADINGCONFIG;
+            startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)");
+            LOG(1) << "loadConfig() " << rsConfigNs << endl;
+            try {
+                vector<ReplSetConfig> configs;
+                try {
+                    configs.push_back( ReplSetConfig(HostAndPort::me()) );
+                }
+                catch(DBException& e) {
+                    log() << "replSet exception loading our local replset configuration object : " << e.toString() << rsLog;
+                }
+                for( vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++ ) {
+                    try {
+                        configs.push_back( ReplSetConfig(*i) );
+                    }
+                    catch( DBException& e ) {
+                        log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog;
+                    }
+                }
+                {
+                    scoped_lock lck( replSettings.discoveredSeeds_mx );
+                    if( replSettings.discoveredSeeds.size() > 0 ) {
+                        for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); 
+                             i != replSettings.discoveredSeeds.end(); 
+                             i++) {
+                            try {
+                                configs.push_back( ReplSetConfig(HostAndPort(*i)) );
+                            }
+                            catch( DBException& ) {
+                                log(1) << "replSet exception trying to load config from discovered seed " << *i << rsLog;
+                                replSettings.discoveredSeeds.erase(*i);
+                            }
+                        }
+                    }
+                }
+
+                if (!replSettings.reconfig.isEmpty()) {
+                    try {
+                        configs.push_back(ReplSetConfig(replSettings.reconfig, true));
+                    }
+                    catch( DBException& re) {
+                        log() << "replSet couldn't load reconfig: " << re.what() << rsLog;
+                        replSettings.reconfig = BSONObj();
+                    }
+                }
+
+                int nok = 0;
+                int nempty = 0;
+                for( vector<ReplSetConfig>::iterator i = configs.begin(); i != configs.end(); i++ ) {
+                    if( i->ok() )
+                        nok++;
+                    if( i->empty() )
+                        nempty++;
+                }
+                if( nok == 0 ) {
+
+                    if( nempty == (int) configs.size() ) {
+                        startupStatus = EMPTYCONFIG;
+                        startupStatusMsg.set("can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)");
+                        log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog;
+                        static unsigned once;
+                        if( ++once == 1 ) {
+                            log() << "replSet info you may need to run replSetInitiate -- rs.initiate() in the shell -- if that is not already done" << rsLog;
+                        }
+                        if( _seeds->size() == 0 ) {
+                            LOG(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog;
+                        }
+                    }
+                    else {
+                        startupStatus = EMPTYUNREACHABLE;
+                        startupStatusMsg.set("can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)");
+                        log() << "replSet can't get " << rsConfigNs << " config from self or any seed (yet)" << rsLog;
+                    }
+
+                    sleepsecs(10);
+                    continue;
+                }
+
+                if( !_loadConfigFinish(configs) ) {
+                    log() << "replSet info Couldn't load config yet. Sleeping 20sec and will try again." << rsLog;
+                    sleepsecs(20);
+                    continue;
+                }
+            }
+            catch(DBException& e) {
+                startupStatus = BADCONFIG;
+                startupStatusMsg.set("replSet error loading set config (BADCONFIG)");
+                log() << "replSet error loading configurations " << e.toString() << rsLog;
+                log() << "replSet error replication will not start" << rsLog;
+                sethbmsg("error loading set config");
+                _fatal();
+                throw;
+            }
+            break;
+        }
+        startupStatusMsg.set("? started");
+        startupStatus = STARTED;
+    }
+
+    void ReplSetImpl::_fatal() {
+        box.set(MemberState::RS_FATAL, 0);
+        log() << "replSet error fatal, stopping replication" << rsLog;
+    }
+
+    void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) {
+        bo comment;
+        if( addComment )
+            comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version );
+
+        newConfig.saveConfigLocally(comment);
+
+        try {
+            if (initFromConfig(newConfig, true)) {
+                log() << "replSet replSetReconfig new config saved locally" << rsLog;
+            }
+        }
+        catch(DBException& e) {
+            if( e.getCode() == 13497 /* removed from set */ ) {
+                cc().shutdown();
+                dbexit( EXIT_CLEAN , "removed from replica set" ); // never returns
+                assert(0);
+            }
+            log() << "replSet error unexpected exception in haveNewConfig() : " << e.toString() << rsLog;
+            _fatal();
+        }
+        catch(...) {
+            log() << "replSet error unexpected exception in haveNewConfig()" << rsLog;
+            _fatal();
+        }
+    }
+
+    void Manager::msgReceivedNewConfig(BSONObj o) {
+        log() << "replset msgReceivedNewConfig version: " << o["version"].toString() << rsLog;
+        ReplSetConfig c(o);
+        if( c.version > rs->config().version )
+            theReplSet->haveNewConfig(c, false);
+        else {
+            log() << "replSet info msgReceivedNewConfig but version isn't higher " <<
+                  c.version << ' ' << rs->config().version << rsLog;
+        }
+    }
+
+    /* forked as a thread during startup
+       it can run quite a while looking for config.  but once found,
+       a separate thread takes over as ReplSetImpl::Manager, and this thread
+       terminates.
+    */
+    void startReplSets(ReplSetCmdline *replSetCmdline) {
+        Client::initThread("rsStart");
+        try {
+            assert( theReplSet == 0 );
+            if( replSetCmdline == 0 ) {
+                assert(!replSet);
+                return;
+            }
+            replLocalAuth();
+            (theReplSet = new ReplSet(*replSetCmdline))->go();
+        }
+        catch(std::exception& e) {
+            log() << "replSet caught exception in startReplSets thread: " << e.what() << rsLog;
+            if( theReplSet )
+                theReplSet->fatal();
+        }
+        cc().shutdown();
+    }
+
+    void replLocalAuth() {
+        if ( noauth )
+            return;
+        cc().getAuthenticationInfo()->authorize("local","_repl");
+    }
+    
+
+}
+
+namespace boost {
+
+    void assertion_failed(char const * expr, char const * function, char const * file, long line) {
+        mongo::log() << "boost assertion failure " << expr << ' ' << function << ' ' << file << ' ' << line << endl;
+    }
+
+}
diff --git a/src/mongo/db/repl/rs.h b/src/mongo/db/repl/rs.h
new file mode 100644
index 00000000000..8e43204be3b
--- /dev/null
+++ b/src/mongo/db/repl/rs.h
@@ -0,0 +1,667 @@
+// /db/repl/rs.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/concurrency/list.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/msg.h"
+#include "../../util/net/hostandport.h"
+#include "../commands.h"
+#include "../oplog.h"
+#include "../oplogreader.h"
+#include "rs_exception.h"
+#include "rs_optime.h"
+#include "rs_member.h"
+#include "rs_config.h"
+
+/**
+ * Order of Events
+ *
+ * On startup, if the --replSet option is present, startReplSets is called.
+ * startReplSets forks off a new thread for replica set activities.  It creates
+ * the global theReplSet variable and calls go() on it.
+ *
+ * theReplSet's constructor changes the replica set's state to RS_STARTUP,
+ * starts the replica set manager, and loads the config (if the replica set
+ * has been initialized).
+ */
+
+namespace mongo {
+
+    struct HowToFixUp;
+    struct Target;
+    class DBClientConnection;
+    class ReplSetImpl;
+    class OplogReader;
+    extern bool replSet; // true if using repl sets
+    extern class ReplSet *theReplSet; // null until initialized
+    extern Tee *rsLog;
+
+    /* member of a replica set */
+    class Member : public List1<Member>::Base {
+    private:
+        ~Member(); // intentionally unimplemented as should never be called -- see List1<>::Base.
+        Member(const Member&); 
+    public:
+        Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self);
+
+        string fullName() const { return h().toString(); }
+        const ReplSetConfig::MemberCfg& config() const { return _config; }
+        ReplSetConfig::MemberCfg& configw() { return _config; }
+        const HeartbeatInfo& hbinfo() const { return _hbinfo; }
+        HeartbeatInfo& get_hbinfo() { return _hbinfo; }
+        string lhb() const { return _hbinfo.lastHeartbeatMsg; }
+        MemberState state() const { return _hbinfo.hbstate; }
+        const HostAndPort& h() const { return _h; }
+        unsigned id() const { return _hbinfo.id(); }
+
+        bool potentiallyHot() const { return _config.potentiallyHot(); } // not arbiter, not priority 0
+        void summarizeMember(stringstream& s) const;
+
+    private:
+        friend class ReplSetImpl;
+        ReplSetConfig::MemberCfg _config;
+        const HostAndPort _h;
+        HeartbeatInfo _hbinfo;
+    };
+
+    namespace replset {
+        /**
+         * "Normal" replica set syncing
+         */
+        class SyncTail : public Sync {
+        public:
+            virtual ~SyncTail() {}
+            SyncTail(const string& host) : Sync(host) {}
+            virtual bool syncApply(const BSONObj &o);
+        };
+
+        /**
+         * Initial clone and sync
+         */
+        class InitialSync : public SyncTail {
+        public:
+            InitialSync(const string& host) : SyncTail(host) {}
+            virtual ~InitialSync() {}
+            bool oplogApplication(OplogReader& r, const Member* source, const OpTime& applyGTE, const OpTime& minValid);
+            virtual void applyOp(const BSONObj& o, const OpTime& minvalid);
+        };
+
+        // TODO: move hbmsg into an error-keeping class (SERVER-4444)
+        void sethbmsg(const string& s, const int logLevel=0);
+
+    } // namespace replset
+
+    class Manager : public task::Server {
+        ReplSetImpl *rs;
+        bool busyWithElectSelf;
+        int _primary;
+
+        /** @param two - if true two primaries were seen.  this can happen transiently, in addition to our
+                         polling being only occasional.  in this case null is returned, but the caller should
+                         not assume primary itself in that situation.
+        */
+        const Member* findOtherPrimary(bool& two);
+
+        void noteARemoteIsPrimary(const Member *);
+        void checkElectableSet();
+        void checkAuth();
+        virtual void starting();
+    public:
+        Manager(ReplSetImpl *rs);
+        virtual ~Manager();
+        void msgReceivedNewConfig(BSONObj);
+        void msgCheckNewState();
+    };
+
+    class GhostSync : public task::Server {
+        struct GhostSlave : boost::noncopyable {
+            GhostSlave() : last(0), slave(0), init(false) { }
+            OplogReader reader;
+            OpTime last;
+            Member* slave;
+            bool init;
+        };
+        /**
+         * This is a cache of ghost slaves
+         */
+        typedef map< mongo::OID,shared_ptr<GhostSlave> > MAP;
+        MAP _ghostCache;
+        RWLock _lock; // protects _ghostCache
+        ReplSetImpl *rs;
+        virtual void starting();
+    public:
+        GhostSync(ReplSetImpl *_rs) : task::Server("rsGhostSync"), _lock("GhostSync"), rs(_rs) {}
+        ~GhostSync() {
+            log() << "~GhostSync() called" << rsLog;
+        }
+
+        /**
+         * Replica sets can sync in a hierarchical fashion, which throws off w
+         * calculation on the master.  percolate() faux-syncs from an upstream
+         * node so that the primary will know what the slaves are up to.
+         *
+         * We can't just directly sync to the primary because it could be
+         * unreachable, e.g., S1--->S2--->S3--->P.  S2 should ghost sync from S3
+         * and S3 can ghost sync from the primary.
+         *
+         * Say we have an S1--->S2--->P situation and this node is S2.  rid
+         * would refer to S1.  S2 would create a ghost slave of S1 and connect
+         * it to P (_currentSyncTarget). Then it would use this connection to
+         * pretend to be S1, replicating off of P.
+         */
+        void percolate(const BSONObj& rid, const OpTime& last);
+        void associateSlave(const BSONObj& rid, const int memberId);
+        void updateSlave(const mongo::OID& id, const OpTime& last);
+    };
+
+    struct Target;
+
+    class Consensus {
+        ReplSetImpl &rs;
+        struct LastYea {
+            LastYea() : when(0), who(0xffffffff) { }
+            time_t when;
+            unsigned who;
+        };
+        static SimpleMutex lyMutex;
+        Guarded<LastYea,lyMutex> ly;
+        unsigned yea(unsigned memberId); // throws VoteException
+        void electionFailed(unsigned meid);
+        void _electSelf();
+        bool weAreFreshest(bool& allUp, int& nTies);
+        bool sleptLast; // slept last elect() pass
+    public:
+        Consensus(ReplSetImpl *t) : rs(*t) {
+            sleptLast = false;
+            steppedDown = 0;
+        }
+
+        /* if we've stepped down, this is when we are allowed to try to elect ourself again.
+           todo: handle possible weirdnesses at clock skews etc.
+        */
+        time_t steppedDown;
+
+        int totalVotes() const;
+        bool aMajoritySeemsToBeUp() const;
+        bool shouldRelinquish() const;
+        void electSelf();
+        void electCmdReceived(BSONObj, BSONObjBuilder*);
+        void multiCommand(BSONObj cmd, list<Target>& L);
+    };
+
+    /**
+     * most operations on a ReplSet object should be done while locked. that
+     * logic implemented here.
+     *
+     * Order of locking: lock the replica set, then take a rwlock.
+     */
+    class RSBase : boost::noncopyable {
+    public:
+        const unsigned magic;
+        void assertValid() { assert( magic == 0x12345677 ); }
+    private:
+        mongo::mutex m;
+        int _locked;
+        ThreadLocalValue<bool> _lockedByMe;
+    protected:
+        RSBase() : magic(0x12345677), m("RSBase"), _locked(0) { }
+        ~RSBase() {
+            /* this can happen if we throw in the constructor; otherwise never happens.  thus we log it as it is quite unusual. */
+            log() << "replSet ~RSBase called" << rsLog;
+        }
+
+    public:
+        class lock {
+            RSBase& rsbase;
+            auto_ptr<scoped_lock> sl;
+        public:
+            lock(RSBase* b) : rsbase(*b) {
+                if( rsbase._lockedByMe.get() )
+                    return; // recursive is ok...
+
+                sl.reset( new scoped_lock(rsbase.m) );
+                DEV assert(rsbase._locked == 0);
+                rsbase._locked++;
+                rsbase._lockedByMe.set(true);
+            }
+            ~lock() {
+                if( sl.get() ) {
+                    assert( rsbase._lockedByMe.get() );
+                    DEV assert(rsbase._locked == 1);
+                    rsbase._lockedByMe.set(false);
+                    rsbase._locked--;
+                }
+            }
+        };
+
+        /* for asserts */
+        bool locked() const { return _locked != 0; }
+
+        /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another
+           just for asserts & such so we can make the contracts clear on who locks what when.
+           we don't use these locks that frequently, so the little bit of overhead is fine.
+        */
+        bool lockedByMe() { return _lockedByMe.get(); }
+    };
+
+    class ReplSetHealthPollTask;
+
+    /* safe container for our state that keeps member pointer and state variables always aligned */
+    class StateBox : boost::noncopyable {
+    public:
+        struct SP { // SP is like pair<MemberState,const Member *> but nicer
+            SP() : state(MemberState::RS_STARTUP), primary(0) { }
+            MemberState state;
+            const Member *primary;
+        };
+        const SP get() {
+            rwlock lk(m, false);
+            return sp;
+        }
+        MemberState getState() const {
+            rwlock lk(m, false);
+            return sp.state;
+        }
+        const Member* getPrimary() const {
+            rwlock lk(m, false);
+            return sp.primary;
+        }
+        void change(MemberState s, const Member *self) {
+            rwlock lk(m, true);
+            if( sp.state != s ) {
+                log() << "replSet " << s.toString() << rsLog;
+            }
+            sp.state = s;
+            if( s.primary() ) {
+                sp.primary = self;
+            }
+            else {
+                if( self == sp.primary )
+                    sp.primary = 0;
+            }
+        }
+        void set(MemberState s, const Member *p) {
+            rwlock lk(m, true);
+            sp.state = s;
+            sp.primary = p;
+        }
+        void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); }
+        void setOtherPrimary(const Member *mem) {
+            rwlock lk(m, true);
+            assert( !sp.state.primary() );
+            sp.primary = mem;
+        }
+        void noteRemoteIsPrimary(const Member *remote) {
+            rwlock lk(m, true);
+            if( !sp.state.secondary() && !sp.state.fatal() )
+                sp.state = MemberState::RS_RECOVERING;
+            sp.primary = remote;
+        }
+        StateBox() : m("StateBox") { }
+    private:
+        RWLock m;
+        SP sp;
+    };
+
+    void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet );
+
+    /** Parameter given to the --replSet command line option (parsed).
+        Syntax is "<setname>/<seedhost1>,<seedhost2>"
+        where setname is a name and seedhost is "<host>[:<port>]" */
+    class ReplSetCmdline {
+    public:
+        ReplSetCmdline(string cfgString) { parseReplsetCmdLine(cfgString, setname, seeds, seedSet); }
+        string setname;
+        vector<HostAndPort> seeds;
+        set<HostAndPort> seedSet;
+    };
+
+    /* information about the entire repl set, such as the various servers in the set, and their state */
+    /* note: We currently do not free mem when the set goes away - it is assumed the replset is a
+             singleton and long lived.
+    */
+    class ReplSetImpl : protected RSBase {
+    public:
+        /** info on our state if the replset isn't yet "up".  for example, if we are pre-initiation. */
+        enum StartupStatus {
+            PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3,
+            EMPTYUNREACHABLE=4, STARTED=5, SOON=6
+        };
+        static StartupStatus startupStatus;
+        static DiagStr startupStatusMsg;
+        static string stateAsHtml(MemberState state);
+
+        /* todo thread */
+        void msgUpdateHBInfo(HeartbeatInfo);
+
+        StateBox box;
+
+        OpTime lastOpTimeWritten;
+        long long lastH; // hash we use to make sure we are reading the right flow of ops and aren't on an out-of-date "fork"
+    private:
+        set<ReplSetHealthPollTask*> healthTasks;
+        void endOldHealthTasks();
+        void startHealthTaskFor(Member *m);
+
+        Consensus elect;
+        void relinquish();
+        void forgetPrimary();
+    protected:
+        bool _stepDown(int secs);
+        bool _freeze(int secs);
+    private:
+        void assumePrimary();
+        void loadLastOpTimeWritten(bool quiet=false);
+        void changeState(MemberState s);
+        
+        /**
+         * Find the closest member (using ping time) with a higher latest optime.
+         */
+        Member* getMemberToSyncTo();
+        void veto(const string& host, unsigned secs=10);
+        Member* _currentSyncTarget;
+
+        bool _blockSync;
+        void blockSync(bool block);
+
+        // set of electable members' _ids
+        set<unsigned> _electableSet;
+    protected:
+        // "heartbeat message"
+        // sent in requestHeartbeat respond in field "hbm"
+        char _hbmsg[256]; // we change this unlocked, thus not an stl::string
+        time_t _hbmsgTime; // when it was logged
+    public:
+        void sethbmsg(string s, int logLevel = 0);
+
+        /**
+         * Election with Priorities
+         *
+         * Each node (n) keeps a set of nodes that could be elected primary.
+         * Each node in this set:
+         *
+         *  1. can connect to a majority of the set
+         *  2. has a priority greater than 0
+         *  3. has an optime within 10 seconds of the most up-to-date node
+         *     that n can reach
+         *
+         * If a node fails to meet one or more of these criteria, it is removed
+         * from the list.  This list is updated whenever the node receives a
+         * heartbeat.
+         *
+         * When a node sends an "am I freshest?" query, the node receiving the
+         * query checks their electable list to make sure that no one else is
+         * electable AND higher priority.  If this check passes, the node will
+         * return an "ok" response, if not, it will veto.
+         *
+         * If a node is primary and there is another node with higher priority
+         * on the electable list (i.e., it must be synced to within 10 seconds
+         * of the current primary), the node (or nodes) with connections to both
+         * the primary and the secondary with higher priority will issue
+         * replSetStepDown requests to the primary to allow the higher-priority
+         * node to take over.  
+         */
+        void addToElectable(const unsigned m) { lock lk(this); _electableSet.insert(m); }
+        void rmFromElectable(const unsigned m) { lock lk(this); _electableSet.erase(m); }
+        bool iAmElectable() { lock lk(this); return _electableSet.find(_self->id()) != _electableSet.end(); }
+        bool isElectable(const unsigned id) { lock lk(this); return _electableSet.find(id) != _electableSet.end(); }
+        Member* getMostElectable();
+    protected:
+        /**
+         * Load a new config as the replica set's main config.
+         *
+         * If there is a "simple" change (just adding a node), this shortcuts
+         * the config. Returns true if the config was changed.  Returns false
+         * if the config doesn't include a this node.  Throws an exception if
+         * something goes very wrong.
+         *
+         * Behavior to note:
+         *  - locks this
+         *  - intentionally leaks the old _cfg and any old _members (if the
+         *    change isn't strictly additive)
+         */
+        bool initFromConfig(ReplSetConfig& c, bool reconf=false); 
+        void _fillIsMaster(BSONObjBuilder&);
+        void _fillIsMasterHost(const Member*, vector<string>&, vector<string>&, vector<string>&);
+        const ReplSetConfig& config() { return *_cfg; }
+        string name() const { return _name; } /* @return replica set's logical name */
+        MemberState state() const { return box.getState(); }
+        void _fatal();
+        void _getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const;
+        void _summarizeAsHtml(stringstream&) const;
+        void _summarizeStatus(BSONObjBuilder&) const; // for replSetGetStatus command
+
+        /* throws exception if a problem initializing. */
+        ReplSetImpl(ReplSetCmdline&);
+
+        /* call afer constructing to start - returns fairly quickly after launching its threads */
+        void _go();
+
+    private:
+        string _name;
+        const vector<HostAndPort> *_seeds;
+        ReplSetConfig *_cfg;
+
+        /**
+         * Finds the configuration with the highest version number and attempts
+         * load it.
+         */
+        bool _loadConfigFinish(vector<ReplSetConfig>& v);
+        /**
+         * Gather all possible configs (from command line seeds, our own config
+         * doc, and any hosts listed therein) and try to initiate from the most
+         * recent config we find.
+         */
+        void loadConfig();
+
+        list<HostAndPort> memberHostnames() const;
+        const ReplSetConfig::MemberCfg& myConfig() const { return _config; }
+        bool iAmArbiterOnly() const { return myConfig().arbiterOnly; }
+        bool iAmPotentiallyHot() const {
+          return myConfig().potentiallyHot() && // not an arbiter
+            elect.steppedDown <= time(0) && // not stepped down/frozen
+            state() == MemberState::RS_SECONDARY; // not stale
+        }
+    protected:
+        Member *_self;
+        bool _buildIndexes;       // = _self->config().buildIndexes
+        void setSelfTo(Member *); // use this as it sets buildIndexes var
+    private:
+        List1<Member> _members; // all members of the set EXCEPT _self.
+        ReplSetConfig::MemberCfg _config; // config of _self
+        unsigned _id; // _id of _self
+
+        int _maintenanceMode; // if we should stay in recovering state
+    public:
+        // this is called from within a writelock in logOpRS
+        unsigned selfId() const { return _id; }
+        Manager *mgr;
+        GhostSync *ghost;
+        /**
+         * This forces a secondary to go into recovering state and stay there
+         * until this is called again, passing in "false".  Multiple threads can
+         * call this and it will leave maintenance mode once all of the callers
+         * have called it again, passing in false.
+         */
+        void setMaintenanceMode(const bool inc);
+    private:
+        Member* head() const { return _members.head(); }
+    public:
+        const Member* findById(unsigned id) const;
+    private:
+        void _getTargets(list<Target>&, int &configVersion);
+        void getTargets(list<Target>&, int &configVersion);
+        void startThreads();
+        friend class FeedbackThread;
+        friend class CmdReplSetElect;
+        friend class Member;
+        friend class Manager;
+        friend class GhostSync;
+        friend class Consensus;
+
+    private:
+        bool initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid);
+        void _syncDoInitialSync();
+        void syncDoInitialSync();
+        void _syncThread();
+        bool tryToGoLiveAsASecondary(OpTime&); // readlocks
+        void syncTail();
+        unsigned _syncRollback(OplogReader& r);
+        void syncRollback(OplogReader& r);
+        void syncFixUp(HowToFixUp& h, OplogReader& r);
+
+        // get an oplog reader for a server with an oplog entry timestamp greater
+        // than or equal to minTS, if set.
+        Member* _getOplogReader(OplogReader& r, const OpTime& minTS);
+
+        // check lastOpTimeWritten against the remote's earliest op, filling in
+        // remoteOldestOp.
+        bool _isStale(OplogReader& r, const OpTime& minTS, BSONObj& remoteOldestOp);
+
+        // keep a list of hosts that we've tried recently that didn't work
+        map<string,time_t> _veto;
+    public:
+        void syncThread();
+        const OpTime lastOtherOpTime() const;
+    };
+
+    class ReplSet : public ReplSetImpl {
+    public:
+        ReplSet(ReplSetCmdline& replSetCmdline) : ReplSetImpl(replSetCmdline) {  }
+
+        // for the replSetStepDown command
+        bool stepDown(int secs) { return _stepDown(secs); }
+
+        // for the replSetFreeze command
+        bool freeze(int secs) { return _freeze(secs); }
+
+        string selfFullName() {
+            assert( _self );
+            return _self->fullName();
+        }
+
+        bool buildIndexes() const { return _buildIndexes; }
+
+        /* call after constructing to start - returns fairly quickly after la[unching its threads */
+        void go() { _go(); }
+
+        void fatal() { _fatal(); }
+        bool isPrimary() { return box.getState().primary(); }
+        bool isSecondary() {  return box.getState().secondary(); }
+        MemberState state() const { return ReplSetImpl::state(); }
+        string name() const { return ReplSetImpl::name(); }
+        const ReplSetConfig& config() { return ReplSetImpl::config(); }
+        void getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { _getOplogDiagsAsHtml(server_id,ss); }
+        void summarizeAsHtml(stringstream& ss) const { _summarizeAsHtml(ss); }
+        void summarizeStatus(BSONObjBuilder& b) const  { _summarizeStatus(b); }
+        void fillIsMaster(BSONObjBuilder& b) { _fillIsMaster(b); }
+
+        /**
+         * We have a new config (reconfig) - apply it.
+         * @param comment write a no-op comment to the oplog about it.  only
+         * makes sense if one is primary and initiating the reconf.
+         *
+         * The slaves are updated when they get a heartbeat indicating the new
+         * config.  The comment is a no-op.
+         */
+        void haveNewConfig(ReplSetConfig& c, bool comment);
+
+        /**
+         * Pointer assignment isn't necessarily atomic, so this needs to assure
+         * locking, even though we don't delete old configs.
+         */
+        const ReplSetConfig& getConfig() { return config(); }
+
+        bool lockedByMe() { return RSBase::lockedByMe(); }
+
+        // heartbeat msg to send to others; descriptive diagnostic info
+        string hbmsg() const {
+            if( time(0)-_hbmsgTime > 120 ) return "";
+            return _hbmsg;
+        }
+    };
+
+    /**
+     * Base class for repl set commands.  Checks basic things such if we're in
+     * rs mode before the command does its real work.
+     */
+    class ReplSetCommand : public Command {
+    protected:
+        ReplSetCommand(const char * s, bool show=false) : Command(s, show) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool logTheOp() { return false; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const { help << "internal"; }
+
+        /**
+         * Some replica set commands call this and then call check(). This is
+         * intentional, as they might do things before theReplSet is initialized
+         * that still need to be checked for auth.
+         */
+        bool checkAuth(string& errmsg, BSONObjBuilder& result) {
+            if( !noauth ) {
+                AuthenticationInfo *ai = cc().getAuthenticationInfo();
+                if (!ai->isAuthorizedForLock("admin", locktype())) {
+                    errmsg = "replSet command unauthorized";
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        bool check(string& errmsg, BSONObjBuilder& result) {
+            if( !replSet ) {
+                errmsg = "not running with --replSet";
+                if( cmdLine.configsvr ) { 
+                    result.append("info", "configsvr"); // for shell prompt
+                }
+                return false;
+            }
+
+            if( theReplSet == 0 ) {
+                result.append("startupStatus", ReplSet::startupStatus);
+                string s;
+                errmsg = ReplSet::startupStatusMsg.empty() ? "replset unknown error 2" : ReplSet::startupStatusMsg.get();
+                if( ReplSet::startupStatus == 3 )
+                    result.append("info", "run rs.initiate(...) if not yet done for the set");
+                return false;
+            }
+
+            return checkAuth(errmsg, result);
+        }
+    };
+
+    /**
+     * does local authentication
+     * directly authorizes against AuthenticationInfo
+     */
+    void replLocalAuth();
+
+    /** inlines ----------------- */
+
+    inline Member::Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self) :
+        _config(*c), _h(h), _hbinfo(ord) {
+        assert(c);
+        if( self )
+            _hbinfo.health = 1.0;
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_config.cpp b/src/mongo/db/repl/rs_config.cpp
new file mode 100644
index 00000000000..22137773aec
--- /dev/null
+++ b/src/mongo/db/repl/rs_config.cpp
@@ -0,0 +1,662 @@
+// rs_config.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "../../client/dbclient.h"
+#include "../../client/syncclusterconnection.h"
+#include "../../util/net/hostandport.h"
+#include "../dbhelpers.h"
+#include "connections.h"
+#include "../oplog.h"
+#include "../instance.h"
+#include "../../util/text.h"
+#include <boost/algorithm/string.hpp>
+
+using namespace bson;
+
+namespace mongo {
+
+    void logOpInitiate(const bo&);
+
+    void assertOnlyHas(BSONObj o, const set<string>& fields) {
+        BSONObj::iterator i(o);
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if( !fields.count( e.fieldName() ) ) {
+                uasserted(13434, str::stream() << "unexpected field '" << e.fieldName() << "' in object");
+            }
+        }
+    }
+
+    list<HostAndPort> ReplSetConfig::otherMemberHostnames() const {
+        list<HostAndPort> L;
+        for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); i++ ) {
+            if( !i->h.isSelf() )
+                L.push_back(i->h);
+        }
+        return L;
+    }
+
+    /* comment MUST only be set when initiating the set by the initiator */
+    void ReplSetConfig::saveConfigLocally(bo comment) {
+        checkRsConfig();
+        log() << "replSet info saving a newer config version to local.system.replset" << rsLog;
+        {
+            writelock lk("");
+            Client::Context cx( rsConfigNs );
+            cx.db()->flushFiles(true);
+
+            //theReplSet->lastOpTimeWritten = ??;
+            //rather than above, do a logOp()? probably
+            BSONObj o = asBson();
+            Helpers::putSingletonGod(rsConfigNs.c_str(), o, false/*logOp=false; local db so would work regardless...*/);
+            if( !comment.isEmpty() && (!theReplSet || theReplSet->isPrimary()) )
+                logOpInitiate(comment);
+
+            cx.db()->flushFiles(true);
+        }
+        log() << "replSet saveConfigLocally done" << rsLog;
+    }
+
+    bo ReplSetConfig::MemberCfg::asBson() const {
+        bob b;
+        b << "_id" << _id;
+        b.append("host", h.dynString());
+        if( votes != 1 ) b << "votes" << votes;
+        if( priority != 1.0 ) b << "priority" << priority;
+        if( arbiterOnly ) b << "arbiterOnly" << true;
+        if( slaveDelay ) b << "slaveDelay" << slaveDelay;
+        if( hidden ) b << "hidden" << hidden;
+        if( !buildIndexes ) b << "buildIndexes" << buildIndexes;
+        if( !tags.empty() ) {
+            BSONObjBuilder a;
+            for( map<string,string>::const_iterator i = tags.begin(); i != tags.end(); i++ )
+                a.append((*i).first, (*i).second);
+            b.append("tags", a.done());
+        }
+        return b.obj();
+    }
+
+    void ReplSetConfig::updateMembers(List1<Member> &dest) {
+        for (vector<MemberCfg>::iterator source = members.begin(); source < members.end(); source++) {
+            for( Member *d = dest.head(); d; d = d->next() ) {
+                if (d->fullName() == (*source).h.toString()) {
+                    d->configw().groupsw() = (*source).groups();
+                }
+            }
+        }
+    }
+
+    bo ReplSetConfig::asBson() const {
+        bob b;
+        b.append("_id", _id).append("version", version);
+
+        BSONArrayBuilder a;
+        for( unsigned i = 0; i < members.size(); i++ )
+            a.append( members[i].asBson() );
+        b.append("members", a.arr());
+
+        if( !ho.isDefault() || !getLastErrorDefaults.isEmpty() || !rules.empty()) {
+            bob settings;
+            if( !rules.empty() ) {
+                bob modes;
+                for (map<string,TagRule*>::const_iterator it = rules.begin(); it != rules.end(); it++) {
+                    bob clauses;
+                    vector<TagClause*> r = (*it).second->clauses;
+                    for (vector<TagClause*>::iterator it2 = r.begin(); it2 < r.end(); it2++) {
+                        clauses << (*it2)->name << (*it2)->target;
+                    }
+                    modes << (*it).first << clauses.obj();
+                }
+                settings << "getLastErrorModes" << modes.obj();
+            }
+            if( !getLastErrorDefaults.isEmpty() )
+                settings << "getLastErrorDefaults" << getLastErrorDefaults;
+            b << "settings" << settings.obj();
+        }
+
+        return b.obj();
+    }
+
+    static inline void mchk(bool expr) {
+        uassert(13126, "bad Member config", expr);
+    }
+
+    void ReplSetConfig::MemberCfg::check() const {
+        mchk(_id >= 0 && _id <= 255);
+        mchk(priority >= 0 && priority <= 1000);
+        mchk(votes <= 100); // votes >= 0 because it is unsigned
+        uassert(13419, "priorities must be between 0.0 and 100.0", priority >= 0.0 && priority <= 100.0);
+        uassert(13437, "slaveDelay requires priority be zero", slaveDelay == 0 || priority == 0);
+        uassert(13438, "bad slaveDelay value", slaveDelay >= 0 && slaveDelay <= 3600 * 24 * 366);
+        uassert(13439, "priority must be 0 when hidden=true", priority == 0 || !hidden);
+        uassert(13477, "priority must be 0 when buildIndexes=false", buildIndexes || priority == 0);
+    }
+/*
+    string ReplSetConfig::TagSubgroup::toString() const {
+        bool first = true;
+        string result = "\""+name+"\": [";
+        for (set<const MemberCfg*>::const_iterator i = m.begin(); i != m.end(); i++) {
+            if (!first) {
+                result += ", ";
+            }
+            first = false;
+            result += (*i)->h.toString();
+        }
+        return result+"]";
+    }
+    */
+    string ReplSetConfig::TagClause::toString() const {
+        string result = name+": {";
+        for (map<string,TagSubgroup*>::const_iterator i = subgroups.begin(); i != subgroups.end(); i++) {
+//TEMP?            result += (*i).second->toString()+", ";
+        }
+        result += "TagClause toString TEMPORARILY DISABLED";
+        return result + "}";
+    }
+
+    string ReplSetConfig::TagRule::toString() const {
+        string result = "{";
+        for (vector<TagClause*>::const_iterator it = clauses.begin(); it < clauses.end(); it++) {
+            result += ((TagClause*)(*it))->toString()+",";
+        }
+        return result+"}";
+    }
+
+    void ReplSetConfig::TagSubgroup::updateLast(const OpTime& op) {
+        RACECHECK
+        if (last < op) {
+            last = op;
+
+            for (vector<TagClause*>::iterator it = clauses.begin(); it < clauses.end(); it++) {
+                (*it)->updateLast(op);
+            }
+        }
+    }
+
+    void ReplSetConfig::TagClause::updateLast(const OpTime& op) {
+        RACECHECK
+        if (last >= op) {
+            return;
+        }
+
+        // check at least n subgroups greater than clause.last
+        int count = 0;
+        map<string,TagSubgroup*>::iterator it;
+        for (it = subgroups.begin(); it != subgroups.end(); it++) {
+            if ((*it).second->last >= op) {
+                count++;
+            }
+        }
+
+        if (count >= actualTarget) {
+            last = op;
+            rule->updateLast(op);
+        }
+    }
+
+    void ReplSetConfig::TagRule::updateLast(const OpTime& op) {
+        OpTime *earliest = (OpTime*)&op;
+        vector<TagClause*>::iterator it;
+
+        for (it = clauses.begin(); it < clauses.end(); it++) {
+            if ((*it)->last < *earliest) {
+                earliest = &(*it)->last;
+            }
+        }
+
+        // rules are simply and-ed clauses, so whatever the most-behind
+        // clause is at is what the rule is at
+        last = *earliest;
+    }
+
+    /** @param o old config
+        @param n new config
+        */
+    /*static*/
+    bool ReplSetConfig::legalChange(const ReplSetConfig& o, const ReplSetConfig& n, string& errmsg) {
+        assert( theReplSet );
+
+        if( o._id != n._id ) {
+            errmsg = "set name may not change";
+            return false;
+        }
+        /* TODO : wonder if we need to allow o.version < n.version only, which is more lenient.
+                  if someone had some intermediate config this node doesnt have, that could be
+                  necessary.  but then how did we become primary?  so perhaps we are fine as-is.
+                  */
+        if( o.version >= n.version ) {
+            errmsg = str::stream() << "version number must increase, old: "
+                                   << o.version << " new: " << n.version;
+            return false;
+        }
+
+        map<HostAndPort,const ReplSetConfig::MemberCfg*> old;
+        bool isLocalHost = false;
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) {
+            if (i->h.isLocalHost()) {
+                isLocalHost = true;
+            }
+            old[i->h] = &(*i);
+        }
+        int me = 0;
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) {
+            const ReplSetConfig::MemberCfg& m = *i;
+            if ( (isLocalHost && !m.h.isLocalHost()) || (!isLocalHost && m.h.isLocalHost())) {
+                log() << "reconfig error, cannot switch between localhost and hostnames: "
+                      << m.h.toString() << rsLog;
+                uasserted(13645, "hosts cannot switch between localhost and hostname");
+            }
+            if( old.count(m.h) ) {
+                const ReplSetConfig::MemberCfg& oldCfg = *old[m.h];
+                if( oldCfg._id != m._id ) {
+                    log() << "replSet reconfig error with member: " << m.h.toString() << rsLog;
+                    uasserted(13432, "_id may not change for members");
+                }
+                if( oldCfg.buildIndexes != m.buildIndexes ) {
+                    log() << "replSet reconfig error with member: " << m.h.toString() << rsLog;
+                    uasserted(13476, "buildIndexes may not change for members");
+                }
+                /* are transitions to and from arbiterOnly guaranteed safe?  if not, we should disallow here.
+                   there is a test at replsets/replsetarb3.js */
+                if( oldCfg.arbiterOnly != m.arbiterOnly ) {
+                    log() << "replSet reconfig error with member: " << m.h.toString() << " arbiterOnly cannot change. remove and readd the member instead " << rsLog;
+                    uasserted(13510, "arbiterOnly may not change for members");
+                }
+            }
+            if( m.h.isSelf() )
+                me++;
+        }
+
+        uassert(13433, "can't find self in new replset config", me == 1);
+
+        return true;
+    }
+
+    void ReplSetConfig::clear() {
+        version = -5;
+        _ok = false;
+    }
+
+    void ReplSetConfig::setMajority() {
+        int total = members.size();
+        int nonArbiters = total;
+        int strictMajority = total/2+1;
+
+        for (vector<MemberCfg>::iterator it = members.begin(); it < members.end(); it++) {
+            if ((*it).arbiterOnly) {
+                nonArbiters--;
+            }
+        }
+
+        // majority should be all "normal" members if we have something like 4
+        // arbiters & 3 normal members
+        _majority = (strictMajority > nonArbiters) ? nonArbiters : strictMajority;
+    }
+
+    int ReplSetConfig::getMajority() const {
+        return _majority;
+    }
+
+    void ReplSetConfig::checkRsConfig() const {
+        uassert(13132,
+                str::stream() << "nonmatching repl set name in _id field: " << _id << " vs. " << cmdLine.ourSetName(),
+                _id == cmdLine.ourSetName());
+        uassert(13308, "replSet bad config version #", version > 0);
+        uassert(13133, "replSet bad config no members", members.size() >= 1);
+        uassert(13309, "replSet bad config maximum number of members is 12", members.size() <= 12);
+        {
+            unsigned voters = 0;
+            for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); ++i ) {
+                if( i->votes )
+                    voters++;
+            }
+            uassert(13612, "replSet bad config maximum number of voting members is 7", voters <= 7);
+            uassert(13613, "replSet bad config no voting members", voters > 0);
+        }
+    }
+
+    void ReplSetConfig::_populateTagMap(map<string,TagClause> &tagMap) {
+        // create subgroups for each server corresponding to each of
+        // its tags. E.g.:
+        //
+        // A is tagged with {"server" : "A", "dc" : "ny"}
+        // B is tagged with {"server" : "B", "dc" : "ny"}
+        //
+        // At the end of this step, tagMap will contain:
+        //
+        // "server" => {"A" : [A], "B" : [B]}
+        // "dc" => {"ny" : [A,B]}
+
+        for (unsigned i=0; i<members.size(); i++) {
+            MemberCfg member = members[i];
+
+            for (map<string,string>::iterator tag = member.tags.begin(); tag != member.tags.end(); tag++) {
+                string label = (*tag).first;
+                string value = (*tag).second;
+
+                TagClause& clause = tagMap[label];
+                clause.name = label;
+
+                TagSubgroup* subgroup;
+                // search for "ny" in "dc"'s clause
+                if (clause.subgroups.find(value) == clause.subgroups.end()) {
+                    clause.subgroups[value] = subgroup = new TagSubgroup(value);
+                }
+                else {
+                    subgroup = clause.subgroups[value];
+                }
+
+                subgroup->m.insert(&members[i]);
+            }
+        }
+    }
+
+    void ReplSetConfig::parseRules(const BSONObj& modes) {
+        map<string,TagClause> tagMap;
+        _populateTagMap(tagMap);
+
+        for (BSONObj::iterator i = modes.begin(); i.more(); ) {
+            unsigned int primaryOnly = 0;
+
+            // ruleName : {dc : 2, m : 3}
+            BSONElement rule = i.next();
+            uassert(14046, "getLastErrorMode rules must be objects", rule.type() == mongo::Object);
+
+            TagRule* r = new TagRule();
+
+            BSONObj clauseObj = rule.Obj();
+            for (BSONObj::iterator c = clauseObj.begin(); c.more(); ) {
+                BSONElement clauseElem = c.next();
+                uassert(14829, "getLastErrorMode criteria must be numeric", clauseElem.isNumber());
+
+                // get the clause, e.g., "x.y" : 3
+                const char *criteria = clauseElem.fieldName();
+                int value = clauseElem.numberInt();
+                uassert(14828, str::stream() << "getLastErrorMode criteria must be greater than 0: " << clauseElem, value > 0);
+
+                TagClause* node = new TagClause(tagMap[criteria]);
+
+                int numGroups = node->subgroups.size();
+                uassert(14831, str::stream() << "mode " << clauseObj << " requires "
+                        << value << " tagged with " << criteria << ", but only "
+                        << numGroups << " with this tag were found", numGroups >= value);
+
+                node->name = criteria;
+                node->target = value;
+                // if any subgroups contain "me", we can decrease the target
+                node->actualTarget = node->target;
+
+                // then we want to add pointers between clause & subgroup
+                for (map<string,TagSubgroup*>::iterator sgs = node->subgroups.begin();
+                     sgs != node->subgroups.end(); sgs++) {
+                    bool foundMe = false;
+                    (*sgs).second->clauses.push_back(node);
+
+                    // if this subgroup contains the primary, it's automatically always up-to-date
+                    for( set<MemberCfg*>::const_iterator cfg = (*sgs).second->m.begin();
+                         cfg != (*sgs).second->m.end(); 
+                         cfg++) 
+                    {
+                        if ((*cfg)->h.isSelf()) {
+                            node->actualTarget--;
+                            foundMe = true;
+                        }
+                    }
+
+                    for (set<MemberCfg *>::iterator cfg = (*sgs).second->m.begin();
+                         !foundMe && cfg != (*sgs).second->m.end(); cfg++) {
+                        (*cfg)->groupsw().insert((*sgs).second);
+                    }
+                }
+
+                // if all of the members of this clause involve the primary, it's always up-to-date
+                if (node->actualTarget == 0) {
+                    node->last = OpTime(INT_MAX, INT_MAX);
+                    primaryOnly++;
+                }
+
+                // this is a valid clause, so we want to add it to its rule
+                node->rule = r;
+                r->clauses.push_back(node);
+            }
+
+            // if all of the clauses are satisfied by the primary, this rule is trivially true
+            if (primaryOnly == r->clauses.size()) {
+                r->last = OpTime(INT_MAX, INT_MAX);
+            }
+
+            // if we got here, this is a valid rule
+            LOG(1) << "replSet new rule " << rule.fieldName() << ": " << r->toString() << rsLog;
+            rules[rule.fieldName()] = r;
+        }
+    }
+
+    void ReplSetConfig::from(BSONObj o) {
+        static const string legal[] = {"_id","version", "members","settings"};
+        static const set<string> legals(legal, legal + 4);
+        assertOnlyHas(o, legals);
+
+        md5 = o.md5();
+        _id = o["_id"].String();
+        if( o["version"].ok() ) {
+            version = o["version"].numberInt();
+            uassert(13115, "bad " + rsConfigNs + " config: version", version > 0);
+        }
+
+        set<string> hosts;
+        set<int> ords;
+        vector<BSONElement> members;
+        try {
+            members = o["members"].Array();
+        }
+        catch(...) {
+            uasserted(13131, "replSet error parsing (or missing) 'members' field in config object");
+        }
+
+        unsigned localhosts = 0;
+        for( unsigned i = 0; i < members.size(); i++ ) {
+            BSONObj mobj = members[i].Obj();
+            MemberCfg m;
+            try {
+                static const string legal[] = {
+                    "_id","votes","priority","host", "hidden","slaveDelay",
+                    "arbiterOnly","buildIndexes","tags","initialSync" // deprecated
+                };
+                static const set<string> legals(legal, legal + 10);
+                assertOnlyHas(mobj, legals);
+
+                try {
+                    m._id = (int) mobj["_id"].Number();
+                }
+                catch(...) {
+                    /* TODO: use of string exceptions may be problematic for reconfig case! */
+                    throw "_id must be numeric";
+                }
+                try {
+                    string s = mobj["host"].String();
+                    boost::trim(s);
+                    m.h = HostAndPort(s);
+                    if ( !m.h.hasPort() ) {
+                        // make port explicit even if default 
+                        m.h.setPort(m.h.port());
+                    }
+                }
+                catch(...) {
+                    throw string("bad or missing host field? ") + mobj.toString();
+                }
+                if( m.h.isLocalHost() )
+                    localhosts++;
+                m.arbiterOnly = mobj["arbiterOnly"].trueValue();
+                m.slaveDelay = mobj["slaveDelay"].numberInt();
+                if( mobj.hasElement("hidden") )
+                    m.hidden = mobj["hidden"].trueValue();
+                if( mobj.hasElement("buildIndexes") )
+                    m.buildIndexes = mobj["buildIndexes"].trueValue();
+                if( mobj.hasElement("priority") )
+                    m.priority = mobj["priority"].Number();
+                if( mobj.hasElement("votes") )
+                    m.votes = (unsigned) mobj["votes"].Number();
+                if( mobj.hasElement("tags") ) {
+                    const BSONObj &t = mobj["tags"].Obj();
+                    for (BSONObj::iterator c = t.begin(); c.more(); c.next()) {
+                        m.tags[(*c).fieldName()] = (*c).String();
+                    }
+                    uassert(14827, "arbiters cannot have tags", !m.arbiterOnly || m.tags.empty() );
+                }
+                m.check();
+            }
+            catch( const char * p ) {
+                log() << "replSet cfg parsing exception for members[" << i << "] " << p << rsLog;
+                stringstream ss;
+                ss << "replSet members[" << i << "] " << p;
+                uassert(13107, ss.str(), false);
+            }
+            catch(DBException& e) {
+                log() << "replSet cfg parsing exception for members[" << i << "] " << e.what() << rsLog;
+                stringstream ss;
+                ss << "bad config for member[" << i << "] " << e.what();
+                uassert(13135, ss.str(), false);
+            }
+            if( !(ords.count(m._id) == 0 && hosts.count(m.h.toString()) == 0) ) {
+                log() << "replSet " << o.toString() << rsLog;
+                uassert(13108, "bad replset config -- duplicate hosts in the config object?", false);
+            }
+            hosts.insert(m.h.dynString());
+            ords.insert(m._id);
+            this->members.push_back(m);
+        }
+        uassert(13393, "can't use localhost in repl set member names except when using it for all members", localhosts == 0 || localhosts == members.size());
+        uassert(13117, "bad " + rsConfigNs + " config", !_id.empty());
+
+        if( o["settings"].ok() ) {
+            BSONObj settings = o["settings"].Obj();
+            if( settings["getLastErrorModes"].ok() ) {
+                parseRules(settings["getLastErrorModes"].Obj());
+            }
+            ho.check();
+            try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); }
+            catch(...) { }
+        }
+
+        // figure out the majority for this config
+        setMajority();
+    }
+
+    static inline void configAssert(bool expr) {
+        uassert(13122, "bad repl set config?", expr);
+    }
+
+    ReplSetConfig::ReplSetConfig(BSONObj cfg, bool force) {
+        _constructed = false;
+        clear();
+        from(cfg);
+        if( force ) {
+            version += rand() % 100000 + 10000;
+        }
+        configAssert( version < 0 /*unspecified*/ || (version >= 1) );
+        if( version < 1 )
+            version = 1;
+        _ok = true;
+        _constructed = true;
+    }
+
+    ReplSetConfig::ReplSetConfig(const HostAndPort& h) {
+        LOG(2) << "ReplSetConfig load " << h.toStringLong() << rsLog;
+
+        _constructed = false;
+        clear();
+        int level = 2;
+        DEV level = 0;
+
+        BSONObj cfg;
+        int v = -5;
+        try {
+            if( h.isSelf() ) {
+                ;
+            }
+            else {
+                /* first, make sure other node is configured to be a replset. just to be safe. */
+                string setname = cmdLine.ourSetName();
+                BSONObj cmd = BSON( "replSetHeartbeat" << setname );
+                int theirVersion;
+                BSONObj info;
+                log() << "trying to contact " << h.toString() << rsLog;
+                bool ok = requestHeartbeat(setname, "", h.toString(), info, -2, theirVersion);
+                if( info["rs"].trueValue() ) {
+                    // yes, it is a replicate set, although perhaps not yet initialized
+                }
+                else {
+                    if( !ok ) {
+                        log() << "replSet TEMP !ok heartbeating " << h.toString() << " on cfg load" << rsLog;
+                        if( !info.isEmpty() )
+                            log() << "replSet info " << h.toString() << " : " << info.toString() << rsLog;
+                        return;
+                    }
+                    {
+                        stringstream ss;
+                        ss << "replSet error: member " << h.toString() << " is not in --replSet mode";
+                        msgassertedNoTrace(13260, ss.str().c_str()); // not caught as not a user exception - we want it not caught
+                        //for python err# checker: uassert(13260, "", false);
+                    }
+                }
+            }
+
+            v = -4;
+            unsigned long long count = 0;
+            try {
+                ScopedConn conn(h.toString());
+                v = -3;
+                cfg = conn.findOne(rsConfigNs, Query()).getOwned();
+                count = conn.count(rsConfigNs);
+            }
+            catch ( DBException& ) {
+                if ( !h.isSelf() ) {
+                    throw;
+                }
+
+                // on startup, socket is not listening yet
+                DBDirectClient cli;
+                cfg = cli.findOne( rsConfigNs, Query() ).getOwned();
+                count = cli.count(rsConfigNs);
+            }
+
+            if( count > 1 )
+                uasserted(13109, str::stream() << "multiple rows in " << rsConfigNs << " not supported host: " << h.toString());
+
+            if( cfg.isEmpty() ) {
+                version = EMPTYCONFIG;
+                return;
+            }
+            version = -1;
+        }
+        catch( DBException& e) {
+            version = v;
+            log(level) << "replSet load config couldn't get from " << h.toString() << ' ' << e.what() << rsLog;
+            return;
+        }
+
+        from(cfg);
+        checkRsConfig();
+        _ok = true;
+        log(level) << "replSet load config ok from " << (h.isSelf() ? "self" : h.toString()) << rsLog;
+        _constructed = true;
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_config.h b/src/mongo/db/repl/rs_config.h
new file mode 100644
index 00000000000..cfe2e86a568
--- /dev/null
+++ b/src/mongo/db/repl/rs_config.h
@@ -0,0 +1,251 @@
+// rs_config.h
+// repl set configuration
+//
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/net/hostandport.h"
+#include "../../util/concurrency/race.h"
+#include "health.h"
+
+namespace mongo {
+    class Member;
+    const string rsConfigNs = "local.system.replset";
+
+    class ReplSetConfig {
+        enum { EMPTYCONFIG = -2 };
+        struct TagSubgroup;
+    public:
+        /**
+         * This contacts the given host and tries to get a config from them.
+         *
+         * This sends a test heartbeat to the host and, if all goes well and the
+         * host has a more recent config, fetches the config and loads it (see
+         * from().
+         *
+         * If it's contacting itself, it skips the heartbeat (for obvious
+         * reasons.) If something is misconfigured, throws an exception. If the
+         * host couldn't be queried or is just blank, ok() will be false.
+         */
+        ReplSetConfig(const HostAndPort& h);
+
+        ReplSetConfig(BSONObj cfg, bool force=false);
+
+        bool ok() const { return _ok; }
+
+        struct TagRule;
+
+        struct MemberCfg {
+            MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false), buildIndexes(true) { }
+            int _id;              /* ordinal */
+            unsigned votes;       /* how many votes this node gets. default 1. */
+            HostAndPort h;
+            double priority;      /* 0 means can never be primary */
+            bool arbiterOnly;
+            int slaveDelay;       /* seconds.  int rather than unsigned for convenient to/front bson conversion. */
+            bool hidden;          /* if set, don't advertise to drives in isMaster. for non-primaries (priority 0) */
+            bool buildIndexes;    /* if false, do not create any non-_id indexes */
+            map<string,string> tags;     /* tagging for data center, rack, etc. */
+        private:
+            set<TagSubgroup*> _groups; // the subgroups this member belongs to
+        public:
+            const set<TagSubgroup*>& groups() const { 
+                return _groups;
+            }
+            set<TagSubgroup*>& groupsw() {
+                return _groups;
+            }
+            void check() const;   /* check validity, assert if not. */
+            BSONObj asBson() const;
+            bool potentiallyHot() const { return !arbiterOnly && priority > 0; }
+            void updateGroups(const OpTime& last) {
+                RACECHECK
+                for (set<TagSubgroup*>::const_iterator it = groups().begin(); it != groups().end(); it++) {
+                    ((TagSubgroup*)(*it))->updateLast(last);
+                }
+            }
+            bool operator==(const MemberCfg& r) const {
+                if (!tags.empty() || !r.tags.empty()) {
+                    if (tags.size() != r.tags.size()) {
+                        return false;
+                    }
+
+                    // if they are the same size and not equal, at least one
+                    // element in A must be different in B
+                    for (map<string,string>::const_iterator lit = tags.begin(); lit != tags.end(); lit++) {
+                        map<string,string>::const_iterator rit = r.tags.find((*lit).first);
+
+                        if (rit == r.tags.end() || (*lit).second != (*rit).second) {
+                            return false;
+                        }
+                    }
+                }
+
+                return _id==r._id && votes == r.votes && h == r.h && priority == r.priority &&
+                       arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden &&
+                       buildIndexes == buildIndexes;
+            }
+            bool operator!=(const MemberCfg& r) const { return !(*this == r); }
+        };
+
+        vector<MemberCfg> members;
+        string _id;
+        int version;
+        HealthOptions ho;
+        string md5;
+        BSONObj getLastErrorDefaults;
+        map<string,TagRule*> rules;
+
+        list<HostAndPort> otherMemberHostnames() const; // except self
+
+        /** @return true if could connect, and there is no cfg object there at all */
+        bool empty() const { return version == EMPTYCONFIG; }
+
+        string toString() const { return asBson().toString(); }
+
+        /** validate the settings. does not call check() on each member, you have to do that separately. */
+        void checkRsConfig() const;
+
+        /** check if modification makes sense */
+        static bool legalChange(const ReplSetConfig& old, const ReplSetConfig& n, string& errmsg);
+
+        //static void receivedNewConfig(BSONObj);
+        void saveConfigLocally(BSONObj comment); // to local db
+        string saveConfigEverywhere(); // returns textual info on what happened
+
+        /**
+         * Update members' groups when the config changes but members stay the same.
+         */
+        void updateMembers(List1<Member> &dest);
+
+        BSONObj asBson() const;
+
+        /**
+         * Getter and setter for _majority. This is almost always
+         * members.size()/2+1, but can be the number of non-arbiter members if
+         * there are more arbiters than non-arbiters (writing to 3 out of 7
+         * servers is safe if 4 of the servers are arbiters).
+         */
+        void setMajority();
+        int getMajority() const;
+
+        bool _constructed;
+    private:
+        bool _ok;
+        int _majority;
+
+        void from(BSONObj);
+        void clear();
+
+        struct TagClause;
+
+        /**
+         * This is a logical grouping of servers.  It is pointed to by a set of
+         * servers with a certain tag.
+         *
+         * For example, suppose servers A, B, and C have the tag "dc" : "nyc". If we
+         * have a rule {"dc" : 2}, then we want A _or_ B _or_ C to have the
+         * write for one of the "dc" critiria to be fulfilled, so all three will
+         * point to this subgroup. When one of their oplog-tailing cursors is
+         * updated, this subgroup is updated.
+         */
+        struct TagSubgroup : boost::noncopyable {
+            ~TagSubgroup(); // never called; not defined
+            TagSubgroup(string nm) : name(nm) { }
+            const string name;
+            OpTime last;
+            vector<TagClause*> clauses;
+
+            // this probably won't actually point to valid members after the
+            // subgroup is created, as initFromConfig() makes a copy of the
+            // config
+            set<MemberCfg*> m;
+
+            void updateLast(const OpTime& op);
+
+            //string toString() const;
+
+            /**
+             * If two tags have the same name, they should compare as equal so
+             * that members don't have to update two identical groups on writes.
+             */
+            bool operator() (TagSubgroup& lhs, TagSubgroup& rhs) const {
+                return lhs.name < rhs.name;
+            }
+        };
+
+        /**
+         * An argument in a rule.  For example, if we had the rule {dc : 2,
+         * machines : 3}, "dc" : 2 and "machines" : 3 would be two TagClauses.
+         *
+         * Each tag clause has a set of associated subgroups.  For example, if
+         * we had "dc" : 2, our subgroups might be "nyc", "sf", and "hk".
+         */
+        struct TagClause {
+            OpTime last;
+            map<string,TagSubgroup*> subgroups;
+            TagRule *rule;
+            string name;
+            /**
+             * If we have get a clause like {machines : 3} and this server is
+             * tagged with "machines", then it's really {machines : 2}, as we
+             * will always be up-to-date.  So, target would be 3 and
+             * actualTarget would be 2, in that example.
+             */
+            int target;
+            int actualTarget;
+
+            void updateLast(const OpTime& op);
+            string toString() const;
+        };
+
+        /**
+         * Parses getLastErrorModes.
+         */
+        void parseRules(const BSONObj& modes);
+
+        /**
+         * Create a  hash containing every possible clause that could be used in a
+         * rule and the servers related to that clause.
+         *
+         * For example, suppose we have the following servers:
+         * A {"dc" : "ny", "ny" : "rk1"}
+         * B {"dc" : "ny", "ny" : "rk1"}
+         * C {"dc" : "ny", "ny" : "rk2"}
+         * D {"dc" : "sf", "sf" : "rk1"}
+         * E {"dc" : "sf", "sf" : "rk2"}
+         *
+         * This would give us the possible criteria:
+         * "dc" -> {A, B, C},{D, E}
+         * "ny" -> {A, B},{C}
+         * "sf" -> {D},{E}
+         */
+        void _populateTagMap(map<string,TagClause> &tagMap);
+
+    public:
+        struct TagRule {
+            vector<TagClause*> clauses;
+            OpTime last;
+
+            void updateLast(const OpTime& op);
+            string toString() const;
+        };
+    };
+
+}
diff --git a/src/mongo/db/repl/rs_exception.h b/src/mongo/db/repl/rs_exception.h
new file mode 100644
index 00000000000..fc372fc241c
--- /dev/null
+++ b/src/mongo/db/repl/rs_exception.h
@@ -0,0 +1,17 @@
+// @file rs_exception.h
+
+#pragma once
+
+namespace mongo {
+
+    class VoteException : public std::exception {
+    public:
+        const char * what() const throw () { return "VoteException"; }
+    };
+
+    class RetryAfterSleepException : public std::exception {
+    public:
+        const char * what() const throw () { return "RetryAfterSleepException"; }
+    };
+
+}
diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp
new file mode 100644
index 00000000000..b67c0d71b83
--- /dev/null
+++ b/src/mongo/db/repl/rs_initialsync.cpp
@@ -0,0 +1,271 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../repl.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../oplogreader.h"
+#include "../../util/mongoutils/str.h"
+#include "../dbhelpers.h"
+#include "rs_optime.h"
+#include "../oplog.h"
+
+namespace mongo {
+
+    using namespace mongoutils;
+    using namespace bson;
+
+    void dropAllDatabasesExceptLocal();
+
+    // add try/catch with sleep
+
+    void isyncassert(const string& msg, bool expr) {
+        if( !expr ) {
+            string m = str::stream() << "initial sync " << msg;
+            theReplSet->sethbmsg(m, 0);
+            uasserted(13404, m);
+        }
+    }
+
+    void ReplSetImpl::syncDoInitialSync() {
+        createOplog();
+
+        while( 1 ) {
+            try {
+                _syncDoInitialSync();
+                break;
+            }
+            catch(DBException& e) {
+                sethbmsg("initial sync exception " + e.toString(), 0);
+                sleepsecs(30);
+            }
+        }
+    }
+
+    /* todo : progress metering to sethbmsg. */
+    static bool clone(const char *master, string db) {
+        string err;
+        return cloneFrom(master, err, db, false,
+                         /* slave_ok */ true, true, false, /*mayYield*/true, /*mayBeInterrupted*/false);
+    }
+
+    void _logOpObjRS(const BSONObj& op);
+
+    static void emptyOplog() {
+        writelock lk(rsoplog);
+        Client::Context ctx(rsoplog);
+        NamespaceDetails *d = nsdetails(rsoplog);
+
+        // temp
+        if( d && d->stats.nrecords == 0 )
+            return; // already empty, ok.
+
+        LOG(1) << "replSet empty oplog" << rsLog;
+        d->emptyCappedCollection(rsoplog);
+    }
+
+    Member* ReplSetImpl::getMemberToSyncTo() {
+        Member *closest = 0;
+        time_t now = 0;
+        bool buildIndexes = true;
+
+        // wait for 2N pings before choosing a sync target
+        if (_cfg) {
+            int needMorePings = config().members.size()*2 - HeartbeatInfo::numPings;
+
+            if (needMorePings > 0) {
+                OCCASIONALLY log() << "waiting for " << needMorePings << " pings from other members before syncing" << endl;
+                return NULL;
+            }
+
+            buildIndexes = myConfig().buildIndexes;
+        }
+
+        // find the member with the lowest ping time that has more data than me
+        for (Member *m = _members.head(); m; m = m->next()) {
+            if (m->hbinfo().up() &&
+                // make sure members with buildIndexes sync from other members w/indexes
+                (!buildIndexes || (buildIndexes && m->config().buildIndexes)) &&
+                (m->state() == MemberState::RS_PRIMARY ||
+                 (m->state() == MemberState::RS_SECONDARY && m->hbinfo().opTime > lastOpTimeWritten)) &&
+                (!closest || m->hbinfo().ping < closest->hbinfo().ping)) {
+
+                map<string,time_t>::iterator vetoed = _veto.find(m->fullName());
+                if (vetoed == _veto.end()) {
+                    closest = m;
+                    break;
+                }
+
+                if (now == 0) {
+                    now = time(0);
+                }
+
+                // if this was on the veto list, check if it was vetoed in the last "while"
+                if ((*vetoed).second < now) {
+                    _veto.erase(vetoed);
+                    closest = m;
+                    break;
+                }
+
+                // if it was recently vetoed, skip
+                log() << "replSet not trying to sync from " << (*vetoed).first
+                      << ", it is vetoed for " << ((*vetoed).second - now) << " more seconds" << rsLog;
+            }
+        }
+
+        {
+            lock lk(this);        
+
+            if (!closest) {
+                _currentSyncTarget = NULL;
+                return NULL;
+            }
+            
+            _currentSyncTarget = closest;
+        }
+
+        sethbmsg( str::stream() << "syncing to: " << closest->fullName(), 0);
+
+        return closest;
+    }
+
+    void ReplSetImpl::veto(const string& host, const unsigned secs) {
+        _veto[host] = time(0)+secs;
+    }
+
+    /**
+     * Do the initial sync for this member.
+     */
+    void ReplSetImpl::_syncDoInitialSync() {
+        sethbmsg("initial sync pending",0);
+
+        // if this is the first node, it may have already become primary
+        if ( box.getState().primary() ) {
+            sethbmsg("I'm already primary, no need for initial sync",0);
+            return;
+        }
+        
+        const Member *source = getMemberToSyncTo();
+        if (!source) {
+            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
+            sleepsecs(15);
+            return;
+        }
+
+        string sourceHostname = source->h().toString();
+        OplogReader r;
+        if( !r.connect(sourceHostname) ) {
+            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
+            sleepsecs(15);
+            return;
+        }
+
+        BSONObj lastOp = r.getLastOp(rsoplog);
+        if( lastOp.isEmpty() ) {
+            sethbmsg("initial sync couldn't read remote oplog", 0);
+            sleepsecs(15);
+            return;
+        }
+        OpTime startingTS = lastOp["ts"]._opTime();
+
+        if (replSettings.fastsync) {
+            log() << "fastsync: skipping database clone" << rsLog;
+        }
+        else {
+            sethbmsg("initial sync drop all databases", 0);
+            dropAllDatabasesExceptLocal();
+
+            sethbmsg("initial sync clone all databases", 0);
+
+            list<string> dbs = r.conn()->getDatabaseNames();
+            for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) {
+                string db = *i;
+                if( db != "local" ) {
+                    sethbmsg( str::stream() << "initial sync cloning db: " << db , 0);
+                    bool ok;
+                    {
+                        writelock lk(db);
+                        Client::Context ctx(db);
+                        ok = clone(sourceHostname.c_str(), db);
+                    }
+                    if( !ok ) {
+                        sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0);
+                        veto(source->fullName(), 600);
+                        sleepsecs(300);
+                        return;
+                    }
+                }
+            }
+        }
+
+        sethbmsg("initial sync query minValid",0);
+
+        /* our cloned copy will be strange until we apply oplog events that occurred
+           through the process.  we note that time point here. */
+        BSONObj minValid = r.getLastOp(rsoplog);
+        isyncassert( "getLastOp is empty ", !minValid.isEmpty() );
+        OpTime mvoptime = minValid["ts"]._opTime();
+        assert( !mvoptime.isNull() );
+        assert( mvoptime >= startingTS );
+
+        // apply startingTS..mvoptime portion of the oplog
+        {
+            // note we assume here that this call does not throw
+            if( ! initialSyncOplogApplication(startingTS, mvoptime) ) {
+                log() << "replSet initial sync failed during oplog application phase" << rsLog;
+
+                emptyOplog(); // otherwise we'll be up!
+                
+                lastOpTimeWritten = OpTime();
+                lastH = 0;
+                
+                log() << "replSet cleaning up [1]" << rsLog;
+                {
+                    writelock lk("local.");
+                    Client::Context cx( "local." );
+                    cx.db()->flushFiles(true);
+                }
+                log() << "replSet cleaning up [2]" << rsLog;
+
+                log() << "replSet initial sync failed will try again" << endl;
+
+                sleepsecs(5);
+                return;
+            }
+        }
+
+        sethbmsg("initial sync finishing up",0);
+
+        assert( !box.getState().primary() ); // wouldn't make sense if we were.
+
+        {
+            writelock lk("local.");
+            Client::Context cx( "local." );
+            cx.db()->flushFiles(true);
+            try {
+                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
+            }
+            catch(...) { }
+            Helpers::putSingleton("local.replset.minvalid", minValid);
+            cx.db()->flushFiles(true);
+        }
+
+        sethbmsg("initial sync done",0);
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_initiate.cpp b/src/mongo/db/repl/rs_initiate.cpp
new file mode 100644
index 00000000000..77bc6c03938
--- /dev/null
+++ b/src/mongo/db/repl/rs_initiate.cpp
@@ -0,0 +1,269 @@
+/* @file rs_initiate.cpp
+   */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../commands.h"
+#include "../../util/mmap.h"
+#include "../../util/mongoutils/str.h"
+#include "health.h"
+#include "rs.h"
+#include "rs_config.h"
+#include "../dbhelpers.h"
+#include "../oplog.h"
+
+using namespace bson;
+using namespace mongoutils;
+
+namespace mongo {
+
+    /* called on a reconfig AND on initiate
+       throws
+       @param initial true when initiating
+    */
+    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial) {
+        int failures = 0, allVotes = 0, allowableFailures = 0;
+        int me = 0;
+        stringstream selfs;
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
+            if( i->h.isSelf() ) {
+                me++;
+                if( me > 1 )
+                    selfs << ',';
+                selfs << i->h.toString();
+                if( !i->potentiallyHot() ) {
+                    uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary");
+                }
+            }
+            allVotes += i->votes;
+        }
+        allowableFailures = allVotes - (allVotes/2 + 1);
+
+        uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups?
+        if( me != 1 ) {
+            stringstream ss;
+            ss << "can't find self in the replset config";
+            if( !cmdLine.isDefaultPort() ) ss << " my port: " << cmdLine.port;
+            if( me != 0 ) ss << " found: " << me;
+            uasserted(13279, ss.str());
+        }
+
+        vector<string> down;
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
+            // we know we're up
+            if (i->h.isSelf()) {
+                continue;
+            }
+
+            BSONObj res;
+            {
+                bool ok = false;
+                try {
+                    int theirVersion = -1000;
+                    ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/);
+                    if( theirVersion >= cfg.version ) {
+                        stringstream ss;
+                        ss << "replSet member " << i->h.toString() << " has too new a config version (" << theirVersion << ") to reconfigure";
+                        uasserted(13259, ss.str());
+                    }
+                }
+                catch(DBException& e) {
+                    log() << "replSet cmufcc requestHeartbeat " << i->h.toString() << " : " << e.toString() << rsLog;
+                }
+                catch(...) {
+                    log() << "replSet cmufcc error exception in requestHeartbeat?" << rsLog;
+                }
+                if( res.getBoolField("mismatch") )
+                    uasserted(13145, "set name does not match the set name host " + i->h.toString() + " expects");
+                if( *res.getStringField("set") ) {
+                    if( cfg.version <= 1 ) {
+                        // this was to be initiation, no one shoudl be initiated already.
+                        uasserted(13256, "member " + i->h.toString() + " is already initiated");
+                    }
+                    else {
+                        // Assure no one has a newer config.
+                        if( res["v"].Int() >= cfg.version ) {
+                            uasserted(13341, "member " + i->h.toString() + " has a config version >= to the new cfg version; cannot change config");
+                        }
+                    }
+                }
+                if( !ok && !res["rs"].trueValue() ) {
+                    down.push_back(i->h.toString());
+
+                    if( !res.isEmpty() ) {
+                        /* strange.  got a response, but not "ok". log it. */
+                        log() << "replSet warning " << i->h.toString() << " replied: " << res.toString() << rsLog;
+                    }
+
+                    bool allowFailure = false;
+                    failures += i->votes;
+                    if( !initial && failures <= allowableFailures ) {
+                        const Member* m = theReplSet->findById( i->_id );
+                        if( m ) {
+                            assert( m->h().toString() == i->h.toString() );
+                        }
+                        // it's okay if the down member isn't part of the config,
+                        // we might be adding a new member that isn't up yet
+                        allowFailure = true;
+                    }
+
+                    if( !allowFailure ) {
+                        string msg = string("need all members up to initiate, not ok : ") + i->h.toStringLong();
+                        if( !initial )
+                            msg = string("need most members up to reconfigure, not ok : ") + i->h.toString();
+                        uasserted(13144, msg);
+                    }
+                }
+            }
+            if( initial ) {
+                bool hasData = res["hasData"].Bool();
+                uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set.  All members except initiator must be empty.",
+                        !hasData || i->h.isSelf());
+            }
+        }
+        if (down.size() > 0) {
+            result.append("down", down);
+        }
+    }
+
+    class CmdReplSetInitiate : public ReplSetCommand {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        CmdReplSetInitiate() : ReplSetCommand("replSetInitiate") { }
+        virtual void help(stringstream& h) const {
+            h << "Initiate/christen a replica set.";
+            h << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            log() << "replSet replSetInitiate admin command received from client" << rsLog;
+
+            if( !replSet ) {
+                errmsg = "server is not running with --replSet";
+                return false;
+            }
+            if( theReplSet ) {
+                errmsg = "already initialized";
+                result.append("info", "try querying " + rsConfigNs + " to see current configuration");
+                return false;
+            }
+
+            {
+                // just make sure we can get a write lock before doing anything else.  we'll reacquire one
+                // later.  of course it could be stuck then, but this check lowers the risk if weird things
+                // are up.
+                time_t t = time(0);
+                writelock lk("");
+                if( time(0)-t > 10 ) {
+                    errmsg = "took a long time to get write lock, so not initiating.  Initiate when server less busy?";
+                    return false;
+                }
+
+                /* check that we don't already have an oplog.  that could cause issues.
+                   it is ok if the initiating member has *other* data than that.
+                   */
+                BSONObj o;
+                if( Helpers::getFirst(rsoplog, o) ) {
+                    errmsg = rsoplog + string(" is not empty on the initiating member.  cannot initiate.");
+                    return false;
+                }
+            }
+
+            if( ReplSet::startupStatus == ReplSet::BADCONFIG ) {
+                errmsg = "server already in BADCONFIG state (check logs); not initiating";
+                result.append("info", ReplSet::startupStatusMsg.get());
+                return false;
+            }
+            if( ReplSet::startupStatus != ReplSet::EMPTYCONFIG ) {
+                result.append("startupStatus", ReplSet::startupStatus);
+                errmsg = "all members and seeds must be reachable to initiate set";
+                result.append("info", cmdLine._replSet);
+                return false;
+            }
+
+            BSONObj configObj;
+
+            if( cmdObj["replSetInitiate"].type() != Object ) {
+                result.append("info2", "no configuration explicitly specified -- making one");
+                log() << "replSet info initiate : no configuration specified.  Using a default configuration for the set" << rsLog;
+
+                string name;
+                vector<HostAndPort> seeds;
+                set<HostAndPort> seedSet;
+                parseReplsetCmdLine(cmdLine._replSet, name, seeds, seedSet); // may throw...
+
+                bob b;
+                b.append("_id", name);
+                bob members;
+                members.append("0", BSON( "_id" << 0 << "host" << HostAndPort::Me().dynString() ));
+                result.append("me", HostAndPort::Me().toString());
+                for( unsigned i = 0; i < seeds.size(); i++ )
+                    members.append(bob::numStr(i+1), BSON( "_id" << i+1 << "host" << seeds[i].toString()));
+                b.appendArray("members", members.obj());
+                configObj = b.obj();
+                log() << "replSet created this configuration for initiation : " << configObj.toString() << rsLog;
+            }
+            else {
+                configObj = cmdObj["replSetInitiate"].Obj();
+            }
+
+            bool parsed = false;
+            try {
+                ReplSetConfig newConfig(configObj);
+                parsed = true;
+
+                if( newConfig.version > 1 ) {
+                    errmsg = "can't initiate with a version number greater than 1";
+                    return false;
+                }
+
+                log() << "replSet replSetInitiate config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
+
+                checkMembersUpForConfigChange(newConfig, result, true);
+
+                log() << "replSet replSetInitiate all members seem up" << rsLog;
+
+                createOplog();
+
+                writelock lk("");
+                bo comment = BSON( "msg" << "initiating set");
+                newConfig.saveConfigLocally(comment);
+                log() << "replSet replSetInitiate config now saved locally.  Should come online in about a minute." << rsLog;
+                result.append("info", "Config now saved locally.  Should come online in about a minute.");
+                ReplSet::startupStatus = ReplSet::SOON;
+                ReplSet::startupStatusMsg.set("Received replSetInitiate - should come online shortly.");
+            }
+            catch( DBException& e ) {
+                log() << "replSet replSetInitiate exception: " << e.what() << rsLog;
+                if( !parsed )
+                    errmsg = string("couldn't parse cfg object ") + e.what();
+                else
+                    errmsg = string("couldn't initiate : ") + e.what();
+                return false;
+            }
+            catch( string& e2 ) {
+                log() << e2 << rsLog;
+                errmsg = e2;
+                return false;
+            }
+
+            return true;
+        }
+    } cmdReplSetInitiate;
+
+}
diff --git a/src/mongo/db/repl/rs_member.h b/src/mongo/db/repl/rs_member.h
new file mode 100644
index 00000000000..24e593392b6
--- /dev/null
+++ b/src/mongo/db/repl/rs_member.h
@@ -0,0 +1,131 @@
+// @file rsmember.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** replica set member */
+
+#pragma once
+
+#include "../../util/concurrency/value.h"
+
+namespace mongo {
+
+
+    /*
+        RS_STARTUP    serving still starting up, or still trying to initiate the set
+        RS_PRIMARY    this server thinks it is primary
+        RS_SECONDARY  this server thinks it is a secondary (slave mode)
+        RS_RECOVERING recovering/resyncing; after recovery usually auto-transitions to secondary
+        RS_FATAL      something bad has occurred and server is not completely offline with regard to the replica set.  fatal error.
+        RS_STARTUP2   loaded config, still determining who is primary
+    */
+    struct MemberState {
+        enum MS {
+            RS_STARTUP = 0,
+            RS_PRIMARY = 1,
+            RS_SECONDARY = 2,
+            RS_RECOVERING = 3,
+            RS_FATAL = 4,
+            RS_STARTUP2 = 5,
+            RS_UNKNOWN = 6, /* remote node not yet reached */
+            RS_ARBITER = 7,
+            RS_DOWN = 8, /* node not reachable for a report */
+            RS_ROLLBACK = 9
+        } s;
+
+        MemberState(MS ms = RS_UNKNOWN) : s(ms) { }
+        explicit MemberState(int ms) : s((MS) ms) { }
+
+        bool startup() const { return s == RS_STARTUP; }
+        bool primary() const { return s == RS_PRIMARY; }
+        bool secondary() const { return s == RS_SECONDARY; }
+        bool recovering() const { return s == RS_RECOVERING; }
+        bool startup2() const { return s == RS_STARTUP2; }
+        bool fatal() const { return s == RS_FATAL; }
+        bool rollback() const { return s == RS_ROLLBACK; }
+        bool readable() const { return s == RS_PRIMARY || s == RS_SECONDARY; }
+
+        string toString() const;
+
+        bool operator==(const MemberState& r) const { return s == r.s; }
+        bool operator!=(const MemberState& r) const { return s != r.s; }
+    };
+
+    /* this is supposed to be just basic information on a member,
+       and copy constructable. */
+    class HeartbeatInfo {
+        unsigned _id;
+    public:
+        HeartbeatInfo() : _id(0xffffffff), hbstate(MemberState::RS_UNKNOWN), health(-1.0),
+            downSince(0), skew(INT_MIN), authIssue(false), ping(0) { }
+        HeartbeatInfo(unsigned id);
+        unsigned id() const { return _id; }
+        MemberState hbstate;
+        double health;
+        time_t upSince;
+        long long downSince;
+        time_t lastHeartbeat;
+        DiagStr lastHeartbeatMsg;
+        OpTime opTime;
+        int skew;
+        bool authIssue;
+        unsigned int ping; // milliseconds
+        static unsigned int numPings;
+
+        bool up() const { return health > 0; }
+
+        /** health is set to -1 on startup.  that means we haven't even checked yet.  0 means we checked and it failed. */
+        bool maybeUp() const { return health != 0; }
+
+        long long timeDown() const; // ms
+
+        /* true if changed in a way of interest to the repl set manager. */
+        bool changed(const HeartbeatInfo& old) const;
+    };
+
+    inline HeartbeatInfo::HeartbeatInfo(unsigned id) : 
+        _id(id), 
+        authIssue(false),
+        ping(0) {
+        hbstate = MemberState::RS_UNKNOWN;
+        health = -1.0;
+        downSince = 0;
+        lastHeartbeat = upSince = 0;
+        skew = INT_MIN;
+    }
+
+    inline bool HeartbeatInfo::changed(const HeartbeatInfo& old) const {
+        return health != old.health ||
+               hbstate != old.hbstate;
+    }
+
+    inline string MemberState::toString() const {
+        switch ( s ) {
+        case RS_STARTUP: return "STARTUP";
+        case RS_PRIMARY: return "PRIMARY";
+        case RS_SECONDARY: return "SECONDARY";
+        case RS_RECOVERING: return "RECOVERING";
+        case RS_FATAL: return "FATAL";
+        case RS_STARTUP2: return "STARTUP2";
+        case RS_ARBITER: return "ARBITER";
+        case RS_DOWN: return "DOWN";
+        case RS_ROLLBACK: return "ROLLBACK";
+        case RS_UNKNOWN: return "UNKNOWN";
+        }
+        return "";
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_optime.h b/src/mongo/db/repl/rs_optime.h
new file mode 100644
index 00000000000..f0ca56927ad
--- /dev/null
+++ b/src/mongo/db/repl/rs_optime.h
@@ -0,0 +1,58 @@
+// @file rs_optime.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../util/optime.h"
+
+namespace mongo {
+
+    const char rsoplog[] = "local.oplog.rs";
+
+    /*
+    class RSOpTime : public OpTime {
+    public:
+        bool initiated() const { return getSecs() != 0; }
+    };*/
+
+    /*struct RSOpTime {
+        unsigned long long ord;
+
+        RSOpTime() : ord(0) { }
+
+        bool initiated() const { return ord > 0; }
+
+        void initiate() {
+            assert( !initiated() );
+            ord = 1000000;
+        }
+
+        ReplTime inc() {
+            DEV assertInWriteLock();
+            return ++ord;
+        }
+
+        string toString() const { return str::stream() << ord; }
+
+        // query the oplog and set the highest value herein.  acquires a db read lock. throws.
+        void load();
+    };
+
+    extern RSOpTime rsOpTime;*/
+
+}
diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp
new file mode 100644
index 00000000000..10727c59669
--- /dev/null
+++ b/src/mongo/db/repl/rs_rollback.cpp
@@ -0,0 +1,667 @@
+/* @file rs_rollback.cpp
+*
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../repl.h"
+#include "../ops/query.h"
+#include "../cloner.h"
+#include "../ops/update.h"
+#include "../ops/delete.h"
+
+/* Scenarios
+
+   We went offline with ops not replicated out.
+
+       F = node that failed and coming back.
+       P = node that took over, new primary
+
+   #1:
+       F : a b c d e f g
+       P : a b c d q
+
+   The design is "keep P".  One could argue here that "keep F" has some merits, however, in most cases P
+   will have significantly more data.  Also note that P may have a proper subset of F's stream if there were
+   no subsequent writes.
+
+   For now the model is simply : get F back in sync with P.  If P was really behind or something, we should have
+   just chosen not to fail over anyway.
+
+   #2:
+       F : a b c d e f g                -> a b c d
+       P : a b c d
+
+   #3:
+       F : a b c d e f g                -> a b c d q r s t u v w x z
+       P : a b c d.q r s t u v w x z
+
+   Steps
+    find an event in common. 'd'.
+    undo our events beyond that by:
+      (1) taking copy from other server of those objects
+      (2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object
+          -- i.e., reset minvalid.
+      (3) we could skip operations on objects that are previous in time to our capture of the object as an optimization.
+
+*/
+
+namespace mongo {
+
+    using namespace bson;
+
+    void incRBID();
+
+    class rsfatal : public std::exception {
+    public:
+        virtual const char* what() const throw() { return "replica set fatal exception"; }
+    };
+
+    struct DocID {
+        const char *ns;
+        be _id;
+        bool operator<(const DocID& d) const {
+            int c = strcmp(ns, d.ns);
+            if( c < 0 ) return true;
+            if( c > 0 ) return false;
+            return _id < d._id;
+        }
+    };
+
+    struct HowToFixUp {
+        /* note this is a set -- if there are many $inc's on a single document we need to rollback, we only
+           need to refetch it once. */
+        set<DocID> toRefetch;
+
+        /* collections to drop */
+        set<string> toDrop;
+
+        set<string> collectionsToResync;
+
+        OpTime commonPoint;
+        DiskLoc commonPointOurDiskloc;
+
+        int rbid; // remote server's current rollback sequence #
+    };
+
+    static void refetch(HowToFixUp& h, const BSONObj& ourObj) {
+        const char *op = ourObj.getStringField("op");
+        if( *op == 'n' )
+            return;
+
+        unsigned long long totSize = 0;
+        totSize += ourObj.objsize();
+        if( totSize > 512 * 1024 * 1024 )
+            throw "rollback too large";
+
+        DocID d;
+        // NOTE The assigned ns value may become invalid if we yield.
+        d.ns = ourObj.getStringField("ns");
+        if( *d.ns == 0 ) {
+            log() << "replSet WARNING ignoring op on rollback no ns TODO : " << ourObj.toString() << rsLog;
+            return;
+        }
+
+        bo o = ourObj.getObjectField(*op=='u' ? "o2" : "o");
+        if( o.isEmpty() ) {
+            log() << "replSet warning ignoring op on rollback : " << ourObj.toString() << rsLog;
+            return;
+        }
+
+        if( *op == 'c' ) {
+            be first = o.firstElement();
+            NamespaceString s(d.ns); // foo.$cmd
+            string cmdname = first.fieldName();
+            Command *cmd = Command::findCommand(cmdname.c_str());
+            if( cmd == 0 ) {
+                log() << "replSet warning rollback no suchcommand " << first.fieldName() << " - different mongod versions perhaps?" << rsLog;
+                return;
+            }
+            else {
+                /* findandmodify - tranlated?
+                   godinsert?,
+                   renamecollection a->b.  just resync a & b
+                */
+                if( cmdname == "create" ) {
+                    /* Create collection operation
+                       { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } }
+                    */
+                    string ns = s.db + '.' + o["create"].String(); // -> foo.abc
+                    h.toDrop.insert(ns);
+                    return;
+                }
+                else if( cmdname == "drop" ) {
+                    string ns = s.db + '.' + first.valuestr();
+                    h.collectionsToResync.insert(ns);
+                    return;
+                }
+                else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) {
+                    /* TODO: this is bad.  we simply full resync the collection here, which could be very slow. */
+                    log() << "replSet info rollback of dropIndexes is slow in this version of mongod" << rsLog;
+                    string ns = s.db + '.' + first.valuestr();
+                    h.collectionsToResync.insert(ns);
+                    return;
+                }
+                else if( cmdname == "renameCollection" ) {
+                    /* TODO: slow. */
+                    log() << "replSet info rollback of renameCollection is slow in this version of mongod" << rsLog;
+                    string from = first.valuestr();
+                    string to = o["to"].String();
+                    h.collectionsToResync.insert(from);
+                    h.collectionsToResync.insert(to);
+                    return;
+                }
+                else if( cmdname == "reIndex" ) {
+                    return;
+                }
+                else if( cmdname == "dropDatabase" ) {
+                    log() << "replSet error rollback : can't rollback drop database full resync will be required" << rsLog;
+                    log() << "replSet " << o.toString() << rsLog;
+                    throw rsfatal();
+                }
+                else {
+                    log() << "replSet error can't rollback this command yet: " << o.toString() << rsLog;
+                    log() << "replSet cmdname=" << cmdname << rsLog;
+                    throw rsfatal();
+                }
+            }
+        }
+
+        d._id = o["_id"];
+        if( d._id.eoo() ) {
+            log() << "replSet WARNING ignoring op on rollback no _id TODO : " << d.ns << ' '<< ourObj.toString() << rsLog;
+            return;
+        }
+
+        h.toRefetch.insert(d);
+    }
+
+    int getRBID(DBClientConnection*);
+
+    static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) {
+        static time_t last;
+        if( time(0)-last < 60 ) {
+            throw "findcommonpoint waiting a while before trying again";
+        }
+        last = time(0);
+
+        assert( d.dbMutex.atLeastReadLocked() );
+        Client::Context c(rsoplog);
+        NamespaceDetails *nsd = nsdetails(rsoplog);
+        assert(nsd);
+        ReverseCappedCursor u(nsd);
+        if( !u.ok() )
+            throw "our oplog empty or unreadable";
+
+        const Query q = Query().sort(reverseNaturalObj);
+        const bo fields = BSON( "ts" << 1 << "h" << 1 );
+
+        //auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0);
+
+        h.rbid = getRBID(them);
+        auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0);
+
+        if( t.get() == 0 || !t->more() ) throw "remote oplog empty or unreadable";
+
+        BSONObj ourObj = u.current();
+        OpTime ourTime = ourObj["ts"]._opTime();
+        BSONObj theirObj = t->nextSafe();
+        OpTime theirTime = theirObj["ts"]._opTime();
+
+        {
+            long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs());
+            /* diff could be positive, negative, or zero */
+            log() << "replSet info rollback our last optime:   " << ourTime.toStringPretty() << rsLog;
+            log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog;
+            log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog;
+            if( diff > 1800 ) {
+                log() << "replSet rollback too long a time period for a rollback." << rsLog;
+                throw "error not willing to roll back more than 30 minutes of data";
+            }
+        }
+
+        unsigned long long scanned = 0;
+        while( 1 ) {
+            scanned++;
+            /* todo add code to assure no excessive scanning for too long */
+            if( ourTime == theirTime ) {
+                if( ourObj["h"].Long() == theirObj["h"].Long() ) {
+                    // found the point back in time where we match.
+                    // todo : check a few more just to be careful about hash collisions.
+                    log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog;
+                    log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog;
+                    h.commonPoint = ourTime;
+                    h.commonPointOurDiskloc = u.currLoc();
+                    return;
+                }
+
+                refetch(h, ourObj);
+
+                if( !t->more() ) {
+                    log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
+                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
+                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
+                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
+                    throw "RS100 reached beginning of remote oplog [2]";
+                }
+                theirObj = t->nextSafe();
+                theirTime = theirObj["ts"]._opTime();
+
+                u.advance();
+                if( !u.ok() ) {
+                    log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
+                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
+                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
+                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
+                    throw "RS101 reached beginning of local oplog [1]";
+                }
+                ourObj = u.current();
+                ourTime = ourObj["ts"]._opTime();
+            }
+            else if( theirTime > ourTime ) {
+                if( !t->more() ) {
+                    log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
+                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
+                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
+                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
+                    throw "RS100 reached beginning of remote oplog [1]";
+                }
+                theirObj = t->nextSafe();
+                theirTime = theirObj["ts"]._opTime();
+            }
+            else {
+                // theirTime < ourTime
+                refetch(h, ourObj);
+                u.advance();
+                if( !u.ok() ) {
+                    log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
+                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
+                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
+                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
+                    throw "RS101 reached beginning of local oplog [2]";
+                }
+                ourObj = u.current();
+                ourTime = ourObj["ts"]._opTime();
+            }
+        }
+    }
+
+    struct X {
+        const bson::bo *op;
+        bson::bo goodVersionOfObject;
+    };
+
+    static void setMinValid(bo newMinValid) {
+        try {
+            log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog;
+        }
+        catch(...) { }
+        {
+            Helpers::putSingleton("local.replset.minvalid", newMinValid);
+            Client::Context cx( "local." );
+            cx.db()->flushFiles(true);
+        }
+    }
+
+    void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) {
+        DBClientConnection *them = r.conn();
+
+        // fetch all first so we needn't handle interruption in a fancy way
+
+        unsigned long long totSize = 0;
+
+        list< pair<DocID,bo> > goodVersions;
+
+        bo newMinValid;
+
+        /* fetch all the goodVersions of each document from current primary */
+        DocID d;
+        unsigned long long n = 0;
+        try {
+            for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) {
+                d = *i;
+
+                assert( !d._id.eoo() );
+
+                {
+                    /* TODO : slow.  lots of round trips. */
+                    n++;
+                    bo good= them->findOne(d.ns, d._id.wrap(), NULL, QueryOption_SlaveOk).getOwned();
+                    totSize += good.objsize();
+                    uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );
+
+                    // note good might be eoo, indicating we should delete it
+                    goodVersions.push_back(pair<DocID,bo>(d,good));
+                }
+            }
+            newMinValid = r.getLastOp(rsoplog);
+            if( newMinValid.isEmpty() ) {
+                sethbmsg("rollback error newMinValid empty?");
+                return;
+            }
+        }
+        catch(DBException& e) {
+            sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0);
+            log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
+            throw e;
+        }
+
+        MemoryMappedFile::flushAll(true);
+
+        sethbmsg("rollback 3.5");
+        if( h.rbid != getRBID(r.conn()) ) {
+            // our source rolled back itself.  so the data we received isn't necessarily consistent.
+            sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt");
+            return;
+        }
+
+        // update them
+        sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size());
+
+        bool warn = false;
+
+        assert( !h.commonPointOurDiskloc.isNull() );
+
+        mongo::d.dbMutex.assertWriteLocked();
+
+        /* we have items we are writing that aren't from a point-in-time.  thus best not to come online
+           until we get to that point in freshness. */
+        setMinValid(newMinValid);
+
+        /** any full collection resyncs required? */
+        if( !h.collectionsToResync.empty() ) {
+            for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) {
+                string ns = *i;
+                sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns);
+
+                Client::Context c(ns);
+                {
+                    bob res;
+                    string errmsg;
+                    dropCollection(ns, errmsg, res);
+                    {
+                        dbtemprelease r;
+                        bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, errmsg);
+                        uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok);
+                    }
+                }
+            }
+
+            /* we did more reading from primary, so check it again for a rollback (which would mess us up), and
+               make minValid newer.
+               */
+            sethbmsg("rollback 4.2");
+            {
+                string err;
+                try {
+                    newMinValid = r.getLastOp(rsoplog);
+                    if( newMinValid.isEmpty() ) {
+                        err = "can't get minvalid from primary";
+                    }
+                    else {
+                        setMinValid(newMinValid);
+                    }
+                }
+                catch (DBException&) {
+                    err = "can't get/set minvalid";
+                }
+                if( h.rbid != getRBID(r.conn()) ) {
+                    // our source rolled back itself.  so the data we received isn't necessarily consistent.
+                    // however, we've now done writes.  thus we have a problem.
+                    err += "rbid at primary changed during resync/rollback";
+                }
+                if( !err.empty() ) {
+                    log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog;
+                    /* todo: reset minvalid so that we are permanently in fatal state */
+                    /* todo: don't be fatal, but rather, get all the data first. */
+                    sethbmsg("rollback error");
+                    throw rsfatal();
+                }
+            }
+            sethbmsg("rollback 4.3");
+        }
+
+        sethbmsg("rollback 4.6");
+        /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */
+        for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) {
+            Client::Context c(*i);
+            try {
+                bob res;
+                string errmsg;
+                log(1) << "replSet rollback drop: " << *i << rsLog;
+                dropCollection(*i, errmsg, res);
+            }
+            catch(...) {
+                log() << "replset rollback error dropping collection " << *i << rsLog;
+            }
+        }
+
+        sethbmsg("rollback 4.7");
+        Client::Context c(rsoplog);
+        NamespaceDetails *oplogDetails = nsdetails(rsoplog);
+        uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);
+
+        map<string,shared_ptr<RemoveSaver> > removeSavers;
+
+        unsigned deletes = 0, updates = 0;
+        for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
+            const DocID& d = i->first;
+            bo pattern = d._id.wrap(); // { _id : ... }
+            try {
+                assert( d.ns && *d.ns );
+                if( h.collectionsToResync.count(d.ns) ) {
+                    /* we just synced this entire collection */
+                    continue;
+                }
+
+                getDur().commitIfNeeded();
+
+                /* keep an archive of items rolled back */
+                shared_ptr<RemoveSaver>& rs = removeSavers[d.ns];
+                if ( ! rs )
+                    rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) );
+
+                // todo: lots of overhead in context, this can be faster
+                Client::Context c(d.ns);
+                if( i->second.isEmpty() ) {
+                    // wasn't on the primary; delete.
+                    /* TODO1.6 : can't delete from a capped collection.  need to handle that here. */
+                    deletes++;
+
+                    NamespaceDetails *nsd = nsdetails(d.ns);
+                    if( nsd ) {
+                        if( nsd->capped ) {
+                            /* can't delete from a capped collection - so we truncate instead. if this item must go,
+                            so must all successors!!! */
+                            try {
+                                /** todo: IIRC cappedTrunateAfter does not handle completely empty.  todo. */
+                                // this will crazy slow if no _id index.
+                                long long start = Listener::getElapsedTimeMillis();
+                                DiskLoc loc = Helpers::findOne(d.ns, pattern, false);
+                                if( Listener::getElapsedTimeMillis() - start > 200 )
+                                    log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog;
+                                //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern);
+                                if( !loc.isNull() ) {
+                                    try {
+                                        nsd->cappedTruncateAfter(d.ns, loc, true);
+                                    }
+                                    catch(DBException& e) {
+                                        if( e.getCode() == 13415 ) {
+                                            // hack: need to just make cappedTruncate do this...
+                                            nsd->emptyCappedCollection(d.ns);
+                                        }
+                                        else {
+                                            throw;
+                                        }
+                                    }
+                                }
+                            }
+                            catch(DBException& e) {
+                                log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog;
+                            }
+                        }
+                        else {
+                            try {
+                                deletes++;
+                                deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() );
+                            }
+                            catch(...) {
+                                log() << "replSet error rollback delete failed ns:" << d.ns << rsLog;
+                            }
+                        }
+                        // did we just empty the collection?  if so let's check if it even exists on the source.
+                        if( nsd->stats.nrecords == 0 ) {
+                            try {
+                                string sys = cc().database()->name + ".system.namespaces";
+                                bo o = them->findOne(sys, QUERY("name"<<d.ns));
+                                if( o.isEmpty() ) {
+                                    // we should drop
+                                    try {
+                                        bob res;
+                                        string errmsg;
+                                        dropCollection(d.ns, errmsg, res);
+                                    }
+                                    catch(...) {
+                                        log() << "replset error rolling back collection " << d.ns << rsLog;
+                                    }
+                                }
+                            }
+                            catch(DBException& ) {
+                                /* this isn't *that* big a deal, but is bad. */
+                                log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog;
+                            }
+                        }
+                    }
+                }
+                else {
+                    // todo faster...
+                    OpDebug debug;
+                    updates++;
+                    _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() );
+                }
+            }
+            catch(DBException& e) {
+                log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog;
+                warn = true;
+            }
+        }
+
+        removeSavers.clear(); // this effectively closes all of them
+
+        sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates);
+        MemoryMappedFile::flushAll(true);
+        sethbmsg("rollback 6");
+
+        // clean up oplog
+        LOG(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
+        // todo: fatal error if this throws?
+        oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
+
+        /* reset cached lastoptimewritten and h value */
+        loadLastOpTimeWritten();
+
+        sethbmsg("rollback 7");
+        MemoryMappedFile::flushAll(true);
+
+        // done
+        if( warn )
+            sethbmsg("issues during syncRollback, see log");
+        else
+            sethbmsg("rollback done");
+    }
+
+    void ReplSetImpl::syncRollback(OplogReader&r) {
+        unsigned s = _syncRollback(r);
+        if( s )
+            sleepsecs(s);
+    }
+
+    unsigned ReplSetImpl::_syncRollback(OplogReader&r) {
+        assert( !lockedByMe() );
+        assert( !d.dbMutex.atLeastReadLocked() );
+
+        sethbmsg("rollback 0");
+
+        writelocktry lk(rsoplog, 20000);
+        if( !lk.got() ) {
+            sethbmsg("rollback couldn't get write lock in a reasonable time");
+            return 2;
+        }
+
+        if( state().secondary() ) {
+            /* by doing this, we will not service reads (return an error as we aren't in secondary staate.
+               that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred
+               or removed or yielded later anyway.
+
+               also, this is better for status reporting - we know what is happening.
+               */
+            changeState(MemberState::RS_ROLLBACK);
+        }
+
+        HowToFixUp how;
+        sethbmsg("rollback 1");
+        {
+            r.resetCursor();
+
+            sethbmsg("rollback 2 FindCommonPoint");
+            try {
+                syncRollbackFindCommonPoint(r.conn(), how);
+            }
+            catch( const char *p ) {
+                sethbmsg(string("rollback 2 error ") + p);
+                return 10;
+            }
+            catch( rsfatal& ) {
+                _fatal();
+                return 2;
+            }
+            catch( DBException& e ) {
+                sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min");
+                dbtemprelease r;
+                sleepsecs(60);
+                throw;
+            }
+        }
+
+        sethbmsg("replSet rollback 3 fixup");
+
+        {
+            incRBID();
+            try {
+                syncFixUp(how, r);
+            }
+            catch( rsfatal& ) {
+                sethbmsg("rollback fixup error");
+                _fatal();
+                return 2;
+            }
+            catch(...) {
+                incRBID(); throw;
+            }
+            incRBID();
+
+            /* success - leave "ROLLBACK" state
+               can go to SECONDARY once minvalid is achieved
+            */
+            changeState(MemberState::RS_RECOVERING);
+        }
+
+        return 0;
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_sync.cpp b/src/mongo/db/repl/rs_sync.cpp
new file mode 100644
index 00000000000..8bac981d951
--- /dev/null
+++ b/src/mongo/db/repl/rs_sync.cpp
@@ -0,0 +1,701 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../repl.h"
+#include "connections.h"
+
+namespace mongo {
+
+    using namespace bson;
+    extern unsigned replSetForceInitialSyncFailure;
+
+    void NOINLINE_DECL blank(const BSONObj& o) {
+        if( *o.getStringField("op") != 'n' ) {
+            log() << "replSet skipping bad op in oplog: " << o.toString() << rsLog;
+        }
+    }
+
+    /* apply the log op that is in param o
+       @return bool success (true) or failure (false)
+    */
+    bool replset::SyncTail::syncApply(const BSONObj &o) {
+        const char *ns = o.getStringField("ns");
+        if ( *ns == '.' || *ns == 0 ) {
+            blank(o);
+            return true;
+        }
+
+        Client::Context ctx(ns);
+        ctx.getClient()->curop()->reset();
+        return !applyOperation_inlock(o);
+    }
+
+    /* initial oplog application, during initial sync, after cloning.
+       @return false on failure.
+       this method returns an error and doesn't throw exceptions (i think).
+    */
+    bool ReplSetImpl::initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid) {
+        Member *source = 0;
+        OplogReader r;
+
+        // keep trying to initial sync from oplog until we run out of targets
+        while ((source = _getOplogReader(r, applyGTE)) != 0) {
+            replset::InitialSync init(source->fullName());
+            if (init.oplogApplication(r, source, applyGTE, minValid)) {
+                return true;
+            }
+
+            r.resetConnection();
+            veto(source->fullName(), 60);
+            log() << "replSet applying oplog from " << source->fullName() << " failed, trying again" << endl;
+        }
+
+        log() << "replSet initial sync error: couldn't find oplog to sync from" << rsLog;
+        return false;
+    }
+
+    bool replset::InitialSync::oplogApplication(OplogReader& r, const Member* source,
+        const OpTime& applyGTE, const OpTime& minValid) {
+
+        const string hn = source->fullName();
+        try {
+            r.tailingQueryGTE( rsoplog, applyGTE );
+            if ( !r.haveCursor() ) {
+                log() << "replSet initial sync oplog query error" << rsLog;
+                return false;
+            }
+
+            {
+                if( !r.more() ) {
+                    sethbmsg("replSet initial sync error reading remote oplog");
+                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
+                    return false;
+                }
+                bo op = r.next();
+                OpTime t = op["ts"]._opTime();
+                r.putBack(op);
+
+                if( op.firstElementFieldName() == string("$err") ) {
+                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
+                    return false;
+                }
+
+                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
+                if( t > applyGTE ) {
+                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
+                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
+                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
+                    return false;
+                }
+
+                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
+                         << t.toStringPretty() << " to " << minValid.toStringPretty());
+            }
+        }
+        catch(DBException& e) {
+            log() << "replSet initial sync failing: " << e.toString() << rsLog;
+            return false;
+        }
+
+        /* we lock outside the loop to avoid the overhead of locking on every operation. */
+        writelock lk("");
+
+        // todo : use exhaust
+        OpTime ts;
+        time_t start = time(0);
+        unsigned long long n = 0;
+        int fails = 0;
+        while( ts < minValid ) {
+            try {
+                // There are some special cases with initial sync (see the catch block), so we
+                // don't want to break out of this while until we've reached minvalid. Thus, we'll
+                // keep trying to requery.
+                if( !r.more() ) {
+                    OCCASIONALLY log() << "replSet initial sync oplog: no more records" << endl;
+                    sleepsecs(1);
+
+                    r.resetCursor();
+                    r.tailingQueryGTE(rsoplog, theReplSet->lastOpTimeWritten);
+                    if ( !r.haveCursor() ) {
+                        if (fails++ > 30) {
+                            log() << "replSet initial sync tried to query oplog 30 times, giving up" << endl;
+                            return false;
+                        }
+                    }
+
+                    continue;
+                }
+
+                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
+                ts = o["ts"]._opTime();
+
+                {
+                    if( (source->state() != MemberState::RS_PRIMARY &&
+                            source->state() != MemberState::RS_SECONDARY) ||
+                            replSetForceInitialSyncFailure ) {
+
+                        int f = replSetForceInitialSyncFailure;
+                        if( f > 0 ) {
+                            replSetForceInitialSyncFailure = f-1;
+                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
+                            throw DBException("forced error",0);
+                        }
+                        log() << "replSet we are now primary" << rsLog;
+                        throw DBException("primary changed",0);
+                    }
+
+                    applyOp(o, applyGTE);
+                }
+
+                if ( ++n % 1000 == 0 ) {
+                    time_t now = time(0);
+                    if (now - start > 10) {
+                        // simple progress metering
+                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
+                              << ts.toStringPretty() << rsLog;
+                        start = now;
+                    }
+                }
+
+                getDur().commitIfNeeded();
+            }
+            catch (DBException& e) {
+                // Skip duplicate key exceptions.
+                // These are relatively common on initial sync: if a document is inserted
+                // early in the clone step, the insert will be replayed but the document
+                // will probably already have been cloned over.
+                if( e.getCode() == 11000 || e.getCode() == 11001 || e.getCode() == 12582) {
+                    continue;
+                }
+                
+                // handle cursor not found (just requery)
+                if( e.getCode() == 13127 ) {
+                    log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl;
+                    r.resetCursor();
+                    r.tailingQueryGTE(rsoplog, ts);
+                    if( r.haveCursor() ) {
+                        continue;
+                    }
+                }
+
+                // TODO: handle server restart
+
+                if( ts <= minValid ) {
+                    // didn't make it far enough
+                    log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
+                    return false;
+                }
+
+                // otherwise, whatever, we'll break out of the loop and catch
+                // anything that's really wrong in syncTail
+            }
+        }
+        return true;
+    }
+
+    void replset::InitialSync::applyOp(const BSONObj& o, const OpTime& applyGTE) {
+        OpTime ts = o["ts"]._opTime();
+
+        // optimes before we started copying need not be applied.
+        if( ts >= applyGTE ) {
+            if (!syncApply(o)) {
+                if (shouldRetry(o)) {
+                    uassert(15915, "replSet update still fails after adding missing object", syncApply(o));
+                }
+            }
+        }
+
+        // with repl sets we write the ops to our oplog, too
+        _logOpObjRS(o);
+    }
+
+    /* should be in RECOVERING state on arrival here.
+       readlocks
+       @return true if transitioned to SECONDARY
+    */
+    bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
+        bool golive = false;
+
+        {
+            lock lk( this );
+
+            if (_maintenanceMode > 0) {
+                // we're not actually going live
+                return true;
+            }
+        }
+
+        {
+            readlock lk("local.replset.minvalid");
+            BSONObj mv;
+            if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
+                minvalid = mv["ts"]._opTime();
+                if( minvalid <= lastOpTimeWritten ) {
+                    golive=true;
+                }
+            }
+            else
+                golive = true; /* must have been the original member */
+        }
+        if( golive ) {
+            sethbmsg("");
+            changeState(MemberState::RS_SECONDARY);
+        }
+        return golive;
+    }
+
+    bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) {
+        remoteOldestOp = r.findOne(rsoplog, Query());
+        OpTime remoteTs = remoteOldestOp["ts"]._opTime();
+        DEV log() << "replSet remoteOldestOp:    " << remoteTs.toStringLong() << rsLog;
+        else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
+        DEV {
+            log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
+            log() << "replSet our state: " << state().toString() << rsLog;
+        }
+        if( startTs >= remoteTs ) {
+            return false;
+        }
+
+        return true;
+    }
+
+    Member* ReplSetImpl::_getOplogReader(OplogReader& r, const OpTime& minTS) {
+        Member *target = 0, *stale = 0;
+        BSONObj oldest;
+
+        assert(r.conn() == 0);
+
+        while ((target = getMemberToSyncTo()) != 0) {
+            string current = target->fullName();
+
+            if( !r.connect(current) ) {
+                log(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
+                r.resetConnection();
+                veto(current);
+                continue;
+            }
+
+            if( !minTS.isNull() && _isStale(r, minTS, oldest) ) {
+                r.resetConnection();
+                veto(current, 600);
+                stale = target;
+                continue;
+            }
+
+            // if we made it here, the target is up and not stale
+            return target;
+        }
+
+        // the only viable sync target was stale
+        if (stale) {
+            log() << "replSet error RS102 too stale to catch up, at least from " << stale->fullName() << rsLog;
+            log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
+            log() << "replSet oldest at " << stale->fullName() << " : " << oldest["ts"]._opTime().toStringLong() << rsLog;
+            log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
+
+            // reset minvalid so that we can't become primary prematurely
+            {
+                writelock lk("local.replset.minvalid");
+                Helpers::putSingleton("local.replset.minvalid", oldest);
+            }
+
+            sethbmsg("error RS102 too stale to catch up");
+            changeState(MemberState::RS_RECOVERING);
+            sleepsecs(120);
+        }
+
+        return 0;
+    }
+
+    /* tail an oplog.  ok to return, will be re-called. */
+    void ReplSetImpl::syncTail() {
+        // todo : locking vis a vis the mgr...
+        OplogReader r;
+        string hn;
+
+        // find a target to sync from the last op time written
+        Member* target = _getOplogReader(r, lastOpTimeWritten);
+
+        // no server found
+        if (target == 0) {
+            // if there is no one to sync from
+            OpTime minvalid;
+            tryToGoLiveAsASecondary(minvalid);
+            return;
+        }
+        
+        r.tailingQueryGTE(rsoplog, lastOpTimeWritten);
+        // if target cut connections between connecting and querying (for
+        // example, because it stepped down) we might not have a cursor
+        if ( !r.haveCursor() ) {
+            return;
+        }
+
+        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );
+
+        {
+            if( !r.more() ) {
+                /* maybe we are ahead and need to roll back? */
+                try {
+                    bo theirLastOp = r.getLastOp(rsoplog);
+                    if( theirLastOp.isEmpty() ) {
+                        log() << "replSet error empty query result from " << hn << " oplog" << rsLog;
+                        sleepsecs(2);
+                        return;
+                    }
+                    OpTime theirTS = theirLastOp["ts"]._opTime();
+                    if( theirTS < lastOpTimeWritten ) {
+                        log() << "replSet we are ahead of the primary, will try to roll back" << rsLog;
+                        syncRollback(r);
+                        return;
+                    }
+                    /* we're not ahead?  maybe our new query got fresher data.  best to come back and try again */
+                    log() << "replSet syncTail condition 1" << rsLog;
+                    sleepsecs(1);
+                }
+                catch(DBException& e) {
+                    log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog;
+                    veto(target->fullName());
+                    sleepsecs(2);
+                }
+                return;
+            }
+
+            BSONObj o = r.nextSafe();
+            OpTime ts = o["ts"]._opTime();
+            long long h = o["h"].numberLong();
+            if( ts != lastOpTimeWritten || h != lastH ) {
+                log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << rsLog;
+                log() << "replset source's GTE: " << ts.toStringPretty() << rsLog;
+                syncRollback(r);
+                return;
+            }
+        }
+
+        /* we have now checked if we need to rollback and we either don't have to or did it. */
+        {
+            OpTime minvalid;
+            tryToGoLiveAsASecondary(minvalid);
+        }
+
+        while( 1 ) {
+            {
+                Timer timeInWriteLock;
+                writelock lk("");
+                while( 1 ) {
+                    if( !r.moreInCurrentBatch() ) {
+                        dbtemprelease tempRelease;
+                        {
+                            // we need to occasionally check some things. between
+                            // batches is probably a good time.                            
+                            if( state().recovering() ) { // perhaps we should check this earlier? but not before the rollback checks.
+                                /* can we go to RS_SECONDARY state?  we can if not too old and if minvalid achieved */
+                                OpTime minvalid;
+                                bool golive = ReplSetImpl::tryToGoLiveAsASecondary(minvalid);
+                                if( golive ) {
+                                    ;
+                                }
+                                else {
+                                    sethbmsg(str::stream() << "still syncing, not yet to minValid optime" << minvalid.toString());
+                                }
+                                // todo: too stale capability
+                            }
+                            if( !target->hbinfo().hbstate.readable() ) {
+                                return;
+                            }
+                        }
+                        r.more(); // to make the requestmore outside the db lock, which obviously is quite important
+                    }
+                    if( timeInWriteLock.micros() > 1000 ) {
+                        dbtemprelease tempRelease;
+                        timeInWriteLock.reset();
+                    }
+                    if( !r.more() )
+                        break;
+                    {
+                        BSONObj o = r.nextSafe(); // note we might get "not master" at some point
+
+                        int sd = myConfig().slaveDelay;
+                        // ignore slaveDelay if the box is still initializing. once
+                        // it becomes secondary we can worry about it.
+                        if( sd && box.getState().secondary() ) {
+                            const OpTime ts = o["ts"]._opTime();
+                            long long a = ts.getSecs();
+                            long long b = time(0);
+                            long long lag = b - a;
+                            long long sleeptime = sd - lag;
+                            if( sleeptime > 0 ) {
+                                dbtemprelease tempRelease;
+                                uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000);
+                                if( sleeptime < 60 ) {
+                                    sleepsecs((int) sleeptime);
+                                }
+                                else {
+                                    log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog;
+                                    // sleep(hours) would prevent reconfigs from taking effect & such!
+                                    long long waitUntil = b + sleeptime;
+                                    while( 1 ) {
+                                        sleepsecs(6);
+                                        if( time(0) >= waitUntil )
+                                            break;
+
+                                        if( !target->hbinfo().hbstate.readable() ) {
+                                            break;
+                                        }
+                                    
+                                        if( myConfig().slaveDelay != sd ) // reconf
+                                            break;
+                                    }
+                                }
+                            }
+                        } // endif slaveDelay
+
+                        d.dbMutex.assertWriteLocked();
+                        try {
+                            /* if we have become primary, we dont' want to apply things from elsewhere
+                               anymore. assumePrimary is in the db lock so we are safe as long as
+                               we check after we locked above. */
+                            if( box.getState().primary() ) {
+                                log(0) << "replSet stopping syncTail we are now primary" << rsLog;
+                                return;
+                            }
+
+                            // TODO: make this whole method a member of SyncTail (SERVER-4444)
+                            replset::SyncTail tail("");
+                            tail.syncApply(o);
+                            _logOpObjRS(o);   // with repl sets we write the ops to our oplog too
+                        }
+                        catch (DBException& e) {
+                            sethbmsg(str::stream() << "syncTail: " << e.toString() << ", syncing: " << o);
+                            veto(target->fullName(), 300);
+                            sleepsecs(30);
+                            return;
+                        }
+                    }
+                } // end while
+            } // end writelock scope
+
+            r.tailCheck();
+            if( !r.haveCursor() ) {
+                LOG(1) << "replSet end syncTail pass with " << hn << rsLog;
+                // TODO : reuse our connection to the primary.
+                return;
+            }
+            
+            if( !target->hbinfo().hbstate.readable() ) {
+                return;
+            }
+            // looping back is ok because this is a tailable cursor
+        }
+    }
+
+    void ReplSetImpl::_syncThread() {
+        StateBox::SP sp = box.get();
+        if( sp.state.primary() ) {
+            sleepsecs(1);
+            return;
+        }
+        if( _blockSync || sp.state.fatal() || sp.state.startup() ) {
+            sleepsecs(5);
+            return;
+        }
+
+        /* do we have anything at all? */
+        if( lastOpTimeWritten.isNull() ) {
+            syncDoInitialSync();
+            return; // _syncThread will be recalled, starts from top again in case sync failed.
+        }
+
+        /* we have some data.  continue tailing. */
+        syncTail();
+    }
+
+    void ReplSetImpl::syncThread() {
+        while( 1 ) {
+            // After a reconfig, we may not be in the replica set anymore, so
+            // check that we are in the set (and not an arbiter) before
+            // trying to sync with other replicas.
+            if( ! _self ) {
+            	log() << "replSet warning did not detect own host and port, not syncing, config: " << theReplSet->config() << rsLog;
+                return;
+            }
+            if( myConfig().arbiterOnly ) {
+                return;
+            }
+
+            try {
+                _syncThread();
+            }
+            catch(DBException& e) {
+                sethbmsg(str::stream() << "syncThread: " << e.toString());
+                sleepsecs(10);
+            }
+            catch(...) {
+                sethbmsg("unexpected exception in syncThread()");
+                // TODO : SET NOT SECONDARY here?
+                sleepsecs(60);
+            }
+            sleepsecs(1);
+
+            /* normally msgCheckNewState gets called periodically, but in a single node repl set there
+               are no heartbeat threads, so we do it here to be sure.  this is relevant if the singleton
+               member has done a stepDown() and needs to come back up.
+               */
+            OCCASIONALLY {
+            	mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+            }
+        }
+    }
+
+    void startSyncThread() {
+        static int n;
+        if( n != 0 ) {
+            log() << "replSet ERROR : more than one sync thread?" << rsLog;
+            assert( n == 0 );
+        }
+        n++;
+
+        Client::initThread("rsSync");
+        cc().iAmSyncThread(); // for isSyncThread() (which is used not used much, is used in secondary create index code
+        replLocalAuth();
+        theReplSet->syncThread();
+        cc().shutdown();
+    }
+
+    void GhostSync::starting() {
+        Client::initThread("rsGhostSync");
+        replLocalAuth();
+    }
+
+    void ReplSetImpl::blockSync(bool block) {
+        _blockSync = block;
+        if (_blockSync) {
+            // syncing is how we get into SECONDARY state, so we'll be stuck in
+            // RECOVERING until we unblock
+            changeState(MemberState::RS_RECOVERING);
+        }
+    }
+
+    void GhostSync::associateSlave(const BSONObj& id, const int memberId) {
+        const OID rid = id["_id"].OID();
+        rwlock lk( _lock , true );
+        shared_ptr<GhostSlave> &g = _ghostCache[rid];
+        if( g.get() == 0 ) {
+            g.reset( new GhostSlave() );
+            wassert( _ghostCache.size() < 10000 );
+        }
+        GhostSlave &slave = *g;
+        if (slave.init) {
+            LOG(1) << "tracking " << slave.slave->h().toString() << " as " << rid << rsLog;
+            return;
+        }
+
+        slave.slave = (Member*)rs->findById(memberId);
+        if (slave.slave != 0) {
+            slave.init = true;
+        }
+        else {
+            log() << "replset couldn't find a slave with id " << memberId
+                  << ", not tracking " << rid << rsLog;
+        }
+    }
+
+    void GhostSync::updateSlave(const mongo::OID& rid, const OpTime& last) {
+        rwlock lk( _lock , false );
+        MAP::iterator i = _ghostCache.find( rid );
+        if ( i == _ghostCache.end() ) {
+            OCCASIONALLY warning() << "couldn't update slave " << rid << " no entry" << rsLog;
+            return;
+        }
+        
+        GhostSlave& slave = *(i->second);
+        if (!slave.init) {
+            OCCASIONALLY log() << "couldn't update slave " << rid << " not init" << rsLog;            
+            return;
+        }
+
+        ((ReplSetConfig::MemberCfg)slave.slave->config()).updateGroups(last);
+    }
+
+    void GhostSync::percolate(const BSONObj& id, const OpTime& last) {
+        const OID rid = id["_id"].OID();
+        GhostSlave* slave;
+        {
+            rwlock lk( _lock , false );
+
+            MAP::iterator i = _ghostCache.find( rid );
+            if ( i == _ghostCache.end() ) {
+                OCCASIONALLY log() << "couldn't percolate slave " << rid << " no entry" << rsLog;
+                return;
+            }
+
+            slave = i->second.get();
+            if (!slave->init) {
+                OCCASIONALLY log() << "couldn't percolate slave " << rid << " not init" << rsLog;
+                return;
+            }
+        }
+
+        assert(slave->slave);
+
+        const Member *target = rs->_currentSyncTarget;
+        if (!target || rs->box.getState().primary()
+            // we are currently syncing from someone who's syncing from us
+            // the target might end up with a new Member, but s.slave never
+            // changes so we'll compare the names
+            || target == slave->slave || target->fullName() == slave->slave->fullName()) {
+            LOG(1) << "replica set ghost target no good" << endl;
+            return;
+        }
+
+        try {
+            if (!slave->reader.haveCursor()) {
+                if (!slave->reader.connect(id, slave->slave->id(), target->fullName())) {
+                    // error message logged in OplogReader::connect
+                    return;
+                }
+                slave->reader.ghostQueryGTE(rsoplog, last);
+            }
+
+            LOG(1) << "replSet last: " << slave->last.toString() << " to " << last.toString() << rsLog;
+            if (slave->last > last) {
+                return;
+            }
+
+            while (slave->last <= last) {
+                if (!slave->reader.more()) {
+                    // we'll be back
+                    return;
+                }
+
+                BSONObj o = slave->reader.nextSafe();
+                slave->last = o["ts"]._opTime();
+            }
+            LOG(2) << "now last is " << slave->last.toString() << rsLog;
+        }
+        catch (DBException& e) {
+            // we'll be back
+            LOG(2) << "replSet ghost sync error: " << e.what() << " for "
+                   << slave->slave->fullName() << rsLog;
+            slave->reader.resetConnection();
+        }
+    }
+}
diff --git a/src/mongo/db/repl/test.html b/src/mongo/db/repl/test.html
new file mode 100644
index 00000000000..295ad2ef0e0
--- /dev/null
+++ b/src/mongo/db/repl/test.html
@@ -0,0 +1,11 @@
+<HTML>
+<BODY>
+<!-- see also jstests/rs/ -->
+<iframe src="http://127.0.0.1:28000/_replSet" width="100%" height="50%" frameborder=1>
+</iframe>
+
+<iframe src="http://127.0.0.1:28001/_replSet" width="100%" height="50%" frameborder=1>
+</iframe>
+
+</BODY>
+</HTML>
diff --git a/src/mongo/db/repl/testing.js b/src/mongo/db/repl/testing.js
new file mode 100644
index 00000000000..d741cf3a644
--- /dev/null
+++ b/src/mongo/db/repl/testing.js
@@ -0,0 +1,42 @@
+// helpers for testing repl sets
+// run
+//   mongo --shell <host:port> testing.js
+
+cfg = {
+    _id: 'asdf',
+    members: [
+        { _id : 0, host : "dm_hp" },
+        { _id : 2, host : "dm_hp:27002" }
+        ]
+};
+c2 = {
+    _id: 'asdf',
+    members: [
+        { _id: 0, host: "dmthink" },
+        { _id: 2, host: "dmthink:27002" }
+        ]
+};
+
+db = db.getSisterDB("admin");
+local = db.getSisterDB("local");
+
+print("\n\ndb = admin db on localhost:27017");
+print("b = admin on localhost:27002");
+print("rc(x) = db.runCommand(x)");
+print("cfg = samp replset config");
+print("i() = replSetInitiate(cfg)");
+print("ism() = rc('ismaster')");
+print("\n\n");
+
+function rc(c) { return db.runCommand(c); }
+function i() { return rc({ replSetInitiate: cfg }); }
+function ism() { return rc("isMaster"); }
+
+b = 0;
+try {
+    b = new Mongo("localhost:27002").getDB("admin");
+}
+catch (e) {
+    print("\nCouldn't connect to b mongod instance\n");
+}
+
diff --git a/src/mongo/db/repl_block.cpp b/src/mongo/db/repl_block.cpp
new file mode 100644
index 00000000000..1776225505c
--- /dev/null
+++ b/src/mongo/db/repl_block.cpp
@@ -0,0 +1,256 @@
+// repl_block.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "instance.h"
+#include "dbhelpers.h"
+#include "../util/background.h"
+#include "../util/mongoutils/str.h"
+#include "../client/dbclient.h"
+#include "replutil.h"
+
+//#define REPLDEBUG(x) log() << "replBlock: "  << x << endl;
+#define REPLDEBUG(x)
+
+namespace mongo {
+
+    using namespace mongoutils;
+
+    class SlaveTracking : public BackgroundJob {
+    public:
+        string name() const { return "SlaveTracking"; }
+
+        static const char * NS;
+
+        struct Ident {
+
+            Ident(const BSONObj& r, const string& h, const string& n) {
+                BSONObjBuilder b;
+                b.appendElements( r );
+                b.append( "host" , h );
+                b.append( "ns" , n );
+                obj = b.obj();
+            }
+
+            bool operator<( const Ident& other ) const {
+                return obj["_id"].OID() < other.obj["_id"].OID();
+            }
+
+            BSONObj obj;
+        };
+
+        struct Info {
+            Info() : loc(0) {}
+            ~Info() {
+                if ( loc && owned ) {
+                    delete loc;
+                }
+            }
+            bool owned; // true if loc is a pointer of our creation (and not a pointer into a MMF)
+            OpTime * loc;
+        };
+
+        SlaveTracking() : _mutex("SlaveTracking") {
+            _dirty = false;
+            _started = false;
+        }
+
+        void run() {
+            Client::initThread( "slaveTracking" );
+            DBDirectClient db;
+            while ( ! inShutdown() ) {
+                sleepsecs( 1 );
+
+                if ( ! _dirty )
+                    continue;
+
+                writelock lk(NS);
+
+                list< pair<BSONObj,BSONObj> > todo;
+
+                {
+                    scoped_lock mylk(_mutex);
+
+                    for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++ ) {
+                        BSONObjBuilder temp;
+                        temp.appendTimestamp( "syncedTo" , i->second.loc[0].asDate() );
+                        todo.push_back( pair<BSONObj,BSONObj>( i->first.obj.getOwned() ,
+                                                               BSON( "$set" << temp.obj() ).getOwned() ) );
+                    }
+                }
+
+                for ( list< pair<BSONObj,BSONObj> >::iterator i=todo.begin(); i!=todo.end(); i++ ) {
+                    db.update( NS , i->first , i->second , true );
+                }
+
+                _dirty = false;
+            }
+        }
+
+        void reset() {
+            scoped_lock mylk(_mutex);
+            _slaves.clear();
+        }
+
+        void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ) {
+            REPLDEBUG( host << " " << rid << " " << ns << " " << last );
+
+            scoped_lock mylk(_mutex);
+
+#ifdef _DEBUG
+            MongoFileAllowWrites allowWrites;
+#endif
+
+            Ident ident(rid,host,ns);
+            Info& i = _slaves[ ident ];
+
+            if (theReplSet && theReplSet->isPrimary()) {
+                theReplSet->ghost->updateSlave(ident.obj["_id"].OID(), last);
+            }
+
+            if ( i.loc ) {
+                if( i.owned )
+                    i.loc[0] = last;
+                else
+                    getDur().setNoJournal(i.loc, &last, sizeof(last));
+                return;
+            }
+
+            d.dbMutex.assertAtLeastReadLocked();
+
+            BSONObj res;
+            if ( Helpers::findOne( NS , ident.obj , res ) ) {
+                assert( res["syncedTo"].type() );
+                i.owned = false;
+                i.loc = (OpTime*)res["syncedTo"].value();
+                getDur().setNoJournal(i.loc, &last, sizeof(last));
+                return;
+            }
+
+            i.owned = true;
+            i.loc = new OpTime(last);
+            _dirty = true;
+
+            if ( ! _started ) {
+                // start background thread here since we definitely need it
+                _started = true;
+                go();
+            }
+
+        }
+
+        bool opReplicatedEnough( OpTime op , BSONElement w ) {
+            RARELY {
+                REPLDEBUG( "looking for : " << op << " w=" << w );
+            }
+
+            if (w.isNumber()) {
+                return replicatedToNum(op, w.numberInt());
+            }
+
+            if (!theReplSet) {
+                return false;
+            }
+
+            string wStr = w.String();
+            if (wStr == "majority") {
+                // use the entire set, including arbiters, to prevent writing
+                // to a majority of the set but not a majority of voters
+                return replicatedToNum(op, theReplSet->config().getMajority());
+            }
+
+            map<string,ReplSetConfig::TagRule*>::const_iterator it = theReplSet->config().rules.find(wStr);
+            uassert(14830, str::stream() << "unrecognized getLastError mode: " << wStr,
+                    it != theReplSet->config().rules.end());
+
+            return op <= (*it).second->last;
+        }
+
+        bool replicatedToNum(OpTime& op, int w) {
+            if ( w <= 1 || ! _isMaster() )
+                return true;
+
+            w--; // now this is the # of slaves i need
+            scoped_lock mylk(_mutex);
+            for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++) {
+                OpTime s = *(i->second.loc);
+                if ( s < op ) {
+                    continue;
+                }
+                if ( --w == 0 )
+                    return true;
+            }
+            return w <= 0;
+        }
+
+        unsigned getSlaveCount() const {
+            scoped_lock mylk(_mutex);
+
+            return _slaves.size();
+        }
+
+        // need to be careful not to deadlock with this
+        mutable mongo::mutex _mutex;
+        map<Ident,Info> _slaves;
+        bool _dirty;
+        bool _started;
+
+    } slaveTracking;
+
+    const char * SlaveTracking::NS = "local.slaves";
+
+    void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ) {
+        if ( lastOp.isNull() )
+            return;
+
+        assert( str::startsWith(ns, "local.oplog.") );
+
+        Client * c = curop.getClient();
+        assert(c);
+        BSONObj rid = c->getRemoteID();
+        if ( rid.isEmpty() )
+            return;
+
+        slaveTracking.update( rid , curop.getRemoteString( false ) , ns , lastOp );
+
+        if (theReplSet && !theReplSet->isPrimary()) {
+            // we don't know the slave's port, so we make the replica set keep
+            // a map of rids to slaves
+            log(2) << "percolating " << lastOp.toString() << " from " << rid << endl;
+            theReplSet->ghost->send( boost::bind(&GhostSync::percolate, theReplSet->ghost, rid, lastOp) );
+        }
+    }
+
+    bool opReplicatedEnough( OpTime op , BSONElement w ) {
+        return slaveTracking.opReplicatedEnough( op , w );
+    }
+
+    bool opReplicatedEnough( OpTime op , int w ) {
+        return slaveTracking.replicatedToNum( op , w );
+    }
+
+    void resetSlaveCache() {
+        slaveTracking.reset();
+    }
+
+    unsigned getSlaveCount() {
+        return slaveTracking.getSlaveCount();
+    }
+}
diff --git a/src/mongo/db/repl_block.h b/src/mongo/db/repl_block.h
new file mode 100644
index 00000000000..bb74deea10f
--- /dev/null
+++ b/src/mongo/db/repl_block.h
@@ -0,0 +1,39 @@
+// repl_block.h - blocking on writes for replication
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "client.h"
+#include "curop.h"
+
+/**
+   local.slaves  - current location for all slaves
+
+ */
+namespace mongo {
+
+    void updateSlaveLocation( CurOp& curop, const char * oplog_ns , OpTime lastOp );
+
+    /** @return true if op has made it to w servers */
+    bool opReplicatedEnough( OpTime op , int w );
+    bool opReplicatedEnough( OpTime op , BSONElement w );
+
+    void resetSlaveCache();
+    unsigned getSlaveCount();
+}
diff --git a/src/mongo/db/replutil.h b/src/mongo/db/replutil.h
new file mode 100644
index 00000000000..6f4dbb875d2
--- /dev/null
+++ b/src/mongo/db/replutil.h
@@ -0,0 +1,102 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "db.h"
+#include "dbhelpers.h"
+#include "json.h"
+#include "../client/dbclient.h"
+#include "repl.h"
+#include "cmdline.h"
+#include "repl/rs.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+    extern const char *replAllDead;
+
+    /* note we always return true for the "local" namespace.
+
+       we should not allow most operations when not the master
+       also we report not master if we are "dead".
+
+       See also CmdIsMaster.
+
+       If 'client' is not specified, the current client is used.
+    */
+    inline bool _isMaster() {
+        if( replSet ) {
+            if( theReplSet )
+                return theReplSet->isPrimary();
+            return false;
+        }
+
+        if( ! replSettings.slave )
+            return true;
+
+        if ( replAllDead )
+            return false;
+
+        if( replSettings.master ) {
+            // if running with --master --slave, allow.
+            return true;
+        }
+
+        if ( cc().isGod() )
+            return true;
+
+        return false;
+    }
+    inline bool isMaster(const char * dbname = 0) {
+        if( _isMaster() )
+            return true;
+        if ( ! dbname ) {
+            Database *database = cc().database();
+            assert( database );
+            dbname = database->name.c_str();
+        }
+        return strcmp( dbname , "local" ) == 0;
+    }
+    inline bool isMasterNs( const char *ns ) {
+        if ( _isMaster() )
+            return true;
+        assert( ns );
+        if ( ! str::startsWith( ns , "local" ) )
+            return false;
+        return ns[5] == 0 || ns[5] == '.';
+    }
+
+    inline void notMasterUnless(bool expr) {
+        uassert( 10107 , "not master" , expr );
+    }
+
+    /** we allow queries to SimpleSlave's */
+    inline void replVerifyReadsOk(ParsedQuery& pq) {
+        if( replSet ) {
+            /* todo: speed up the secondary case.  as written here there are 2 mutex entries, it can b 1. */
+            if( isMaster() ) return;
+            uassert(13435, "not master and slaveOk=false", pq.hasOption(QueryOption_SlaveOk));
+            uassert(13436, "not master or secondary; cannot currently read from this replSet member", theReplSet && theReplSet->isSecondary() );
+        }
+        else {
+            notMasterUnless(isMaster() || pq.hasOption(QueryOption_SlaveOk) || replSettings.slave == SimpleSlave );
+        }
+    }
+
+
+
+} // namespace mongo
diff --git a/src/mongo/db/resource.h b/src/mongo/db/resource.h
new file mode 100644
index 00000000000..9ba1ed26a0c
--- /dev/null
+++ b/src/mongo/db/resource.h
@@ -0,0 +1,16 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by db.rc
+//
+#define IDI_ICON2                       102
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        104
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1001
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
diff --git a/src/mongo/db/restapi.cpp b/src/mongo/db/restapi.cpp
new file mode 100644
index 00000000000..370051354a2
--- /dev/null
+++ b/src/mongo/db/restapi.cpp
@@ -0,0 +1,294 @@
+/** @file resetapi.cpp
+    web rest api
+*/
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/miniwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/md5.hpp"
+#include "instance.h"
+#include "dbwebserver.h"
+#include "dbhelpers.h"
+#include "repl.h"
+#include "replutil.h"
+#include "clientcursor.h"
+#include "background.h"
+
+#include "restapi.h"
+
+namespace mongo {
+
+    extern const char *replInfo;
+    bool getInitialSyncCompleted();
+
+    using namespace bson;
+    using namespace mongoutils::html;
+
+    class RESTHandler : public DbWebHandler {
+    public:
+        RESTHandler() : DbWebHandler( "DUMMY REST" , 1000 , true ) {}
+
+        virtual bool handles( const string& url ) const {
+            return
+                url[0] == '/' &&
+                url.find_last_of( '/' ) > 0;
+        }
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+
+            string::size_type first = url.find( "/" , 1 );
+            if ( first == string::npos ) {
+                responseCode = 400;
+                return;
+            }
+
+            string method = MiniWebServer::parseMethod( rq );
+            string dbname = url.substr( 1 , first - 1 );
+            string coll = url.substr( first + 1 );
+            string action = "";
+
+            string::size_type last = coll.find_last_of( "/" );
+            if ( last == string::npos ) {
+                action = coll;
+                coll = "_defaultCollection";
+            }
+            else {
+                action = coll.substr( last + 1 );
+                coll = coll.substr( 0 , last );
+            }
+
+            for ( string::size_type i=0; i<coll.size(); i++ )
+                if ( coll[i] == '/' )
+                    coll[i] = '.';
+
+            string fullns = MiniWebServer::urlDecode(dbname + "." + coll);
+
+            headers.push_back( (string)"x-action: " + action );
+            headers.push_back( (string)"x-ns: " + fullns );
+
+            bool html = false;
+
+            stringstream ss;
+
+            if ( method == "GET" ) {
+                responseCode = 200;
+                html = handleRESTQuery( fullns , action , params , responseCode , ss  );
+            }
+            else if ( method == "POST" ) {
+                responseCode = 201;
+                handlePost( fullns , MiniWebServer::body( rq ) , params , responseCode , ss  );
+            }
+            else {
+                responseCode = 400;
+                headers.push_back( "X_err: bad request" );
+                ss << "don't know how to handle a [" << method << "]";
+                out() << "don't know how to handle a [" << method << "]" << endl;
+            }
+
+            if( html )
+                headers.push_back("Content-Type: text/html;charset=utf-8");
+            else
+                headers.push_back("Content-Type: text/plain;charset=utf-8");
+
+            responseMsg = ss.str();
+        }
+
+        bool handleRESTQuery( string ns , string action , BSONObj & params , int & responseCode , stringstream & out ) {
+            Timer t;
+
+            int html = _getOption( params["html"] , 0 );
+            int skip = _getOption( params["skip"] , 0 );
+            int num  = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new
+
+            int one = 0;
+            if ( params["one"].type() == String && tolower( params["one"].valuestr()[0] ) == 't' ) {
+                num = 1;
+                one = 1;
+            }
+
+            BSONObjBuilder queryBuilder;
+
+            BSONObjIterator i(params);
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                string name = e.fieldName();
+                if ( ! name.find( "filter_" ) == 0 )
+                    continue;
+
+                string field = name.substr(7);
+                const char * val = e.valuestr();
+
+                char * temp;
+
+                // TODO: this is how i guess if something is a number.  pretty lame right now
+                double number = strtod( val , &temp );
+                if ( temp != val )
+                    queryBuilder.append( field , number );
+                else
+                    queryBuilder.append( field , val );
+            }
+
+            BSONObj query = queryBuilder.obj();
+            auto_ptr<DBClientCursor> cursor = db.query( ns.c_str() , query, num , skip );
+            uassert( 13085 , "query failed for dbwebserver" , cursor.get() );
+
+            if ( one ) {
+                if ( cursor->more() ) {
+                    BSONObj obj = cursor->next();
+                    out << obj.jsonString(Strict,html?1:0) << '\n';
+                }
+                else {
+                    responseCode = 404;
+                }
+                return html != 0;
+            }
+
+            if( html )  {
+                string title = string("query ") + ns;
+                out << start(title)
+                    << p(title)
+                    << "<pre>";
+            }
+            else {
+                out << "{\n";
+                out << "  \"offset\" : " << skip << ",\n";
+                out << "  \"rows\": [\n";
+            }
+
+            int howMany = 0;
+            while ( cursor->more() ) {
+                if ( howMany++ && html == 0 )
+                    out << " ,\n";
+                BSONObj obj = cursor->next();
+                if( html ) {
+                    if( out.tellp() > 4 * 1024 * 1024 ) {
+                        out << "Stopping output: more than 4MB returned and in html mode\n";
+                        break;
+                    }
+                    out << obj.jsonString(Strict, html?1:0) << "\n\n";
+                }
+                else {
+                    if( out.tellp() > 50 * 1024 * 1024 ) // 50MB limit - we are using ram
+                        break;
+                    out << "    " << obj.jsonString();
+                }
+            }
+
+            if( html ) {
+                out << "</pre>\n";
+                if( howMany == 0 ) out << p("Collection is empty");
+                out << _end();
+            }
+            else {
+                out << "\n  ],\n\n";
+                out << "  \"total_rows\" : " << howMany << " ,\n";
+                out << "  \"query\" : " << query.jsonString() << " ,\n";
+                out << "  \"millis\" : " << t.millis() << '\n';
+                out << "}\n";
+            }
+
+            return html != 0;
+        }
+
+        // TODO Generate id and revision per couch POST spec
+        void handlePost( string ns, const char *body, BSONObj& params, int & responseCode, stringstream & out ) {
+            try {
+                BSONObj obj = fromjson( body );
+                db.insert( ns.c_str(), obj );
+            }
+            catch ( ... ) {
+                responseCode = 400; // Bad Request.  Seems reasonable for now.
+                out << "{ \"ok\" : false }";
+                return;
+            }
+
+            responseCode = 201;
+            out << "{ \"ok\" : true }";
+        }
+
+        int _getOption( BSONElement e , int def ) {
+            if ( e.isNumber() )
+                return e.numberInt();
+            if ( e.type() == String )
+                return atoi( e.valuestr() );
+            return def;
+        }
+
+        DBDirectClient db;
+
+    } restHandler;
+
+    bool RestAdminAccess::haveAdminUsers() const {
+        readlocktryassert rl("admin.system.users", 10000);
+        Client::Context cx( "admin.system.users", dbpath, false );
+        return ! Helpers::isEmpty("admin.system.users", false);
+    }
+
+    BSONObj RestAdminAccess::getAdminUser( const string& username ) const {
+        Client::GodScope gs;
+        readlocktryassert rl("admin.system.users", 10000);
+        Client::Context cx( "admin.system.users" );
+        BSONObj user;
+        if ( Helpers::findOne( "admin.system.users" , BSON( "user" << username ) , user ) )
+            return user.copy();
+        return BSONObj();
+    }
+
+    class LowLevelMongodStatus : public WebStatusPlugin {
+    public:
+        LowLevelMongodStatus() : WebStatusPlugin( "overview" , 5 , "(only reported if can acquire read lock quickly)" ) {}
+
+        virtual void init() {}
+
+        void _gotLock( int millis , stringstream& ss ) {
+            ss << "<pre>\n";
+            ss << "time to get readlock: " << millis << "ms\n";
+            ss << "# databases: " << dbHolder().sizeInfo() << '\n';
+            ss << "# Cursors: " << ClientCursor::numCursors() << '\n';
+            ss << "replication: ";
+            if( *replInfo )
+                ss << "\nreplInfo:  " << replInfo << "\n\n";
+            if( replSet ) {
+                ss << a("", "see replSetGetStatus link top of page") << "--replSet </a>" << cmdLine._replSet;
+            }
+            if ( replAllDead )
+                ss << "\n<b>replication replAllDead=" << replAllDead << "</b>\n";
+            else {
+                ss << "\nmaster: " << replSettings.master << '\n';
+                ss << "slave:  " << replSettings.slave << '\n';
+                ss << '\n';
+            }
+
+            BackgroundOperation::dump(ss);
+            ss << "</pre>\n";
+        }
+
+        virtual void run( stringstream& ss ) {
+            Timer t;
+            readlocktry lk( "" , 300 );
+            if ( lk.got() ) {
+                _gotLock( t.millis() , ss );
+            }
+            else {
+                ss << "\n<b>timed out getting lock</b>\n";
+            }
+        }
+    } lowLevelMongodStatus;
+}
diff --git a/src/mongo/db/restapi.h b/src/mongo/db/restapi.h
new file mode 100644
index 00000000000..e5ac52083fe
--- /dev/null
+++ b/src/mongo/db/restapi.h
@@ -0,0 +1,34 @@
+/** @file restapi.h
+ */
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/admin_access.h"
+
+namespace mongo {
+
+    class RestAdminAccess : public AdminAccess {
+    public:
+        virtual ~RestAdminAccess() { }
+
+        virtual bool haveAdminUsers() const;
+        virtual BSONObj getAdminUser( const string& username ) const;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/scanandorder.cpp b/src/mongo/db/scanandorder.cpp
new file mode 100644
index 00000000000..b5e282a5866
--- /dev/null
+++ b/src/mongo/db/scanandorder.cpp
@@ -0,0 +1,105 @@
+/* scanandorder.cpp
+   Order results (that aren't already indexes and in order.)
+*/
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "scanandorder.h"
+
+namespace mongo {
+
+    const unsigned ScanAndOrder::MaxScanAndOrderBytes = 32 * 1024 * 1024;
+
+    void ScanAndOrder::_add(BSONObj& k, BSONObj o, DiskLoc* loc) {
+        if (!loc) {
+            _best.insert(make_pair(k.getOwned(),o.getOwned()));
+        }
+        else {
+            BSONObjBuilder b;
+            b.appendElements(o);
+            b.append("$diskLoc", loc->toBSONObj());
+            _best.insert(make_pair(k.getOwned(), b.obj().getOwned()));
+        }
+    }
+
+    void ScanAndOrder::_addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc) {
+        /* todo : we don't correct _approxSize here. */
+        const BSONObj& worstBestKey = i->first;
+        int c = worstBestKey.woCompare(k, _order._spec.keyPattern);
+        if ( c > 0 ) {
+            // k is better, 'upgrade'
+            _best.erase(i);
+            _add(k, o, loc);
+        }
+    }
+
+
+    void ScanAndOrder::add(BSONObj o, DiskLoc* loc) {
+        assert( o.isValid() );
+        BSONObj k;
+        try {
+            k = _order.getKeyFromObject(o);
+        }
+        catch (UserException &e) {
+            if ( e.getCode() == ParallelArraysCode ) { // cannot get keys for parallel arrays
+                // fix lasterror text to be more accurate.
+                uasserted( 15925, "cannot sort with keys that are parallel arrays" );
+            }
+            else
+                throw;
+        }
+
+        if ( k.isEmpty() ) {
+            return;   
+        }
+        if ( (int) _best.size() < _limit ) {
+            _approxSize += k.objsize();
+            _approxSize += o.objsize();
+            
+            /* note : adjust when bson return limit adjusts. note this limit should be a bit higher. */
+            uassert( 10128 ,  "too much data for sort() with no index.  add an index or specify a smaller limit", _approxSize < MaxScanAndOrderBytes );
+            
+            _add(k, o, loc);
+            return;
+        }
+        BestMap::iterator i;
+        assert( _best.end() != _best.begin() );
+        i = _best.end();
+        i--;
+        _addIfBetter(k, o, i, loc);
+    }
+
+
+    void ScanAndOrder::fill(BufBuilder& b, Projection *filter, int& nout ) const {
+        int n = 0;
+        int nFilled = 0;
+        for ( BestMap::const_iterator i = _best.begin(); i != _best.end(); i++ ) {
+            n++;
+            if ( n <= _startFrom )
+                continue;
+            const BSONObj& o = i->second;
+            fillQueryResultFromObj(b, filter, o);
+            nFilled++;
+            if ( nFilled >= _limit )
+                break;
+            uassert( 10129 ,  "too much data for sort() with no index", b.len() < (int)MaxScanAndOrderBytes ); // appserver limit
+        }
+        nout = nFilled;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/scanandorder.h b/src/mongo/db/scanandorder.h
new file mode 100644
index 00000000000..33e76f61f67
--- /dev/null
+++ b/src/mongo/db/scanandorder.h
@@ -0,0 +1,111 @@
+/* scanandorder.h
+   Order results (that aren't already indexes and in order.)
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "indexkey.h"
+#include "queryutil.h"
+#include "projection.h"
+
+namespace mongo {
+
+    /* todo:
+       _ limit amount of data
+    */
+
+    class KeyType : boost::noncopyable {
+    public:
+        IndexSpec _spec;
+        FieldRangeVector _keyCutter;
+    public:
+        KeyType(BSONObj pattern, const FieldRangeSet &frs):
+        _spec((assert(!pattern.isEmpty()),pattern)),
+        _keyCutter(frs, _spec, 1) {
+        }
+
+        /**
+         * @return first key of the object that would be encountered while
+         * scanning index with keySpec 'pattern' using constraints 'frs', or
+         * BSONObj() if no such key.
+         */
+        BSONObj getKeyFromObject(BSONObj o) {
+            return _keyCutter.firstMatch(o);
+        }
+    };
+
+    /* todo:
+       _ respect limit
+       _ check for excess mem usage
+       _ response size limit from runquery; push it up a bit.
+    */
+
+    inline void fillQueryResultFromObj(BufBuilder& bb, Projection *filter, const BSONObj& js, DiskLoc* loc=NULL) {
+        if ( filter ) {
+            BSONObjBuilder b( bb );
+            filter->transform( js , b );
+            if (loc)
+                b.append("$diskLoc", loc->toBSONObj());
+            b.done();
+        }
+        else if (loc) {
+            BSONObjBuilder b( bb );
+            b.appendElements(js);
+            b.append("$diskLoc", loc->toBSONObj());
+            b.done();
+        }
+        else {
+            bb.appendBuf((void*) js.objdata(), js.objsize());
+        }
+    }
+
+    typedef multimap<BSONObj,BSONObj,BSONObjCmp> BestMap;
+    class ScanAndOrder {
+    public:
+        static const unsigned MaxScanAndOrderBytes;
+
+        ScanAndOrder(int startFrom, int limit, BSONObj order, const FieldRangeSet &frs) :
+            _best( BSONObjCmp( order ) ),
+            _startFrom(startFrom), _order(order, frs) {
+            _limit = limit > 0 ? limit + _startFrom : 0x7fffffff;
+            _approxSize = 0;
+        }
+
+        int size() const { return _best.size(); }
+
+        void add(BSONObj o, DiskLoc* loc);
+
+        /* scanning complete. stick the query result in b for n objects. */
+        void fill(BufBuilder& b, Projection *filter, int& nout ) const;
+
+    private:
+
+        void _add(BSONObj& k, BSONObj o, DiskLoc* loc);
+
+        void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc);
+
+        BestMap _best; // key -> full object
+        int _startFrom;
+        int _limit;   // max to send back.
+        KeyType _order;
+        unsigned _approxSize;
+
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/security.cpp b/src/mongo/db/security.cpp
new file mode 100644
index 00000000000..c9b9bb40326
--- /dev/null
+++ b/src/mongo/db/security.cpp
@@ -0,0 +1,106 @@
+// security.cpp
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "security.h"
+#include "security_common.h"
+#include "instance.h"
+#include "client.h"
+#include "curop-inl.h"
+#include "db.h"
+#include "dbhelpers.h"
+
+// this is the _mongod only_ implementation of security.h
+
+namespace mongo {
+
+    bool AuthenticationInfo::_warned = false;
+    /*
+    void AuthenticationInfo::print() const {
+        cout << "AuthenticationInfo: " << this << '\n';
+        for ( MA::const_iterator i=_dbs.begin(); i!=_dbs.end(); i++ ) {
+            cout << "\t" << i->first << "\t" << i->second.level << '\n';
+        }
+        cout << "END" << endl;
+    }
+    */
+
+    string AuthenticationInfo::getUser( const string& dbname ) const {
+        scoped_spinlock lk(_lock);
+
+        MA::const_iterator i = _dbs.find(dbname);
+        if ( i == _dbs.end() )
+            return "";
+
+        return i->second.user;
+    }
+
+
+    bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) const {
+        if ( cc().isGod() ) 
+            return true;
+
+        if ( isLocalHost ) {
+            Client::GodScope gs;
+            Client::ReadContext ctx("admin.system.users");
+            BSONObj result;
+            if( ! Helpers::getSingleton("admin.system.users", result) ) {
+                if( ! _warned ) {
+                    // you could get a few of these in a race, but that's ok
+                    _warned = true;
+                    log() << "note: no users configured in admin.system.users, allowing localhost access" << endl;
+                }
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    bool CmdAuthenticate::getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd) {
+        if (user == internalSecurity.user) {
+            uassert(15889, "key file must be used to log in with internal user", cmdLine.keyFile);
+            pwd = internalSecurity.pwd;
+        }
+        else {
+            // static BSONObj userPattern = fromjson("{\"user\":1}");
+            string systemUsers = dbname + ".system.users";
+            // OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
+            {
+                BSONObjBuilder b;
+                b << "user" << user;
+                BSONObj query = b.done();
+                if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) {
+                    log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
+                    return false;
+                }
+            }
+
+            pwd = userObj.getStringField("pwd");
+        }
+        return true;
+    }
+
+    bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        AuthenticationInfo *ai = cc().getAuthenticationInfo();
+        ai->logout(dbname);
+        return true;
+    }
+
+} // namespace mongo
+
diff --git a/src/mongo/db/security.h b/src/mongo/db/security.h
new file mode 100755
index 00000000000..f193f305def
--- /dev/null
+++ b/src/mongo/db/security.h
@@ -0,0 +1,113 @@
+// security.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "nonce.h"
+#include "concurrency.h"
+#include "security_common.h"
+#include "../util/concurrency/spin_lock.h"
+
+// this is used by both mongos and mongod
+
+namespace mongo {
+
+    /* 
+     * for a particular db
+     * levels
+     *     0 : none
+     *     1 : read
+     *     2 : write
+     */
+    struct Auth {
+        
+        enum Level { NONE = 0 , READ = 1 , WRITE = 2 };
+
+        Auth() { level = NONE; }
+        Level level;
+        string user;
+    };
+
+    class AuthenticationInfo : boost::noncopyable {
+    public:
+        bool isLocalHost;
+        
+        AuthenticationInfo(){ isLocalHost = false; }
+        ~AuthenticationInfo() {}
+
+        // -- modifiers ----
+        
+        void logout(const string& dbname ) {
+            scoped_spinlock lk(_lock);
+            _dbs.erase(dbname);
+        }
+        void authorize(const string& dbname , const string& user ) {
+            scoped_spinlock lk(_lock);
+            _dbs[dbname].level = Auth::WRITE;
+            _dbs[dbname].user = user;
+        }
+        void authorizeReadOnly(const string& dbname , const string& user ) {
+            scoped_spinlock lk(_lock);
+            _dbs[dbname].level = Auth::READ;
+            _dbs[dbname].user = user;
+        }
+        
+        // -- accessors ---
+
+        bool isAuthorized(const string& dbname) const { 
+            return _isAuthorized( dbname, Auth::WRITE ); 
+        }
+        
+        bool isAuthorizedReads(const string& dbname) const { 
+            return _isAuthorized( dbname, Auth::READ ); 
+        }
+        
+        /**
+         * @param lockType - this is from dbmutex 1 is write, 0 is read
+         */
+        bool isAuthorizedForLock(const string& dbname, int lockType ) const { 
+            return _isAuthorized( dbname , lockType > 0 ? Auth::WRITE : Auth::READ ); 
+        }
+
+        bool isAuthorizedForLevel( const string& dbname , Auth::Level level ) const {
+            return _isAuthorized( dbname , level );
+        }
+
+        string getUser( const string& dbname ) const;
+
+        void print() const;
+
+    protected:
+        /** takes a lock */
+        bool _isAuthorized(const string& dbname, Auth::Level level) const;
+
+        bool _isAuthorizedSingle_inlock(const string& dbname, Auth::Level level) const;
+        
+        /** cannot call this locked */
+        bool _isAuthorizedSpecialChecks( const string& dbname ) const ;
+
+    private:
+        mutable SpinLock _lock;
+
+        typedef map<string,Auth> MA;
+        MA _dbs; // dbname -> auth
+
+        static bool _warned;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/security_commands.cpp b/src/mongo/db/security_commands.cpp
new file mode 100644
index 00000000000..33dbd597c83
--- /dev/null
+++ b/src/mongo/db/security_commands.cpp
@@ -0,0 +1,150 @@
+// security_commands.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// security.cpp links with both dbgrid and db.  this file db only -- at least for now.
+
+// security.cpp
+
+#include "pch.h"
+#include "security.h"
+#include "../util/md5.hpp"
+#include "json.h"
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "commands.h"
+#include "jsobj.h"
+#include "client.h"
+
+namespace mongo {
+
+    /* authentication
+
+       system.users contains
+         { user : <username>, pwd : <pwd_digest>, ... }
+
+       getnonce sends nonce to client
+
+       client then sends { authenticate:1, nonce64:<nonce_str>, user:<username>, key:<key> }
+
+       where <key> is md5(<nonce_str><username><pwd_digest_str>) as a string
+    */
+
+    boost::thread_specific_ptr<nonce64> lastNonce;
+
+    class CmdGetNonce : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        void help(stringstream& h) const { h << "internal"; }
+        virtual LockType locktype() const { return NONE; }
+        CmdGetNonce() : Command("getnonce") {}
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            nonce64 *n = new nonce64(Security::getNonce());
+            stringstream ss;
+            ss << hex << *n;
+            result.append("nonce", ss.str() );
+            lastNonce.reset(n);
+            return true;
+        }
+    } cmdGetNonce;
+
+    CmdLogout cmdLogout;
+
+    bool CmdAuthenticate::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        log() << " authenticate: " << cmdObj << endl;
+
+        string user = cmdObj.getStringField("user");
+        string key = cmdObj.getStringField("key");
+        string received_nonce = cmdObj.getStringField("nonce");
+
+        if( user.empty() || key.empty() || received_nonce.empty() ) {
+            log() << "field missing/wrong type in received authenticate command "
+                  << dbname
+                  << endl;
+            errmsg = "auth fails";
+            sleepmillis(10);
+            return false;
+        }
+
+        stringstream digestBuilder;
+
+        {
+            bool reject = false;
+            nonce64 *ln = lastNonce.release();
+            if ( ln == 0 ) {
+                reject = true;
+                log(1) << "auth: no lastNonce" << endl;
+            }
+            else {
+                digestBuilder << hex << *ln;
+                reject = digestBuilder.str() != received_nonce;
+                if ( reject ) log(1) << "auth: different lastNonce" << endl;
+            }
+
+            if ( reject ) {
+                log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << dbname << endl;
+                errmsg = "auth fails";
+                sleepmillis(30);
+                return false;
+            }
+        }
+
+        BSONObj userObj;
+        string pwd;
+        if (!getUserObj(dbname, user, userObj, pwd)) {
+            errmsg = "auth fails";
+            return false;
+        }
+
+        md5digest d;
+        {
+            digestBuilder << user << pwd;
+            string done = digestBuilder.str();
+
+            md5_state_t st;
+            md5_init(&st);
+            md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
+            md5_finish(&st, d);
+        }
+
+        string computed = digestToString( d );
+
+        if ( key != computed ) {
+            log() << "auth: key mismatch " << user << ", ns:" << dbname << endl;
+            errmsg = "auth fails";
+            return false;
+        }
+
+        bool readOnly = userObj["readOnly"].trueValue();
+        authenticate(dbname, user, readOnly );
+        
+        
+        result.append( "dbname" , dbname );
+        result.append( "user" , user );
+        result.appendBool( "readOnly" , readOnly );
+        
+
+        return true;
+    }
+
+    CmdAuthenticate cmdAuthenticate;
+
+} // namespace mongo
diff --git a/src/mongo/db/security_common.cpp b/src/mongo/db/security_common.cpp
new file mode 100644
index 00000000000..a480919c27e
--- /dev/null
+++ b/src/mongo/db/security_common.cpp
@@ -0,0 +1,148 @@
+// security_common.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * This file contains inter-mongo instance security helpers.  Due to the
+ * requirement that it be possible to compile this into mongos and mongod, it
+ * should not depend on much external stuff.
+ */
+
+#include "pch.h"
+#include "security.h"
+#include "security_common.h"
+#include "../client/dbclient.h"
+#include "commands.h"
+#include "nonce.h"
+#include "../util/md5.hpp"
+#include "client_common.h"
+#include <sys/stat.h>
+
+namespace mongo {
+
+    bool noauth = true;
+    AuthInfo internalSecurity;
+
+    bool setUpSecurityKey(const string& filename) {
+        struct stat stats;
+
+        // check obvious file errors
+        if (stat(filename.c_str(), &stats) == -1) {
+            log() << "error getting file " << filename << ": " << strerror(errno) << endl;
+            return false;
+        }
+
+#if !defined(_WIN32)
+        // check permissions: must be X00, where X is >= 4
+        if ((stats.st_mode & (S_IRWXG|S_IRWXO)) != 0) {
+            log() << "permissions on " << filename << " are too open" << endl;
+            return false;
+        }
+#endif
+
+        const unsigned long long fileLength = stats.st_size;
+        if (fileLength < 6 || fileLength > 1024) {
+            log() << " key file " << filename << " has length " << stats.st_size
+                  << ", must be between 6 and 1024 chars" << endl;
+            return false;
+        }
+
+        FILE* file = fopen( filename.c_str(), "rb" );
+        if (!file) {
+            log() << "error opening file: " << filename << ": " << strerror(errno) << endl;
+            return false;
+        }
+
+        string str = "";
+
+        // strip key file
+        unsigned long long read = 0;
+        while (read < fileLength) {
+            char buf;
+            int readLength = fread(&buf, 1, 1, file);
+            if (readLength < 1) {
+                log() << "error reading file " << filename << endl;
+                return false;
+            }
+            read++;
+
+            // check for whitespace
+            if ((buf >= '\x09' && buf <= '\x0D') || buf == ' ') {
+                continue;
+            }
+
+            // check valid base64
+            if ((buf < 'A' || buf > 'Z') && (buf < 'a' || buf > 'z') && (buf < '0' || buf > '9') && buf != '+' && buf != '/') {
+                log() << "invalid char in key file " << filename << ": " << buf << endl;
+                return false;
+            }
+
+            str += buf;
+        }
+
+        if (str.size() < 6) {
+            log() << "security key must be at least 6 characters" << endl;
+            return false;
+        }
+
+        log(1) << "security key: " << str << endl;
+
+        // createPWDigest should really not be a member func
+        DBClientConnection conn;
+        internalSecurity.pwd = conn.createPasswordDigest(internalSecurity.user, str);
+
+        return true;
+    }
+
+    void CmdAuthenticate::authenticate(const string& dbname, const string& user, const bool readOnly) {
+        ClientBasic* c = ClientBasic::getCurrent();
+        assert(c);
+        AuthenticationInfo *ai = c->getAuthenticationInfo();
+
+        if ( readOnly ) {
+            ai->authorizeReadOnly( dbname , user );
+        }
+        else {
+            ai->authorize( dbname , user );
+        }
+    }
+
+
+    bool AuthenticationInfo::_isAuthorized(const string& dbname, Auth::Level level) const {
+        {
+            scoped_spinlock lk(_lock);
+
+            if ( _isAuthorizedSingle_inlock( dbname , level ) )
+                return true;
+
+            if ( noauth )
+                return true;
+
+            if ( _isAuthorizedSingle_inlock( "admin" , level ) )
+                return true;
+
+            if ( _isAuthorizedSingle_inlock( "local" , level ) )
+                return true;
+        }
+        return _isAuthorizedSpecialChecks( dbname );
+    }
+
+    bool AuthenticationInfo::_isAuthorizedSingle_inlock(const string& dbname, Auth::Level level) const {
+        MA::const_iterator i = _dbs.find(dbname);
+        return i != _dbs.end() && i->second.level >= level;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/security_common.h b/src/mongo/db/security_common.h
new file mode 100644
index 00000000000..6615c6e573e
--- /dev/null
+++ b/src/mongo/db/security_common.h
@@ -0,0 +1,85 @@
+// security_common.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "commands.h"
+#include "concurrency.h"
+#include "../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+    /**
+     * Internal secret key info.
+     */
+    struct AuthInfo {
+        AuthInfo() {
+            user = "__system";
+        }
+        string user;
+        string pwd;
+    };
+
+    // --noauth cmd line option
+    extern bool noauth;
+    extern AuthInfo internalSecurity;
+
+    /**
+     * This method checks the validity of filename as a security key, hashes its
+     * contents, and stores it in the internalSecurity variable.  Prints an
+     * error message to the logs if there's an error.
+     * @param filename the file containing the key
+     * @return if the key was successfully stored
+     */
+    bool setUpSecurityKey(const string& filename);
+
+    class CmdAuthenticate : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return READ; }
+        virtual void help(stringstream& ss) const { ss << "internal"; }
+        CmdAuthenticate() : Command("authenticate") {}
+        bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+        void authenticate(const string& dbname, const string& user, const bool readOnly);
+    private:
+        bool getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd);
+    };
+    
+    extern CmdAuthenticate cmdAuthenticate;
+
+    class CmdLogout : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        void help(stringstream& h) const { h << "de-authenticate"; }
+        virtual LockType locktype() const { return NONE; }
+        CmdLogout() : Command("logout") {}
+        bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/stats/counters.cpp b/src/mongo/db/stats/counters.cpp
new file mode 100644
index 00000000000..889e8a86c4c
--- /dev/null
+++ b/src/mongo/db/stats/counters.cpp
@@ -0,0 +1,207 @@
+// counters.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "../jsobj.h"
+#include "counters.h"
+
+namespace mongo {
+
+    OpCounters::OpCounters() {
+        int zero = 0;
+
+        BSONObjBuilder b;
+        b.append( "insert" , zero );
+        b.append( "query" , zero );
+        b.append( "update" , zero );
+        b.append( "delete" , zero );
+        b.append( "getmore" , zero );
+        b.append( "command" , zero );
+        _obj = b.obj();
+
+        _insert = (AtomicUInt*)_obj["insert"].value();
+        _query = (AtomicUInt*)_obj["query"].value();
+        _update = (AtomicUInt*)_obj["update"].value();
+        _delete = (AtomicUInt*)_obj["delete"].value();
+        _getmore = (AtomicUInt*)_obj["getmore"].value();
+        _command = (AtomicUInt*)_obj["command"].value();
+    }
+
+    void OpCounters::gotOp( int op , bool isCommand ) {
+        switch ( op ) {
+        case dbInsert: /*gotInsert();*/ break; // need to handle multi-insert
+        case dbQuery:
+            if ( isCommand )
+                gotCommand();
+            else
+                gotQuery();
+            break;
+
+        case dbUpdate: gotUpdate(); break;
+        case dbDelete: gotDelete(); break;
+        case dbGetMore: gotGetMore(); break;
+        case dbKillCursors:
+        case opReply:
+        case dbMsg:
+            break;
+        default: log() << "OpCounters::gotOp unknown op: " << op << endl;
+        }
+    }
+
+    BSONObj& OpCounters::getObj() {
+        const unsigned MAX = 1 << 30;
+        RARELY {
+            bool wrap =
+            _insert->get() > MAX ||
+            _query->get() > MAX ||
+            _update->get() > MAX ||
+            _delete->get() > MAX ||
+            _getmore->get() > MAX ||
+            _command->get() > MAX;
+
+            if ( wrap ) {
+                _insert->zero();
+                _query->zero();
+                _update->zero();
+                _delete->zero();
+                _getmore->zero();
+                _command->zero();
+            }
+
+        }
+        return _obj;
+    }
+
+    IndexCounters::IndexCounters() {
+        _memSupported = _pi.blockCheckSupported();
+
+        _btreeMemHits = 0;
+        _btreeMemMisses = 0;
+        _btreeAccesses = 0;
+
+
+        _maxAllowed = ( numeric_limits< long long >::max() ) / 2;
+        _resets = 0;
+
+        _sampling = 0;
+        _samplingrate = 100;
+    }
+
+    void IndexCounters::append( BSONObjBuilder& b ) {
+        if ( ! _memSupported ) {
+            b.append( "note" , "not supported on this platform" );
+            return;
+        }
+
+        BSONObjBuilder bb( b.subobjStart( "btree" ) );
+        bb.appendNumber( "accesses" , _btreeAccesses );
+        bb.appendNumber( "hits" , _btreeMemHits );
+        bb.appendNumber( "misses" , _btreeMemMisses );
+
+        bb.append( "resets" , _resets );
+
+        bb.append( "missRatio" , (_btreeAccesses ? (_btreeMemMisses / (double)_btreeAccesses) : 0) );
+
+        bb.done();
+
+        if ( _btreeAccesses > _maxAllowed ) {
+            _btreeAccesses = 0;
+            _btreeMemMisses = 0;
+            _btreeMemHits = 0;
+            _resets++;
+        }
+    }
+
+    FlushCounters::FlushCounters()
+        : _total_time(0)
+        , _flushes(0)
+        , _last()
+    {}
+
+    void FlushCounters::flushed(int ms) {
+        _flushes++;
+        _total_time += ms;
+        _last_time = ms;
+        _last = jsTime();
+    }
+
+    void FlushCounters::append( BSONObjBuilder& b ) {
+        b.appendNumber( "flushes" , _flushes );
+        b.appendNumber( "total_ms" , _total_time );
+        b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) );
+        b.appendNumber( "last_ms" , _last_time );
+        b.append("last_finished", _last);
+    }
+
+
+    void GenericCounter::hit( const string& name , int count ) {
+        scoped_lock lk( _mutex );
+        _counts[name]++;
+    }
+
+    BSONObj GenericCounter::getObj() {
+        BSONObjBuilder b(128);
+        {
+            mongo::mutex::scoped_lock lk( _mutex );
+            for ( map<string,long long>::iterator i=_counts.begin(); i!=_counts.end(); i++ ) {
+                b.appendNumber( i->first , i->second );
+            }
+        }
+        return b.obj();
+    }
+
+
+    void NetworkCounter::hit( long long bytesIn , long long bytesOut ) {
+        const long long MAX = 1ULL << 60;
+
+        // don't care about the race as its just a counter
+        bool overflow = _bytesIn > MAX || _bytesOut > MAX;
+
+        if ( overflow ) {
+            _lock.lock();
+            _overflows++;
+            _bytesIn = bytesIn;
+            _bytesOut = bytesOut;
+            _requests = 1;
+            _lock.unlock();
+        }
+        else {
+            _lock.lock();
+            _bytesIn += bytesIn;
+            _bytesOut += bytesOut;
+            _requests++;
+            _lock.unlock();
+        }
+    }
+
+    void NetworkCounter::append( BSONObjBuilder& b ) {
+        _lock.lock();
+        b.appendNumber( "bytesIn" , _bytesIn );
+        b.appendNumber( "bytesOut" , _bytesOut );
+        b.appendNumber( "numRequests" , _requests );
+        _lock.unlock();
+    }
+
+
+    OpCounters globalOpCounters;
+    OpCounters replOpCounters;
+    IndexCounters globalIndexCounters;
+    FlushCounters globalFlushCounters;
+    NetworkCounter networkCounter;
+
+}
diff --git a/src/mongo/db/stats/counters.h b/src/mongo/db/stats/counters.h
new file mode 100644
index 00000000000..0cb29aa49aa
--- /dev/null
+++ b/src/mongo/db/stats/counters.h
@@ -0,0 +1,159 @@
+// counters.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../../util/net/message.h"
+#include "../../util/processinfo.h"
+#include "../../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+    /**
+     * for storing operation counters
+     * note: not thread safe.  ok with that for speed
+     */
+    class OpCounters {
+    public:
+
+        OpCounters();
+
+        AtomicUInt * getInsert() { return _insert; }
+        AtomicUInt * getQuery() { return _query; }
+        AtomicUInt * getUpdate() { return _update; }
+        AtomicUInt * getDelete() { return _delete; }
+        AtomicUInt * getGetMore() { return _getmore; }
+        AtomicUInt * getCommand() { return _command; }
+
+        void incInsertInWriteLock(int n) { _insert->x += n; }
+        void gotInsert() { _insert[0]++; }
+        void gotQuery() { _query[0]++; }
+        void gotUpdate() { _update[0]++; }
+        void gotDelete() { _delete[0]++; }
+        void gotGetMore() { _getmore[0]++; }
+        void gotCommand() { _command[0]++; }
+
+        void gotOp( int op , bool isCommand );
+
+        BSONObj& getObj();
+
+    private:
+        BSONObj _obj;
+
+        // todo: there will be a lot of cache line contention on these.  need to do something 
+        //       else eventually.
+        AtomicUInt * _insert;
+        AtomicUInt * _query;
+        AtomicUInt * _update;
+        AtomicUInt * _delete;
+        AtomicUInt * _getmore;
+        AtomicUInt * _command;
+    };
+
+    extern OpCounters globalOpCounters;
+    extern OpCounters replOpCounters;
+
+
+    class IndexCounters {
+    public:
+        IndexCounters();
+
+        // used without a mutex intentionally (can race)
+        void btree( char * node ) {
+            if ( ! _memSupported )
+                return;
+            if ( _sampling++ % _samplingrate )
+                return;
+            btree( _pi.blockInMemory( node ) );
+        }
+
+        void btree( bool memHit ) {
+            if ( memHit )
+                _btreeMemHits++;
+            else
+                _btreeMemMisses++;
+            _btreeAccesses++;
+        }
+        void btreeHit() { _btreeMemHits++; _btreeAccesses++; }
+        void btreeMiss() { _btreeMemMisses++; _btreeAccesses++; }
+
+        void append( BSONObjBuilder& b );
+
+    private:
+        ProcessInfo _pi;
+        bool _memSupported;
+
+        int _sampling;
+        int _samplingrate;
+
+        int _resets;
+        long long _maxAllowed;
+
+        long long _btreeMemMisses;
+        long long _btreeMemHits;
+        long long _btreeAccesses;
+    };
+
+    extern IndexCounters globalIndexCounters;
+
+    class FlushCounters {
+    public:
+        FlushCounters();
+
+        void flushed(int ms);
+
+        void append( BSONObjBuilder& b );
+
+    private:
+        long long _total_time;
+        long long _flushes;
+        int _last_time;
+        Date_t _last;
+    };
+
+    extern FlushCounters globalFlushCounters;
+
+
+    class GenericCounter {
+    public:
+        GenericCounter() : _mutex("GenericCounter") { }
+        void hit( const string& name , int count=0 );
+        BSONObj getObj();
+    private:
+        map<string,long long> _counts; // TODO: replace with thread safe map
+        mongo::mutex _mutex;
+    };
+
+    class NetworkCounter {
+    public:
+        NetworkCounter() : _bytesIn(0), _bytesOut(0), _requests(0), _overflows(0) {}
+        void hit( long long bytesIn , long long bytesOut );
+        void append( BSONObjBuilder& b );
+    private:
+        long long _bytesIn;
+        long long _bytesOut;
+        long long _requests;
+
+        long long _overflows;
+
+        SpinLock _lock;
+    };
+
+    extern NetworkCounter networkCounter;
+}
diff --git a/src/mongo/db/stats/fine_clock.h b/src/mongo/db/stats/fine_clock.h
new file mode 100644
index 00000000000..02600e718c4
--- /dev/null
+++ b/src/mongo/db/stats/fine_clock.h
@@ -0,0 +1,67 @@
+// fine_clock.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef DB_STATS_FINE_CLOCK_HEADER
+#define DB_STATS_FINE_CLOCK_HEADER
+
+#include <time.h>  // struct timespec
+
+namespace mongo {
+
+    /**
+     * This is a nano-second precision clock. We're skipping the
+     * harware TSC in favor of clock_gettime() which in some systems
+     * does not involve a trip to the OS (VDSO).
+     *
+     * We're exporting a type WallTime that is and should remain
+     * opaque. The business of getting accurate time is still ongoing
+     * and we may change the internal representation of this class.
+     * (http://lwn.net/Articles/388188/)
+     *
+     * Really, you shouldn't be using this class in hot code paths for
+     * platforms you're not sure whether the overhead is low.
+     */
+    class FineClock {
+    public:
+
+        typedef timespec WallTime;
+
+        static WallTime now() {
+            struct timespec ts;
+            clock_gettime(CLOCK_MONOTONIC, &ts);
+            return ts;
+        }
+
+        static uint64_t diffInNanos( WallTime end, WallTime start ) {
+            uint64_t diff;
+            if ( end.tv_nsec < start.tv_nsec ) {
+                diff = 1000000000 * ( end.tv_sec - start.tv_sec - 1);
+                diff += 1000000000 + end.tv_nsec - start.tv_nsec;
+            }
+            else {
+                diff = 1000000000 * ( end.tv_sec - start.tv_sec );
+                diff += end.tv_nsec - start.tv_nsec;
+            }
+            return diff;
+        }
+
+    };
+}
+
+#endif  // DB_STATS_FINE_CLOCK_HEADER
+
diff --git a/src/mongo/db/stats/service_stats.cpp b/src/mongo/db/stats/service_stats.cpp
new file mode 100644
index 00000000000..d69147fe969
--- /dev/null
+++ b/src/mongo/db/stats/service_stats.cpp
@@ -0,0 +1,68 @@
+// service_stats.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <sstream>
+
+#include "../../util/histogram.h"
+#include "service_stats.h"
+
+namespace mongo {
+
+    using std::ostringstream;
+
+    ServiceStats::ServiceStats() {
+        // Time histogram covers up to 128msec in exponential intervals
+        // starting at 125usec.
+        Histogram::Options timeOpts;
+        timeOpts.numBuckets = 12;
+        timeOpts.bucketSize = 125;
+        timeOpts.exponential = true;
+        _timeHistogram = new Histogram( timeOpts );
+
+        // Space histogram covers up to 1MB in exponentialintervals starting
+        // at 1K.
+        Histogram::Options spaceOpts;
+        spaceOpts.numBuckets = 12;
+        spaceOpts.bucketSize = 1024;
+        spaceOpts.exponential = true;
+        _spaceHistogram = new Histogram( spaceOpts );
+    }
+
+    ServiceStats::~ServiceStats() {
+        delete _timeHistogram;
+        delete _spaceHistogram;
+    }
+
+    void ServiceStats::logResponse( uint64_t duration, uint64_t bytes ) {
+        _spinLock.lock();
+        _timeHistogram->insert( duration / 1000 /* in usecs */ );
+        _spaceHistogram->insert( bytes );
+        _spinLock.unlock();
+    }
+
+    string ServiceStats::toHTML() const {
+        ostringstream res ;
+        res << "Cumulative wire stats\n"
+            << "Response times\n" << _timeHistogram->toHTML()
+            << "Response sizes\n" << _spaceHistogram->toHTML()
+            << '\n';
+
+        return res.str();
+    }
+
+}  // mongo
diff --git a/src/mongo/db/stats/service_stats.h b/src/mongo/db/stats/service_stats.h
new file mode 100644
index 00000000000..5b0e75fdcb9
--- /dev/null
+++ b/src/mongo/db/stats/service_stats.h
@@ -0,0 +1,66 @@
+// service_stats.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef DB_STATS_SERVICE_STATS_HEADER
+#define DB_STATS_SERVICE_STATS_HEADER
+
+#include <string>
+
+#include "../../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+    using std::string;
+
+    class Histogram;
+
+    /**
+     * ServiceStats keeps track of the time a request/response message
+     * took inside a service as well as the size of the response
+     * generated.
+     */
+    class ServiceStats {
+    public:
+        ServiceStats();
+        ~ServiceStats();
+
+        /**
+         * Record the 'duration' in microseconds a request/response
+         * message took and the size in bytes of the generated
+         * response.
+         */
+        void logResponse( uint64_t duration, uint64_t bytes );
+
+        /**
+         * Render the histogram as string that can be used inside an
+         * HTML doc.
+         */
+        string toHTML() const;
+
+    private:
+        SpinLock   _spinLock;         // protects state below
+        Histogram* _timeHistogram;
+        Histogram* _spaceHistogram;
+
+        ServiceStats( const ServiceStats& );
+        ServiceStats operator=( const ServiceStats& );
+    };
+
+}  // namespace mongo
+
+#endif  //  DB_STATS_SERVICE_STATS_HEADER
diff --git a/src/mongo/db/stats/snapshots.cpp b/src/mongo/db/stats/snapshots.cpp
new file mode 100644
index 00000000000..900cc4ff1ad
--- /dev/null
+++ b/src/mongo/db/stats/snapshots.cpp
@@ -0,0 +1,227 @@
+// snapshots.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "snapshots.h"
+#include "../client.h"
+#include "../clientcursor.h"
+#include "../dbwebserver.h"
+#include "../../util/mongoutils/html.h"
+
+/**
+   handles snapshotting performance metrics and other such things
+ */
+namespace mongo {
+
+    void SnapshotData::takeSnapshot() {
+        _created = curTimeMicros64();
+        _globalUsage = Top::global.getGlobalData();
+//        _totalWriteLockedTime = d.dbMutex.info().getTimeLocked();
+        Top::global.cloneMap(_usage);
+    }
+
+    SnapshotDelta::SnapshotDelta( const SnapshotData& older , const SnapshotData& newer )
+        : _older( older ) , _newer( newer ) {
+        assert( _newer._created > _older._created );
+        _elapsed = _newer._created - _older._created;
+    }
+
+    Top::CollectionData SnapshotDelta::globalUsageDiff() {
+        return Top::CollectionData( _older._globalUsage , _newer._globalUsage );
+    }
+    Top::UsageMap SnapshotDelta::collectionUsageDiff() {
+        assert( _newer._created > _older._created );
+        Top::UsageMap u;
+
+        for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ) {
+            Top::UsageMap::const_iterator j = _older._usage.find(i->first);
+            if (j != _older._usage.end())
+                u[i->first] = Top::CollectionData( j->second , i->second );
+            else
+                u[i->first] = i->second;
+        }
+        return u;
+    }
+
+    Snapshots::Snapshots(int n)
+        : _lock("Snapshots"), _n(n)
+        , _snapshots(new SnapshotData[n])
+        , _loc(0)
+        , _stored(0)
+    {}
+
+    const SnapshotData* Snapshots::takeSnapshot() {
+        scoped_lock lk(_lock);
+        _loc = ( _loc + 1 ) % _n;
+        _snapshots[_loc].takeSnapshot();
+        if ( _stored < _n )
+            _stored++;
+        return &_snapshots[_loc];
+    }
+
+    auto_ptr<SnapshotDelta> Snapshots::computeDelta( int numBack ) {
+        scoped_lock lk(_lock);
+        auto_ptr<SnapshotDelta> p;
+        if ( numBack < numDeltas() )
+            p.reset( new SnapshotDelta( getPrev(numBack+1) , getPrev(numBack) ) );
+        return p;
+    }
+
+    const SnapshotData& Snapshots::getPrev( int numBack ) {
+        int x = _loc - numBack;
+        if ( x < 0 )
+            x += _n;
+        return _snapshots[x];
+    }
+
+    void Snapshots::outputLockInfoHTML( stringstream& ss ) {
+        scoped_lock lk(_lock);
+        ss << "\n<div>";
+        for ( int i=0; i<numDeltas(); i++ ) {
+            SnapshotDelta d( getPrev(i+1) , getPrev(i) );
+            unsigned e = (unsigned) d.elapsed() / 1000;
+            ss << (unsigned)(100*d.percentWriteLocked());
+            if( e < 3900 || e > 4100 )
+                ss << '(' << e / 1000.0 << "s)";
+            ss << ' ';
+        }
+        ss << "</div>\n";
+    }
+
+    void SnapshotThread::run() {
+        Client::initThread("snapshotthread");
+        Client& client = cc();
+
+        long long numLoops = 0;
+
+        const SnapshotData* prev = 0;
+
+        while ( ! inShutdown() ) {
+            try {
+                const SnapshotData* s = statsSnapshots.takeSnapshot();
+
+                if ( prev && cmdLine.cpu ) {
+                    unsigned long long elapsed = s->_created - prev->_created;
+                    SnapshotDelta d( *prev , *s );
+                    log() << "cpu: elapsed:" << (elapsed/1000) <<"  writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl;
+                }
+
+                prev = s;
+            }
+            catch ( std::exception& e ) {
+                log() << "ERROR in SnapshotThread: " << e.what() << endl;
+            }
+
+            numLoops++;
+            sleepsecs(4);
+        }
+
+        client.shutdown();
+    }
+
+    using namespace mongoutils::html;
+
+    class WriteLockStatus : public WebStatusPlugin {
+    public:
+        WriteLockStatus() : WebStatusPlugin( "write lock" , 51 , "% time in write lock, by 4 sec periods" ) {}
+        virtual void init() {}
+
+        virtual void run( stringstream& ss ) {
+            statsSnapshots.outputLockInfoHTML( ss );
+
+            ss << "<a "
+               "href=\"http://www.mongodb.org/pages/viewpage.action?pageId=7209296\" "
+               "title=\"snapshot: was the db in the write lock when this page was generated?\">";
+            ss << "write locked now:</a> " << (d.dbMutex.info().isLocked() ? "true" : "false") << "\n";
+        }
+
+    } writeLockStatus;
+
+    class DBTopStatus : public WebStatusPlugin {
+    public:
+        DBTopStatus() : WebStatusPlugin( "dbtop" , 50 , "(occurrences|percent of elapsed)" ) {}
+
+        void display( stringstream& ss , double elapsed , const Top::UsageData& usage ) {
+            ss << "<td>";
+            ss << usage.count;
+            ss << "</td><td>";
+            double per = 100 * ((double)usage.time)/elapsed;
+            if( per == (int) per )
+                ss << (int) per;
+            else
+                ss << setprecision(1) << fixed << per;
+            ss << '%';
+            ss << "</td>";
+        }
+
+        void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ) {
+            if ( ns != "TOTAL" && data.total.count == 0 )
+                return;
+            ss << "<tr><th>" << ns << "</th>";
+
+            display( ss , elapsed , data.total );
+
+            display( ss , elapsed , data.readLock );
+            display( ss , elapsed , data.writeLock );
+
+            display( ss , elapsed , data.queries );
+            display( ss , elapsed , data.getmore );
+            display( ss , elapsed , data.insert );
+            display( ss , elapsed , data.update );
+            display( ss , elapsed , data.remove );
+
+            ss << "</tr>\n";
+        }
+
+        void run( stringstream& ss ) {
+            auto_ptr<SnapshotDelta> delta = statsSnapshots.computeDelta();
+            if ( ! delta.get() )
+                return;
+
+            ss << "<table border=1 cellpadding=2 cellspacing=0>";
+            ss << "<tr align='left'><th>";
+            ss << a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "namespace") <<
+               "NS</a></th>"
+               "<th colspan=2>total</th>"
+               "<th colspan=2>Reads</th>"
+               "<th colspan=2>Writes</th>"
+               "<th colspan=2>Queries</th>"
+               "<th colspan=2>GetMores</th>"
+               "<th colspan=2>Inserts</th>"
+               "<th colspan=2>Updates</th>"
+               "<th colspan=2>Removes</th>";
+            ss << "</tr>\n";
+
+            display( ss , (double) delta->elapsed() , "TOTAL" , delta->globalUsageDiff() );
+
+            Top::UsageMap usage = delta->collectionUsageDiff();
+            for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ) {
+                display( ss , (double) delta->elapsed() , i->first , i->second );
+            }
+
+            ss << "</table>";
+
+        }
+
+        virtual void init() {}
+    } dbtopStatus;
+
+    Snapshots statsSnapshots;
+    SnapshotThread snapshotThread;
+
+}
diff --git a/src/mongo/db/stats/snapshots.h b/src/mongo/db/stats/snapshots.h
new file mode 100644
index 00000000000..d9b8e5eb901
--- /dev/null
+++ b/src/mongo/db/stats/snapshots.h
@@ -0,0 +1,114 @@
+// snapshots.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "top.h"
+#include "../../util/background.h"
+
+/**
+   handles snapshotting performance metrics and other such things
+ */
+namespace mongo {
+
+    class SnapshotThread;
+
+    /**
+     * stores a point in time snapshot
+     * i.e. all counters at a given time
+     */
+    class SnapshotData {
+        void takeSnapshot();
+
+        unsigned long long _created;
+        Top::CollectionData _globalUsage;
+        unsigned long long _totalWriteLockedTime; // micros of total time locked
+        Top::UsageMap _usage;
+
+        friend class SnapshotThread;
+        friend class SnapshotDelta;
+        friend class Snapshots;
+    };
+
+    /**
+     * contains performance information for a time period
+     */
+    class SnapshotDelta {
+    public:
+        SnapshotDelta( const SnapshotData& older , const SnapshotData& newer );
+
+        unsigned long long start() const {
+            return _older._created;
+        }
+
+        unsigned long long elapsed() const {
+            return _elapsed;
+        }
+
+        unsigned long long timeInWriteLock() const {
+            return _newer._totalWriteLockedTime - _older._totalWriteLockedTime;
+        }
+        double percentWriteLocked() const {
+            double e = (double) elapsed();
+            double w = (double) timeInWriteLock();
+            return w/e;
+        }
+
+        Top::CollectionData globalUsageDiff();
+        Top::UsageMap collectionUsageDiff();
+
+    private:
+        const SnapshotData& _older;
+        const SnapshotData& _newer;
+
+        unsigned long long _elapsed;
+    };
+
+    class Snapshots {
+    public:
+        Snapshots(int n=100);
+
+        const SnapshotData* takeSnapshot();
+
+        int numDeltas() const { return _stored-1; }
+
+        const SnapshotData& getPrev( int numBack = 0 );
+        auto_ptr<SnapshotDelta> computeDelta( int numBack = 0 );
+
+
+        void outputLockInfoHTML( stringstream& ss );
+    private:
+        mongo::mutex _lock;
+        int _n;
+        boost::scoped_array<SnapshotData> _snapshots;
+        int _loc;
+        int _stored;
+    };
+
+    class SnapshotThread : public BackgroundJob {
+    public:
+        virtual string name() const { return "snapshot"; }
+        void run();
+    };
+
+    extern Snapshots statsSnapshots;
+    extern SnapshotThread snapshotThread;
+
+
+}
diff --git a/src/mongo/db/stats/top.cpp b/src/mongo/db/stats/top.cpp
new file mode 100644
index 00000000000..f5b6ee42f1c
--- /dev/null
+++ b/src/mongo/db/stats/top.cpp
@@ -0,0 +1,183 @@
+// top.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "top.h"
+#include "../../util/net/message.h"
+#include "../commands.h"
+
+namespace mongo {
+
+    Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) {
+        // this won't be 100% accurate on rollovers and drop(), but at least it won't be negative
+        time  = (newer.time  >= older.time)  ? (newer.time  - older.time)  : newer.time;
+        count = (newer.count >= older.count) ? (newer.count - older.count) : newer.count;
+    }
+
+    Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer )
+        : total( older.total , newer.total ) ,
+          readLock( older.readLock , newer.readLock ) ,
+          writeLock( older.writeLock , newer.writeLock ) ,
+          queries( older.queries , newer.queries ) ,
+          getmore( older.getmore , newer.getmore ) ,
+          insert( older.insert , newer.insert ) ,
+          update( older.update , newer.update ) ,
+          remove( older.remove , newer.remove ),
+          commands( older.commands , newer.commands ) {
+
+    }
+
+    void Top::record( const string& ns , int op , int lockType , long long micros , bool command ) {
+        if ( ns[0] == '?' )
+            return;
+
+        //cout << "record: " << ns << "\t" << op << "\t" << command << endl;
+        scoped_lock lk(_lock);
+
+        if ( ( command || op == dbQuery ) && ns == _lastDropped ) {
+            _lastDropped = "";
+            return;
+        }
+
+        CollectionData& coll = _usage[ns];
+        _record( coll , op , lockType , micros , command );
+        _record( _global , op , lockType , micros , command );
+    }
+
+    void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ) {
+        c.total.inc( micros );
+
+        if ( lockType > 0 )
+            c.writeLock.inc( micros );
+        else if ( lockType < 0 )
+            c.readLock.inc( micros );
+
+        switch ( op ) {
+        case 0:
+            // use 0 for unknown, non-specific
+            break;
+        case dbUpdate:
+            c.update.inc( micros );
+            break;
+        case dbInsert:
+            c.insert.inc( micros );
+            break;
+        case dbQuery:
+            if ( command )
+                c.commands.inc( micros );
+            else
+                c.queries.inc( micros );
+            break;
+        case dbGetMore:
+            c.getmore.inc( micros );
+            break;
+        case dbDelete:
+            c.remove.inc( micros );
+            break;
+        case dbKillCursors:
+            break;
+        case opReply:
+        case dbMsg:
+            log() << "unexpected op in Top::record: " << op << endl;
+            break;
+        default:
+            log() << "unknown op in Top::record: " << op << endl;
+        }
+
+    }
+
+    void Top::collectionDropped( const string& ns ) {
+        //cout << "collectionDropped: " << ns << endl;
+        scoped_lock lk(_lock);
+        _usage.erase(ns);
+        _lastDropped = ns;
+    }
+
+    void Top::cloneMap(Top::UsageMap& out) const {
+        scoped_lock lk(_lock);
+        out = _usage;
+    }
+
+    void Top::append( BSONObjBuilder& b ) {
+        scoped_lock lk( _lock );
+        _appendToUsageMap( b , _usage );
+    }
+
+    void Top::_appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const {
+        for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ) {
+            BSONObjBuilder bb( b.subobjStart( i->first ) );
+
+            const CollectionData& coll = i->second;
+
+            _appendStatsEntry( b , "total" , coll.total );
+
+            _appendStatsEntry( b , "readLock" , coll.readLock );
+            _appendStatsEntry( b , "writeLock" , coll.writeLock );
+
+            _appendStatsEntry( b , "queries" , coll.queries );
+            _appendStatsEntry( b , "getmore" , coll.getmore );
+            _appendStatsEntry( b , "insert" , coll.insert );
+            _appendStatsEntry( b , "update" , coll.update );
+            _appendStatsEntry( b , "remove" , coll.remove );
+            _appendStatsEntry( b , "commands" , coll.commands );
+
+            bb.done();
+        }
+    }
+
+    void Top::_appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const {
+        BSONObjBuilder bb( b.subobjStart( statsName ) );
+        bb.appendNumber( "time" , map.time );
+        bb.appendNumber( "count" , map.count );
+        bb.done();
+    }
+
+    class TopCmd : public Command {
+    public:
+        TopCmd() : Command( "top", true ) {}
+
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream& help ) const { help << "usage by collection, in micros "; }
+
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            {
+                BSONObjBuilder b( result.subobjStart( "totals" ) );
+                b.append( "note" , "all times in microseconds" );
+                Top::global.append( b );
+                b.done();
+            }
+            return true;
+        }
+
+    } topCmd;
+
+    Top Top::global;
+
+    TopOld::T TopOld::_snapshotStart = TopOld::currentTime();
+    TopOld::D TopOld::_snapshotDuration;
+    TopOld::UsageMap TopOld::_totalUsage;
+    TopOld::UsageMap TopOld::_snapshotA;
+    TopOld::UsageMap TopOld::_snapshotB;
+    TopOld::UsageMap &TopOld::_snapshot = TopOld::_snapshotA;
+    TopOld::UsageMap &TopOld::_nextSnapshot = TopOld::_snapshotB;
+    mongo::mutex TopOld::topMutex("topMutex");
+
+
+}
diff --git a/src/mongo/db/stats/top.h b/src/mongo/db/stats/top.h
new file mode 100644
index 00000000000..9645ed1a3a6
--- /dev/null
+++ b/src/mongo/db/stats/top.h
@@ -0,0 +1,247 @@
+// top.h : DB usage monitor.
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+    /**
+     * tracks usage by collection
+     */
+    class Top {
+
+    public:
+        Top() : _lock("Top") { }
+
+        struct UsageData {
+            UsageData() : time(0) , count(0) {}
+            UsageData( const UsageData& older , const UsageData& newer );
+            long long time;
+            long long count;
+
+            void inc( long long micros ) {
+                count++;
+                time += micros;
+            }
+        };
+
+        struct CollectionData {
+            /**
+             * constructs a diff
+             */
+            CollectionData() {}
+            CollectionData( const CollectionData& older , const CollectionData& newer );
+
+            UsageData total;
+
+            UsageData readLock;
+            UsageData writeLock;
+
+            UsageData queries;
+            UsageData getmore;
+            UsageData insert;
+            UsageData update;
+            UsageData remove;
+            UsageData commands;
+        };
+
+        typedef map<string,CollectionData> UsageMap;
+
+    public:
+        void record( const string& ns , int op , int lockType , long long micros , bool command );
+        void append( BSONObjBuilder& b );
+        void cloneMap(UsageMap& out) const;
+        CollectionData getGlobalData() const { return _global; }
+        void collectionDropped( const string& ns );
+
+    public: // static stuff
+        static Top global;
+
+    private:
+        void _appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const;
+        void _appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const;
+        void _record( CollectionData& c , int op , int lockType , long long micros , bool command );
+
+        mutable mongo::mutex _lock;
+        CollectionData _global;
+        UsageMap _usage;
+        string _lastDropped;
+    };
+
+    /* Records per namespace utilization of the mongod process.
+       No two functions of this class may be called concurrently.
+    */
+    class TopOld {
+        typedef boost::posix_time::ptime T;
+        typedef boost::posix_time::time_duration D;
+        typedef boost::tuple< D, int, int, int > UsageData;
+    public:
+        TopOld() : _read(false), _write(false) { }
+
+        /* these are used to record activity: */
+
+        void clientStart( const char *client ) {
+            clientStop();
+            _currentStart = currentTime();
+            _current = client;
+        }
+
+        /* indicate current request is a read operation. */
+        void setRead() { _read = true; }
+
+        void setWrite() { _write = true; }
+
+        void clientStop() {
+            if ( _currentStart == T() )
+                return;
+            D d = currentTime() - _currentStart;
+
+            {
+                scoped_lock L(topMutex);
+                recordUsage( _current, d );
+            }
+
+            _currentStart = T();
+            _read = false;
+            _write = false;
+        }
+
+        /* these are used to fetch the stats: */
+
+        struct Usage {
+            string ns;
+            D time;
+            double pct;
+            int reads, writes, calls;
+        };
+
+        static void usage( vector< Usage > &res ) {
+            scoped_lock L(topMutex);
+
+            // Populate parent namespaces
+            UsageMap snapshot;
+            UsageMap totalUsage;
+            fillParentNamespaces( snapshot, _snapshot );
+            fillParentNamespaces( totalUsage, _totalUsage );
+
+            multimap< D, string, more > sorted;
+            for( UsageMap::iterator i = snapshot.begin(); i != snapshot.end(); ++i )
+                sorted.insert( make_pair( i->second.get<0>(), i->first ) );
+            for( multimap< D, string, more >::iterator i = sorted.begin(); i != sorted.end(); ++i ) {
+                if ( trivialNs( i->second.c_str() ) )
+                    continue;
+                Usage u;
+                u.ns = i->second;
+                u.time = totalUsage[ u.ns ].get<0>();
+                u.pct = _snapshotDuration != D() ? 100.0 * i->first.ticks() / _snapshotDuration.ticks() : 0;
+                u.reads = snapshot[ u.ns ].get<1>();
+                u.writes = snapshot[ u.ns ].get<2>();
+                u.calls = snapshot[ u.ns ].get<3>();
+                res.push_back( u );
+            }
+            for( UsageMap::iterator i = totalUsage.begin(); i != totalUsage.end(); ++i ) {
+                if ( snapshot.count( i->first ) != 0 || trivialNs( i->first.c_str() ) )
+                    continue;
+                Usage u;
+                u.ns = i->first;
+                u.time = i->second.get<0>();
+                u.pct = 0;
+                u.reads = 0;
+                u.writes = 0;
+                u.calls = 0;
+                res.push_back( u );
+            }
+        }
+
+        static void completeSnapshot() {
+            scoped_lock L(topMutex);
+
+            if ( &_snapshot == &_snapshotA ) {
+                _snapshot = _snapshotB;
+                _nextSnapshot = _snapshotA;
+            }
+            else {
+                _snapshot = _snapshotA;
+                _nextSnapshot = _snapshotB;
+            }
+            _snapshotDuration = currentTime() - _snapshotStart;
+            _snapshotStart = currentTime();
+            _nextSnapshot.clear();
+        }
+
+    private:
+        static mongo::mutex topMutex;
+        static bool trivialNs( const char *ns ) {
+            const char *ret = strrchr( ns, '.' );
+            return ret && ret[ 1 ] == '\0';
+        }
+        typedef map<string,UsageData> UsageMap; // duration, # reads, # writes, # total calls
+        static T currentTime() {
+            return boost::posix_time::microsec_clock::universal_time();
+        }
+        void recordUsage( const string &client, D duration ) {
+            recordUsageForMap( _totalUsage, client, duration );
+            recordUsageForMap( _nextSnapshot, client, duration );
+        }
+        void recordUsageForMap( UsageMap &map, const string &client, D duration ) {
+            UsageData& g = map[client];
+            g.get< 0 >() += duration;
+            if ( _read && !_write )
+                g.get< 1 >()++;
+            else if ( !_read && _write )
+                g.get< 2 >()++;
+            g.get< 3 >()++;
+        }
+        static void fillParentNamespaces( UsageMap &to, const UsageMap &from ) {
+            for( UsageMap::const_iterator i = from.begin(); i != from.end(); ++i ) {
+                string current = i->first;
+                size_t dot = current.rfind( "." );
+                if ( dot == string::npos || dot != current.length() - 1 ) {
+                    inc( to[ current ], i->second );
+                }
+                while( dot != string::npos ) {
+                    current = current.substr( 0, dot );
+                    inc( to[ current ], i->second );
+                    dot = current.rfind( "." );
+                }
+            }
+        }
+        static void inc( UsageData &to, const UsageData &from ) {
+            to.get<0>() += from.get<0>();
+            to.get<1>() += from.get<1>();
+            to.get<2>() += from.get<2>();
+            to.get<3>() += from.get<3>();
+        }
+        struct more { bool operator()( const D &a, const D &b ) { return a > b; } };
+        string _current;
+        T _currentStart;
+        static T _snapshotStart;
+        static D _snapshotDuration;
+        static UsageMap _totalUsage;
+        static UsageMap _snapshotA;
+        static UsageMap _snapshotB;
+        static UsageMap &_snapshot;
+        static UsageMap &_nextSnapshot;
+        bool _read;
+        bool _write;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/taskqueue.h b/src/mongo/db/taskqueue.h
new file mode 100644
index 00000000000..005bd986f11
--- /dev/null
+++ b/src/mongo/db/taskqueue.h
@@ -0,0 +1,106 @@
+// @file deferredinvoker.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mongomutex.h"
+
+namespace mongo {
+
+    /** defer work items by queueing them for invocation by another thread.  presumption is that
+        consumer thread is outside of locks more than the source thread.  Additional presumption
+        is that several objects or micro-tasks will be queued and that having a single thread
+        processing them in batch is hepful as they (in the first use case) use a common data
+        structure that can then be in local cpu classes.
+
+        this class is in db/ as it is dbMutex (mongomutex) specific (so far).
+
+        using a functor instead of go() might be more elegant too, once again, would like to test any
+        performance differential.  also worry that operator() hides things?
+
+        MT - copyable "micro task" object we can queue
+             must have a static method void MT::go(const MT&)
+
+        see DefInvoke in dbtests/ for an example.
+    */
+    template< class MT >
+    class TaskQueue {
+    public:
+        TaskQueue() : _which(0), _invokeMutex("deferredinvoker") { }
+
+        void defer(MT mt) {
+            // only one writer allowed.  however the invoke processing below can occur concurrently with
+            // writes (for the most part)
+            DEV d.dbMutex.assertWriteLocked();
+
+            _queues[_which].push_back(mt);
+        }
+
+        /** call to process deferrals.
+
+            concurrency: handled herein.  multiple threads could call invoke(), but their efforts will be
+                         serialized.  the common case is that there is a single processor calling invoke().
+
+            normally, you call this outside of any lock.  but if you want to fully drain the queue,
+            call from within a read lock.  for example:
+            {
+              // drain with minimal time in lock
+              d.invoke();
+              readlock lk;
+              d.invoke();
+              ...
+            }
+            you can also call invoke periodically to do some work and then pick up later on more.
+        */
+        void invoke() {
+            mutex::scoped_lock lk2(_invokeMutex);
+            int toDrain = 0;
+            {
+                // flip queueing to the other queue (we are double buffered)
+                readlocktry lk("", 5);
+                if( !lk.got() )
+                    return;
+                toDrain = _which;
+                _which = _which ^ 1;
+                wassert( _queues[_which].empty() ); // we are in dbMutex, so it should be/stay empty til we exit dbMutex
+            }
+
+            _drain( _queues[toDrain] );
+            assert( _queues[toDrain].empty() );
+        }
+
+    private:
+        int _which; // 0 or 1
+        typedef vector< MT > Queue;
+        Queue _queues[2];
+
+        // lock order when multiple locks: dbMutex, _invokeMutex
+        mongo::mutex _invokeMutex;
+
+        void _drain(Queue& queue) {
+            unsigned oldCap = queue.capacity();
+            for( typename Queue::iterator i = queue.begin(); i != queue.end(); i++ ) {
+                const MT& v = *i;
+                MT::go(v);
+            }
+            queue.clear();
+            DEV assert( queue.capacity() == oldCap ); // just checking that clear() doesn't deallocate, we don't want that
+        }
+    };
+
+}
diff --git a/src/mongo/db/tests.cpp b/src/mongo/db/tests.cpp
new file mode 100644
index 00000000000..00f299e1bb6
--- /dev/null
+++ b/src/mongo/db/tests.cpp
@@ -0,0 +1,68 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* tests.cpp
+
+   unit test & such
+*/
+
+#include "pch.h"
+#include "../util/mmap.h"
+
+namespace mongo {
+
+    int test2_old9() {
+        out() << "test2" << endl;
+        printStackTrace();
+        if ( 1 )
+            return 1;
+
+        MemoryMappedFile f;
+
+        unsigned long long len = 64*1024*1024;
+        char *p = (char *) f.map("/tmp/test.dat", len);
+        char *start = p;
+        char *end = p + 64*1024*1024-2;
+        end[1] = 'z';
+        int i;
+        while ( p < end ) {
+            *p++ = ' ';
+            if ( ++i%64 == 0 ) {
+                *p++ = '\n';
+                *p++ = 'x';
+            }
+        }
+        *p = 'a';
+
+        f.flush(true);
+        out() << "done" << endl;
+
+        char *x = start + 32 * 1024 * 1024;
+        char *y = start + 48 * 1024 * 1024;
+        char *z = start + 62 * 1024 * 1024;
+
+        strcpy(z, "zfoo");
+        out() << "y" << endl;
+        strcpy(y, "yfoo");
+        strcpy(x, "xfoo");
+        strcpy(start, "xfoo");
+
+        dbexit( EXIT_TEST );
+
+        return 1;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/dbtests/background_job_test.cpp b/src/mongo/dbtests/background_job_test.cpp
new file mode 100644
index 00000000000..f2bf7d86244
--- /dev/null
+++ b/src/mongo/dbtests/background_job_test.cpp
@@ -0,0 +1,109 @@
+// @file background_job_test.cpp
+
+/**
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../pch.h"
+#include <boost/thread/thread.hpp>
+
+#include "dbtests.h"
+#include "../util/time_support.h"
+#include "../util/background.h"
+
+namespace BackgroundJobTests {
+
+    // a global variable that can be accessed independent of the IncTester object below
+    // IncTester keeps it up-to-date
+    int GLOBAL_val;
+
+    class IncTester : public mongo::BackgroundJob {
+    public:
+        explicit IncTester( long long millis , bool selfDelete = false )
+            : BackgroundJob(selfDelete), _val(0), _millis(millis) { GLOBAL_val = 0; }
+
+        void waitAndInc( long long millis ) {
+            if ( millis )
+                mongo::sleepmillis( millis );
+            ++_val;
+            ++GLOBAL_val;
+        }
+
+        int getVal() { return _val; }
+
+        /* --- BackgroundJob virtuals --- */
+
+        string name() const { return "IncTester"; }
+
+        void run() { waitAndInc( _millis ); }
+
+    private:
+        int _val;
+        long long _millis;
+    };
+
+
+    class NormalCase {
+    public:
+        void run() {
+            IncTester tester( 0 /* inc without wait */ );
+            tester.go();
+            ASSERT( tester.wait() );
+            ASSERT_EQUALS( tester.getVal() , 1 );
+        }
+    };
+
+    class TimeOutCase {
+    public:
+        void run() {
+            IncTester tester( 1000 /* wait 1sec before inc-ing */ );
+            tester.go();
+            ASSERT( ! tester.wait( 100 /* ms */ ) ); // should time out
+            ASSERT_EQUALS( tester.getVal() , 0 );
+
+            // if we wait longer than the IncTester, we should see the increment
+            ASSERT( tester.wait( 1500 /* ms */ ) );  // should not time out
+            ASSERT_EQUALS( tester.getVal() , 1 );
+        }
+    };
+
+    class SelfDeletingCase {
+    public:
+        void run() {
+            BackgroundJob* j = new IncTester( 0 /* inc without wait */ , true /* self delete */  );
+            j->go();
+
+
+            // the background thread should have continued running and this test should pass the
+            // heap-checker as well
+            mongo::sleepmillis( 1000 );
+            ASSERT_EQUALS( GLOBAL_val, 1 );
+        }
+    };
+
+
+    class BackgroundJobSuite : public Suite {
+    public:
+        BackgroundJobSuite() : Suite( "background_job" ) {}
+
+        void setupTests() {
+            add< NormalCase >();
+            add< TimeOutCase >();
+            add< SelfDeletingCase >();
+        }
+
+    } backgroundJobSuite;
+
+} // namespace BackgroundJobTests
diff --git a/src/mongo/dbtests/balancer_policy_tests.cpp b/src/mongo/dbtests/balancer_policy_tests.cpp
new file mode 100644
index 00000000000..6f7c4a5dcd3
--- /dev/null
+++ b/src/mongo/dbtests/balancer_policy_tests.cpp
@@ -0,0 +1,203 @@
+// @file balancer_policy_test.cpp
+
+/**
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "dbtests.h"
+
+// TODO SERVER-1822
+//#include "../s/config.h" // for ShardFields
+//#include "../s/balancer_policy.h"
+
+namespace BalancerPolicyTests {
+
+//
+// TODO SERVER-1822
+//
+#if 0
+
+    typedef mongo::ShardFields sf;  // fields from 'shards' colleciton
+    typedef mongo::LimitsFields lf; // fields from the balancer's limits map
+
+    class SizeMaxedShardTest {
+    public:
+        void run() {
+            BSONObj shard0 = BSON( sf::maxSize(0LL) << lf::currSize(0LL) );
+            ASSERT( ! BalancerPolicy::isSizeMaxed( shard0 ) );
+
+            BSONObj shard1 = BSON( sf::maxSize(100LL) << lf::currSize(80LL) );
+            ASSERT( ! BalancerPolicy::isSizeMaxed( shard1 ) );
+
+            BSONObj shard2 = BSON( sf::maxSize(100LL) << lf::currSize(110LL) );
+            ASSERT( BalancerPolicy::isSizeMaxed( shard2 ) );
+
+            BSONObj empty;
+            ASSERT( ! BalancerPolicy::isSizeMaxed( empty ) );
+        }
+    };
+
+    class DrainingShardTest {
+    public:
+        void run() {
+            BSONObj shard0 = BSON( sf::draining(true) );
+            ASSERT( BalancerPolicy::isDraining( shard0 ) );
+
+            BSONObj shard1 = BSON( sf::draining(false) );
+            ASSERT( ! BalancerPolicy::isDraining( shard1 ) );
+
+            BSONObj empty;
+            ASSERT( ! BalancerPolicy::isDraining( empty ) );
+        }
+    };
+
+    class BalanceNormalTest {
+    public:
+        void run() {
+            // 2 chunks and 0 chunk shards
+            BalancerPolicy::ShardToChunksMap chunkMap;
+            vector<BSONObj> chunks;
+            chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) <<
+                                   "max" << BSON( "x" << 49 )));
+            chunks.push_back(BSON( "min" << BSON( "x" << 49 ) <<
+                                   "max" << BSON( "x" << BSON( "$maxkey"<<1 ))));
+            chunkMap["shard0"] = chunks;
+            chunks.clear();
+            chunkMap["shard1"] = chunks;
+
+            // no limits
+            BalancerPolicy::ShardToLimitsMap limitsMap;
+            BSONObj limits0 = BSON( sf::maxSize(0LL) << lf::currSize(2LL) << sf::draining(false) << lf::hasOpsQueued(false) );
+            BSONObj limits1 = BSON( sf::maxSize(0LL) << lf::currSize(0LL) << sf::draining(false) << lf::hasOpsQueued(false) );
+            limitsMap["shard0"] = limits0;
+            limitsMap["shard1"] = limits1;
+
+            BalancerPolicy::ChunkInfo* c = NULL;
+            c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 1 );
+            ASSERT( c );
+        }
+    };
+
+    class BalanceDrainingTest {
+    public:
+        void run() {
+            // one normal, one draining
+            // 2 chunks and 0 chunk shards
+            BalancerPolicy::ShardToChunksMap chunkMap;
+            vector<BSONObj> chunks;
+            chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) <<
+                                   "max" << BSON( "x" << 49 )));
+            chunkMap["shard0"] = chunks;
+            chunks.clear();
+            chunks.push_back(BSON( "min" << BSON( "x" << 49 ) <<
+                                   "max" << BSON( "x" << BSON( "$maxkey"<<1 ))));
+            chunkMap["shard1"] = chunks;
+
+            // shard0 is draining
+            BalancerPolicy::ShardToLimitsMap limitsMap;
+            BSONObj limits0 = BSON( sf::maxSize(0LL) << lf::currSize(2LL) << sf::draining(true) );
+            BSONObj limits1 = BSON( sf::maxSize(0LL) << lf::currSize(0LL) << sf::draining(false) );
+            limitsMap["shard0"] = limits0;
+            limitsMap["shard1"] = limits1;
+
+            BalancerPolicy::ChunkInfo* c = NULL;
+            c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 );
+            ASSERT( c );
+            ASSERT_EQUALS( c->to , "shard1" );
+            ASSERT_EQUALS( c->from , "shard0" );
+            ASSERT( ! c->chunk.isEmpty() );
+        }
+    };
+
+    class BalanceEndedDrainingTest {
+    public:
+        void run() {
+            // 2 chunks and 0 chunk (drain completed) shards
+            BalancerPolicy::ShardToChunksMap chunkMap;
+            vector<BSONObj> chunks;
+            chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) <<
+                                   "max" << BSON( "x" << 49 )));
+            chunks.push_back(BSON( "min" << BSON( "x" << 49 ) <<
+                                   "max" << BSON( "x" << BSON( "$maxkey"<<1 ))));
+            chunkMap["shard0"] = chunks;
+            chunks.clear();
+            chunkMap["shard1"] = chunks;
+
+            // no limits
+            BalancerPolicy::ShardToLimitsMap limitsMap;
+            BSONObj limits0 = BSON( sf::maxSize(0LL) << lf::currSize(2LL) << sf::draining(false) );
+            BSONObj limits1 = BSON( sf::maxSize(0LL) << lf::currSize(0LL) << sf::draining(true) );
+            limitsMap["shard0"] = limits0;
+            limitsMap["shard1"] = limits1;
+
+            BalancerPolicy::ChunkInfo* c = NULL;
+            c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 );
+            ASSERT( ! c );
+        }
+    };
+
+    class BalanceImpasseTest {
+    public:
+        void run() {
+            // one maxed out, one draining
+            // 2 chunks and 0 chunk shards
+            BalancerPolicy::ShardToChunksMap chunkMap;
+            vector<BSONObj> chunks;
+            chunks.push_back(BSON( "min" << BSON( "x" << BSON( "$minKey"<<1) ) <<
+                                   "max" << BSON( "x" << 49 )));
+            chunkMap["shard0"] = chunks;
+            chunks.clear();
+            chunks.push_back(BSON( "min" << BSON( "x" << 49 ) <<
+                                   "max" << BSON( "x" << BSON( "$maxkey"<<1 ))));
+            chunkMap["shard1"] = chunks;
+
+            // shard0 is draining, shard1 is maxed out, shard2 has writebacks pending
+            BalancerPolicy::ShardToLimitsMap limitsMap;
+            BSONObj limits0 = BSON( sf::maxSize(0LL) << lf::currSize(2LL) << sf::draining(true) );
+            BSONObj limits1 = BSON( sf::maxSize(1LL) << lf::currSize(1LL) << sf::draining(false) );
+            BSONObj limits2 = BSON( sf::maxSize(0LL) << lf::currSize(1LL) << lf::hasOpsQueued(true) );
+            limitsMap["shard0"] = limits0;
+            limitsMap["shard1"] = limits1;
+            limitsMap["shard2"] = limits2;
+
+            BalancerPolicy::ChunkInfo* c = NULL;
+            c = BalancerPolicy::balance( "ns", limitsMap, chunkMap, 0 );
+            ASSERT( ! c );
+        }
+    };
+
+//
+// TODO SERVER-1822
+//
+#endif // #if 0
+
+    class All : public Suite {
+    public:
+        All() : Suite( "balancer_policy" ) {
+        }
+
+        void setupTests() {
+            // TODO SERVER-1822
+            // add< SizeMaxedShardTest >();
+            // add< DrainingShardTest >();
+            // add< BalanceNormalTest >();
+            // add< BalanceDrainingTest >();
+            // add< BalanceEndedDrainingTest >();
+            // add< BalanceImpasseTest >();
+        }
+    } allTests;
+
+} // namespace BalancerPolicyTests
diff --git a/src/mongo/dbtests/basictests.cpp b/src/mongo/dbtests/basictests.cpp
new file mode 100644
index 00000000000..46a7dbc22bd
--- /dev/null
+++ b/src/mongo/dbtests/basictests.cpp
@@ -0,0 +1,695 @@
+// basictests.cpp : basic unit tests
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "dbtests.h"
+#include "../util/base64.h"
+#include "../util/array.h"
+#include "../util/text.h"
+#include "../util/queue.h"
+#include "../util/paths.h"
+#include "../util/stringutils.h"
+#include "../util/compress.h"
+#include "../db/db.h"
+
+namespace BasicTests {
+
+    class Rarely {
+    public:
+        void run() {
+            int first = 0;
+            int second = 0;
+            int third = 0;
+            for( int i = 0; i < 128; ++i ) {
+                incRarely( first );
+                incRarely2( second );
+                ONCE ++third;
+            }
+            ASSERT_EQUALS( 1, first );
+            ASSERT_EQUALS( 1, second );
+            ASSERT_EQUALS( 1, third );
+        }
+    private:
+        void incRarely( int &c ) {
+            RARELY ++c;
+        }
+        void incRarely2( int &c ) {
+            RARELY ++c;
+        }
+    };
+
+    class Base64Tests {
+    public:
+
+        void roundTrip( string s ) {
+            ASSERT_EQUALS( s , base64::decode( base64::encode( s ) ) );
+        }
+
+        void roundTrip( const unsigned char * _data , int len ) {
+            const char *data = (const char *) _data;
+            string s = base64::encode( data , len );
+            string out = base64::decode( s );
+            ASSERT_EQUALS( out.size() , static_cast<size_t>(len) );
+            bool broke = false;
+            for ( int i=0; i<len; i++ ) {
+                if ( data[i] != out[i] )
+                    broke = true;
+            }
+            if ( ! broke )
+                return;
+
+            cout << s << endl;
+            for ( int i=0; i<len; i++ )
+                cout << hex << ( data[i] & 0xFF ) << dec << " ";
+            cout << endl;
+            for ( int i=0; i<len; i++ )
+                cout << hex << ( out[i] & 0xFF ) << dec << " ";
+            cout << endl;
+
+            ASSERT(0);
+        }
+
+        void run() {
+
+            ASSERT_EQUALS( "ZWxp" , base64::encode( "eli" , 3 ) );
+            ASSERT_EQUALS( "ZWxpb3Rz" , base64::encode( "eliots" , 6 ) );
+            ASSERT_EQUALS( "ZWxpb3Rz" , base64::encode( "eliots" ) );
+
+            ASSERT_EQUALS( "ZQ==" , base64::encode( "e" , 1 ) );
+            ASSERT_EQUALS( "ZWw=" , base64::encode( "el" , 2 ) );
+
+            roundTrip( "e" );
+            roundTrip( "el" );
+            roundTrip( "eli" );
+            roundTrip( "elio" );
+            roundTrip( "eliot" );
+            roundTrip( "eliots" );
+            roundTrip( "eliotsz" );
+
+            unsigned char z[] = { 0x1 , 0x2 , 0x3 , 0x4 };
+            roundTrip( z , 4 );
+
+            unsigned char y[] = {
+                0x01, 0x10, 0x83, 0x10, 0x51, 0x87, 0x20, 0x92, 0x8B, 0x30,
+                0xD3, 0x8F, 0x41, 0x14, 0x93, 0x51, 0x55, 0x97, 0x61, 0x96,
+                0x9B, 0x71, 0xD7, 0x9F, 0x82, 0x18, 0xA3, 0x92, 0x59, 0xA7,
+                0xA2, 0x9A, 0xAB, 0xB2, 0xDB, 0xAF, 0xC3, 0x1C, 0xB3, 0xD3,
+                0x5D, 0xB7, 0xE3, 0x9E, 0xBB, 0xF3, 0xDF, 0xBF
+            };
+            roundTrip( y , 4 );
+            roundTrip( y , 40 );
+        }
+    };
+
+    namespace stringbuildertests {
+#define SBTGB(x) ss << (x); sb << (x);
+
+        class Base {
+            virtual void pop() = 0;
+
+        public:
+            Base() {}
+            virtual ~Base() {}
+
+            void run() {
+                pop();
+                ASSERT_EQUALS( ss.str() , sb.str() );
+            }
+
+            stringstream ss;
+            StringBuilder sb;
+        };
+
+        class simple1 : public Base {
+            void pop() {
+                SBTGB(1);
+                SBTGB("yo");
+                SBTGB(2);
+            }
+        };
+
+        class simple2 : public Base {
+            void pop() {
+                SBTGB(1);
+                SBTGB("yo");
+                SBTGB(2);
+                SBTGB( 12123123123LL );
+                SBTGB( "xxx" );
+                SBTGB( 5.4 );
+                SBTGB( 5.4312 );
+                SBTGB( "yyy" );
+                SBTGB( (short)5 );
+                SBTGB( (short)(1231231231231LL) );
+            }
+        };
+
+        class reset1 {
+        public:
+            void run() {
+                StringBuilder sb;
+                sb << "1" << "abc" << "5.17";
+                ASSERT_EQUALS( "1abc5.17" , sb.str() );
+                ASSERT_EQUALS( "1abc5.17" , sb.str() );
+                sb.reset();
+                ASSERT_EQUALS( "" , sb.str() );
+                sb << "999";
+                ASSERT_EQUALS( "999" , sb.str() );
+            }
+        };
+
+        class reset2 {
+        public:
+            void run() {
+                StringBuilder sb;
+                sb << "1" << "abc" << "5.17";
+                ASSERT_EQUALS( "1abc5.17" , sb.str() );
+                ASSERT_EQUALS( "1abc5.17" , sb.str() );
+                sb.reset(1);
+                ASSERT_EQUALS( "" , sb.str() );
+                sb << "999";
+                ASSERT_EQUALS( "999" , sb.str() );
+            }
+        };
+
+    }
+
+    class sleeptest {
+    public:
+
+        void run() {
+            Timer t;
+            int matches = 0;
+            for( int p = 0; p < 3; p++ ) {
+                sleepsecs( 1 );
+                int sec = (t.millis() + 2)/1000;
+                if( sec == 1 ) 
+                    matches++;
+                else
+                    log() << "temp millis: " << t.millis() << endl;
+                ASSERT( sec >= 0 && sec <= 2 );
+                t.reset();
+            }
+            if ( matches < 2 )
+                log() << "matches:" << matches << endl;
+            ASSERT( matches >= 2 );
+
+            sleepmicros( 1527123 );
+            ASSERT( t.micros() > 1000000 );
+            ASSERT( t.micros() < 2000000 );
+
+            t.reset();
+            sleepmillis( 1727 );
+            ASSERT( t.millis() >= 1000 );
+            ASSERT( t.millis() <= 2500 );
+
+            {
+                int total = 1200;
+                int ms = 2;
+                t.reset();
+                for ( int i=0; i<(total/ms); i++ ) {
+                    sleepmillis( ms );
+                }
+                {
+                    int x = t.millis();
+                    if ( x < 1000 || x > 2500 ) {
+                        cout << "sleeptest finds sleep accuracy to be not great. x: " << x << endl;
+                        ASSERT( x >= 1000 );
+                        ASSERT( x <= 20000 );
+                    }
+                }
+            }
+
+#ifdef __linux__
+            {
+                int total = 1200;
+                int micros = 100;
+                t.reset();
+                int numSleeps = 1000*(total/micros);
+                for ( int i=0; i<numSleeps; i++ ) {
+                    sleepmicros( micros );
+                }
+                {
+                    int y = t.millis();
+                    if ( y < 1000 || y > 2500 ) {
+                        cout << "sleeptest y: " << y << endl;
+                        ASSERT( y >= 1000 );
+                        /* ASSERT( y <= 100000 ); */
+                    }
+                }
+            }
+#endif
+
+        }
+
+    };
+
+    class AssertTests {
+    public:
+
+        int x;
+
+        AssertTests() {
+            x = 0;
+        }
+
+        string foo() {
+            x++;
+            return "";
+        }
+        void run() {
+            uassert( -1 , foo() , 1 );
+            if( x != 0 ) {
+                ASSERT_EQUALS( 0 , x );
+            }
+            try {
+                uassert( -1 , foo() , 0 );
+            }
+            catch ( ... ) {}
+            ASSERT_EQUALS( 1 , x );
+        }
+    };
+
+    namespace ArrayTests {
+        class basic1 {
+        public:
+            void run() {
+                FastArray<int> a(100);
+                a.push_back( 5 );
+                a.push_back( 6 );
+
+                ASSERT_EQUALS( 2 , a.size() );
+
+                FastArray<int>::iterator i = a.begin();
+                ASSERT( i != a.end() );
+                ASSERT_EQUALS( 5 , *i );
+                ++i;
+                ASSERT( i != a.end() );
+                ASSERT_EQUALS( 6 , *i );
+                ++i;
+                ASSERT( i == a.end() );
+            }
+        };
+    };
+
+    class ThreadSafeStringTest {
+    public:
+        void run() {
+            ThreadSafeString s;
+            s = "eliot";
+            ASSERT_EQUALS( s , "eliot" );
+            ASSERT( s != "eliot2" );
+
+            ThreadSafeString s2 = s;
+            ASSERT_EQUALS( s2 , "eliot" );
+
+
+            {
+                string foo;
+                {
+                    ThreadSafeString bar;
+                    bar = "eliot2";
+                    foo = bar.toString();
+                }
+                ASSERT_EQUALS( "eliot2" , foo );
+            }
+        }
+    };
+
+    class LexNumCmp {
+    public:
+        void run() {
+
+            ASSERT( ! isNumber( (char)255 ) );
+
+            ASSERT_EQUALS( 0, lexNumCmp( "a", "a" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a", "aa" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "aa", "a" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a", "b" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "100", "50" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "50", "100" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "b", "a" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "aa", "aa" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "aa", "ab" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "ab", "aa" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "0", "a" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "a0", "aa" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a", "0" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "aa", "a0" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "0", "0" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "10", "10" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "1", "10" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "10", "1" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "11", "10" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "10", "11" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "f11f", "f10f" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "f10f", "f11f" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "f11f", "f111" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "f111", "f11f" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "f12f", "f12g" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "f12g", "f12f" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "aa{", "aab" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "aa{", "aa1" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a1{", "a11" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "a1{a", "a1{" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a1{", "a1{a" ) );
+            ASSERT_EQUALS( 1, lexNumCmp("21", "11") );
+            ASSERT_EQUALS( -1, lexNumCmp("11", "21") );
+
+            ASSERT_EQUALS( -1 , lexNumCmp( "a.0" , "a.1" ) );
+            ASSERT_EQUALS( -1 , lexNumCmp( "a.0.b" , "a.1" ) );
+
+            ASSERT_EQUALS( -1 , lexNumCmp( "b." , "b.|" ) );
+            ASSERT_EQUALS( -1 , lexNumCmp( "b.0e" , (string("b.") + (char)255).c_str() ) );
+            ASSERT_EQUALS( -1 , lexNumCmp( "b." , "b.0e" ) );
+
+            ASSERT_EQUALS( 0, lexNumCmp( "238947219478347782934718234", "238947219478347782934718234"));
+            ASSERT_EQUALS( 0, lexNumCmp( "000238947219478347782934718234", "238947219478347782934718234"));
+            ASSERT_EQUALS( 1, lexNumCmp( "000238947219478347782934718235", "238947219478347782934718234"));
+            ASSERT_EQUALS( -1, lexNumCmp( "238947219478347782934718234", "238947219478347782934718234.1"));
+            ASSERT_EQUALS( 0, lexNumCmp( "238", "000238"));
+            ASSERT_EQUALS( 0, lexNumCmp( "002384", "0002384"));
+            ASSERT_EQUALS( 0, lexNumCmp( "00002384", "0002384"));
+            ASSERT_EQUALS( 0, lexNumCmp( "0", "0"));
+            ASSERT_EQUALS( 0, lexNumCmp( "0000", "0"));
+            ASSERT_EQUALS( 0, lexNumCmp( "0", "000"));
+            ASSERT_EQUALS( -1, lexNumCmp( "0000", "0.0"));
+            ASSERT_EQUALS( 1, lexNumCmp( "2380", "238"));
+            ASSERT_EQUALS( 1, lexNumCmp( "2385", "2384"));
+            ASSERT_EQUALS( 1, lexNumCmp( "2385", "02384"));
+            ASSERT_EQUALS( 1, lexNumCmp( "2385", "002384"));
+            ASSERT_EQUALS( -1, lexNumCmp( "123.234.4567", "00238"));
+            ASSERT_EQUALS( 0, lexNumCmp( "123.234", "00123.234"));
+            ASSERT_EQUALS( 0, lexNumCmp( "a.123.b", "a.00123.b"));
+            ASSERT_EQUALS( 1, lexNumCmp( "a.123.b", "a.b.00123.b"));
+            ASSERT_EQUALS( -1, lexNumCmp( "a.00.0", "a.0.1"));
+            ASSERT_EQUALS( 0, lexNumCmp( "01.003.02", "1.3.2"));
+            ASSERT_EQUALS( -1, lexNumCmp( "1.3.2", "10.300.20"));
+            ASSERT_EQUALS( 0, lexNumCmp( "10.300.20", "000000000000010.0000300.000000020"));
+            ASSERT_EQUALS( 0, lexNumCmp( "0000a", "0a"));
+            ASSERT_EQUALS( -1, lexNumCmp( "a", "0a"));
+            ASSERT_EQUALS( -1, lexNumCmp( "000a", "001a"));
+            ASSERT_EQUALS( 0, lexNumCmp( "010a", "0010a"));
+            
+            ASSERT_EQUALS( -1 , lexNumCmp( "a0" , "a00" ) );
+            ASSERT_EQUALS( 0 , lexNumCmp( "a.0" , "a.00" ) );
+            ASSERT_EQUALS( -1 , lexNumCmp( "a.b.c.d0" , "a.b.c.d00" ) );
+            ASSERT_EQUALS( 1 , lexNumCmp( "a.b.c.0.y" , "a.b.c.00.x" ) );
+            
+            ASSERT_EQUALS( -1, lexNumCmp( "a", "a-" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "a-", "a" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "a-", "a-" ) );
+
+            ASSERT_EQUALS( -1, lexNumCmp( "a", "a-c" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "a-c", "a" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "a-c", "a-c" ) );
+
+            ASSERT_EQUALS( 1, lexNumCmp( "a-c.t", "a.t" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a.t", "a-c.t" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "a-c.t", "a-c.t" ) );
+
+            ASSERT_EQUALS( 1, lexNumCmp( "ac.t", "a.t" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a.t", "ac.t" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "ac.t", "ac.t" ) );            
+        }
+    };
+
+    class DatabaseValidNames {
+    public:
+        void run() {
+            ASSERT( NamespaceString::validDBName( "foo" ) );
+            ASSERT( ! NamespaceString::validDBName( "foo/bar" ) );
+            ASSERT( ! NamespaceString::validDBName( "foo bar" ) );
+            ASSERT( ! NamespaceString::validDBName( "foo.bar" ) );
+
+            ASSERT( NamespaceString::normal( "asdads" ) );
+            ASSERT( ! NamespaceString::normal( "asda$ds" ) );
+            ASSERT( NamespaceString::normal( "local.oplog.$main" ) );
+        }
+    };
+
+    class DatabaseOwnsNS {
+    public:
+        void run() {
+            dblock lk;
+            bool isNew = false;
+            // this leaks as ~Database is private
+            // if that changes, should put this on the stack
+            {
+                Database * db = new Database( "dbtests_basictests_ownsns" , isNew );
+                assert( isNew );
+
+                ASSERT( db->ownsNS( "dbtests_basictests_ownsns.x" ) );
+                ASSERT( db->ownsNS( "dbtests_basictests_ownsns.x.y" ) );
+                ASSERT( ! db->ownsNS( "dbtests_basictests_ownsn.x.y" ) );
+                ASSERT( ! db->ownsNS( "dbtests_basictests_ownsnsa.x.y" ) );
+            }
+        }
+    };
+
+    class NSValidNames {
+    public:
+        void run() {
+            ASSERT( isValidNS( "test.foo" ) );
+            ASSERT( ! isValidNS( "test." ) );
+            ASSERT( ! isValidNS( "test" ) );
+        }
+    };
+
+    class PtrTests {
+    public:
+        void run() {
+            scoped_ptr<int> p1 (new int(1));
+            boost::shared_ptr<int> p2 (new int(2));
+            scoped_ptr<const int> p3 (new int(3));
+            boost::shared_ptr<const int> p4 (new int(4));
+
+            //non-const
+            ASSERT_EQUALS( p1.get() , ptr<int>(p1) );
+            ASSERT_EQUALS( p2.get() , ptr<int>(p2) );
+            ASSERT_EQUALS( p2.get() , ptr<int>(p2.get()) ); // T* constructor
+            ASSERT_EQUALS( p2.get() , ptr<int>(ptr<int>(p2)) ); // copy constructor
+            ASSERT_EQUALS( *p2      , *ptr<int>(p2));
+            ASSERT_EQUALS( p2.get() , ptr<boost::shared_ptr<int> >(&p2)->get() ); // operator->
+
+            //const
+            ASSERT_EQUALS( p1.get() , ptr<const int>(p1) );
+            ASSERT_EQUALS( p2.get() , ptr<const int>(p2) );
+            ASSERT_EQUALS( p2.get() , ptr<const int>(p2.get()) );
+            ASSERT_EQUALS( p3.get() , ptr<const int>(p3) );
+            ASSERT_EQUALS( p4.get() , ptr<const int>(p4) );
+            ASSERT_EQUALS( p4.get() , ptr<const int>(p4.get()) );
+            ASSERT_EQUALS( p2.get() , ptr<const int>(ptr<const int>(p2)) );
+            ASSERT_EQUALS( p2.get() , ptr<const int>(ptr<int>(p2)) ); // constizing copy constructor
+            ASSERT_EQUALS( *p2      , *ptr<int>(p2));
+            ASSERT_EQUALS( p2.get() , ptr<const boost::shared_ptr<int> >(&p2)->get() );
+
+            //bool context
+            ASSERT( ptr<int>(p1) );
+            ASSERT( !ptr<int>(NULL) );
+            ASSERT( !ptr<int>() );
+
+#if 0
+            // These shouldn't compile
+            ASSERT_EQUALS( p3.get() , ptr<int>(p3) );
+            ASSERT_EQUALS( p4.get() , ptr<int>(p4) );
+            ASSERT_EQUALS( p2.get() , ptr<int>(ptr<const int>(p2)) );
+#endif
+        }
+    };
+
+    struct StringSplitterTest {
+
+        void test( string s ) {
+            vector<string> v = StringSplitter::split( s , "," );
+            ASSERT_EQUALS( s , StringSplitter::join( v , "," ) );
+        }
+
+        void run() {
+            test( "a" );
+            test( "a,b" );
+            test( "a,b,c" );
+        }
+    };
+
+    struct IsValidUTF8Test {
+// macros used to get valid line numbers
+#define good(s)  ASSERT(isValidUTF8(s));
+#define bad(s)   ASSERT(!isValidUTF8(s));
+
+        void run() {
+            good("A");
+            good("\xC2\xA2"); // cent: ¢
+            good("\xE2\x82\xAC"); // euro: €
+            good("\xF0\x9D\x90\x80"); // Blackboard A: 𝐀
+
+            //abrupt end
+            bad("\xC2");
+            bad("\xE2\x82");
+            bad("\xF0\x9D\x90");
+            bad("\xC2 ");
+            bad("\xE2\x82 ");
+            bad("\xF0\x9D\x90 ");
+
+            //too long
+            bad("\xF8\x80\x80\x80\x80");
+            bad("\xFC\x80\x80\x80\x80\x80");
+            bad("\xFE\x80\x80\x80\x80\x80\x80");
+            bad("\xFF\x80\x80\x80\x80\x80\x80\x80");
+
+            bad("\xF5\x80\x80\x80"); // U+140000 > U+10FFFF
+            bad("\x80"); //cant start with continuation byte
+            bad("\xC0\x80"); // 2-byte version of ASCII NUL
+#undef good
+#undef bad
+        }
+    };
+
+
+    class QueueTest {
+    public:
+        void run() {
+            BlockingQueue<int> q;
+            Timer t;
+            int x;
+            ASSERT( ! q.blockingPop( x , 5 ) );
+            ASSERT( t.seconds() > 3 && t.seconds() < 9 );
+
+        }
+    };
+
+    class StrTests {
+    public:
+
+        void run() {
+            ASSERT_EQUALS( 1u , str::count( "abc" , 'b' ) );
+            ASSERT_EQUALS( 3u , str::count( "babab" , 'b' ) );
+        }
+
+    };
+
+    class HostAndPortTests {
+    public:
+        void run() {
+            HostAndPort a( "x1" , 1000 );
+            HostAndPort b( "x1" , 1000 );
+            HostAndPort c( "x1" , 1001 );
+            HostAndPort d( "x2" , 1000 );
+
+            ASSERT( a == b );
+            ASSERT( a != c );
+            ASSERT( a != d );
+
+        }
+    };
+
+    class RelativePathTest {
+    public:
+        void run() {
+            RelativePath a = RelativePath::fromRelativePath( "a" );
+            RelativePath b = RelativePath::fromRelativePath( "a" );
+            RelativePath c = RelativePath::fromRelativePath( "b" );
+            RelativePath d = RelativePath::fromRelativePath( "a/b" );
+
+
+            ASSERT( a == b );
+            ASSERT( a != c );
+            ASSERT( a != d );
+            ASSERT( c != d );
+        }
+    };
+
+    class CmdLineParseConfigTest {
+    public:
+        void run() {
+            stringstream ss1;
+            istringstream iss1("");
+            CmdLine::parseConfigFile( iss1, ss1 );
+            stringstream ss2;
+            istringstream iss2("password=\'foo bar baz\'");
+            CmdLine::parseConfigFile( iss2, ss2 );
+            stringstream ss3;
+            istringstream iss3("\t    this = false  \n#that = true\n  #another = whocares\n\n  other = monkeys  ");
+            CmdLine::parseConfigFile( iss3, ss3 );
+
+            ASSERT( ss1.str().compare("\n") == 0 );
+            ASSERT( ss2.str().compare("password=\'foo bar baz\'\n\n") == 0 );
+            ASSERT( ss3.str().compare("\n  other = monkeys  \n\n") == 0 );
+        }
+    };
+
+    struct CompressionTest1 { 
+        void run() { 
+            const char * c = "this is a test";
+            std::string s;
+            size_t len = compress(c, strlen(c)+1, &s);
+            assert( len > 0 );
+            
+            std::string out;
+            bool ok = uncompress(s.c_str(), s.size(), &out);
+            assert(ok);
+            assert( strcmp(out.c_str(), c) == 0 );
+        }
+    } ctest1;
+
+
+    class All : public Suite {
+    public:
+        All() : Suite( "basic" ) {
+        }
+
+        void setupTests() {
+            add< Rarely >();
+            add< Base64Tests >();
+
+            add< stringbuildertests::simple1 >();
+            add< stringbuildertests::simple2 >();
+            add< stringbuildertests::reset1 >();
+            add< stringbuildertests::reset2 >();
+
+            add< sleeptest >();
+            add< AssertTests >();
+
+            add< ArrayTests::basic1 >();
+            add< LexNumCmp >();
+
+            add< DatabaseValidNames >();
+            add< DatabaseOwnsNS >();
+
+            add< NSValidNames >();
+
+            add< PtrTests >();
+
+            add< StringSplitterTest >();
+            add< IsValidUTF8Test >();
+
+            add< QueueTest >();
+
+            add< StrTests >();
+
+            add< HostAndPortTests >();
+            add< RelativePathTest >();
+            add< CmdLineParseConfigTest >();
+
+            add< CompressionTest1 >();
+        }
+    } myall;
+
+} // namespace BasicTests
+
diff --git a/src/mongo/dbtests/btreetests.cpp b/src/mongo/dbtests/btreetests.cpp
new file mode 100644
index 00000000000..efa42b1d5c1
--- /dev/null
+++ b/src/mongo/dbtests/btreetests.cpp
@@ -0,0 +1,59 @@
+// btreetests.cpp : Btree unit tests
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "../db/db.h"
+#include "../db/btree.h"
+
+#include "dbtests.h"
+
+#define BtreeBucket BtreeBucket<V0>
+#define btree btree<V0>
+#define btreemod btreemod<V0>
+#define Continuation Continuation<V0>
+#define testName "btree"
+#define BTVERSION 0
+namespace BtreeTests0 {
+ #include "btreetests.inl"
+}
+
+#undef BtreeBucket
+#undef btree
+#undef btreemod
+#undef Continuation
+#define BtreeBucket BtreeBucket<V1>
+#define btree btree<V1>
+#define btreemod btreemod<V1>
+#define Continuation Continuation<V1>
+#undef testName
+#define testName "btree1"
+#undef BTVERSION
+#define BTVERSION 1
+namespace BtreeTests1 {
+ #include "btreetests.inl"
+}
+
+#undef testName
+#define testName "btree1_twostep"
+#define TESTTWOSTEP 1
+
+namespace BtreeTests2 {
+ #include "btreetests.inl"
+}
diff --git a/src/mongo/dbtests/btreetests.inl b/src/mongo/dbtests/btreetests.inl
new file mode 100644
index 00000000000..824313e6a54
--- /dev/null
+++ b/src/mongo/dbtests/btreetests.inl
@@ -0,0 +1,1713 @@
+    typedef BtreeBucket::_KeyNode _KeyNode;
+ 
+    const char* ns() {
+        return "unittests.btreetests";
+    }
+
+    // dummy, valid record loc
+    const DiskLoc recordLoc() {
+        return DiskLoc( 0, 2 );
+    }
+
+    class Ensure {
+    public:
+        Ensure() {
+            _c.ensureIndex( ns(), BSON( "a" << 1 ), false, "testIndex",
+                false, // given two versions not sure if cache true would mess us up...
+                false, BTVERSION);
+        }
+        ~Ensure() {
+            _c.dropCollection( ns() );
+            //_c.dropIndexes( ns() );
+        }
+    private:
+        DBDirectClient _c;
+    };
+
+    class Base : public Ensure {
+    public:
+        Base() :
+            _context( ns() ) {
+            {
+                bool f = false;
+                assert( f = true );
+                massert( 10402 , "assert is misdefined", f);
+            }
+        }
+        virtual ~Base() {}
+        static string bigNumString( long long n, int len = 800 ) {
+            char sub[17];
+            sprintf( sub, "%.16llx", n );
+            string val( len, ' ' );
+            for( int i = 0; i < len; ++i ) {
+                val[ i ] = sub[ i % 16 ];
+            }
+            return val;
+        }
+    protected:
+        const BtreeBucket* bt() {
+            return id().head.btree();
+        }
+        DiskLoc dl() {
+            return id().head;
+        }
+        IndexDetails& id() {
+            NamespaceDetails *nsd = nsdetails( ns() );
+            assert( nsd );
+            return nsd->idx( 1 );
+        }
+        void checkValid( int nKeys ) {
+            ASSERT( bt() );
+            ASSERT( bt()->isHead() );
+            bt()->assertValid( order(), true );
+            ASSERT_EQUALS( nKeys, bt()->fullValidate( dl(), order(), 0, true ) );
+        }
+        void dump() {
+            bt()->dumpTree( dl(), order() );
+        }
+        void insert( BSONObj &key ) {
+            const BtreeBucket *b = bt();
+
+#if defined(TESTTWOSTEP)
+            {
+                Continuation c(dl(), recordLoc(), key, Ordering::make(order()), id());
+                b->twoStepInsert(dl(), c, true);
+                c.stepTwo();
+            }
+#else
+            {
+                b->bt_insert( dl(), recordLoc(), key, Ordering::make(order()), true, id(), true );
+            }
+#endif
+            getDur().commitIfNeeded();
+        }
+        bool unindex( BSONObj &key ) {
+            getDur().commitIfNeeded();
+            return bt()->unindex( dl(), id(), key, recordLoc() );
+        }
+        static BSONObj simpleKey( char c, int n = 1 ) {
+            BSONObjBuilder builder;
+            string val( n, c );
+            builder.append( "a", val );
+            return builder.obj();
+        }
+        void locate( BSONObj &key, int expectedPos,
+                     bool expectedFound, const DiskLoc &expectedLocation,
+                     int direction = 1 ) {
+            int pos;
+            bool found;
+            DiskLoc location =
+                bt()->locate( id(), dl(), key, Ordering::make(order()), pos, found, recordLoc(), direction );
+            ASSERT_EQUALS( expectedFound, found );
+            ASSERT( location == expectedLocation );
+            ASSERT_EQUALS( expectedPos, pos );
+        }
+        bool present( BSONObj &key, int direction ) {
+            int pos;
+            bool found;
+            bt()->locate( id(), dl(), key, Ordering::make(order()), pos, found, recordLoc(), direction );
+            return found;
+        }
+        BSONObj order() {
+            return id().keyPattern();
+        }
+        const BtreeBucket *child( const BtreeBucket *b, int i ) {
+            assert( i <= b->nKeys() );
+            DiskLoc d;
+            if ( i == b->nKeys() ) {
+                d = b->getNextChild();
+            }
+            else {
+                d = b->keyNode( i ).prevChildBucket;
+            }
+            assert( !d.isNull() );
+            return d.btree();
+        }
+        void checkKey( char i ) {
+            stringstream ss;
+            ss << i;
+            checkKey( ss.str() );
+        }
+        void checkKey( const string &k ) {
+            BSONObj key = BSON( "" << k );
+//            log() << "key: " << key << endl;
+            ASSERT( present( key, 1 ) );
+            ASSERT( present( key, -1 ) );
+        }
+    private:
+        dblock lk_;
+        Client::Context _context;
+    };
+
+    class Create : public Base {
+    public:
+        void run() {
+            checkValid( 0 );
+        }
+    };
+
+    class SimpleInsertDelete : public Base {
+    public:
+        void run() {
+            BSONObj key = simpleKey( 'z' );
+            insert( key );
+
+            checkValid( 1 );
+            locate( key, 0, true, dl() );
+
+            unindex( key );
+
+            checkValid( 0 );
+            locate( key, 0, false, DiskLoc() );
+        }
+    };
+
+    class SplitUnevenBucketBase : public Base {
+    public:
+        virtual ~SplitUnevenBucketBase() {}
+        void run() {
+            for ( int i = 0; i < 10; ++i ) {
+                BSONObj shortKey = simpleKey( shortToken( i ), 1 );
+                insert( shortKey );
+                BSONObj longKey = simpleKey( longToken( i ), 800 );
+                insert( longKey );
+            }
+            checkValid( 20 );
+            ASSERT_EQUALS( 1, bt()->nKeys() );
+            checkSplit();
+        }
+    protected:
+        virtual char shortToken( int i ) const = 0;
+        virtual char longToken( int i ) const = 0;
+        static char leftToken( int i ) {
+            return 'a' + i;
+        }
+        static char rightToken( int i ) {
+            return 'z' - i;
+        }
+        virtual void checkSplit() = 0;
+    };
+
+    class SplitRightHeavyBucket : public SplitUnevenBucketBase {
+    private:
+        virtual char shortToken( int i ) const {
+            return leftToken( i );
+        }
+        virtual char longToken( int i ) const {
+            return rightToken( i );
+        }
+        virtual void checkSplit() {
+            ASSERT_EQUALS( 15, child( bt(), 0 )->nKeys() );
+            ASSERT_EQUALS( 4, child( bt(), 1 )->nKeys() );
+        }
+    };
+
+    class SplitLeftHeavyBucket : public SplitUnevenBucketBase {
+    private:
+        virtual char shortToken( int i ) const {
+            return rightToken( i );
+        }
+        virtual char longToken( int i ) const {
+            return leftToken( i );
+        }
+        virtual void checkSplit() {
+            ASSERT_EQUALS( 4, child( bt(), 0 )->nKeys() );
+            ASSERT_EQUALS( 15, child( bt(), 1 )->nKeys() );
+        }
+    };
+
+    class MissingLocate : public Base {
+    public:
+        void run() {
+            for ( int i = 0; i < 3; ++i ) {
+                BSONObj k = simpleKey( 'b' + 2 * i );
+                insert( k );
+            }
+
+            locate( 1, 'a', 'b', dl() );
+            locate( 1, 'c', 'd', dl() );
+            locate( 1, 'e', 'f', dl() );
+            locate( 1, 'g', 'g' + 1, DiskLoc() ); // of course, 'h' isn't in the index.
+
+            // old behavior
+            //       locate( -1, 'a', 'b', dl() );
+            //       locate( -1, 'c', 'd', dl() );
+            //       locate( -1, 'e', 'f', dl() );
+            //       locate( -1, 'g', 'f', dl() );
+
+            locate( -1, 'a', 'a' - 1, DiskLoc() ); // of course, 'a' - 1 isn't in the index
+            locate( -1, 'c', 'b', dl() );
+            locate( -1, 'e', 'd', dl() );
+            locate( -1, 'g', 'f', dl() );
+        }
+    private:
+        void locate( int direction, char token, char expectedMatch,
+                     DiskLoc expectedLocation ) {
+            BSONObj k = simpleKey( token );
+            int expectedPos = ( expectedMatch - 'b' ) / 2;
+            Base::locate( k, expectedPos, false, expectedLocation, direction );
+        }
+    };
+
+    class MissingLocateMultiBucket : public Base {
+    public:
+        void run() {
+            for ( int i = 0; i < 8; ++i ) {
+                insert( i );
+            }
+            insert( 9 );
+            insert( 8 );
+//            dump();
+            BSONObj straddle = key( 'i' );
+            locate( straddle, 0, false, dl(), 1 );
+            straddle = key( 'k' );
+            locate( straddle, 0, false, dl(), -1 );
+        }
+    private:
+        BSONObj key( char c ) {
+            return simpleKey( c, 800 );
+        }
+        void insert( int i ) {
+            BSONObj k = key( 'b' + 2 * i );
+            Base::insert( k );
+        }
+    };
+
+    class SERVER983 : public Base {
+    public:
+        void run() {
+            for ( int i = 0; i < 10; ++i ) {
+                insert( i );
+            }
+//            dump();
+            BSONObj straddle = key( 'o' );
+            locate( straddle, 0, false, dl(), 1 );
+            straddle = key( 'q' );
+            locate( straddle, 0, false, dl(), -1 );
+        }
+    private:
+        BSONObj key( char c ) {
+            return simpleKey( c, 800 );
+        }
+        void insert( int i ) {
+            BSONObj k = key( 'b' + 2 * i );
+            Base::insert( k );
+        }
+    };
+
+    class DontReuseUnused : public Base {
+    public:
+        void run() {
+            for ( int i = 0; i < 10; ++i ) {
+                insert( i );
+            }
+//            dump();
+            BSONObj root = key( 'p' );
+            unindex( root );
+            Base::insert( root );
+            locate( root, 0, true, bt()->getNextChild(), 1 );
+        }
+    private:
+        BSONObj key( char c ) {
+            return simpleKey( c, 800 );
+        }
+        void insert( int i ) {
+            BSONObj k = key( 'b' + 2 * i );
+            Base::insert( k );
+        }
+    };
+
+    class PackUnused : public Base {
+    public:
+        void run() {
+            for ( long long i = 0; i < 1000000; i += 1000 ) {
+                insert( i );
+            }
+            string orig, after;
+            {
+                stringstream ss;
+                bt()->shape( ss );
+                orig = ss.str();
+            }
+            vector< string > toDel;
+            vector< string > other;
+            BSONObjBuilder start;
+            start.appendMinKey( "a" );
+            BSONObjBuilder end;
+            end.appendMaxKey( "a" );
+            auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ), 1, id(), start.done(), end.done(), false, 1 ) );
+            while( c->ok() ) {
+                if ( c->curKeyHasChild() ) {
+                    toDel.push_back( c->currKey().firstElement().valuestr() );
+                }
+                else {
+                    other.push_back( c->currKey().firstElement().valuestr() );
+                }
+                c->advance();
+            }
+            ASSERT( toDel.size() > 0 );
+            for( vector< string >::const_iterator i = toDel.begin(); i != toDel.end(); ++i ) {
+                BSONObj o = BSON( "a" << *i );
+                unindex( o );
+            }
+            ASSERT( other.size() > 0 );
+            for( vector< string >::const_iterator i = other.begin(); i != other.end(); ++i ) {
+                BSONObj o = BSON( "a" << *i );
+                unindex( o );
+            }
+
+            long long unused = 0;
+            ASSERT_EQUALS( 0, bt()->fullValidate( dl(), order(), &unused, true ) );
+
+            for ( long long i = 50000; i < 50100; ++i ) {
+                insert( i );
+            }
+
+            long long unused2 = 0;
+            ASSERT_EQUALS( 100, bt()->fullValidate( dl(), order(), &unused2, true ) );
+
+//            log() << "old unused: " << unused << ", new unused: " << unused2 << endl;
+//
+            ASSERT( unused2 <= unused );
+        }
+    protected:
+        void insert( long long n ) {
+            string val = bigNumString( n );
+            BSONObj k = BSON( "a" << val );
+            Base::insert( k );
+        }
+    };
+
+    class DontDropReferenceKey : public PackUnused {
+    public:
+        void run() {
+            // with 80 root node is full
+            for ( long long i = 0; i < 80; i += 1 ) {
+                insert( i );
+            }
+
+            BSONObjBuilder start;
+            start.appendMinKey( "a" );
+            BSONObjBuilder end;
+            end.appendMaxKey( "a" );
+            BSONObj l = bt()->keyNode( 0 ).key.toBson();
+            string toInsert;
+            auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ), 1, id(), start.done(), end.done(), false, 1 ) );
+            while( c->ok() ) {
+                if ( c->currKey().woCompare( l ) > 0 ) {
+                    toInsert = c->currKey().firstElement().valuestr();
+                    break;
+                }
+                c->advance();
+            }
+            // too much work to try to make this happen through inserts and deletes
+            // we are intentionally manipulating the btree bucket directly here
+            BtreeBucket::Loc* L = const_cast< BtreeBucket::Loc* >( &bt()->keyNode( 1 ).prevChildBucket );
+            getDur().writing(L)->Null();
+            getDur().writingInt( const_cast< BtreeBucket::Loc& >( bt()->keyNode( 1 ).recordLoc ).GETOFS() ) |= 1; // make unused
+            BSONObj k = BSON( "a" << toInsert );
+            Base::insert( k );
+        }
+    };
+
+    class MergeBuckets : public Base {
+    public:
+        virtual ~MergeBuckets() {}
+        void run() {
+            for ( int i = 0; i < 10; ++i ) {
+                insert( i );
+            }
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            int expectedCount = 10 - unindexKeys();
+//            dump();
+            ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords );
+            long long unused = 0;
+            ASSERT_EQUALS( expectedCount, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+        }
+    protected:
+        BSONObj key( char c ) {
+            return simpleKey( c, 800 );
+        }
+        void insert( int i ) {
+            BSONObj k = key( 'b' + 2 * i );
+            Base::insert( k );
+        }
+        virtual int unindexKeys() = 0;
+    };
+
+    class MergeBucketsLeft : public MergeBuckets {
+        virtual int unindexKeys() {
+            BSONObj k = key( 'b' );
+            unindex( k );
+            k = key( 'b' + 2 );
+            unindex( k );
+            k = key( 'b' + 4 );
+            unindex( k );
+            k = key( 'b' + 6 );
+            unindex( k );
+            return 4;
+        }
+    };
+
+    class MergeBucketsRight : public MergeBuckets {
+        virtual int unindexKeys() {
+            BSONObj k = key( 'b' + 2 * 9 );
+            unindex( k );
+            return 1;
+        }
+    };
+
+    // deleting from head won't coalesce yet
+//    class MergeBucketsHead : public MergeBuckets {
+//        virtual BSONObj unindexKey() { return key( 'p' ); }
+//    };
+
+    class MergeBucketsDontReplaceHead : public Base {
+    public:
+        void run() {
+            for ( int i = 0; i < 18; ++i ) {
+                insert( i );
+            }
+            //            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = key( 'a' + 17 );
+            unindex( k );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            long long unused = 0;
+            ASSERT_EQUALS( 17, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+        }
+    private:
+        BSONObj key( char c ) {
+            return simpleKey( c, 800 );
+        }
+        void insert( int i ) {
+            BSONObj k = key( 'a' + i );
+            Base::insert( k );
+        }
+    };
+
+    // Tool to construct custom trees for tests.
+    class ArtificialTree : public BtreeBucket {
+    public:
+        void push( const BSONObj &key, const DiskLoc &child ) {
+            KeyOwned k(key);
+            pushBack( dummyDiskLoc(), k, Ordering::make( BSON( "a" << 1 ) ), child );
+        }
+        void setNext( const DiskLoc &child ) {
+            nextChild = child;
+        }
+        static DiskLoc make( IndexDetails &id ) {
+            DiskLoc ret = addBucket( id );
+            is( ret )->init();
+            getDur().commitIfNeeded();
+            return ret;
+        }
+        static ArtificialTree *is( const DiskLoc &l ) {
+            return static_cast< ArtificialTree * >( l.btreemod() );
+        }
+        static DiskLoc makeTree( const string &spec, IndexDetails &id ) {
+            return makeTree( fromjson( spec ), id );
+        }
+        static DiskLoc makeTree( const BSONObj &spec, IndexDetails &id ) {
+            DiskLoc node = make( id );
+            ArtificialTree *n = ArtificialTree::is( node );
+            BSONObjIterator i( spec );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                DiskLoc child;
+                if ( e.type() == Object ) {
+                    child = makeTree( e.embeddedObject(), id );
+                }
+                if ( e.fieldName() == string( "_" ) ) {
+                    n->setNext( child );
+                }
+                else {
+                    n->push( BSON( "" << expectedKey( e.fieldName() ) ), child );
+                }
+            }
+            n->fixParentPtrs( node );
+            return node;
+        }
+        static void setTree( const string &spec, IndexDetails &id ) {
+            set( makeTree( spec, id ), id );
+        }
+        static void set( const DiskLoc &l, IndexDetails &id ) {
+            ArtificialTree::is( id.head )->deallocBucket( id.head, id );
+            getDur().writingDiskLoc(id.head) = l;
+        }
+        static string expectedKey( const char *spec ) {
+            if ( spec[ 0 ] != '$' ) {
+                return spec;
+            }
+            char *endPtr;
+            // parsing a long long is a pain, so just allow shorter keys for now
+            unsigned long long num = strtol( spec + 1, &endPtr, 16 );
+            int len = 800;
+            if( *endPtr == '$' ) {
+                len = strtol( endPtr + 1, 0, 16 );
+            }
+            return Base::bigNumString( num, len );
+        }
+        static void checkStructure( const BSONObj &spec, const IndexDetails &id, const DiskLoc node ) {
+            ArtificialTree *n = ArtificialTree::is( node );
+            BSONObjIterator j( spec );
+            for( int i = 0; i < n->n; ++i ) {
+                ASSERT( j.more() );
+                BSONElement e = j.next();
+                KeyNode kn = n->keyNode( i );
+                string expected = expectedKey( e.fieldName() );
+                ASSERT( present( id, BSON( "" << expected ), 1 ) );
+                ASSERT( present( id, BSON( "" << expected ), -1 ) );
+                ASSERT_EQUALS( expected, kn.key.toBson().firstElement().valuestr() );
+                if ( kn.prevChildBucket.isNull() ) {
+                    ASSERT( e.type() == jstNULL );
+                }
+                else {
+                    ASSERT( e.type() == Object );
+                    checkStructure( e.embeddedObject(), id, kn.prevChildBucket );
+                }
+            }
+            if ( n->nextChild.isNull() ) {
+                // maybe should allow '_' field with null value?
+                ASSERT( !j.more() );
+            }
+            else {
+                BSONElement e = j.next();
+                ASSERT_EQUALS( string( "_" ), e.fieldName() );
+                ASSERT( e.type() == Object );
+                checkStructure( e.embeddedObject(), id, n->nextChild );
+            }
+            ASSERT( !j.more() );
+        }
+        static void checkStructure( const string &spec, const IndexDetails &id ) {
+            checkStructure( fromjson( spec ), id, id.head );
+        }
+        static bool present( const IndexDetails &id, const BSONObj &key, int direction ) {
+            int pos;
+            bool found;
+            id.head.btree()->locate( id, id.head, key, Ordering::make(id.keyPattern()), pos, found, recordLoc(), direction );
+            return found;
+        }
+        int headerSize() const { return BtreeBucket::headerSize(); }
+        int packedDataSize( int pos ) const { return BtreeBucket::packedDataSize( pos ); }
+        void fixParentPtrs( const DiskLoc &thisLoc ) { BtreeBucket::fixParentPtrs( thisLoc ); }
+        void forcePack() {
+            topSize += emptySize;
+            emptySize = 0;
+            setNotPacked();
+        }
+    private:
+        DiskLoc dummyDiskLoc() const { return DiskLoc( 0, 2 ); }
+    };
+
+    /**
+     * We could probably refactor the following tests, but it's easier to debug
+     * them in the present state.
+     */
+
+    class MergeBucketsDelInternal : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{d:{b:{a:null},bb:null,_:{c:null}},_:{f:{e:null},_:{g:null}}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "bb" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 7, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{b:{a:null},d:{c:null},f:{e:null},_:{g:null}}", id() );
+        }
+    };
+
+    class MergeBucketsRightNull : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{d:{b:{a:null},bb:null,cc:{c:null}},_:{f:{e:null},h:{g:null}}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "bb" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}", id() );
+        }
+    };
+
+    // not yet handling this case
+    class DontMergeSingleBucket : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{d:{b:{a:null},c:null}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "c" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{d:{b:{a:null}}}", id() );
+        }
+    };
+
+    class ParentMergeNonRightToLeft : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{d:{b:{a:null},bb:null,cc:{c:null}},i:{f:{e:null},h:{g:null}}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "bb" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) );
+            // child does not currently replace parent in this case
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}", id() );
+        }
+    };
+
+    class ParentMergeNonRightToRight : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{d:{b:{a:null},cc:{c:null}},i:{f:{e:null},ff:null,h:{g:null}}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "ff" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) );
+            // child does not currently replace parent in this case
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}", id() );
+        }
+    };
+
+    class CantMergeRightNoMerge : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{d:{b:{a:null},bb:null,cc:{c:null}},dd:null,_:{f:{e:null},h:{g:null}}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "bb" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{d:{b:{a:null},cc:{c:null}},dd:null,_:{f:{e:null},h:{g:null}}}", id() );
+        }
+    };
+
+    class CantMergeLeftNoMerge : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{c:{b:{a:null}},d:null,_:{f:{e:null},g:null}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 7, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "g" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 6, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{c:{b:{a:null}},d:null,_:{f:{e:null}}}", id() );
+        }
+    };
+
+    class MergeOption : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{c:{b:{a:null}},f:{e:{d:null},ee:null},_:{h:{g:null}}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "ee" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{c:{b:{a:null}},_:{e:{d:null},f:null,h:{g:null}}}", id() );
+        }
+    };
+
+    class ForceMergeLeft : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{c:{b:{a:null}},f:{e:{d:null},ee:null},ff:null,_:{h:{g:null}}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "ee" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{f:{b:{a:null},c:null,e:{d:null}},ff:null,_:{h:{g:null}}}", id() );
+        }
+    };
+
+    class ForceMergeRight : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{c:{b:{a:null}},cc:null,f:{e:{d:null},ee:null},_:{h:{g:null}}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 7, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "ee" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{c:{b:{a:null}},cc:null,_:{e:{d:null},f:null,h:{g:null}}}", id() );
+        }
+    };
+
+    class RecursiveMerge : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},j:{i:null}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 10, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "c" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            // height is not currently reduced in this case
+            ArtificialTree::checkStructure( "{j:{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}}", id() );
+        }
+    };
+
+    class RecursiveMergeRightBucket : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},_:{i:null}}", id() );
+//            dump();
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 9, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+
+            BSONObj k = BSON( "" << "c" );
+            assert( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}", id() );
+        }
+    };
+
+    class RecursiveMergeDoubleRightBucket : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( "{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}", id() );
+            string ns = id().indexNamespace();
+            ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "c" );
+            assert( unindex( k ) );
+            long long keyCount = bt()->fullValidate( dl(), order(), 0, true );
+            ASSERT_EQUALS( 7, keyCount );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            // no recursion currently in this case
+            ArtificialTree::checkStructure( "{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}", id() );
+        }
+    };
+
+    class MergeSizeBase : public Base {
+    public:
+        MergeSizeBase() : _count() {}
+        virtual ~MergeSizeBase() {}
+        void run() {
+            typedef ArtificialTree A;
+            A::set( A::make( id() ), id() );
+            A* root = A::is( dl() );
+            DiskLoc left = A::make( id() );
+            root->push( biggestKey( 'm' ), left );
+            _count = 1;
+            A* l = A::is( left );
+            DiskLoc right = A::make( id() );
+            root->setNext( right );
+            A* r = A::is( right );
+            root->fixParentPtrs( dl() );
+
+            //ASSERT_EQUALS( bigSize(), bigSize() / 2 * 2 );
+            fillToExactSize( l, leftSize(), 'a' );
+            fillToExactSize( r, rightSize(), 'n' );
+            ASSERT( leftAdditional() <= 2 );
+            if ( leftAdditional() >= 2 ) {
+                l->push( bigKey( 'k' ), DiskLoc() );
+            }
+            if ( leftAdditional() >= 1 ) {
+                l->push( bigKey( 'l' ), DiskLoc() );
+            }
+            ASSERT( rightAdditional() <= 2 );
+            if ( rightAdditional() >= 2 ) {
+                r->push( bigKey( 'y' ), DiskLoc() );
+            }
+            if ( rightAdditional() >= 1 ) {
+                r->push( bigKey( 'z' ), DiskLoc() );
+            }
+            _count += leftAdditional() + rightAdditional();
+
+//            dump();
+
+            initCheck();
+            string ns = id().indexNamespace();
+            const char *keys = delKeys();
+            for( const char *i = keys; *i; ++i ) {
+                long long unused = 0;
+                ASSERT_EQUALS( _count, bt()->fullValidate( dl(), order(), &unused, true ) );
+                ASSERT_EQUALS( 0, unused );
+                ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+                BSONObj k = bigKey( *i );
+                unindex( k );
+//                dump();
+                --_count;
+            }
+
+//            dump();
+
+            long long unused = 0;
+            ASSERT_EQUALS( _count, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            validate();
+            if ( !merge() ) {
+                ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            }
+            else {
+                ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords );
+            }
+        }
+    protected:
+        virtual int leftAdditional() const { return 2; }
+        virtual int rightAdditional() const { return 2; }
+        virtual void initCheck() {}
+        virtual void validate() {}
+        virtual int leftSize() const = 0;
+        virtual int rightSize() const = 0;
+        virtual const char * delKeys() const { return "klyz"; }
+        virtual bool merge() const { return true; }
+        void fillToExactSize( ArtificialTree *t, int targetSize, char startKey ) {
+            int size = 0;
+            while( size < targetSize ) {
+                int space = targetSize - size;
+                int nextSize = space - sizeof( _KeyNode );
+                assert( nextSize > 0 );
+                BSONObj newKey = key( startKey++, nextSize );
+                t->push( newKey, DiskLoc() );
+                size += BtreeBucket::KeyOwned(newKey).dataSize() + sizeof( _KeyNode );
+                _count += 1;
+            }
+            if( t->packedDataSize( 0 ) != targetSize ) {
+                ASSERT_EQUALS( t->packedDataSize( 0 ), targetSize );
+            }
+        }
+        static BSONObj key( char a, int size ) {
+            if ( size >= bigSize() ) {
+                return bigKey( a );
+            }
+            return simpleKey( a, size - ( bigSize() - 801 ) );
+        }
+        static BSONObj bigKey( char a ) {
+            return simpleKey( a, 801 );
+        }
+        static BSONObj biggestKey( char a ) {
+            int size = BtreeBucket::getKeyMax() - bigSize() + 801;
+            return simpleKey( a, size );
+        }
+        static int bigSize() {
+            return BtreeBucket::KeyOwned(bigKey( 'a' )).dataSize();
+        }
+        static int biggestSize() {
+            return BtreeBucket::KeyOwned(biggestKey( 'a' )).dataSize();
+        }
+        int _count;
+    };
+
+    class MergeSizeJustRightRight : public MergeSizeBase {
+    protected:
+        virtual int rightSize() const { return BtreeBucket::lowWaterMark() - 1; }
+        virtual int leftSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ) - ( BtreeBucket::lowWaterMark() - 1 ); }
+    };
+
+    class MergeSizeJustRightLeft : public MergeSizeBase {
+    protected:
+        virtual int leftSize() const { return BtreeBucket::lowWaterMark() - 1; }
+        virtual int rightSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ) - ( BtreeBucket::lowWaterMark() - 1 ); }
+        virtual const char * delKeys() const { return "yzkl"; }
+    };
+
+    class MergeSizeRight : public MergeSizeJustRightRight {
+        virtual int rightSize() const { return MergeSizeJustRightRight::rightSize() - 1; }
+        virtual int leftSize() const { return MergeSizeJustRightRight::leftSize() + 1; }
+    };
+
+    class MergeSizeLeft : public MergeSizeJustRightLeft {
+        virtual int rightSize() const { return MergeSizeJustRightLeft::rightSize() + 1; }
+        virtual int leftSize() const { return MergeSizeJustRightLeft::leftSize() - 1; }
+    };
+
+    class NoMergeBelowMarkRight : public MergeSizeJustRightRight {
+        virtual int rightSize() const { return MergeSizeJustRightRight::rightSize() + 1; }
+        virtual int leftSize() const { return MergeSizeJustRightRight::leftSize() - 1; }
+        virtual bool merge() const { return false; }
+    };
+
+    class NoMergeBelowMarkLeft : public MergeSizeJustRightLeft {
+        virtual int rightSize() const { return MergeSizeJustRightLeft::rightSize() - 1; }
+        virtual int leftSize() const { return MergeSizeJustRightLeft::leftSize() + 1; }
+        virtual bool merge() const { return false; }
+    };
+
+    class MergeSizeRightTooBig : public MergeSizeJustRightLeft {
+        virtual int rightSize() const { return MergeSizeJustRightLeft::rightSize() + 1; }
+        virtual bool merge() const { return false; }
+    };
+
+    class MergeSizeLeftTooBig : public MergeSizeJustRightRight {
+        virtual int leftSize() const { return MergeSizeJustRightRight::leftSize() + 1; }
+        virtual bool merge() const { return false; }
+    };
+
+    class BalanceOneLeftToRight : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},b:{$20:null,$30:null,$40:null,$50:null,a:null},_:{c:null}}", id() );
+            ASSERT_EQUALS( 14, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x40 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},b:{$10:null,$20:null,$30:null,$50:null,a:null},_:{c:null}}", id() );
+        }
+    };
+
+    class BalanceOneRightToLeft : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null},b:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},_:{c:null}}", id() );
+            ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x3 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$20:{$1:null,$2:null,$4:null,$10:null},b:{$30:null,$40:null,$50:null,$60:null,$70:null},_:{c:null}}", id() );
+        }
+    };
+
+    class BalanceThreeLeftToRight : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$20:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null},$9:{$8:null},$11:{$10:null},$13:{$12:null},_:{$14:null}},b:{$30:null,$40:{$35:null},$50:{$45:null}},_:{c:null}}", id() );
+            ASSERT_EQUALS( 23, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 14, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x30 ) );
+            //            dump();
+            ASSERT( unindex( k ) );
+            //            dump();
+            ASSERT_EQUALS( 22, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 14, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$9:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null},_:{$8:null}},b:{$11:{$10:null},$13:{$12:null},$20:{$14:null},$40:{$35:null},$50:{$45:null}},_:{c:null}}", id() );
+        }
+    };
+
+    class BalanceThreeRightToLeft : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$20:{$1:{$0:null},$3:{$2:null},$5:null,_:{$14:null}},b:{$30:{$25:null},$40:{$35:null},$50:{$45:null},$60:{$55:null},$70:{$65:null},$80:{$75:null},$90:{$85:null},$100:{$95:null}},_:{c:null}}", id() );
+            ASSERT_EQUALS( 25, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 15, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x5 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 24, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 15, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$50:{$1:{$0:null},$3:{$2:null},$20:{$14:null},$30:{$25:null},$40:{$35:null},_:{$45:null}},b:{$60:{$55:null},$70:{$65:null},$80:{$75:null},$90:{$85:null},$100:{$95:null}},_:{c:null}}", id() );
+        }
+    };
+
+    class BalanceSingleParentKey : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},_:{$20:null,$30:null,$40:null,$50:null,a:null}}", id() );
+            ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x40 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$10:null,$20:null,$30:null,$50:null,a:null}}", id() );
+        }
+    };
+
+    class PackEmpty : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null}", id() );
+            BSONObj k = BSON( "" << "a" );
+            ASSERT( unindex( k ) );
+            ArtificialTree *t = ArtificialTree::is( dl() );
+            t->forcePack();
+            Tester::checkEmpty( t, id() );
+        }
+        class Tester : public ArtificialTree {
+        public:
+            static void checkEmpty( ArtificialTree *a, const IndexDetails &id ) {
+                Tester *t = static_cast< Tester * >( a );
+                ASSERT_EQUALS( 0, t->n );
+                ASSERT( !( t->flags & Packed ) );
+                Ordering o = Ordering::make( id.keyPattern() );
+                int zero = 0;
+                t->_packReadyForMod( o, zero );
+                ASSERT_EQUALS( 0, t->n );
+                ASSERT_EQUALS( 0, t->topSize );
+                ASSERT_EQUALS( BtreeBucket::bodySize(), t->emptySize );
+                ASSERT( t->flags & Packed );
+            }
+        };
+    };
+
+    class PackedDataSizeEmpty : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null}", id() );
+            BSONObj k = BSON( "" << "a" );
+            ASSERT( unindex( k ) );
+            ArtificialTree *t = ArtificialTree::is( dl() );
+            t->forcePack();
+            Tester::checkEmpty( t, id() );
+        }
+        class Tester : public ArtificialTree {
+        public:
+            static void checkEmpty( ArtificialTree *a, const IndexDetails &id ) {
+                Tester *t = static_cast< Tester * >( a );
+                ASSERT_EQUALS( 0, t->n );
+                ASSERT( !( t->flags & Packed ) );
+                int zero = 0;
+                ASSERT_EQUALS( 0, t->packedDataSize( zero ) );
+                ASSERT( !( t->flags & Packed ) );
+            }
+        };
+    };
+
+    class BalanceSingleParentKeyPackParent : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},_:{$20:null,$30:null,$40:null,$50:null,a:null}}", id() );
+            ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            // force parent pack
+            ArtificialTree::is( dl() )->forcePack();
+            BSONObj k = BSON( "" << bigNumString( 0x40 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 11, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$10:null,$20:null,$30:null,$50:null,a:null}}", id() );
+        }
+    };
+
+    class BalanceSplitParent : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10$10:{$1:null,$2:null,$3:null,$4:null},$100:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null},$200:null,$300:null,$400:null,$500:null,$600:null,$700:null,$800:null,$900:null,_:{c:null}}", id() );
+            ASSERT_EQUALS( 22, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x3 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 21, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$500:{$30:{$1:null,$2:null,$4:null,$10$10:null,$20:null},$100:{$40:null,$50:null,$60:null,$70:null,$80:null},$200:null,$300:null,$400:null},_:{$600:null,$700:null,$800:null,$900:null,_:{c:null}}}", id() );
+        }
+    };
+
+    class RebalancedSeparatorBase : public Base {
+    public:
+        void run() {
+            ArtificialTree::setTree( treeSpec(), id() );
+            modTree();
+            Tester::checkSeparator( id(), expectedSeparator() );
+        }
+        virtual string treeSpec() const = 0;
+        virtual int expectedSeparator() const = 0;
+        virtual void modTree() {}
+        struct Tester : public ArtificialTree {
+            static void checkSeparator( const IndexDetails& id, int expected ) {
+                ASSERT_EQUALS( expected, static_cast< Tester * >( id.head.btreemod() )->rebalancedSeparatorPos( id.head, 0 ) );
+            }
+        };
+    };
+
+    class EvenRebalanceLeft : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$7:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null,$6:null},_:{$8:null,$9:null,$10$31e:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class EvenRebalanceLeftCusp : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$6:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null},_:{$7:null,$8:null,$9$31e:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class EvenRebalanceRight : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$3:{$1:null,$2$31f:null},_:{$4$31f:null,$5:null,$6:null,$7:null,$8$31e:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class EvenRebalanceRightCusp : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$4$31f:{$1:null,$2$31f:null,$3:null},_:{$5:null,$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class EvenRebalanceCenter : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$5:{$1:null,$2$31f:null,$3:null,$4$31f:null},_:{$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class OddRebalanceLeft : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$6$31f:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$7:null,$8:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class OddRebalanceRight : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$4:{$1:null,$2:null,$3:null},_:{$5:null,$6:null,$7:null,$8$31f:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class OddRebalanceCenter : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$5:{$1:null,$2:null,$3:null,$4:null},_:{$6:null,$7:null,$8:null,$9:null,$10$31f:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class RebalanceEmptyRight : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$a:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null,$7:null,$8:null,$9:null},_:{$b:null}}"; }
+        virtual void modTree() {
+            BSONObj k = BSON( "" << bigNumString( 0xb ) );
+            ASSERT( unindex( k ) );
+        }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class RebalanceEmptyLeft : public RebalancedSeparatorBase {
+        virtual string treeSpec() const { return "{$a:{$1:null},_:{$11:null,$12:null,$13:null,$14:null,$15:null,$16:null,$17:null,$18:null,$19:null}}"; }
+        virtual void modTree() {
+            BSONObj k = BSON( "" << bigNumString( 0x1 ) );
+            ASSERT( unindex( k ) );
+        }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    class NoMoveAtLowWaterMarkRight : public MergeSizeJustRightRight {
+        virtual int rightSize() const { return MergeSizeJustRightRight::rightSize() + 1; }
+        virtual void initCheck() { _oldTop = bt()->keyNode( 0 ).key.toBson(); }
+        virtual void validate() { ASSERT_EQUALS( _oldTop, bt()->keyNode( 0 ).key.toBson() ); }
+        virtual bool merge() const { return false; }
+    protected:
+        BSONObj _oldTop;
+    };
+
+    class MoveBelowLowWaterMarkRight : public NoMoveAtLowWaterMarkRight {
+        virtual int rightSize() const { return MergeSizeJustRightRight::rightSize(); }
+        virtual int leftSize() const { return MergeSizeJustRightRight::leftSize() + 1; }
+        // different top means we rebalanced
+        virtual void validate() { ASSERT( !( _oldTop == bt()->keyNode( 0 ).key.toBson() ) ); }
+    };
+
+    class NoMoveAtLowWaterMarkLeft : public MergeSizeJustRightLeft {
+        virtual int leftSize() const { return MergeSizeJustRightLeft::leftSize() + 1; }
+        virtual void initCheck() { _oldTop = bt()->keyNode( 0 ).key.toBson(); }
+        virtual void validate() { ASSERT_EQUALS( _oldTop, bt()->keyNode( 0 ).key.toBson() ); }
+        virtual bool merge() const { return false; }
+    protected:
+        BSONObj _oldTop;
+    };
+
+    class MoveBelowLowWaterMarkLeft : public NoMoveAtLowWaterMarkLeft {
+        virtual int leftSize() const { return MergeSizeJustRightLeft::leftSize(); }
+        virtual int rightSize() const { return MergeSizeJustRightLeft::rightSize() + 1; }
+        // different top means we rebalanced
+        virtual void validate() { ASSERT( !( _oldTop == bt()->keyNode( 0 ).key.toBson() ) ); }
+    };
+
+    class PreferBalanceLeft : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},$20:{$11:null,$12:null,$13:null,$14:null},_:{$30:null}}", id() );
+            ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x12 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$5:{$1:null,$2:null,$3:null,$4:null},$20:{$6:null,$10:null,$11:null,$13:null,$14:null},_:{$30:null}}", id() );
+        }
+    };
+
+    class PreferBalanceRight : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:{$1:null},$20:{$11:null,$12:null,$13:null,$14:null},_:{$31:null,$32:null,$33:null,$34:null,$35:null,$36:null}}", id() );
+            ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x12 ) );
+            //            dump();
+            ASSERT( unindex( k ) );
+            //            dump();
+            ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$10:{$1:null},$31:{$11:null,$13:null,$14:null,$20:null},_:{$32:null,$33:null,$34:null,$35:null,$36:null}}", id() );
+        }
+    };
+
+    class RecursiveMergeThenBalance : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:{$5:{$1:null,$2:null},$8:{$6:null,$7:null}},_:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null,$90:null}}", id() );
+            ASSERT_EQUALS( 15, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x7 ) );
+            //            dump();
+            ASSERT( unindex( k ) );
+            //            dump();
+            ASSERT_EQUALS( 14, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$40:{$8:{$1:null,$2:null,$5:null,$6:null},$10:null,$20:null,$30:null},_:{$50:null,$60:null,$70:null,$80:null,$90:null}}", id() );
+        }
+    };
+
+    class MergeRightEmpty : public MergeSizeBase {
+    protected:
+        virtual int rightAdditional() const { return 1; }
+        virtual int leftAdditional() const { return 1; }
+        virtual const char * delKeys() const { return "lz"; }
+        virtual int rightSize() const { return 0; }
+        virtual int leftSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ); }
+    };
+
+    class MergeMinRightEmpty : public MergeSizeBase {
+    protected:
+        virtual int rightAdditional() const { return 1; }
+        virtual int leftAdditional() const { return 0; }
+        virtual const char * delKeys() const { return "z"; }
+        virtual int rightSize() const { return 0; }
+        virtual int leftSize() const { return bigSize() + sizeof( _KeyNode ); }
+    };
+
+    class MergeLeftEmpty : public MergeSizeBase {
+    protected:
+        virtual int rightAdditional() const { return 1; }
+        virtual int leftAdditional() const { return 1; }
+        virtual const char * delKeys() const { return "zl"; }
+        virtual int leftSize() const { return 0; }
+        virtual int rightSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ); }
+    };
+
+    class MergeMinLeftEmpty : public MergeSizeBase {
+    protected:
+        virtual int leftAdditional() const { return 1; }
+        virtual int rightAdditional() const { return 0; }
+        virtual const char * delKeys() const { return "l"; }
+        virtual int leftSize() const { return 0; }
+        virtual int rightSize() const { return bigSize() + sizeof( _KeyNode ); }
+    };
+
+    class BalanceRightEmpty : public MergeRightEmpty {
+    protected:
+        virtual int leftSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ) + 1; }
+        virtual bool merge() const { return false; }
+        virtual void initCheck() { _oldTop = bt()->keyNode( 0 ).key.toBson(); }
+        virtual void validate() { ASSERT( !( _oldTop == bt()->keyNode( 0 ).key.toBson() ) ); }
+    private:
+        BSONObj _oldTop;
+    };
+
+    class BalanceLeftEmpty : public MergeLeftEmpty {
+    protected:
+        virtual int rightSize() const { return BtreeBucket::bodySize() - biggestSize() - sizeof( _KeyNode ) + 1; }
+        virtual bool merge() const { return false; }
+        virtual void initCheck() { _oldTop = bt()->keyNode( 0 ).key.toBson(); }
+        virtual void validate() { ASSERT( !( _oldTop == bt()->keyNode( 0 ).key.toBson() ) ); }
+    private:
+        BSONObj _oldTop;
+    };
+
+    class DelEmptyNoNeighbors : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{b:{a:null}}", id() );
+            ASSERT_EQUALS( 2, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "a" );
+            //            dump();
+            ASSERT( unindex( k ) );
+            //            dump();
+            ASSERT_EQUALS( 1, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{b:null}", id() );
+        }
+    };
+
+    class DelEmptyEmptyNeighbors : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null,c:{b:null},d:null}", id() );
+            ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "b" );
+            //            dump();
+            ASSERT( unindex( k ) );
+            //            dump();
+            ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), 0, true ) );
+            ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{a:null,c:null,d:null}", id() );
+        }
+    };
+
+    class DelInternal : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null,c:{b:null},d:null}", id() );
+            long long unused = 0;
+            ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "c" );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{a:null,b:null,d:null}", id() );
+        }
+    };
+
+    class DelInternalReplaceWithUnused : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null,c:{b:null},d:null}", id() );
+            getDur().writingInt( const_cast< BtreeBucket::Loc& >( bt()->keyNode( 1 ).prevChildBucket.btree()->keyNode( 0 ).recordLoc ).GETOFS() ) |= 1; // make unused
+            long long unused = 0;
+            ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 1, unused );
+            ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "c" );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            unused = 0;
+            ASSERT_EQUALS( 2, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 1, unused );
+            ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords );
+            // doesn't discriminate between used and unused
+            ArtificialTree::checkStructure( "{a:null,b:null,d:null}", id() );
+        }
+    };
+
+    class DelInternalReplaceRight : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null,_:{b:null}}", id() );
+            long long unused = 0;
+            ASSERT_EQUALS( 2, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "a" );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            unused = 0;
+            ASSERT_EQUALS( 1, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 1, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{b:null}", id() );
+        }
+    };
+
+    class DelInternalPromoteKey : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null,y:{d:{c:{b:null}},_:{e:null}},z:null}", id() );
+            long long unused = 0;
+            ASSERT_EQUALS( 7, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 5, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "y" );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            unused = 0;
+            ASSERT_EQUALS( 6, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{a:null,e:{c:{b:null},d:null},z:null}", id() );
+        }
+    };
+
+    class DelInternalPromoteRightKey : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null,_:{e:{c:null},_:{f:null}}}", id() );
+            long long unused = 0;
+            ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "a" );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            unused = 0;
+            ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 2, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{c:null,_:{e:null,f:null}}", id() );
+        }
+    };
+
+    class DelInternalReplacementPrevNonNull : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null,d:{c:{b:null}},e:null}", id() );
+            long long unused = 0;
+            ASSERT_EQUALS( 5, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "d" );
+            //            dump();
+            ASSERT( unindex( k ) );
+            //            dump();
+            ASSERT_EQUALS( 4, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 1, unused );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{a:null,d:{c:{b:null}},e:null}", id() );
+            ASSERT( bt()->keyNode( 1 ).recordLoc.getOfs() & 1 ); // check 'unused' key
+        }
+    };
+
+    class DelInternalReplacementNextNonNull : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{a:null,_:{c:null,_:{d:null}}}", id() );
+            long long unused = 0;
+            ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << "a" );
+            //            dump();
+            ASSERT( unindex( k ) );
+            //            dump();
+            ASSERT_EQUALS( 2, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 1, unused );
+            ASSERT_EQUALS( 3, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{a:null,_:{c:null,_:{d:null}}}", id() );
+            ASSERT( bt()->keyNode( 0 ).recordLoc.getOfs() & 1 ); // check 'unused' key
+        }
+    };
+
+    class DelInternalSplitPromoteLeft : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:null,$20:null,$30$10:{$25:{$23:null},_:{$27:null}},$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100:null}", id() );
+            long long unused = 0;
+            ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x30, 0x10 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$60:{$10:null,$20:null,$27:{$23:null,$25:null},$40:null,$50:null},_:{$70:null,$80:null,$90:null,$100:null}}", id() );
+        }
+    };
+
+    class DelInternalSplitPromoteRight : public Base {
+    public:
+        void run() {
+            string ns = id().indexNamespace();
+            ArtificialTree::setTree( "{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100$10:{$95:{$93:null},_:{$97:null}}}", id() );
+            long long unused = 0;
+            ASSERT_EQUALS( 13, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            BSONObj k = BSON( "" << bigNumString( 0x100, 0x10 ) );
+//            dump();
+            ASSERT( unindex( k ) );
+//            dump();
+            ASSERT_EQUALS( 12, bt()->fullValidate( dl(), order(), &unused, true ) );
+            ASSERT_EQUALS( 0, unused );
+            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
+            ArtificialTree::checkStructure( "{$80:{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},_:{$90:null,$97:{$93:null,$95:null}}}", id() );
+        }
+    };
+
+    class SignedZeroDuplication : public Base {
+    public:
+        void run() {
+            ASSERT_EQUALS( 0.0, -0.0 );
+            DBDirectClient c;
+            c.ensureIndex( ns(), BSON( "b" << 1 ), true );
+            c.insert( ns(), BSON( "b" << 0.0 ) );
+            c.insert( ns(), BSON( "b" << 1.0 ) );
+            c.update( ns(), BSON( "b" << 1.0 ), BSON( "b" << -0.0 ) );
+            ASSERT_EQUALS( 1U, c.count( ns(), BSON( "b" << 0.0 ) ) );
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( testName ) {
+        }
+
+        void setupTests() {
+            add< Create >();
+            add< SimpleInsertDelete >();
+            add< SplitRightHeavyBucket >();
+            add< SplitLeftHeavyBucket >();
+            add< MissingLocate >();
+            add< MissingLocateMultiBucket >();
+            add< SERVER983 >();
+            add< DontReuseUnused >();
+            add< PackUnused >();
+            add< DontDropReferenceKey >();
+            add< MergeBucketsLeft >();
+            add< MergeBucketsRight >();
+//            add< MergeBucketsHead >();
+            add< MergeBucketsDontReplaceHead >();
+            add< MergeBucketsDelInternal >();
+            add< MergeBucketsRightNull >();
+            add< DontMergeSingleBucket >();
+            add< ParentMergeNonRightToLeft >();
+            add< ParentMergeNonRightToRight >();
+            add< CantMergeRightNoMerge >();
+            add< CantMergeLeftNoMerge >();
+            add< MergeOption >();
+            add< ForceMergeLeft >();
+            add< ForceMergeRight >();
+            add< RecursiveMerge >();
+            add< RecursiveMergeRightBucket >();
+            add< RecursiveMergeDoubleRightBucket >();
+            add< MergeSizeJustRightRight >();
+            add< MergeSizeJustRightLeft >();
+            add< MergeSizeRight >();
+            add< MergeSizeLeft >();
+            add< NoMergeBelowMarkRight >();
+            add< NoMergeBelowMarkLeft >();
+            add< MergeSizeRightTooBig >();
+            add< MergeSizeLeftTooBig >();
+            add< BalanceOneLeftToRight >();
+            add< BalanceOneRightToLeft >();
+            add< BalanceThreeLeftToRight >();
+            add< BalanceThreeRightToLeft >();
+            add< BalanceSingleParentKey >();
+            add< PackEmpty >();
+            add< PackedDataSizeEmpty >();
+            add< BalanceSingleParentKeyPackParent >();
+            add< BalanceSplitParent >();
+            add< EvenRebalanceLeft >();
+            add< EvenRebalanceLeftCusp >();
+            add< EvenRebalanceRight >();
+            add< EvenRebalanceRightCusp >();
+            add< EvenRebalanceCenter >();
+            add< OddRebalanceLeft >();
+            add< OddRebalanceRight >();
+            add< OddRebalanceCenter >();
+            add< RebalanceEmptyRight >();
+            add< RebalanceEmptyLeft >();
+            add< NoMoveAtLowWaterMarkRight >();
+            add< MoveBelowLowWaterMarkRight >();
+            add< NoMoveAtLowWaterMarkLeft >();
+            add< MoveBelowLowWaterMarkLeft >();
+            add< PreferBalanceLeft >();
+            add< PreferBalanceRight >();
+            add< RecursiveMergeThenBalance >();
+            add< MergeRightEmpty >();
+            add< MergeMinRightEmpty >();
+            add< MergeLeftEmpty >();
+            add< MergeMinLeftEmpty >();
+            add< BalanceRightEmpty >();
+            add< BalanceLeftEmpty >();
+            add< DelEmptyNoNeighbors >();
+            add< DelEmptyEmptyNeighbors >();
+            add< DelInternal >();
+            add< DelInternalReplaceWithUnused >();
+            add< DelInternalReplaceRight >();
+            add< DelInternalPromoteKey >();
+            add< DelInternalPromoteRightKey >();
+            add< DelInternalReplacementPrevNonNull >();
+            add< DelInternalReplacementNextNonNull >();
+            add< DelInternalSplitPromoteLeft >();
+            add< DelInternalSplitPromoteRight >();
+            add< SignedZeroDuplication >();
+        }
+    } myall;
diff --git a/src/mongo/dbtests/clienttests.cpp b/src/mongo/dbtests/clienttests.cpp
new file mode 100644
index 00000000000..a64894b43c1
--- /dev/null
+++ b/src/mongo/dbtests/clienttests.cpp
@@ -0,0 +1,197 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// client.cpp
+
+#include "pch.h"
+#include "../client/dbclient.h"
+#include "dbtests.h"
+#include "../db/concurrency.h"
+
+namespace ClientTests {
+
+    class Base {
+    public:
+
+        Base( string coll ) {
+            _ns = (string)"test." + coll;
+        }
+
+        virtual ~Base() {
+            db.dropCollection( _ns );
+        }
+
+        const char * ns() { return _ns.c_str(); }
+
+        string _ns;
+        DBDirectClient db;
+    };
+
+
+    class DropIndex : public Base {
+    public:
+        DropIndex() : Base( "dropindex" ) {}
+        void run() {
+            db.insert( ns() , BSON( "x" << 2 ) );
+            ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() );
+
+            db.ensureIndex( ns() , BSON( "x" << 1 ) );
+            ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() );
+
+            db.dropIndex( ns() , BSON( "x" << 1 ) );
+            ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() );
+
+            db.ensureIndex( ns() , BSON( "x" << 1 ) );
+            ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() );
+
+            db.dropIndexes( ns() );
+            ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() );
+        }
+    };
+
+    class ReIndex : public Base {
+    public:
+        ReIndex() : Base( "reindex" ) {}
+        void run() {
+
+            db.insert( ns() , BSON( "x" << 2 ) );
+            ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() );
+
+            db.ensureIndex( ns() , BSON( "x" << 1 ) );
+            ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() );
+
+            db.reIndex( ns() );
+            ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() );
+        }
+
+    };
+
+    class ReIndex2 : public Base {
+    public:
+        ReIndex2() : Base( "reindex2" ) {}
+        void run() {
+
+            db.insert( ns() , BSON( "x" << 2 ) );
+            ASSERT_EQUALS( 1 , db.getIndexes( ns() )->itcount() );
+
+            db.ensureIndex( ns() , BSON( "x" << 1 ) );
+            ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() );
+
+            BSONObj out;
+            ASSERT( db.runCommand( "test" , BSON( "reIndex" << "reindex2" ) , out ) );
+            ASSERT_EQUALS( 2 , out["nIndexes"].number() );
+            ASSERT_EQUALS( 2 , db.getIndexes( ns() )->itcount() );
+        }
+
+    };
+
+    class CS_10 : public Base {
+    public:
+        CS_10() : Base( "CS_10" ) {}
+        void run() {
+            string longs( 770, 'c' );
+            for( int i = 0; i < 1111; ++i )
+                db.insert( ns(), BSON( "a" << i << "b" << longs ) );
+            db.ensureIndex( ns(), BSON( "a" << 1 << "b" << 1 ) );
+
+            auto_ptr< DBClientCursor > c = db.query( ns(), Query().sort( BSON( "a" << 1 << "b" << 1 ) ) );
+            ASSERT_EQUALS( 1111, c->itcount() );
+        }
+    };
+
+    class PushBack : public Base {
+    public:
+        PushBack() : Base( "PushBack" ) {}
+        void run() {
+            for( int i = 0; i < 10; ++i )
+                db.insert( ns(), BSON( "i" << i ) );
+            auto_ptr< DBClientCursor > c = db.query( ns(), Query().sort( BSON( "i" << 1 ) ) );
+
+            BSONObj o = c->next();
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 9 , c->objsLeftInBatch() );
+            ASSERT( c->moreInCurrentBatch() );
+
+            c->putBack( o );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 10, c->objsLeftInBatch() );
+            ASSERT( c->moreInCurrentBatch() );
+
+            o = c->next();
+            BSONObj o2 = c->next();
+            BSONObj o3 = c->next();
+            c->putBack( o3 );
+            c->putBack( o2 );
+            c->putBack( o );
+            for( int i = 0; i < 10; ++i ) {
+                o = c->next();
+                ASSERT_EQUALS( i, o[ "i" ].number() );
+            }
+            ASSERT( !c->more() );
+            ASSERT_EQUALS( 0, c->objsLeftInBatch() );
+            ASSERT( !c->moreInCurrentBatch() );
+
+            c->putBack( o );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 1, c->objsLeftInBatch() );
+            ASSERT( c->moreInCurrentBatch() );
+            ASSERT_EQUALS( 1, c->itcount() );
+        }
+    };
+
+    class Create : public Base {
+    public:
+        Create() : Base( "Create" ) {}
+        void run() {
+            db.createCollection( "unittests.clienttests.create", 4096, true );
+            BSONObj info;
+            ASSERT( db.runCommand( "unittests", BSON( "collstats" << "clienttests.create" ), info ) );
+        }
+    };
+    
+    class ConnectionStringTests {
+    public:
+        void run() {
+            {
+                ConnectionString s( "a/b,c,d" , ConnectionString::SET );
+                ASSERT_EQUALS( ConnectionString::SET , s.type() );
+                ASSERT_EQUALS( "a" , s.getSetName() );
+                vector<HostAndPort> v = s.getServers();
+                ASSERT_EQUALS( 3U , v.size() );
+                ASSERT_EQUALS( "b" , v[0].host() );
+                ASSERT_EQUALS( "c" , v[1].host() );
+                ASSERT_EQUALS( "d" , v[2].host() );
+            }
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "client" ) {
+        }
+
+        void setupTests() {
+            add<DropIndex>();
+            add<ReIndex>();
+            add<ReIndex2>();
+            add<CS_10>();
+            add<PushBack>();
+            add<Create>();
+            add<ConnectionStringTests>();
+        }
+
+    } all;
+}
diff --git a/src/mongo/dbtests/commandtests.cpp b/src/mongo/dbtests/commandtests.cpp
new file mode 100644
index 00000000000..fa6204d25fd
--- /dev/null
+++ b/src/mongo/dbtests/commandtests.cpp
@@ -0,0 +1,98 @@
+/**
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../client/dbclient.h"
+#include "dbtests.h"
+#include "../db/concurrency.h"
+
+using namespace mongo;
+
+namespace CommandTests {
+    // one namespace per command
+    namespace FileMD5 {
+        struct Base {
+            Base() {
+                db.dropCollection(ns());
+                db.ensureIndex(ns(), BSON( "files_id" << 1 << "n" << 1 ));
+            }
+
+            const char* ns() { return "test.fs.chunks"; }
+
+            DBDirectClient db;
+        };
+        struct Type0 : Base {
+            void run() {
+                {
+                    BSONObjBuilder b;
+                    b.genOID();
+                    b.append("files_id", 0);
+                    b.append("n", 0);
+                    b.appendBinData("data", 6, BinDataGeneral, "hello ");
+                    db.insert(ns(), b.obj());
+                }
+                {
+                    BSONObjBuilder b;
+                    b.genOID();
+                    b.append("files_id", 0);
+                    b.append("n", 1);
+                    b.appendBinData("data", 5, BinDataGeneral, "world");
+                    db.insert(ns(), b.obj());
+                }
+
+                BSONObj result;
+                ASSERT( db.runCommand("test", BSON("filemd5" << 0), result) );
+                ASSERT_EQUALS( string("5eb63bbbe01eeed093cb22bb8f5acdc3") , result["md5"].valuestr() );
+            }
+        };
+        struct Type2 : Base {
+            void run() {
+                {
+                    BSONObjBuilder b;
+                    b.genOID();
+                    b.append("files_id", 0);
+                    b.append("n", 0);
+                    b.appendBinDataArrayDeprecated("data", "hello ", 6);
+                    db.insert(ns(), b.obj());
+                }
+                {
+                    BSONObjBuilder b;
+                    b.genOID();
+                    b.append("files_id", 0);
+                    b.append("n", 1);
+                    b.appendBinDataArrayDeprecated("data", "world", 5);
+                    db.insert(ns(), b.obj());
+                }
+
+                BSONObj result;
+                ASSERT( db.runCommand("test", BSON("filemd5" << 0), result) );
+                ASSERT_EQUALS( string("5eb63bbbe01eeed093cb22bb8f5acdc3") , result["md5"].valuestr() );
+            }
+        };
+    }
+
+    class All : public Suite {
+    public:
+        All() : Suite( "commands" ) {
+        }
+
+        void setupTests() {
+            add< FileMD5::Type0 >();
+            add< FileMD5::Type2 >();
+        }
+
+    } all;
+}
diff --git a/src/mongo/dbtests/counttests.cpp b/src/mongo/dbtests/counttests.cpp
new file mode 100644
index 00000000000..0d2575f14e3
--- /dev/null
+++ b/src/mongo/dbtests/counttests.cpp
@@ -0,0 +1,142 @@
+// counttests.cpp : count.{h,cpp} unit tests.
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../db/ops/count.h"
+
+#include "../db/cursor.h"
+#include "../db/pdfile.h"
+
+#include "dbtests.h"
+
+namespace CountTests {
+
+    class Base {
+        dblock lk;
+        Client::Context _context;
+    public:
+        Base() : _context( ns() ) {
+            addIndex( fromjson( "{\"a\":1}" ) );
+        }
+        ~Base() {
+            try {
+                boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( ns() );
+                vector< DiskLoc > toDelete;
+                for(; c->ok(); c->advance() )
+                    toDelete.push_back( c->currLoc() );
+                for( vector< DiskLoc >::iterator i = toDelete.begin(); i != toDelete.end(); ++i )
+                    theDataFileMgr.deleteRecord( ns(), i->rec(), *i, false );
+                DBDirectClient cl;
+                cl.dropIndexes( ns() );
+            }
+            catch ( ... ) {
+                FAIL( "Exception while cleaning up collection" );
+            }
+        }
+    protected:
+        static const char *ns() {
+            return "unittests.counttests";
+        }
+        static void addIndex( const BSONObj &key ) {
+            BSONObjBuilder b;
+            b.append( "name", key.firstElementFieldName() );
+            b.append( "ns", ns() );
+            b.append( "key", key );
+            BSONObj o = b.done();
+            stringstream indexNs;
+            indexNs << "unittests.system.indexes";
+            theDataFileMgr.insert( indexNs.str().c_str(), o.objdata(), o.objsize() );
+        }
+        static void insert( const char *s ) {
+            insert( fromjson( s ) );
+        }
+        static void insert( const BSONObj &o ) {
+            theDataFileMgr.insert( ns(), o.objdata(), o.objsize() );
+        }
+    };
+    
+    class CountBasic : public Base {
+    public:
+        void run() {
+            insert( "{\"a\":\"b\"}" );
+            BSONObj cmd = fromjson( "{\"query\":{}}" );
+            string err;
+            ASSERT_EQUALS( 1, runCount( ns(), cmd, err ) );
+        }
+    };
+    
+    class CountQuery : public Base {
+    public:
+        void run() {
+            insert( "{\"a\":\"b\"}" );
+            insert( "{\"a\":\"b\",\"x\":\"y\"}" );
+            insert( "{\"a\":\"c\"}" );
+            BSONObj cmd = fromjson( "{\"query\":{\"a\":\"b\"}}" );
+            string err;
+            ASSERT_EQUALS( 2, runCount( ns(), cmd, err ) );
+        }
+    };
+    
+    class CountFields : public Base {
+    public:
+        void run() {
+            insert( "{\"a\":\"b\"}" );
+            insert( "{\"c\":\"d\"}" );
+            BSONObj cmd = fromjson( "{\"query\":{},\"fields\":{\"a\":1}}" );
+            string err;
+            ASSERT_EQUALS( 2, runCount( ns(), cmd, err ) );
+        }
+    };
+    
+    class CountQueryFields : public Base {
+    public:
+        void run() {
+            insert( "{\"a\":\"b\"}" );
+            insert( "{\"a\":\"c\"}" );
+            insert( "{\"d\":\"e\"}" );
+            BSONObj cmd = fromjson( "{\"query\":{\"a\":\"b\"},\"fields\":{\"a\":1}}" );
+            string err;
+            ASSERT_EQUALS( 1, runCount( ns(), cmd, err ) );
+        }
+    };
+    
+    class CountIndexedRegex : public Base {
+    public:
+        void run() {
+            insert( "{\"a\":\"b\"}" );
+            insert( "{\"a\":\"c\"}" );
+            BSONObj cmd = fromjson( "{\"query\":{\"a\":/^b/}}" );
+            string err;
+            ASSERT_EQUALS( 1, runCount( ns(), cmd, err ) );
+        }
+    };
+    
+    class All : public Suite {
+    public:
+        All() : Suite( "count" ) {
+        }
+        
+        void setupTests() {
+            add< CountBasic >();
+            add< CountQuery >();
+            add< CountFields >();
+            add< CountQueryFields >();
+            add< CountIndexedRegex >();
+        }
+    } myall;
+    
+} // namespace CountTests
diff --git a/src/mongo/dbtests/cursortests.cpp b/src/mongo/dbtests/cursortests.cpp
new file mode 100644
index 00000000000..a7b52aada12
--- /dev/null
+++ b/src/mongo/dbtests/cursortests.cpp
@@ -0,0 +1,305 @@
+// cusrortests.cpp // cursor related unit tests
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/clientcursor.h"
+#include "../db/instance.h"
+#include "../db/btree.h"
+#include "../db/queryutil.h"
+#include "dbtests.h"
+
+namespace CursorTests {
+
+    namespace BtreeCursorTests {
+
+        // The ranges expressed in these tests are impossible given our query
+        // syntax, so going to do them a hacky way.
+
+        class Base {
+        protected:
+            static const char *ns() { return "unittests.cursortests.Base"; }
+            FieldRangeVector *vec( int *vals, int len, int direction = 1 ) {
+                FieldRangeSet s( "", BSON( "a" << 1 ), true );
+                for( int i = 0; i < len; i += 2 ) {
+                    _objs.push_back( BSON( "a" << BSON( "$gte" << vals[ i ] << "$lte" << vals[ i + 1 ] ) ) );
+                    FieldRangeSet s2( "", _objs.back(), true );
+                    if ( i == 0 ) {
+                        s.range( "a" ) = s2.range( "a" );
+                    }
+                    else {
+                        s.range( "a" ) |= s2.range( "a" );
+                    }
+                }
+                // orphan idxSpec for this test
+                IndexSpec *idxSpec = new IndexSpec( BSON( "a" << 1 ) );
+                return new FieldRangeVector( s, *idxSpec, direction );
+            }
+            DBDirectClient _c;
+        private:
+            vector< BSONObj > _objs;
+        };
+
+        class MultiRange : public Base {
+        public:
+            void run() {
+                dblock lk;
+                const char *ns = "unittests.cursortests.BtreeCursorTests.MultiRange";
+                {
+                    DBDirectClient c;
+                    for( int i = 0; i < 10; ++i )
+                        c.insert( ns, BSON( "a" << i ) );
+                    ASSERT( c.ensureIndex( ns, BSON( "a" << 1 ) ) );
+                }
+                int v[] = { 1, 2, 4, 6 };
+                boost::shared_ptr< FieldRangeVector > frv( vec( v, 4 ) );
+                Client::Context ctx( ns );
+                scoped_ptr<BtreeCursor> _c( BtreeCursor::make( nsdetails( ns ), 1, nsdetails( ns )->idx(1), frv, 1 ) );
+                BtreeCursor &c = *_c.get();
+                ASSERT_EQUALS( "BtreeCursor a_1 multi", c.toString() );
+                double expected[] = { 1, 2, 4, 5, 6 };
+                for( int i = 0; i < 5; ++i ) {
+                    ASSERT( c.ok() );
+                    ASSERT_EQUALS( expected[ i ], c.currKey().firstElement().number() );
+                    c.advance();
+                }
+                ASSERT( !c.ok() );
+            }
+        };
+
+        class MultiRangeGap : public Base {
+        public:
+            void run() {
+                dblock lk;
+                const char *ns = "unittests.cursortests.BtreeCursorTests.MultiRangeGap";
+                {
+                    DBDirectClient c;
+                    for( int i = 0; i < 10; ++i )
+                        c.insert( ns, BSON( "a" << i ) );
+                    for( int i = 100; i < 110; ++i )
+                        c.insert( ns, BSON( "a" << i ) );
+                    ASSERT( c.ensureIndex( ns, BSON( "a" << 1 ) ) );
+                }
+                int v[] = { -50, 2, 40, 60, 109, 200 };
+                boost::shared_ptr< FieldRangeVector > frv( vec( v, 6 ) );
+                Client::Context ctx( ns );
+                scoped_ptr<BtreeCursor> _c( BtreeCursor::make(nsdetails( ns ), 1, nsdetails( ns )->idx(1), frv, 1 ) );
+                BtreeCursor &c = *_c.get();
+                ASSERT_EQUALS( "BtreeCursor a_1 multi", c.toString() );
+                double expected[] = { 0, 1, 2, 109 };
+                for( int i = 0; i < 4; ++i ) {
+                    ASSERT( c.ok() );
+                    ASSERT_EQUALS( expected[ i ], c.currKey().firstElement().number() );
+                    c.advance();
+                }
+                ASSERT( !c.ok() );
+            }
+        };
+
+        class MultiRangeReverse : public Base {
+        public:
+            void run() {
+                dblock lk;
+                const char *ns = "unittests.cursortests.BtreeCursorTests.MultiRangeReverse";
+                {
+                    DBDirectClient c;
+                    for( int i = 0; i < 10; ++i )
+                        c.insert( ns, BSON( "a" << i ) );
+                    ASSERT( c.ensureIndex( ns, BSON( "a" << 1 ) ) );
+                }
+                int v[] = { 1, 2, 4, 6 };
+                boost::shared_ptr< FieldRangeVector > frv( vec( v, 4, -1 ) );
+                Client::Context ctx( ns );
+                scoped_ptr<BtreeCursor> _c( BtreeCursor::make( nsdetails( ns ), 1, nsdetails( ns )->idx(1), frv, -1 ) );
+                BtreeCursor& c = *_c.get();
+                ASSERT_EQUALS( "BtreeCursor a_1 reverse multi", c.toString() );
+                double expected[] = { 6, 5, 4, 2, 1 };
+                for( int i = 0; i < 5; ++i ) {
+                    ASSERT( c.ok() );
+                    ASSERT_EQUALS( expected[ i ], c.currKey().firstElement().number() );
+                    c.advance();
+                }
+                ASSERT( !c.ok() );
+            }
+        };
+
+        class Base2 {
+        public:
+            virtual ~Base2() { _c.dropCollection( ns() ); }
+        protected:
+            static const char *ns() { return "unittests.cursortests.Base2"; }
+            DBDirectClient _c;
+            virtual BSONObj idx() const = 0;
+            virtual int direction() const { return 1; }
+            void insert( const BSONObj &o ) {
+                _objs.push_back( o );
+                _c.insert( ns(), o );
+            }
+            void check( const BSONObj &spec ) {
+                {
+                    BSONObj keypat = idx();
+                    //cout << keypat.toString() << endl;
+                    _c.ensureIndex( ns(), idx() );
+                }
+
+                Client::Context ctx( ns() );
+                FieldRangeSet frs( ns(), spec, true );
+                // orphan spec for this test.
+                IndexSpec *idxSpec = new IndexSpec( idx() );
+                boost::shared_ptr< FieldRangeVector > frv( new FieldRangeVector( frs, *idxSpec, direction() ) );
+                scoped_ptr<BtreeCursor> c( BtreeCursor::make( nsdetails( ns() ), 1, nsdetails( ns() )->idx( 1 ), frv, direction() ) );
+                Matcher m( spec );
+                int count = 0;
+                while( c->ok() ) {
+                    ASSERT( m.matches( c->current() ) );
+                    c->advance();
+                    ++count;
+                }
+                int expectedCount = 0;
+                for( vector< BSONObj >::const_iterator i = _objs.begin(); i != _objs.end(); ++i ) {
+                    if ( m.matches( *i ) ) {
+                        ++expectedCount;
+                    }
+                }
+                ASSERT_EQUALS( expectedCount, count );
+            }
+        private:
+            dblock _lk;
+            vector< BSONObj > _objs;
+        };
+
+        class EqEq : public Base2 {
+        public:
+            void run() {
+                insert( BSON( "a" << 4 << "b" << 5 ) );
+                insert( BSON( "a" << 4 << "b" << 5 ) );
+                insert( BSON( "a" << 4 << "b" << 4 ) );
+                insert( BSON( "a" << 5 << "b" << 4 ) );
+                check( BSON( "a" << 4 << "b" << 5 ) );
+            }
+            virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); }
+        };
+
+        class EqRange : public Base2 {
+        public:
+            void run() {
+                insert( BSON( "a" << 3 << "b" << 5 ) );
+                insert( BSON( "a" << 4 << "b" << 0 ) );
+                insert( BSON( "a" << 4 << "b" << 5 ) );
+                insert( BSON( "a" << 4 << "b" << 6 ) );
+                insert( BSON( "a" << 4 << "b" << 6 ) );
+                insert( BSON( "a" << 4 << "b" << 10 ) );
+                insert( BSON( "a" << 4 << "b" << 11 ) );
+                insert( BSON( "a" << 5 << "b" << 5 ) );
+                check( BSON( "a" << 4 << "b" << BSON( "$gte" << 1 << "$lte" << 10 ) ) );
+            }
+            virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); }
+        };
+
+        class EqIn : public Base2 {
+        public:
+            void run() {
+                insert( BSON( "a" << 3 << "b" << 5 ) );
+                insert( BSON( "a" << 4 << "b" << 0 ) );
+                insert( BSON( "a" << 4 << "b" << 5 ) );
+                insert( BSON( "a" << 4 << "b" << 6 ) );
+                insert( BSON( "a" << 4 << "b" << 6 ) );
+                insert( BSON( "a" << 4 << "b" << 10 ) );
+                insert( BSON( "a" << 4 << "b" << 11 ) );
+                insert( BSON( "a" << 5 << "b" << 5 ) );
+                check( BSON( "a" << 4 << "b" << BSON( "$in" << BSON_ARRAY( 5 << 6 << 11 ) ) ) );
+            }
+            virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); }
+        };
+
+        class RangeEq : public Base2 {
+        public:
+            void run() {
+                insert( BSON( "a" << 0 << "b" << 4 ) );
+                insert( BSON( "a" << 1 << "b" << 4 ) );
+                insert( BSON( "a" << 4 << "b" << 3 ) );
+                insert( BSON( "a" << 5 << "b" << 4 ) );
+                insert( BSON( "a" << 7 << "b" << 4 ) );
+                insert( BSON( "a" << 4 << "b" << 4 ) );
+                insert( BSON( "a" << 9 << "b" << 6 ) );
+                insert( BSON( "a" << 11 << "b" << 1 ) );
+                insert( BSON( "a" << 11 << "b" << 4 ) );
+                check( BSON( "a" << BSON( "$gte" << 1 << "$lte" << 10 ) << "b" << 4 ) );
+            }
+            virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); }
+        };
+
+        class RangeIn : public Base2 {
+        public:
+            void run() {
+                insert( BSON( "a" << 0 << "b" << 4 ) );
+                insert( BSON( "a" << 1 << "b" << 5 ) );
+                insert( BSON( "a" << 4 << "b" << 3 ) );
+                insert( BSON( "a" << 5 << "b" << 4 ) );
+                insert( BSON( "a" << 7 << "b" << 5 ) );
+                insert( BSON( "a" << 4 << "b" << 4 ) );
+                insert( BSON( "a" << 9 << "b" << 6 ) );
+                insert( BSON( "a" << 11 << "b" << 1 ) );
+                insert( BSON( "a" << 11 << "b" << 4 ) );
+                check( BSON( "a" << BSON( "$gte" << 1 << "$lte" << 10 ) << "b" << BSON( "$in" << BSON_ARRAY( 4 << 6 ) ) ) );
+            }
+            virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); }
+        };
+        
+        class AbortImplicitScan : public Base {
+        public:
+            void run() {
+                dblock lk;
+                IndexSpec idx( BSON( "a" << 1 << "b" << 1 ) );
+                _c.ensureIndex( ns(), idx.keyPattern );
+                for( int i = 0; i < 300; ++i ) {
+                    _c.insert( ns(), BSON( "a" << i << "b" << 5 ) );
+                }
+                FieldRangeSet frs( ns(), BSON( "b" << 3 ), true );
+                boost::shared_ptr<FieldRangeVector> frv( new FieldRangeVector( frs, idx, 1 ) );
+                Client::Context ctx( ns() );
+                scoped_ptr<BtreeCursor> c( BtreeCursor::make( nsdetails( ns() ), 1, nsdetails( ns() )->idx(1), frv, 1 ) );
+                long long initialNscanned = c->nscanned();
+                ASSERT( initialNscanned < 200 );
+                ASSERT( c->ok() );
+                c->advance();
+                ASSERT( c->nscanned() > initialNscanned );
+                ASSERT( c->nscanned() < 200 );
+                ASSERT( c->ok() );
+            }
+        };
+
+    } // namespace BtreeCursorTests
+
+    class All : public Suite {
+    public:
+        All() : Suite( "cursor" ) {}
+
+        void setupTests() {
+            add< BtreeCursorTests::MultiRange >();
+            add< BtreeCursorTests::MultiRangeGap >();
+            add< BtreeCursorTests::MultiRangeReverse >();
+            add< BtreeCursorTests::EqEq >();
+            add< BtreeCursorTests::EqRange >();
+            add< BtreeCursorTests::EqIn >();
+            add< BtreeCursorTests::RangeEq >();
+            add< BtreeCursorTests::RangeIn >();
+            add< BtreeCursorTests::AbortImplicitScan >();
+        }
+    } myall;
+} // namespace CursorTests
diff --git a/src/mongo/dbtests/d_chunk_manager_tests.cpp b/src/mongo/dbtests/d_chunk_manager_tests.cpp
new file mode 100644
index 00000000000..2bcc90faf7a
--- /dev/null
+++ b/src/mongo/dbtests/d_chunk_manager_tests.cpp
@@ -0,0 +1,467 @@
+//@file d_chunk_manager_tests.cpp : s/d_chunk_manager.{h,cpp} tests
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dbtests.h"
+
+#include "../s/d_chunk_manager.h"
+
+namespace {
+
+    class BasicTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 ) <<
+                                       "unique"  << false );
+
+            // single-chunk collection
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey" <<
+                                                 "ns"  << "test.foo" <<
+                                                 "min" << BSON( "a" << MINKEY ) <<
+                                                 "max" << BSON( "a" << MAXKEY ) ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            BSONObj k1 = BSON( "a" << MINKEY );
+            ASSERT( s.belongsToMe( k1 ) );
+            BSONObj k2 = BSON( "a" << MAXKEY );
+            ASSERT( ! s.belongsToMe( k2 ) );
+            BSONObj k3 = BSON( "a" << 1 << "b" << 2 );
+            ASSERT( s.belongsToMe( k3 ) );
+        }
+    };
+
+    class BasicCompoundTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 << "b" << 1) <<
+                                       "unique"  << false );
+
+            // single-chunk collection
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKeyb_MinKey" <<
+                                                 "ns"  << "test.foo" <<
+                                                 "min" << BSON( "a" << MINKEY << "b" << MINKEY ) <<
+                                                 "max" << BSON( "a" << MAXKEY << "b" << MAXKEY ) ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            BSONObj k1 = BSON( "a" << MINKEY << "b" << MINKEY );
+            ASSERT( s.belongsToMe( k1 ) );
+            BSONObj k2 = BSON( "a" << MAXKEY << "b" << MAXKEY );
+            ASSERT( ! s.belongsToMe( k2 ) );
+            BSONObj k3 = BSON( "a" << MINKEY << "b" << 10 );
+            ASSERT( s.belongsToMe( k3 ) );
+            BSONObj k4 = BSON( "a" << 10 << "b" << 20 );
+            ASSERT( s.belongsToMe( k4 ) );
+        }
+    };
+
+    class RangeTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "x.y" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 ) <<
+                                       "unique"  << false );
+
+            // 3-chunk collection, 2 of them being contiguous
+            // [min->10) , [10->20) , <gap> , [30->max)
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "x.y-a_MinKey" <<
+                                                 "ns"  << "x.y" <<
+                                                 "min" << BSON( "a" << MINKEY ) <<
+                                                 "max" << BSON( "a" << 10 ) ) <<
+                                           BSON( "_id" << "x.y-a_10" <<
+                                                 "ns"  << "x.y" <<
+                                                 "min" << BSON( "a" << 10 ) <<
+                                                 "max" << BSON( "a" << 20 ) ) <<
+                                           BSON( "_id" << "x.y-a_30" <<
+                                                 "ns"  << "x.y" <<
+                                                 "min" << BSON( "a" << 30 ) <<
+                                                 "max" << BSON( "a" << MAXKEY ) ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            BSONObj k1 = BSON( "a" << 5 );
+            ASSERT( s.belongsToMe( k1 ) );
+            BSONObj k2 = BSON( "a" << 10 );
+            ASSERT( s.belongsToMe( k2 ) );
+            BSONObj k3 = BSON( "a" << 25 );
+            ASSERT( ! s.belongsToMe( k3 ) );
+            BSONObj k4 = BSON( "a" << 30 );
+            ASSERT( s.belongsToMe( k4 ) );
+            BSONObj k5 = BSON( "a" << 40 );
+            ASSERT( s.belongsToMe( k5 ) );
+        }
+    };
+
+    class GetNextTests {
+    public:
+        void run() {
+
+            BSONObj collection = BSON( "_id"     << "x.y" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 ) <<
+                                       "unique"  << false );
+            // empty collection
+            BSONArray chunks1 = BSONArray();
+            ShardChunkManager s1( collection , chunks1 );
+
+            BSONObj empty;
+            BSONObj arbitraryKey = BSON( "a" << 10 );
+            BSONObj foundMin, foundMax;
+
+            ASSERT( s1.getNextChunk( empty , &foundMin , &foundMax ) );
+            ASSERT( foundMin.isEmpty() );
+            ASSERT( foundMax.isEmpty() );
+            ASSERT( s1.getNextChunk( arbitraryKey , &foundMin , &foundMax ) );
+            ASSERT( foundMin.isEmpty() );
+            ASSERT( foundMax.isEmpty() );
+
+            // single-chunk collection
+            // [10->20]
+            BSONObj key_a10 = BSON( "a" << 10 );
+            BSONObj key_a20 = BSON( "a" << 20 );
+            BSONArray chunks2 = BSON_ARRAY( BSON( "_id" << "x.y-a_10" <<
+                                                  "ns"  << "x.y" <<
+                                                  "min" << key_a10 <<
+                                                  "max" << key_a20 ) );
+            ShardChunkManager s2( collection , chunks2 );
+            ASSERT( s2.getNextChunk( empty , &foundMin , &foundMax ) );
+            ASSERT( foundMin.woCompare( key_a10 ) == 0 );
+            ASSERT( foundMax.woCompare( key_a20 ) == 0 );
+
+            // 3-chunk collection, 2 of them being contiguous
+            // [min->10) , [10->20) , <gap> , [30->max)
+            BSONObj key_a30 = BSON( "a" << 30 );
+            BSONObj key_min = BSON( "a" << MINKEY );
+            BSONObj key_max = BSON( "a" << MAXKEY );
+            BSONArray chunks3 = BSON_ARRAY( BSON( "_id" << "x.y-a_MinKey" <<
+                                                  "ns"  << "x.y" <<
+                                                  "min" << key_min <<
+                                                  "max" << key_a10  ) <<
+                                            BSON( "_id" << "x.y-a_10" <<
+                                                  "ns"  << "x.y" <<
+                                                  "min" << key_a10  <<
+                                                  "max" << key_a20  ) <<
+                                            BSON( "_id" << "x.y-a_30" <<
+                                                  "ns"  << "x.y" <<
+                                                  "min" << key_a30  <<
+                                                  "max" << key_max  ) );
+            ShardChunkManager s3( collection , chunks3 );
+            ASSERT( ! s3.getNextChunk( empty , &foundMin , &foundMax ) ); // not eof
+            ASSERT( foundMin.woCompare( key_min ) == 0 );
+            ASSERT( foundMax.woCompare( key_a10 ) == 0 );
+            ASSERT( ! s3.getNextChunk( key_a10 , &foundMin , &foundMax ) );
+            ASSERT( foundMin.woCompare( key_a30 ) == 0 );
+            ASSERT( foundMax.woCompare( key_max ) == 0 );
+            ASSERT( s3.getNextChunk( key_a30 , &foundMin , &foundMax ) );
+        }
+    };
+
+    class DeletedTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << "true" );
+
+            BSONArray chunks = BSONArray();
+
+            ASSERT_THROWS( ShardChunkManager s ( collection , chunks ) , UserException );
+        }
+    };
+
+    class ClonePlusTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 << "b" << 1 ) <<
+                                       "unique"  << false );
+            // 1-chunk collection
+            // [10,0-20,0)
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey" <<
+                                                 "ns"  << "test.foo" <<
+                                                 "min" << BSON( "a" << 10 << "b" << 0 ) <<
+                                                 "max" << BSON( "a" << 20 << "b" << 0 ) ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            // new chunk [20,0-30,0)
+            BSONObj min = BSON( "a" << 20 << "b" << 0 );
+            BSONObj max = BSON( "a" << 30 << "b" << 0 );
+            ShardChunkManagerPtr cloned( s.clonePlus( min , max , 1 /* TODO test version */ ) );
+
+            BSONObj k1 = BSON( "a" << 5 << "b" << 0 );
+            ASSERT( ! cloned->belongsToMe( k1 ) );
+            BSONObj k2 = BSON( "a" << 20 << "b" << 0 );
+            ASSERT( cloned->belongsToMe( k2 ) );
+            BSONObj k3 = BSON( "a" << 25 << "b" << 0 );
+            ASSERT( cloned->belongsToMe( k3 ) );
+            BSONObj k4 = BSON( "a" << 30 << "b" << 0 );
+            ASSERT( ! cloned->belongsToMe( k4 ) );
+        }
+    };
+
+    class ClonePlusExceptionTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 << "b" << 1 ) <<
+                                       "unique"  << false );
+            // 1-chunk collection
+            // [10,0-20,0)
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey" <<
+                                                 "ns"  << "test.foo" <<
+                                                 "min" << BSON( "a" << 10 << "b" << 0 ) <<
+                                                 "max" << BSON( "a" << 20 << "b" << 0 ) ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            // [15,0-25,0) overlaps [10,0-20,0)
+            BSONObj min = BSON( "a" << 15 << "b" << 0 );
+            BSONObj max = BSON( "a" << 25 << "b" << 0 );
+            ASSERT_THROWS( s.clonePlus ( min , max , 1 /* TODO test version */ ) , UserException );
+        }
+    };
+
+    class CloneMinusTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "x.y" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 << "b" << 1 ) <<
+                                       "unique"  << false );
+
+            // 2-chunk collection
+            // [10,0->20,0) , <gap> , [30,0->40,0)
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "x.y-a_10b_0" <<
+                                                 "ns"  << "x.y" <<
+                                                 "min" << BSON( "a" << 10 << "b" << 0 ) <<
+                                                 "max" << BSON( "a" << 20 << "b" << 0 ) ) <<
+                                           BSON( "_id" << "x.y-a_30b_0" <<
+                                                 "ns"  << "x.y" <<
+                                                 "min" << BSON( "a" << 30 << "b" << 0 ) <<
+                                                 "max" << BSON( "a" << 40 << "b" << 0 ) ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            // deleting chunk [10,0-20,0)
+            BSONObj min = BSON( "a" << 10 << "b" << 0 );
+            BSONObj max = BSON( "a" << 20 << "b" << 0 );
+            ShardChunkManagerPtr cloned( s.cloneMinus( min , max , 1 /* TODO test version */ ) );
+
+            BSONObj k1 = BSON( "a" << 5 << "b" << 0 );
+            ASSERT( ! cloned->belongsToMe( k1 ) );
+            BSONObj k2 = BSON( "a" << 15 << "b" << 0 );
+            ASSERT( ! cloned->belongsToMe( k2 ) );
+            BSONObj k3 = BSON( "a" << 30 << "b" << 0 );
+            ASSERT( cloned->belongsToMe( k3 ) );
+            BSONObj k4 = BSON( "a" << 35 << "b" << 0 );
+            ASSERT( cloned->belongsToMe( k4 ) );
+            BSONObj k5 = BSON( "a" << 40 << "b" << 0 );
+            ASSERT( ! cloned->belongsToMe( k5 ) );
+        }
+    };
+
+    class CloneMinusExceptionTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "x.y" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 << "b" << 1 ) <<
+                                       "unique"  << false );
+
+            // 2-chunk collection
+            // [10,0->20,0) , <gap> , [30,0->40,0)
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "x.y-a_10b_0" <<
+                                                 "ns"  << "x.y" <<
+                                                 "min" << BSON( "a" << 10 << "b" << 0 ) <<
+                                                 "max" << BSON( "a" << 20 << "b" << 0 ) ) <<
+                                           BSON( "_id" << "x.y-a_30b_0" <<
+                                                 "ns"  << "x.y" <<
+                                                 "min" << BSON( "a" << 30 << "b" << 0 ) <<
+                                                 "max" << BSON( "a" << 40 << "b" << 0 ) ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            // deleting non-existing chunk [25,0-28,0)
+            BSONObj min1 = BSON( "a" << 25 << "b" << 0 );
+            BSONObj max1 = BSON( "a" << 28 << "b" << 0 );
+            ASSERT_THROWS( s.cloneMinus( min1 , max1 , 1 /* TODO test version */ ) , UserException );
+
+
+            // deletin an overlapping range (not exactly a chunk) [15,0-25,0)
+            BSONObj min2 = BSON( "a" << 15 << "b" << 0 );
+            BSONObj max2 = BSON( "a" << 25 << "b" << 0 );
+            ASSERT_THROWS( s.cloneMinus( min2 , max2 , 1 /* TODO test version */ ) , UserException );
+        }
+    };
+
+    class CloneSplitTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 << "b" << 1 ) <<
+                                       "unique"  << false );
+            // 1-chunk collection
+            // [10,0-20,0)
+            BSONObj min = BSON( "a" << 10 << "b" << 0 );
+            BSONObj max = BSON( "a" << 20 << "b" << 0 );
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey"
+                                                 << "ns"  << "test.foo"
+                                                 << "min" << min
+                                                 << "max" << max ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            BSONObj split1 = BSON( "a" << 15 << "b" << 0 );
+            BSONObj split2 = BSON( "a" << 18 << "b" << 0 );
+            vector<BSONObj> splitKeys;
+            splitKeys.push_back( split1 );
+            splitKeys.push_back( split2 );
+            ShardChunkVersion version( 1 , 99 ); // first chunk 1|99 , second 1|100
+            ShardChunkManagerPtr cloned( s.cloneSplit( min , max , splitKeys , version ) );
+
+            version.incMinor(); /* second chunk 1|100, first split point */
+            version.incMinor(); /* third chunk 1|101, second split point */
+            ASSERT_EQUALS( cloned->getVersion() , version /* 1|101 */ );
+            ASSERT_EQUALS( s.getNumChunks() , 1u );
+            ASSERT_EQUALS( cloned->getNumChunks() , 3u );
+            ASSERT( cloned->belongsToMe( min ) );
+            ASSERT( cloned->belongsToMe( split1 ) );
+            ASSERT( cloned->belongsToMe( split2 ) );
+            ASSERT( ! cloned->belongsToMe( max ) );
+        }
+    };
+
+    class CloneSplitExceptionTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 << "b" << 1 ) <<
+                                       "unique"  << false );
+            // 1-chunk collection
+            // [10,0-20,0)
+            BSONObj min = BSON( "a" << 10 << "b" << 0 );
+            BSONObj max = BSON( "a" << 20 << "b" << 0 );
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_MinKey"
+                                                 << "ns"  << "test.foo"
+                                                 << "min" << min
+                                                 << "max" << max ) );
+
+            ShardChunkManager s ( collection , chunks );
+
+            BSONObj badSplit = BSON( "a" << 5 << "b" << 0 );
+            vector<BSONObj> splitKeys;
+            splitKeys.push_back( badSplit );
+            ASSERT_THROWS( s.cloneSplit( min , max , splitKeys , ShardChunkVersion( 1 ) ) , UserException );
+
+            BSONObj badMax = BSON( "a" << 25 << "b" << 0 );
+            BSONObj split = BSON( "a" << 15 << "b" << 0 );
+            splitKeys.clear();
+            splitKeys.push_back( split );
+            ASSERT_THROWS( s.cloneSplit( min , badMax, splitKeys , ShardChunkVersion( 1 ) ) , UserException );
+        }
+    };
+
+    class EmptyShardTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 ) <<
+                                       "unique"  << false );
+
+            // no chunks on this shard
+            BSONArray chunks;
+
+            // shard can have zero chunks for an existing collection
+            // version should be 0, though
+            ShardChunkManager s( collection , chunks );
+            ASSERT_EQUALS( s.getVersion() , ShardChunkVersion( 0 ) );
+            ASSERT_EQUALS( s.getNumChunks() , 0u );
+        }
+    };
+
+    class LastChunkTests {
+    public:
+        void run() {
+            BSONObj collection = BSON( "_id"     << "test.foo" <<
+                                       "dropped" << false <<
+                                       "key"     << BSON( "a" << 1 ) <<
+                                       "unique"  << false  );
+
+            // 1-chunk collection
+            // [10->20)
+            BSONArray chunks = BSON_ARRAY( BSON( "_id" << "test.foo-a_10" <<
+                                                 "ns"  << "test.foo" <<
+                                                 "min" << BSON( "a" << 10 ) <<
+                                                 "max" << BSON( "a" << 20 ) ) );
+
+            ShardChunkManager s( collection , chunks );
+            BSONObj min = BSON( "a" << 10 );
+            BSONObj max = BSON( "a" << 20 );
+
+            // if we remove the only chunk, the only version accepted is 0
+            ShardChunkVersion nonZero = 99;
+            ASSERT_THROWS( s.cloneMinus( min , max , nonZero ) , UserException );
+            ShardChunkManagerPtr empty( s.cloneMinus( min , max , 0 ) );
+            ASSERT_EQUALS( empty->getVersion() , ShardChunkVersion( 0 ) );
+            ASSERT_EQUALS( empty->getNumChunks() , 0u );
+            BSONObj k = BSON( "a" << 15 << "b" << 0 );
+            ASSERT( ! empty->belongsToMe( k ) );
+
+            // we can add a chunk to an empty manager
+            // version should be provided
+            ASSERT_THROWS( empty->clonePlus( min , max , 0 ) , UserException );
+            ShardChunkManagerPtr cloned( empty->clonePlus( min , max , nonZero ) );
+            ASSERT_EQUALS( cloned->getVersion(), nonZero );
+            ASSERT_EQUALS( cloned->getNumChunks() , 1u );
+            ASSERT( cloned->belongsToMe( k ) );
+        }
+    };
+
+    class ShardChunkManagerSuite : public Suite {
+    public:
+        ShardChunkManagerSuite() : Suite ( "shard_chunk_manager" ) {}
+
+        void setupTests() {
+            add< BasicTests >();
+            add< BasicCompoundTests >();
+            add< RangeTests >();
+            add< GetNextTests >();
+            add< DeletedTests >();
+            add< ClonePlusTests >();
+            add< ClonePlusExceptionTests >();
+            add< CloneMinusTests >();
+            add< CloneMinusExceptionTests >();
+            add< CloneSplitTests >();
+            add< CloneSplitExceptionTests >();
+            add< EmptyShardTests >();
+            add< LastChunkTests >();
+        }
+    } shardChunkManagerSuite;
+
+}  // anonymous namespace
diff --git a/src/mongo/dbtests/dbtests.cpp b/src/mongo/dbtests/dbtests.cpp
new file mode 100644
index 00000000000..fde0f669c98
--- /dev/null
+++ b/src/mongo/dbtests/dbtests.cpp
@@ -0,0 +1,29 @@
+// #file dbtests.cpp : Runs db unit tests.
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "dbtests.h"
+#include "../util/unittest.h"
+
+int main( int argc, char** argv ) {
+    static StaticObserver StaticObserver;
+    doPreServerStartupInits();
+    UnitTest::runTests();
+    return Suite::run(argc, argv, "/tmp/unittest");
+}
diff --git a/src/mongo/dbtests/dbtests.h b/src/mongo/dbtests/dbtests.h
new file mode 100644
index 00000000000..dbaeea1d180
--- /dev/null
+++ b/src/mongo/dbtests/dbtests.h
@@ -0,0 +1,25 @@
+// dbtests.h : Test suite generator headers.
+//
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "framework.h"
+
+using namespace mongo;
+using namespace mongo::regression;
+using boost::shared_ptr;
+
diff --git a/src/mongo/dbtests/directclienttests.cpp b/src/mongo/dbtests/directclienttests.cpp
new file mode 100644
index 00000000000..860eb7e7e5c
--- /dev/null
+++ b/src/mongo/dbtests/directclienttests.cpp
@@ -0,0 +1,103 @@
+/** @file directclienttests.cpp
+*/
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/ops/query.h"
+#include "../db/db.h"
+#include "../db/instance.h"
+#include "../db/json.h"
+#include "../db/lasterror.h"
+#include "../db/ops/update.h"
+#include "../util/timer.h"
+#include "dbtests.h"
+
+namespace DirectClientTests {
+
+    class ClientBase {
+    public:
+        // NOTE: Not bothering to backup the old error record.
+        ClientBase() {  mongo::lastError.reset( new LastError() );  }
+        virtual ~ClientBase() { }
+    protected:
+        static bool error() {
+            return !_client.getPrevError().getField( "err" ).isNull();
+        }
+        DBDirectClient &client() const { return _client; }
+    private:
+        static DBDirectClient _client;
+    };
+    DBDirectClient ClientBase::_client;
+
+    const char *ns = "a.b";
+
+    class Capped : public ClientBase {
+    public:
+        virtual void run() {
+            for( int pass=0; pass < 3; pass++ ) {
+                client().createCollection(ns, 1024 * 1024, true, 999);
+                for( int j =0; j < pass*3; j++ )
+                    client().insert(ns, BSON("x" << j));
+
+                // test truncation of a capped collection
+                if( pass ) {
+                    BSONObj info;
+                    BSONObj cmd = BSON( "captrunc" << "b" << "n" << 1 << "inc" << true );
+                    //cout << cmd.toString() << endl;
+                    bool ok = client().runCommand("a", cmd, info);
+                    //cout << info.toString() << endl;
+                    assert(ok);
+                }
+
+                assert( client().dropCollection(ns) );
+            }
+        }
+    };
+
+    class InsertMany : ClientBase {
+    public:
+        virtual void run(){
+            vector<BSONObj> objs;
+            objs.push_back(BSON("_id" << 1));
+            objs.push_back(BSON("_id" << 1));
+            objs.push_back(BSON("_id" << 2));
+
+
+            client().dropCollection(ns);
+            client().insert(ns, objs);
+            ASSERT_EQUALS(client().getLastErrorDetailed()["code"].numberInt(), 11000);
+            ASSERT_EQUALS((int)client().count(ns), 1);
+
+            client().dropCollection(ns);
+            client().insert(ns, objs, InsertOption_ContinueOnError);
+            ASSERT_EQUALS(client().getLastErrorDetailed()["code"].numberInt(), 11000);
+            ASSERT_EQUALS((int)client().count(ns), 2);
+        }
+
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "directclient" ) {
+        }
+        void setupTests() {
+            add< Capped >();
+            add< InsertMany >();
+        }
+    } myall;
+}
diff --git a/src/mongo/dbtests/framework.cpp b/src/mongo/dbtests/framework.cpp
new file mode 100644
index 00000000000..95ed8b33668
--- /dev/null
+++ b/src/mongo/dbtests/framework.cpp
@@ -0,0 +1,446 @@
+// framework.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/version.h"
+#include <boost/program_options.hpp>
+
+#undef assert
+#define assert MONGO_assert
+
+#include "framework.h"
+#include "../util/file_allocator.h"
+#include "../db/dur.h"
+#include "../util/background.h"
+
+#ifndef _WIN32
+#include <cxxabi.h>
+#include <sys/file.h>
+#endif
+
+namespace po = boost::program_options;
+
+namespace mongo {
+
+    CmdLine cmdLine;
+
+    namespace regression {
+
+        map<string,Suite*> * mongo::regression::Suite::_suites = 0;
+
+        class Result {
+        public:
+            Result( string name ) : _name( name ) , _rc(0) , _tests(0) , _fails(0) , _asserts(0) {
+            }
+
+            string toString() {
+                stringstream ss;
+
+                char result[128];
+                sprintf(result, "%-20s | tests: %4d | fails: %4d | assert calls: %6d\n", _name.c_str(), _tests, _fails, _asserts);
+                ss << result;
+
+                for ( list<string>::iterator i=_messages.begin(); i!=_messages.end(); i++ ) {
+                    ss << "\t" << *i << '\n';
+                }
+
+                return ss.str();
+            }
+
+            int rc() {
+                return _rc;
+            }
+
+            string _name;
+
+            int _rc;
+            int _tests;
+            int _fails;
+            int _asserts;
+            list<string> _messages;
+
+            static Result * cur;
+        };
+
+        Result * Result::cur = 0;
+
+        int minutesRunning = 0; // reset to 0 each time a new test starts
+        mutex minutesRunningMutex("minutesRunningMutex");
+        string currentTestName;
+
+        Result * Suite::run( const string& filter ) {
+            // set tlogLevel to -1 to suppress tlog() output in a test program
+            tlogLevel = -1;
+
+            log(1) << "\t about to setupTests" << endl;
+            setupTests();
+            log(1) << "\t done setupTests" << endl;
+
+            Result * r = new Result( _name );
+            Result::cur = r;
+
+            /* see note in SavedContext */
+            //writelock lk("");
+
+            for ( list<TestCase*>::iterator i=_tests.begin(); i!=_tests.end(); i++ ) {
+                TestCase * tc = *i;
+                if ( filter.size() && tc->getName().find( filter ) == string::npos ) {
+                    log(1) << "\t skipping test: " << tc->getName() << " because doesn't match filter" << endl;
+                    continue;
+                }
+
+                r->_tests++;
+
+                bool passes = false;
+
+                log(1) << "\t going to run test: " << tc->getName() << endl;
+
+                stringstream err;
+                err << tc->getName() << "\t";
+
+                {
+                    scoped_lock lk(minutesRunningMutex);
+                    minutesRunning = 0;
+                    currentTestName = tc->getName();
+                }
+
+                try {
+                    tc->run();
+                    passes = true;
+                }
+                catch ( MyAssertionException * ae ) {
+                    err << ae->ss.str();
+                    delete( ae );
+                }
+                catch ( std::exception& e ) {
+                    err << " exception: " << e.what();
+                }
+                catch ( int x ) {
+                    err << " caught int : " << x << endl;
+                }
+                catch ( ... ) {
+                    cerr << "unknown exception in test: " << tc->getName() << endl;
+                }
+
+                if ( ! passes ) {
+                    string s = err.str();
+                    log() << "FAIL: " << s << endl;
+                    r->_fails++;
+                    r->_messages.push_back( s );
+                }
+            }
+
+            if ( r->_fails )
+                r->_rc = 17;
+
+            log(1) << "\t DONE running tests" << endl;
+
+            return r;
+        }
+
+        void show_help_text(const char* name, po::options_description options) {
+            cout << "usage: " << name << " [options] [suite]..." << endl
+                 << options << "suite: run the specified test suite(s) only" << endl;
+        }
+
+        class TestWatchDog : public BackgroundJob {
+        public:
+            virtual string name() const { return "TestWatchDog"; }
+            virtual void run(){
+
+                while (true) {
+                    sleepsecs(60);
+
+                    scoped_lock lk(minutesRunningMutex);
+                    minutesRunning++; //reset to 0 when new test starts
+
+                    if (minutesRunning > 30){
+                        log() << currentTestName << " has been running for more than 30 minutes. aborting." << endl;
+                        ::abort();
+                    }
+                    else if (minutesRunning > 1){
+                        warning() << currentTestName << " has been running for more than " << minutesRunning-1 << " minutes." << endl;
+                    }
+                }
+            }
+        };
+
+        unsigned perfHist = 1;
+
+        int Suite::run( int argc , char** argv , string default_dbpath ) {
+            unsigned long long seed = time( 0 );
+            string dbpathSpec;
+
+            po::options_description shell_options("options");
+            po::options_description hidden_options("Hidden options");
+            po::options_description cmdline_options("Command line options");
+            po::positional_options_description positional_options;
+
+            shell_options.add_options()
+            ("help,h", "show this usage information")
+            ("dbpath", po::value<string>(&dbpathSpec)->default_value(default_dbpath),
+             "db data path for this test run. NOTE: the contents of this "
+             "directory will be overwritten if it already exists")
+            ("debug", "run tests with verbose output")
+            ("list,l", "list available test suites")
+            ("bigfiles", "use big datafiles instead of smallfiles which is the default")
+            ("filter,f" , po::value<string>() , "string substring filter on test name" )
+            ("verbose,v", "verbose")
+            ("dur", "enable journaling")
+            ("nodur", "disable journaling (currently the default)")
+            ("seed", po::value<unsigned long long>(&seed), "random number seed")
+            ("perfHist", po::value<unsigned>(&perfHist), "number of back runs of perf stats to display")
+            ;
+
+            hidden_options.add_options()
+            ("suites", po::value< vector<string> >(), "test suites to run")
+            ("nopreallocj", "disable journal prealloc")
+            ;
+
+            positional_options.add("suites", -1);
+
+            cmdline_options.add(shell_options).add(hidden_options);
+
+            po::variables_map params;
+            int command_line_style = (((po::command_line_style::unix_style ^
+                                        po::command_line_style::allow_guessing) |
+                                       po::command_line_style::allow_long_disguise) ^
+                                      po::command_line_style::allow_sticky);
+
+            try {
+                po::store(po::command_line_parser(argc, argv).options(cmdline_options).
+                          positional(positional_options).
+                          style(command_line_style).run(), params);
+                po::notify(params);
+            }
+            catch (po::error &e) {
+                cout << "ERROR: " << e.what() << endl << endl;
+                show_help_text(argv[0], shell_options);
+                return EXIT_BADOPTIONS;
+            }
+
+            if (params.count("help")) {
+                show_help_text(argv[0], shell_options);
+                return EXIT_CLEAN;
+            }
+
+            bool nodur = false;
+            if( params.count("nodur") ) {
+                nodur = true;
+                cmdLine.dur = false;
+            }
+            if( params.count("dur") || cmdLine.dur ) {
+                cmdLine.dur = true;
+            }
+
+            if( params.count("nopreallocj") ) {
+                cmdLine.preallocj = false;
+            }
+
+            if (params.count("debug") || params.count("verbose") ) {
+                logLevel = 1;
+            }
+
+            if (params.count("list")) {
+                for ( map<string,Suite*>::iterator i = _suites->begin() ; i != _suites->end(); i++ )
+                    cout << i->first << endl;
+                return 0;
+            }
+
+            boost::filesystem::path p(dbpathSpec);
+
+            /* remove the contents of the test directory if it exists. */
+            if (boost::filesystem::exists(p)) {
+                if (!boost::filesystem::is_directory(p)) {
+                    cout << "ERROR: path \"" << p.string() << "\" is not a directory" << endl << endl;
+                    show_help_text(argv[0], shell_options);
+                    return EXIT_BADOPTIONS;
+                }
+                boost::filesystem::directory_iterator end_iter;
+                for (boost::filesystem::directory_iterator dir_iter(p);
+                        dir_iter != end_iter; ++dir_iter) {
+                    boost::filesystem::remove_all(*dir_iter);
+                }
+            }
+            else {
+                boost::filesystem::create_directory(p);
+            }
+
+            string dbpathString = p.native_directory_string();
+            dbpath = dbpathString.c_str();
+
+            cmdLine.prealloc = false;
+
+            // dbtest defaults to smallfiles
+            cmdLine.smallfiles = true;
+            if( params.count("bigfiles") ) {
+                cmdLine.dur = true;
+            }
+
+            cmdLine.oplogSize = 10 * 1024 * 1024;
+            Client::initThread("testsuite");
+            acquirePathLock();
+
+            srand( (unsigned) seed );
+            printGitVersion();
+            printSysInfo();
+            DEV log() << "_DEBUG build" << endl;
+            if( sizeof(void*)==4 )
+                log() << "32bit" << endl;
+            log() << "random seed: " << seed << endl;
+
+            if( time(0) % 3 == 0 && !nodur ) {
+                cmdLine.dur = true;
+                log() << "****************" << endl;
+                log() << "running with journaling enabled to test that. dbtests will do this occasionally even if --dur is not specified." << endl;
+                log() << "****************" << endl;
+            }
+
+            FileAllocator::get()->start();
+
+            vector<string> suites;
+            if (params.count("suites")) {
+                suites = params["suites"].as< vector<string> >();
+            }
+
+            string filter = "";
+            if ( params.count( "filter" ) ) {
+                filter = params["filter"].as<string>();
+            }
+
+            dur::startup();
+
+            if( debug && cmdLine.dur ) {
+                log() << "_DEBUG: automatically enabling cmdLine.durOptions=8 (DurParanoid)" << endl;
+                // this was commented out.  why too slow or something? : 
+                cmdLine.durOptions |= 8;
+            }
+
+            TestWatchDog twd;
+            twd.go();
+
+            int ret = run(suites,filter);
+
+#if !defined(_WIN32) && !defined(__sunos__)
+            flock( lockFile, LOCK_UN );
+#endif
+
+            cc().shutdown();
+            dbexit( (ExitCode)ret ); // so everything shuts down cleanly
+            return ret;
+        }
+
+        int Suite::run( vector<string> suites , const string& filter ) {
+            for ( unsigned int i = 0; i < suites.size(); i++ ) {
+                if ( _suites->find( suites[i] ) == _suites->end() ) {
+                    cout << "invalid test [" << suites[i] << "], use --list to see valid names" << endl;
+                    return -1;
+                }
+            }
+
+            list<string> torun(suites.begin(), suites.end());
+
+            if ( torun.size() == 0 )
+                for ( map<string,Suite*>::iterator i=_suites->begin() ; i!=_suites->end(); i++ )
+                    torun.push_back( i->first );
+
+            list<Result*> results;
+
+            for ( list<string>::iterator i=torun.begin(); i!=torun.end(); i++ ) {
+                string name = *i;
+                Suite * s = (*_suites)[name];
+                assert( s );
+
+                log() << "going to run suite: " << name << endl;
+                results.push_back( s->run( filter ) );
+            }
+
+            Logstream::get().flush();
+
+            cout << "**************************************************" << endl;
+
+            int rc = 0;
+
+            int tests = 0;
+            int fails = 0;
+            int asserts = 0;
+
+            for ( list<Result*>::iterator i=results.begin(); i!=results.end(); i++ ) {
+                Result * r = *i;
+                cout << r->toString();
+                if ( abs( r->rc() ) > abs( rc ) )
+                    rc = r->rc();
+
+                tests += r->_tests;
+                fails += r->_fails;
+                asserts += r->_asserts;
+            }
+
+            Result totals ("TOTALS");
+            totals._tests = tests;
+            totals._fails = fails;
+            totals._asserts = asserts;
+
+            cout << totals.toString(); // includes endl
+
+            return rc;
+        }
+
+        void Suite::registerSuite( string name , Suite * s ) {
+            if ( ! _suites )
+                _suites = new map<string,Suite*>();
+            Suite*& m = (*_suites)[name];
+            uassert( 10162 ,  "already have suite with that name" , ! m );
+            m = s;
+        }
+
+        void assert_pass() {
+            Result::cur->_asserts++;
+        }
+
+        void assert_fail( const char * exp , const char * file , unsigned line ) {
+            Result::cur->_asserts++;
+
+            MyAssertionException * e = new MyAssertionException();
+            e->ss << "ASSERT FAILED! " << file << ":" << line << endl;
+            throw e;
+        }
+
+        void fail( const char * exp , const char * file , unsigned line ) {
+            assert(0);
+        }
+
+        MyAssertionException * MyAsserts::getBase() {
+            MyAssertionException * e = new MyAssertionException();
+            e->ss << _file << ":" << _line << " " << _aexp << " != " << _bexp << " ";
+            return e;
+        }
+
+        void MyAsserts::printLocation() {
+            log() << _file << ":" << _line << " " << _aexp << " != " << _bexp << " ";
+        }
+
+        void MyAsserts::_gotAssert() {
+            Result::cur->_asserts++;
+        }
+
+    }
+
+    void setupSignals( bool inFork ) {}
+
+}
diff --git a/src/mongo/dbtests/framework.h b/src/mongo/dbtests/framework.h
new file mode 100644
index 00000000000..adf610a05eb
--- /dev/null
+++ b/src/mongo/dbtests/framework.h
@@ -0,0 +1,199 @@
+// framework.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+
+  simple portable regression system
+ */
+
+#include "../pch.h"
+
+#define ASSERT_THROWS(a,b)                                       \
+    try {                                                           \
+        a;                                                          \
+        mongo::regression::assert_fail( #a , __FILE__ , __LINE__ ); \
+    } catch ( b& ){                                               \
+        mongo::regression::assert_pass();                           \
+    }
+
+
+
+#define ASSERT_EQUALS(a,b) (mongo::regression::MyAsserts( #a , #b , __FILE__ , __LINE__ ) ).ae( (a) , (b) )
+#define ASSERT_NOT_EQUALS(a,b) (mongo::regression::MyAsserts( #a , #b , __FILE__ , __LINE__ ) ).nae( (a) , (b) )
+
+#define ASSERT(x) (void)( (!(!(x))) ? mongo::regression::assert_pass() : mongo::regression::assert_fail( #x , __FILE__ , __LINE__ ) )
+#define FAIL(x) mongo::regression::fail( #x , __FILE__ , __LINE__ )
+
+#include "../db/instance.h"
+
+namespace mongo {
+
+    namespace regression {
+
+        class Result;
+
+        class TestCase {
+        public:
+            virtual ~TestCase() {}
+            virtual void run() = 0;
+            virtual string getName() = 0;
+        };
+
+        template< class T >
+        class TestHolderBase : public TestCase {
+        public:
+            TestHolderBase() {}
+            virtual ~TestHolderBase() {}
+            virtual void run() {
+                auto_ptr<T> t;
+                t.reset( create() );
+                t->run();
+            }
+            virtual T * create() = 0;
+            virtual string getName() {
+                return demangleName( typeid(T) );
+            }
+        };
+
+        template< class T >
+        class TestHolder0 : public TestHolderBase<T> {
+        public:
+            virtual T * create() {
+                return new T();
+            }
+        };
+
+        template< class T , typename A  >
+        class TestHolder1 : public TestHolderBase<T> {
+        public:
+            TestHolder1( const A& a ) : _a(a) {}
+            virtual T * create() {
+                return new T( _a );
+            }
+            const A& _a;
+        };
+
+        class Suite {
+        public:
+            Suite( string name ) : _name( name ) {
+                registerSuite( name , this );
+                _ran = 0;
+            }
+
+            virtual ~Suite() {
+                if ( _ran ) {
+                    DBDirectClient c;
+                    c.dropDatabase( "unittests" );
+                }
+            }
+
+            template<class T>
+            void add() {
+                _tests.push_back( new TestHolder0<T>() );
+            }
+
+            template<class T , typename A >
+            void add( const A& a ) {
+                _tests.push_back( new TestHolder1<T,A>(a) );
+            }
+
+            Result * run( const string& filter );
+
+            static int run( vector<string> suites , const string& filter );
+            static int run( int argc , char ** argv , string default_dbpath );
+
+
+        protected:
+            virtual void setupTests() = 0;
+
+        private:
+            string _name;
+            list<TestCase*> _tests;
+            bool _ran;
+
+            static map<string,Suite*> * _suites;
+
+            void registerSuite( string name , Suite * s );
+        };
+
+        void assert_pass();
+        void assert_fail( const char * exp , const char * file , unsigned line );
+        void fail( const char * exp , const char * file , unsigned line );
+
+        class MyAssertionException : boost::noncopyable {
+        public:
+            MyAssertionException() {
+                ss << "assertion: ";
+            }
+            stringstream ss;
+        };
+
+
+
+        class MyAsserts {
+        public:
+            MyAsserts( const char * aexp , const char * bexp , const char * file , unsigned line )
+                : _aexp( aexp ) , _bexp( bexp ) , _file( file ) , _line( line ) {
+
+            }
+
+            template<typename A,typename B>
+            void ae( A a , B b ) {
+                _gotAssert();
+                if ( a == b )
+                    return;
+
+                printLocation();
+
+                MyAssertionException * e = getBase();
+                e->ss << a << " != " << b << endl;
+                log() << e->ss.str() << endl;
+                throw e;
+            }
+
+            template<typename A,typename B>
+            void nae( A a , B b ) {
+                _gotAssert();
+                if ( a != b )
+                    return;
+
+                printLocation();
+
+                MyAssertionException * e = getBase();
+                e->ss << a << " == " << b << endl;
+                log() << e->ss.str() << endl;
+                throw e;
+            }
+
+
+            void printLocation();
+
+        private:
+
+            void _gotAssert();
+
+            MyAssertionException * getBase();
+
+            string _aexp;
+            string _bexp;
+            string _file;
+            unsigned _line;
+        };
+
+    }
+}
diff --git a/src/mongo/dbtests/histogram_test.cpp b/src/mongo/dbtests/histogram_test.cpp
new file mode 100644
index 00000000000..e9cbb5bdf25
--- /dev/null
+++ b/src/mongo/dbtests/histogram_test.cpp
@@ -0,0 +1,94 @@
+// histogramtests.cpp : histogram.{h,cpp} unit tests
+
+/**
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../pch.h"
+
+#include "dbtests.h"
+#include "../util/histogram.h"
+
+namespace mongo {
+
+    using mongo::Histogram;
+
+    class BoundariesInit {
+    public:
+        void run() {
+            Histogram::Options opts;
+            opts.numBuckets = 3;
+            opts.bucketSize = 10;
+            Histogram h( opts );
+
+            ASSERT_EQUALS( h.getBucketsNum(), 3u );
+
+            ASSERT_EQUALS( h.getCount( 0 ), 0u );
+            ASSERT_EQUALS( h.getCount( 1 ), 0u );
+            ASSERT_EQUALS( h.getCount( 2 ), 0u );
+
+            ASSERT_EQUALS( h.getBoundary( 0 ), 10u );
+            ASSERT_EQUALS( h.getBoundary( 1 ), 20u );
+            ASSERT_EQUALS( h.getBoundary( 2 ), numeric_limits<uint32_t>::max() );
+        }
+    };
+
+    class BoundariesExponential {
+    public:
+        void run() {
+            Histogram::Options opts;
+            opts.numBuckets = 4;
+            opts.bucketSize = 125;
+            opts.exponential = true;
+            Histogram h( opts );
+
+            ASSERT_EQUALS( h.getBoundary( 0 ), 125u );
+            ASSERT_EQUALS( h.getBoundary( 1 ), 250u );
+            ASSERT_EQUALS( h.getBoundary( 2 ), 500u );
+            ASSERT_EQUALS( h.getBoundary( 3 ), numeric_limits<uint32_t>::max() );
+        }
+    };
+
+    class BoundariesFind {
+    public:
+        void run() {
+            Histogram::Options opts;
+            opts.numBuckets = 3;
+            opts.bucketSize = 10;
+            Histogram h( opts );
+
+            h.insert( 10 );  // end of first bucket
+            h.insert( 15 );  // second bucket
+            h.insert( 18 );  // second bucket
+
+            ASSERT_EQUALS( h.getCount( 0 ), 1u );
+            ASSERT_EQUALS( h.getCount( 1 ), 2u );
+            ASSERT_EQUALS( h.getCount( 2 ), 0u );
+        }
+    };
+
+    class HistogramSuite : public Suite {
+    public:
+        HistogramSuite() : Suite( "histogram" ) {}
+
+        void setupTests() {
+            add< BoundariesInit >();
+            add< BoundariesExponential >();
+            add< BoundariesFind >();
+            // TODO: complete the test suite
+        }
+    } histogramSuite;
+
+}  // anonymous namespace
diff --git a/src/mongo/dbtests/jsobjtests.cpp b/src/mongo/dbtests/jsobjtests.cpp
new file mode 100644
index 00000000000..709c013f6d8
--- /dev/null
+++ b/src/mongo/dbtests/jsobjtests.cpp
@@ -0,0 +1,2208 @@
+// jsobjtests.cpp - Tests for jsobj.{h,cpp} code
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../bson/util/builder.h"
+#include "../db/jsobj.h"
+#include "../db/jsobjmanipulator.h"
+#include "../db/json.h"
+#include "../db/repl.h"
+#include "../db/extsort.h"
+#include "dbtests.h"
+#include "../util/mongoutils/checksum.h"
+#include "../db/key.h"
+#include "../db/btree.h"
+
+namespace JsobjTests {
+
+    IndexInterface& indexInterfaceForTheseTests = (time(0)%2) ? *IndexDetails::iis[0] : *IndexDetails::iis[1];
+
+    void keyTest(const BSONObj& o, bool mustBeCompact = false) {
+        static KeyV1Owned *kLast;
+        static BSONObj last;
+
+        KeyV1Owned *key = new KeyV1Owned(o);
+        KeyV1Owned& k = *key;
+
+        ASSERT( !mustBeCompact || k.isCompactFormat() );
+
+        BSONObj x = k.toBson();
+        int res = o.woCompare(x, BSONObj(), /*considerfieldname*/false);
+        if( res ) {
+            cout << o.toString() << endl;
+            k.toBson();
+            cout << x.toString() << endl;
+            o.woCompare(x, BSONObj(), /*considerfieldname*/false);
+            ASSERT( res == 0 );
+        }
+        ASSERT( k.woEqual(k) );
+        ASSERT( !k.isCompactFormat() || k.dataSize() < o.objsize() );
+
+        {
+            // check BSONObj::equal.  this part not a KeyV1 test.
+            int res = o.woCompare(last);
+            ASSERT( (res==0) == o.equal(last) );
+        }
+
+        if( kLast ) {
+            int r1 = o.woCompare(last, BSONObj(), false);
+            int r2 = k.woCompare(*kLast, Ordering::make(BSONObj()));
+            bool ok = (r1<0 && r2<0) || (r1>0&&r2>0) || r1==r2;
+            if( !ok ) { 
+                cout << "r1r2 " << r1 << ' ' << r2 << endl;
+                cout << "o:" << o.toString() << endl;
+                cout << "last:" << last.toString() << endl;
+                cout << "k:" << k.toString() << endl;
+                cout << "kLast:" << kLast->toString() << endl;
+                int r3 = k.woCompare(*kLast, Ordering::make(BSONObj()));
+                cout << r3 << endl;
+            }
+            ASSERT(ok);
+            if( k.isCompactFormat() && kLast->isCompactFormat() ) { // only check if not bson as bson woEqual is broken! (or was may2011)
+                if( k.woEqual(*kLast) != (r2 == 0) ) { // check woEqual matches
+                    cout << r2 << endl;
+                    cout << k.toString() << endl;
+                    cout << kLast->toString() << endl;
+                    k.woEqual(*kLast);
+                    ASSERT(false);
+                }
+            }
+        }
+
+        delete kLast;
+        kLast = key;
+        last = o.getOwned();
+    }
+
+    class BufBuilderBasic {
+    public:
+        void run() {
+            {
+                BufBuilder b( 0 );
+                b.appendStr( "foo" );
+                ASSERT_EQUALS( 4, b.len() );
+                ASSERT( strcmp( "foo", b.buf() ) == 0 );
+            }
+            {
+                mongo::StackBufBuilder b;
+                b.appendStr( "foo" );
+                ASSERT_EQUALS( 4, b.len() );
+                ASSERT( strcmp( "foo", b.buf() ) == 0 );
+            }
+        }
+    };
+
+    class BSONElementBasic {
+    public:
+        void run() {
+            ASSERT_EQUALS( 1, BSONElement().size() );
+
+            BSONObj x;
+            ASSERT_EQUALS( 1, x.firstElement().size() );
+        }
+    };
+
+    namespace BSONObjTests {
+        class Create {
+        public:
+            void run() {
+                BSONObj b;
+                ASSERT_EQUALS( 0, b.nFields() );
+            }
+        };
+
+        class Base {
+        protected:
+            static BSONObj basic( const char *name, int val ) {
+                BSONObjBuilder b;
+                b.append( name, val );
+                return b.obj();
+            }
+            static BSONObj basic( const char *name, vector< int > val ) {
+                BSONObjBuilder b;
+                b.append( name, val );
+                return b.obj();
+            }
+            template< class T >
+            static BSONObj basic( const char *name, T val ) {
+                BSONObjBuilder b;
+                b.append( name, val );
+                return b.obj();
+            }
+        };
+
+        class WoCompareBasic : public Base {
+        public:
+            void run() {
+                ASSERT( basic( "a", 1 ).woCompare( basic( "a", 1 ) ) == 0 );
+                ASSERT( basic( "a", 2 ).woCompare( basic( "a", 1 ) ) > 0 );
+                ASSERT( basic( "a", 1 ).woCompare( basic( "a", 2 ) ) < 0 );
+                // field name comparison
+                ASSERT( basic( "a", 1 ).woCompare( basic( "b", 1 ) ) < 0 );
+            }
+        };
+
+        class NumericCompareBasic : public Base {
+        public:
+            void run() {
+                ASSERT( basic( "a", 1 ).woCompare( basic( "a", 1.0 ) ) == 0 );
+            }
+        };
+
+        class WoCompareEmbeddedObject : public Base {
+        public:
+            void run() {
+                ASSERT( basic( "a", basic( "b", 1 ) ).woCompare
+                        ( basic( "a", basic( "b", 1.0 ) ) ) == 0 );
+                ASSERT( basic( "a", basic( "b", 1 ) ).woCompare
+                        ( basic( "a", basic( "b", 2 ) ) ) < 0 );
+            }
+        };
+
+        class WoCompareEmbeddedArray : public Base {
+        public:
+            void run() {
+                vector< int > i;
+                i.push_back( 1 );
+                i.push_back( 2 );
+                vector< double > d;
+                d.push_back( 1 );
+                d.push_back( 2 );
+                ASSERT( basic( "a", i ).woCompare( basic( "a", d ) ) == 0 );
+
+                vector< int > j;
+                j.push_back( 1 );
+                j.push_back( 3 );
+                ASSERT( basic( "a", i ).woCompare( basic( "a", j ) ) < 0 );
+            }
+        };
+
+        class WoCompareOrdered : public Base {
+        public:
+            void run() {
+                ASSERT( basic( "a", 1 ).woCompare( basic( "a", 1 ), basic( "a", 1 ) ) == 0 );
+                ASSERT( basic( "a", 2 ).woCompare( basic( "a", 1 ), basic( "a", 1 ) ) > 0 );
+                ASSERT( basic( "a", 1 ).woCompare( basic( "a", 2 ), basic( "a", 1 ) ) < 0 );
+                ASSERT( basic( "a", 1 ).woCompare( basic( "a", 1 ), basic( "a", -1 ) ) == 0 );
+                ASSERT( basic( "a", 2 ).woCompare( basic( "a", 1 ), basic( "a", -1 ) ) < 0 );
+                ASSERT( basic( "a", 1 ).woCompare( basic( "a", 2 ), basic( "a", -1 ) ) > 0 );
+            }
+        };
+
+        class WoCompareDifferentLength : public Base {
+        public:
+            void run() {
+                ASSERT( BSON( "a" << 1 ).woCompare( BSON( "a" << 1 << "b" << 1 ) ) < 0 );
+                ASSERT( BSON( "a" << 1 << "b" << 1 ).woCompare( BSON( "a" << 1 ) ) > 0 );
+            }
+        };
+
+        class WoSortOrder : public Base {
+        public:
+            void run() {
+                ASSERT( BSON( "a" << 1 ).woSortOrder( BSON( "a" << 2 ), BSON( "b" << 1 << "a" << 1 ) ) < 0 );
+                ASSERT( fromjson( "{a:null}" ).woSortOrder( BSON( "b" << 1 ), BSON( "a" << 1 ) ) == 0 );
+            }
+        };
+
+        class MultiKeySortOrder : public Base {
+        public:
+            void run() {
+                ASSERT( BSON( "x" << "a" ).woCompare( BSON( "x" << "b" ) ) < 0 );
+                ASSERT( BSON( "x" << "b" ).woCompare( BSON( "x" << "a" ) ) > 0 );
+
+                ASSERT( BSON( "x" << "a" << "y" << "a" ).woCompare( BSON( "x" << "a" << "y" << "b" ) ) < 0 );
+                ASSERT( BSON( "x" << "a" << "y" << "a" ).woCompare( BSON( "x" << "b" << "y" << "a" ) ) < 0 );
+                ASSERT( BSON( "x" << "a" << "y" << "a" ).woCompare( BSON( "x" << "b" ) ) < 0 );
+
+                ASSERT( BSON( "x" << "c" ).woCompare( BSON( "x" << "b" << "y" << "h" ) ) > 0 );
+                ASSERT( BSON( "x" << "b" << "y" << "b" ).woCompare( BSON( "x" << "c" ) ) < 0 );
+
+                BSONObj key = BSON( "x" << 1 << "y" << 1 );
+
+                ASSERT( BSON( "x" << "c" ).woSortOrder( BSON( "x" << "b" << "y" << "h" ) , key ) > 0 );
+                ASSERT( BSON( "x" << "b" << "y" << "b" ).woCompare( BSON( "x" << "c" ) , key ) < 0 );
+
+                key = BSON( "" << 1 << "" << 1 );
+
+                ASSERT( BSON( "" << "c" ).woSortOrder( BSON( "" << "b" << "" << "h" ) , key ) > 0 );
+                ASSERT( BSON( "" << "b" << "" << "b" ).woCompare( BSON( "" << "c" ) , key ) < 0 );
+
+                {
+                    // test a big key
+                    string x(2000, 'z');
+                    BSONObj o = BSON( "q" << x );
+                    keyTest(o, false);
+                }
+                {
+                    string y(200, 'w');                 
+                    BSONObjBuilder b;
+                    for( int i = 0; i < 10; i++ ) {
+                        b.append("x", y);
+                    }
+                    keyTest(b.obj(), true);
+                }
+                {
+                    double nan = numeric_limits<double>::quiet_NaN();
+                    BSONObj o = BSON( "y" << nan );
+                    keyTest(o);
+                }
+
+                {
+                    BSONObjBuilder b;
+                    b.append( "" , "c" );
+                    b.appendNull( "" );
+                    BSONObj o = b.obj();
+                    keyTest(o);
+                    ASSERT( o.woSortOrder( BSON( "" << "b" << "" << "h" ) , key ) > 0 );
+                    ASSERT( BSON( "" << "b" << "" << "h" ).woSortOrder( o , key ) < 0 );
+
+                }
+
+                ASSERT( BSON( "" << "a" ).woCompare( BSON( "" << "a" << "" << "c" ) ) < 0 );
+                {
+                    BSONObjBuilder b;
+                    b.append( "" , "a" );
+                    b.appendNull( "" );
+                    ASSERT(  b.obj().woCompare( BSON( "" << "a" << "" << "c" ) ) < 0 ); // SERVER-282
+                }
+
+            }
+        };
+
+        class TimestampTest : public Base {
+        public:
+            void run() {
+                Client *c = currentClient.get();
+                if( c == 0 ) {
+                    Client::initThread("pretouchN");
+                    c = &cc();
+                }
+                writelock lk(""); // for initTimestamp
+        
+                BSONObjBuilder b;
+                b.appendTimestamp( "a" );
+                BSONObj o = b.done();
+                o.toString();
+                ASSERT( o.valid() );
+                ASSERT_EQUALS( Timestamp, o.getField( "a" ).type() );
+                BSONObjIterator i( o );
+                ASSERT( i.moreWithEOO() );
+                ASSERT( i.more() );
+
+                BSONElement e = i.next();
+                ASSERT_EQUALS( Timestamp, e.type() );
+                ASSERT( i.moreWithEOO() );
+                ASSERT( ! i.more() );
+
+                e = i.next();
+                ASSERT( e.eoo() );
+
+                OpTime before = OpTime::now();
+                BSONElementManipulator( o.firstElement() ).initTimestamp();
+                OpTime after = OpTime::now();
+
+                OpTime test = OpTime( o.firstElement().date() );
+                ASSERT( before < test && test < after );
+
+                BSONElementManipulator( o.firstElement() ).initTimestamp();
+                test = OpTime( o.firstElement().date() );
+                ASSERT( before < test && test < after );
+
+                OpTime x(123,456);
+                ASSERT_EQUALS( 528280977864LL , x.asLL() );
+            }
+        };
+
+        class Nan : public Base {
+        public:
+            void run() {
+                double inf = numeric_limits< double >::infinity();
+                double nan = numeric_limits< double >::quiet_NaN();
+                double nan2 = numeric_limits< double >::signaling_NaN();
+                ASSERT( isNaN(nan) );
+                ASSERT( isNaN(nan2) );
+                ASSERT( !isNaN(inf) );
+
+                ASSERT( BSON( "a" << inf ).woCompare( BSON( "a" << inf ) ) == 0 );
+                ASSERT( BSON( "a" << inf ).woCompare( BSON( "a" << 1 ) ) > 0 );
+                ASSERT( BSON( "a" << 1 ).woCompare( BSON( "a" << inf ) ) < 0 );
+
+                ASSERT( BSON( "a" << nan ).woCompare( BSON( "a" << nan ) ) == 0 );
+                ASSERT( BSON( "a" << nan ).woCompare( BSON( "a" << 1 ) ) < 0 );
+
+                ASSERT( BSON( "a" << nan ).woCompare( BSON( "a" << 5000000000LL ) ) < 0 );
+
+                {
+                    KeyV1Owned a( BSON( "a" << nan ) );
+                    KeyV1Owned b( BSON( "a" << 1 ) );
+                    Ordering o = Ordering::make(BSON("a"<<1));
+                    ASSERT( a.woCompare(b, o) < 0 );
+                }
+
+                ASSERT( BSON( "a" << 1 ).woCompare( BSON( "a" << nan ) ) > 0 );
+
+                ASSERT( BSON( "a" << nan2 ).woCompare( BSON( "a" << nan2 ) ) == 0 );
+                ASSERT( BSON( "a" << nan2 ).woCompare( BSON( "a" << 1 ) ) < 0 );
+                ASSERT( BSON( "a" << 1 ).woCompare( BSON( "a" << nan2 ) ) > 0 );
+
+                ASSERT( BSON( "a" << inf ).woCompare( BSON( "a" << nan ) ) > 0 );
+                ASSERT( BSON( "a" << inf ).woCompare( BSON( "a" << nan2 ) ) > 0 );
+                ASSERT( BSON( "a" << nan ).woCompare( BSON( "a" << nan2 ) ) == 0 );
+            }
+        };
+
+        class AsTempObj {
+        public:
+            void run() {
+                {
+                    BSONObjBuilder bb;
+                    bb << "a" << 1;
+                    BSONObj tmp = bb.asTempObj();
+                    ASSERT(tmp.objsize() == 4+(1+2+4)+1);
+                    ASSERT(tmp.valid());
+                    ASSERT(tmp.hasField("a"));
+                    ASSERT(!tmp.hasField("b"));
+                    ASSERT(tmp == BSON("a" << 1));
+
+                    bb << "b" << 2;
+                    BSONObj obj = bb.obj();
+                    ASSERT_EQUALS(obj.objsize() , 4+(1+2+4)+(1+2+4)+1);
+                    ASSERT(obj.valid());
+                    ASSERT(obj.hasField("a"));
+                    ASSERT(obj.hasField("b"));
+                    ASSERT(obj == BSON("a" << 1 << "b" << 2));
+                }
+                {
+                    BSONObjBuilder bb;
+                    bb << "a" << GT << 1;
+                    BSONObj tmp = bb.asTempObj();
+                    ASSERT(tmp.objsize() == 4+(1+2+(4+1+4+4+1))+1);
+                    ASSERT(tmp.valid());
+                    ASSERT(tmp.hasField("a"));
+                    ASSERT(!tmp.hasField("b"));
+                    ASSERT(tmp == BSON("a" << BSON("$gt" << 1)));
+
+                    bb << "b" << LT << 2;
+                    BSONObj obj = bb.obj();
+                    ASSERT(obj.objsize() == 4+(1+2+(4+1+4+4+1))+(1+2+(4+1+4+4+1))+1);
+                    ASSERT(obj.valid());
+                    ASSERT(obj.hasField("a"));
+                    ASSERT(obj.hasField("b"));
+                    ASSERT(obj == BSON("a" << BSON("$gt" << 1)
+                                       << "b" << BSON("$lt" << 2)));
+                }
+                {
+                    BSONObjBuilder bb(32);
+                    bb << "a" << 1;
+                    BSONObj tmp = bb.asTempObj();
+                    ASSERT(tmp.objsize() == 4+(1+2+4)+1);
+                    ASSERT(tmp.valid());
+                    ASSERT(tmp.hasField("a"));
+                    ASSERT(!tmp.hasField("b"));
+                    ASSERT(tmp == BSON("a" << 1));
+
+                    //force a realloc
+                    BSONArrayBuilder arr;
+                    for (int i=0; i < 10000; i++) {
+                        arr << i;
+                    }
+                    bb << "b" << arr.arr();
+                    BSONObj obj = bb.obj();
+                    ASSERT(obj.valid());
+                    ASSERT(obj.hasField("a"));
+                    ASSERT(obj.hasField("b"));
+                }
+            }
+        };
+
+        struct AppendIntOrLL {
+            void run() {
+                const long long billion = 1000*1000*1000;
+
+                long long n = 0x3333111122224444LL;
+                {
+                    double d = (double) n;
+                    BSONObj a = BSON( "x" << n );
+                    BSONObj b = BSON( "x" << d );
+
+                    long long back = (long long) d;
+//3717
+//////                    int res = a.woCompare(b);
+
+                    ASSERT( n > back );
+                    //ASSERT( res > 0 );  // SERVER-3717
+
+                    keyTest(a, false);
+
+                    KeyV1Owned A(a);
+                    KeyV1Owned B(b);
+//3717
+//////                    int res2 =  A.woCompare(B, Ordering::make(BSONObj()));
+                    // ASSERT( res2 > 0 ); // SERVER-3717
+
+                    // fixing requires an index v# change.
+
+                    cout << "todo fix SERVER-3717 and uncomment test in AppendIntOrLL" << endl;
+
+                    n++;
+                }
+
+                {
+                    BSONObjBuilder b;
+                    b.appendIntOrLL("L4", -4*billion);
+                    keyTest(b.obj());
+                    keyTest( BSON("" << billion) );
+                }
+
+                BSONObjBuilder b;
+                b.appendIntOrLL("i1",  1);
+                b.appendIntOrLL("i2", -1);
+                b.appendIntOrLL("i3",  1*billion);
+                b.appendIntOrLL("i4", -1*billion);
+
+                b.appendIntOrLL("L1",  2*billion);
+                b.appendIntOrLL("L2", -2*billion);
+                b.appendIntOrLL("L3",  4*billion);
+                b.appendIntOrLL("L4", -4*billion);
+                b.appendIntOrLL("L5",  16*billion);
+                b.appendIntOrLL("L6", -16*billion);
+
+                BSONObj o = b.obj();
+                keyTest(o);
+
+                ASSERT(o["i1"].type() == NumberInt);
+                ASSERT(o["i1"].number() == 1);
+                ASSERT(o["i2"].type() == NumberInt);
+                ASSERT(o["i2"].number() == -1);
+                ASSERT(o["i3"].type() == NumberInt);
+                ASSERT(o["i3"].number() == 1*billion);
+                ASSERT(o["i4"].type() == NumberInt);
+                ASSERT(o["i4"].number() == -1*billion);
+
+                ASSERT(o["L1"].type() == NumberLong);
+                ASSERT(o["L1"].number() == 2*billion);
+                ASSERT(o["L2"].type() == NumberLong);
+                ASSERT(o["L2"].number() == -2*billion);
+                ASSERT(o["L3"].type() == NumberLong);
+                ASSERT(o["L3"].number() == 4*billion);
+                ASSERT(o["L4"].type() == NumberLong);
+                ASSERT(o["L4"].number() == -4*billion);
+                ASSERT(o["L5"].type() == NumberLong);
+                ASSERT(o["L5"].number() == 16*billion);
+                ASSERT(o["L6"].type() == NumberLong);
+                ASSERT(o["L6"].number() == -16*billion);
+            }
+        };
+
+        struct AppendNumber {
+            void run() {
+                BSONObjBuilder b;
+                b.appendNumber( "a" , 5 );
+                b.appendNumber( "b" , 5.5 );
+                b.appendNumber( "c" , (1024LL*1024*1024)-1 );
+                b.appendNumber( "d" , (1024LL*1024*1024*1024)-1 );
+                b.appendNumber( "e" , 1024LL*1024*1024*1024*1024*1024 );
+
+                BSONObj o = b.obj();
+                keyTest(o);
+
+                ASSERT( o["a"].type() == NumberInt );
+                ASSERT( o["b"].type() == NumberDouble );
+                ASSERT( o["c"].type() == NumberInt );
+                ASSERT( o["d"].type() == NumberDouble );
+                ASSERT( o["e"].type() == NumberLong );
+
+            }
+        };
+
+        class ToStringArray {
+        public:
+            void run() {
+                string spec = "{ a: [ \"a\", \"b\" ] }";
+                ASSERT_EQUALS( spec, fromjson( spec ).toString() );
+
+                BSONObj x = BSON( "a" << "astring" << "b" << "str" );
+                keyTest(x);
+                keyTest(x);
+                BSONObj y = BSON( "a" << "astring" << "b" << "stra" );
+                keyTest(y);
+                y = BSON( "a" << "" );
+                keyTest(y);
+
+                keyTest( BSON("abc" << true ) );
+                keyTest( BSON("abc" << false ) );
+                keyTest( BSON("abc" << false << "b" << true ) );
+
+                Date_t now = jsTime();
+                keyTest( BSON("" << now << "" << 3 << "" << jstNULL << "" << true) );
+                keyTest( BSON("" << now << "" << 3 << "" << BSONObj() << "" << true) );
+
+                {
+                    {
+                        // check signed dates with new key format
+                        KeyV1Owned a( BSONObjBuilder().appendDate("", -50).obj() );
+                        KeyV1Owned b( BSONObjBuilder().appendDate("", 50).obj() );
+                        ASSERT( a.woCompare(b, Ordering::make(BSONObj())) < 0 );
+                    }
+                    {
+                        // backward compatibility
+                        KeyBson a( BSONObjBuilder().appendDate("", -50).obj() );
+                        KeyBson b( BSONObjBuilder().appendDate("", 50).obj() );
+                        ASSERT( a.woCompare(b, Ordering::make(BSONObj())) > 0 );
+                    }
+                    {
+                        // this is an uncompactible key:
+                        BSONObj uc1 = BSONObjBuilder().appendDate("", -50).appendCode("", "abc").obj();
+                        BSONObj uc2 = BSONObjBuilder().appendDate("", 55).appendCode("", "abc").obj();
+                        ASSERT( uc1.woCompare(uc2, Ordering::make(BSONObj())) < 0 );
+                        {
+                            KeyV1Owned a(uc1);
+                            KeyV1Owned b(uc2);
+                            ASSERT( !a.isCompactFormat() );
+                            ASSERT( a.woCompare(b, Ordering::make(BSONObj())) < 0 );
+                        }
+                        {
+                            KeyBson a(uc1);
+                            KeyBson b(uc2);
+                            ASSERT( !a.isCompactFormat() );
+                            ASSERT( a.woCompare(b, Ordering::make(BSONObj())) > 0 );
+                        }
+                    }
+                }
+
+                {
+                    BSONObjBuilder b;
+                    b.appendBinData("f", 8, (BinDataType) 1, "aaaabbbb");
+                    b.appendBinData("e", 3, (BinDataType) 1, "aaa");
+                    b.appendBinData("b", 1, (BinDataType) 1, "x");
+                    BSONObj o = b.obj();
+                    keyTest( o, true );
+                }
+
+                {
+                    // check (non)equality
+                    BSONObj a = BSONObjBuilder().appendBinData("", 8, (BinDataType) 1, "abcdefgh").obj();
+                    BSONObj b = BSONObjBuilder().appendBinData("", 8, (BinDataType) 1, "abcdefgj").obj();
+                    ASSERT( !a.equal(b) );
+                    int res_ab = a.woCompare(b);
+                    ASSERT( res_ab != 0 );
+                    keyTest( a, true );
+                    keyTest( b, true );
+
+                    // check subtypes do not equal
+                    BSONObj c = BSONObjBuilder().appendBinData("", 8, (BinDataType) 4, "abcdefgh").obj();
+                    BSONObj d = BSONObjBuilder().appendBinData("", 8, (BinDataType) 0x81, "abcdefgh").obj();
+                    ASSERT( !a.equal(c) );
+                    int res_ac = a.woCompare(c);
+                    ASSERT( res_ac != 0 );
+                    keyTest( c, true );
+                    ASSERT( !a.equal(d) );
+                    int res_ad = a.woCompare(d);
+                    ASSERT( res_ad != 0 );
+                    keyTest( d, true );
+
+                    KeyV1Owned A(a);
+                    KeyV1Owned B(b);
+                    KeyV1Owned C(c);
+                    KeyV1Owned D(d);
+                    ASSERT( !A.woEqual(B) );
+                    ASSERT( A.woCompare(B, Ordering::make(BSONObj())) < 0 && res_ab < 0 );
+                    ASSERT( !A.woEqual(C) );
+                    ASSERT( A.woCompare(C, Ordering::make(BSONObj())) < 0 && res_ac < 0 );
+                    ASSERT( !A.woEqual(D) );
+                    ASSERT( A.woCompare(D, Ordering::make(BSONObj())) < 0 && res_ad < 0 );
+                }
+
+                {
+                    BSONObjBuilder b;
+                    b.appendBinData("f", 33, (BinDataType) 1, "123456789012345678901234567890123");
+                    BSONObj o = b.obj();
+                    keyTest( o, false );
+                }
+
+                {
+                    for( int i = 1; i <= 3; i++ ) {
+                        for( int j = 1; j <= 3; j++ ) {
+                            BSONObjBuilder b;
+                            b.appendBinData("f", i, (BinDataType) j, "abc");
+                            BSONObj o = b.obj();
+                            keyTest( o, j != ByteArrayDeprecated );
+                        }
+                    }
+                }
+
+                {
+                    BSONObjBuilder b;
+                    b.appendBinData("f", 1, (BinDataType) 133, "a");
+                    BSONObj o = b.obj();
+                    keyTest( o, true );
+                }
+
+                {
+                    BSONObjBuilder b;
+                    b.append("AA", 3);
+                    b.appendBinData("f", 0, (BinDataType) 0, "");
+                    b.appendBinData("e", 3, (BinDataType) 7, "aaa");
+                    b.appendBinData("b", 1, (BinDataType) 128, "x");
+                    b.append("z", 3);
+                    b.appendBinData("bb", 0, (BinDataType) 129, "x");
+                    BSONObj o = b.obj();
+                    keyTest( o, true );
+                }
+
+                {
+                    // 9 is not supported in compact format. so test a non-compact case here.
+                    BSONObjBuilder b;
+                    b.appendBinData("f", 9, (BinDataType) 0, "aaaabbbbc");
+                    BSONObj o = b.obj();
+                    keyTest( o );
+                }
+          }
+        };
+
+        class ToStringNumber {
+        public:
+
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a" , (int)4 );
+                b.append( "b" , (double)5 );
+                b.append( "c" , (long long)6 );
+
+                b.append( "d" , 123.456789123456789123456789123456789 );
+                b.append( "e" , 123456789.123456789123456789123456789 );
+                b.append( "f" , 1234567891234567891234.56789123456789 );
+
+                b.append( "g" , -123.456 );
+
+                BSONObj x = b.obj();
+                keyTest(x);
+
+                ASSERT_EQUALS( "4", x["a"].toString( false , true ) );
+                ASSERT_EQUALS( "5.0", x["b"].toString( false , true ) );
+                ASSERT_EQUALS( "6", x["c"].toString( false , true ) );
+
+                ASSERT_EQUALS( "123.4567891234568" , x["d"].toString( false , true ) );
+                ASSERT_EQUALS( "123456789.1234568" , x["e"].toString( false , true ) );
+                // ASSERT_EQUALS( "1.234567891234568e+21" , x["f"].toString( false , true ) ); // windows and *nix are different - TODO, work around for test or not bother?
+
+                ASSERT_EQUALS( "-123.456" , x["g"].toString( false , true ) );
+
+            }
+        };
+
+        class NullString {
+        public:
+            void run() {
+                {
+                    BSONObjBuilder b;
+                    const char x[] = {'a', 0, 'b', 0};
+                    b.append("field", x, 4);
+                    b.append("z", true);
+                    BSONObj B = b.obj();
+                    //cout << B.toString() << endl;
+
+                    BSONObjBuilder a;
+                    const char xx[] = {'a', 0, 'c', 0};
+                    a.append("field", xx, 4);
+                    a.append("z", true);
+                    BSONObj A = a.obj();
+
+                    BSONObjBuilder c;
+                    const char xxx[] = {'a', 0, 'c', 0, 0};
+                    c.append("field", xxx, 5);
+                    c.append("z", true);
+                    BSONObj C = c.obj();
+
+                    // test that nulls are ok within bson strings
+                    ASSERT( !(A == B) );
+                    ASSERT( A > B );
+
+                    ASSERT( !(B == C) );
+                    ASSERT( C > B );
+
+                    // check iteration is ok
+                    ASSERT( B["z"].Bool() && A["z"].Bool() && C["z"].Bool() );
+                }
+
+                BSONObjBuilder b;
+                b.append("a", "a\0b", 4);
+                string z("a\0b", 3);
+                b.append("b", z);
+                b.appendAs(b.asTempObj()["a"], "c");
+                BSONObj o = b.obj();
+                keyTest(o);
+
+                stringstream ss;
+                ss << 'a' << '\0' << 'b';
+
+                ASSERT_EQUALS(o["a"].valuestrsize(), 3+1);
+                ASSERT_EQUALS(o["a"].str(), ss.str());
+
+                ASSERT_EQUALS(o["b"].valuestrsize(), 3+1);
+                ASSERT_EQUALS(o["b"].str(), ss.str());
+
+                ASSERT_EQUALS(o["c"].valuestrsize(), 3+1);
+                ASSERT_EQUALS(o["c"].str(), ss.str());
+
+            }
+
+        };
+
+        class AppendAs {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                {
+                    BSONObj foo = BSON( "foo" << 1 );
+                    b.appendAs( foo.firstElement(), "bar" );
+                }
+                ASSERT_EQUALS( BSON( "bar" << 1 ), b.done() );
+            }
+        };
+
+        class ArrayAppendAs {
+        public:
+            void run() {
+                BSONArrayBuilder b;
+                {
+                    BSONObj foo = BSON( "foo" << 1 );
+                    b.appendAs( foo.firstElement(), "3" );
+                }
+                BSONArray a = b.arr();
+                BSONObj expected = BSON( "3" << 1 );
+                ASSERT_EQUALS( expected.firstElement(), a[ 3 ] );
+                ASSERT_EQUALS( 4, a.nFields() );
+            }
+        };
+
+        class GetField {
+        public:
+            void run(){
+                BSONObj o = BSON( "a" << 1 <<
+                                  "b" << BSON( "a" << 2 ) <<
+                                  "c" << BSON_ARRAY( BSON( "a" << 3 ) << BSON( "a" << 4 ) ) );
+                ASSERT_EQUALS( 1 , o.getFieldDotted( "a" ).numberInt() );
+                ASSERT_EQUALS( 2 , o.getFieldDotted( "b.a" ).numberInt() );
+                ASSERT_EQUALS( 3 , o.getFieldDotted( "c.0.a" ).numberInt() );
+                ASSERT_EQUALS( 4 , o.getFieldDotted( "c.1.a" ).numberInt() );
+                keyTest(o);
+            }
+        };
+
+        namespace Validation {
+
+            class Base {
+            public:
+                virtual ~Base() {}
+                void run() {
+                    ASSERT( valid().valid() );
+                    ASSERT( !invalid().valid() );
+                }
+            protected:
+                virtual BSONObj valid() const { return BSONObj(); }
+                virtual BSONObj invalid() const { return BSONObj(); }
+                static char get( const BSONObj &o, int i ) {
+                    return o.objdata()[ i ];
+                }
+                static void set( BSONObj &o, int i, char c ) {
+                    const_cast< char * >( o.objdata() )[ i ] = c;
+                }
+            };
+
+            class BadType : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":1}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    set( ret, 4, 50 );
+                    return ret;
+                }
+            };
+
+            class EooBeforeEnd : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":1}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    // (first byte of size)++
+                    set( ret, 0, get( ret, 0 ) + 1 );
+                    // re-read size for BSONObj::details
+                    return ret.copy();
+                }
+            };
+
+            class Undefined : public Base {
+            public:
+                void run() {
+                    BSONObjBuilder b;
+                    b.appendNull( "a" );
+                    BSONObj o = b.done();
+                    set( o, 4, mongo::Undefined );
+                    ASSERT( o.valid() );
+                }
+            };
+
+            class TotalSizeTooSmall : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":1}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    // (first byte of size)--
+                    set( ret, 0, get( ret, 0 ) - 1 );
+                    // re-read size for BSONObj::details
+                    return ret.copy();
+                }
+            };
+
+            class EooMissing : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":1}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    set( ret, ret.objsize() - 1, (char) 0xff );
+                    // (first byte of size)--
+                    set( ret, 0, get( ret, 0 ) - 1 );
+                    // re-read size for BSONObj::details
+                    return ret.copy();
+                }
+            };
+
+            class WrongStringSize : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":\"b\"}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    ASSERT_EQUALS( ret.firstElement().valuestr()[0] , 'b' );
+                    ASSERT_EQUALS( ret.firstElement().valuestr()[1] , 0 );
+                    ((char*)ret.firstElement().valuestr())[1] = 1;
+                    return ret.copy();
+                }
+            };
+
+            class ZeroStringSize : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":\"b\"}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    set( ret, 7, 0 );
+                    return ret;
+                }
+            };
+
+            class NegativeStringSize : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":\"b\"}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    set( ret, 10, -100 );
+                    return ret;
+                }
+            };
+
+            class WrongSubobjectSize : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":{\"b\":1}}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    set( ret, 0, get( ret, 0 ) + 1 );
+                    set( ret, 7, get( ret, 7 ) + 1 );
+                    return ret.copy();
+                }
+            };
+
+            class WrongDbrefNsSize : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{ \"a\": Dbref( \"b\", \"ffffffffffffffffffffffff\" ) }" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    set( ret, 0, get( ret, 0 ) + 1 );
+                    set( ret, 7, get( ret, 7 ) + 1 );
+                    return ret.copy();
+                };
+            };
+
+            class NoFieldNameEnd : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":1}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    memset( const_cast< char * >( ret.objdata() ) + 5, 0xff, ret.objsize() - 5 );
+                    return ret;
+                }
+            };
+
+            class BadRegex : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":/c/i}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    memset( const_cast< char * >( ret.objdata() ) + 7, 0xff, ret.objsize() - 7 );
+                    return ret;
+                }
+            };
+
+            class BadRegexOptions : public Base {
+                BSONObj valid() const {
+                    return fromjson( "{\"a\":/c/i}" );
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    memset( const_cast< char * >( ret.objdata() ) + 9, 0xff, ret.objsize() - 9 );
+                    return ret;
+                }
+            };
+
+            class CodeWScopeBase : public Base {
+                BSONObj valid() const {
+                    BSONObjBuilder b;
+                    BSONObjBuilder scope;
+                    scope.append( "a", "b" );
+                    b.appendCodeWScope( "c", "d", scope.done() );
+                    return b.obj();
+                }
+                BSONObj invalid() const {
+                    BSONObj ret = valid();
+                    modify( ret );
+                    return ret;
+                }
+            protected:
+                virtual void modify( BSONObj &o ) const = 0;
+            };
+
+            class CodeWScopeSmallSize : public CodeWScopeBase {
+                void modify( BSONObj &o ) const {
+                    set( o, 7, 7 );
+                }
+            };
+
+            class CodeWScopeZeroStrSize : public CodeWScopeBase {
+                void modify( BSONObj &o ) const {
+                    set( o, 11, 0 );
+                }
+            };
+
+            class CodeWScopeSmallStrSize : public CodeWScopeBase {
+                void modify( BSONObj &o ) const {
+                    set( o, 11, 1 );
+                }
+            };
+
+            class CodeWScopeNoSizeForObj : public CodeWScopeBase {
+                void modify( BSONObj &o ) const {
+                    set( o, 7, 13 );
+                }
+            };
+
+            class CodeWScopeSmallObjSize : public CodeWScopeBase {
+                void modify( BSONObj &o ) const {
+                    set( o, 17, 1 );
+                }
+            };
+
+            class CodeWScopeBadObject : public CodeWScopeBase {
+                void modify( BSONObj &o ) const {
+                    set( o, 21, JSTypeMax + 1 );
+                }
+            };
+
+            class NoSize {
+            public:
+                NoSize( BSONType type ) : type_( type ) {}
+                void run() {
+                    const char data[] = { 0x07, 0x00, 0x00, 0x00, char( type_ ), 'a', 0x00 };
+                    BSONObj o( data );
+                    ASSERT( !o.valid() );
+                }
+            private:
+                BSONType type_;
+            };
+
+            // Randomized BSON parsing test.  See if we seg fault.
+            class Fuzz {
+            public:
+                Fuzz( double frequency ) : frequency_( frequency ) {}
+                void run() {
+                    BSONObj a = fromjson( "{\"a\": 1, \"b\": \"c\"}" );
+                    fuzz( a );
+                    a.valid();
+
+                    BSONObj b = fromjson( "{\"one\":2, \"two\":5, \"three\": {},"
+                                          "\"four\": { \"five\": { \"six\" : 11 } },"
+                                          "\"seven\": [ \"a\", \"bb\", \"ccc\", 5 ],"
+                                          "\"eight\": Dbref( \"rrr\", \"01234567890123456789aaaa\" ),"
+                                          "\"_id\": ObjectId( \"deadbeefdeadbeefdeadbeef\" ),"
+                                          "\"nine\": { \"$binary\": \"abc=\", \"$type\": \"00\" },"
+                                          "\"ten\": Date( 44 ), \"eleven\": /foooooo/i }" );
+                    fuzz( b );
+                    b.valid();
+                }
+            private:
+                void fuzz( BSONObj &o ) const {
+                    for( int i = 4; i < o.objsize(); ++i )
+                        for( unsigned char j = 1; j; j <<= 1 )
+                            if ( rand() < int( RAND_MAX * frequency_ ) ) {
+                                char *c = const_cast< char * >( o.objdata() ) + i;
+                                if ( *c & j )
+                                    *c &= ~j;
+                                else
+                                    *c |= j;
+                            }
+                }
+                double frequency_;
+            };
+
+        } // namespace Validation
+
+    } // namespace BSONObjTests
+
+    namespace OIDTests {
+
+        class init1 {
+        public:
+            void run() {
+                OID a;
+                OID b;
+
+                a.init();
+                b.init();
+
+                ASSERT( a != b );
+            }
+        };
+
+        class initParse1 {
+        public:
+            void run() {
+
+                OID a;
+                OID b;
+
+                a.init();
+                b.init( a.str() );
+
+                ASSERT( a == b );
+            }
+        };
+
+        class append {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendOID( "a" , 0 );
+                b.appendOID( "b" , 0 , false );
+                b.appendOID( "c" , 0 , true );
+                BSONObj o = b.obj();
+                keyTest(o);
+
+                ASSERT( o["a"].__oid().str() == "000000000000000000000000" );
+                ASSERT( o["b"].__oid().str() == "000000000000000000000000" );
+                ASSERT( o["c"].__oid().str() != "000000000000000000000000" );
+
+            }
+        };
+
+        class increasing {
+        public:
+            BSONObj g() {
+                BSONObjBuilder b;
+                b.appendOID( "_id" , 0 , true );
+                return b.obj();
+            }
+            void run() {
+                BSONObj a = g();
+                BSONObj b = g();
+
+                ASSERT( a.woCompare( b ) < 0 );
+
+                // yes, there is a 1/1000 chance this won't increase time(0)
+                // and therefore inaccurately say the function is behaving
+                // buf if its broken, it will fail 999/1000, so i think that's good enough
+                sleepsecs( 1 );
+                BSONObj c = g();
+                ASSERT( a.woCompare( c ) < 0 );
+            }
+        };
+
+        class ToDate {
+        public:
+            void run() {
+                OID oid;
+
+                {
+                    time_t before = ::time(0);
+                    oid.init();
+                    time_t after = ::time(0);
+                    ASSERT( oid.asTimeT() >= before );
+                    ASSERT( oid.asTimeT() <= after );
+                }
+
+                {
+                    Date_t before = jsTime();
+                    sleepsecs(1);
+                    oid.init();
+                    Date_t after = jsTime();
+                    ASSERT( oid.asDateT() >= before );
+                    ASSERT( oid.asDateT() <= after );
+                }
+            }
+        };
+
+        class FromDate {
+        public:
+            void run() {
+                OID min, oid, max;
+                Date_t now = jsTime();
+                oid.init(); // slight chance this has different time. If its a problem, can change.
+                min.init(now);
+                max.init(now, true);
+
+                ASSERT_EQUALS( (unsigned)oid.asTimeT() , now/1000 );
+                ASSERT_EQUALS( (unsigned)min.asTimeT() , now/1000 );
+                ASSERT_EQUALS( (unsigned)max.asTimeT() , now/1000 );
+                ASSERT( BSON("" << min).woCompare( BSON("" << oid) ) < 0  );
+                ASSERT( BSON("" << max).woCompare( BSON("" << oid)  )> 0  );
+            }
+        };
+    } // namespace OIDTests
+
+
+    namespace ValueStreamTests {
+
+        class LabelBase {
+        public:
+            virtual ~LabelBase() {}
+            void run() {
+                ASSERT( !expected().woCompare( actual() ) );
+            }
+        protected:
+            virtual BSONObj expected() = 0;
+            virtual BSONObj actual() = 0;
+        };
+
+        class LabelBasic : public LabelBase {
+            BSONObj expected() {
+                return BSON( "a" << ( BSON( "$gt" << 1 ) ) );
+            }
+            BSONObj actual() {
+                return BSON( "a" << GT << 1 );
+            }
+        };
+
+        class LabelShares : public LabelBase {
+            BSONObj expected() {
+                return BSON( "z" << "q" << "a" << ( BSON( "$gt" << 1 ) ) << "x" << "p" );
+            }
+            BSONObj actual() {
+                return BSON( "z" << "q" << "a" << GT << 1 << "x" << "p" );
+            }
+        };
+
+        class LabelDouble : public LabelBase {
+            BSONObj expected() {
+                return BSON( "a" << ( BSON( "$gt" << 1 << "$lte" << "x" ) ) );
+            }
+            BSONObj actual() {
+                return BSON( "a" << GT << 1 << LTE << "x" );
+            }
+        };
+
+        class LabelDoubleShares : public LabelBase {
+            BSONObj expected() {
+                return BSON( "z" << "q" << "a" << ( BSON( "$gt" << 1 << "$lte" << "x" ) ) << "x" << "p" );
+            }
+            BSONObj actual() {
+                return BSON( "z" << "q" << "a" << GT << 1 << LTE << "x" << "x" << "p" );
+            }
+        };
+
+        class LabelSize : public LabelBase {
+            BSONObj expected() {
+                return BSON( "a" << BSON( "$size" << 4 ) );
+            }
+            BSONObj actual() {
+                return BSON( "a" << mongo::SIZE << 4 );
+            }
+        };
+
+        class LabelMulti : public LabelBase {
+            BSONObj expected() {
+                return BSON( "z" << "q"
+                             << "a" << BSON( "$gt" << 1 << "$lte" << "x" )
+                             << "b" << BSON( "$ne" << 1 << "$ne" << "f" << "$ne" << 22.3 )
+                             << "x" << "p" );
+            }
+            BSONObj actual() {
+                return BSON( "z" << "q"
+                             << "a" << GT << 1 << LTE << "x"
+                             << "b" << NE << 1 << NE << "f" << NE << 22.3
+                             << "x" << "p" );
+            }
+        };
+        class LabelishOr : public LabelBase {
+            BSONObj expected() {
+                return BSON( "$or" << BSON_ARRAY(
+                                 BSON("a" << BSON( "$gt" << 1 << "$lte" << "x" ))
+                                 << BSON("b" << BSON( "$ne" << 1 << "$ne" << "f" << "$ne" << 22.3 ))
+                                 << BSON("x" << "p" )));
+            }
+            BSONObj actual() {
+                return OR( BSON( "a" << GT << 1 << LTE << "x"),
+                           BSON( "b" << NE << 1 << NE << "f" << NE << 22.3),
+                           BSON( "x" << "p" ) );
+            }
+        };
+
+        class Unallowed {
+        public:
+            void run() {
+                ASSERT_THROWS( BSON( GT << 4 ), MsgAssertionException );
+                ASSERT_THROWS( BSON( "a" << 1 << GT << 4 ), MsgAssertionException );
+            }
+        };
+
+        class ElementAppend {
+        public:
+            void run() {
+                BSONObj a = BSON( "a" << 17 );
+                BSONObj b = BSON( "b" << a["a"] );
+                ASSERT_EQUALS( NumberInt , a["a"].type() );
+                ASSERT_EQUALS( NumberInt , b["b"].type() );
+                ASSERT_EQUALS( 17 , b["b"].number() );
+            }
+        };
+
+    } // namespace ValueStreamTests
+
+    class SubObjectBuilder {
+    public:
+        void run() {
+            BSONObjBuilder b1;
+            b1.append( "a", "bcd" );
+            BSONObjBuilder b2( b1.subobjStart( "foo" ) );
+            b2.append( "ggg", 44.0 );
+            b2.done();
+            b1.append( "f", 10.0 );
+            BSONObj ret = b1.done();
+            ASSERT( ret.valid() );
+            ASSERT( ret.woCompare( fromjson( "{a:'bcd',foo:{ggg:44},f:10}" ) ) == 0 );
+        }
+    };
+
+    class DateBuilder {
+    public:
+        void run() {
+            BSONObj o = BSON("" << Date_t(1234567890));
+            ASSERT( o.firstElement().type() == Date );
+            ASSERT( o.firstElement().date() == Date_t(1234567890) );
+        }
+    };
+
+    class DateNowBuilder {
+    public:
+        void run() {
+            Date_t before = jsTime();
+            BSONObj o = BSON("now" << DATENOW);
+            Date_t after = jsTime();
+
+            ASSERT( o.valid() );
+
+            BSONElement e = o["now"];
+            ASSERT( e.type() == Date );
+            ASSERT( e.date() >= before );
+            ASSERT( e.date() <= after );
+        }
+    };
+
+    class TimeTBuilder {
+    public:
+        void run() {
+            Date_t before = jsTime();
+            sleepmillis(1);
+            time_t now = time(NULL);
+            sleepmillis(1);
+            Date_t after = jsTime();
+
+            BSONObjBuilder b;
+            b.appendTimeT("now", now);
+            BSONObj o = b.obj();
+
+            ASSERT( o.valid() );
+
+            BSONElement e = o["now"];
+            ASSERT( e.type() == Date );
+            ASSERT( e.date()/1000 >= before/1000 );
+            ASSERT( e.date()/1000 <= after/1000 );
+        }
+    };
+
+    class MinMaxKeyBuilder {
+    public:
+        void run() {
+            BSONObj min = BSON( "a" << MINKEY );
+            BSONObj max = BSON( "b" << MAXKEY );
+
+            ASSERT( min.valid() );
+            ASSERT( max.valid() );
+
+            BSONElement minElement = min["a"];
+            BSONElement maxElement = max["b"];
+            ASSERT( minElement.type() == MinKey );
+            ASSERT( maxElement.type() == MaxKey );
+        }
+    };
+
+    class MinMaxElementTest {
+    public:
+
+        BSONObj min( int t ) {
+            BSONObjBuilder b;
+            b.appendMinForType( "a" , t );
+            return b.obj();
+        }
+
+        BSONObj max( int t ) {
+            BSONObjBuilder b;
+            b.appendMaxForType( "a" , t );
+            return b.obj();
+        }
+
+        void run() {
+            for ( int t=1; t<JSTypeMax; t++ ) {
+                stringstream ss;
+                ss << "type: " << t;
+                string s = ss.str();
+                ASSERT( min( t ).woCompare( max( t ) ) <= 0 );
+                ASSERT( max( t ).woCompare( min( t ) ) >= 0 );
+                ASSERT( min( t ).woCompare( min( t ) ) == 0 );
+                ASSERT( max( t ).woCompare( max( t ) ) == 0 );
+            }
+        }
+    };
+
+    class ExtractFieldsTest {
+    public:
+        void run() {
+            BSONObj x = BSON( "a" << 10 << "b" << 11 );
+            assert( BSON( "a" << 10 ).woCompare( x.extractFields( BSON( "a" << 1 ) ) ) == 0 );
+            assert( BSON( "b" << 11 ).woCompare( x.extractFields( BSON( "b" << 1 ) ) ) == 0 );
+            assert( x.woCompare( x.extractFields( BSON( "a" << 1 << "b" << 1 ) ) ) == 0 );
+
+            assert( (string)"a" == x.extractFields( BSON( "a" << 1 << "c" << 1 ) ).firstElementFieldName() );
+        }
+    };
+
+    class ComparatorTest {
+    public:
+        BSONObj one( string s ) {
+            return BSON( "x" << s );
+        }
+        BSONObj two( string x , string y ) {
+            BSONObjBuilder b;
+            b.append( "x" , x );
+            if ( y.size() )
+                b.append( "y" , y );
+            else
+                b.appendNull( "y" );
+            return b.obj();
+        }
+
+        void test( BSONObj order , BSONObj l , BSONObj r , bool wanted ) {
+            BSONObjCmp c( order );
+            bool got = c(l,r);
+            if ( got == wanted )
+                return;
+            cout << " order: " << order << " l: " << l << "r: " << r << " wanted: " << wanted << " got: " << got << endl;
+        }
+
+        void lt( BSONObj order , BSONObj l , BSONObj r ) {
+            test( order , l , r , 1 );
+        }
+
+        void run() {
+            BSONObj s = BSON( "x" << 1 );
+            BSONObj c = BSON( "x" << 1 << "y" << 1 );
+            test( s , one( "A" ) , one( "B" ) , 1 );
+            test( s , one( "B" ) , one( "A" ) , 0 );
+
+            test( c , two( "A" , "A" ) , two( "A" , "B" ) , 1 );
+            test( c , two( "A" , "A" ) , two( "B" , "A" ) , 1 );
+            test( c , two( "B" , "A" ) , two( "A" , "B" ) , 0 );
+
+            lt( c , one("A") , two( "A" , "A" ) );
+            lt( c , one("A") , one( "B" ) );
+            lt( c , two("A","") , two( "B" , "A" ) );
+
+            lt( c , two("B","A") , two( "C" , "A" ) );
+            lt( c , two("B","A") , one( "C" ) );
+            lt( c , two("B","A") , two( "C" , "" ) );
+
+        }
+    };
+
+    namespace external_sort {
+        class Basic1 {
+        public:
+            void run() {
+                BSONObjExternalSorter sorter(indexInterfaceForTheseTests);
+
+                sorter.add( BSON( "x" << 10 ) , 5  , 1);
+                sorter.add( BSON( "x" << 2 ) , 3 , 1 );
+                sorter.add( BSON( "x" << 5 ) , 6 , 1 );
+                sorter.add( BSON( "x" << 5 ) , 7 , 1 );
+
+                sorter.sort();
+
+                auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+                int num=0;
+                while ( i->more() ) {
+                    pair<BSONObj,DiskLoc> p = i->next();
+                    if ( num == 0 )
+                        assert( p.first["x"].number() == 2 );
+                    else if ( num <= 2 ) {
+                        assert( p.first["x"].number() == 5 );
+                    }
+                    else if ( num == 3 )
+                        assert( p.first["x"].number() == 10 );
+                    else
+                        ASSERT( 0 );
+                    num++;
+                }
+
+
+                ASSERT_EQUALS( 0 , sorter.numFiles() );
+            }
+        };
+
+        class Basic2 {
+        public:
+            void run() {
+                BSONObjExternalSorter sorter( indexInterfaceForTheseTests, BSONObj() , 10 );
+                sorter.add( BSON( "x" << 10 ) , 5  , 11 );
+                sorter.add( BSON( "x" << 2 ) , 3 , 1 );
+                sorter.add( BSON( "x" << 5 ) , 6 , 1 );
+                sorter.add( BSON( "x" << 5 ) , 7 , 1 );
+
+                sorter.sort();
+
+                auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+                int num=0;
+                while ( i->more() ) {
+                    pair<BSONObj,DiskLoc> p = i->next();
+                    if ( num == 0 ) {
+                        assert( p.first["x"].number() == 2 );
+                        ASSERT_EQUALS( p.second.toString() , "3:1" );
+                    }
+                    else if ( num <= 2 )
+                        assert( p.first["x"].number() == 5 );
+                    else if ( num == 3 ) {
+                        assert( p.first["x"].number() == 10 );
+                        ASSERT_EQUALS( p.second.toString() , "5:b" );
+                    }
+                    else
+                        ASSERT( 0 );
+                    num++;
+                }
+
+            }
+        };
+
+        class Basic3 {
+        public:
+            void run() {
+                BSONObjExternalSorter sorter( indexInterfaceForTheseTests, BSONObj() , 10 );
+                sorter.sort();
+
+                auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+                assert( ! i->more() );
+
+            }
+        };
+
+
+        class ByDiskLock {
+        public:
+            void run() {
+                BSONObjExternalSorter sorter(indexInterfaceForTheseTests);
+                sorter.add( BSON( "x" << 10 ) , 5  , 4);
+                sorter.add( BSON( "x" << 2 ) , 3 , 0 );
+                sorter.add( BSON( "x" << 5 ) , 6 , 2 );
+                sorter.add( BSON( "x" << 5 ) , 7 , 3 );
+                sorter.add( BSON( "x" << 5 ) , 2 , 1 );
+
+                sorter.sort();
+
+                auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+                int num=0;
+                while ( i->more() ) {
+                    pair<BSONObj,DiskLoc> p = i->next();
+                    if ( num == 0 )
+                        assert( p.first["x"].number() == 2 );
+                    else if ( num <= 3 ) {
+                        assert( p.first["x"].number() == 5 );
+                    }
+                    else if ( num == 4 )
+                        assert( p.first["x"].number() == 10 );
+                    else
+                        ASSERT( 0 );
+                    ASSERT_EQUALS( num , p.second.getOfs() );
+                    num++;
+                }
+
+
+            }
+        };
+
+
+        class Big1 {
+        public:
+            void run() {
+                BSONObjExternalSorter sorter( indexInterfaceForTheseTests, BSONObj() , 2000 );
+                for ( int i=0; i<10000; i++ ) {
+                    sorter.add( BSON( "x" << rand() % 10000 ) , 5  , i );
+                }
+
+                sorter.sort();
+
+                auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+                int num=0;
+                double prev = 0;
+                while ( i->more() ) {
+                    pair<BSONObj,DiskLoc> p = i->next();
+                    num++;
+                    double cur = p.first["x"].number();
+                    assert( cur >= prev );
+                    prev = cur;
+                }
+                assert( num == 10000 );
+            }
+        };
+
+        class Big2 {
+        public:
+            void run() {
+                const int total = 100000;
+                BSONObjExternalSorter sorter( indexInterfaceForTheseTests, BSONObj() , total * 2 );
+                for ( int i=0; i<total; i++ ) {
+                    sorter.add( BSON( "a" << "b" ) , 5  , i );
+                }
+
+                sorter.sort();
+
+                auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+                int num=0;
+                double prev = 0;
+                while ( i->more() ) {
+                    pair<BSONObj,DiskLoc> p = i->next();
+                    num++;
+                    double cur = p.first["x"].number();
+                    assert( cur >= prev );
+                    prev = cur;
+                }
+                assert( num == total );
+                ASSERT( sorter.numFiles() > 2 );
+            }
+        };
+
+        class D1 {
+        public:
+            void run() {
+
+                BSONObjBuilder b;
+                b.appendNull("");
+                BSONObj x = b.obj();
+
+                BSONObjExternalSorter sorter(indexInterfaceForTheseTests);
+                sorter.add(x, DiskLoc(3,7));
+                sorter.add(x, DiskLoc(4,7));
+                sorter.add(x, DiskLoc(2,7));
+                sorter.add(x, DiskLoc(1,7));
+                sorter.add(x, DiskLoc(3,77));
+
+                sorter.sort();
+
+                auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+                while( i->more() ) {
+                    BSONObjExternalSorter::Data d = i->next();
+                    /*cout << d.second.toString() << endl;
+                    cout << d.first.objsize() << endl;
+                    cout<<"SORTER next:" << d.first.toString() << endl;*/
+                }
+            }
+        };
+    }
+
+    class CompatBSON {
+    public:
+
+#define JSONBSONTEST(j,s,m) ASSERT_EQUALS( fromjson( j ).objsize() , s ); ASSERT_EQUALS( fromjson( j ).md5() , m );
+#define RAWBSONTEST(j,s,m) ASSERT_EQUALS( j.objsize() , s ); ASSERT_EQUALS( j.md5() , m );
+
+        void run() {
+
+            JSONBSONTEST( "{ 'x' : true }" , 9 , "6fe24623e4efc5cf07f027f9c66b5456" );
+            JSONBSONTEST( "{ 'x' : null }" , 8 , "12d43430ff6729af501faf0638e68888" );
+            JSONBSONTEST( "{ 'x' : 5.2 }" , 16 , "aaeeac4a58e9c30eec6b0b0319d0dff2" );
+            JSONBSONTEST( "{ 'x' : 'eliot' }" , 18 , "331a3b8b7cbbe0706c80acdb45d4ebbe" );
+            JSONBSONTEST( "{ 'x' : 5.2 , 'y' : 'truth' , 'z' : 1.1 }" , 40 , "7c77b3a6e63e2f988ede92624409da58" );
+            JSONBSONTEST( "{ 'a' : { 'b' : 1.1 } }" , 24 , "31887a4b9d55cd9f17752d6a8a45d51f" );
+            JSONBSONTEST( "{ 'x' : 5.2 , 'y' : { 'a' : 'eliot' , b : true } , 'z' : null }" , 44 , "b3de8a0739ab329e7aea138d87235205" );
+            JSONBSONTEST( "{ 'x' : 5.2 , 'y' : [ 'a' , 'eliot' , 'b' , true ] , 'z' : null }" , 62 , "cb7bad5697714ba0cbf51d113b6a0ee8" );
+
+            RAWBSONTEST( BSON( "x" << 4 ) , 12 , "d1ed8dbf79b78fa215e2ded74548d89d" );
+
+        }
+    };
+
+    class CompareDottedFieldNamesTest {
+    public:
+        void t( FieldCompareResult res , const string& l , const string& r ) {
+            ASSERT_EQUALS( res , compareDottedFieldNames( l , r ) );
+            ASSERT_EQUALS( -1 * res , compareDottedFieldNames( r , l ) );
+        }
+
+        void run() {
+            t( SAME , "x" , "x" );
+            t( SAME , "x.a" , "x.a" );
+            t( LEFT_BEFORE , "a" , "b" );
+            t( RIGHT_BEFORE , "b" , "a" );
+
+            t( LEFT_SUBFIELD , "a.x" , "a" );
+        }
+    };
+
+    struct NestedDottedConversions {
+        void t(const BSONObj& nest, const BSONObj& dot) {
+            ASSERT_EQUALS( nested2dotted(nest), dot);
+            ASSERT_EQUALS( nest, dotted2nested(dot));
+        }
+
+        void run() {
+            t( BSON("a" << BSON("b" << 1)), BSON("a.b" << 1) );
+            t( BSON("a" << BSON("b" << 1 << "c" << 1)), BSON("a.b" << 1 << "a.c" << 1) );
+            t( BSON("a" << BSON("b" << 1 << "c" << 1) << "d" << 1), BSON("a.b" << 1 << "a.c" << 1 << "d" << 1) );
+            t( BSON("a" << BSON("b" << 1 << "c" << 1 << "e" << BSON("f" << 1)) << "d" << 1), BSON("a.b" << 1 << "a.c" << 1 << "a.e.f" << 1 << "d" << 1) );
+        }
+    };
+
+    struct BSONArrayBuilderTest {
+        void run() {
+            int i = 0;
+            BSONObjBuilder objb;
+            BSONArrayBuilder arrb;
+
+            objb << objb.numStr(i++) << 100;
+            arrb                     << 100;
+
+            objb << objb.numStr(i++) << 1.0;
+            arrb                     << 1.0;
+
+            objb << objb.numStr(i++) << "Hello";
+            arrb                     << "Hello";
+
+            objb << objb.numStr(i++) << string("World");
+            arrb                     << string("World");
+
+            objb << objb.numStr(i++) << BSON( "a" << 1 << "b" << "foo" );
+            arrb                     << BSON( "a" << 1 << "b" << "foo" );
+
+            objb << objb.numStr(i++) << BSON( "a" << 1)["a"];
+            arrb                     << BSON( "a" << 1)["a"];
+
+            OID oid;
+            oid.init();
+            objb << objb.numStr(i++) << oid;
+            arrb                     << oid;
+
+            BSONObj obj = objb.obj();
+            BSONArray arr = arrb.arr();
+
+            ASSERT_EQUALS(obj, arr);
+
+            BSONObj o = BSON( "obj" << obj << "arr" << arr << "arr2" << BSONArray(obj) );
+            keyTest(o);
+
+            ASSERT_EQUALS(o["obj"].type(), Object);
+            ASSERT_EQUALS(o["arr"].type(), Array);
+            ASSERT_EQUALS(o["arr2"].type(), Array);
+        }
+    };
+
+    struct ArrayMacroTest {
+        void run() {
+            BSONArray arr = BSON_ARRAY( "hello" << 1 << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) );
+            BSONObj obj = BSON( "0" << "hello"
+                                << "1" << 1
+                                << "2" << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) );
+
+            ASSERT_EQUALS(arr, obj);
+            ASSERT_EQUALS(arr["2"].type(), Object);
+            ASSERT_EQUALS(arr["2"].embeddedObject()["foo"].type(), Array);
+        }
+    };
+
+    class NumberParsing {
+    public:
+        void run() {
+            BSONObjBuilder a;
+            BSONObjBuilder b;
+
+            a.append( "a" , (int)1 );
+            ASSERT( b.appendAsNumber( "a" , "1" ) );
+
+            a.append( "b" , 1.1 );
+            ASSERT( b.appendAsNumber( "b" , "1.1" ) );
+
+            a.append( "c" , (int)-1 );
+            ASSERT( b.appendAsNumber( "c" , "-1" ) );
+
+            a.append( "d" , -1.1 );
+            ASSERT( b.appendAsNumber( "d" , "-1.1" ) );
+
+            a.append( "e" , (long long)32131231231232313LL );
+            ASSERT( b.appendAsNumber( "e" , "32131231231232313" ) );
+
+            ASSERT( ! b.appendAsNumber( "f" , "zz" ) );
+            ASSERT( ! b.appendAsNumber( "f" , "5zz" ) );
+            ASSERT( ! b.appendAsNumber( "f" , "zz5" ) );
+
+            ASSERT_EQUALS( a.obj() , b.obj() );
+        }
+    };
+
+    class bson2settest {
+    public:
+        void run() {
+            BSONObj o = BSON( "z" << 1 << "a" << 2 << "m" << 3 << "c" << 4 );
+            BSONObjIteratorSorted i( o );
+            stringstream ss;
+            while ( i.more() )
+                ss << i.next().fieldName();
+            ASSERT_EQUALS( "acmz" , ss.str() );
+
+            {
+                Timer t;
+                for ( int i=0; i<10000; i++ ) {
+                    BSONObjIteratorSorted j( o );
+                    int l = 0;
+                    while ( j.more() )
+                        l += strlen( j.next().fieldName() );
+                }
+                //unsigned long long tm = t.micros();
+                //cout << "time: " << tm << endl;
+            }
+        }
+
+    };
+
+    class checkForStorageTests {
+    public:
+
+        void good( string s ) {
+            BSONObj o = fromjson( s );
+            if ( o.okForStorage() )
+                return;
+            throw UserException( 12528 , (string)"should be ok for storage:" + s );
+        }
+
+        void bad( string s ) {
+            BSONObj o = fromjson( s );
+            if ( ! o.okForStorage() )
+                return;
+            throw UserException( 12529 , (string)"should NOT be ok for storage:" + s );
+        }
+
+        void run() {
+            good( "{x:1}" );
+            bad( "{'x.y':1}" );
+
+            good( "{x:{a:2}}" );
+            bad( "{x:{'$a':2}}" );
+        }
+    };
+
+    class InvalidIDFind {
+    public:
+        void run() {
+            BSONObj x = BSON( "_id" << 5 << "t" << 2 );
+            {
+                char * crap = (char*)malloc( x.objsize() );
+                memcpy( crap , x.objdata() , x.objsize() );
+                BSONObj y( crap );
+                ASSERT_EQUALS( x , y );
+                free( crap );
+            }
+
+            {
+                char * crap = (char*)malloc( x.objsize() );
+                memcpy( crap , x.objdata() , x.objsize() );
+                int * foo = (int*)crap;
+                foo[0] = 123123123;
+                int state = 0;
+                try {
+                    BSONObj y( crap );
+                    state = 1;
+                }
+                catch ( std::exception& e ) {
+                    state = 2;
+                    ASSERT( strstr( e.what() , "_id: 5" ) > 0 );
+                }
+                free( crap );
+                ASSERT_EQUALS( 2 , state );
+            }
+
+
+        }
+    };
+
+    class ElementSetTest {
+    public:
+        void run() {
+            BSONObj x = BSON( "a" << 1 << "b" << 1 << "c" << 2 );
+            BSONElement a = x["a"];
+            BSONElement b = x["b"];
+            BSONElement c = x["c"];
+            //cout << "c: " << c << endl;
+            ASSERT( a.woCompare( b ) != 0 );
+            ASSERT( a.woCompare( b , false ) == 0 );
+
+            BSONElementSet s;
+            s.insert( a );
+            ASSERT_EQUALS( 1U , s.size() );
+            s.insert( b );
+            ASSERT_EQUALS( 1U , s.size() );
+            ASSERT( ! s.count( c ) );
+
+            ASSERT( s.find( a ) != s.end() );
+            ASSERT( s.find( b ) != s.end() );
+            ASSERT( s.find( c ) == s.end() );
+
+
+            s.insert( c );
+            ASSERT_EQUALS( 2U , s.size() );
+
+
+            ASSERT( s.find( a ) != s.end() );
+            ASSERT( s.find( b ) != s.end() );
+            ASSERT( s.find( c ) != s.end() );
+
+            ASSERT( s.count( a ) );
+            ASSERT( s.count( b ) );
+            ASSERT( s.count( c ) );
+
+            {
+                BSONElementSet x;
+                BSONObj o = fromjson( "{ 'a' : [ 1 , 2 , 1 ] }" );
+                BSONObjIterator i( o["a"].embeddedObjectUserCheck() );
+                while ( i.more() ) {
+                    x.insert( i.next() );
+                }
+                ASSERT_EQUALS( 2U , x.size() );
+            }
+        }
+    };
+
+    class EmbeddedNumbers {
+    public:
+        void run() {
+            BSONObj x = BSON( "a" << BSON( "b" << 1 ) );
+            BSONObj y = BSON( "a" << BSON( "b" << 1.0 ) );
+            keyTest(x); keyTest(y);
+            ASSERT_EQUALS( x , y );
+            ASSERT_EQUALS( 0 , x.woCompare( y ) );
+        }
+    };
+
+    class BuilderPartialItearte {
+    public:
+        void run() {
+            {
+                BSONObjBuilder b;
+                b.append( "x" , 1 );
+                b.append( "y" , 2 );
+
+                BSONObjIterator i = b.iterator();
+                ASSERT( i.more() );
+                ASSERT_EQUALS( 1 , i.next().numberInt() );
+                ASSERT( i.more() );
+                ASSERT_EQUALS( 2 , i.next().numberInt() );
+                ASSERT( ! i.more() );
+
+                b.append( "z" , 3 );
+
+                i = b.iterator();
+                ASSERT( i.more() );
+                ASSERT_EQUALS( 1 , i.next().numberInt() );
+                ASSERT( i.more() );
+                ASSERT_EQUALS( 2 , i.next().numberInt() );
+                ASSERT( i.more() );
+                ASSERT_EQUALS( 3 , i.next().numberInt() );
+                ASSERT( ! i.more() );
+
+                ASSERT_EQUALS( BSON( "x" << 1 << "y" << 2 << "z" << 3 ) , b.obj() );
+            }
+
+        }
+    };
+
+    class BSONFieldTests {
+    public:
+        void run() {
+            {
+                BSONField<int> x("x");
+                BSONObj o = BSON( x << 5 );
+                ASSERT_EQUALS( BSON( "x" << 5 ) , o );
+            }
+
+            {
+                BSONField<int> x("x");
+                BSONObj o = BSON( x.make(5) );
+                ASSERT_EQUALS( BSON( "x" << 5 ) , o );
+            }
+
+            {
+                BSONField<int> x("x");
+                BSONObj o = BSON( x(5) );
+                ASSERT_EQUALS( BSON( "x" << 5 ) , o );
+
+                o = BSON( x.gt(5) );
+                ASSERT_EQUALS( BSON( "x" << BSON( "$gt" << 5 ) ) , o );
+            }
+
+        }
+    };
+
+    class BSONForEachTest {
+    public:
+        void run() {
+            BSONObj obj = BSON("a" << 1 << "a" << 2 << "a" << 3);
+
+            int count = 0;
+            BSONForEach(e, obj) {
+                ASSERT_EQUALS( e.fieldName() , string("a") );
+                count += e.Int();
+            }
+
+            ASSERT_EQUALS( count , 1+2+3 );
+        }
+    };
+
+    class StringDataTest {
+    public:
+        void run() {
+            StringData a( string( "aaa" ) );
+            ASSERT_EQUALS( 3u , a.size() );
+
+            StringData b( string( "bbb" ).c_str() );
+            ASSERT_EQUALS( 3u , b.size() );
+
+            StringData c( "ccc", StringData::LiteralTag() );
+            ASSERT_EQUALS( 3u , c.size() );
+
+            // TODO update test when second parm takes StringData too
+            BSONObjBuilder builder;
+            builder.append( c, "value");
+            ASSERT_EQUALS( builder.obj() , BSON( c.data() << "value" ) );
+
+        }
+    };
+
+    class CompareOps {
+    public:
+        void run() {
+
+            BSONObj a = BSON("a"<<1);
+            BSONObj b = BSON("a"<<1);
+            BSONObj c = BSON("a"<<2);
+            BSONObj d = BSON("a"<<3);
+            BSONObj e = BSON("a"<<4);
+            BSONObj f = BSON("a"<<4);
+
+            ASSERT( ! ( a < b ) );
+            ASSERT( a <= b );
+            ASSERT( a < c );
+
+            ASSERT( f > d );
+            ASSERT( f >= e );
+            ASSERT( ! ( f > e ) );
+        }
+    };
+
+    class HashingTest {
+    public:
+        void run() {
+            int N = 100000;
+            BSONObj x = BSON( "name" << "eliot was here"
+                              << "x" << 5
+                              << "asdasdasdas" << "asldkasldjasldjasldjlasjdlasjdlasdasdasdasdasdasdasd" );
+
+            {
+	        //Timer t;
+                for ( int i=0; i<N; i++ )
+                    x.md5();
+                //int millis = t.millis();
+                //cout << "md5 : " << millis << endl;
+            }
+
+            {
+	        //Timer t;
+                for ( int i=0; i<N; i++ )
+                    x.toString();
+                //int millis = t.millis();
+                //cout << "toString : " << millis << endl;
+            }
+
+            {
+	        //Timer t;
+                for ( int i=0; i<N; i++ )
+                    checksum( x.objdata() , x.objsize() );
+                //int millis = t.millis();
+                //cout << "checksum : " << millis << endl;
+            }
+
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "jsobj" ) {
+        }
+
+        void setupTests() {
+            add< BufBuilderBasic >();
+            add< BSONElementBasic >();
+            add< BSONObjTests::NullString >();
+            add< BSONObjTests::Create >();
+            add< BSONObjTests::WoCompareBasic >();
+            add< BSONObjTests::NumericCompareBasic >();
+            add< BSONObjTests::WoCompareEmbeddedObject >();
+            add< BSONObjTests::WoCompareEmbeddedArray >();
+            add< BSONObjTests::WoCompareOrdered >();
+            add< BSONObjTests::WoCompareDifferentLength >();
+            add< BSONObjTests::WoSortOrder >();
+            add< BSONObjTests::MultiKeySortOrder > ();
+            add< BSONObjTests::TimestampTest >();
+            add< BSONObjTests::Nan >();
+            add< BSONObjTests::AsTempObj >();
+            add< BSONObjTests::AppendIntOrLL >();
+            add< BSONObjTests::AppendNumber >();
+            add< BSONObjTests::ToStringArray >();
+            add< BSONObjTests::ToStringNumber >();
+            add< BSONObjTests::AppendAs >();
+            add< BSONObjTests::ArrayAppendAs >();
+            add< BSONObjTests::GetField >();
+
+            add< BSONObjTests::Validation::BadType >();
+            add< BSONObjTests::Validation::EooBeforeEnd >();
+            add< BSONObjTests::Validation::Undefined >();
+            add< BSONObjTests::Validation::TotalSizeTooSmall >();
+            add< BSONObjTests::Validation::EooMissing >();
+            add< BSONObjTests::Validation::WrongStringSize >();
+            add< BSONObjTests::Validation::ZeroStringSize >();
+            add< BSONObjTests::Validation::NegativeStringSize >();
+            add< BSONObjTests::Validation::WrongSubobjectSize >();
+            add< BSONObjTests::Validation::WrongDbrefNsSize >();
+            add< BSONObjTests::Validation::NoFieldNameEnd >();
+            add< BSONObjTests::Validation::BadRegex >();
+            add< BSONObjTests::Validation::BadRegexOptions >();
+            add< BSONObjTests::Validation::CodeWScopeSmallSize >();
+            add< BSONObjTests::Validation::CodeWScopeZeroStrSize >();
+            add< BSONObjTests::Validation::CodeWScopeSmallStrSize >();
+            add< BSONObjTests::Validation::CodeWScopeNoSizeForObj >();
+            add< BSONObjTests::Validation::CodeWScopeSmallObjSize >();
+            add< BSONObjTests::Validation::CodeWScopeBadObject >();
+            add< BSONObjTests::Validation::NoSize >( Symbol );
+            add< BSONObjTests::Validation::NoSize >( Code );
+            add< BSONObjTests::Validation::NoSize >( String );
+            add< BSONObjTests::Validation::NoSize >( CodeWScope );
+            add< BSONObjTests::Validation::NoSize >( DBRef );
+            add< BSONObjTests::Validation::NoSize >( Object );
+            add< BSONObjTests::Validation::NoSize >( Array );
+            add< BSONObjTests::Validation::NoSize >( BinData );
+            add< BSONObjTests::Validation::Fuzz >( .5 );
+            add< BSONObjTests::Validation::Fuzz >( .1 );
+            add< BSONObjTests::Validation::Fuzz >( .05 );
+            add< BSONObjTests::Validation::Fuzz >( .01 );
+            add< BSONObjTests::Validation::Fuzz >( .001 );
+            add< OIDTests::init1 >();
+            add< OIDTests::initParse1 >();
+            add< OIDTests::append >();
+            add< OIDTests::increasing >();
+            add< OIDTests::ToDate >();
+            add< OIDTests::FromDate >();
+            add< ValueStreamTests::LabelBasic >();
+            add< ValueStreamTests::LabelShares >();
+            add< ValueStreamTests::LabelDouble >();
+            add< ValueStreamTests::LabelDoubleShares >();
+            add< ValueStreamTests::LabelSize >();
+            add< ValueStreamTests::LabelMulti >();
+            add< ValueStreamTests::LabelishOr >();
+            add< ValueStreamTests::Unallowed >();
+            add< ValueStreamTests::ElementAppend >();
+            add< ValueStreamTests::Unallowed >();
+            add< ValueStreamTests::ElementAppend >();
+            add< SubObjectBuilder >();
+            add< DateBuilder >();
+            add< DateNowBuilder >();
+            add< TimeTBuilder >();
+            add< MinMaxKeyBuilder >();
+            add< MinMaxElementTest >();
+            add< ComparatorTest >();
+            add< ExtractFieldsTest >();
+            add< external_sort::Basic1 >();
+            add< external_sort::Basic2 >();
+            add< external_sort::Basic3 >();
+            add< external_sort::ByDiskLock >();
+            add< external_sort::Big1 >();
+            add< external_sort::Big2 >();
+            add< external_sort::D1 >();
+            add< CompatBSON >();
+            add< CompareDottedFieldNamesTest >();
+            add< NestedDottedConversions >();
+            add< BSONArrayBuilderTest >();
+            add< ArrayMacroTest >();
+            add< NumberParsing >();
+            add< bson2settest >();
+            add< checkForStorageTests >();
+            add< InvalidIDFind >();
+            add< ElementSetTest >();
+            add< EmbeddedNumbers >();
+            add< BuilderPartialItearte >();
+            add< BSONFieldTests >();
+            add< BSONForEachTest >();
+            add< StringDataTest >();
+            add< CompareOps >();
+            add< HashingTest >();
+        }
+    } myall;
+
+} // namespace JsobjTests
+
diff --git a/src/mongo/dbtests/jsontests.cpp b/src/mongo/dbtests/jsontests.cpp
new file mode 100644
index 00000000000..36c204a1011
--- /dev/null
+++ b/src/mongo/dbtests/jsontests.cpp
@@ -0,0 +1,1185 @@
+// jsontests.cpp - Tests for json.{h,cpp} code and BSONObj::jsonString()
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/jsobj.h"
+#include "../db/json.h"
+
+#include "dbtests.h"
+
+#include <limits>
+
+namespace JsonTests {
+    namespace JsonStringTests {
+
+        class Empty {
+        public:
+            void run() {
+                ASSERT_EQUALS( "{}", BSONObj().jsonString( Strict ) );
+            }
+        };
+
+        class SingleStringMember {
+        public:
+            void run() {
+                ASSERT_EQUALS( "{ \"a\" : \"b\" }", BSON( "a" << "b" ).jsonString( Strict ) );
+            }
+        };
+
+        class EscapedCharacters {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a", "\" \\ / \b \f \n \r \t" );
+                ASSERT_EQUALS( "{ \"a\" : \"\\\" \\\\ / \\b \\f \\n \\r \\t\" }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        // per http://www.ietf.org/rfc/rfc4627.txt, control characters are
+        // (U+0000 through U+001F).  U+007F is not mentioned as a control character.
+        class AdditionalControlCharacters {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a", "\x1 \x1f" );
+                ASSERT_EQUALS( "{ \"a\" : \"\\u0001 \\u001f\" }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class ExtendedAscii {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a", "\x80" );
+                ASSERT_EQUALS( "{ \"a\" : \"\x80\" }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class EscapeFieldName {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "\t", "b" );
+                ASSERT_EQUALS( "{ \"\\t\" : \"b\" }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class SingleIntMember {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a", 1 );
+                ASSERT_EQUALS( "{ \"a\" : 1 }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class SingleNumberMember {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a", 1.5 );
+                ASSERT_EQUALS( "{ \"a\" : 1.5 }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class InvalidNumbers {
+        public:
+            void run() {
+                BSONObjBuilder c;
+                c.append( "a", numeric_limits< double >::quiet_NaN() );
+                string s = c.done().jsonString( Strict );
+                // Note there is no NaN in the JSON RFC but what would be the alternative?
+                ASSERT( str::contains(s, "NaN") );
+
+                // commented out assertion as it doesn't throw anymore:
+                //ASSERT_THROWS( c.done().jsonString( Strict ), AssertionException );
+
+                BSONObjBuilder d;
+                d.append( "a", numeric_limits< double >::signaling_NaN() );
+                //ASSERT_THROWS( d.done().jsonString( Strict ), AssertionException );
+                s = d.done().jsonString( Strict );
+                ASSERT( str::contains(s, "NaN") );
+            }
+        };
+
+        class NumberPrecision {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a", 123456789 );
+                ASSERT_EQUALS( "{ \"a\" : 123456789 }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class NegativeNumber {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a", -1 );
+                ASSERT_EQUALS( "{ \"a\" : -1 }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class SingleBoolMember {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendBool( "a", true );
+                ASSERT_EQUALS( "{ \"a\" : true }", b.done().jsonString( Strict ) );
+
+                BSONObjBuilder c;
+                c.appendBool( "a", false );
+                ASSERT_EQUALS( "{ \"a\" : false }", c.done().jsonString( Strict ) );
+            }
+        };
+
+        class SingleNullMember {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendNull( "a" );
+                ASSERT_EQUALS( "{ \"a\" : null }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class SingleObjectMember {
+        public:
+            void run() {
+                BSONObjBuilder b, c;
+                b.append( "a", c.done() );
+                ASSERT_EQUALS( "{ \"a\" : {} }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class TwoMembers {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "a", 1 );
+                b.append( "b", 2 );
+                ASSERT_EQUALS( "{ \"a\" : 1, \"b\" : 2 }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class EmptyArray {
+        public:
+            void run() {
+                vector< int > arr;
+                BSONObjBuilder b;
+                b.append( "a", arr );
+                ASSERT_EQUALS( "{ \"a\" : [] }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class Array {
+        public:
+            void run() {
+                vector< int > arr;
+                arr.push_back( 1 );
+                arr.push_back( 2 );
+                BSONObjBuilder b;
+                b.append( "a", arr );
+                ASSERT_EQUALS( "{ \"a\" : [ 1, 2 ] }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class DBRef {
+        public:
+            void run() {
+                OID oid;
+                memset( &oid, 0xff, 12 );
+                BSONObjBuilder b;
+                b.appendDBRef( "a", "namespace", oid );
+                BSONObj built = b.done();
+                ASSERT_EQUALS( "{ \"a\" : { \"$ref\" : \"namespace\", \"$id\" : \"ffffffffffffffffffffffff\" } }",
+                               built.jsonString( Strict ) );
+                ASSERT_EQUALS( "{ \"a\" : { \"$ref\" : \"namespace\", \"$id\" : \"ffffffffffffffffffffffff\" } }",
+                               built.jsonString( JS ) );
+                ASSERT_EQUALS( "{ \"a\" : Dbref( \"namespace\", \"ffffffffffffffffffffffff\" ) }",
+                               built.jsonString( TenGen ) );
+            }
+        };
+
+        class DBRefZero {
+        public:
+            void run() {
+                OID oid;
+                memset( &oid, 0, 12 );
+                BSONObjBuilder b;
+                b.appendDBRef( "a", "namespace", oid );
+                ASSERT_EQUALS( "{ \"a\" : { \"$ref\" : \"namespace\", \"$id\" : \"000000000000000000000000\" } }",
+                               b.done().jsonString( Strict ) );
+            }
+        };
+
+        class ObjectId {
+        public:
+            void run() {
+                OID oid;
+                memset( &oid, 0xff, 12 );
+                BSONObjBuilder b;
+                b.appendOID( "a", &oid );
+                BSONObj built = b.done();
+                ASSERT_EQUALS( "{ \"a\" : { \"$oid\" : \"ffffffffffffffffffffffff\" } }",
+                               built.jsonString( Strict ) );
+                ASSERT_EQUALS( "{ \"a\" : ObjectId( \"ffffffffffffffffffffffff\" ) }",
+                               built.jsonString( TenGen ) );
+            }
+        };
+
+        class BinData {
+        public:
+            void run() {
+                char z[ 3 ];
+                z[ 0 ] = 'a';
+                z[ 1 ] = 'b';
+                z[ 2 ] = 'c';
+                BSONObjBuilder b;
+                b.appendBinData( "a", 3, BinDataGeneral, z );
+
+                string o = b.done().jsonString( Strict );
+
+                ASSERT_EQUALS( "{ \"a\" : { \"$binary\" : \"YWJj\", \"$type\" : \"00\" } }",
+                               o );
+
+                BSONObjBuilder c;
+                c.appendBinData( "a", 2, BinDataGeneral, z );
+                ASSERT_EQUALS( "{ \"a\" : { \"$binary\" : \"YWI=\", \"$type\" : \"00\" } }",
+                               c.done().jsonString( Strict ) );
+
+                BSONObjBuilder d;
+                d.appendBinData( "a", 1, BinDataGeneral, z );
+                ASSERT_EQUALS( "{ \"a\" : { \"$binary\" : \"YQ==\", \"$type\" : \"00\" } }",
+                               d.done().jsonString( Strict ) );
+            }
+        };
+
+        class Symbol {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendSymbol( "a", "b" );
+                ASSERT_EQUALS( "{ \"a\" : \"b\" }", b.done().jsonString( Strict ) );
+            }
+        };
+
+        class Date {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendDate( "a", 0 );
+                BSONObj built = b.done();
+                ASSERT_EQUALS( "{ \"a\" : { \"$date\" : 0 } }", built.jsonString( Strict ) );
+                ASSERT_EQUALS( "{ \"a\" : Date( 0 ) }", built.jsonString( TenGen ) );
+                ASSERT_EQUALS( "{ \"a\" : Date( 0 ) }", built.jsonString( JS ) );
+            }
+        };
+
+        class Regex {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendRegex( "a", "abc", "i" );
+                BSONObj built = b.done();
+                ASSERT_EQUALS( "{ \"a\" : { \"$regex\" : \"abc\", \"$options\" : \"i\" } }",
+                               built.jsonString( Strict ) );
+                ASSERT_EQUALS( "{ \"a\" : /abc/i }", built.jsonString( TenGen ) );
+                ASSERT_EQUALS( "{ \"a\" : /abc/i }", built.jsonString( JS ) );
+            }
+        };
+
+        class RegexEscape {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendRegex( "a", "/\"", "i" );
+                BSONObj built = b.done();
+                ASSERT_EQUALS( "{ \"a\" : { \"$regex\" : \"/\\\"\", \"$options\" : \"i\" } }",
+                               built.jsonString( Strict ) );
+                ASSERT_EQUALS( "{ \"a\" : /\\/\\\"/i }", built.jsonString( TenGen ) );
+                ASSERT_EQUALS( "{ \"a\" : /\\/\\\"/i }", built.jsonString( JS ) );
+            }
+        };
+
+        class RegexManyOptions {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendRegex( "a", "z", "abcgimx" );
+                BSONObj built = b.done();
+                ASSERT_EQUALS( "{ \"a\" : { \"$regex\" : \"z\", \"$options\" : \"abcgimx\" } }",
+                               built.jsonString( Strict ) );
+                ASSERT_EQUALS( "{ \"a\" : /z/gim }", built.jsonString( TenGen ) );
+                ASSERT_EQUALS( "{ \"a\" : /z/gim }", built.jsonString( JS ) );
+            }
+        };
+
+        class CodeTests {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendCode( "x" , "function(){ return 1; }" );
+                BSONObj o = b.obj();
+                ASSERT_EQUALS( "{ \"x\" : function(){ return 1; } }" , o.jsonString() );
+            }
+        };
+
+        class TimestampTests {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendTimestamp( "x" , 4000 , 10 );
+                BSONObj o = b.obj();
+                ASSERT_EQUALS( "{ \"x\" : { \"t\" : 4000 , \"i\" : 10 } }" , o.jsonString() );
+            }
+        };
+
+        class NullString {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "x" , "a\0b" , 4 );
+                BSONObj o = b.obj();
+                ASSERT_EQUALS( "{ \"x\" : \"a\\u0000b\" }" , o.jsonString() );
+            }
+        };
+
+        class AllTypes {
+        public:
+            void run() {
+                OID oid;
+                oid.init();
+
+                BSONObjBuilder b;
+                b.appendMinKey( "a" );
+                b.append( "b" , 5.5 );
+                b.append( "c" , "abc" );
+                b.append( "e" , BSON( "x" << 1 ) );
+                b.append( "f" , BSON_ARRAY( 1 << 2 << 3 ) );
+                b.appendBinData( "g" , 5 , bdtCustom , (const char*)this );
+                b.appendUndefined( "h" );
+                b.append( "i" , oid );
+                b.appendBool( "j" , 1 );
+                b.appendDate( "k" , 123 );
+                b.appendNull( "l" );
+                b.appendRegex( "m" , "a" );
+                b.appendDBRef( "n" , "foo" , oid );
+                b.appendCode( "o" , "function(){}" );
+                b.appendSymbol( "p" , "foo" );
+                b.appendCodeWScope( "q" , "function(){}" , BSON("x" << 1 ) );
+                b.append( "r" , (int)5 );
+                b.appendTimestamp( "s" , 123123123123123LL );
+                b.append( "t" , 12321312312LL );
+                b.appendMaxKey( "u" );
+
+                BSONObj o = b.obj();
+                o.jsonString();
+                //cout << o.jsonString() << endl;
+            }
+        };
+
+    } // namespace JsonStringTests
+
+    namespace FromJsonTests {
+
+        class Base {
+        public:
+            virtual ~Base() {}
+            void run() {
+                ASSERT( fromjson( json() ).valid() );
+                assertEquals( bson(), fromjson( json() ) );
+                assertEquals( bson(), fromjson( bson().jsonString( Strict ) ) );
+                assertEquals( bson(), fromjson( bson().jsonString( TenGen ) ) );
+                assertEquals( bson(), fromjson( bson().jsonString( JS ) ) );
+            }
+        protected:
+            virtual BSONObj bson() const = 0;
+            virtual string json() const = 0;
+        private:
+            static void assertEquals( const BSONObj &expected, const BSONObj &actual ) {
+                if ( expected.woCompare( actual ) ) {
+                    out() << "want:" << expected.jsonString() << " size: " << expected.objsize() << endl;
+                    out() << "got :" << actual.jsonString() << " size: " << actual.objsize() << endl;
+                    out() << expected.hexDump() << endl;
+                    out() << actual.hexDump() << endl;
+                }
+                ASSERT( !expected.woCompare( actual ) );
+            }
+        };
+
+        class Bad {
+        public:
+            virtual ~Bad() {}
+            void run() {
+                ASSERT_THROWS( fromjson( json() ), MsgAssertionException );
+            }
+        protected:
+            virtual string json() const = 0;
+        };
+
+        class Empty : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{}";
+            }
+        };
+
+        class EmptyWithSpace : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ }";
+            }
+        };
+
+        class SingleString : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", "b" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : \"b\" }";
+            }
+        };
+
+        class EmptyStrings : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "", "" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"\" : \"\" }";
+            }
+        };
+
+        class ReservedFieldName : public Bad {
+            virtual string json() const {
+                return "{ \"$oid\" : \"b\" }";
+            }
+        };
+
+        class OkDollarFieldName : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "$where", 1 );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"$where\" : 1 }";
+            }
+        };
+
+        class SingleNumber : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", 1 );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : 1 }";
+            }
+        };
+
+        class RealNumber : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", strtod( "0.7", 0 ) );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : 0.7 }";
+            }            
+        };
+        
+        class FancyNumber : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", strtod( "-4.4433e-2", 0 ) );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : -4.4433e-2 }";
+            }
+        };
+
+        class TwoElements : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", 1 );
+                b.append( "b", "foo" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : 1, \"b\" : \"foo\" }";
+            }
+        };
+
+        class Subobject : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", 1 );
+                BSONObjBuilder c;
+                c.append( "z", b.done() );
+                return c.obj();
+            }
+            virtual string json() const {
+                return "{ \"z\" : { \"a\" : 1 } }";
+            }
+        };
+
+        class ArrayEmpty : public Base {
+            virtual BSONObj bson() const {
+                vector< int > arr;
+                BSONObjBuilder b;
+                b.append( "a", arr );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : [] }";
+            }
+        };
+
+        class Array : public Base {
+            virtual BSONObj bson() const {
+                vector< int > arr;
+                arr.push_back( 1 );
+                arr.push_back( 2 );
+                arr.push_back( 3 );
+                BSONObjBuilder b;
+                b.append( "a", arr );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : [ 1, 2, 3 ] }";
+            }
+        };
+
+        class True : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendBool( "a", true );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : true }";
+            }
+        };
+
+        class False : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendBool( "a", false );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : false }";
+            }
+        };
+
+        class Null : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendNull( "a" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : null }";
+            }
+        };
+
+        class EscapedCharacters : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", "\" \\ / \b \f \n \r \t \v" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : \"\\\" \\\\ \\/ \\b \\f \\n \\r \\t \\v\" }";
+            }
+        };
+
+        class NonEscapedCharacters : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", "% { a z $ # '  " );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : \"\\% \\{ \\a \\z \\$ \\# \\' \\ \" }";
+            }
+        };
+
+        class AllowedControlCharacter : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a", "\x7f" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : \"\x7f\" }";
+            }
+        };
+
+        class EscapeFieldName : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "\n", "b" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"\\n\" : \"b\" }";
+            }
+        };
+
+        class EscapedUnicodeToUtf8 : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                unsigned char u[ 7 ];
+                u[ 0 ] = 0xe0 | 0x0a;
+                u[ 1 ] = 0x80;
+                u[ 2 ] = 0x80;
+                u[ 3 ] = 0xe0 | 0x0a;
+                u[ 4 ] = 0x80;
+                u[ 5 ] = 0x80;
+                u[ 6 ] = 0;
+                b.append( "a", (char *) u );
+                BSONObj built = b.obj();
+                ASSERT_EQUALS( string( (char *) u ), built.firstElement().valuestr() );
+                return built;
+            }
+            virtual string json() const {
+                return "{ \"a\" : \"\\ua000\\uA000\" }";
+            }
+        };
+
+        class Utf8AllOnes : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                unsigned char u[ 8 ];
+                u[ 0 ] = 0x01;
+
+                u[ 1 ] = 0x7f;
+
+                u[ 2 ] = 0xdf;
+                u[ 3 ] = 0xbf;
+
+                u[ 4 ] = 0xef;
+                u[ 5 ] = 0xbf;
+                u[ 6 ] = 0xbf;
+
+                u[ 7 ] = 0;
+
+                b.append( "a", (char *) u );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : \"\\u0001\\u007f\\u07ff\\uffff\" }";
+            }
+        };
+
+        class Utf8FirstByteOnes : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                unsigned char u[ 6 ];
+                u[ 0 ] = 0xdc;
+                u[ 1 ] = 0x80;
+
+                u[ 2 ] = 0xef;
+                u[ 3 ] = 0xbc;
+                u[ 4 ] = 0x80;
+
+                u[ 5 ] = 0;
+
+                b.append( "a", (char *) u );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : \"\\u0700\\uff00\" }";
+            }
+        };
+
+        class DBRef : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                OID o;
+                memset( &o, 0, 12 );
+                b.appendDBRef( "a", "foo", o );
+                return b.obj();
+            }
+            // NOTE Testing other formats handled by by Base class.
+            virtual string json() const {
+                return "{ \"a\" : { \"$ref\" : \"foo\", \"$id\" : \"000000000000000000000000\" } }";
+            }
+        };
+
+        class NewDBRef : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                OID o;
+                memset( &o, 0, 12 );
+                b.append( "$ref", "items" );
+                b.appendOID( "$id", &o );
+                BSONObjBuilder c;
+                c.append( "refval", b.done() );
+                return c.obj();
+            }
+            virtual string json() const {
+                return "{ \"refval\" : { \"$ref\" : \"items\", \"$id\" : ObjectId( \"000000000000000000000000\" ) } }";
+            }
+        };
+
+        class Oid : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendOID( "_id" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"_id\" : { \"$oid\" : \"000000000000000000000000\" } }";
+            }
+        };
+
+        class Oid2 : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                OID o;
+                memset( &o, 0x0f, 12 );
+                b.appendOID( "_id", &o );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"_id\" : ObjectId( \"0f0f0f0f0f0f0f0f0f0f0f0f\" ) }";
+            }
+        };
+
+        class StringId : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append("_id", "000000000000000000000000");
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"_id\" : \"000000000000000000000000\" }";
+            }
+        };
+
+        class BinData : public Base {
+            virtual BSONObj bson() const {
+                char z[ 3 ];
+                z[ 0 ] = 'a';
+                z[ 1 ] = 'b';
+                z[ 2 ] = 'c';
+                BSONObjBuilder b;
+                b.appendBinData( "a", 3, BinDataGeneral, z );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : { \"$binary\" : \"YWJj\", \"$type\" : \"00\" } }";
+            }
+        };
+
+        class BinDataPaddedSingle : public Base {
+            virtual BSONObj bson() const {
+                char z[ 2 ];
+                z[ 0 ] = 'a';
+                z[ 1 ] = 'b';
+                BSONObjBuilder b;
+                b.appendBinData( "a", 2, BinDataGeneral, z );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : { \"$binary\" : \"YWI=\", \"$type\" : \"00\" } }";
+            }
+        };
+
+        class BinDataPaddedDouble : public Base {
+            virtual BSONObj bson() const {
+                char z[ 1 ];
+                z[ 0 ] = 'a';
+                BSONObjBuilder b;
+                b.appendBinData( "a", 1, BinDataGeneral, z );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : { \"$binary\" : \"YQ==\", \"$type\" : \"00\" } }";
+            }
+        };
+
+        class BinDataAllChars : public Base {
+            virtual BSONObj bson() const {
+                unsigned char z[] = {
+                    0x00, 0x10, 0x83, 0x10, 0x51, 0x87, 0x20, 0x92, 0x8B, 0x30,
+                    0xD3, 0x8F, 0x41, 0x14, 0x93, 0x51, 0x55, 0x97, 0x61, 0x96,
+                    0x9B, 0x71, 0xD7, 0x9F, 0x82, 0x18, 0xA3, 0x92, 0x59, 0xA7,
+                    0xA2, 0x9A, 0xAB, 0xB2, 0xDB, 0xAF, 0xC3, 0x1C, 0xB3, 0xD3,
+                    0x5D, 0xB7, 0xE3, 0x9E, 0xBB, 0xF3, 0xDF, 0xBF
+                };
+                BSONObjBuilder b;
+                b.appendBinData( "a", 48, BinDataGeneral, z );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : { \"$binary\" : \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\", \"$type\" : \"00\" } }";
+            }
+        };
+
+        class Date : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendDate( "a", 0 );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : { \"$date\" : 0 } }";
+            }
+        };
+
+        class DateNonzero : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendDate( "a", 100 );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : { \"$date\" : 100 } }";
+            }
+        };
+
+        class DateTooLong : public Bad {
+            virtual string json() const {
+                stringstream ss;
+                ss << "{ \"a\" : { \"$date\" : " << ~(0LL) << "0" << " } }";
+                return ss.str();
+            }
+        };
+
+        class Regex : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendRegex( "a", "b", "i" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : { \"$regex\" : \"b\", \"$options\" : \"i\" } }";
+            }
+        };
+
+        class RegexEscape : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendRegex( "a", "\t", "i" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : { \"$regex\" : \"\\t\", \"$options\" : \"i\" } }";
+            }
+        };
+
+        class RegexWithQuotes : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.appendRegex( "a", "\"", "" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"a\" : /\"/ }";
+            }
+        };
+
+        class RegexInvalidOption : public Bad {
+            virtual string json() const {
+                return "{ \"a\" : { \"$regex\" : \"b\", \"$options\" : \"1\" } }";
+            }
+        };
+
+        class RegexInvalidOption2 : public Bad {
+            virtual string json() const {
+                return "{ \"a\" : /b/c }";
+            }
+        };
+
+        class Malformed : public Bad {
+            string json() const {
+                return "{";
+            }
+        };
+
+        class UnquotedFieldName : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "a_b", 1 );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ a_b : 1 }";
+            }
+        };
+
+        class UnquotedFieldNameDollar : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "$a_b", 1 );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ $a_b : 1 }";
+            }
+        };
+
+        class SingleQuotes : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "ab'c\"", "bb\b '\"" );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ 'ab\\'c\"' : 'bb\\b \\'\"' }";
+            }
+        };
+
+        class ObjectId : public Base {
+            virtual BSONObj bson() const {
+                OID id;
+                id.init( "deadbeeff00ddeadbeeff00d" );
+                BSONObjBuilder b;
+                b.appendOID( "_id", &id );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"_id\": ObjectId( \"deadbeeff00ddeadbeeff00d\" ) }";
+            }
+        };
+
+        class ObjectId2 : public Base {
+            virtual BSONObj bson() const {
+                OID id;
+                id.init( "deadbeeff00ddeadbeeff00d" );
+                BSONObjBuilder b;
+                b.appendOID( "foo", &id );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"foo\": ObjectId( \"deadbeeff00ddeadbeeff00d\" ) }";
+            }
+        };
+
+        class NumericTypes : public Base {
+        public:
+            void run() {
+                Base::run();
+
+                BSONObj o = fromjson(json());
+
+                ASSERT(o["int"].type() == NumberInt);
+                ASSERT(o["long"].type() == NumberLong);
+                ASSERT(o["double"].type() == NumberDouble);
+
+                ASSERT(o["long"].numberLong() == 9223372036854775807ll);
+            }
+
+            virtual BSONObj bson() const {
+                return BSON( "int" << 123
+                             << "long" << 9223372036854775807ll // 2**63 - 1
+                             << "double" << 3.14
+                           );
+            }
+            virtual string json() const {
+                return "{ \"int\": 123, \"long\": 9223372036854775807, \"double\": 3.14 }";
+            }
+        };
+
+        class NegativeNumericTypes : public Base {
+        public:
+            void run() {
+                Base::run();
+
+                BSONObj o = fromjson(json());
+
+                ASSERT(o["int"].type() == NumberInt);
+                ASSERT(o["long"].type() == NumberLong);
+                ASSERT(o["double"].type() == NumberDouble);
+
+                ASSERT(o["long"].numberLong() == -9223372036854775807ll);
+            }
+
+            virtual BSONObj bson() const {
+                return BSON( "int" << -123
+                             << "long" << -9223372036854775807ll // -1 * (2**63 - 1)
+                             << "double" << -3.14
+                           );
+            }
+            virtual string json() const {
+                return "{ \"int\": -123, \"long\": -9223372036854775807, \"double\": -3.14 }";
+            }
+        };
+
+        class EmbeddedDatesBase : public Base  {
+        public:
+
+            virtual void run() {
+                BSONObj o = fromjson( json() );
+                ASSERT_EQUALS( 3 , (o["time.valid"].type()) );
+                BSONObj e = o["time.valid"].embeddedObjectUserCheck();
+                ASSERT_EQUALS( 9 , e["$gt"].type() );
+                ASSERT_EQUALS( 9 , e["$lt"].type() );
+                Base::run();
+            }
+
+            BSONObj bson() const {
+                BSONObjBuilder e;
+                e.appendDate( "$gt" , 1257829200000LL );
+                e.appendDate( "$lt" , 1257829200100LL );
+
+                BSONObjBuilder b;
+                b.append( "time.valid" , e.obj() );
+                return b.obj();
+            }
+            virtual string json() const = 0;
+        };
+
+        struct EmbeddedDatesFormat1 :  EmbeddedDatesBase  {
+            string json() const {
+                return "{ \"time.valid\" : { $gt : { \"$date\" :  1257829200000 } , $lt : { \"$date\" : 1257829200100 } } }";
+            }
+        };
+        struct EmbeddedDatesFormat2 :  EmbeddedDatesBase  {
+            string json() const {
+                return "{ \"time.valid\" : { $gt : Date(1257829200000) , $lt : Date( 1257829200100 ) } }";
+            }
+        };
+        struct EmbeddedDatesFormat3 :  EmbeddedDatesBase  {
+            string json() const {
+                return "{ \"time.valid\" : { $gt : new Date(1257829200000) , $lt : new Date( 1257829200100 ) } }";
+            }
+        };
+
+        class NullString : public Base {
+            virtual BSONObj bson() const {
+                BSONObjBuilder b;
+                b.append( "x" , "a\0b" , 4 );
+                return b.obj();
+            }
+            virtual string json() const {
+                return "{ \"x\" : \"a\\u0000b\" }";
+            }
+        };
+
+    } // namespace FromJsonTests
+
+    class All : public Suite {
+    public:
+        All() : Suite( "json" ) {
+        }
+
+        void setupTests() {
+            add< JsonStringTests::Empty >();
+            add< JsonStringTests::SingleStringMember >();
+            add< JsonStringTests::EscapedCharacters >();
+            add< JsonStringTests::AdditionalControlCharacters >();
+            add< JsonStringTests::ExtendedAscii >();
+            add< JsonStringTests::EscapeFieldName >();
+            add< JsonStringTests::SingleIntMember >();
+            add< JsonStringTests::SingleNumberMember >();
+            add< JsonStringTests::InvalidNumbers >();
+            add< JsonStringTests::NumberPrecision >();
+            add< JsonStringTests::NegativeNumber >();
+            add< JsonStringTests::SingleBoolMember >();
+            add< JsonStringTests::SingleNullMember >();
+            add< JsonStringTests::SingleObjectMember >();
+            add< JsonStringTests::TwoMembers >();
+            add< JsonStringTests::EmptyArray >();
+            add< JsonStringTests::Array >();
+            add< JsonStringTests::DBRef >();
+            add< JsonStringTests::DBRefZero >();
+            add< JsonStringTests::ObjectId >();
+            add< JsonStringTests::BinData >();
+            add< JsonStringTests::Symbol >();
+            add< JsonStringTests::Date >();
+            add< JsonStringTests::Regex >();
+            add< JsonStringTests::RegexEscape >();
+            add< JsonStringTests::RegexManyOptions >();
+            add< JsonStringTests::CodeTests >();
+            add< JsonStringTests::TimestampTests >();
+            add< JsonStringTests::NullString >();
+            add< JsonStringTests::AllTypes >();
+
+            add< FromJsonTests::Empty >();
+            add< FromJsonTests::EmptyWithSpace >();
+            add< FromJsonTests::SingleString >();
+            add< FromJsonTests::EmptyStrings >();
+            add< FromJsonTests::ReservedFieldName >();
+            add< FromJsonTests::OkDollarFieldName >();
+            add< FromJsonTests::SingleNumber >();
+            add< FromJsonTests::RealNumber >();
+            add< FromJsonTests::FancyNumber >();
+            add< FromJsonTests::TwoElements >();
+            add< FromJsonTests::Subobject >();
+            add< FromJsonTests::ArrayEmpty >();
+            add< FromJsonTests::Array >();
+            add< FromJsonTests::True >();
+            add< FromJsonTests::False >();
+            add< FromJsonTests::Null >();
+            add< FromJsonTests::EscapedCharacters >();
+            add< FromJsonTests::NonEscapedCharacters >();
+            add< FromJsonTests::AllowedControlCharacter >();
+            add< FromJsonTests::EscapeFieldName >();
+            add< FromJsonTests::EscapedUnicodeToUtf8 >();
+            add< FromJsonTests::Utf8AllOnes >();
+            add< FromJsonTests::Utf8FirstByteOnes >();
+            add< FromJsonTests::DBRef >();
+            add< FromJsonTests::NewDBRef >();
+            add< FromJsonTests::Oid >();
+            add< FromJsonTests::Oid2 >();
+            add< FromJsonTests::StringId >();
+            add< FromJsonTests::BinData >();
+            add< FromJsonTests::BinDataPaddedSingle >();
+            add< FromJsonTests::BinDataPaddedDouble >();
+            add< FromJsonTests::BinDataAllChars >();
+            add< FromJsonTests::Date >();
+            add< FromJsonTests::DateNonzero >();
+            add< FromJsonTests::DateTooLong >();
+            add< FromJsonTests::Regex >();
+            add< FromJsonTests::RegexEscape >();
+            add< FromJsonTests::RegexWithQuotes >();
+            add< FromJsonTests::RegexInvalidOption >();
+            add< FromJsonTests::RegexInvalidOption2 >();
+            add< FromJsonTests::Malformed >();
+            add< FromJsonTests::UnquotedFieldName >();
+            add< FromJsonTests::UnquotedFieldNameDollar >();
+            add< FromJsonTests::SingleQuotes >();
+            add< FromJsonTests::ObjectId >();
+            add< FromJsonTests::ObjectId2 >();
+            add< FromJsonTests::NumericTypes >();
+            add< FromJsonTests::NegativeNumericTypes >();
+            add< FromJsonTests::EmbeddedDatesFormat1 >();
+            add< FromJsonTests::EmbeddedDatesFormat2 >();
+            add< FromJsonTests::EmbeddedDatesFormat3 >();
+            add< FromJsonTests::NullString >();
+        }
+    } myall;
+
+} // namespace JsonTests
+
diff --git a/src/mongo/dbtests/jstests.cpp b/src/mongo/dbtests/jstests.cpp
new file mode 100644
index 00000000000..9782eedaacb
--- /dev/null
+++ b/src/mongo/dbtests/jstests.cpp
@@ -0,0 +1,1052 @@
+// javajstests.cpp
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/instance.h"
+
+#include "../pch.h"
+#include "../scripting/engine.h"
+#include "../util/timer.h"
+
+#include "dbtests.h"
+
+namespace mongo {
+    bool dbEval(const string& dbName , BSONObj& cmd, BSONObjBuilder& result, string& errmsg);
+} // namespace mongo
+
+namespace JSTests {
+
+    class Fundamental {
+    public:
+        void run() {
+            // By calling JavaJSImpl() inside run(), we ensure the unit test framework's
+            // signal handlers are pre-installed from JNI's perspective.  This allows
+            // JNI to catch signals generated within the JVM and forward other signals
+            // as appropriate.
+            ScriptEngine::setup();
+            globalScriptEngine->runTest();
+        }
+    };
+
+    class BasicScope {
+    public:
+        void run() {
+            auto_ptr<Scope> s;
+            s.reset( globalScriptEngine->newScope() );
+
+            s->setNumber( "x" , 5 );
+            ASSERT( 5 == s->getNumber( "x" ) );
+
+            s->setNumber( "x" , 1.67 );
+            ASSERT( 1.67 == s->getNumber( "x" ) );
+
+            s->setString( "s" , "eliot was here" );
+            ASSERT( "eliot was here" == s->getString( "s" ) );
+
+            s->setBoolean( "b" , true );
+            ASSERT( s->getBoolean( "b" ) );
+
+            if ( 0 ) {
+                s->setBoolean( "b" , false );
+                ASSERT( ! s->getBoolean( "b" ) );
+            }
+        }
+    };
+
+    class ResetScope {
+    public:
+        void run() {
+            // Not worrying about this for now SERVER-446.
+            /*
+            auto_ptr<Scope> s;
+            s.reset( globalScriptEngine->newScope() );
+
+            s->setBoolean( "x" , true );
+            ASSERT( s->getBoolean( "x" ) );
+
+            s->reset();
+            ASSERT( !s->getBoolean( "x" ) );
+            */
+        }
+    };
+
+    class FalseTests {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            ASSERT( ! s->getBoolean( "x" ) );
+
+            s->setString( "z" , "" );
+            ASSERT( ! s->getBoolean( "z" ) );
+
+
+            delete s ;
+        }
+    };
+
+    class SimpleFunctions {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            s->invoke( "x=5;" , 0, 0 );
+            ASSERT( 5 == s->getNumber( "x" ) );
+
+            s->invoke( "return 17;" , 0, 0 );
+            ASSERT( 17 == s->getNumber( "return" ) );
+
+            s->invoke( "function(){ return 17; }" , 0, 0 );
+            ASSERT( 17 == s->getNumber( "return" ) );
+
+            s->setNumber( "x" , 1.76 );
+            s->invoke( "return x == 1.76; " , 0, 0 );
+            ASSERT( s->getBoolean( "return" ) );
+
+            s->setNumber( "x" , 1.76 );
+            s->invoke( "return x == 1.79; " , 0, 0 );
+            ASSERT( ! s->getBoolean( "return" ) );
+
+            BSONObj obj = BSON( "" << 11.0 );
+            s->invoke( "function( z ){ return 5 + z; }" , &obj, 0 );
+            ASSERT_EQUALS( 16 , s->getNumber( "return" ) );
+
+            delete s;
+        }
+    };
+
+    class ObjectMapping {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            BSONObj o = BSON( "x" << 17.0 << "y" << "eliot" << "z" << "sara" );
+            s->setObject( "blah" , o );
+
+            s->invoke( "return blah.x;" , 0, 0 );
+            ASSERT_EQUALS( 17 , s->getNumber( "return" ) );
+            s->invoke( "return blah.y;" , 0, 0 );
+            ASSERT_EQUALS( "eliot" , s->getString( "return" ) );
+
+            s->invoke( "return this.z;" , 0, &o );
+            ASSERT_EQUALS( "sara" , s->getString( "return" ) );
+
+            s->invoke( "return this.z == 'sara';" , 0, &o );
+            ASSERT_EQUALS( true , s->getBoolean( "return" ) );
+
+            s->invoke( "this.z == 'sara';" , 0, &o );
+            ASSERT_EQUALS( true , s->getBoolean( "return" ) );
+
+            s->invoke( "this.z == 'asara';" , 0, &o );
+            ASSERT_EQUALS( false , s->getBoolean( "return" ) );
+
+            s->invoke( "return this.x == 17;" , 0, &o );
+            ASSERT_EQUALS( true , s->getBoolean( "return" ) );
+
+            s->invoke( "return this.x == 18;" , 0, &o );
+            ASSERT_EQUALS( false , s->getBoolean( "return" ) );
+
+            s->invoke( "function(){ return this.x == 17; }" , 0, &o );
+            ASSERT_EQUALS( true , s->getBoolean( "return" ) );
+
+            s->invoke( "function(){ return this.x == 18; }" , 0, &o );
+            ASSERT_EQUALS( false , s->getBoolean( "return" ) );
+
+            s->invoke( "function (){ return this.x == 17; }" , 0, &o );
+            ASSERT_EQUALS( true , s->getBoolean( "return" ) );
+
+            s->invoke( "function z(){ return this.x == 18; }" , 0, &o );
+            ASSERT_EQUALS( false , s->getBoolean( "return" ) );
+
+            s->invoke( "function (){ this.x == 17; }" , 0, &o );
+            ASSERT_EQUALS( false , s->getBoolean( "return" ) );
+
+            s->invoke( "function z(){ this.x == 18; }" , 0, &o );
+            ASSERT_EQUALS( false , s->getBoolean( "return" ) );
+
+            s->invoke( "x = 5; for( ; x <10; x++){ a = 1; }" , 0, &o );
+            ASSERT_EQUALS( 10 , s->getNumber( "x" ) );
+
+            delete s;
+        }
+    };
+
+    class ObjectDecoding {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            s->invoke( "z = { num : 1 };" , 0, 0 );
+            BSONObj out = s->getObject( "z" );
+            ASSERT_EQUALS( 1 , out["num"].number() );
+            ASSERT_EQUALS( 1 , out.nFields() );
+
+            s->invoke( "z = { x : 'eliot' };" , 0, 0 );
+            out = s->getObject( "z" );
+            ASSERT_EQUALS( (string)"eliot" , out["x"].valuestr() );
+            ASSERT_EQUALS( 1 , out.nFields() );
+
+            BSONObj o = BSON( "x" << 17 );
+            s->setObject( "blah" , o );
+            out = s->getObject( "blah" );
+            ASSERT_EQUALS( 17 , out["x"].number() );
+
+            delete s;
+        }
+    };
+
+    class JSOIDTests {
+    public:
+        void run() {
+#ifdef MOZJS
+            Scope * s = globalScriptEngine->newScope();
+
+            s->localConnect( "blah" );
+
+            s->invoke( "z = { _id : new ObjectId() , a : 123 };" , 0, 0 );
+            BSONObj out = s->getObject( "z" );
+            ASSERT_EQUALS( 123 , out["a"].number() );
+            ASSERT_EQUALS( jstOID , out["_id"].type() );
+
+            OID save = out["_id"].__oid();
+
+            s->setObject( "a" , out );
+
+            s->invoke( "y = { _id : a._id , a : 124 };" , 0, 0 );
+            out = s->getObject( "y" );
+            ASSERT_EQUALS( 124 , out["a"].number() );
+            ASSERT_EQUALS( jstOID , out["_id"].type() );
+            ASSERT_EQUALS( out["_id"].__oid().str() , save.str() );
+
+            s->invoke( "y = { _id : new ObjectId( a._id ) , a : 125 };" , 0, 0 );
+            out = s->getObject( "y" );
+            ASSERT_EQUALS( 125 , out["a"].number() );
+            ASSERT_EQUALS( jstOID , out["_id"].type() );
+            ASSERT_EQUALS( out["_id"].__oid().str() , save.str() );
+
+            delete s;
+#endif
+        }
+    };
+
+    class SetImplicit {
+    public:
+        void run() {
+            Scope *s = globalScriptEngine->newScope();
+
+            BSONObj o = BSON( "foo" << "bar" );
+            s->setObject( "a.b", o );
+            ASSERT( s->getObject( "a" ).isEmpty() );
+
+            BSONObj o2 = BSONObj();
+            s->setObject( "a", o2 );
+            s->setObject( "a.b", o );
+            ASSERT( s->getObject( "a" ).isEmpty() );
+
+            o2 = fromjson( "{b:{}}" );
+            s->setObject( "a", o2 );
+            s->setObject( "a.b", o );
+            ASSERT( !s->getObject( "a" ).isEmpty() );
+        }
+    };
+
+    class ObjectModReadonlyTests {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            BSONObj o = BSON( "x" << 17 << "y" << "eliot" << "z" << "sara" << "zz" << BSONObj() );
+            s->setObject( "blah" , o , true );
+
+            s->invoke( "blah.y = 'e'", 0, 0 );
+            BSONObj out = s->getObject( "blah" );
+            ASSERT( strlen( out["y"].valuestr() ) > 1 );
+
+            s->invoke( "blah.a = 19;" , 0, 0 );
+            out = s->getObject( "blah" );
+            ASSERT( out["a"].eoo() );
+
+            s->invoke( "blah.zz.a = 19;" , 0, 0 );
+            out = s->getObject( "blah" );
+            ASSERT( out["zz"].embeddedObject()["a"].eoo() );
+
+            s->setObject( "blah.zz", BSON( "a" << 19 ) );
+            out = s->getObject( "blah" );
+            ASSERT( out["zz"].embeddedObject()["a"].eoo() );
+
+            s->invoke( "delete blah['x']" , 0, 0 );
+            out = s->getObject( "blah" );
+            ASSERT( !out["x"].eoo() );
+
+            // read-only object itself can be overwritten
+            s->invoke( "blah = {}", 0, 0 );
+            out = s->getObject( "blah" );
+            ASSERT( out.isEmpty() );
+
+            // test array - can't implement this in v8
+//            o = fromjson( "{a:[1,2,3]}" );
+//            s->setObject( "blah", o, true );
+//            out = s->getObject( "blah" );
+//            s->invoke( "blah.a[ 0 ] = 4;", BSONObj() );
+//            s->invoke( "delete blah['a'][ 2 ];", BSONObj() );
+//            out = s->getObject( "blah" );
+//            ASSERT_EQUALS( 1.0, out[ "a" ].embeddedObject()[ 0 ].number() );
+//            ASSERT_EQUALS( 3.0, out[ "a" ].embeddedObject()[ 2 ].number() );
+
+            delete s;
+        }
+    };
+
+    class OtherJSTypes {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            {
+                // date
+                BSONObj o;
+                {
+                    BSONObjBuilder b;
+                    b.appendDate( "d" , 123456789 );
+                    o = b.obj();
+                }
+                s->setObject( "x" , o );
+
+                s->invoke( "return x.d.getTime() != 12;" , 0, 0 );
+                ASSERT_EQUALS( true, s->getBoolean( "return" ) );
+
+                s->invoke( "z = x.d.getTime();" , 0, 0 );
+                ASSERT_EQUALS( 123456789 , s->getNumber( "z" ) );
+
+                s->invoke( "z = { z : x.d }" , 0, 0 );
+                BSONObj out = s->getObject( "z" );
+                ASSERT( out["z"].type() == Date );
+            }
+
+            {
+                // regex
+                BSONObj o;
+                {
+                    BSONObjBuilder b;
+                    b.appendRegex( "r" , "^a" , "i" );
+                    o = b.obj();
+                }
+                s->setObject( "x" , o );
+
+                s->invoke( "z = x.r.test( 'b' );" , 0, 0 );
+                ASSERT_EQUALS( false , s->getBoolean( "z" ) );
+
+                s->invoke( "z = x.r.test( 'a' );" , 0, 0 );
+                ASSERT_EQUALS( true , s->getBoolean( "z" ) );
+
+                s->invoke( "z = x.r.test( 'ba' );" , 0, 0 );
+                ASSERT_EQUALS( false , s->getBoolean( "z" ) );
+
+                s->invoke( "z = { a : x.r };" , 0, 0 );
+
+                BSONObj out = s->getObject("z");
+                ASSERT_EQUALS( (string)"^a" , out["a"].regex() );
+                ASSERT_EQUALS( (string)"i" , out["a"].regexFlags() );
+
+            }
+
+            // array
+            {
+                BSONObj o = fromjson( "{r:[1,2,3]}" );
+                s->setObject( "x", o, false );
+                BSONObj out = s->getObject( "x" );
+                ASSERT_EQUALS( Array, out.firstElement().type() );
+
+                s->setObject( "x", o, true );
+                out = s->getObject( "x" );
+                ASSERT_EQUALS( Array, out.firstElement().type() );
+            }
+
+            delete s;
+        }
+    };
+
+    class SpecialDBTypes {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            BSONObjBuilder b;
+            b.appendTimestamp( "a" , 123456789 );
+            b.appendMinKey( "b" );
+            b.appendMaxKey( "c" );
+            b.appendTimestamp( "d" , 1234000 , 9876 );
+
+
+            {
+                BSONObj t = b.done();
+                ASSERT_EQUALS( 1234000U , t["d"].timestampTime() );
+                ASSERT_EQUALS( 9876U , t["d"].timestampInc() );
+            }
+
+            s->setObject( "z" , b.obj() );
+
+            ASSERT( s->invoke( "y = { a : z.a , b : z.b , c : z.c , d: z.d }" , 0, 0 ) == 0 );
+
+            BSONObj out = s->getObject( "y" );
+            ASSERT_EQUALS( Timestamp , out["a"].type() );
+            ASSERT_EQUALS( MinKey , out["b"].type() );
+            ASSERT_EQUALS( MaxKey , out["c"].type() );
+            ASSERT_EQUALS( Timestamp , out["d"].type() );
+
+            ASSERT_EQUALS( 9876U , out["d"].timestampInc() );
+            ASSERT_EQUALS( 1234000U , out["d"].timestampTime() );
+            ASSERT_EQUALS( 123456789U , out["a"].date() );
+
+            delete s;
+        }
+    };
+
+    class TypeConservation {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            //  --  A  --
+
+            BSONObj o;
+            {
+                BSONObjBuilder b ;
+                b.append( "a" , (int)5 );
+                b.append( "b" , 5.6 );
+                o = b.obj();
+            }
+            ASSERT_EQUALS( NumberInt , o["a"].type() );
+            ASSERT_EQUALS( NumberDouble , o["b"].type() );
+
+            s->setObject( "z" , o );
+            s->invoke( "return z" , 0, 0 );
+            BSONObj out = s->getObject( "return" );
+            ASSERT_EQUALS( 5 , out["a"].number() );
+            ASSERT_EQUALS( 5.6 , out["b"].number() );
+
+            ASSERT_EQUALS( NumberDouble , out["b"].type() );
+            ASSERT_EQUALS( NumberInt , out["a"].type() );
+
+            //  --  B  --
+
+            {
+                BSONObjBuilder b ;
+                b.append( "a" , (int)5 );
+                b.append( "b" , 5.6 );
+                o = b.obj();
+            }
+
+            s->setObject( "z" , o , false );
+            s->invoke( "return z" , 0, 0 );
+            out = s->getObject( "return" );
+            ASSERT_EQUALS( 5 , out["a"].number() );
+            ASSERT_EQUALS( 5.6 , out["b"].number() );
+
+            ASSERT_EQUALS( NumberDouble , out["b"].type() );
+            ASSERT_EQUALS( NumberInt , out["a"].type() );
+
+
+            //  -- C --
+
+            {
+                BSONObjBuilder b ;
+
+                {
+                    BSONObjBuilder c;
+                    c.append( "0" , 5.5 );
+                    c.append( "1" , 6 );
+                    b.appendArray( "a" , c.obj() );
+                }
+
+                o = b.obj();
+            }
+
+            ASSERT_EQUALS( NumberDouble , o["a"].embeddedObjectUserCheck()["0"].type() );
+            ASSERT_EQUALS( NumberInt , o["a"].embeddedObjectUserCheck()["1"].type() );
+
+            s->setObject( "z" , o , false );
+            out = s->getObject( "z" );
+
+            ASSERT_EQUALS( NumberDouble , out["a"].embeddedObjectUserCheck()["0"].type() );
+            ASSERT_EQUALS( NumberInt , out["a"].embeddedObjectUserCheck()["1"].type() );
+
+            s->invokeSafe( "z.z = 5;" , 0, 0 );
+            out = s->getObject( "z" );
+            ASSERT_EQUALS( 5 , out["z"].number() );
+            ASSERT_EQUALS( NumberDouble , out["a"].embeddedObjectUserCheck()["0"].type() );
+            // Commenting so that v8 tests will work
+//            ASSERT_EQUALS( NumberDouble , out["a"].embeddedObjectUserCheck()["1"].type() ); // TODO: this is technically bad, but here to make sure that i understand the behavior
+
+
+            // Eliot says I don't have to worry about this case
+
+//            // -- D --
+//
+//            o = fromjson( "{a:3.0,b:4.5}" );
+//            ASSERT_EQUALS( NumberDouble , o["a"].type() );
+//            ASSERT_EQUALS( NumberDouble , o["b"].type() );
+//
+//            s->setObject( "z" , o , false );
+//            s->invoke( "return z" , BSONObj() );
+//            out = s->getObject( "return" );
+//            ASSERT_EQUALS( 3 , out["a"].number() );
+//            ASSERT_EQUALS( 4.5 , out["b"].number() );
+//
+//            ASSERT_EQUALS( NumberDouble , out["b"].type() );
+//            ASSERT_EQUALS( NumberDouble , out["a"].type() );
+//
+
+            delete s;
+        }
+
+    };
+
+    class NumberLong {
+    public:
+        void run() {
+            auto_ptr<Scope> s( globalScriptEngine->newScope() );
+            s->localConnect( "blah" );
+            BSONObjBuilder b;
+            long long val = (long long)( 0xbabadeadbeefbaddULL );
+            b.append( "a", val );
+            BSONObj in = b.obj();
+            s->setObject( "a", in );
+            BSONObj out = s->getObject( "a" );
+            ASSERT_EQUALS( mongo::NumberLong, out.firstElement().type() );
+
+            ASSERT( s->exec( "b = {b:a.a}", "foo", false, true, false ) );
+            out = s->getObject( "b" );
+            ASSERT_EQUALS( mongo::NumberLong, out.firstElement().type() );
+            if( val != out.firstElement().numberLong() ) {
+                cout << val << endl;
+                cout << out.firstElement().numberLong() << endl;
+                cout << out.toString() << endl;
+                ASSERT_EQUALS( val, out.firstElement().numberLong() );
+            }
+
+            ASSERT( s->exec( "c = {c:a.a.toString()}", "foo", false, true, false ) );
+            out = s->getObject( "c" );
+            stringstream ss;
+            ss << "NumberLong(\"" << val << "\")";
+            ASSERT_EQUALS( ss.str(), out.firstElement().valuestr() );
+
+            ASSERT( s->exec( "d = {d:a.a.toNumber()}", "foo", false, true, false ) );
+            out = s->getObject( "d" );
+            ASSERT_EQUALS( NumberDouble, out.firstElement().type() );
+            ASSERT_EQUALS( double( val ), out.firstElement().number() );
+
+            ASSERT( s->exec( "e = {e:a.a.floatApprox}", "foo", false, true, false ) );
+            out = s->getObject( "e" );
+            ASSERT_EQUALS( NumberDouble, out.firstElement().type() );
+            ASSERT_EQUALS( double( val ), out.firstElement().number() );
+
+            ASSERT( s->exec( "f = {f:a.a.top}", "foo", false, true, false ) );
+            out = s->getObject( "f" );
+            ASSERT( NumberDouble == out.firstElement().type() || NumberInt == out.firstElement().type() );
+
+            s->setObject( "z", BSON( "z" << (long long)( 4 ) ) );
+            ASSERT( s->exec( "y = {y:z.z.top}", "foo", false, true, false ) );
+            out = s->getObject( "y" );
+            ASSERT_EQUALS( Undefined, out.firstElement().type() );
+
+            ASSERT( s->exec( "x = {x:z.z.floatApprox}", "foo", false, true, false ) );
+            out = s->getObject( "x" );
+            ASSERT( NumberDouble == out.firstElement().type() || NumberInt == out.firstElement().type() );
+            ASSERT_EQUALS( double( 4 ), out.firstElement().number() );
+
+            ASSERT( s->exec( "w = {w:z.z}", "foo", false, true, false ) );
+            out = s->getObject( "w" );
+            ASSERT_EQUALS( mongo::NumberLong, out.firstElement().type() );
+            ASSERT_EQUALS( 4, out.firstElement().numberLong() );
+
+        }
+    };
+
+    class NumberLong2 {
+    public:
+        void run() {
+            auto_ptr<Scope> s( globalScriptEngine->newScope() );
+            s->localConnect( "blah" );
+
+            BSONObj in;
+            {
+                BSONObjBuilder b;
+                b.append( "a" , 5 );
+                b.append( "b" , (long long)5 );
+                b.append( "c" , (long long)pow( 2.0, 29 ) );
+                b.append( "d" , (long long)pow( 2.0, 30 ) );
+                b.append( "e" , (long long)pow( 2.0, 31 ) );
+                b.append( "f" , (long long)pow( 2.0, 45 ) );
+                in = b.obj();
+            }
+            s->setObject( "a" , in );
+
+            ASSERT( s->exec( "x = tojson( a ); " ,"foo" , false , true , false ) );
+            string outString = s->getString( "x" );
+
+            ASSERT( s->exec( (string)"y = " + outString , "foo2" , false , true , false ) );
+            BSONObj out = s->getObject( "y" );
+            ASSERT_EQUALS( in , out );
+        }
+    };
+
+    class NumberLongUnderLimit {
+    public:
+        void run() {
+            auto_ptr<Scope> s( globalScriptEngine->newScope() );
+            s->localConnect( "blah" );
+            BSONObjBuilder b;
+            // limit is 2^53
+            long long val = (long long)( 9007199254740991ULL );
+            b.append( "a", val );
+            BSONObj in = b.obj();
+            s->setObject( "a", in );
+            BSONObj out = s->getObject( "a" );
+            ASSERT_EQUALS( mongo::NumberLong, out.firstElement().type() );
+
+            ASSERT( s->exec( "b = {b:a.a}", "foo", false, true, false ) );
+            out = s->getObject( "b" );
+            ASSERT_EQUALS( mongo::NumberLong, out.firstElement().type() );
+            if( val != out.firstElement().numberLong() ) {
+                cout << val << endl;
+                cout << out.firstElement().numberLong() << endl;
+                cout << out.toString() << endl;
+                ASSERT_EQUALS( val, out.firstElement().numberLong() );
+            }
+
+            ASSERT( s->exec( "c = {c:a.a.toString()}", "foo", false, true, false ) );
+            out = s->getObject( "c" );
+            stringstream ss;
+            ss << "NumberLong(\"" << val << "\")";
+            ASSERT_EQUALS( ss.str(), out.firstElement().valuestr() );
+
+            ASSERT( s->exec( "d = {d:a.a.toNumber()}", "foo", false, true, false ) );
+            out = s->getObject( "d" );
+            ASSERT_EQUALS( NumberDouble, out.firstElement().type() );
+            ASSERT_EQUALS( double( val ), out.firstElement().number() );
+
+            ASSERT( s->exec( "e = {e:a.a.floatApprox}", "foo", false, true, false ) );
+            out = s->getObject( "e" );
+            ASSERT_EQUALS( NumberDouble, out.firstElement().type() );
+            ASSERT_EQUALS( double( val ), out.firstElement().number() );
+
+            ASSERT( s->exec( "f = {f:a.a.top}", "foo", false, true, false ) );
+            out = s->getObject( "f" );
+            ASSERT( Undefined == out.firstElement().type() );
+        }
+    };
+
+    class WeirdObjects {
+    public:
+
+        BSONObj build( int depth ) {
+            BSONObjBuilder b;
+            b.append( "0" , depth );
+            if ( depth > 0 )
+                b.appendArray( "1" , build( depth - 1 ) );
+            return b.obj();
+        }
+
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            s->localConnect( "blah" );
+
+            for ( int i=5; i<100 ; i += 10 ) {
+                s->setObject( "a" , build(i) , false );
+                s->invokeSafe( "tojson( a )" , 0, 0 );
+
+                s->setObject( "a" , build(5) , true );
+                s->invokeSafe( "tojson( a )" , 0, 0 );
+            }
+
+            delete s;
+        }
+    };
+
+
+    void dummy_function_to_force_dbeval_cpp_linking() {
+        BSONObj cmd;
+        BSONObjBuilder result;
+        string errmsg;
+        dbEval( "test", cmd, result, errmsg);
+        assert(0);
+    }
+
+    DBDirectClient client;
+
+    class Utf8Check {
+    public:
+        Utf8Check() { reset(); }
+        ~Utf8Check() { reset(); }
+        void run() {
+            if( !globalScriptEngine->utf8Ok() ) {
+                log() << "warning: utf8 not supported" << endl;
+                return;
+            }
+            string utf8ObjSpec = "{'_id':'\\u0001\\u007f\\u07ff\\uffff'}";
+            BSONObj utf8Obj = fromjson( utf8ObjSpec );
+            client.insert( ns(), utf8Obj );
+            client.eval( "unittest", "v = db.jstests.utf8check.findOne(); db.jstests.utf8check.remove( {} ); db.jstests.utf8check.insert( v );" );
+            check( utf8Obj, client.findOne( ns(), BSONObj() ) );
+        }
+    private:
+        void check( const BSONObj &one, const BSONObj &two ) {
+            if ( one.woCompare( two ) != 0 ) {
+                static string fail = string( "Assertion failure expected " ) + one.toString() + ", got " + two.toString();
+                FAIL( fail.c_str() );
+            }
+        }
+        void reset() {
+            client.dropCollection( ns() );
+        }
+        static const char *ns() { return "unittest.jstests.utf8check"; }
+    };
+
+    class LongUtf8String {
+    public:
+        LongUtf8String() { reset(); }
+        ~LongUtf8String() { reset(); }
+        void run() {
+            if( !globalScriptEngine->utf8Ok() )
+                return;
+            client.eval( "unittest", "db.jstests.longutf8string.save( {_id:'\\uffff\\uffff\\uffff\\uffff'} )" );
+        }
+    private:
+        void reset() {
+            client.dropCollection( ns() );
+        }
+        static const char *ns() { return "unittest.jstests.longutf8string"; }
+    };
+
+    class InvalidUTF8Check {
+    public:
+        void run() {
+            if( !globalScriptEngine->utf8Ok() )
+                return;
+
+            auto_ptr<Scope> s;
+            s.reset( globalScriptEngine->newScope() );
+
+            BSONObj b;
+            {
+                char crap[5];
+
+                crap[0] = (char) 128;
+                crap[1] = 17;
+                crap[2] = (char) 128;
+                crap[3] = 17;
+                crap[4] = 0;
+
+                BSONObjBuilder bb;
+                bb.append( "x" , crap );
+                b = bb.obj();
+            }
+
+            //cout << "ELIOT: " << b.jsonString() << endl;
+            // its ok  if this is handled by js, just can't create a c++ exception
+            s->invoke( "x=this.x.length;" , 0, &b );
+        }
+    };
+
+    class CodeTests {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            {
+                BSONObjBuilder b;
+                b.append( "a" , 1 );
+                b.appendCode( "b" , "function(){ out.b = 11; }" );
+                b.appendCodeWScope( "c" , "function(){ out.c = 12; }" , BSONObj() );
+                b.appendCodeWScope( "d" , "function(){ out.d = 13 + bleh; }" , BSON( "bleh" << 5 ) );
+                s->setObject( "foo" , b.obj() );
+            }
+
+            s->invokeSafe( "out = {}; out.a = foo.a; foo.b(); foo.c();" , 0, 0 );
+            BSONObj out = s->getObject( "out" );
+
+            ASSERT_EQUALS( 1 , out["a"].number() );
+            ASSERT_EQUALS( 11 , out["b"].number() );
+            ASSERT_EQUALS( 12 , out["c"].number() );
+
+            // Guess we don't care about this
+            //s->invokeSafe( "foo.d() " , BSONObj() );
+            //out = s->getObject( "out" );
+            //ASSERT_EQUALS( 18 , out["d"].number() );
+
+
+            delete s;
+        }
+    };
+
+    class DBRefTest {
+    public:
+        DBRefTest() {
+            _a = "unittest.dbref.a";
+            _b = "unittest.dbref.b";
+            reset();
+        }
+        ~DBRefTest() {
+            //reset();
+        }
+
+        void run() {
+
+            client.insert( _a , BSON( "a" << "17" ) );
+
+            {
+                BSONObj fromA = client.findOne( _a , BSONObj() );
+                assert( fromA.valid() );
+                //cout << "Froma : " << fromA << endl;
+                BSONObjBuilder b;
+                b.append( "b" , 18 );
+                b.appendDBRef( "c" , "dbref.a" , fromA["_id"].__oid() );
+                client.insert( _b , b.obj() );
+            }
+
+            ASSERT( client.eval( "unittest" , "x = db.dbref.b.findOne(); assert.eq( 17 , x.c.fetch().a , 'ref working' );" ) );
+
+            // BSON DBRef <=> JS DBPointer
+            ASSERT( client.eval( "unittest", "x = db.dbref.b.findOne(); db.dbref.b.drop(); x.c = new DBPointer( x.c.ns, x.c.id ); db.dbref.b.insert( x );" ) );
+            ASSERT_EQUALS( DBRef, client.findOne( "unittest.dbref.b", "" )[ "c" ].type() );
+
+            // BSON Object <=> JS DBRef
+            ASSERT( client.eval( "unittest", "x = db.dbref.b.findOne(); db.dbref.b.drop(); x.c = new DBRef( x.c.ns, x.c.id ); db.dbref.b.insert( x );" ) );
+            ASSERT_EQUALS( Object, client.findOne( "unittest.dbref.b", "" )[ "c" ].type() );
+            ASSERT_EQUALS( string( "dbref.a" ), client.findOne( "unittest.dbref.b", "" )[ "c" ].embeddedObject().getStringField( "$ref" ) );
+        }
+
+        void reset() {
+            client.dropCollection( _a );
+            client.dropCollection( _b );
+        }
+
+        const char * _a;
+        const char * _b;
+    };
+
+    class InformalDBRef {
+    public:
+        void run() {
+            client.insert( ns(), BSON( "i" << 1 ) );
+            BSONObj obj = client.findOne( ns(), BSONObj() );
+            client.remove( ns(), BSONObj() );
+            client.insert( ns(), BSON( "r" << BSON( "$ref" << "jstests.informaldbref" << "$id" << obj["_id"].__oid() << "foo" << "bar" ) ) );
+            obj = client.findOne( ns(), BSONObj() );
+            ASSERT_EQUALS( "bar", obj[ "r" ].embeddedObject()[ "foo" ].str() );
+
+            ASSERT( client.eval( "unittest", "x = db.jstests.informaldbref.findOne(); y = { r:x.r }; db.jstests.informaldbref.drop(); y.r[ \"a\" ] = \"b\"; db.jstests.informaldbref.save( y );" ) );
+            obj = client.findOne( ns(), BSONObj() );
+            ASSERT_EQUALS( "bar", obj[ "r" ].embeddedObject()[ "foo" ].str() );
+            ASSERT_EQUALS( "b", obj[ "r" ].embeddedObject()[ "a" ].str() );
+        }
+    private:
+        static const char *ns() { return "unittest.jstests.informaldbref"; }
+    };
+
+    class BinDataType {
+    public:
+
+        void pp( const char * s , BSONElement e ) {
+            int len;
+            const char * data = e.binData( len );
+            cout << s << ":" << e.binDataType() << "\t" << len << endl;
+            cout << "\t";
+            for ( int i=0; i<len; i++ )
+                cout << (int)(data[i]) << " ";
+            cout << endl;
+        }
+
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+            s->localConnect( "asd" );
+            const char * foo = "asdas\0asdasd";
+            const char * base64 = "YXNkYXMAYXNkYXNk";
+
+            BSONObj in;
+            {
+                BSONObjBuilder b;
+                b.append( "a" , 7 );
+                b.appendBinData( "b" , 12 , BinDataGeneral , foo );
+                in = b.obj();
+                s->setObject( "x" , in );
+            }
+
+            s->invokeSafe( "myb = x.b; print( myb ); printjson( myb );" , 0, 0 );
+            s->invokeSafe( "y = { c : myb };" , 0, 0 );
+
+            BSONObj out = s->getObject( "y" );
+            ASSERT_EQUALS( BinData , out["c"].type() );
+//            pp( "in " , in["b"] );
+//            pp( "out" , out["c"] );
+            ASSERT_EQUALS( 0 , in["b"].woCompare( out["c"] , false ) );
+
+            // check that BinData js class is utilized
+            s->invokeSafe( "q = x.b.toString();", 0, 0 );
+            stringstream expected;
+            expected << "BinData(" << BinDataGeneral << ",\"" << base64 << "\")";
+            ASSERT_EQUALS( expected.str(), s->getString( "q" ) );
+
+            stringstream scriptBuilder;
+            scriptBuilder << "z = { c : new BinData( " << BinDataGeneral << ", \"" << base64 << "\" ) };";
+            string script = scriptBuilder.str();
+            s->invokeSafe( script.c_str(), 0, 0 );
+            out = s->getObject( "z" );
+//            pp( "out" , out["c"] );
+            ASSERT_EQUALS( 0 , in["b"].woCompare( out["c"] , false ) );
+
+            s->invokeSafe( "a = { f: new BinData( 128, \"\" ) };", 0, 0 );
+            out = s->getObject( "a" );
+            int len = -1;
+            out[ "f" ].binData( len );
+            ASSERT_EQUALS( 0, len );
+            ASSERT_EQUALS( 128, out[ "f" ].binDataType() );
+
+            delete s;
+        }
+    };
+
+    class VarTests {
+    public:
+        void run() {
+            Scope * s = globalScriptEngine->newScope();
+
+            ASSERT( s->exec( "a = 5;" , "a" , false , true , false ) );
+            ASSERT_EQUALS( 5 , s->getNumber("a" ) );
+
+            ASSERT( s->exec( "var b = 6;" , "b" , false , true , false ) );
+            ASSERT_EQUALS( 6 , s->getNumber("b" ) );
+            delete s;
+        }
+    };
+
+    class Speed1 {
+    public:
+        void run() {
+            BSONObj start = BSON( "x" << 5.0 );
+            BSONObj empty;
+
+            auto_ptr<Scope> s;
+            s.reset( globalScriptEngine->newScope() );
+
+            ScriptingFunction f = s->createFunction( "return this.x + 6;" );
+
+            Timer t;
+            double n = 0;
+            for ( ; n < 100000; n++ ) {
+                s->invoke( f , &empty, &start );
+                ASSERT_EQUALS( 11 , s->getNumber( "return" ) );
+            }
+            //cout << "speed1: " << ( n / t.millis() ) << " ops/ms" << endl;
+        }
+    };
+
+    class ScopeOut {
+    public:
+        void run() {
+            auto_ptr<Scope> s;
+            s.reset( globalScriptEngine->newScope() );
+
+            s->invokeSafe( "x = 5;" , 0, 0 );
+            {
+                BSONObjBuilder b;
+                s->append( b , "z" , "x" );
+                ASSERT_EQUALS( BSON( "z" << 5 ) , b.obj() );
+            }
+
+            s->invokeSafe( "x = function(){ return 17; }" , 0, 0 );
+            BSONObj temp;
+            {
+                BSONObjBuilder b;
+                s->append( b , "z" , "x" );
+                temp = b.obj();
+            }
+
+            s->invokeSafe( "foo = this.z();" , 0, &temp );
+            ASSERT_EQUALS( 17 , s->getNumber( "foo" ) );
+        }
+    };
+
+    class RenameTest {
+    public:
+        void run() {
+            auto_ptr<Scope> s;
+            s.reset( globalScriptEngine->newScope() );
+
+            s->setNumber( "x" , 5 );
+            ASSERT_EQUALS( 5 , s->getNumber( "x" ) );
+            ASSERT_EQUALS( Undefined , s->type( "y" ) );
+
+            s->rename( "x" , "y" );
+            ASSERT_EQUALS( 5 , s->getNumber( "y" ) );
+            ASSERT_EQUALS( Undefined , s->type( "x" ) );
+
+            s->rename( "y" , "x" );
+            ASSERT_EQUALS( 5 , s->getNumber( "x" ) );
+            ASSERT_EQUALS( Undefined , s->type( "y" ) );
+        }
+    };
+
+
+    class All : public Suite {
+    public:
+        All() : Suite( "js" ) {
+        }
+
+        void setupTests() {
+            add< Fundamental >();
+            add< BasicScope >();
+            add< ResetScope >();
+            add< FalseTests >();
+            add< SimpleFunctions >();
+
+            add< ObjectMapping >();
+            add< ObjectDecoding >();
+            add< JSOIDTests >();
+            add< SetImplicit >();
+            add< ObjectModReadonlyTests >();
+            add< OtherJSTypes >();
+            add< SpecialDBTypes >();
+            add< TypeConservation >();
+            add< NumberLong >();
+            add< NumberLong2 >();
+            add< RenameTest >();
+
+            add< WeirdObjects >();
+            add< CodeTests >();
+            add< DBRefTest >();
+            add< InformalDBRef >();
+            add< BinDataType >();
+
+            add< VarTests >();
+
+            add< Speed1 >();
+
+            add< InvalidUTF8Check >();
+            add< Utf8Check >();
+            add< LongUtf8String >();
+
+            add< ScopeOut >();
+        }
+    } myall;
+
+} // namespace JavaJSTests
+
diff --git a/src/mongo/dbtests/macrotests.cpp b/src/mongo/dbtests/macrotests.cpp
new file mode 100644
index 00000000000..f547c851677
--- /dev/null
+++ b/src/mongo/dbtests/macrotests.cpp
@@ -0,0 +1,47 @@
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#undef MONGO_EXPOSE_MACROS
+
+#include "../client/dbclient.h"
+
+#ifdef malloc
+# error malloc defined 0
+#endif
+
+#ifdef assert
+# error assert defined 1
+#endif
+
+#include "../client/parallel.h" //uses assert
+
+#ifdef assert
+# error assert defined 2
+#endif
+
+#include "../client/redef_macros.h"
+
+#ifndef assert
+# error assert not defined 3
+#endif
+
+#include "../client/undef_macros.h"
+
+#ifdef assert
+# error assert defined 3
+#endif
+
+
diff --git a/src/mongo/dbtests/matchertests.cpp b/src/mongo/dbtests/matchertests.cpp
new file mode 100644
index 00000000000..380b8b802d4
--- /dev/null
+++ b/src/mongo/dbtests/matchertests.cpp
@@ -0,0 +1,163 @@
+// matchertests.cpp : matcher unit tests
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../util/timer.h"
+
+#include "../db/matcher.h"
+#include "../db/json.h"
+
+#include "dbtests.h"
+
+
+
+namespace MatcherTests {
+
+    class Basic {
+    public:
+        void run() {
+            BSONObj query = fromjson( "{\"a\":\"b\"}" );
+            Matcher m( query );
+            ASSERT( m.matches( fromjson( "{\"a\":\"b\"}" ) ) );
+        }
+    };
+
+    class DoubleEqual {
+    public:
+        void run() {
+            BSONObj query = fromjson( "{\"a\":5}" );
+            Matcher m( query );
+            ASSERT( m.matches( fromjson( "{\"a\":5}" ) ) );
+        }
+    };
+
+    class MixedNumericEqual {
+    public:
+        void run() {
+            BSONObjBuilder query;
+            query.append( "a", 5 );
+            Matcher m( query.done() );
+            ASSERT( m.matches( fromjson( "{\"a\":5}" ) ) );
+        }
+    };
+
+    class MixedNumericGt {
+    public:
+        void run() {
+            BSONObj query = fromjson( "{\"a\":{\"$gt\":4}}" );
+            Matcher m( query );
+            BSONObjBuilder b;
+            b.append( "a", 5 );
+            ASSERT( m.matches( b.done() ) );
+        }
+    };
+
+    class MixedNumericIN {
+    public:
+        void run() {
+            BSONObj query = fromjson( "{ a : { $in : [4,6] } }" );
+            ASSERT_EQUALS( 4 , query["a"].embeddedObject()["$in"].embeddedObject()["0"].number() );
+            ASSERT_EQUALS( NumberInt , query["a"].embeddedObject()["$in"].embeddedObject()["0"].type() );
+
+            Matcher m( query );
+
+            {
+                BSONObjBuilder b;
+                b.append( "a" , 4.0 );
+                ASSERT( m.matches( b.done() ) );
+            }
+
+            {
+                BSONObjBuilder b;
+                b.append( "a" , 5 );
+                ASSERT( ! m.matches( b.done() ) );
+            }
+
+
+            {
+                BSONObjBuilder b;
+                b.append( "a" , 4 );
+                ASSERT( m.matches( b.done() ) );
+            }
+
+        }
+    };
+
+    class MixedNumericEmbedded {
+    public:
+        void run() {
+            Matcher m( BSON( "a" << BSON( "x" << 1 ) ) );
+            ASSERT( m.matches( BSON( "a" << BSON( "x" << 1 ) ) ) );
+            ASSERT( m.matches( BSON( "a" << BSON( "x" << 1.0 ) ) ) );
+        }
+    };
+
+    class Size {
+    public:
+        void run() {
+            Matcher m( fromjson( "{a:{$size:4}}" ) );
+            ASSERT( m.matches( fromjson( "{a:[1,2,3,4]}" ) ) );
+            ASSERT( !m.matches( fromjson( "{a:[1,2,3]}" ) ) );
+            ASSERT( !m.matches( fromjson( "{a:[1,2,3,'a','b']}" ) ) );
+            ASSERT( !m.matches( fromjson( "{a:[[1,2,3,4]]}" ) ) );
+        }
+    };
+
+
+    class TimingBase {
+    public:
+        long time( const BSONObj& patt , const BSONObj& obj ) {
+            Matcher m( patt );
+            Timer t;
+            for ( int i=0; i<10000; i++ ) {
+                ASSERT( m.matches( obj ) );
+            }
+            return t.millis();
+        }
+    };
+
+    class AllTiming : public TimingBase {
+    public:
+        void run() {
+            long normal = time( BSON( "x" << 5 ) , BSON( "x" << 5 ) );
+            long all = time( BSON( "x" << BSON( "$all" << BSON_ARRAY( 5 ) ) ) , BSON( "x" << 5 ) );
+
+            cout << "normal: " << normal << " all: " << all << endl;
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "matcher" ) {
+        }
+
+        void setupTests() {
+            add< Basic >();
+            add< DoubleEqual >();
+            add< MixedNumericEqual >();
+            add< MixedNumericGt >();
+            add< MixedNumericIN >();
+            add< Size >();
+            add< MixedNumericEmbedded >();
+            add< AllTiming >();
+        }
+    } dball;
+
+} // namespace MatcherTests
+
diff --git a/src/mongo/dbtests/mmaptests.cpp b/src/mongo/dbtests/mmaptests.cpp
new file mode 100644
index 00000000000..7fb6eee98fc
--- /dev/null
+++ b/src/mongo/dbtests/mmaptests.cpp
@@ -0,0 +1,219 @@
+// @file mmaptests.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/mongommf.h"
+#include "../util/timer.h"
+#include "dbtests.h"
+
+namespace MMapTests {
+
+    class LeakTest  {
+        const string fn;
+        const int optOld;
+    public:
+        LeakTest() :
+            fn( (path(dbpath) / "testfile.map").string() ), optOld(cmdLine.durOptions)
+        { 
+            cmdLine.durOptions = 0; // DurParanoid doesn't make sense with this test
+        }
+        ~LeakTest() {
+            cmdLine.durOptions = optOld;
+            try { boost::filesystem::remove(fn); }
+            catch(...) { }
+        }
+        void run() {
+
+            try { boost::filesystem::remove(fn); }
+            catch(...) { }
+
+            writelock lk;
+
+            {
+                MongoMMF f;
+                unsigned long long len = 256 * 1024 * 1024;
+                assert( f.create(fn, len, /*sequential*/false) );
+                {
+                    char *p = (char *) f.getView();
+                    assert(p);
+                    // write something to the private view as a test
+                    if( cmdLine.dur ) 
+                        MemoryMappedFile::makeWritable(p, 6);
+                    strcpy(p, "hello");
+                }
+                if( cmdLine.dur ) {
+                    char *w = (char *) f.view_write();
+                    strcpy(w + 6, "world");
+                }
+                MongoFileFinder ff;
+                ASSERT( ff.findByPath(fn) );
+                ASSERT( ff.findByPath("asdf") == 0 );
+            }
+            {
+                MongoFileFinder ff;
+                ASSERT( ff.findByPath(fn) == 0 );
+            }
+
+            int N = 10000;
+#if !defined(_WIN32) && !defined(__linux__)
+            // seems this test is slow on OS X.
+            N = 100;
+#endif
+
+            // we make a lot here -- if we were leaking, presumably it would fail doing this many.
+            Timer t;
+            for( int i = 0; i < N; i++ ) {
+                MongoMMF f;
+                assert( f.open(fn, i%4==1) );
+                {
+                    char *p = (char *) f.getView();
+                    assert(p);
+                    if( cmdLine.dur ) 
+                        MemoryMappedFile::makeWritable(p, 4);
+                    strcpy(p, "zzz");
+                }
+                if( cmdLine.dur ) {
+                    char *w = (char *) f.view_write();
+                    if( i % 2 == 0 )
+                        ++(*w);
+                    assert( w[6] == 'w' );
+                }
+            }
+            if( t.millis() > 10000 ) {
+                log() << "warning: MMap LeakTest is unusually slow N:" << N << ' ' << t.millis() << "ms" << endl;
+            }
+
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "mmap" ) {}
+        void setupTests() {
+            add< LeakTest >();
+        }
+    } myall;
+
+#if 0
+
+    class CopyOnWriteSpeedTest {
+    public:
+        void run() {
+
+            string fn = "/tmp/testfile.map";
+            boost::filesystem::remove(fn);
+
+            MemoryMappedFile f;
+            char *p = (char *) f.create(fn, 1024 * 1024 * 1024, true);
+            assert(p);
+            strcpy(p, "hello");
+
+            {
+                void *x = f.testGetCopyOnWriteView();
+                Timer tt;
+                for( int i = 11; i < 1000000000; i++ )
+                    p[i] = 'z';
+                cout << "fill 1GB time: " << tt.millis() << "ms" << endl;
+                f.testCloseCopyOnWriteView(x);
+            }
+
+            /* test a lot of view/unviews */
+            {
+                Timer t;
+
+                char *q;
+                for( int i = 0; i < 1000; i++ ) {
+                    q = (char *) f.testGetCopyOnWriteView();
+                    assert( q );
+                    if( i == 999 ) {
+                        strcpy(q+2, "there");
+                    }
+                    f.testCloseCopyOnWriteView(q);
+                }
+
+                cout << "view unview: " << t.millis() << "ms" << endl;
+            }
+
+            f.flush(true);
+
+            /* plain old mmaped writes */
+            {
+                Timer t;
+                for( int i = 0; i < 10; i++ ) {
+                    memset(p+100, 'c', 200 * 1024 * 1024);
+                }
+                cout << "traditional writes: " << t.millis() << "ms" << endl;
+            }
+
+            f.flush(true);
+
+            /* test doing some writes */
+            {
+                Timer t;
+                char *q = (char *) f.testGetCopyOnWriteView();
+                for( int i = 0; i < 10; i++ ) {
+                    assert( q );
+                    memset(q+100, 'c', 200 * 1024 * 1024);
+                }
+                f.testCloseCopyOnWriteView(q);
+
+                cout << "inc style some writes: " << t.millis() << "ms" << endl;
+            }
+
+            /* test doing some writes */
+            {
+                Timer t;
+                for( int i = 0; i < 10; i++ ) {
+                    char *q = (char *) f.testGetCopyOnWriteView();
+                    assert( q );
+                    memset(q+100, 'c', 200 * 1024 * 1024);
+                    f.testCloseCopyOnWriteView(q);
+                }
+
+                cout << "some writes: " << t.millis() << "ms" << endl;
+            }
+
+            /* more granular */
+            {
+                Timer t;
+                for( int i = 0; i < 100; i++ ) {
+                    char *q = (char *) f.testGetCopyOnWriteView();
+                    assert( q );
+                    memset(q+100, 'c', 20 * 1024 * 1024);
+                    f.testCloseCopyOnWriteView(q);
+                }
+
+                cout << "more granular some writes: " << t.millis() << "ms" << endl;
+            }
+
+            p[10] = 0;
+            cout << p << endl;
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "mmap" ) {}
+        void setupTests() {
+            add< CopyOnWriteSpeedTest >();
+        }
+    } myall;
+
+#endif
+
+}
diff --git a/src/mongo/dbtests/namespacetests.cpp b/src/mongo/dbtests/namespacetests.cpp
new file mode 100644
index 00000000000..792baf2ccfa
--- /dev/null
+++ b/src/mongo/dbtests/namespacetests.cpp
@@ -0,0 +1,1244 @@
+// namespacetests.cpp : namespace.{h,cpp} unit tests.
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// Where IndexDetails defined.
+#include "pch.h"
+#include "../db/namespace.h"
+
+#include "../db/db.h"
+#include "../db/json.h"
+
+#include "dbtests.h"
+
+namespace NamespaceTests {
+
+    const int MinExtentSize = 4096;
+
+    namespace IndexDetailsTests {
+        class Base {
+            dblock lk;
+            Client::Context _context;
+        public:
+            Base() : _context(ns()) {
+            }
+            virtual ~Base() {
+                if ( id_.info.isNull() )
+                    return;
+                theDataFileMgr.deleteRecord( ns(), id_.info.rec(), id_.info );
+                ASSERT( theDataFileMgr.findAll( ns() )->eof() );
+            }
+        protected:
+            void create( bool sparse = false ) {
+                NamespaceDetailsTransient::get( ns() ).deletedIndex();
+                BSONObjBuilder builder;
+                builder.append( "ns", ns() );
+                builder.append( "name", "testIndex" );
+                builder.append( "key", key() );
+                builder.append( "sparse", sparse );
+                BSONObj bobj = builder.done();
+                id_.info = theDataFileMgr.insert( ns(), bobj.objdata(), bobj.objsize() );
+                // head not needed for current tests
+                // idx_.head = BtreeBucket::addHead( id_ );
+            }
+            static const char* ns() {
+                return "unittests.indexdetailstests";
+            }
+            IndexDetails& id() {
+                return id_;
+            }
+            virtual BSONObj key() const {
+                BSONObjBuilder k;
+                k.append( "a", 1 );
+                return k.obj();
+            }
+            BSONObj aDotB() const {
+                BSONObjBuilder k;
+                k.append( "a.b", 1 );
+                return k.obj();
+            }
+            BSONObj aAndB() const {
+                BSONObjBuilder k;
+                k.append( "a", 1 );
+                k.append( "b", 1 );
+                return k.obj();
+            }
+            static vector< int > shortArray() {
+                vector< int > a;
+                a.push_back( 1 );
+                a.push_back( 2 );
+                a.push_back( 3 );
+                return a;
+            }
+            static BSONObj simpleBC( int i ) {
+                BSONObjBuilder b;
+                b.append( "b", i );
+                b.append( "c", 4 );
+                return b.obj();
+            }
+            static void checkSize( int expected, const BSONObjSet  &objs ) {
+                ASSERT_EQUALS( BSONObjSet::size_type( expected ), objs.size() );
+            }
+            static void assertEquals( const BSONObj &a, const BSONObj &b ) {
+                if ( a.woCompare( b ) != 0 ) {
+                    out() << "expected: " << a.toString()
+                          << ", got: " << b.toString() << endl;
+                }
+                ASSERT( a.woCompare( b ) == 0 );
+            }
+            BSONObj nullObj() const {
+                BSONObjBuilder b;
+                b.appendNull( "" );
+                return b.obj();
+            }
+        private:
+            dblock lk_;
+            IndexDetails id_;
+        };
+
+        class Create : public Base {
+        public:
+            void run() {
+                create();
+                ASSERT_EQUALS( "testIndex", id().indexName() );
+                ASSERT_EQUALS( ns(), id().parentNS() );
+                assertEquals( key(), id().keyPattern() );
+            }
+        };
+
+        class GetKeysFromObjectSimple : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjBuilder b, e;
+                b.append( "b", 4 );
+                b.append( "a", 5 );
+                e.append( "", 5 );
+                BSONObjSet keys;
+                id().getKeysFromObject( b.done(), keys );
+                checkSize( 1, keys );
+                assertEquals( e.obj(), *keys.begin() );
+            }
+        };
+
+        class GetKeysFromObjectDotted : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjBuilder a, e, b;
+                b.append( "b", 4 );
+                a.append( "a", b.done() );
+                a.append( "c", "foo" );
+                e.append( "", 4 );
+                BSONObjSet keys;
+                id().getKeysFromObject( a.done(), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( e.obj(), *keys.begin() );
+            }
+        private:
+            virtual BSONObj key() const {
+                return aDotB();
+            }
+        };
+
+        class GetKeysFromArraySimple : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjBuilder b;
+                b.append( "a", shortArray()) ;
+
+                BSONObjSet keys;
+                id().getKeysFromObject( b.done(), keys );
+                checkSize( 3, keys );
+                int j = 1;
+                for ( BSONObjSet::iterator i = keys.begin(); i != keys.end(); ++i, ++j ) {
+                    BSONObjBuilder b;
+                    b.append( "", j );
+                    assertEquals( b.obj(), *i );
+                }
+            }
+        };
+
+        class GetKeysFromArrayFirstElement : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjBuilder b;
+                b.append( "a", shortArray() );
+                b.append( "b", 2 );
+
+                BSONObjSet keys;
+                id().getKeysFromObject( b.done(), keys );
+                checkSize( 3, keys );
+                int j = 1;
+                for ( BSONObjSet::iterator i = keys.begin(); i != keys.end(); ++i, ++j ) {
+                    BSONObjBuilder b;
+                    b.append( "", j );
+                    b.append( "", 2 );
+                    assertEquals( b.obj(), *i );
+                }
+            }
+        private:
+            virtual BSONObj key() const {
+                return aAndB();
+            }
+        };
+
+        class GetKeysFromArraySecondElement : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjBuilder b;
+                b.append( "first", 5 );
+                b.append( "a", shortArray()) ;
+
+                BSONObjSet keys;
+                id().getKeysFromObject( b.done(), keys );
+                checkSize( 3, keys );
+                int j = 1;
+                for ( BSONObjSet::iterator i = keys.begin(); i != keys.end(); ++i, ++j ) {
+                    BSONObjBuilder b;
+                    b.append( "", 5 );
+                    b.append( "", j );
+                    assertEquals( b.obj(), *i );
+                }
+            }
+        private:
+            virtual BSONObj key() const {
+                BSONObjBuilder k;
+                k.append( "first", 1 );
+                k.append( "a", 1 );
+                return k.obj();
+            }
+        };
+
+        class GetKeysFromSecondLevelArray : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjBuilder b;
+                b.append( "b", shortArray() );
+                BSONObjBuilder a;
+                a.append( "a", b.done() );
+
+                BSONObjSet keys;
+                id().getKeysFromObject( a.done(), keys );
+                checkSize( 3, keys );
+                int j = 1;
+                for ( BSONObjSet::iterator i = keys.begin(); i != keys.end(); ++i, ++j ) {
+                    BSONObjBuilder b;
+                    b.append( "", j );
+                    assertEquals( b.obj(), *i );
+                }
+            }
+        private:
+            virtual BSONObj key() const {
+                return aDotB();
+            }
+        };
+
+        class ParallelArraysBasic : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjBuilder b;
+                b.append( "a", shortArray() );
+                b.append( "b", shortArray() );
+
+                BSONObjSet keys;
+                ASSERT_THROWS( id().getKeysFromObject( b.done(), keys ),
+                                  UserException );
+            }
+        private:
+            virtual BSONObj key() const {
+                return aAndB();
+            }
+        };
+
+        class ArraySubobjectBasic : public Base {
+        public:
+            void run() {
+                create();
+                vector< BSONObj > elts;
+                for ( int i = 1; i < 4; ++i )
+                    elts.push_back( simpleBC( i ) );
+                BSONObjBuilder b;
+                b.append( "a", elts );
+
+                BSONObjSet keys;
+                id().getKeysFromObject( b.done(), keys );
+                checkSize( 3, keys );
+                int j = 1;
+                for ( BSONObjSet::iterator i = keys.begin(); i != keys.end(); ++i, ++j ) {
+                    BSONObjBuilder b;
+                    b.append( "", j );
+                    assertEquals( b.obj(), *i );
+                }
+            }
+        private:
+            virtual BSONObj key() const {
+                return aDotB();
+            }
+        };
+
+        class ArraySubobjectMultiFieldIndex : public Base {
+        public:
+            void run() {
+                create();
+                vector< BSONObj > elts;
+                for ( int i = 1; i < 4; ++i )
+                    elts.push_back( simpleBC( i ) );
+                BSONObjBuilder b;
+                b.append( "a", elts );
+                b.append( "d", 99 );
+
+                BSONObjSet keys;
+                id().getKeysFromObject( b.done(), keys );
+                checkSize( 3, keys );
+                int j = 1;
+                for ( BSONObjSet::iterator i = keys.begin(); i != keys.end(); ++i, ++j ) {
+                    BSONObjBuilder c;
+                    c.append( "", j );
+                    c.append( "", 99 );
+                    assertEquals( c.obj(), *i );
+                }
+            }
+        private:
+            virtual BSONObj key() const {
+                BSONObjBuilder k;
+                k.append( "a.b", 1 );
+                k.append( "d", 1 );
+                return k.obj();
+            }
+        };
+
+        class ArraySubobjectSingleMissing : public Base {
+        public:
+            void run() {
+                create();
+                vector< BSONObj > elts;
+                BSONObjBuilder s;
+                s.append( "foo", 41 );
+                elts.push_back( s.obj() );
+                for ( int i = 1; i < 4; ++i )
+                    elts.push_back( simpleBC( i ) );
+                BSONObjBuilder b;
+                b.append( "a", elts );
+                BSONObj obj = b.obj();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( obj, keys );
+                checkSize( 4, keys );
+                BSONObjSet::iterator i = keys.begin();
+                assertEquals( nullObj(), *i++ ); // see SERVER-3377
+                for ( int j = 1; j < 4; ++i, ++j ) {
+                    BSONObjBuilder b;
+                    b.append( "", j );
+                    assertEquals( b.obj(), *i );
+                }
+            }
+        private:
+            virtual BSONObj key() const {
+                return aDotB();
+            }
+        };
+
+        class ArraySubobjectMissing : public Base {
+        public:
+            void run() {
+                create();
+                vector< BSONObj > elts;
+                BSONObjBuilder s;
+                s.append( "foo", 41 );
+                for ( int i = 1; i < 4; ++i )
+                    elts.push_back( s.done() );
+                BSONObjBuilder b;
+                b.append( "a", elts );
+
+                BSONObjSet keys;
+                id().getKeysFromObject( b.done(), keys );
+                checkSize( 1, keys );
+                assertEquals( nullObj(), *keys.begin() );
+            }
+        private:
+            virtual BSONObj key() const {
+                return aDotB();
+            }
+        };
+
+        class MissingField : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjSet keys;
+                id().getKeysFromObject( BSON( "b" << 1 ), keys );
+                checkSize( 1, keys );
+                assertEquals( nullObj(), *keys.begin() );
+            }
+        private:
+            virtual BSONObj key() const {
+                return BSON( "a" << 1 );
+            }
+        };
+
+        class SubobjectMissing : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[1,2]}" ), keys );
+                checkSize( 1, keys );
+                assertEquals( nullObj(), *keys.begin() );
+            }
+        private:
+            virtual BSONObj key() const {
+                return aDotB();
+            }
+        };
+
+        class CompoundMissing : public Base {
+        public:
+            void run() {
+                create();
+
+                {
+                    BSONObjSet keys;
+                    id().getKeysFromObject( fromjson( "{x:'a',y:'b'}" ) , keys );
+                    checkSize( 1 , keys );
+                    assertEquals( BSON( "" << "a" << "" << "b" ) , *keys.begin() );
+                }
+
+                {
+                    BSONObjSet keys;
+                    id().getKeysFromObject( fromjson( "{x:'a'}" ) , keys );
+                    checkSize( 1 , keys );
+                    BSONObjBuilder b;
+                    b.append( "" , "a" );
+                    b.appendNull( "" );
+                    assertEquals( b.obj() , *keys.begin() );
+                }
+
+            }
+
+        private:
+            virtual BSONObj key() const {
+                return BSON( "x" << 1 << "y" << 1 );
+            }
+
+        };
+
+        class ArraySubelementComplex : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[{b:[2]}]}" ), keys );
+                checkSize( 1, keys );
+                assertEquals( BSON( "" << 2 ), *keys.begin() );
+            }
+        private:
+            virtual BSONObj key() const {
+                return aDotB();
+            }
+        };
+
+        class ParallelArraysComplex : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjSet keys;
+                ASSERT_THROWS( id().getKeysFromObject( fromjson( "{a:[{b:[1],c:[2]}]}" ), keys ),
+                                  UserException );
+            }
+        private:
+            virtual BSONObj key() const {
+                return fromjson( "{'a.b':1,'a.c':1}" );
+            }
+        };
+
+        class AlternateMissing : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[{b:1},{c:2}]}" ), keys );
+                checkSize( 2, keys );
+                BSONObjSet::iterator i = keys.begin();
+                {
+                    BSONObjBuilder e;
+                    e.appendNull( "" );
+                    e.append( "", 2 );
+                    assertEquals( e.obj(), *i++ );
+                }
+
+                {
+                    BSONObjBuilder e;
+                    e.append( "", 1 );
+                    e.appendNull( "" );
+                    assertEquals( e.obj(), *i++ );
+                }
+            }
+        private:
+            virtual BSONObj key() const {
+                return fromjson( "{'a.b':1,'a.c':1}" );
+            }
+        };
+
+        class MultiComplex : public Base {
+        public:
+            void run() {
+                create();
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[{b:1},{b:[1,2,3]}]}" ), keys );
+                checkSize( 3, keys );
+            }
+        private:
+            virtual BSONObj key() const {
+                return aDotB();
+            }
+        };
+
+        class EmptyArray : Base {
+        public:
+            void run() {
+                create();
+
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[1,2]}" ), keys );
+                checkSize(2, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[1]}" ), keys );
+                checkSize(1, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:null}" ), keys );
+                checkSize(1, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize(1, keys );
+                ASSERT_EQUALS( Undefined, keys.begin()->firstElement().type() );
+                keys.clear();
+            }
+        };
+ 
+        class DoubleArray : Base {
+        public:
+            void run() {
+             	create();   
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[1,2]}" ), keys );
+                checkSize(2, keys );
+                BSONObjSet::const_iterator i = keys.begin();
+                ASSERT_EQUALS( BSON( "" << 1 << "" << 1 ), *i );
+                ++i;
+                ASSERT_EQUALS( BSON( "" << 2 << "" << 2 ), *i );
+                keys.clear();
+            }
+            
+        protected:
+            BSONObj key() const {
+                return BSON( "a" << 1 << "a" << 1 );
+            }
+        };
+        
+        class DoubleEmptyArray : Base {
+        public:
+            void run() {
+             	create();   
+
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize(1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined,'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+            
+        protected:
+            BSONObj key() const {
+                return BSON( "a" << 1 << "a" << 1 );
+            }
+        };
+
+        class MultiEmptyArray : Base {
+        public:
+            void run() {
+                create();
+
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:1,b:[1,2]}" ), keys );
+                checkSize(2, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:1,b:[1]}" ), keys );
+                checkSize(1, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:1,b:null}" ), keys );
+                //cout << "YO : " << *(keys.begin()) << endl;
+                checkSize(1, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:1,b:[]}" ), keys );
+                checkSize(1, keys );
+                //cout << "YO : " << *(keys.begin()) << endl;
+                BSONObjIterator i( *keys.begin() );
+                ASSERT_EQUALS( NumberInt , i.next().type() );
+                ASSERT_EQUALS( Undefined , i.next().type() );
+                keys.clear();
+            }
+
+        protected:
+            BSONObj key() const {
+                return aAndB();
+            }
+        };
+        
+        class NestedEmptyArray : Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 ); }
+        };
+        
+		class MultiNestedEmptyArray : Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null,'':null}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 << "a.c" << 1 ); }
+        };
+        
+        class UnevenNestedEmptyArray : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined,'':null}" ), *keys.begin() );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[{b:1}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':{b:1},'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[{b:[]}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':{b:[]},'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a" << 1 << "a.b" << 1 ); }            
+        };
+
+        class ReverseUnevenNestedEmptyArray : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null,'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 << "a" << 1 ); }            
+        };
+        
+        class SparseReverseUnevenNestedEmptyArray : public Base {
+        public:
+            void run() {
+             	create( true );
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null,'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 << "a" << 1 ); }            
+        };
+        
+        class SparseEmptyArray : public Base {
+        public:
+            void run() {
+             	create( true );
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:1}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[{c:1}]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 ); }            
+        };
+
+        class SparseEmptyArraySecond : public Base {
+        public:
+            void run() {
+             	create( true );
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:1}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[{c:1}]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "z" << 1 << "a.b" << 1 ); }
+        };
+        
+        class NonObjectMissingNestedField : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[1]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[1,{b:1}]}" ), keys );
+                checkSize( 2, keys );
+                BSONObjSet::const_iterator c = keys.begin();
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *c );
+                ++c;
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *c );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 ); }
+        };
+
+        class SparseNonObjectMissingNestedField : public Base {
+        public:
+            void run() {
+             	create( true );
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[1]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[1,{b:1}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 ); }
+        };
+        
+        class IndexedArrayIndex : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[1]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( BSON( "" << 1 ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[1]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':[1]}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined}" ), *keys.begin() );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:{'0':1}}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( BSON( "" << 1 ), *keys.begin() );
+                keys.clear();
+
+                ASSERT_THROWS( id().getKeysFromObject( fromjson( "{a:[{'0':1}]}" ), keys ), UserException );
+
+                ASSERT_THROWS( id().getKeysFromObject( fromjson( "{a:[1,{'0':2}]}" ), keys ), UserException );
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.0" << 1 ); }
+        };
+
+        class DoubleIndexedArrayIndex : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[[1]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[[]]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.0.0" << 1 ); }
+        };
+        
+        class ObjectWithinArray : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[{b:1}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[{b:[1]}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[{b:[[1]]}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':[1]}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[{b:1}]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[{b:[1]}]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[{b:[[1]]}]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':[1]}" ), *keys.begin() );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[[{b:[]}]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.0.b" << 1 ); }
+        };
+
+        class ArrayWithinObjectWithinArray : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[{b:[1]}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.0.b.0" << 1 ); }
+        };
+        
+        // also test numeric string field names
+        
+    } // namespace IndexDetailsTests
+
+    namespace NamespaceDetailsTests {
+
+        class Base {
+            const char *ns_;
+            dblock lk;
+            Client::Context _context;
+        public:
+            Base( const char *ns = "unittests.NamespaceDetailsTests" ) : ns_( ns ) , _context( ns ) {}
+            virtual ~Base() {
+                if ( !nsd() )
+                    return;
+                string s( ns() );
+                string errmsg;
+                BSONObjBuilder result;
+                dropCollection( s, errmsg, result );
+            }
+        protected:
+            void create() {
+                dblock lk;
+                string err;
+                ASSERT( userCreateNS( ns(), fromjson( spec() ), err, false ) );
+            }
+            virtual string spec() const {
+                return "{\"capped\":true,\"size\":512,\"$nExtents\":1}";
+            }
+            int nRecords() const {
+                int count = 0;
+                for ( DiskLoc i = nsd()->firstExtent; !i.isNull(); i = i.ext()->xnext ) {
+                    int fileNo = i.ext()->firstRecord.a();
+                    if ( fileNo == -1 )
+                        continue;
+                    for ( int j = i.ext()->firstRecord.getOfs(); j != DiskLoc::NullOfs;
+                            j = DiskLoc( fileNo, j ).rec()->nextOfs ) {
+                        ++count;
+                    }
+                }
+                ASSERT_EQUALS( count, nsd()->stats.nrecords );
+                return count;
+            }
+            int nExtents() const {
+                int count = 0;
+                for ( DiskLoc i = nsd()->firstExtent; !i.isNull(); i = i.ext()->xnext )
+                    ++count;
+                return count;
+            }
+            static int min( int a, int b ) {
+                return a < b ? a : b;
+            }
+            const char *ns() const {
+                return ns_;
+            }
+            NamespaceDetails *nsd() const {
+                return nsdetails( ns() )->writingWithExtra();
+            }
+            static BSONObj bigObj(bool bGenID=false) {
+                BSONObjBuilder b;
+				if (bGenID)
+					b.appendOID("_id", 0, true);
+                string as( 187, 'a' );
+                b.append( "a", as );
+                return b.obj();
+            }
+        };
+
+        class Create : public Base {
+        public:
+            void run() {
+                create();
+                ASSERT( nsd() );
+                ASSERT_EQUALS( 0, nRecords() );
+                ASSERT( nsd()->firstExtent == nsd()->capExtent );
+                DiskLoc initial = DiskLoc();
+                initial.setInvalid();
+                ASSERT( initial == nsd()->capFirstNewRecord );
+            }
+        };
+
+        class SingleAlloc : public Base {
+        public:
+            void run() {
+                create();
+                BSONObj b = bigObj();
+                ASSERT( !theDataFileMgr.insert( ns(), b.objdata(), b.objsize() ).isNull() );
+                ASSERT_EQUALS( 1, nRecords() );
+            }
+        };
+
+        class Realloc : public Base {
+        public:
+            void run() {
+                create();
+
+                const int N = 20;
+                const int Q = 16; // these constants depend on the size of the bson object, the extent size allocated by the system too
+                DiskLoc l[ N ];
+                for ( int i = 0; i < N; ++i ) {
+					BSONObj b = bigObj(true);
+                    l[ i ] = theDataFileMgr.insert( ns(), b.objdata(), b.objsize() );
+                    ASSERT( !l[ i ].isNull() );
+                    ASSERT( nRecords() <= Q );
+                    //ASSERT_EQUALS( 1 + i % 2, nRecords() );
+                    if ( i >= 16 )
+                        ASSERT( l[ i ] == l[ i - Q] );
+                }
+            }
+        };
+
+        class TwoExtent : public Base {
+        public:
+            void run() {
+                create();
+                ASSERT_EQUALS( 2, nExtents() );
+
+                BSONObj b = bigObj();
+
+                DiskLoc l[ 8 ];
+                for ( int i = 0; i < 8; ++i ) {
+                    l[ i ] = theDataFileMgr.insert( ns(), b.objdata(), b.objsize() );
+                    ASSERT( !l[ i ].isNull() );
+                    //ASSERT_EQUALS( i < 2 ? i + 1 : 3 + i % 2, nRecords() );
+                    //if ( i > 3 )
+                    //    ASSERT( l[ i ] == l[ i - 4 ] );
+                }
+                ASSERT( nRecords() == 8 );
+
+                // Too big
+                BSONObjBuilder bob;
+                bob.append( "a", string( MinExtentSize + 500, 'a' ) ); // min extent size is now 4096
+                BSONObj bigger = bob.done();
+                ASSERT( theDataFileMgr.insert( ns(), bigger.objdata(), bigger.objsize() ).isNull() );
+                ASSERT_EQUALS( 0, nRecords() );
+            }
+        private:
+            virtual string spec() const {
+                return "{\"capped\":true,\"size\":512,\"$nExtents\":2}";
+            }
+        };
+
+        /* test  NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc loc)
+        */
+        class TruncateCapped : public Base {
+            virtual string spec() const {
+                return "{\"capped\":true,\"size\":512,\"$nExtents\":2}";
+            }
+            void pass(int p) {
+                create();
+                ASSERT_EQUALS( 2, nExtents() );
+
+                BSONObj b = bigObj(true);
+
+                int N = MinExtentSize / b.objsize() * nExtents() + 5;
+                int T = N - 4;
+
+                DiskLoc truncAt;
+                //DiskLoc l[ 8 ];
+                for ( int i = 0; i < N; ++i ) {
+					BSONObj bb = bigObj(true);
+                    DiskLoc a = theDataFileMgr.insert( ns(), bb.objdata(), bb.objsize() );
+                    if( T == i )
+                        truncAt = a;
+                    ASSERT( !a.isNull() );
+                    /*ASSERT_EQUALS( i < 2 ? i + 1 : 3 + i % 2, nRecords() );
+                    if ( i > 3 )
+                        ASSERT( l[ i ] == l[ i - 4 ] );*/
+                }
+                ASSERT( nRecords() < N );
+
+                NamespaceDetails *nsd = nsdetails(ns());
+
+                DiskLoc last, first;
+                {
+                    ReverseCappedCursor c(nsd);
+                    last = c.currLoc();
+                    ASSERT( !last.isNull() );
+                }
+                {
+                    ForwardCappedCursor c(nsd);
+                    first = c.currLoc();
+                    ASSERT( !first.isNull() );
+                    ASSERT( first != last ) ;
+                }
+
+                nsd->cappedTruncateAfter(ns(), truncAt, false);
+                ASSERT_EQUALS( nsd->stats.nrecords , 28 );
+
+                {
+                    ForwardCappedCursor c(nsd);
+                    ASSERT( first == c.currLoc() );
+                }
+                {
+                    ReverseCappedCursor c(nsd);
+                    ASSERT( last != c.currLoc() ); // old last should be deleted
+                    ASSERT( !last.isNull() );
+                }
+
+                // Too big
+                BSONObjBuilder bob;
+				bob.appendOID("_id", 0, true);
+                bob.append( "a", string( MinExtentSize + 300, 'a' ) );
+                BSONObj bigger = bob.done();
+                ASSERT( theDataFileMgr.insert( ns(), bigger.objdata(), bigger.objsize() ).isNull() );
+                ASSERT_EQUALS( 0, nRecords() );
+            }
+        public:
+            void run() {
+//                log() << "******** NOT RUNNING TruncateCapped test yet ************" << endl;
+                pass(0);
+            }
+        };
+
+        class Migrate : public Base {
+        public:
+            void run() {
+                create();
+                nsd()->deletedList[ 2 ] = nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted.drec()->nextDeleted;
+                nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted.drec()->nextDeleted.writing() = DiskLoc();
+                nsd()->cappedLastDelRecLastExtent().Null();
+                NamespaceDetails *d = nsd();
+                zero( &d->capExtent );
+                zero( &d->capFirstNewRecord );
+
+                nsd();
+
+                ASSERT( nsd()->firstExtent == nsd()->capExtent );
+                ASSERT( nsd()->capExtent.getOfs() != 0 );
+                ASSERT( !nsd()->capFirstNewRecord.isValid() );
+                int nDeleted = 0;
+                for ( DiskLoc i = nsd()->cappedListOfAllDeletedRecords(); !i.isNull(); i = i.drec()->nextDeleted, ++nDeleted );
+                ASSERT_EQUALS( 10, nDeleted );
+                ASSERT( nsd()->cappedLastDelRecLastExtent().isNull() );
+            }
+        private:
+            static void zero( DiskLoc *d ) {
+                memset( d, 0, sizeof( DiskLoc ) );
+            }
+            virtual string spec() const {
+                return "{\"capped\":true,\"size\":512,\"$nExtents\":10}";
+            }
+        };
+
+        // This isn't a particularly useful test, and because it doesn't clean up
+        // after itself, /tmp/unittest needs to be cleared after running.
+        //        class BigCollection : public Base {
+        //        public:
+        //            BigCollection() : Base( "NamespaceDetailsTests_BigCollection" ) {}
+        //            void run() {
+        //                create();
+        //                ASSERT_EQUALS( 2, nExtents() );
+        //            }
+        //        private:
+        //            virtual string spec() const {
+        //                // NOTE 256 added to size in _userCreateNS()
+        //                long long big = MongoDataFile::maxSize() - DataFileHeader::HeaderSize;
+        //                stringstream ss;
+        //                ss << "{\"capped\":true,\"size\":" << big << "}";
+        //                return ss.str();
+        //            }
+        //        };
+
+        class Size {
+        public:
+            void run() {
+                ASSERT_EQUALS( 496U, sizeof( NamespaceDetails ) );
+            }
+        };
+
+    } // namespace NamespaceDetailsTests
+
+    class All : public Suite {
+    public:
+        All() : Suite( "namespace" ) {
+        }
+
+        void setupTests() {
+            add< IndexDetailsTests::Create >();
+            add< IndexDetailsTests::GetKeysFromObjectSimple >();
+            add< IndexDetailsTests::GetKeysFromObjectDotted >();
+            add< IndexDetailsTests::GetKeysFromArraySimple >();
+            add< IndexDetailsTests::GetKeysFromArrayFirstElement >();
+            add< IndexDetailsTests::GetKeysFromArraySecondElement >();
+            add< IndexDetailsTests::GetKeysFromSecondLevelArray >();
+            add< IndexDetailsTests::ParallelArraysBasic >();
+            add< IndexDetailsTests::ArraySubobjectBasic >();
+            add< IndexDetailsTests::ArraySubobjectMultiFieldIndex >();
+            add< IndexDetailsTests::ArraySubobjectSingleMissing >();
+            add< IndexDetailsTests::ArraySubobjectMissing >();
+            add< IndexDetailsTests::ArraySubelementComplex >();
+            add< IndexDetailsTests::ParallelArraysComplex >();
+            add< IndexDetailsTests::AlternateMissing >();
+            add< IndexDetailsTests::MultiComplex >();
+            add< IndexDetailsTests::EmptyArray >();
+            add< IndexDetailsTests::DoubleArray >();
+            add< IndexDetailsTests::DoubleEmptyArray >();
+            add< IndexDetailsTests::MultiEmptyArray >();
+            add< IndexDetailsTests::NestedEmptyArray >();
+            add< IndexDetailsTests::MultiNestedEmptyArray >();
+            add< IndexDetailsTests::UnevenNestedEmptyArray >();
+            add< IndexDetailsTests::ReverseUnevenNestedEmptyArray >();
+            add< IndexDetailsTests::SparseReverseUnevenNestedEmptyArray >();
+            add< IndexDetailsTests::SparseEmptyArray >();
+            add< IndexDetailsTests::SparseEmptyArraySecond >();
+            add< IndexDetailsTests::NonObjectMissingNestedField >();
+            add< IndexDetailsTests::SparseNonObjectMissingNestedField >();
+            add< IndexDetailsTests::IndexedArrayIndex >();
+            add< IndexDetailsTests::DoubleIndexedArrayIndex >();
+            add< IndexDetailsTests::ObjectWithinArray >();
+            add< IndexDetailsTests::ArrayWithinObjectWithinArray >();
+            add< IndexDetailsTests::MissingField >();
+            add< IndexDetailsTests::SubobjectMissing >();
+            add< IndexDetailsTests::CompoundMissing >();
+            add< NamespaceDetailsTests::Create >();
+            add< NamespaceDetailsTests::SingleAlloc >();
+            add< NamespaceDetailsTests::Realloc >();
+            add< NamespaceDetailsTests::TwoExtent >();
+            add< NamespaceDetailsTests::TruncateCapped >();
+            add< NamespaceDetailsTests::Migrate >();
+            //            add< NamespaceDetailsTests::BigCollection >();
+            add< NamespaceDetailsTests::Size >();
+        }
+    } myall;
+} // namespace NamespaceTests
+
diff --git a/src/mongo/dbtests/pdfiletests.cpp b/src/mongo/dbtests/pdfiletests.cpp
new file mode 100644
index 00000000000..e07ccb42aa6
--- /dev/null
+++ b/src/mongo/dbtests/pdfiletests.cpp
@@ -0,0 +1,407 @@
+// pdfiletests.cpp : pdfile unit tests.
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/pdfile.h"
+
+#include "../db/db.h"
+#include "../db/json.h"
+
+#include "dbtests.h"
+
+namespace PdfileTests {
+
+    namespace ScanCapped {
+
+        class Base {
+        public:
+            Base() : _context( ns() ) {
+            }
+            virtual ~Base() {
+                if ( !nsd() )
+                    return;
+                string n( ns() );
+                dropNS( n );
+            }
+            void run() {
+                stringstream spec;
+                spec << "{\"capped\":true,\"size\":2000,\"$nExtents\":" << nExtents() << "}";
+                string err;
+                ASSERT( userCreateNS( ns(), fromjson( spec.str() ), err, false ) );
+                prepare();
+                int j = 0;
+                for ( boost::shared_ptr<Cursor> i = theDataFileMgr.findAll( ns() );
+                        i->ok(); i->advance(), ++j )
+                    ASSERT_EQUALS( j, i->current().firstElement().number() );
+                ASSERT_EQUALS( count(), j );
+
+                j = count() - 1;
+                for ( boost::shared_ptr<Cursor> i =
+                            findTableScan( ns(), fromjson( "{\"$natural\":-1}" ) );
+                        i->ok(); i->advance(), --j )
+                    ASSERT_EQUALS( j, i->current().firstElement().number() );
+                ASSERT_EQUALS( -1, j );
+            }
+        protected:
+            virtual void prepare() = 0;
+            virtual int count() const = 0;
+            virtual int nExtents() const {
+                return 0;
+            }
+            // bypass standard alloc/insert routines to use the extent we want.
+            static DiskLoc insert( DiskLoc ext, int i ) {
+                BSONObjBuilder b;
+                b.append( "a", i );
+                BSONObj o = b.done();
+                int len = o.objsize();
+                Extent *e = ext.ext();
+                e = getDur().writing(e);
+                int ofs;
+                if ( e->lastRecord.isNull() )
+                    ofs = ext.getOfs() + ( e->_extentData - (char *)e );
+                else
+                    ofs = e->lastRecord.getOfs() + e->lastRecord.rec()->lengthWithHeaders;
+                DiskLoc dl( ext.a(), ofs );
+                Record *r = dl.rec();
+                r = (Record*) getDur().writingPtr(r, Record::HeaderSize + len);
+                r->lengthWithHeaders = Record::HeaderSize + len;
+                r->extentOfs = e->myLoc.getOfs();
+                r->nextOfs = DiskLoc::NullOfs;
+                r->prevOfs = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs();
+                memcpy( r->data, o.objdata(), len );
+                if ( e->firstRecord.isNull() )
+                    e->firstRecord = dl;
+                else
+                    getDur().writingInt(e->lastRecord.rec()->nextOfs) = ofs;
+                e->lastRecord = dl;
+                return dl;
+            }
+            static const char *ns() {
+                return "unittests.ScanCapped";
+            }
+            static NamespaceDetails *nsd() {
+                return nsdetails( ns() );
+            }
+        private:
+            dblock lk_;
+            Client::Context _context;
+        };
+
+        class Empty : public Base {
+            virtual void prepare() {}
+            virtual int count() const {
+                return 0;
+            }
+        };
+
+        class EmptyLooped : public Base {
+            virtual void prepare() {
+                nsd()->writingWithExtra()->capFirstNewRecord = DiskLoc();
+            }
+            virtual int count() const {
+                return 0;
+            }
+        };
+
+        class EmptyMultiExtentLooped : public Base {
+            virtual void prepare() {
+                nsd()->writingWithExtra()->capFirstNewRecord = DiskLoc();
+            }
+            virtual int count() const {
+                return 0;
+            }
+            virtual int nExtents() const {
+                return 3;
+            }
+        };
+
+        class Single : public Base {
+            virtual void prepare() {
+                nsd()->writingWithExtra()->capFirstNewRecord = insert( nsd()->capExtent, 0 );
+            }
+            virtual int count() const {
+                return 1;
+            }
+        };
+
+        class NewCapFirst : public Base {
+            virtual void prepare() {
+                DiskLoc x = insert( nsd()->capExtent, 0 );
+                nsd()->writingWithExtra()->capFirstNewRecord = x;
+                insert( nsd()->capExtent, 1 );
+            }
+            virtual int count() const {
+                return 2;
+            }
+        };
+
+        class NewCapLast : public Base {
+            virtual void prepare() {
+                insert( nsd()->capExtent, 0 );
+                nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 1 );
+            }
+            virtual int count() const {
+                return 2;
+            }
+        };
+
+        class NewCapMiddle : public Base {
+            virtual void prepare() {
+                insert( nsd()->capExtent, 0 );
+                nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 1 );
+                insert( nsd()->capExtent, 2 );
+            }
+            virtual int count() const {
+                return 3;
+            }
+        };
+
+        class FirstExtent : public Base {
+            virtual void prepare() {
+                insert( nsd()->capExtent, 0 );
+                insert( nsd()->lastExtent, 1 );
+                nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 2 );
+                insert( nsd()->capExtent, 3 );
+            }
+            virtual int count() const {
+                return 4;
+            }
+            virtual int nExtents() const {
+                return 2;
+            }
+        };
+
+        class LastExtent : public Base {
+            virtual void prepare() {
+                nsd()->capExtent.writing() = nsd()->lastExtent;
+                insert( nsd()->capExtent, 0 );
+                insert( nsd()->firstExtent, 1 );
+                nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 2 );
+                insert( nsd()->capExtent, 3 );
+            }
+            virtual int count() const {
+                return 4;
+            }
+            virtual int nExtents() const {
+                return 2;
+            }
+        };
+
+        class MidExtent : public Base {
+            virtual void prepare() {
+                nsd()->capExtent.writing() = nsd()->firstExtent.ext()->xnext;
+                insert( nsd()->capExtent, 0 );
+                insert( nsd()->lastExtent, 1 );
+                insert( nsd()->firstExtent, 2 );
+                nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 3 );
+                insert( nsd()->capExtent, 4 );
+            }
+            virtual int count() const {
+                return 5;
+            }
+            virtual int nExtents() const {
+                return 3;
+            }
+        };
+
+        class AloneInExtent : public Base {
+            virtual void prepare() {
+                nsd()->capExtent.writing() = nsd()->firstExtent.ext()->xnext;
+                insert( nsd()->lastExtent, 0 );
+                insert( nsd()->firstExtent, 1 );
+                nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 2 );
+            }
+            virtual int count() const {
+                return 3;
+            }
+            virtual int nExtents() const {
+                return 3;
+            }
+        };
+
+        class FirstInExtent : public Base {
+            virtual void prepare() {
+                nsd()->capExtent.writing() = nsd()->firstExtent.ext()->xnext;
+                insert( nsd()->lastExtent, 0 );
+                insert( nsd()->firstExtent, 1 );
+                nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 2 );
+                insert( nsd()->capExtent, 3 );
+            }
+            virtual int count() const {
+                return 4;
+            }
+            virtual int nExtents() const {
+                return 3;
+            }
+        };
+
+        class LastInExtent : public Base {
+            virtual void prepare() {
+                nsd()->capExtent.writing() = nsd()->firstExtent.ext()->xnext;
+                insert( nsd()->capExtent, 0 );
+                insert( nsd()->lastExtent, 1 );
+                insert( nsd()->firstExtent, 2 );
+                nsd()->capFirstNewRecord.writing() = insert( nsd()->capExtent, 3 );
+            }
+            virtual int count() const {
+                return 4;
+            }
+            virtual int nExtents() const {
+                return 3;
+            }
+        };
+
+    } // namespace ScanCapped
+
+    namespace Insert {
+        class Base {
+        public:
+            Base() : _context( ns() ) {
+            }
+            virtual ~Base() {
+                if ( !nsd() )
+                    return;
+                string n( ns() );
+                dropNS( n );
+            }
+        protected:
+            static const char *ns() {
+                return "unittests.pdfiletests.Insert";
+            }
+            static NamespaceDetails *nsd() {
+                return nsdetails( ns() );
+            }
+        private:
+            dblock lk_;
+            Client::Context _context;
+        };
+
+        class UpdateDate : public Base {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendTimestamp( "a" );
+                BSONObj o = b.done();
+                ASSERT( 0 == o.getField( "a" ).date() );
+                theDataFileMgr.insertWithObjMod( ns(), o );
+                ASSERT( 0 != o.getField( "a" ).date() );
+            }
+        };
+    } // namespace Insert
+
+    class ExtentSizing {
+    public:
+        struct SmallFilesControl {
+            SmallFilesControl() {
+                old = cmdLine.smallfiles;
+                cmdLine.smallfiles = false;
+            }
+            ~SmallFilesControl() {
+                cmdLine.smallfiles = old;
+            }
+            bool old;
+        };
+        void run() {
+            SmallFilesControl c;
+            // test that no matter what we start with, we always get to max extent size
+            for ( int obj=16; obj<BSONObjMaxUserSize; obj += 111 ) {
+                int sz = Extent::initialSize( obj );
+                for ( int i=0; i<100; i++ ) {
+                    sz = Extent::followupSize( obj , sz );
+                }
+                ASSERT_EQUALS( Extent::maxSize() , sz );
+            }
+        }
+    };
+
+    class ExtentAllocOrder {
+    public:
+        void run() {
+            string dbname = "unittest_ex";
+
+            string c1 = dbname + ".x1";
+            string c2 = dbname + ".x2";
+
+            {
+                DBDirectClient db;
+                db.dropDatabase( dbname );
+            }
+
+            dblock mylock;
+            Client::Context cx( dbname );
+
+            bool isnew;
+            Database * d = dbHolderW().getOrCreate( dbname , dbpath , isnew );
+            assert( d );
+
+            int big = 10 * 1024;
+            //int small = 1024;
+
+            unsigned long long l = 0;
+            int n = 0;
+            while ( 1 ) {
+                n++;
+                if( n == 5 && sizeof(void*)==4 )
+                    break;
+                MongoDataFile * f = d->addAFile( big , false );
+                //cout << f->length() << ' ' << n << endl;
+                if ( f->length() == l )
+                    break;
+                l = f->length();
+            }
+
+            int start = d->numFiles();
+            for ( int i=0; i<start; i++ )
+                d->allocExtent( c1.c_str() , d->getFile( i )->getHeader()->unusedLength , false, false );
+            ASSERT_EQUALS( start , d->numFiles() );
+
+            {
+                DBDirectClient db;
+                db.dropDatabase( dbname );
+            }
+        }
+    };
+
+
+    class All : public Suite {
+    public:
+        All() : Suite( "pdfile" ) {}
+
+        void setupTests() {
+            add< ScanCapped::Empty >();
+            add< ScanCapped::EmptyLooped >();
+            add< ScanCapped::EmptyMultiExtentLooped >();
+            add< ScanCapped::Single >();
+            add< ScanCapped::NewCapFirst >();
+            add< ScanCapped::NewCapLast >();
+            add< ScanCapped::NewCapMiddle >();
+            add< ScanCapped::FirstExtent >();
+            add< ScanCapped::LastExtent >();
+            add< ScanCapped::MidExtent >();
+            add< ScanCapped::AloneInExtent >();
+            add< ScanCapped::FirstInExtent >();
+            add< ScanCapped::LastInExtent >();
+            add< Insert::UpdateDate >();
+            add< ExtentSizing >();
+            add< ExtentAllocOrder >();
+        }
+    } myall;
+
+} // namespace PdfileTests
+
diff --git a/src/mongo/dbtests/perf/btreeperf.cpp b/src/mongo/dbtests/perf/btreeperf.cpp
new file mode 100644
index 00000000000..7d68d8f5cc7
--- /dev/null
+++ b/src/mongo/dbtests/perf/btreeperf.cpp
@@ -0,0 +1,442 @@
+// btreeperf.cpp
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+ * Performance timing and space utilization testing for btree indexes.
+ */
+
+#include <iostream>
+
+#include <boost/random/bernoulli_distribution.hpp>
+#include <boost/random/geometric_distribution.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/variate_generator.hpp>
+#include <boost/random/uniform_int.hpp>
+
+#include "client/dbclient.h"
+#include "../../util/timer.h"
+
+using namespace std;
+using namespace mongo;
+using namespace boost;
+
+const char *ns = "test.btreeperf";
+const char *db = "test";
+const char *index_collection = "btreeperf.$_id_";
+
+// This random number generator has a much larger period than the default
+// generator and is half as fast as the default.  Given that we intend to
+// generate large numbers of documents and will utilize more than one random
+// sample per document, choosing this generator seems like a worthwhile tradeoff.
+mt19937 randomNumberGenerator;
+
+/**
+ * An interface for generating documents to be inserted and document specs for
+ * remove requests.
+ */
+class InsertAndRemoveStrategy {
+public:
+    virtual ~InsertAndRemoveStrategy() {}
+    virtual BSONObj insertObj() = 0;
+    virtual BSONObj removeObj() = 0;
+protected:
+    /**
+     * Helper functions for converting a sample value to a sample object with
+     * specified _id, to be inserted or removed.
+     */
+
+    template< class T >
+    BSONObj insertObjWithVal( const T &val ) {
+        BSONObjBuilder b;
+        b.append( "_id", val );
+        return b.obj();
+    }
+    template< class T >
+    BSONObj removeObjWithVal( const T &val ) {
+        BSONObjBuilder b;
+        b.append( "_id", val );
+        return b.obj();
+    }
+};
+
+/**
+ * Manages a set of elements of type T.  Supports inserting unique elements and
+ * sampling a random element without replacement.
+ *
+ * TODO In the contexts where this class is currently used, duplicate keys are
+ * either impossible or highly unlikely.  And an occasional duplicate value will
+ * not much affect the procedure by wich a random element is chosen.  We could
+ * stop checking for duplicates in push(), eliminate _set from the implementaiton,
+ * and potentially improve performance and memory requirements somewhat.
+ */
+template< class T >
+class SetSampler {
+public:
+    /** @param val Insert this value in the set if not already present. */
+    void push( const T& val ) {
+        if ( _set.insert( val ).second ) {
+            _vector.push_back( val );
+        }
+    }
+    /** @return a random element removed from the set */
+    T pull() {
+        if ( _vector.size() == 0 ) {
+            return T();
+        }
+        uniform_int< size_t > sizeRange( 0, _vector.size() - 1 );
+        variate_generator< mt19937&, uniform_int< size_t > > sizeGenerator( randomNumberGenerator, sizeRange );
+        size_t toRemove = sizeGenerator();
+        T val = _vector[ toRemove ];
+        // Replace the random element with the last element, then remove the
+        // last element.
+        _vector[ toRemove ] = _vector.back();
+        _vector.pop_back();
+        _set.erase( val );
+        return val;
+    }
+private:
+    vector< T > _vector;
+    set< T > _set;
+};
+
+/**
+ * Tracks values that have been specified for insertion by the derived class's
+ * implementation of insertVal() and selects uniformally from among values that
+ * have been inserted but not yet removed for the next value to remove.
+ *
+ * The implementation is probabilistically sound, but may be resource intensive
+ * and slow due to the use of a SetSampler.
+ */
+template< class T >
+class InsertAndUniformRemoveStrategy : public InsertAndRemoveStrategy {
+public:
+    virtual BSONObj insertObj() {
+        T val = insertVal();
+        _sampler.push( val );
+        return insertObjWithVal( val );
+    }
+    virtual BSONObj removeObj() { return removeObjWithVal( _sampler.pull() ); }
+protected:
+    /** @return value to insert. This is the only function a derived class need implement. */
+    virtual T insertVal() = 0;
+private:
+    SetSampler< T > _sampler;
+};
+
+/**
+ * The derived class supplies keys to be inserted and removed.  The key removal
+ * strategy is similar to the strategy for selecting a random element described
+ * in the MongoDB cookbook: the first key in the collection greater than or
+ * equal to the supplied removal key is removed.  This allows selecting an
+ * exising key for removal without the overhead required by a SetSampler.
+ *
+ * While this ranged selection strategy can work well for selecting a random
+ * element, there are some theoretical and empirically observed shortcomings
+ * when the strategy is applied to removing nodes for btree performance measurement:
+ * 1 The likelihood that a given key is removed is proportional to the difference
+ *   in value between it and the previous key.  Because key deletion increases
+ *   the difference in value between adjacent keys, neighboring keys will be
+ *   more likely to be deleted than they would be in a true uniform distribution.
+ * 2 MongoDB 1.6 uses 'unused' nodes in the btree implementation.  With a ranged
+ *   removal strategy, those nodes must be traversed to find a node available
+ *   for removal.
+ * 3 Ranged removal was observed to be biased against the balancing policy of
+ *   MongoDB 1.7 in some cases, in terms of storage size.  This may be a
+ *   consequence of point 1 above.
+ * 4 Ranged removal was observed to be significantly biased against the btree
+ *   implementation in MongoDB 1.6 in terms of performance.  This is likely a
+ *   consequence of point 2 above.
+ * 5 In some cases the biases described above were not evident in tests lasting
+ *   several minutes, but were evident in tests lasting several hours.
+ */
+template< class T >
+class InsertAndRangedRemoveStrategy : public InsertAndRemoveStrategy {
+public:
+    virtual BSONObj insertObj() { return insertObjWithVal( insertVal() ); }
+    virtual BSONObj removeObj() { return rangedRemoveObjWithVal( removeVal() ); }
+protected:
+    /** Small likelihood that this removal spec will not match any document */
+    template< class U >
+    BSONObj rangedRemoveObjWithVal( const U &val ) {
+        BSONObjBuilder b1;
+        BSONObjBuilder b2( b1.subobjStart( "_id" ) );
+        b2.append( "$gte", val );
+        b2.done();
+        return b1.obj();
+    }
+    virtual T insertVal() = 0;
+    virtual T removeVal() = 0;
+};
+
+/**
+ * Integer Keys
+ * Uniform Inserts
+ * Uniform Removes
+ */
+class UniformInsertRangedUniformRemoveInteger : public InsertAndRangedRemoveStrategy< long long > {
+public:
+    UniformInsertRangedUniformRemoveInteger() :
+        _uniform_int( 0ULL, ~0ULL ),
+        _nextLongLong( randomNumberGenerator, _uniform_int ) {
+    }
+    /** Small likelihood of duplicates */
+    virtual long long insertVal() { return _nextLongLong(); }
+    virtual long long removeVal() { return _nextLongLong(); }
+private:
+    uniform_int< unsigned long long > _uniform_int;
+    variate_generator< mt19937&, uniform_int< unsigned long long > > _nextLongLong;
+};
+
+class UniformInsertUniformRemoveInteger : public InsertAndUniformRemoveStrategy< long long > {
+public:
+    virtual long long insertVal() { return _gen.insertVal(); }
+private:
+    UniformInsertRangedUniformRemoveInteger _gen;
+};
+
+/**
+ * String Keys
+ * Uniform Inserts
+ * Uniform Removes
+ */
+class UniformInsertRangedUniformRemoveString : public InsertAndRangedRemoveStrategy< string > {
+public:
+    UniformInsertRangedUniformRemoveString() :
+        _geometric_distribution( 0.9 ),
+        _nextLength( randomNumberGenerator, _geometric_distribution ),
+        _uniform_char( 'a', 'z' ),
+        _nextChar( randomNumberGenerator, _uniform_char ) {
+    }
+    /** Small likelihood of duplicates */
+    virtual string insertVal() { return nextString(); }
+    virtual string removeVal() { return nextString(); }
+private:
+    string nextString() {
+        // The longer the minimum string length, the lower the likelihood of duplicates
+        int len = _nextLength() + 5;
+        len = len > 100 ? 100 : len;
+        string ret( len, 'x' );
+        for( int i = 0; i < len; ++i ) {
+            ret[ i ] = _nextChar();
+        }
+        return ret;
+    }
+    geometric_distribution<> _geometric_distribution;
+    variate_generator< mt19937&, geometric_distribution<> > _nextLength;
+    uniform_int< char > _uniform_char;
+    variate_generator< mt19937&, uniform_int< char > > _nextChar;
+};
+
+class UniformInsertUniformRemoveString : public InsertAndUniformRemoveStrategy< string > {
+public:
+    virtual string insertVal() { return _gen.insertVal(); }
+private:
+    UniformInsertRangedUniformRemoveString _gen;
+};
+
+/**
+ * OID Keys
+ * Increasing Inserts
+ * Uniform Removes
+ */
+class IncreasingInsertRangedUniformRemoveOID : public InsertAndRangedRemoveStrategy< OID > {
+public:
+    IncreasingInsertRangedUniformRemoveOID() :
+        _max( -1 ) {
+    }
+    virtual OID insertVal() { return oidFromULL( ++_max ); }
+    virtual OID removeVal() {
+        uniform_int< unsigned long long > distribution( 0, _max > 0 ? _max : 0 );
+        variate_generator< mt19937&, uniform_int< unsigned long long > > generator( randomNumberGenerator, distribution );
+        return oidFromULL( generator() );
+    }
+private:
+    static OID oidFromULL( unsigned long long val ) {
+        val = __builtin_bswap64( val );
+        OID oid;
+        oid.clear();
+        memcpy( (char*)&oid + 4, &val, 8 );
+        return oid;
+    }
+    long long _max;
+};
+
+class IncreasingInsertUniformRemoveOID : public InsertAndUniformRemoveStrategy< OID > {
+public:
+    virtual OID insertVal() { return _gen.insertVal(); }
+private:
+    IncreasingInsertRangedUniformRemoveOID _gen;
+};
+
+/**
+ * Integer Keys
+ * Increasing Inserts
+ * Increasing Removes (on remove, the lowest key is always removed)
+ */
+class IncreasingInsertIncreasingRemoveInteger : public InsertAndRemoveStrategy {
+public:
+    IncreasingInsertIncreasingRemoveInteger() :
+        // Start with a large value so data type will be preserved if we round
+        // trip through json.
+        _min( 1LL << 32 ),
+        _max( 1LL << 32 ) {
+    }
+    virtual BSONObj insertObj() { return insertObjWithVal( ++_max ); }
+    virtual BSONObj removeObj() { return removeObjWithVal( _min < _max ? ++_min : _min ); }
+private:
+    long long _min;
+    long long _max;
+};
+
+/** Generate a random boolean value. */
+class BernoulliGenerator {
+public:
+    /**
+     * @param excessFalsePercent This specifies the desired rate of false values
+     * vs true values.  If we want false to be 5% more likely than true, we
+     * specify 5 for this argument.
+     */
+    BernoulliGenerator( int excessFalsePercent ) :
+        _bernoulli_distribution( 1.0 / ( 2.0 + excessFalsePercent / 100.0 ) ),
+        _generator( randomNumberGenerator, _bernoulli_distribution ) {
+    }
+    bool operator()() { return _generator(); }
+private:
+    bernoulli_distribution<> _bernoulli_distribution;
+    variate_generator< mt19937&, bernoulli_distribution<> > _generator;
+};
+
+/** Runs a strategy on a connection, with specified mix of inserts and removes. */
+class InsertAndRemoveRunner {
+public:
+    InsertAndRemoveRunner( DBClientConnection &conn, InsertAndRemoveStrategy &strategy, int excessInsertPercent ) :
+        _conn( conn ),
+        _strategy( strategy ),
+        _nextOpTypeRemove( excessInsertPercent ) {
+    }
+    void writeOne() {
+        if ( _nextOpTypeRemove() ) {
+            _conn.remove( ns, _strategy.removeObj(), true );
+        }
+        else {
+            _conn.insert( ns, _strategy.insertObj() );
+        }
+    }
+private:
+    DBClientConnection &_conn;
+    InsertAndRemoveStrategy &_strategy;
+    BernoulliGenerator _nextOpTypeRemove;
+};
+
+/**
+ * Writes a test script to cout based on a strategy and specified mix of inserts
+ * and removes.  The script can be subsequently executed by InsertAndRemoveRunner.
+ * Script generation is intended for strategies that are memory or cpu intensive
+ * and might either divert resources from a mongod instance being analyzed on the
+ * same machine or fail to generate requests as quickly as the mongod might
+ * accept them.
+ * The script contains one line per operation.  Each line begins
+ * with a letter indicating the operation type, followed by a space.  Next
+ * follows the json representation of a document for the specified operation
+ * type.
+ */
+class InsertAndRemoveScriptGenerator {
+public:
+    InsertAndRemoveScriptGenerator( InsertAndRemoveStrategy &strategy, int excessInsertPercent ) :
+        _strategy( strategy ),
+        _nextOpTypeRemove( excessInsertPercent ) {
+    }
+    void writeOne() {
+        if ( _nextOpTypeRemove() ) {
+            cout << "r " << _strategy.removeObj().jsonString() << endl;
+        }
+        else {
+            cout << "i " << _strategy.insertObj().jsonString() << endl;
+        }
+    }
+private:
+    InsertAndRemoveStrategy &_strategy;
+    BernoulliGenerator _nextOpTypeRemove;
+};
+
+/**
+ * Run a test script from cin that was generated by
+ * InsertAndRemoveScriptGenerator.  Running the script is intended to be
+ * lightweight in terms of memory and cpu usage, and fast.
+ */
+class InsertAndRemoveScriptRunner {
+public:
+    InsertAndRemoveScriptRunner( DBClientConnection &conn ) :
+        _conn( conn ) {
+    }
+    void writeOne() {
+        cin.getline( _buf, 1024 );
+        BSONObj val = fromjson( _buf + 2 );
+        if ( _buf[ 0 ] == 'r' ) {
+            _conn.remove( ns, val, true );
+        }
+        else {
+            _conn.insert( ns, val );
+        }
+    }
+private:
+    DBClientConnection &_conn;
+    char _buf[ 1024 ];
+};
+
+int main( int argc, const char **argv ) {
+
+    DBClientConnection conn;
+    conn.connect( "127.0.0.1:27017" );
+    conn.dropCollection( ns );
+
+//    UniformInsertRangedUniformRemoveInteger strategy;
+//    UniformInsertUniformRemoveInteger strategy;
+//    UniformInsertRangedUniformRemoveString strategy;
+//    UniformInsertUniformRemoveString strategy;
+//    IncreasingInsertRangedUniformRemoveOID strategy;
+//    IncreasingInsertUniformRemoveOID strategy;
+//    IncreasingInsertIncreasingRemoveInteger strategy;
+//    InsertAndRemoveScriptGenerator runner( strategy, 5 );
+    InsertAndRemoveScriptRunner runner( conn );
+
+    Timer t;
+    BSONObj statsCmd = BSON( "collstats" << index_collection );
+
+    // Print header, unless we are generating a script (in that case, comment this out).
+    cout << "ops,milliseconds,docs,totalBucketSize" << endl;
+
+    long long i = 0;
+    long long n = 10000000000;
+    while( i < n ) {
+        runner.writeOne();
+        // Print statistics, unless we are generating a script (in that case, comment this out).
+        // The stats collection requests below provide regular read operations,
+        // ensuring we are caught up with the progress being made by the mongod
+        // under analysis.
+        if ( ++i % 50000 == 0 ) {
+            // The total number of documents present.
+            long long docs = conn.count( ns );
+            BSONObj result;
+            conn.runCommand( db, statsCmd, result );
+            // The total number of bytes used for all allocated 8K buckets of the
+            // btree.
+            long long totalBucketSize = result.getField( "count" ).numberLong() * 8192;
+            cout << i << ',' << t.millis() << ',' << docs << ',' << totalBucketSize << endl;
+        }
+    }
+}
diff --git a/src/mongo/dbtests/perf/perftest.cpp b/src/mongo/dbtests/perf/perftest.cpp
new file mode 100644
index 00000000000..b6219f7f5d9
--- /dev/null
+++ b/src/mongo/dbtests/perf/perftest.cpp
@@ -0,0 +1,761 @@
+// perftest.cpp : Run db performance tests.
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "../../client/dbclient.h"
+#include "../../db/instance.h"
+#include "../../db/ops/query.h"
+#include "../../db/queryoptimizer.h"
+#include "../../util/file_allocator.h"
+
+#include "../framework.h"
+#include <boost/date_time/posix_time/posix_time.hpp>
+
+namespace mongo {
+    extern string dbpath;
+} // namespace mongo
+
+
+using namespace mongo;
+using namespace mongo::regression;
+
+DBClientBase *client_;
+
+// Each test runs with a separate db, so no test does any of the startup
+// (ie allocation) work for another test.
+template< class T >
+string testDb( T *t = 0 ) {
+    string name = mongo::demangleName( typeid( T ) );
+    // Make filesystem safe.
+    for( string::iterator i = name.begin(); i != name.end(); ++i )
+        if ( *i == ':' )
+            *i = '_';
+    return name;
+}
+
+template< class T >
+string testNs( T *t ) {
+    stringstream ss;
+    ss << testDb( t ) << ".perftest";
+    return ss.str();
+}
+
+template <class T>
+class Runner {
+public:
+    void run() {
+        T test;
+        string name = testDb( &test );
+        boost::posix_time::ptime start = boost::posix_time::microsec_clock::universal_time();
+        test.run();
+        boost::posix_time::ptime end = boost::posix_time::microsec_clock::universal_time();
+        long long micro = ( end - start ).total_microseconds();
+        cout << "{'" << name << "': "
+             << micro / 1000000
+             << "."
+             << setw( 6 ) << setfill( '0' ) << micro % 1000000
+             << "}" << endl;
+    }
+    ~Runner() {
+        FileAllocator::get()->waitUntilFinished();
+        client_->dropDatabase( testDb< T >().c_str() );
+    }
+};
+
+class RunnerSuite : public Suite {
+public:
+    RunnerSuite( string name ) : Suite( name ) {}
+protected:
+    template< class T >
+    void add() {
+        Suite::add< Runner< T > >();
+    }
+};
+
+namespace Insert {
+    class IdIndex {
+    public:
+        void run() {
+            string ns = testNs( this );
+            for( int i = 0; i < 100000; ++i ) {
+                client_->insert( ns.c_str(), BSON( "_id" << i ) );
+            }
+        }
+    };
+
+    class TwoIndex {
+    public:
+        TwoIndex() : ns_( testNs( this ) ) {
+            client_->ensureIndex( ns_, BSON( "_id" << 1 ), "my_id" );
+        }
+        void run() {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i ) );
+        }
+        string ns_;
+    };
+
+    class TenIndex {
+    public:
+        TenIndex() : ns_( testNs( this ) ) {
+            const char *names = "aaaaaaaaa";
+            for( int i = 0; i < 9; ++i ) {
+                client_->resetIndexCache();
+                client_->ensureIndex( ns_.c_str(), BSON( "_id" << 1 ), false, names + i );
+            }
+        }
+        void run() {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i ) );
+        }
+        string ns_;
+    };
+
+    class Capped {
+    public:
+        Capped() : ns_( testNs( this ) ) {
+            client_->createCollection( ns_.c_str(), 100000, true );
+        }
+        void run() {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i ) );
+        }
+        string ns_;
+    };
+
+    class OneIndexReverse {
+    public:
+        OneIndexReverse() : ns_( testNs( this ) ) {
+            client_->ensureIndex( ns_, BSON( "_id" << 1 ) );
+        }
+        void run() {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << ( 100000 - 1 - i ) ) );
+        }
+        string ns_;
+    };
+
+    class OneIndexHighLow {
+    public:
+        OneIndexHighLow() : ns_( testNs( this ) ) {
+            client_->ensureIndex( ns_, BSON( "_id" << 1 ) );
+        }
+        void run() {
+            for( int i = 0; i < 100000; ++i ) {
+                int j = 50000 + ( ( i % 2 == 0 ) ? 1 : -1 ) * ( i / 2 + 1 );
+                client_->insert( ns_.c_str(), BSON( "_id" << j ) );
+            }
+        }
+        string ns_;
+    };
+
+    class All : public RunnerSuite {
+    public:
+        All() : RunnerSuite( "insert" ) {}
+
+        void setupTests() {
+            add< IdIndex >();
+            add< TwoIndex >();
+            add< TenIndex >();
+            add< Capped >();
+            add< OneIndexReverse >();
+            add< OneIndexHighLow >();
+        }
+    } all;
+} // namespace Insert
+
+namespace Update {
+    class Smaller {
+    public:
+        Smaller() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i << "b" << 2 ) );
+        }
+        void run() {
+            for( int i = 0; i < 100000; ++i )
+                client_->update( ns_.c_str(), QUERY( "_id" << i ), BSON( "_id" << i ) );
+        }
+        string ns_;
+    };
+
+    class Bigger {
+    public:
+        Bigger() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i ) );
+        }
+        void run() {
+            for( int i = 0; i < 100000; ++i )
+                client_->update( ns_.c_str(), QUERY( "_id" << i ), BSON( "_id" << i << "b" << 2 ) );
+        }
+        string ns_;
+    };
+
+    class Inc {
+    public:
+        Inc() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 10000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i << "i" << 0 ) );
+        }
+        void run() {
+            for( int j = 0; j < 10; ++j )
+                for( int i = 0; i < 10000; ++i )
+                    client_->update( ns_.c_str(), QUERY( "_id" << i ), BSON( "$inc" << BSON( "i" << 1 ) ) );
+        }
+        string ns_;
+    };
+
+    class Set {
+    public:
+        Set() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 10000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i << "i" << 0 ) );
+        }
+        void run() {
+            for( int j = 1; j < 11; ++j )
+                for( int i = 0; i < 10000; ++i )
+                    client_->update( ns_.c_str(), QUERY( "_id" << i ), BSON( "$set" << BSON( "i" << j ) ) );
+        }
+        string ns_;
+    };
+
+    class SetGrow {
+    public:
+        SetGrow() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 10000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i << "i" << "" ) );
+        }
+        void run() {
+            for( int j = 9; j > -1; --j )
+                for( int i = 0; i < 10000; ++i )
+                    client_->update( ns_.c_str(), QUERY( "_id" << i ), BSON( "$set" << BSON( "i" << "aaaaaaaaaa"[j] ) ) );
+        }
+        string ns_;
+    };
+
+    class All : public RunnerSuite {
+    public:
+        All() : RunnerSuite( "update" ) {}
+        void setupTests() {
+            add< Smaller >();
+            add< Bigger >();
+            add< Inc >();
+            add< Set >();
+            add< SetGrow >();
+        }
+    } all;
+} // namespace Update
+
+namespace BSON {
+
+    const char *sample =
+        "{\"one\":2, \"two\":5, \"three\": {},"
+        "\"four\": { \"five\": { \"six\" : 11 } },"
+        "\"seven\": [ \"a\", \"bb\", \"ccc\", 5 ],"
+        "\"eight\": Dbref( \"rrr\", \"01234567890123456789aaaa\" ),"
+        "\"_id\": ObjectId( \"deadbeefdeadbeefdeadbeef\" ),"
+        "\"nine\": { \"$binary\": \"abc=\", \"$type\": \"02\" },"
+        "\"ten\": Date( 44 ), \"eleven\": /foooooo/i }";
+
+    const char *shopwikiSample =
+        "{ '_id' : '289780-80f85380b5c1d4a0ad75d1217673a4a2' , 'site_id' : 289780 , 'title'"
+        ": 'Jubilee - Margaret Walker' , 'image_url' : 'http://www.heartlanddigsandfinds.c"
+        "om/store/graphics/Product_Graphics/Product_8679.jpg' , 'url' : 'http://www.heartla"
+        "nddigsandfinds.com/store/store_product_detail.cfm?Product_ID=8679&Category_ID=2&Su"
+        "b_Category_ID=910' , 'url_hash' : 3450626119933116345 , 'last_update' :  null  , '"
+        "features' : { '$imagePrefetchDate' : '2008Aug30 22:39' , '$image.color.rgb' : '5a7"
+        "574' , 'Price' : '$10.99' , 'Description' : 'Author--s 1st Novel. A Houghton Miffl"
+        "in Literary Fellowship Award novel by the esteemed poet and novelist who has demon"
+        "strated a lifelong commitment to the heritage of black culture. An acclaimed story"
+        "of Vyry, a negro slave during the 19th Century, facing the biggest challenge of h"
+        "er lifetime - that of gaining her freedom, fighting for all the things she had nev"
+        "er known before. The author, great-granddaughter of Vyry, reveals what the Civil W"
+        "ar in America meant to the Negroes. Slavery W' , '$priceHistory-1' : '2008Dec03 $1"
+        "0.99' , 'Brand' : 'Walker' , '$brands_in_title' : 'Walker' , '--path' : '//HTML[1]"
+        "/BODY[1]/TABLE[1]/TR[1]/TD[1]/P[1]/TABLE[1]/TR[1]/TD[1]/TABLE[1]/TR[2]/TD[2]/TABLE"
+        "[1]/TR[1]/TD[1]/P[1]/TABLE[1]/TR[1]' , '~location' : 'en_US' , '$crawled' : '2009J"
+        "an11 03:22' , '$priceHistory-2' : '2008Nov15 $10.99' , '$priceHistory-0' : '2008De"
+        "c24 $10.99'}}";
+
+    class Parse {
+    public:
+        void run() {
+            for( int i = 0; i < 10000; ++i )
+                fromjson( sample );
+        }
+    };
+
+    class ShopwikiParse {
+    public:
+        void run() {
+            for( int i = 0; i < 10000; ++i )
+                fromjson( shopwikiSample );
+        }
+    };
+
+    class Json {
+    public:
+        Json() : o_( fromjson( sample ) ) {}
+        void run() {
+            for( int i = 0; i < 10000; ++i )
+                o_.jsonString();
+        }
+        BSONObj o_;
+    };
+
+    class ShopwikiJson {
+    public:
+        ShopwikiJson() : o_( fromjson( shopwikiSample ) ) {}
+        void run() {
+            for( int i = 0; i < 10000; ++i )
+                o_.jsonString();
+        }
+        BSONObj o_;
+    };
+
+    template <int LEN>
+    class Copy {
+    public:
+        Copy(){
+            // putting it in a subobject to force copy on getOwned
+            BSONObjBuilder outer;
+            BSONObjBuilder b (outer.subobjStart("inner"));
+            while (b.len() < LEN)
+                b.append(BSONObjBuilder::numStr(b.len()), b.len());
+            b.done();
+            _base = outer.obj();
+        }
+
+        void run() {
+            int iterations = 1000*1000;
+            while (iterations--){
+                BSONObj temp = copy(_base.firstElement().embeddedObject().getOwned());
+            }
+        }
+
+    private:
+        // noinline should force copying even when optimized
+        NOINLINE_DECL BSONObj copy(BSONObj x){
+            return x;
+        }
+
+        BSONObj _base;
+    };
+
+
+
+    class All : public RunnerSuite {
+    public:
+        All() : RunnerSuite( "bson" ) {}
+        void setupTests() {
+            add< Parse >();
+            add< ShopwikiParse >();
+            add< Json >();
+            add< ShopwikiJson >();
+            add< Copy<10> >();
+            add< Copy<100> >();
+            add< Copy<1000> >();
+            add< Copy<10*1000> >();
+        }
+    } all;
+
+} // namespace BSON
+
+namespace Index {
+
+    class Int {
+    public:
+        Int() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "a" << i ) );
+        }
+        void run() {
+            client_->ensureIndex( ns_, BSON( "a" << 1 ) );
+        }
+        string ns_;
+    };
+
+    class ObjectId {
+    public:
+        ObjectId() : ns_( testNs( this ) ) {
+            OID id;
+            for( int i = 0; i < 100000; ++i ) {
+                id.init();
+                client_->insert( ns_.c_str(), BSON( "a" << id ) );
+            }
+        }
+        void run() {
+            client_->ensureIndex( ns_, BSON( "a" << 1 ) );
+        }
+        string ns_;
+    };
+
+    class String {
+    public:
+        String() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i ) {
+                stringstream ss;
+                ss << i;
+                client_->insert( ns_.c_str(), BSON( "a" << ss.str() ) );
+            }
+        }
+        void run() {
+            client_->ensureIndex( ns_, BSON( "a" << 1 ) );
+        }
+        string ns_;
+    };
+
+    class Object {
+    public:
+        Object() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i ) {
+                client_->insert( ns_.c_str(), BSON( "a" << BSON( "a" << i ) ) );
+            }
+        }
+        void run() {
+            client_->ensureIndex( ns_, BSON( "a" << 1 ) );
+        }
+        string ns_;
+    };
+
+    class All : public RunnerSuite {
+    public:
+        All() : RunnerSuite( "index" ) {}
+        void setupTests() {
+            add< Int >();
+            add< ObjectId >();
+            add< String >();
+            add< Object >();
+        }
+    } all;
+
+} // namespace Index
+
+namespace QueryTests {
+
+    class NoMatch {
+    public:
+        NoMatch() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i ) );
+        }
+        void run() {
+            client_->findOne( ns_.c_str(), QUERY( "_id" << 100000 ) );
+        }
+        string ns_;
+    };
+
+    class NoMatchIndex {
+    public:
+        NoMatchIndex() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i ) );
+        }
+        void run() {
+            client_->findOne( ns_.c_str(),
+                              QUERY( "a" << "b" ).hint( BSON( "_id" << 1 ) ) );
+        }
+        string ns_;
+    };
+
+    class NoMatchLong {
+    public:
+        NoMatchLong() : ns_( testNs( this ) ) {
+            const char *names = "aaaaaaaaaa";
+            for( int i = 0; i < 100000; ++i ) {
+                BSONObjBuilder b;
+                for( int j = 0; j < 10; ++j )
+                    b << ( names + j ) << i;
+                client_->insert( ns_.c_str(), b.obj() );
+            }
+        }
+        void run() {
+            client_->findOne( ns_.c_str(), QUERY( "a" << 100000 ) );
+        }
+        string ns_;
+    };
+
+    class SortOrdered {
+    public:
+        SortOrdered() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 50000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << i ) );
+        }
+        void run() {
+            auto_ptr< DBClientCursor > c =
+                client_->query( ns_.c_str(), Query( BSONObj() ).sort( BSON( "_id" << 1 ) ) );
+            int i = 0;
+            for( ; c->more(); c->nextSafe(), ++i );
+            ASSERT_EQUALS( 50000, i );
+        }
+        string ns_;
+    };
+
+    class SortReverse {
+    public:
+        SortReverse() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 50000; ++i )
+                client_->insert( ns_.c_str(), BSON( "_id" << ( 50000 - 1 - i ) ) );
+        }
+        void run() {
+            auto_ptr< DBClientCursor > c =
+                client_->query( ns_.c_str(), Query( BSONObj() ).sort( BSON( "_id" << 1 ) ) );
+            int i = 0;
+            for( ; c->more(); c->nextSafe(), ++i );
+            ASSERT_EQUALS( 50000, i );
+        }
+        string ns_;
+    };
+
+    class GetMore {
+    public:
+        GetMore() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "a" << i ) );
+            c_ = client_->query( ns_.c_str(), Query() );
+        }
+        void run() {
+            int i = 0;
+            for( ; c_->more(); c_->nextSafe(), ++i );
+            ASSERT_EQUALS( 100000, i );
+        }
+        string ns_;
+        auto_ptr< DBClientCursor > c_;
+    };
+
+    class GetMoreIndex {
+    public:
+        GetMoreIndex() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_.c_str(), BSON( "a" << i ) );
+            client_->ensureIndex( ns_, BSON( "a" << 1 ) );
+            c_ = client_->query( ns_.c_str(), QUERY( "a" << GT << -1 ).hint( BSON( "a" << 1 ) ) );
+        }
+        void run() {
+            int i = 0;
+            for( ; c_->more(); c_->nextSafe(), ++i );
+            ASSERT_EQUALS( 100000, i );
+        }
+        string ns_;
+        auto_ptr< DBClientCursor > c_;
+    };
+
+    class GetMoreKeyMatchHelps {
+    public:
+        GetMoreKeyMatchHelps() : ns_( testNs( this ) ) {
+            for( int i = 0; i < 1000000; ++i )
+                client_->insert( ns_.c_str(), BSON( "a" << i << "b" << i % 10 << "c" << "d" ) );
+            client_->ensureIndex( ns_, BSON( "a" << 1 << "b" << 1 ) );
+            c_ = client_->query( ns_.c_str(), QUERY( "a" << GT << -1 << "b" << 0 ).hint( BSON( "a" << 1 << "b" << 1 ) ) );
+        }
+        void run() {
+            int i = 0;
+            for( ; c_->more(); c_->nextSafe(), ++i );
+            ASSERT_EQUALS( 100000, i );
+        }
+        string ns_;
+        auto_ptr< DBClientCursor > c_;
+    };
+
+    class All : public RunnerSuite {
+    public:
+        All() : RunnerSuite( "query" ) {}
+        void setupTests() {
+            add< NoMatch >();
+            add< NoMatchIndex >();
+            add< NoMatchLong >();
+            add< SortOrdered >();
+            add< SortReverse >();
+            add< GetMore >();
+            add< GetMoreIndex >();
+            add< GetMoreKeyMatchHelps >();
+        }
+    } all;
+
+} // namespace QueryTests
+
+namespace Count {
+
+    class Count {
+    public:
+        Count() : ns_( testNs( this ) ) {
+            BSONObj obj = BSON( "a" << 1 );
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_, obj );
+        }
+        void run() {
+            ASSERT_EQUALS( 100000U, client_->count( ns_, BSON( "a" << 1 ) ) );
+        }
+        string ns_;
+    };
+
+    class CountIndex {
+    public:
+        CountIndex() : ns_( testNs( this ) ) {
+            BSONObj obj = BSON( "a" << 1 );
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_, obj );
+            client_->ensureIndex( ns_, obj );
+        }
+        void run() {
+            // 'simple' match does not work for numbers
+            ASSERT_EQUALS( 100000U, client_->count( ns_, BSON( "a" << 1 ) ) );
+        }
+        string ns_;
+    };
+
+    class CountSimpleIndex {
+    public:
+        CountSimpleIndex() : ns_( testNs( this ) ) {
+            BSONObj obj = BSON( "a" << "b" );
+            for( int i = 0; i < 100000; ++i )
+                client_->insert( ns_, obj );
+            client_->ensureIndex( ns_, obj );
+        }
+        void run() {
+            ASSERT_EQUALS( 100000U, client_->count( ns_, BSON( "a" << "b" ) ) );
+        }
+        string ns_;
+    };
+
+    class All : public RunnerSuite {
+    public:
+        All() : RunnerSuite( "count" ) {}
+        void setupTests() {
+            add< Count >();
+            add< CountIndex >();
+            add< CountSimpleIndex >();
+        }
+    } all;
+
+} // namespace Count
+
+namespace Plan {
+
+    class Hint {
+    public:
+        Hint() : ns_( testNs( this ) ) {
+            const char *names = "aaaaaaaaa";
+            for( int i = 0; i < 9; ++i ) {
+                client_->resetIndexCache();
+                client_->ensureIndex( ns_.c_str(), BSON( ( names + i ) << 1 ), false, names + i );
+            }
+            lk_.reset( new dblock );
+            Client::Context ctx( ns_ );
+            hint_ = BSON( "hint" << BSON( "a" << 1 ) );
+            hintElt_ = hint_.firstElement();
+        }
+        void run() {
+            for( int i = 0; i < 10000; ++i )
+                MultiPlanScanner s( ns_.c_str(), BSONObj(), BSONObj(), &hintElt_ );
+        }
+        string ns_;
+        auto_ptr< dblock > lk_;
+        BSONObj hint_;
+        BSONElement hintElt_;
+    };
+
+    class Sort {
+    public:
+        Sort() : ns_( testNs( this ) ) {
+            const char *names = "aaaaaaaaaa";
+            for( int i = 0; i < 10; ++i ) {
+                client_->resetIndexCache();
+                client_->ensureIndex( ns_.c_str(), BSON( ( names + i ) << 1 ), false, names + i );
+            }
+            lk_.reset( new dblock );
+        }
+        void run() {
+            Client::Context ctx( ns_ );
+            for( int i = 0; i < 10000; ++i )
+                MultiPlanScanner s( ns_.c_str(), BSONObj(), BSON( "a" << 1 ) );
+        }
+        string ns_;
+        auto_ptr< dblock > lk_;
+    };
+
+    class Query {
+    public:
+        Query() : ns_( testNs( this ) ) {
+            const char *names = "aaaaaaaaaa";
+            for( int i = 0; i < 10; ++i ) {
+                client_->resetIndexCache();
+                client_->ensureIndex( ns_.c_str(), BSON( ( names + i ) << 1 ), false, names + i );
+            }
+            lk_.reset( new dblock );
+        }
+        void run() {
+            Client::Context ctx( ns_.c_str() );
+            for( int i = 0; i < 10000; ++i )
+                MultiPlanScanner s( ns_.c_str(), BSON( "a" << 1 ), BSONObj() );
+        }
+        string ns_;
+        auto_ptr< dblock > lk_;
+    };
+
+    class All : public RunnerSuite {
+    public:
+        All() : RunnerSuite("plan" ) {}
+        void setupTests() {
+            add< Hint >();
+            add< Sort >();
+            add< Query >();
+        }
+    } all;
+} // namespace Plan
+
+namespace Misc {
+    class TimeMicros64 {
+    public:
+        void run() {
+            int iterations = 1000*1000;
+            while(iterations--){
+                curTimeMicros64();
+            }
+        }
+    };
+
+    class JSTime {
+    public:
+        void run() {
+            int iterations = 1000*1000;
+            while(iterations--){
+                jsTime();
+            }
+        }
+    };
+
+    class All : public RunnerSuite {
+    public:
+        All() : RunnerSuite("misc") {}
+        void setupTests() {
+            add< TimeMicros64 >();
+            add< JSTime >();
+        }
+    } all;
+}
+
+int main( int argc, char **argv ) {
+    logLevel = -1;
+    client_ = new DBDirectClient();
+
+    return Suite::run(argc, argv, "/data/db/perftest");
+}
+
diff --git a/src/mongo/dbtests/perftests.cpp b/src/mongo/dbtests/perftests.cpp
new file mode 100644
index 00000000000..284e3991f15
--- /dev/null
+++ b/src/mongo/dbtests/perftests.cpp
@@ -0,0 +1,1029 @@
+/** @file perftests.cpp.cpp : unit tests relating to performance
+
+          The idea herein is tests that run fast and can be part of the normal CI suite.  So no tests herein that take
+          a long time to run.  Obviously we need those too, but they will be separate.
+
+          These tests use DBDirectClient; they are a bit white-boxish.
+*/
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include <fstream>
+#include "../db/ops/query.h"
+#include "../db/db.h"
+#include "../db/instance.h"
+#include "../db/json.h"
+#include "../db/lasterror.h"
+#include "../db/ops/update.h"
+#include "../db/taskqueue.h"
+#include "../util/timer.h"
+#include "dbtests.h"
+#include "../db/dur_stats.h"
+#include "../util/checksum.h"
+#include "../util/version.h"
+#include "../db/key.h"
+#include "../util/compress.h"
+
+using namespace bson;
+
+namespace mongo {
+    namespace regression {
+        extern unsigned perfHist;
+    }
+}
+
+namespace PerfTests {
+
+    const bool profiling = false;
+
+    typedef DBDirectClient DBClientType;
+    //typedef DBClientConnection DBClientType;
+
+    class ClientBase {
+    public:
+        // NOTE: Not bothering to backup the old error record.
+        ClientBase() {
+            //_client.connect("localhost");
+            mongo::lastError.reset( new LastError() );
+        }
+        virtual ~ClientBase() {
+            //mongo::lastError.release();
+        }
+    protected:
+        static void insert( const char *ns, BSONObj o ) {
+            _client.insert( ns, o );
+        }
+        static void update( const char *ns, BSONObj q, BSONObj o, bool upsert = 0 ) {
+            _client.update( ns, Query( q ), o, upsert );
+        }
+        static bool error() {
+            return !_client.getPrevError().getField( "err" ).isNull();
+        }
+        DBClientBase &client() const { return _client; }
+    private:
+        static DBClientType _client;
+    };
+    DBClientType ClientBase::_client;
+
+    // todo: use a couple threads. not a very good test yet.
+    class TaskQueueTest {
+        static int tot;
+        struct V {
+            int val;
+            static void go(const V &v) { tot += v.val; }
+        };
+    public:
+        void run() {
+            tot = 0;
+            TaskQueue<V> d;
+            int x = 0;
+            for( int i = 0; i < 100; i++ ) {
+                if( i % 30 == 0 )
+                    d.invoke();
+
+                x += i;
+                writelock lk;
+                V v;
+                v.val = i;
+                d.defer(v);
+            }
+            d.invoke();
+            assert( x == tot );
+        }
+    };
+    int TaskQueueTest::tot;
+
+    class B : public ClientBase {
+        string _ns;
+    protected:
+        const char *ns() { return _ns.c_str(); }
+
+        // anything you want to do before being timed
+        virtual void prep() { }
+
+        virtual void timed() = 0;
+
+        // optional 2nd test phase to be timed separately
+        // return name of it
+        virtual string timed2(DBClientBase&) { return ""; }
+
+        virtual void post() { }
+
+        virtual string name() = 0;
+
+        // how long to run test.  0 is a sentinel which means just run the timed() method once and time it.
+        virtual int howLongMillis() { return profiling ? 60000 : 5000; }
+
+        /* override if your test output doesn't need that */
+        virtual bool showDurStats() { return true; }
+
+        static  boost::shared_ptr<DBClientConnection> conn;
+        static string _perfhostname;
+        static unsigned once;
+
+    public:
+        /* if you want recording of the timings, place the password for the perf database
+            in ./../settings.py:
+                pstatspassword="<pwd>"
+        */
+        void connect() {
+            if( once )
+                return;
+            ++once;
+
+            // no writing to perf db if _DEBUG
+            DEV return;
+
+            const char *fn = "../../settings.py";
+            if( !exists(fn) ) {
+                if( exists("settings.py") )
+                    fn = "settings.py";
+                else {
+                    cout << "no ../../settings.py or ./settings.py file found. will not write perf stats to pstats db." << endl;
+                    cout << "it is recommended this be enabled even on dev boxes" << endl;
+                    return;
+                }
+            }
+
+            try {
+                if( conn == 0 ) {
+                    MemoryMappedFile f;
+                    const char *p = (const char *) f.mapWithOptions(fn, MongoFile::READONLY);
+                    string pwd;
+
+                    {
+                        const char *q = str::after(p, "pstatspassword=\"");
+                        if( *q == 0 ) {
+                            cout << "info perftests.cpp: no pstatspassword= in settings.py" << endl;
+                            return;
+                        }
+                        else {
+                            pwd = str::before(q, '\"');
+                        }
+                    }
+
+                    boost::shared_ptr<DBClientConnection> c(new DBClientConnection(false, 0, 60));
+                    string err;
+                    if( c->connect("perfdb.10gen.cc", err) ) {
+                        if( !c->auth("perf", "perf", pwd, err) ) {
+                            cout << "info: authentication with stats db failed: " << err << endl;
+                            assert(false);
+                        }
+                        conn = c;
+
+                        // override the hostname with the buildbot hostname, if present
+                        ifstream hostf( "../../info/host" );
+                        if ( hostf.good() ) {
+                            char buf[1024];
+                            hostf.getline(buf, sizeof(buf));
+                            _perfhostname = buf;
+                        }
+                        else {
+                            _perfhostname = getHostName();
+                        }
+                    }
+                    else {
+                        cout << err << " (to log perfstats)" << endl;
+                    }
+                }
+            }
+            catch(...) { }
+        }
+
+        virtual unsigned batchSize() { return 50; }
+
+        void say(unsigned long long n, int ms, string s) {
+            unsigned long long rps = n*1000/ms;
+            cout << "stats " << setw(33) << left << s << ' ' << right << setw(9) << rps << ' ' << right << setw(5) << ms << "ms ";
+            if( showDurStats() )
+                cout << dur::stats.curr->_asCSV();
+            cout << endl;
+
+            connect();
+
+            if( conn && !conn->isFailed() ) {
+                const char *ns = "perf.pstats";
+                if( perfHist ) {
+                    static bool needver = true;
+                    try {
+                        // try to report rps from last time */
+                        Query q;
+                        {
+                            BSONObjBuilder b;
+                            b.append("host",_perfhostname).append("test",s).append("dur",cmdLine.dur);
+                            DEV { b.append("info.DEBUG",true); }
+                            else b.appendNull("info.DEBUG");
+                            if( sizeof(int*) == 4 )
+                                b.append("info.bits", 32);
+                            else
+                                b.appendNull("info.bits");
+                            q = Query(b.obj()).sort("when",-1);
+                        }
+                        BSONObj fields = BSON( "rps" << 1 << "info" << 1 );
+                        vector<BSONObj> v;
+                        conn->findN(v, ns, q, perfHist, 0, &fields);
+                        for( vector<BSONObj>::iterator i = v.begin(); i != v.end(); i++ ) {
+                            BSONObj o = *i;
+                            double lastrps = o["rps"].Number();
+                            if( lastrps ) {
+                                cout << "stats " << setw(33) << right << "new/old:" << ' ' << setw(9);
+                                cout << fixed << setprecision(2) << rps / lastrps;
+                                if( needver ) {
+                                    cout << "         " << o.getFieldDotted("info.git").toString();
+                                }
+                                cout << '\n';
+                            }
+                        }
+                    } catch(...) { }
+                    cout.flush();
+                    needver = false;
+                }
+                {
+                    bob b;
+                    b.append("host", _perfhostname);
+                    b.appendTimeT("when", time(0));
+                    b.append("test", s);
+                    b.append("rps", (int) rps);
+                    b.append("millis", ms);
+                    b.appendBool("dur", cmdLine.dur);
+                    if( showDurStats() && cmdLine.dur )
+                        b.append("durStats", dur::stats.curr->_asObj());
+                    {
+                        bob inf;
+                        inf.append("version", versionString);
+                        if( sizeof(int*) == 4 ) inf.append("bits", 32);
+                        DEV inf.append("DEBUG", true);
+#if defined(_WIN32)
+                        inf.append("os", "win");
+#endif
+                        inf.append("git", gitVersion());
+                        inf.append("boost", BOOST_VERSION);
+                        b.append("info", inf.obj());
+                    }
+                    BSONObj o = b.obj();
+                    //cout << "inserting " << o.toString() << endl;
+                    try {
+                        conn->insert(ns, o);
+                    }
+                    catch ( std::exception& e ) {
+                        warning() << "couldn't save perf results: " << e.what() << endl;
+                    }
+                }
+            }
+        }
+
+        virtual bool testThreaded() { return false; }
+
+        unsigned long long n;
+
+        void run() {
+            _ns = string("perftest.") + name();
+            client().dropCollection(ns());
+
+            prep();
+
+            int hlm = howLongMillis();
+            DEV {
+                // don't run very long with _DEBUG - not very meaningful anyway on that build
+                hlm = min(hlm, 500);
+            }
+
+            dur::stats._intervalMicros = 0; // no auto rotate
+            dur::stats.curr->reset();
+            mongo::Timer t;
+            n = 0;
+            const unsigned Batch = batchSize();
+
+            if( hlm == 0 ) {
+                // means just do once
+                timed();
+            }
+            else {
+                do {
+                    unsigned i;
+                    for( i = 0; i < Batch; i++ )
+                        timed();
+                    n += i;
+                } while( t.micros() < (unsigned) hlm * 1000 );
+            }
+
+            client().getLastError(); // block until all ops are finished
+            int ms = t.millis();
+
+            say(n, ms, name());
+
+            post();
+
+            string test2name = timed2(client());
+            {
+                if( test2name.size() != 0 ) {
+                    dur::stats.curr->reset();
+                    mongo::Timer t;
+                    unsigned long long n = 0;
+                    while( 1 ) {
+                        unsigned i;
+                        for( i = 0; i < Batch; i++ )
+                            timed2(client());
+                        n += i;
+                        if( t.millis() > hlm )
+                            break;
+                    }
+                    int ms = t.millis();
+                    say(n, ms, test2name);
+                }
+            }
+
+            if( testThreaded() ) {
+                cout << "testThreaded" << endl;
+                mongo::Timer t;
+                launchThreads(8);
+                //cout << "threaded done " << t.millis() << "ms" << endl;
+                //cout << n * 1000 / t.millis() << " per second" << endl;
+                say(n, t.millis(), test2name+"-threaded");
+
+            }
+        }
+
+        void thread() {
+            DBClientType c;
+            Client::initThreadIfNotAlready("perftestthr");
+            for( unsigned long long i = 0; i < n/8; i++ ) {
+                timed2(c);
+            }
+            cc().shutdown();
+        }
+
+        void launchThreads(int remaining) {
+            if (!remaining)
+                return;
+            boost::thread athread(boost::bind(&B::thread, this));
+            launchThreads(remaining - 1);
+            athread.join();
+        }
+    };
+
+    boost::shared_ptr<DBClientConnection> B::conn;
+    string B::_perfhostname;
+    unsigned B::once;
+
+    unsigned dontOptimizeOutHopefully;
+
+    class NonDurTest : public B {
+    public:
+        virtual int howLongMillis() { return 3000; }
+        virtual bool showDurStats() { return false; }
+    };
+
+    class BSONIter : public NonDurTest {
+    public:
+        int n;
+        bo b, sub;
+        string name() { return "BSONIter"; }
+        BSONIter() {
+            n = 0;
+            bo sub = bob().appendTimeT("t", time(0)).appendBool("abool", true).appendBinData("somebin", 3, BinDataGeneral, "abc").appendNull("anullone").obj();
+            b = BSON( "_id" << OID() << "x" << 3 << "yaaaaaa" << 3.00009 << "zz" << 1 << "q" << false << "obj" << sub << "zzzzzzz" << "a string a string" );
+        }
+        void timed() {
+            for( bo::iterator i = b.begin(); i.more(); )
+                if( i.next().fieldName() )
+                    n++;
+            for( bo::iterator i = sub.begin(); i.more(); )
+                if( i.next().fieldName() )
+                    n++;
+        }
+    };
+
+    class BSONGetFields1 : public NonDurTest {
+    public:
+        int n;
+        bo b, sub;
+        string name() { return "BSONGetFields1By1"; }
+        BSONGetFields1() {
+            n = 0;
+            bo sub = bob().appendTimeT("t", time(0)).appendBool("abool", true).appendBinData("somebin", 3, BinDataGeneral, "abc").appendNull("anullone").obj();
+            b = BSON( "_id" << OID() << "x" << 3 << "yaaaaaa" << 3.00009 << "zz" << 1 << "q" << false << "obj" << sub << "zzzzzzz" << "a string a string" );
+        }
+        void timed() {
+            if( b["x"].eoo() )
+                n++;
+            if( b["q"].eoo() )
+                n++;
+            if( b["zzz"].eoo() )
+                n++;
+        }
+    };
+
+    class BSONGetFields2 : public BSONGetFields1 {
+    public:
+        string name() { return "BSONGetFields"; }
+        void timed() {
+            static const char *names[] = { "x", "q", "zzz" };
+            BSONElement elements[3];
+            b.getFields(3, names, elements);
+            if( elements[0].eoo() )
+                n++;
+            if( elements[1].eoo() )
+                n++;
+            if( elements[2].eoo() )
+                n++;
+        }
+    };
+
+    class KeyTest : public B {
+    public:
+        KeyV1Owned a,b,c;
+        string name() { return "Key-woequal"; }
+        virtual int howLongMillis() { return 3000; }
+        KeyTest() :
+          a(BSON("a"<<1<<"b"<<3.0<<"c"<<"qqq")),
+          b(BSON("a"<<1<<"b"<<3.0<<"c"<<"qqq")),
+          c(BSON("a"<<1<<"b"<<3.0<<"c"<<"qqqb"))
+          {}
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            assert( a.woEqual(b) );
+            assert( !a.woEqual(c) );
+        }
+    };
+
+    unsigned long long aaa;
+
+    class Timer : public B {
+    public:
+        string name() { return "Timer"; }
+        virtual int howLongMillis() { return 1000; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            mongo::Timer t;
+            aaa += t.millis();
+        }
+    };
+
+    class Sleep0Ms : public B {
+    public:
+        string name() { return "Sleep0Ms"; }
+        virtual int howLongMillis() { return 400; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            sleepmillis(0);
+            mongo::Timer t;
+            aaa++;
+        }
+    };
+
+    RWLock lk("testrw");
+    SimpleMutex m("simptst");
+    mongo::mutex mtest("mtest");
+    SpinLock s;
+
+    class mutexspeed : public B {
+    public:
+        string name() { return "mutex"; }
+        virtual int howLongMillis() { return 500; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            mongo::mutex::scoped_lock lk(mtest);
+        }
+    };
+    class simplemutexspeed : public B {
+    public:
+        string name() { return "simplemutex"; }
+        virtual int howLongMillis() { return 500; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            SimpleMutex::scoped_lock lk(m);
+        }
+    };
+    class spinlockspeed : public B {
+    public:
+        string name() { return "spinlock"; }
+        virtual int howLongMillis() { return 500; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            mongo::scoped_spinlock lk(s);
+        }
+    };
+    int cas;
+    class casspeed : public B {
+    public:
+        string name() { return "compareandswap"; }
+        virtual int howLongMillis() { return 500; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+#define RUNCOMPARESWAP 1
+            __sync_bool_compare_and_swap(&cas, 0, 0);
+#endif
+        }
+    };
+    class rlock : public B {
+    public:
+        string name() { return "rlock"; }
+        virtual int howLongMillis() { return 500; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            lk.lock_shared();
+            lk.unlock_shared();
+        }
+    };
+    class wlock : public B {
+    public:
+        string name() { return "wlock"; }
+        virtual int howLongMillis() { return 500; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            lk.lock();
+            lk.unlock();
+        }
+    };
+
+#if 0
+    class ulock : public B {
+    public:
+        string name() { return "ulock"; }
+        virtual int howLongMillis() { return 500; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            lk.lockAsUpgradable();
+            lk.unlockFromUpgradable();
+        }
+    };
+#endif
+
+    class CTM : public B {
+    public:
+        CTM() : last(0), delts(0), n(0) { }
+        string name() { return "curTimeMillis64"; }
+        virtual int howLongMillis() { return 500; }
+        virtual bool showDurStats() { return false; }
+        unsigned long long last;
+        unsigned long long delts;
+        unsigned n;
+        void timed() {
+            unsigned long long x = curTimeMillis64();
+            aaa += x;
+            if( last ) {
+                unsigned long long delt = x-last;
+                if( delt ) {
+                    delts += delt;
+                    n++;
+                }
+            }
+            last = x;
+        }
+        void post() {
+            // we need to know if timing is highly ungranular - that could be relevant in some places
+            if( n )
+                cout << "      avg timer granularity: " << ((double)delts)/n << "ms " << endl;
+        }
+    };
+
+    class Bldr : public B {
+    public:
+        int n;
+        string name() { return "BufBuilder"; }
+        Bldr() {
+        }
+        virtual int howLongMillis() { return 3000; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            BufBuilder b;
+            b.appendNum(3);
+            b.appendUChar(' ');
+            b.appendStr("abcd");
+            n += b.len();
+        }
+    };
+
+    class StkBldr : public B {
+    public:
+        virtual int howLongMillis() { return 3000; }
+        int n;
+        string name() { return "StackBufBuilder"; }
+        virtual bool showDurStats() { return false; }
+        void timed() {
+            StackBufBuilder b;
+            b.appendNum(3);
+            b.appendUChar(' ');
+            b.appendStr("abcd");
+            n += b.len();
+        }
+    };
+
+    // if a test is this fast, it was optimized out
+    class Dummy : public B {
+    public:
+        Dummy() { }
+        virtual int howLongMillis() { return 3000; }
+        string name() { return "dummy"; }
+        void timed() {
+            dontOptimizeOutHopefully++;
+        }
+        virtual bool showDurStats() { return false; }
+    };
+
+    // test thread local speed
+#if defined(_WIN32)
+    __declspec( thread ) int x;
+    class TLS2 : public B {
+    public:
+        virtual int howLongMillis() { return 3000; }
+        string name() { return "thread-local-storage2"; }
+        void timed() {
+            if( x )
+                dontOptimizeOutHopefully++;
+        }
+        virtual bool showDurStats() { return false; }
+    };
+#endif
+
+    // test thread local speed
+    class TLS : public B {
+    public:
+        virtual int howLongMillis() { return 3000; }
+        string name() { return "thread-local-storage"; }
+        void timed() {
+            if( &cc() )
+                dontOptimizeOutHopefully++;
+        }
+        virtual bool showDurStats() { return false; }
+    };
+
+    bool dummy1 = false;
+
+    class TestException : public DBException { 
+    public:
+        TestException() : DBException("testexception",3) { }
+    };
+
+    void foo_throws() { 
+        if( dontOptimizeOutHopefully ) { 
+            throw TestException();
+        }
+        log() << "hmmm" << endl;
+    }
+
+    class Throw : public B {
+    public:
+        virtual int howLongMillis() { return 2000; }
+        string name() { return "throw"; }
+        void timed() {
+            try { 
+                foo_throws();
+                dontOptimizeOutHopefully += 2;
+            }
+            catch(DBException& e) {
+                e.getCode();
+                dontOptimizeOutHopefully++;
+            }
+        }
+        virtual bool showDurStats() { return false; }
+    };
+
+    class New128 : public B {
+    public:
+        virtual int howLongMillis() { return 2000; }
+        string name() { return "new128"; }
+        void timed() {
+            char *p = new char[128];
+            if( dontOptimizeOutHopefully++ > 0 )
+                delete p;
+        }
+        virtual bool showDurStats() { return false; }
+    };
+
+    class New8 : public B {
+    public:
+        virtual int howLongMillis() { return 2000; }
+        string name() { return "new8"; }
+        void timed() {
+            char *p = new char[8];
+            if( dontOptimizeOutHopefully++ > 0 )
+                delete p;
+        }
+        virtual bool showDurStats() { return false; }
+    };
+
+    class Compress : public B {
+    public:
+        const unsigned sz;
+        void *p;
+        Compress() : sz(1024*1024*100+3) { }
+        virtual unsigned batchSize() { return 1; }
+        string name() { return "compress"; }
+        virtual bool showDurStats() { return false; }
+        virtual int howLongMillis() { return 4000; }
+        void prep() {
+            p = malloc(sz);
+            // this isn't a fair test as it is mostly rands but we just want a rough perf check
+            static int last;
+            for (unsigned i = 0; i<sz; i++) {
+                int r = rand();
+                if( (r & 0x300) == 0x300 )
+                    r = last;
+                ((char*)p)[i] = r;
+                last = r;
+            }
+        }
+        size_t last;
+        string res;
+        void timed() {
+            mongo::Timer t;
+            string out;
+            size_t len = compress((const char *) p, sz, &out);
+            bool ok = uncompress(out.c_str(), out.size(), &res);
+            ASSERT(ok);
+            static unsigned once;
+            if( once++ == 0 )
+                cout << "compress round trip " << sz/(1024.0*1024) / (t.millis()/1000.0) << "MB/sec\n";
+            //cout << len / (1024.0/1024) << " compressed" << endl;
+            (void)len; //fix unused error while above line is commented out
+        }
+        void post() {
+            ASSERT( memcmp(res.c_str(), p, sz) == 0 );
+            free(p);
+        }
+    };
+
+    // test speed of checksum method
+    class ChecksumTest : public B {
+    public:
+        const unsigned sz;
+        ChecksumTest() : sz(1024*1024*100+3) { }
+        string name() { return "checksum"; }
+        virtual int howLongMillis() { return 2000; }
+        virtual bool showDurStats() { return false; }
+        virtual unsigned batchSize() { return 1; }
+
+        void *p;
+
+        void prep() {
+            {
+                // the checksum code assumes 'standard' rollover on addition overflows. let's check that:
+                unsigned long long x = 0xffffffffffffffffULL;
+                ASSERT( x+2 == 1 );
+            }
+
+            p = malloc(sz);
+            for (unsigned i = 0; i<sz; i++)
+                ((char*)p)[i] = rand();
+        }
+
+        Checksum last;
+
+        void timed() {
+            static int i;
+            Checksum c;
+            c.gen(p, sz);
+            if( i == 0 )
+                last = c;
+            else if( i == 1 ) {
+                ASSERT( c == last );
+            }
+        }
+        void post() {
+            {
+                mongo::Checksum c;
+                c.gen(p, sz-1);
+                ASSERT( c != last );
+                ((char *&)p)[0]++; // check same data, different order, doesn't give same checksum
+                ((char *&)p)[1]--;
+                c.gen(p, sz);
+                ASSERT( c != last );
+                ((char *&)p)[1]++; // check same data, different order, doesn't give same checksum (different longwords case)
+                ((char *&)p)[8]--;
+                c.gen(p, sz);
+                ASSERT( c != last );
+            }
+            free(p);
+        }
+    };
+
+    class InsertDup : public B {
+        const BSONObj o;
+    public:
+        InsertDup() : o( BSON("_id" << 1) ) { } // dup keys
+        string name() {
+            return "insert-duplicate-_ids";
+        }
+        void prep() {
+            client().insert( ns(), o );
+        }
+        void timed() {
+            client().insert( ns(), o );
+        }
+        void post() {
+            assert( client().count(ns()) == 1 );
+        }
+    };
+
+    class Insert1 : public B {
+        const BSONObj x;
+        OID oid;
+        BSONObj query;
+    public:
+        virtual int howLongMillis() { return 30000; }
+        Insert1() : x( BSON("x" << 99) ) {
+            oid.init();
+            query = BSON("_id" << oid);
+            i = 0;
+        }
+        string name() { return "insert-simple"; }
+        unsigned i;
+        void timed() {
+            BSONObj o = BSON( "_id" << i++ << "x" << 99 );
+            client().insert( ns(), o );
+            //client().insert( ns(), x );
+        }
+        virtual bool testThreaded() { return true; }
+        string timed2(DBClientBase& c) {
+            Query q = QUERY( "_id" << (unsigned) Security::getNonce() % i );
+            c.findOne(ns(), q);
+            //client().findOne(ns(), query);
+            return "findOne_by_id";
+        }
+        void post() {
+#if !defined(_DEBUG)
+            assert( client().count(ns()) > 50 );
+#endif
+        }
+    };
+
+    class InsertBig : public B {
+        BSONObj x;
+        virtual int howLongMillis() {
+            if( sizeof(void*) == 4 )
+                return 1000;  // could exceed mmapping if run too long, as this function adds a lot fasta
+            return 5000;
+        }
+    public:
+        InsertBig() {
+            char buf[200000];
+            BSONObjBuilder b;
+            b.append("x", 99);
+            b.appendBinData("bin", 200000, (BinDataType) 129, buf);
+            x = b.obj();
+        }
+        string name() { return "insert-big"; }
+        void timed() {
+            client().insert( ns(), x );
+        }
+    };
+
+    class InsertRandom : public B {
+    public:
+        virtual int howLongMillis() { return profiling ? 30000 : 5000; }
+        string name() { return "random-inserts"; }
+        void prep() {
+            client().insert( ns(), BSONObj() );
+            client().ensureIndex(ns(), BSON("x"<<1));
+        }
+        void timed() {
+            int x = rand();
+            BSONObj y = BSON("x" << x << "y" << rand() << "z" << 33);
+            client().insert(ns(), y);
+        }
+    };
+
+    /** upserts about 32k records and then keeps updating them
+        2 indexes
+    */
+    class Update1 : public B {
+    public:
+        static int rand() {
+            return std::rand() & 0x7fff;
+        }
+        virtual string name() { return "random-upserts"; }
+        void prep() {
+            client().insert( ns(), BSONObj() );
+            client().ensureIndex(ns(), BSON("x"<<1));
+        }
+        void timed() {
+            int x = rand();
+            BSONObj q = BSON("x" << x);
+            BSONObj y = BSON("x" << x << "y" << rand() << "z" << 33);
+            client().update(ns(), q, y, /*upsert*/true);
+        }
+
+        virtual string timed2(DBClientBase& c) {
+            static BSONObj I = BSON( "$inc" << BSON( "y" << 1 ) );
+
+            // test some $inc's
+
+            int x = rand();
+            BSONObj q = BSON("x" << x);
+            c.update(ns(), q, I);
+
+            return name()+"-inc";
+        }
+    };
+
+    template <typename T>
+    class MoreIndexes : public T {
+    public:
+        string name() { return T::name() + "-more-indexes"; }
+        void prep() {
+            T::prep();
+            this->client().ensureIndex(this->ns(), BSON("y"<<1));
+            this->client().ensureIndex(this->ns(), BSON("z"<<1));
+        }
+    };
+
+    void t() {
+        for( int i = 0; i < 20; i++ ) {
+            sleepmillis(21);
+            string fn = "/tmp/t1";
+            MongoMMF f;
+            unsigned long long len = 1 * 1024 * 1024;
+            assert( f.create(fn, len, /*sequential*/rand()%2==0) );
+            {
+                char *p = (char *) f.getView();
+                assert(p);
+                // write something to the private view as a test
+                strcpy(p, "hello");
+            }
+            if( cmdLine.dur ) {
+                char *w = (char *) f.view_write();
+                strcpy(w + 6, "world");
+            }
+            MongoFileFinder ff;
+            ASSERT( ff.findByPath(fn) );
+        }
+    }
+
+    class All : public Suite {
+    public:
+        All() : Suite( "perf" ) { }
+
+        Result * run( const string& filter ) {
+            boost::thread a(t);
+            Result * res = Suite::run(filter);
+            a.join();
+            return res;
+        }
+
+        void setupTests() {
+            cout
+                << "stats test                              rps------  time-- "
+                << dur::stats.curr->_CSVHeader() << endl;
+            if( profiling ) {
+                add< New8 >();
+                add< New128 >();
+            }
+            else {
+                add< Dummy >();
+                add< ChecksumTest >();
+                add< Compress >();
+                add< TLS >();
+#if defined(_WIN32)
+                add< TLS2 >();
+#endif
+                add< New8 >();
+                add< New128 >();
+                add< Throw >();
+                add< Timer >();
+                add< Sleep0Ms >();
+                add< rlock >();
+                add< wlock >();
+                //add< ulock >();
+                add< mutexspeed >();
+                add< simplemutexspeed >();
+                add< spinlockspeed >();
+#ifdef RUNCOMPARESWAP
+                add< casspeed >();
+#endif
+                add< CTM >();
+                add< KeyTest >();
+                add< Bldr >();
+                add< StkBldr >();
+                add< BSONIter >();
+                add< BSONGetFields1 >();
+                add< BSONGetFields2 >();
+                add< TaskQueueTest >();
+                add< InsertDup >();
+                add< Insert1 >();
+                add< InsertRandom >();
+                add< MoreIndexes<InsertRandom> >();
+                add< Update1 >();
+                add< MoreIndexes<Update1> >();
+                add< InsertBig >();
+            }
+        }
+    } myall;
+}
diff --git a/src/mongo/dbtests/queryoptimizercursortests.cpp b/src/mongo/dbtests/queryoptimizercursortests.cpp
new file mode 100644
index 00000000000..2d5590db3b7
--- /dev/null
+++ b/src/mongo/dbtests/queryoptimizercursortests.cpp
@@ -0,0 +1,2521 @@
+// queryoptimizertests.cpp : query optimizer unit tests
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/queryoptimizer.h"
+#include "../db/queryoptimizercursor.h"
+#include "../db/instance.h"
+#include "../db/ops/delete.h"
+#include "dbtests.h"
+
+namespace mongo {
+    void __forceLinkGeoPlugin();
+    shared_ptr<Cursor> newQueryOptimizerCursor( const char *ns, const BSONObj &query, const BSONObj &order = BSONObj(), bool requireIndex = false );
+} // namespace mongo
+
+namespace QueryOptimizerCursorTests {
+    
+    void dropCollection( const char *ns ) {
+     	string errmsg;
+        BSONObjBuilder result;
+        dropCollection( ns, errmsg, result );
+    }
+        
+    using boost::shared_ptr;
+    
+    class CachedMatchCounterCount {
+    public:
+        void run() {
+            long long aggregateNscanned;
+            CachedMatchCounter c( aggregateNscanned, 0 );
+            ASSERT_EQUALS( 0, c.count() );
+            ASSERT_EQUALS( 0, c.cumulativeCount() );
+
+            c.resetMatch();
+            ASSERT( !c.knowMatch() );
+
+            c.setMatch( false );
+            ASSERT( c.knowMatch() );
+
+            c.countMatch( DiskLoc() );
+            ASSERT_EQUALS( 0, c.count() );
+            ASSERT_EQUALS( 0, c.cumulativeCount() );
+            
+            c.resetMatch();
+            ASSERT( !c.knowMatch() );
+            
+            c.setMatch( true );
+            ASSERT( c.knowMatch() );
+            
+            c.countMatch( DiskLoc() );
+            ASSERT_EQUALS( 1, c.count() );
+            ASSERT_EQUALS( 1, c.cumulativeCount() );
+
+            // Don't count the same match twice, without checking the document location.
+            c.countMatch( DiskLoc( 1, 1 ) );
+            ASSERT_EQUALS( 1, c.count() );
+            ASSERT_EQUALS( 1, c.cumulativeCount() );
+
+            // Reset and count another match.
+            c.resetMatch();
+            c.setMatch( true );
+            c.countMatch( DiskLoc( 1, 1 ) );
+            ASSERT_EQUALS( 2, c.count() );
+            ASSERT_EQUALS( 2, c.cumulativeCount() );
+        }
+    };
+    
+    class CachedMatchCounterAccumulate {
+    public:
+        void run() {
+            long long aggregateNscanned;
+            CachedMatchCounter c( aggregateNscanned, 10 );
+            ASSERT_EQUALS( 0, c.count() );
+            ASSERT_EQUALS( 10, c.cumulativeCount() );
+            
+            c.setMatch( true );
+            c.countMatch( DiskLoc() );
+            ASSERT_EQUALS( 1, c.count() );
+            ASSERT_EQUALS( 11, c.cumulativeCount() );
+        }
+    };
+    
+    class CachedMatchCounterDedup {
+    public:
+        void run() {
+            long long aggregateNscanned;
+            CachedMatchCounter c( aggregateNscanned, 0 );
+
+            c.setCheckDups( true );
+            c.setMatch( true );
+            c.countMatch( DiskLoc() );
+            ASSERT_EQUALS( 1, c.count() );
+
+            c.resetMatch();
+            c.setMatch( true );
+            c.countMatch( DiskLoc() );
+            ASSERT_EQUALS( 1, c.count() );
+        }
+    };
+
+    class CachedMatchCounterNscanned {
+    public:
+        void run() {
+            long long aggregateNscanned = 5;
+            CachedMatchCounter c( aggregateNscanned, 0 );
+            ASSERT_EQUALS( 0, c.nscanned() );
+            ASSERT_EQUALS( 5, c.aggregateNscanned() );
+
+            c.updateNscanned( 4 );
+            ASSERT_EQUALS( 4, c.nscanned() );
+            ASSERT_EQUALS( 9, c.aggregateNscanned() );
+        }
+    };
+    
+    class SmallDupSetUpgrade {
+    public:
+        void run() {
+            SmallDupSet d;
+            for( int i = 0; i < 100; ++i ) {
+                ASSERT( !d.getsetdup( DiskLoc( 0, i ) ) );
+                for( int j = 0; j <= i; ++j ) {
+                    ASSERT( d.getdup( DiskLoc( 0, j ) ) );
+                }
+            }
+        }
+    };
+
+    class SmallDupSetUpgradeRead {
+    public:
+        void run() {
+            SmallDupSet d;
+            d.getsetdup( DiskLoc( 0, 0 ) );
+            for( int i = 0; i < 550; ++i ) {
+                ASSERT( d.getdup( DiskLoc( 0, 0 ) ) );
+            }
+            ASSERT( d.getsetdup( DiskLoc( 0, 0 ) ) );
+        }
+    };
+
+    class SmallDupSetUpgradeWrite {
+    public:
+        void run() {
+            SmallDupSet d;
+            for( int i = 0; i < 550; ++i ) {
+                ASSERT( !d.getsetdup( DiskLoc( 0, i ) ) );
+            }
+            for( int i = 0; i < 550; ++i ) {
+                ASSERT( d.getsetdup( DiskLoc( 0, i ) ) );
+            }
+        }
+    };
+
+    class Base {
+    public:
+        Base() {
+            dblock lk;
+            Client::Context ctx( ns() );
+            string err;
+            userCreateNS( ns(), BSONObj(), err, false );
+            dropCollection( ns() );
+        }
+        ~Base() {
+            cc().curop()->reset();
+        }
+    protected:
+        DBDirectClient _cli;
+        static const char *ns() { return "unittests.QueryOptimizerTests"; }
+        void setQueryOptimizerCursor( const BSONObj &query, const BSONObj &order = BSONObj() ) {
+            setQueryOptimizerCursorWithoutAdvancing( query, order );
+            if ( ok() && !mayReturnCurrent() ) {
+                advance();
+            }
+        }
+        void setQueryOptimizerCursorWithoutAdvancing( const BSONObj &query, const BSONObj &order = BSONObj() ) {
+            _c = newQueryOptimizerCursor( ns(), query, order, false );
+        }
+        bool ok() const { return _c->ok(); }
+        /** Handles matching and deduping. */
+        bool advance() {
+            while( _c->advance() && !mayReturnCurrent() );
+            return ok();
+        }
+        int itcount() {
+            int ret = 0;
+            while( ok() ) {
+                ++ret;
+                advance();
+            }
+            return ret;
+        }
+        BSONObj current() const { return _c->current(); }
+        DiskLoc currLoc() const { return _c->currLoc(); }
+        void prepareToTouchEarlierIterate() { _c->prepareToTouchEarlierIterate(); }
+        void recoverFromTouchingEarlierIterate() { _c->recoverFromTouchingEarlierIterate(); }
+        bool mayReturnCurrent() {
+//            return _c->currentMatches() && !_c->getsetdup( _c->currLoc() );
+            return ( !_c->matcher() || _c->matcher()->matchesCurrent( _c.get() ) ) && !_c->getsetdup( _c->currLoc() );
+        }
+        bool prepareToYield() const { return _c->prepareToYield(); }
+        void recoverFromYield() {
+            _c->recoverFromYield();
+            if ( ok() && !mayReturnCurrent() ) {
+                advance();   
+            }
+        }
+        shared_ptr<Cursor> c() { return _c; }
+        long long nscanned() const { return _c->nscanned(); }
+    private:
+        shared_ptr<Cursor> _c;
+    };
+    
+    /** No results for empty collection. */
+    class Empty : public Base {
+    public:
+        void run() {
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSONObj() );
+            ASSERT( !c->ok() );
+            ASSERT_THROWS( c->_current(), AssertionException );
+            ASSERT_THROWS( c->current(), AssertionException );
+            ASSERT( c->currLoc().isNull() );
+            ASSERT( !c->advance() );
+            ASSERT_THROWS( c->currKey(), AssertionException );
+            ASSERT_THROWS( c->getsetdup( DiskLoc() ), AssertionException );
+            ASSERT_THROWS( c->isMultiKey(), AssertionException );
+            ASSERT_THROWS( c->matcher(), AssertionException );
+        }
+    };
+    
+    /** Simple table scan. */
+    class Unindexed : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSONObj() );
+            ASSERT_EQUALS( 2, itcount() );
+        }
+    };
+    
+    /** Basic test with two indexes and deduping requirement. */
+    class Basic : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 2 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
+            ASSERT( ok() );
+            ASSERT_EQUALS( BSON( "_id" << 1 << "a" << 2 ), current() );
+            ASSERT( advance() );
+            ASSERT_EQUALS( BSON( "_id" << 2 << "a" << 1 ), current() );
+            ASSERT( !advance() );
+            ASSERT( !ok() );
+        }
+    };
+    
+    class NoMatch : public Base {
+    public:
+        void run() {
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 5 << LT << 4 << "a" << GT << 0 ) );
+            ASSERT( !ok() );
+        }            
+    };
+    
+    /** Order of results indicates that interleaving is occurring. */
+    class Interleaved : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 2 ) );
+            _cli.insert( ns(), BSON( "_id" << 3 << "a" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "a" << 2 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
+            ASSERT( ok() );
+            ASSERT_EQUALS( BSON( "_id" << 1 << "a" << 2 ), current() );
+            ASSERT( advance() );
+            ASSERT_EQUALS( BSON( "_id" << 3 << "a" << 1 ), current() );
+            ASSERT( advance() );
+            ASSERT_EQUALS( BSON( "_id" << 2 << "a" << 2 ), current() );
+            ASSERT( !advance() );
+            ASSERT( !ok() );
+        }
+    };
+    
+    /** Some values on each index do not match. */
+    class NotMatch : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 10 ) );
+            _cli.insert( ns(), BSON( "_id" << 10 << "a" << 0 ) );
+            _cli.insert( ns(), BSON( "_id" << 11 << "a" << 12 ) );
+            _cli.insert( ns(), BSON( "_id" << 12 << "a" << 11 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 5 << "a" << GT << 5 ) );
+            ASSERT( ok() );
+            ASSERT_EQUALS( BSON( "_id" << 11 << "a" << 12 ), current() );
+            ASSERT( advance() );
+            ASSERT_EQUALS( BSON( "_id" << 12 << "a" << 11 ), current() );
+            ASSERT( !advance() );
+            ASSERT( !ok() );
+        }            
+    };
+    
+    /** After the first 101 matches for a plan, we stop interleaving the plans. */
+    class StopInterleaving : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 101; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i ) );   
+            }
+            for( int i = 101; i < 200; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << (301-i) ) );   
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << -1 << "a" << GT << -1 ) );
+            for( int i = 0; i < 200; ++i ) {
+                ASSERT( ok() );
+                ASSERT_EQUALS( i, current().getIntField( "_id" ) );
+                advance();
+            }
+            ASSERT( !advance() );
+            ASSERT( !ok() );                
+        }
+    };
+    
+    /** Test correct deduping with the takeover cursor. */
+    class TakeoverWithDup : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 101; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i ) );   
+            }
+            _cli.insert( ns(), BSON( "_id" << 500 << "a" << BSON_ARRAY( 0 << 300 ) ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << -1 << "a" << GT << -1 ) );
+            ASSERT_EQUALS( 102, itcount() );
+        }
+    };
+    
+    /** Test usage of matcher with takeover cursor. */
+    class TakeoverWithNonMatches : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 101; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i ) );   
+            }
+            _cli.insert( ns(), BSON( "_id" << 101 << "a" << 600 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << -1 << "a" << LT << 500 ) );
+            ASSERT_EQUALS( 101, itcount() );
+        }
+    };
+    
+    /** Check deduping of dups within just the takeover cursor. */
+    class TakeoverWithTakeoverDup : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 101; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i*2 << "a" << 0 ) );
+                _cli.insert( ns(), BSON( "_id" << i*2+1 << "a" << 1 ) );
+            }
+            _cli.insert( ns(), BSON( "_id" << 202 << "a" << BSON_ARRAY( 2 << 3 ) ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << -1 << "a" << GT << 0) );
+            ASSERT_EQUALS( 102, itcount() );
+        }
+    };
+    
+    /** Basic test with $or query. */
+    class BasicOr : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 0 ) );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << 0 ) << BSON( "a" << 1 ) ) ) );
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 0 ), current() );
+            ASSERT( advance() );
+            ASSERT_EQUALS( BSON( "_id" << 1 << "a" << 1 ), current() );
+            ASSERT( !advance() );
+        }
+    };
+    
+    /** $or first clause empty. */
+    class OrFirstClauseEmpty : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << -1 ) << BSON( "a" << 1 ) ) ) );
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 1 ), current() );
+            ASSERT( advance() );
+            ASSERT_EQUALS( BSON( "_id" << 1 << "a" << 1 ), current() );
+            ASSERT( !advance() );
+        }
+    };        
+    
+    /** $or second clause empty. */
+    class OrSecondClauseEmpty : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << 0 ) << BSON( "_id" << -1 ) << BSON( "a" << 1 ) ) ) );
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 1 ), current() );
+            ASSERT( advance() );
+            ASSERT_EQUALS( BSON( "_id" << 1 << "a" << 1 ), current() );
+            ASSERT( !advance() );
+        }
+    };
+    
+    /** $or multiple clauses empty empty. */
+    class OrMultipleClausesEmpty : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << 2 ) << BSON( "_id" << 4 ) << BSON( "_id" << 0 ) << BSON( "_id" << -1 ) << BSON( "_id" << 6 ) << BSON( "a" << 1 ) << BSON( "_id" << 9 ) ) ) );
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 1 ), current() );
+            ASSERT( advance() );
+            ASSERT_EQUALS( BSON( "_id" << 1 << "a" << 1 ), current() );
+            ASSERT( !advance() );
+        }
+    };
+    
+    /** Check that takeover occurs at proper match count with $or clauses */
+    class TakeoverCountOr : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 60; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << 0 ) );   
+            }
+            for( int i = 60; i < 120; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << 1 ) );
+            }
+            for( int i = 120; i < 150; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << (200-i) ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "a" << 0 ) << BSON( "a" << 1 ) << BSON( "_id" << GTE << 120 << "a" << GT << 1 ) ) ) );
+            for( int i = 0; i < 120; ++i ) {
+                ASSERT( ok() );
+                advance();
+            }
+            // Expect to be scanning on _id index only.
+            for( int i = 120; i < 150; ++i ) {
+                ASSERT_EQUALS( i, current().getIntField( "_id" ) );
+                advance();
+            }
+            ASSERT( !ok() );
+        }
+    };
+    
+    /** Takeover just at end of clause. */
+    class TakeoverEndOfOrClause : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 102; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i ) );   
+            }
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << LT << 101 ) << BSON( "_id" << 101 ) ) ) );
+            for( int i = 0; i < 102; ++i ) {
+                ASSERT_EQUALS( i, current().getIntField( "_id" ) );
+                advance();
+            }
+            ASSERT( !ok() );
+        }
+    };
+    
+    class TakeoverBeforeEndOfOrClause : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 101; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i ) );   
+            }
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << LT << 100 ) << BSON( "_id" << 100 ) ) ) );
+            for( int i = 0; i < 101; ++i ) {
+                ASSERT_EQUALS( i, current().getIntField( "_id" ) );
+                advance();
+            }
+            ASSERT( !ok() );
+        }
+    };
+    
+    class TakeoverAfterEndOfOrClause : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 103; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i ) );   
+            }
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << LT << 102 ) << BSON( "_id" << 102 ) ) ) );
+            for( int i = 0; i < 103; ++i ) {
+                ASSERT_EQUALS( i, current().getIntField( "_id" ) );
+                advance();
+            }
+            ASSERT( !ok() );
+        }
+    };
+    
+    /** Test matching and deduping done manually by cursor client. */
+    class ManualMatchingDeduping : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 10 ) );
+            _cli.insert( ns(), BSON( "_id" << 10 << "a" << 0 ) ); 
+            _cli.insert( ns(), BSON( "_id" << 11 << "a" << 12 ) );
+            _cli.insert( ns(), BSON( "_id" << 12 << "a" << 11 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr< Cursor > c = newQueryOptimizerCursor( ns(), BSON( "_id" << GT << 5 << "a" << GT << 5 ) );
+            ASSERT( c->ok() );
+            
+            // _id 10 {_id:1}
+            ASSERT_EQUALS( 10, c->current().getIntField( "_id" ) );
+            ASSERT( !c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->advance() );
+            
+            // _id 0 {a:1}
+            ASSERT_EQUALS( 0, c->current().getIntField( "_id" ) );
+            ASSERT( !c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->advance() );
+            
+            // _id 0 {$natural:1}
+            ASSERT_EQUALS( 0, c->current().getIntField( "_id" ) );
+            ASSERT( !c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->advance() );
+            
+            // _id 11 {_id:1}
+            ASSERT_EQUALS( BSON( "_id" << 11 << "a" << 12 ), c->current() );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->getsetdup( c->currLoc() ) );
+            ASSERT( c->advance() );
+            
+            // _id 12 {a:1}
+            ASSERT_EQUALS( BSON( "_id" << 12 << "a" << 11 ), c->current() );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->getsetdup( c->currLoc() ) );
+            ASSERT( c->advance() );
+            
+            // _id 10 {$natural:1}
+            ASSERT_EQUALS( 10, c->current().getIntField( "_id" ) );
+            ASSERT( !c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->advance() );
+            
+            // _id 12 {_id:1}
+            ASSERT_EQUALS( BSON( "_id" << 12 << "a" << 11 ), c->current() );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->getsetdup( c->currLoc() ) );
+            ASSERT( c->advance() );
+            
+            // _id 11 {a:1}
+            ASSERT_EQUALS( BSON( "_id" << 11 << "a" << 12 ), c->current() );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->getsetdup( c->currLoc() ) );
+            ASSERT( c->advance() );
+            
+            // _id 11 {$natural:1}
+            ASSERT_EQUALS( 11, c->current().getIntField( "_id" ) );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->getsetdup( c->currLoc() ) );
+            
+            // {_id:1} scan is complete.
+            ASSERT( !c->advance() );
+            ASSERT( !c->ok() );       
+            
+            // Scan the results again - this time the winning plan has been
+            // recorded.
+            c = newQueryOptimizerCursor( ns(), BSON( "_id" << GT << 5 << "a" << GT << 5 ) );
+            ASSERT( c->ok() );
+            
+            // _id 10 {_id:1}
+            ASSERT_EQUALS( 10, c->current().getIntField( "_id" ) );
+            ASSERT( !c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->advance() );
+            
+            // _id 11 {_id:1}
+            ASSERT_EQUALS( BSON( "_id" << 11 << "a" << 12 ), c->current() );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->getsetdup( c->currLoc() ) );
+            ASSERT( c->advance() );
+            
+            // _id 12 {_id:1}
+            ASSERT_EQUALS( BSON( "_id" << 12 << "a" << 11 ), c->current() );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->getsetdup( c->currLoc() ) );
+            
+            // {_id:1} scan complete
+            ASSERT( !c->advance() );
+            ASSERT( !c->ok() );
+        }
+    };
+    
+    /** Curr key must be correct for currLoc for correct matching. */
+    class ManualMatchingUsingCurrKey : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << "a" ) );
+            _cli.insert( ns(), BSON( "_id" << "b" ) );
+            _cli.insert( ns(), BSON( "_id" << "ba" ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr< Cursor > c = newQueryOptimizerCursor( ns(), fromjson( "{_id:/a/}" ) );
+            ASSERT( c->ok() );
+            // "a"
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->getsetdup( c->currLoc() ) );
+            ASSERT( c->advance() );
+            ASSERT( c->ok() );
+            
+            // "b"
+            ASSERT( !c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->advance() );
+            ASSERT( c->ok() );
+            
+            // "ba"
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->getsetdup( c->currLoc() ) );
+            ASSERT( !c->advance() );
+        }
+    };
+    
+    /** Test matching and deduping done manually by cursor client. */
+    class ManualMatchingDedupingTakeover : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 150; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << 0 ) );
+            }
+            _cli.insert( ns(), BSON( "_id" << 300 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr< Cursor > c = newQueryOptimizerCursor( ns(), BSON( "$or" << BSON_ARRAY( BSON( "_id" << LT << 300 ) << BSON( "a" << 1 ) ) ) );
+            for( int i = 0; i < 151; ++i ) {
+                ASSERT( c->ok() );
+                ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+                ASSERT( !c->getsetdup( c->currLoc() ) );
+                c->advance();
+            }
+            ASSERT( !c->ok() );
+        }
+    };
+    
+    /** Test single key matching bounds. */
+    class Singlekey : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "a" << "10" ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr< Cursor > c = newQueryOptimizerCursor( ns(), BSON( "a" << GT << 1 << LT << 5 ) );
+            // Two sided bounds work.
+            ASSERT( !c->ok() );
+        }
+    };
+    
+    /** Test multi key matching bounds. */
+    class Multikey : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "a" << BSON_ARRAY( 1 << 10 ) ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "a" << GT << 5 << LT << 3 ) );
+            // Multi key bounds work.
+            ASSERT( ok() );
+        }
+    };
+    
+    /** Add other plans when the recorded one is doing more poorly than expected. */
+    class AddOtherPlans : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 0 << "b" << 0 ) );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 << "b" << 0 ) );
+            for( int i = 100; i < 150; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << 100 << "b" << i ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "a" << 0 << "b" << 0 ) );
+            
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 0 << "b" << 0 ), c->current() );
+            ASSERT_EQUALS( BSON( "a" << 1 ), c->indexKeyPattern() );
+
+            ASSERT( c->advance() );
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 0 << "b" << 0 ), c->current() );
+            ASSERT_EQUALS( BSON( "b" << 1 ), c->indexKeyPattern() );
+            
+            ASSERT( c->advance() );
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 0 << "b" << 0 ), c->current() );                
+            // Unindexed plan
+            ASSERT_EQUALS( BSONObj(), c->indexKeyPattern() );
+            ASSERT( !c->advance() );
+            
+            c = newQueryOptimizerCursor( ns(), BSON( "a" << 100 << "b" << 149 ) );
+            // Try {a:1}, which was successful previously.
+            for( int i = 0; i < 12; ++i ) {
+                ASSERT( 149 != c->current().getIntField( "b" ) );
+                ASSERT( c->advance() );
+            }
+            bool sawB1Index = false;
+            do {
+                if ( c->indexKeyPattern() == BSON( "b" << 1 ) ) {
+                    ASSERT_EQUALS( 149, c->current().getIntField( "b" ) );
+                    // We should try the {b:1} index and only see one result from it.
+                    ASSERT( !sawB1Index );
+                    sawB1Index = true;
+                }
+            } while ( c->advance() );
+            ASSERT( sawB1Index );
+        }
+    };
+
+    /** Add other plans when the recorded one is doing more poorly than expected, with deletion. */
+    class AddOtherPlansDelete : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 0 << "b" << 0 ) );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 << "b" << 0 ) );
+            for( int i = 100; i < 120; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << 100 << "b" << i ) );
+            }
+            for( int i = 199; i >= 150; --i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << 100 << "b" << 150 ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "a" << 0 << "b" << 0 ) );
+            
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 0 << "b" << 0 ), c->current() );
+            ASSERT_EQUALS( BSON( "a" << 1 ), c->indexKeyPattern() );
+            
+            ASSERT( c->advance() );
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 0 << "b" << 0 ), c->current() );
+            ASSERT_EQUALS( BSON( "b" << 1 ), c->indexKeyPattern() );
+            
+            ASSERT( c->advance() );
+            ASSERT_EQUALS( BSON( "_id" << 0 << "a" << 0 << "b" << 0 ), c->current() );                
+            // Unindexed plan
+            ASSERT_EQUALS( BSONObj(), c->indexKeyPattern() );
+            ASSERT( !c->advance() );
+            
+            c = newQueryOptimizerCursor( ns(), BSON( "a" << 100 << "b" << 150 ) );
+            // Try {a:1}, which was successful previously.
+            for( int i = 0; i < 12; ++i ) {
+                ASSERT( 150 != c->current().getIntField( "b" ) );
+                ASSERT_EQUALS( BSON( "a" << 1 ), c->indexKeyPattern() );
+                ASSERT( c->advance() );
+            }
+            // Now try {b:1} plan.
+            ASSERT_EQUALS( BSON( "b" << 1 ), c->indexKeyPattern() );
+            ASSERT_EQUALS( 150, c->current().getIntField( "b" ) );
+            ASSERT( c->currentMatches() );
+            int id = c->current().getIntField( "_id" );
+            c->advance();
+            c->prepareToTouchEarlierIterate();
+            _cli.remove( ns(), BSON( "_id" << id ) );
+            c->recoverFromTouchingEarlierIterate();
+            int count = 1;
+            while( c->ok() ) {
+                if ( c->currentMatches() ) {
+                    ++count;
+                    int id = c->current().getIntField( "_id" );
+                    c->advance();
+                    c->prepareToTouchEarlierIterate();
+                    _cli.remove( ns(), BSON( "_id" << id ) );
+                    c->recoverFromTouchingEarlierIterate();                    
+                }
+                else {
+                    c->advance();
+                }
+            }
+            ASSERT_EQUALS( 50, count );
+        }
+    };
+
+    /**
+     * Add other plans when the recorded one is doing more poorly than expected, with deletion before
+     * and after adding the additional plans.
+     */
+    class AddOtherPlansContinuousDelete : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << 0 << "b" << 0 ) );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 << "b" << 0 ) );
+            for( int i = 100; i < 400; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i << "b" << ( 499 - i ) ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "a" << GTE << -1 << LTE << 0 << "b" << GTE << -1 << LTE << 0 ) );
+            while( c->advance() );
+            // {a:1} plan should be recorded now.
+          
+            c = newQueryOptimizerCursor( ns(), BSON( "a" << GTE << 100 << LTE << 400 << "b" << GTE << 100 << LTE << 400 ) );
+            int count = 0;
+            while( c->ok() ) {
+                if ( c->currentMatches() ) {
+                    ASSERT( !c->getsetdup( c->currLoc() ) );
+                    ++count;
+                    int id = c->current().getIntField( "_id" );
+                    c->advance();
+                    c->prepareToTouchEarlierIterate();
+                    _cli.remove( ns(), BSON( "_id" << id ) );
+                    c->recoverFromTouchingEarlierIterate();
+                } else {
+                    c->advance();
+                }
+            }
+            ASSERT_EQUALS( 300, count );
+            ASSERT_EQUALS( 2U, _cli.count( ns(), BSONObj() ) );
+        }
+    };
+
+    /** Check $or clause range elimination. */
+    class OrRangeElimination : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "$or" << BSON_ARRAY( BSON( "_id" << GT << 0 ) << BSON( "_id" << 1 ) ) ) );
+            ASSERT( c->ok() );
+            ASSERT( !c->advance() );
+        }
+    };
+    
+    /** Check $or match deduping - in takeover cursor. */
+    class OrDedup : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 150; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i ) );   
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "$or" << BSON_ARRAY( BSON( "_id" << LT << 140 ) << BSON( "_id" << 145 ) << BSON( "a" << 145 ) ) ) );
+            
+            while( c->current().getIntField( "_id" ) < 140 ) {
+                ASSERT( c->advance() );
+            }
+            // Match from second $or clause.
+            ASSERT_EQUALS( 145, c->current().getIntField( "_id" ) );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->advance() );
+            // Match from third $or clause.
+            ASSERT_EQUALS( 145, c->current().getIntField( "_id" ) );
+            // $or deduping is handled by the matcher.
+            ASSERT( !c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->advance() );
+        }
+    };
+    
+    /** Standard dups with a multikey cursor. */
+    class EarlyDups : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "a" << BSON_ARRAY( 0 << 1 << 200 ) ) );
+            for( int i = 2; i < 150; ++i ) {
+                _cli.insert( ns(), BSON( "a" << i ) );   
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "a" << GT << -1 ) );
+            ASSERT_EQUALS( 149, itcount() );
+        }
+    };
+    
+    /** Pop or clause in takeover cursor. */
+    class OrPopInTakeover : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 150; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i ) );   
+            }
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "$or" << BSON_ARRAY( BSON( "_id" << LTE << 147 ) << BSON( "_id" << 148 ) << BSON( "_id" << 149 ) ) ) );
+            for( int i = 0; i < 150; ++i ) {
+                ASSERT( c->ok() );
+                ASSERT_EQUALS( i, c->current().getIntField( "_id" ) );
+                c->advance();
+            }
+            ASSERT( !c->ok() );
+        }
+    };
+    
+    /** Or clause iteration abandoned once full collection scan is performed. */
+    class OrCollectionScanAbort : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 0 << "a" << BSON_ARRAY( 1 << 2 << 3 << 4 << 5 ) << "b" << 4 ) );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << BSON_ARRAY( 6 << 7 << 8 << 9 << 10 ) << "b" << 4 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "$or" << BSON_ARRAY( BSON( "a" << LT << 6 << "b" << 4 ) << BSON( "a" << GTE << 6 << "b" << 4 ) ) ) );
+            
+            ASSERT( c->ok() );
+            
+            // _id 0 on {a:1}
+            ASSERT_EQUALS( 0, c->current().getIntField( "_id" ) );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->getsetdup( c->currLoc() ) );
+            c->advance();
+            
+            // _id 0 on {$natural:1}
+            ASSERT_EQUALS( 0, c->current().getIntField( "_id" ) );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->getsetdup( c->currLoc() ) );
+            c->advance();
+            
+            // _id 0 on {a:1}
+            ASSERT_EQUALS( 0, c->current().getIntField( "_id" ) );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->getsetdup( c->currLoc() ) );
+            c->advance();
+            
+            // _id 1 on {$natural:1}
+            ASSERT_EQUALS( 1, c->current().getIntField( "_id" ) );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( !c->getsetdup( c->currLoc() ) );
+            c->advance();
+            
+            // _id 0 on {a:1}
+            ASSERT_EQUALS( 0, c->current().getIntField( "_id" ) );
+            ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+            ASSERT( c->getsetdup( c->currLoc() ) );
+            c->advance();
+            
+            // {$natural:1} finished
+            ASSERT( !c->ok() );
+        }
+    };
+    
+    /** Yield cursor and delete current entry, then continue iteration. */
+    class YieldNoOp : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+                ASSERT( prepareToYield() );
+                recoverFromYield();
+            }
+        }            
+    };
+    
+    /** Yield cursor and delete current entry. */
+    class YieldDelete : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << 1 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.remove( ns(), BSON( "_id" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( !ok() );
+                ASSERT( !advance() );
+            }
+        }
+    };
+    
+    /** Yield cursor and delete current entry, then continue iteration. */
+    class YieldDeleteContinue : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.remove( ns(), BSON( "_id" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+            }
+        }            
+    };
+    
+    /** Yield cursor and delete current entry, then continue iteration. */
+    class YieldDeleteContinueFurther : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 ) );
+            _cli.insert( ns(), BSON( "_id" << 3 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.remove( ns(), BSON( "_id" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 3, current().getIntField( "_id" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+            }
+        }            
+    };
+    
+    /** Yield and update current. */
+    class YieldUpdate : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "a" << 1 ) );
+            _cli.insert( ns(), BSON( "a" << 2 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "a" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "a" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.update( ns(), BSON( "a" << 1 ), BSON( "$set" << BSON( "a" << 3 ) ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 2, current().getIntField( "a" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+            }                
+        }
+    };
+    
+    /** Yield and drop collection. */
+    class YieldDrop : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.dropCollection( ns() );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( !ok() );
+            }                
+        }
+    };
+    
+    /** Yield and drop collection with $or query. */
+    class YieldDropOr : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << 1 ) << BSON( "_id" << 2 ) ) ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.dropCollection( ns() );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                ASSERT_THROWS( recoverFromYield(), MsgAssertionException );
+                ASSERT( !ok() );
+            }                
+        }
+    };
+    
+    /** Yield and remove document with $or query. */
+    class YieldRemoveOr : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 ) );
+
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << 1 ) << BSON( "_id" << 2 ) ) ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+
+            _cli.remove( ns(), BSON( "_id" << 1 ) );
+
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+            }
+        }
+    };
+
+    /** Yield and overwrite current in capped collection. */
+    class YieldCappedOverwrite : public Base {
+    public:
+        void run() {
+            _cli.createCollection( ns(), 1000, true );
+            _cli.insert( ns(), BSON( "x" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "x" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "x" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            int x = 2;
+            while( _cli.count( ns(), BSON( "x" << 1 ) ) > 0 ) {
+                _cli.insert( ns(), BSON( "x" << x++ ) );   
+            }
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                ASSERT_THROWS( recoverFromYield(), MsgAssertionException );
+                ASSERT( !ok() );
+            }                
+        }
+    };
+    
+    /** Yield and drop unrelated index - see SERVER-2454. */
+    class YieldDropIndex : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << 1 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.dropIndex( ns(), BSON( "a" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( !ok() );
+            }                
+        }
+    };
+    
+    /** Yielding with multiple plans active. */
+    class YieldMultiplePlansNoOp : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 2 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+            }                
+        }
+    };
+    
+    /** Yielding with advance and multiple plans active. */
+    class YieldMultiplePlansAdvanceNoOp : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 2 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "a" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 3 << "a" << 3 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                advance();
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 3, current().getIntField( "_id" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+            }                
+        }
+    };
+    
+    /** Yielding with delete and multiple plans active. */
+    class YieldMultiplePlansDelete : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 2 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "a" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 3 << "a" << 4 ) );
+            _cli.insert( ns(), BSON( "_id" << 4 << "a" << 3 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                advance();
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.remove( ns(), BSON( "_id" << 2 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                c()->recoverFromYield();
+                ASSERT( ok() );
+                // index {a:1} active during yield
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 3, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 4, current().getIntField( "_id" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+            }                
+        }
+    };
+
+    /** Yielding with delete, multiple plans active, and $or clause. */
+    class YieldMultiplePlansDeleteOr : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 2 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << 1 << "a" << 2 ) << BSON( "_id" << 2 << "a" << 1 ) ) ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+
+            _cli.remove( ns(), BSON( "_id" << 1 ) );
+
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                c()->recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+            }
+        }
+    };
+    
+    /** Yielding with delete, multiple plans active with advancement to the second, and $or clause. */
+    class YieldMultiplePlansDeleteOrAdvance : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 2 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "$or" << BSON_ARRAY( BSON( "_id" << 1 << "a" << 2 ) << BSON( "_id" << 2 << "a" << 1 ) ) ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+                c()->advance();
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+            }
+
+            _cli.remove( ns(), BSON( "_id" << 1 ) );
+
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                c()->recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( !advance() );
+                ASSERT( !ok() );
+            }
+        }
+    };
+
+    /** Yielding with multiple plans and capped overwrite. */
+    class YieldMultiplePlansCappedOverwrite : public Base {
+    public:
+        void run() {
+            _cli.createCollection( ns(), 1000, true );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "_id" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            int i = 1;
+            while( _cli.count( ns(), BSON( "_id" << 1 ) ) > 0 ) {
+                ++i;
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i ) );
+            }
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                // {$natural:1} plan does not recover, {_id:1} plan does.
+                ASSERT( 1 < current().getIntField( "_id" ) );
+            }                
+        }
+    };
+    
+    /**
+     * Yielding with multiple plans and capped overwrite with unrecoverable cursor
+     * active at time of yield.
+     */
+    class YieldMultiplePlansCappedOverwriteManual : public Base {
+    public:
+        void run() {
+            _cli.createCollection( ns(), 1000, true );
+            _cli.insert( ns(), BSON( "a" << 1 << "b" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            shared_ptr<Cursor> c;
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                c = newQueryOptimizerCursor( ns(), BSON( "a" << GT << 0 << "b" << GT << 0 ) );
+                ASSERT_EQUALS( 1, c->current().getIntField( "a" ) );
+                ASSERT( !c->getsetdup( c->currLoc() ) );
+                c->advance();
+                ASSERT_EQUALS( 1, c->current().getIntField( "a" ) );
+                ASSERT( c->getsetdup( c->currLoc() ) );
+                ASSERT( c->prepareToYield() );
+            }
+            
+            int i = 1;
+            while( _cli.count( ns(), BSON( "a" << 1 ) ) > 0 ) {
+                ++i;
+                _cli.insert( ns(), BSON( "a" << i << "b" << i ) );
+            }
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                c->recoverFromYield();
+                ASSERT( c->ok() );
+                // {$natural:1} plan does not recover, {_id:1} plan does.
+                ASSERT( 1 < c->current().getIntField( "a" ) );
+            }                
+        }
+    };
+    
+    /**
+     * Yielding with multiple plans and capped overwrite with unrecoverable cursor
+     * inctive at time of yield.
+     */
+    class YieldMultiplePlansCappedOverwriteManual2 : public Base {
+    public:
+        void run() {
+            _cli.createCollection( ns(), 1000, true );
+            _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "_id" << 1 ) );
+            
+            shared_ptr<Cursor> c;
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                c = newQueryOptimizerCursor( ns(), BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
+                ASSERT_EQUALS( 1, c->current().getIntField( "_id" ) );
+                ASSERT( !c->getsetdup( c->currLoc() ) );
+                ASSERT( c->prepareToYield() );
+            }
+            
+            int n = 1;
+            while( _cli.count( ns(), BSON( "_id" << 1 ) ) > 0 ) {
+                ++n;
+                _cli.insert( ns(), BSON( "_id" << n << "a" << n ) );
+            }
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                c->recoverFromYield();
+                ASSERT( c->ok() );
+                // {$natural:1} plan does not recover, {_id:1} plan does.
+                ASSERT( 1 < c->current().getIntField( "_id" ) );
+                ASSERT( !c->getsetdup( c->currLoc() ) );
+                int i = c->current().getIntField( "_id" );
+                ASSERT( c->advance() );
+                ASSERT( c->getsetdup( c->currLoc() ) );
+                while( i < n ) {
+                    ASSERT( c->advance() );
+                    ++i;
+                    ASSERT_EQUALS( i, c->current().getIntField( "_id" ) );
+                }
+            }                
+        }
+    };
+    
+    /** Yield with takeover cursor. */
+    class YieldTakeover : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 150; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i ) );   
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GTE << 0 << "a" << GTE << 0 ) );
+                for( int i = 0; i < 120; ++i ) {
+                    ASSERT( advance() );
+                }
+                ASSERT( ok() );
+                ASSERT_EQUALS( 120, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.remove( ns(), BSON( "_id" << 120 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 121, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 122, current().getIntField( "_id" ) );
+            }
+        }
+    };
+    
+    /** Yield with BacicCursor takeover cursor. */
+    class YieldTakeoverBasic : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 150; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << BSON_ARRAY( i << i+1 ) ) );   
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            auto_ptr<ClientCursor> cc;
+            auto_ptr<ClientCursor::YieldData> data( new ClientCursor::YieldData() );
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "b" << NE << 0 << "a" << GTE << 0 ) );
+                cc.reset( new ClientCursor( QueryOption_NoCursorTimeout, c(), ns() ) );
+                for( int i = 0; i < 120; ++i ) {
+                    ASSERT( advance() );
+                }
+                ASSERT( ok() );
+                ASSERT_EQUALS( 120, current().getIntField( "_id" ) );
+                cc->prepareToYield( *data );
+            }                
+            _cli.remove( ns(), BSON( "_id" << 120 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                ASSERT( ClientCursor::recoverFromYield( *data ) );
+                ASSERT( ok() );
+                ASSERT_EQUALS( 121, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 122, current().getIntField( "_id" ) );
+            }
+        }
+    };
+    
+    /** Yield with advance of inactive cursor. */
+    class YieldInactiveCursorAdvance : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 10; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << 10 - i ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
+                ASSERT( ok() );
+                ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 9, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 2, current().getIntField( "_id" ) );
+                ASSERT( prepareToYield() );
+            }
+            
+            _cli.remove( ns(), BSON( "_id" << 9 ) );
+            
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                recoverFromYield();
+                ASSERT( ok() );
+                ASSERT_EQUALS( 8, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 3, current().getIntField( "_id" ) );
+                ASSERT( advance() );
+                ASSERT_EQUALS( 7, current().getIntField( "_id" ) );
+            }                    
+        }
+    };
+    
+    class OrderId : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 10; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i ) );
+            }
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSONObj(), BSON( "_id" << 1 ) );
+            
+            for( int i = 0; i < 10; ++i, advance() ) {
+                ASSERT( ok() );
+                ASSERT_EQUALS( i, current().getIntField( "_id" ) );
+            }
+        }
+    };
+    
+    class OrderMultiIndex : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 10; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << 1 ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "_id" << 1 << "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GTE << 0 << "a" << GTE << 0 ), BSON( "_id" << 1 ) );
+            
+            for( int i = 0; i < 10; ++i, advance() ) {
+                ASSERT( ok() );
+                ASSERT_EQUALS( i, current().getIntField( "_id" ) );
+            }
+        }
+    };
+    
+    class OrderReject : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 10; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i % 5 ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "a" << GTE << 3 ), BSON( "_id" << 1 ) );
+            
+            ASSERT( ok() );
+            ASSERT_EQUALS( 3, current().getIntField( "_id" ) );
+            ASSERT( advance() );
+            ASSERT_EQUALS( 4, current().getIntField( "_id" ) );
+            ASSERT( advance() );
+            ASSERT_EQUALS( 8, current().getIntField( "_id" ) );
+            ASSERT( advance() );
+            ASSERT_EQUALS( 9, current().getIntField( "_id" ) );
+            ASSERT( !advance() );
+        }
+    };
+    
+    class OrderNatural : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 5 ) );
+            _cli.insert( ns(), BSON( "_id" << 4 ) );
+            _cli.insert( ns(), BSON( "_id" << 6 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 0 ), BSON( "$natural" << 1 ) );
+            
+            ASSERT( ok() );
+            ASSERT_EQUALS( 5, current().getIntField( "_id" ) );
+            ASSERT( advance() );
+            ASSERT_EQUALS( 4, current().getIntField( "_id" ) );
+            ASSERT( advance() );                
+            ASSERT_EQUALS( 6, current().getIntField( "_id" ) );
+            ASSERT( !advance() );                
+        }
+    };
+    
+    class OrderUnindexed : public Base {
+    public:
+        void run() {
+            dblock lk;
+            Client::Context ctx( ns() );
+            ASSERT( !newQueryOptimizerCursor( ns(), BSONObj(), BSON( "a" << 1 ) ).get() );
+        }
+    };
+    
+    class RecordedOrderInvalid : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "a" << 1 << "b" << 1 ) );
+            _cli.insert( ns(), BSON( "a" << 2 << "b" << 2 ) );
+            _cli.insert( ns(), BSON( "a" << 3 << "b" << 3 ) );
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            ASSERT( _cli.query( ns(), QUERY( "a" << 2 ).sort( "b" ) )->more() );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "a" << 2 ), BSON( "b" << 1 ) );
+            // Check that we are scanning {b:1} not {a:1}.
+            for( int i = 0; i < 3; ++i ) {
+                ASSERT( c->ok() );
+                c->advance();
+            }
+            ASSERT( !c->ok() );
+        }
+    };
+    
+    class KillOp : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "b" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "b" << 2 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            Client::ReadContext ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "b" << GT << 0 ) );
+            ASSERT( ok() );
+            cc().curop()->kill();
+            // First advance() call throws, subsequent calls just fail.
+            ASSERT_THROWS( advance(), MsgAssertionException );
+            ASSERT( !advance() );
+        }
+    };
+    
+    class KillOpFirstClause : public Base {
+    public:
+        void run() {
+            _cli.insert( ns(), BSON( "_id" << 1 << "b" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "b" << 2 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            Client::ReadContext ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "$or" << BSON_ARRAY( BSON( "_id" << GT << 0 ) << BSON( "b" << GT << 0 ) ) ) );
+            ASSERT( c->ok() );
+            cc().curop()->kill();
+            // First advance() call throws, subsequent calls just fail.
+            ASSERT_THROWS( c->advance(), MsgAssertionException );
+            ASSERT( !c->advance() );
+        }
+    };
+    
+    class Nscanned : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 120; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "a" << i ) );
+            }
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "_id" << GTE << 0 << "a" << GTE << 0 ) );
+            ASSERT( c->ok() );
+            ASSERT_EQUALS( 2, c->nscanned() );
+            c->advance();
+            ASSERT( c->ok() );
+            ASSERT_EQUALS( 2, c->nscanned() );
+            c->advance();
+            for( int i = 3; i < 222; ++i ) {
+                ASSERT( c->ok() );
+                c->advance();
+            }
+            ASSERT( !c->ok() );
+        }
+    };
+
+    /* Test 'touching earlier iterate' without doc modifications. */
+    class TouchEarlierIterate : public Base {
+    public:
+        void run() {            
+            _cli.insert( ns(), BSON( "_id" << 1 << "b" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "b" << 2 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+
+            Client::ReadContext ctx( ns() );
+            shared_ptr<Cursor> c = newQueryOptimizerCursor( ns(), BSON( "_id" << GT << 0 << "b" << GT << 0 ) );
+            
+            ASSERT( c->ok() );
+            while( c->ok() ) {
+                DiskLoc loc = c->currLoc();
+                BSONObj obj = c->current();
+                c->prepareToTouchEarlierIterate();
+                c->recoverFromTouchingEarlierIterate();
+                ASSERT( loc == c->currLoc() );
+                ASSERT_EQUALS( obj, c->current() );
+                c->advance();
+            }
+        }
+    };
+
+    /* Test 'touching earlier iterate' with doc modifications. */
+    class TouchEarlierIterateDelete : public Base {
+    public:
+        void run() {            
+            _cli.insert( ns(), BSON( "_id" << 1 << "b" << 1 ) );
+            _cli.insert( ns(), BSON( "_id" << 2 << "b" << 2 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            DiskLoc firstLoc;
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "b" << GT << 0 ) );
+            ASSERT( ok() );
+            firstLoc = currLoc();
+            ASSERT( c()->advance() );
+            prepareToTouchEarlierIterate();
+            
+            _cli.remove( ns(), BSON( "_id" << 1 ), true );
+
+            recoverFromTouchingEarlierIterate();
+            ASSERT( ok() );
+            while( ok() ) {
+                ASSERT( firstLoc != currLoc() );
+                c()->advance();
+            }
+        }
+    };
+
+    /* Test 'touch earlier iterate' with several doc modifications. */
+    class TouchEarlierIterateDeleteMultiple : public Base {
+    public:
+        void run() {
+            for( int i = 1; i < 10; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "b" << i ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            set<DiskLoc> deleted;
+            int id = 0;
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "b" << GT << 0 ) );
+            while( 1 ) {
+                if ( !ok() ) {
+                    break;
+                }
+                ASSERT( deleted.count( currLoc() ) == 0 );
+                id = current()["_id"].Int();
+                deleted.insert( currLoc() );
+                c()->advance();
+                prepareToTouchEarlierIterate();
+                
+                _cli.remove( ns(), BSON( "_id" << id ), true );
+
+                recoverFromTouchingEarlierIterate();
+            }
+            ASSERT_EQUALS( 9U, deleted.size() );
+        }
+    };
+
+    /* Test 'touch earlier iterate' with takeover. */
+    class TouchEarlierIterateTakeover : public Base {
+    public:
+        void run() {
+            for( int i = 1; i < 600; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "b" << i ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            Client::ReadContext ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "b" << GT << 0 ) );
+            
+            ASSERT( ok() );
+            int count = 1;
+            while( ok() ) {
+                DiskLoc loc = currLoc();
+                BSONObj obj = current();
+                prepareToTouchEarlierIterate();
+                recoverFromTouchingEarlierIterate();
+                ASSERT( loc == currLoc() );
+                ASSERT_EQUALS( obj, current() );
+                count += mayReturnCurrent();
+                c()->advance();
+            }
+            ASSERT_EQUALS( 599, count );
+        }
+    };
+
+    /* Test 'touch earlier iterate' with takeover and deletes. */
+    class TouchEarlierIterateTakeoverDeleteMultiple : public Base {
+    public:
+        void run() {
+            for( int i = 1; i < 600; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "b" << i ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            set<DiskLoc> deleted;
+            int id = 0;
+
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursorWithoutAdvancing( BSON( "_id" << GT << 0 << "b" << GT << 0 ) );
+            while( 1 ) {
+                if ( !ok() ) {
+                    break;
+                }
+                ASSERT( deleted.count( currLoc() ) == 0 );
+                id = current()["_id"].Int();
+                ASSERT( c()->currentMatches() );
+                ASSERT( !c()->getsetdup( currLoc() ) );
+                deleted.insert( currLoc() );
+                c()->advance();
+                prepareToTouchEarlierIterate();
+
+                _cli.remove( ns(), BSON( "_id" << id ), true );
+
+                recoverFromTouchingEarlierIterate();
+            }
+            ASSERT_EQUALS( 599U, deleted.size() );
+        }
+    };
+
+    /* Test 'touch earlier iterate' with undexed cursor takeover and deletes. */
+    class TouchEarlierIterateUnindexedTakeoverDeleteMultiple : public Base {
+    public:
+        void run() {
+            for( int i = 1; i < 600; ++i ) {
+                _cli.insert( ns(), BSON( "a" << BSON_ARRAY( i << i+1 ) << "b" << BSON_ARRAY( i << i+1 ) << "_id" << i ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            set<DiskLoc> deleted;
+            int id = 0;
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursorWithoutAdvancing( BSON( "a" << GT << 0 << "b" << GT << 0 ) );
+            while( 1 ) {
+                if ( !ok() ) {
+                    break;
+                }
+                ASSERT( deleted.count( currLoc() ) == 0 );
+                id = current()["_id"].Int();
+                ASSERT( c()->currentMatches() );
+                ASSERT( !c()->getsetdup( currLoc() ) );
+                deleted.insert( currLoc() );
+                c()->advance();
+                prepareToTouchEarlierIterate();
+                
+                _cli.remove( ns(), BSON( "_id" << id ), true );
+                
+                recoverFromTouchingEarlierIterate();
+            }
+            ASSERT_EQUALS( 599U, deleted.size() );
+        }
+    };
+    
+    /* Test 'touch earlier iterate' with takeover and deletes, with multiple advances in a row. */
+    class TouchEarlierIterateTakeoverDeleteMultipleMultiAdvance : public Base {
+    public:
+        void run() {
+            for( int i = 1; i < 600; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "b" << i ) );
+            }
+            _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+            
+            set<DiskLoc> deleted;
+            int id = 0;
+
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "_id" << GT << 0 << "b" << GT << 0 ) );
+            while( 1 ) {
+                if ( !ok() ) {
+                    break;
+                }
+                ASSERT( deleted.count( currLoc() ) == 0 );
+                id = current()["_id"].Int();
+                ASSERT( c()->currentMatches() );
+                deleted.insert( currLoc() );
+                advance();
+                prepareToTouchEarlierIterate();
+
+                _cli.remove( ns(), BSON( "_id" << id ), true );
+                
+                recoverFromTouchingEarlierIterate();
+            }
+            ASSERT_EQUALS( 599U, deleted.size() );
+        }
+    };
+
+    /* Test yield recovery failure of component capped cursor. */
+    class InitialCappedWrapYieldRecoveryFailure : public Base {
+    public:
+        void run() {
+            _cli.createCollection( ns(), 1000, true );
+            _cli.insert( ns(), BSON( "_id" << 1 << "x" << 1 ) );
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            setQueryOptimizerCursor( BSON( "x" << GT << 0 ) );
+            ASSERT_EQUALS( 1, current().getIntField( "x" ) );
+            
+            ClientCursor::CleanupPointer p;
+            p.reset( new ClientCursor( QueryOption_NoCursorTimeout, c(), ns() ) );
+            ClientCursor::YieldData yieldData;
+            p->prepareToYield( yieldData );
+            
+            int x = 2;
+            while( _cli.count( ns(), BSON( "x" << 1 ) ) > 0 ) {
+                _cli.insert( ns(), BSON( "_id" << x << "x" << x ) );
+                ++x;
+            }
+
+            // TODO - Might be preferable to return false rather than assert here.
+            ASSERT_THROWS( ClientCursor::recoverFromYield( yieldData ), AssertionException );
+        }
+    };
+
+    /* Test yield recovery failure of takeover capped cursor. */
+    class TakeoverCappedWrapYieldRecoveryFailure : public Base {
+    public:
+        void run() {
+            _cli.createCollection( ns(), 10000, true );
+            for( int i = 0; i < 300; ++i ) {
+                _cli.insert( ns(), BSON( "_id" << i << "x" << i ) );                
+            }
+
+            ClientCursor::CleanupPointer p;
+            ClientCursor::YieldData yieldData;
+            {
+                dblock lk;
+                Client::Context ctx( ns() );
+                setQueryOptimizerCursor( BSON( "x" << GTE << 0 ) );
+                for( int i = 0; i < 299; ++i ) {
+                    advance();
+                }
+                ASSERT_EQUALS( 299, current().getIntField( "x" ) );
+                
+                p.reset( new ClientCursor( QueryOption_NoCursorTimeout, c(), ns() ) );
+                p->prepareToYield( yieldData );
+            }
+            
+            int i = 300;
+            while( _cli.count( ns(), BSON( "x" << 299 ) ) > 0 ) {
+                _cli.insert( ns(), BSON( "_id" << i << "x" << i ) );
+                ++i;
+            }
+            
+            dblock lk;
+            Client::Context ctx( ns() );
+            ASSERT( !ClientCursor::recoverFromYield( yieldData ) );
+        }
+    };
+
+    namespace GetCursor {
+        
+        class Base : public QueryOptimizerCursorTests::Base {
+        public:
+            Base() {
+                // create collection
+                _cli.insert( ns(), BSON( "_id" << 5 ) );
+            }
+            virtual ~Base() {}
+            void run() {
+                dblock lk;
+                Client::Context ctx( ns() );
+                bool simpleEqualityMatch;
+                if ( expectException() ) {
+                    ASSERT_THROWS( NamespaceDetailsTransient::getCursor( ns(), query(), order(), requireIndex(), &simpleEqualityMatch ), MsgAssertionException );
+                    return;
+                }
+                shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns(), query(), order(), requireIndex(), &simpleEqualityMatch );
+                ASSERT_EQUALS( expectSimpleEquality(), simpleEqualityMatch );
+                string type = c->toString().substr( 0, expectedType().length() );
+                ASSERT_EQUALS( expectedType(), type );
+                check( c );
+            }
+        protected:
+            virtual string expectedType() const { return "TESTDUMMY"; }
+            virtual bool expectException() const { return false; }
+            virtual bool expectSimpleEquality() const { return false; }
+            virtual BSONObj query() const { return BSONObj(); }
+            virtual BSONObj order() const { return BSONObj(); }
+            virtual bool requireIndex() const { return false; }
+            virtual void check( const shared_ptr<Cursor> &c ) {
+                ASSERT( c->ok() );
+                ASSERT( !c->matcher() );
+                ASSERT_EQUALS( 5, c->current().getIntField( "_id" ) );
+                ASSERT( !c->advance() );
+            }
+        };
+        
+        class NoConstraints : public Base {
+            string expectedType() const { return "BasicCursor"; }
+        };
+        
+        class SimpleId : public Base {
+        public:
+            SimpleId() {
+                _cli.insert( ns(), BSON( "_id" << 0 ) );
+                _cli.insert( ns(), BSON( "_id" << 10 ) );
+            }
+            string expectedType() const { return "BtreeCursor _id_"; }
+            BSONObj query() const { return BSON( "_id" << 5 ); }
+        };
+        
+        class OptimalIndex : public Base {
+        public:
+            OptimalIndex() {
+                _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+                _cli.insert( ns(), BSON( "a" << 5 ) );
+                _cli.insert( ns(), BSON( "a" << 6 ) );
+            }
+            string expectedType() const { return "BtreeCursor a_1"; }
+            BSONObj query() const { return BSON( "a" << GTE << 5 ); }
+            void check( const shared_ptr<Cursor> &c ) {
+                ASSERT( c->ok() );
+                ASSERT( c->matcher() );
+                ASSERT_EQUALS( 5, c->current().getIntField( "a" ) );
+                ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+                ASSERT( c->advance() );                    
+                ASSERT_EQUALS( 6, c->current().getIntField( "a" ) );
+                ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+                ASSERT( !c->advance() );                    
+            }
+        };
+        
+        class SimpleKeyMatch : public Base {
+        public:
+            SimpleKeyMatch() {
+                _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+                _cli.update( ns(), BSONObj(), BSON( "$set" << BSON( "a" << true ) ) );
+            }
+            string expectedType() const { return "BtreeCursor a_1"; }
+            bool expectSimpleEquality() const { return true; }
+            BSONObj query() const { return BSON( "a" << true ); }
+            virtual void check( const shared_ptr<Cursor> &c ) {
+                ASSERT( c->ok() );
+                ASSERT_EQUALS( 5, c->current().getIntField( "_id" ) );
+                ASSERT( !c->advance() );
+            }
+        };
+        
+        class Geo : public Base {
+        public:
+            Geo() {
+                _cli.insert( ns(), BSON( "_id" << 44 << "loc" << BSON_ARRAY( 44 << 45 ) ) );
+                _cli.ensureIndex( ns(), BSON( "loc" << "2d" ) );
+            }
+            string expectedType() const { return "GeoSearchCursor"; }
+            BSONObj query() const { return fromjson( "{ loc : { $near : [50,50] } }" ); }
+            void check( const shared_ptr<Cursor> &c ) {
+                ASSERT( c->ok() );
+                ASSERT( c->matcher() );
+                ASSERT( c->matcher()->matchesCurrent( c.get() ) );
+                ASSERT_EQUALS( 44, c->current().getIntField( "_id" ) );
+                ASSERT( !c->advance() );
+            }
+        };
+        
+        class OutOfOrder : public QueryOptimizerCursorTests::Base {
+        public:
+            void run() {
+                _cli.insert( ns(), BSON( "_id" << 5 ) );
+                dblock lk;
+                Client::Context ctx( ns() );
+                shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns(), BSONObj(), BSON( "b" << 1 ) );
+                ASSERT( !c );
+            }
+        };
+        
+        class BestSavedOutOfOrder : public QueryOptimizerCursorTests::Base {
+        public:
+            void run() {
+                _cli.insert( ns(), BSON( "_id" << 5 << "b" << BSON_ARRAY( 1 << 2 << 3 << 4 << 5 ) ) );
+                _cli.insert( ns(), BSON( "_id" << 1 << "b" << 6 ) );
+                _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+                // record {_id:1} index for this query
+                ASSERT( _cli.query( ns(), QUERY( "_id" << GT << 0 << "b" << GT << 0 ).sort( "b" ) )->more() );
+                dblock lk;
+                Client::Context ctx( ns() );
+                shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns(), BSON( "_id" << GT << 0 << "b" << GT << 0 ), BSON( "b" << 1 ) );
+                // {_id:1} requires scan and order, so {b:1} must be chosen.
+                ASSERT( c );
+                ASSERT_EQUALS( 5, c->current().getIntField( "_id" ) );
+            }
+        };
+
+        /**
+         * If an optimal plan is a candidate, return a cursor for it rather than a QueryOptimizerCursor.  Avoid
+         * caching optimal plans since simple cursors will not save a plan anyway (so in the most common case optimal
+         * plans won't be cached) and because this simplifies the implementation for selecting a simple cursor.
+         */
+        class BestSavedOptimal : public QueryOptimizerCursorTests::Base {
+        public:
+            void run() {
+                _cli.insert( ns(), BSON( "_id" << 1 ) );
+                _cli.ensureIndex( ns(), BSON( "_id" << 1 << "q" << 1 ) );
+                // {_id:1} index not recorded for these queries since it is an optimal index.
+                ASSERT( _cli.query( ns(), QUERY( "_id" << GT << 0 ) )->more() );
+                ASSERT( _cli.query( ns(), QUERY( "$or" << BSON_ARRAY( BSON( "_id" << GT << 0 ) ) ) )->more() );
+                dblock lk;
+                Client::Context ctx( ns() );
+                // Check that no plan was recorded for this query.
+                ASSERT( BSONObj().woCompare( NamespaceDetailsTransient::get_inlock( ns() ).indexForPattern( FieldRangeSet( ns(), BSON( "_id" << GT << 0 ), true ).pattern() ) ) == 0 );
+                shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns(), BSON( "_id" << GT << 0 ) );
+                // No need for query optimizer cursor since the plan is optimal.
+                ASSERT_EQUALS( "BtreeCursor _id_", c->toString() );
+            }
+        };
+        
+        /** If a no optimal plan is a candidate a QueryOptimizerCursor should be returned, even if plan has been recorded. */
+        class BestSavedNotOptimal : public QueryOptimizerCursorTests::Base {
+        public:
+            void run() {
+                _cli.insert( ns(), BSON( "_id" << 1 << "q" << 1 ) );
+                _cli.ensureIndex( ns(), BSON( "q" << 1 ) );
+                // Record {_id:1} index for this query
+                ASSERT( _cli.query( ns(), QUERY( "q" << 1 << "_id" << 1 ) )->more() );
+                dblock lk;
+                Client::Context ctx( ns() );
+                ASSERT( BSON( "_id" << 1 ).woCompare( NamespaceDetailsTransient::get_inlock( ns() ).indexForPattern( FieldRangeSet( ns(), BSON( "q" << 1 << "_id" << 1 ), true ).pattern() ) ) == 0 );
+                shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns(), BSON( "q" << 1 << "_id" << 1 ) );
+                // Need query optimizer cursor since the cached plan is not optimal.
+                ASSERT_EQUALS( "QueryOptimizerCursor", c->toString() );
+            }
+        };
+        
+        class MultiIndex : public Base {
+        public:
+            MultiIndex() {
+                _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+            }
+            string expectedType() const { return "QueryOptimizerCursor"; }
+            BSONObj query() const { return BSON( "_id" << GT << 0 << "a" << GT << 0 ); }
+            void check( const shared_ptr<Cursor> &c ) {}
+        };
+        
+        class RequireIndexNoConstraints : public Base {
+            bool requireIndex() const { return true; }
+            bool expectException() const { return true; }
+        };
+
+        class RequireIndexSimpleId : public Base {
+            bool requireIndex() const { return true; }
+            string expectedType() const { return "BtreeCursor _id_"; }
+            BSONObj query() const { return BSON( "_id" << 5 ); }
+        };
+
+        class RequireIndexUnindexedQuery : public Base {
+            bool requireIndex() const { return true; }
+            bool expectException() const { return true; }
+            BSONObj query() const { return BSON( "a" << GTE << 5 ); }
+        };
+
+        class RequireIndexIndexedQuery : public Base {
+        public:
+            RequireIndexIndexedQuery() {
+                _cli.insert( ns(), BSON( "_id" << 6 << "a" << 6 << "c" << 4 ) );
+                _cli.ensureIndex( ns(), BSON( "a" << 1 << "b" << 1 << "c" << 1 ) );
+            }
+            string expectedType() const { return "QueryOptimizerCursor"; }
+            bool requireIndex() const { return true; }
+            BSONObj query() const { return BSON( "a" << GTE << 5 << "c" << 4 ); }
+            void check( const shared_ptr<Cursor> &c ) {
+                ASSERT( c->ok() );
+                ASSERT( c->matcher() );
+                ASSERT_EQUALS( 6, c->current().getIntField( "_id" ) );
+                ASSERT( !c->advance() );
+            }
+        };
+
+        class RequireIndexSecondOrClauseIndexed : public Base {
+        public:
+            RequireIndexSecondOrClauseIndexed() {
+                _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+                _cli.ensureIndex( ns(), BSON( "b" << 1 ) );
+                _cli.insert( ns(), BSON( "a" << 1 ) );
+                _cli.insert( ns(), BSON( "b" << 1 ) );
+            }
+            bool requireIndex() const { return true; }
+            string expectedType() const { return "QueryOptimizerCursor"; }
+            BSONObj query() const { return fromjson( "{$or:[{a:1},{b:1}]}" ); }
+            void check( const shared_ptr<Cursor> &c ) {
+                ASSERT( c->ok() );
+                ASSERT( c->matcher() );
+                ASSERT( c->advance() );
+                ASSERT( !c->advance() ); // 2 matches exactly
+            }
+        };
+        
+        class RequireIndexSecondOrClauseUnindexed : public Base {
+        public:
+            RequireIndexSecondOrClauseUnindexed() {
+                _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+                _cli.insert( ns(), BSON( "a" << 1 ) );
+            }
+            bool requireIndex() const { return true; }
+            bool expectException() const { return true; }
+            BSONObj query() const { return fromjson( "{$or:[{a:1},{b:1}]}" ); }
+        };
+
+        class RequireIndexSecondOrClauseUnindexedUndetected : public Base {
+        public:
+            RequireIndexSecondOrClauseUnindexedUndetected() {
+                _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
+                _cli.ensureIndex( ns(), BSON( "a" << 1 << "b" << 1 ) );
+                _cli.insert( ns(), BSON( "a" << 1 ) );
+                _cli.insert( ns(), BSON( "b" << 1 ) );
+            }
+            bool requireIndex() const { return true; }
+            string expectedType() const { return "QueryOptimizerCursor"; }
+            BSONObj query() const { return fromjson( "{$or:[{a:1},{b:1}]}" ); }
+            void check( const shared_ptr<Cursor> &c ) {
+                ASSERT( c->ok() );
+                ASSERT( c->matcher() );
+                // An unindexed cursor is required for the second clause, but is not allowed.
+                ASSERT_THROWS( c->advance(), MsgAssertionException );
+            }
+        };
+        
+    } // namespace GetCursor
+    
+    class All : public Suite {
+    public:
+        All() : Suite( "queryoptimizercursor" ) {}
+        
+        void setupTests() {
+            __forceLinkGeoPlugin();
+            add<QueryOptimizerCursorTests::CachedMatchCounterCount>();
+            add<QueryOptimizerCursorTests::CachedMatchCounterAccumulate>();
+            add<QueryOptimizerCursorTests::CachedMatchCounterDedup>();
+            add<QueryOptimizerCursorTests::CachedMatchCounterNscanned>();
+            add<QueryOptimizerCursorTests::SmallDupSetUpgrade>();
+            add<QueryOptimizerCursorTests::CachedMatchCounterCount>();
+            add<QueryOptimizerCursorTests::SmallDupSetUpgradeRead>();
+            add<QueryOptimizerCursorTests::SmallDupSetUpgradeWrite>();
+            add<QueryOptimizerCursorTests::Empty>();
+            add<QueryOptimizerCursorTests::Unindexed>();
+            add<QueryOptimizerCursorTests::Basic>();
+            add<QueryOptimizerCursorTests::NoMatch>();
+            add<QueryOptimizerCursorTests::Interleaved>();
+            add<QueryOptimizerCursorTests::NotMatch>();
+            add<QueryOptimizerCursorTests::StopInterleaving>();
+            add<QueryOptimizerCursorTests::TakeoverWithDup>();
+            add<QueryOptimizerCursorTests::TakeoverWithNonMatches>();
+            add<QueryOptimizerCursorTests::TakeoverWithTakeoverDup>();
+            add<QueryOptimizerCursorTests::BasicOr>();
+            add<QueryOptimizerCursorTests::OrFirstClauseEmpty>();
+            add<QueryOptimizerCursorTests::OrSecondClauseEmpty>();
+            add<QueryOptimizerCursorTests::OrMultipleClausesEmpty>();
+            add<QueryOptimizerCursorTests::TakeoverCountOr>();
+            add<QueryOptimizerCursorTests::TakeoverEndOfOrClause>();
+            add<QueryOptimizerCursorTests::TakeoverBeforeEndOfOrClause>();
+            add<QueryOptimizerCursorTests::TakeoverAfterEndOfOrClause>();
+            add<QueryOptimizerCursorTests::ManualMatchingDeduping>();
+            add<QueryOptimizerCursorTests::ManualMatchingUsingCurrKey>();
+            add<QueryOptimizerCursorTests::ManualMatchingDedupingTakeover>();
+            add<QueryOptimizerCursorTests::Singlekey>();
+            add<QueryOptimizerCursorTests::Multikey>();
+            add<QueryOptimizerCursorTests::AddOtherPlans>();
+            add<QueryOptimizerCursorTests::AddOtherPlansDelete>();
+            add<QueryOptimizerCursorTests::AddOtherPlansContinuousDelete>();
+            add<QueryOptimizerCursorTests::OrRangeElimination>();
+            add<QueryOptimizerCursorTests::OrDedup>();
+            add<QueryOptimizerCursorTests::EarlyDups>();
+            add<QueryOptimizerCursorTests::OrPopInTakeover>();
+            add<QueryOptimizerCursorTests::OrCollectionScanAbort>();
+            add<QueryOptimizerCursorTests::YieldNoOp>();
+            add<QueryOptimizerCursorTests::YieldDelete>();
+            add<QueryOptimizerCursorTests::YieldDeleteContinue>();
+            add<QueryOptimizerCursorTests::YieldDeleteContinueFurther>();
+            add<QueryOptimizerCursorTests::YieldUpdate>();
+            add<QueryOptimizerCursorTests::YieldDrop>();
+            add<QueryOptimizerCursorTests::YieldDropOr>();
+            add<QueryOptimizerCursorTests::YieldRemoveOr>();
+            add<QueryOptimizerCursorTests::YieldCappedOverwrite>();
+            add<QueryOptimizerCursorTests::YieldDropIndex>();
+            add<QueryOptimizerCursorTests::YieldMultiplePlansNoOp>();
+            add<QueryOptimizerCursorTests::YieldMultiplePlansAdvanceNoOp>();
+            add<QueryOptimizerCursorTests::YieldMultiplePlansDelete>();
+            add<QueryOptimizerCursorTests::YieldMultiplePlansDeleteOr>();
+            add<QueryOptimizerCursorTests::YieldMultiplePlansDeleteOrAdvance>();
+            add<QueryOptimizerCursorTests::YieldMultiplePlansCappedOverwrite>();
+            add<QueryOptimizerCursorTests::YieldMultiplePlansCappedOverwriteManual>();
+            add<QueryOptimizerCursorTests::YieldMultiplePlansCappedOverwriteManual2>();
+            add<QueryOptimizerCursorTests::YieldTakeover>();
+            add<QueryOptimizerCursorTests::YieldTakeoverBasic>();
+            add<QueryOptimizerCursorTests::YieldInactiveCursorAdvance>();
+            add<QueryOptimizerCursorTests::OrderId>();
+            add<QueryOptimizerCursorTests::OrderMultiIndex>();
+            add<QueryOptimizerCursorTests::OrderReject>();
+            add<QueryOptimizerCursorTests::OrderNatural>();
+            add<QueryOptimizerCursorTests::OrderUnindexed>();
+            add<QueryOptimizerCursorTests::RecordedOrderInvalid>();
+            add<QueryOptimizerCursorTests::KillOp>();
+            add<QueryOptimizerCursorTests::KillOpFirstClause>();
+            add<QueryOptimizerCursorTests::Nscanned>();
+            add<QueryOptimizerCursorTests::TouchEarlierIterate>();
+            add<QueryOptimizerCursorTests::TouchEarlierIterateDelete>();
+            add<QueryOptimizerCursorTests::TouchEarlierIterateDeleteMultiple>();
+            add<QueryOptimizerCursorTests::TouchEarlierIterateTakeover>();
+            add<QueryOptimizerCursorTests::TouchEarlierIterateTakeoverDeleteMultiple>();
+            add<QueryOptimizerCursorTests::TouchEarlierIterateUnindexedTakeoverDeleteMultiple>();
+            add<QueryOptimizerCursorTests::TouchEarlierIterateTakeoverDeleteMultipleMultiAdvance>();
+            add<QueryOptimizerCursorTests::InitialCappedWrapYieldRecoveryFailure>();
+            add<QueryOptimizerCursorTests::TakeoverCappedWrapYieldRecoveryFailure>();
+            add<QueryOptimizerCursorTests::GetCursor::NoConstraints>();
+            add<QueryOptimizerCursorTests::GetCursor::SimpleId>();
+            add<QueryOptimizerCursorTests::GetCursor::OptimalIndex>();
+            add<QueryOptimizerCursorTests::GetCursor::SimpleKeyMatch>();
+            add<QueryOptimizerCursorTests::GetCursor::Geo>();
+            add<QueryOptimizerCursorTests::GetCursor::OutOfOrder>();
+            add<QueryOptimizerCursorTests::GetCursor::BestSavedOutOfOrder>();
+            add<QueryOptimizerCursorTests::GetCursor::BestSavedOptimal>();
+            add<QueryOptimizerCursorTests::GetCursor::BestSavedNotOptimal>();
+            add<QueryOptimizerCursorTests::GetCursor::MultiIndex>();
+            add<QueryOptimizerCursorTests::GetCursor::RequireIndexNoConstraints>();
+            add<QueryOptimizerCursorTests::GetCursor::RequireIndexSimpleId>();
+            add<QueryOptimizerCursorTests::GetCursor::RequireIndexUnindexedQuery>();
+            add<QueryOptimizerCursorTests::GetCursor::RequireIndexIndexedQuery>();
+            add<QueryOptimizerCursorTests::GetCursor::RequireIndexSecondOrClauseIndexed>();
+            add<QueryOptimizerCursorTests::GetCursor::RequireIndexSecondOrClauseUnindexed>();
+            add<QueryOptimizerCursorTests::GetCursor::RequireIndexSecondOrClauseUnindexedUndetected>();
+        }
+    } myall;
+    
+} // namespace QueryOptimizerTests
+
diff --git a/src/mongo/dbtests/queryoptimizertests.cpp b/src/mongo/dbtests/queryoptimizertests.cpp
new file mode 100644
index 00000000000..8da13578b45
--- /dev/null
+++ b/src/mongo/dbtests/queryoptimizertests.cpp
@@ -0,0 +1,1063 @@
+// queryoptimizertests.cpp : query optimizer unit tests
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/queryoptimizer.h"
+#include "../db/instance.h"
+#include "../db/ops/count.h"
+#include "../db/ops/query.h"
+#include "../db/ops/delete.h"
+#include "dbtests.h"
+
+
+namespace mongo {
+    extern BSONObj id_obj;
+    void runQuery(Message& m, QueryMessage& q, Message &response ) {
+        CurOp op( &(cc()) );
+        op.ensureStarted();
+        runQuery( m , q , op, response );
+    }
+    void runQuery(Message& m, QueryMessage& q ) {
+        Message response;
+        runQuery( m, q, response );
+    }
+    void __forceLinkGeoPlugin();
+} // namespace mongo
+
+namespace QueryOptimizerTests {
+
+    void dropCollection( const char *ns ) {
+     	string errmsg;
+        BSONObjBuilder result;
+        dropCollection( ns, errmsg, result );
+    }
+    
+    namespace QueryPlanTests {
+
+        using boost::shared_ptr;
+
+        class Base {
+        public:
+            Base() : _ctx( ns() ) , indexNum_( 0 ) {
+                string err;
+                userCreateNS( ns(), BSONObj(), err, false );
+            }
+            ~Base() {
+                if ( !nsd() )
+                    return;
+                dropCollection( ns() );
+            }
+        protected:
+            static const char *ns() { return "unittests.QueryPlanTests"; }
+            static NamespaceDetails *nsd() { return nsdetails( ns() ); }
+            IndexDetails *index( const BSONObj &key ) {
+                stringstream ss;
+                ss << indexNum_++;
+                string name = ss.str();
+                client_.resetIndexCache();
+                client_.ensureIndex( ns(), key, false, name.c_str() );
+                NamespaceDetails *d = nsd();
+                for( int i = 0; i < d->nIndexes; ++i ) {
+                    if ( d->idx(i).keyPattern() == key /*indexName() == name*/ || ( d->idx(i).isIdIndex() && IndexDetails::isIdIndexPattern( key ) ) )
+                        return &d->idx(i);
+                }
+                assert( false );
+                return 0;
+            }
+            int indexno( const BSONObj &key ) {
+                return nsd()->idxNo( *index(key) );
+            }
+            BSONObj startKey( const QueryPlan &p ) const {
+                return p.frv()->startKey();
+            }
+            BSONObj endKey( const QueryPlan &p ) const {
+                return p.frv()->endKey();
+            }
+        private:
+            dblock lk_;
+            Client::Context _ctx;
+            int indexNum_;
+            static DBDirectClient client_;
+        };
+        DBDirectClient Base::client_;
+
+        // There's a limit of 10 indexes total, make sure not to exceed this in a given test.
+#define INDEXNO(x) nsd()->idxNo( *this->index( BSON(x) ) )
+#define INDEX(x) this->index( BSON(x) )
+        auto_ptr< FieldRangeSetPair > FieldRangeSetPair_GLOBAL;
+#define FRSP(x) ( FieldRangeSetPair_GLOBAL.reset( new FieldRangeSetPair( ns(), x ) ), *FieldRangeSetPair_GLOBAL )
+        auto_ptr< FieldRangeSetPair > FieldRangeSetPair_GLOBAL2;
+#define FRSP2(x) ( FieldRangeSetPair_GLOBAL2.reset( new FieldRangeSetPair( ns(), x ) ), FieldRangeSetPair_GLOBAL2.get() )
+
+        class NoIndex : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), -1, FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSONObj() );
+                ASSERT( !p.optimal() );
+                ASSERT( !p.scanAndOrderRequired() );
+                ASSERT( !p.exactKeyMatch() );
+            }
+        };
+
+        class SimpleOrder : public Base {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendMinKey( "" );
+                BSONObj start = b.obj();
+                BSONObjBuilder b2;
+                b2.appendMaxKey( "" );
+                BSONObj end = b2.obj();
+
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) );
+                ASSERT( !p.scanAndOrderRequired() );
+                ASSERT( !startKey( p ).woCompare( start ) );
+                ASSERT( !endKey( p ).woCompare( end ) );
+                QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << 1 ) );
+                ASSERT( !p2.scanAndOrderRequired() );
+                QueryPlan p3( nsd(), INDEXNO( "a" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "b" << 1 ) );
+                ASSERT( p3.scanAndOrderRequired() );
+                ASSERT( !startKey( p3 ).woCompare( start ) );
+                ASSERT( !endKey( p3 ).woCompare( end ) );
+            }
+        };
+
+        class MoreIndexThanNeeded : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) );
+                ASSERT( !p.scanAndOrderRequired() );
+            }
+        };
+
+        class IndexSigns : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << -1 ) , FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) );
+                ASSERT( !p.scanAndOrderRequired() );
+                ASSERT_EQUALS( 1, p.direction() );
+                QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) );
+                ASSERT( p2.scanAndOrderRequired() );
+                ASSERT_EQUALS( 0, p2.direction() );
+                QueryPlan p3( nsd(), indexno( id_obj ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "_id" << 1 ) );
+                ASSERT( !p3.scanAndOrderRequired() );
+                ASSERT_EQUALS( 1, p3.direction() );
+            }
+        };
+
+        class IndexReverse : public Base {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.appendMinKey( "" );
+                b.appendMaxKey( "" );
+                BSONObj start = b.obj();
+                BSONObjBuilder b2;
+                b2.appendMaxKey( "" );
+                b2.appendMinKey( "" );
+                BSONObj end = b2.obj();
+                QueryPlan p( nsd(),  INDEXNO( "a" << -1 << "b" << 1 ),FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 << "b" << -1 ) );
+                ASSERT( !p.scanAndOrderRequired() );
+                ASSERT_EQUALS( -1, p.direction() );
+                ASSERT( !startKey( p ).woCompare( start ) );
+                ASSERT( !endKey( p ).woCompare( end ) );
+                QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << -1 << "b" << -1 ) );
+                ASSERT( !p2.scanAndOrderRequired() );
+                ASSERT_EQUALS( -1, p2.direction() );
+                QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << -1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << -1 << "b" << -1 ) );
+                ASSERT( p3.scanAndOrderRequired() );
+                ASSERT_EQUALS( 0, p3.direction() );
+            }
+        };
+
+        class NoOrder : public Base {
+        public:
+            void run() {
+                BSONObjBuilder b;
+                b.append( "", 3 );
+                b.appendMinKey( "" );
+                BSONObj start = b.obj();
+                BSONObjBuilder b2;
+                b2.append( "", 3 );
+                b2.appendMaxKey( "" );
+                BSONObj end = b2.obj();
+                QueryPlan p( nsd(), INDEXNO( "a" << -1 << "b" << 1 ), FRSP( BSON( "a" << 3 ) ), FRSP2( BSON( "a" << 3 ) ), BSON( "a" << 3 ), BSONObj() );
+                ASSERT( !p.scanAndOrderRequired() );
+                ASSERT( !startKey( p ).woCompare( start ) );
+                ASSERT( !endKey( p ).woCompare( end ) );
+                QueryPlan p2( nsd(), INDEXNO( "a" << -1 << "b" << 1 ), FRSP( BSON( "a" << 3 ) ), FRSP2( BSON( "a" << 3 ) ), BSON( "a" << 3 ), BSONObj() );
+                ASSERT( !p2.scanAndOrderRequired() );
+                ASSERT( !startKey( p ).woCompare( start ) );
+                ASSERT( !endKey( p ).woCompare( end ) );
+            }
+        };
+
+        class EqualWithOrder : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "a" << 4 ) ), FRSP2( BSON( "a" << 4 ) ), BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                ASSERT( !p.scanAndOrderRequired() );
+                QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FRSP( BSON( "b" << 4 ) ), FRSP2( BSON( "b" << 4 ) ), BSON( "b" << 4 ), BSON( "a" << 1 << "c" << 1 ) );
+                ASSERT( !p2.scanAndOrderRequired() );
+                QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "b" << 4 ) ), FRSP2( BSON( "b" << 4 ) ), BSON( "b" << 4 ), BSON( "a" << 1 << "c" << 1 ) );
+                ASSERT( p3.scanAndOrderRequired() );
+            }
+        };
+
+        class Optimal : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) );
+                ASSERT( p.optimal() );
+                QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) );
+                ASSERT( p2.optimal() );
+                QueryPlan p3( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "a" << 1 ) ), FRSP2( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSON( "a" << 1 ) );
+                ASSERT( p3.optimal() );
+                QueryPlan p4( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "b" << 1 ) ), FRSP2( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSON( "a" << 1 ) );
+                ASSERT( !p4.optimal() );
+                QueryPlan p5( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "a" << 1 ) ), FRSP2( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSON( "b" << 1 ) );
+                ASSERT( p5.optimal() );
+                QueryPlan p6( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "b" << 1 ) ), FRSP2( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSON( "b" << 1 ) );
+                ASSERT( !p6.optimal() );
+                QueryPlan p7( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "a" << 1 << "b" << 1 ) ), FRSP2( BSON( "a" << 1 << "b" << 1 ) ), BSON( "a" << 1 << "b" << 1 ), BSON( "a" << 1 ) );
+                ASSERT( p7.optimal() );
+                QueryPlan p8( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "a" << 1 << "b" << LT << 1 ) ), FRSP2( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSON( "a" << 1 )  );
+                ASSERT( p8.optimal() );
+                QueryPlan p9( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FRSP( BSON( "a" << 1 << "b" << LT << 1 ) ), FRSP2( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSON( "a" << 1 ) );
+                ASSERT( p9.optimal() );
+            }
+        };
+
+        class MoreOptimal : public Base {
+        public:
+            void run() {
+                QueryPlan p10( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FRSP( BSON( "a" << 1 ) ), FRSP2( BSON( "a" << 1 ) ), BSON( "a" << 1 ), BSONObj() );
+                ASSERT( p10.optimal() );
+                QueryPlan p11( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FRSP( BSON( "a" << 1 << "b" << LT << 1 ) ), FRSP2( BSON( "a" << 1 << "b" << LT << 1 ) ), BSON( "a" << 1 << "b" << LT << 1 ), BSONObj() );
+                ASSERT( p11.optimal() );
+                QueryPlan p12( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FRSP( BSON( "a" << LT << 1 ) ), FRSP2( BSON( "a" << LT << 1 ) ), BSON( "a" << LT << 1 ), BSONObj() );
+                ASSERT( p12.optimal() );
+                QueryPlan p13( nsd(), INDEXNO( "a" << 1 << "b" << 1 << "c" << 1 ), FRSP( BSON( "a" << LT << 1 ) ), FRSP2( BSON( "a" << LT << 1 ) ), BSON( "a" << LT << 1 ), BSON( "a" << 1 ) );
+                ASSERT( p13.optimal() );
+            }
+        };
+
+        class KeyMatch : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) );
+                ASSERT( !p.exactKeyMatch() );
+                QueryPlan p2( nsd(), INDEXNO( "b" << 1 << "a" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) );
+                ASSERT( !p2.exactKeyMatch() );
+                QueryPlan p3( nsd(), INDEXNO( "b" << 1 << "a" << 1 ), FRSP( BSON( "b" << "z" ) ), FRSP2( BSON( "b" << "z" ) ), BSON( "b" << "z" ), BSON( "a" << 1 ) );
+                ASSERT( !p3.exactKeyMatch() );
+                QueryPlan p4( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FRSP( BSON( "c" << "y" << "b" << "z" ) ), FRSP2( BSON( "c" << "y" << "b" << "z" ) ), BSON( "c" << "y" << "b" << "z" ), BSON( "a" << 1 ) );
+                ASSERT( !p4.exactKeyMatch() );
+                QueryPlan p5( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FRSP( BSON( "c" << "y" << "b" << "z" ) ), FRSP2( BSON( "c" << "y" << "b" << "z" ) ), BSON( "c" << "y" << "b" << "z" ), BSONObj() );
+                ASSERT( !p5.exactKeyMatch() );
+                QueryPlan p6( nsd(), INDEXNO( "b" << 1 << "a" << 1 << "c" << 1 ), FRSP( BSON( "c" << LT << "y" << "b" << GT << "z" ) ), FRSP2( BSON( "c" << LT << "y" << "b" << GT << "z" ) ), BSON( "c" << LT << "y" << "b" << GT << "z" ), BSONObj() );
+                ASSERT( !p6.exactKeyMatch() );
+                QueryPlan p7( nsd(), INDEXNO( "b" << 1 ), FRSP( BSONObj() ), FRSP2( BSONObj() ), BSONObj(), BSON( "a" << 1 ) );
+                ASSERT( !p7.exactKeyMatch() );
+                QueryPlan p8( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "b" << "y" << "a" << "z" ) ), FRSP2( BSON( "b" << "y" << "a" << "z" ) ), BSON( "b" << "y" << "a" << "z" ), BSONObj() );
+                ASSERT( p8.exactKeyMatch() );
+                QueryPlan p9( nsd(), INDEXNO( "a" << 1 ), FRSP( BSON( "a" << "z" ) ), FRSP2( BSON( "a" << "z" ) ), BSON( "a" << "z" ), BSON( "a" << 1 ) );
+                ASSERT( p9.exactKeyMatch() );
+            }
+        };
+
+        class MoreKeyMatch : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FRSP( BSON( "a" << "r" << "b" << NE << "q" ) ), FRSP2( BSON( "a" << "r" << "b" << NE << "q" ) ), BSON( "a" << "r" << "b" << NE << "q" ), BSON( "a" << 1 ) );
+                ASSERT( !p.exactKeyMatch() );
+            }
+        };
+
+        class ExactKeyQueryTypes : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 ), FRSP( BSON( "a" << "b" ) ), FRSP2( BSON( "a" << "b" ) ), BSON( "a" << "b" ), BSONObj() );
+                ASSERT( p.exactKeyMatch() );
+                QueryPlan p2( nsd(), INDEXNO( "a" << 1 ), FRSP( BSON( "a" << 4 ) ), FRSP2( BSON( "a" << 4 ) ), BSON( "a" << 4 ), BSONObj() );
+                ASSERT( !p2.exactKeyMatch() );
+                QueryPlan p3( nsd(), INDEXNO( "a" << 1 ), FRSP( BSON( "a" << BSON( "c" << "d" ) ) ), FRSP2( BSON( "a" << BSON( "c" << "d" ) ) ), BSON( "a" << BSON( "c" << "d" ) ), BSONObj() );
+                ASSERT( !p3.exactKeyMatch() );
+                BSONObjBuilder b;
+                b.appendRegex( "a", "^ddd" );
+                BSONObj q = b.obj();
+                QueryPlan p4( nsd(), INDEXNO( "a" << 1 ), FRSP( q ), FRSP2( q ), q, BSONObj() );
+                ASSERT( !p4.exactKeyMatch() );
+                QueryPlan p5( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "a" << "z" << "b" << 4 ) ), FRSP2( BSON( "a" << "z" << "b" << 4 ) ), BSON( "a" << "z" << "b" << 4 ), BSONObj() );
+                ASSERT( !p5.exactKeyMatch() );
+            }
+        };
+
+        class Unhelpful : public Base {
+        public:
+            void run() {
+                QueryPlan p( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "b" << 1 ) ), FRSP2( BSON( "b" << 1 ) ), BSON( "b" << 1 ), BSONObj() );
+                ASSERT( !p.range( "a" ).nontrivial() );
+                ASSERT( p.unhelpful() );
+                QueryPlan p2( nsd(), INDEXNO( "a" << 1 << "b" << 1 ), FRSP( BSON( "b" << 1 << "c" << 1 ) ), FRSP2( BSON( "b" << 1 << "c" << 1 ) ), BSON( "b" << 1 << "c" << 1 ), BSON( "a" << 1 ) );
+                ASSERT( !p2.scanAndOrderRequired() );
+                ASSERT( !p2.range( "a" ).nontrivial() );
+                ASSERT( !p2.unhelpful() );
+                QueryPlan p3( nsd(), INDEXNO( "b" << 1 ), FRSP( BSON( "b" << 1 << "c" << 1 ) ), FRSP2( BSON( "b" << 1 << "c" << 1 ) ), BSON( "b" << 1 << "c" << 1 ), BSONObj() );
+                ASSERT( p3.range( "b" ).nontrivial() );
+                ASSERT( !p3.unhelpful() );
+                QueryPlan p4( nsd(), INDEXNO( "b" << 1 << "c" << 1 ), FRSP( BSON( "c" << 1 << "d" << 1 ) ), FRSP2( BSON( "c" << 1 << "d" << 1 ) ), BSON( "c" << 1 << "d" << 1 ), BSONObj() );
+                ASSERT( !p4.range( "b" ).nontrivial() );
+                ASSERT( p4.unhelpful() );
+            }
+        };
+
+    } // namespace QueryPlanTests
+
+    namespace QueryPlanSetTests {
+        class Base {
+        public:
+            Base() : _context( ns() ) {
+                string err;
+                userCreateNS( ns(), BSONObj(), err, false );
+            }
+            virtual ~Base() {
+                if ( !nsd() )
+                    return;
+                NamespaceDetailsTransient::get_inlock( ns() ).clearQueryCache();
+                dropCollection( ns() );
+            }
+            static void assembleRequest( const string &ns, BSONObj query, int nToReturn, int nToSkip, BSONObj *fieldsToReturn, int queryOptions, Message &toSend ) {
+                // see query.h for the protocol we are using here.
+                BufBuilder b;
+                int opts = queryOptions;
+                b.appendNum(opts);
+                b.appendStr(ns);
+                b.appendNum(nToSkip);
+                b.appendNum(nToReturn);
+                query.appendSelfToBufBuilder(b);
+                if ( fieldsToReturn )
+                    fieldsToReturn->appendSelfToBufBuilder(b);
+                toSend.setData(dbQuery, b.buf(), b.len());
+            }
+        protected:
+            static const char *ns() { return "unittests.QueryPlanSetTests"; }
+            static NamespaceDetails *nsd() { return nsdetails( ns() ); }
+        private:
+            dblock lk_;
+            Client::Context _context;
+        };
+
+        class NoIndexes : public Base {
+        public:
+            void run() {
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                ASSERT_EQUALS( 1, s.nPlans() );
+            }
+        };
+
+        class Optimal : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "b_2" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSONObj() );
+                ASSERT_EQUALS( 1, s.nPlans() );
+            }
+        };
+
+        class NoOptimal : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                ASSERT_EQUALS( 3, s.nPlans() );
+            }
+        };
+
+        class NoSpec : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSONObj() ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSONObj(), BSONObj() );
+                ASSERT_EQUALS( 1, s.nPlans() );
+            }
+        };
+
+        class HintSpec : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                BSONObj b = BSON( "hint" << BSON( "a" << 1 ) );
+                BSONElement e = b.firstElement();
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 1 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 1 ), BSON( "b" << 1 ), true, &e );
+                ASSERT_EQUALS( 1, s.nPlans() );
+            }
+        };
+
+        class HintName : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                BSONObj b = BSON( "hint" << "a_1" );
+                BSONElement e = b.firstElement();
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 1 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 1 ), BSON( "b" << 1 ), true, &e );
+                ASSERT_EQUALS( 1, s.nPlans() );
+            }
+        };
+
+        class NaturalHint : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                BSONObj b = BSON( "hint" << BSON( "$natural" << 1 ) );
+                BSONElement e = b.firstElement();
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 1 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 1 ), BSON( "b" << 1 ), true, &e );
+                ASSERT_EQUALS( 1, s.nPlans() );
+            }
+        };
+
+        class NaturalSort : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "b_2" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 1 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 1 ), BSON( "$natural" << 1 ) );
+                ASSERT_EQUALS( 1, s.nPlans() );
+            }
+        };
+
+        class BadHint : public Base {
+        public:
+            void run() {
+                BSONObj b = BSON( "hint" << "a_1" );
+                BSONElement e = b.firstElement();
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 1 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                ASSERT_THROWS( QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 1 ), BSON( "b" << 1 ), true, &e ),
+                                  AssertionException );
+            }
+        };
+
+        class Count : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                string err;
+                ASSERT_EQUALS( 0, runCount( ns(), BSON( "query" << BSON( "a" << 4 ) ), err ) );
+                BSONObj one = BSON( "a" << 1 );
+                BSONObj fourA = BSON( "a" << 4 );
+                BSONObj fourB = BSON( "a" << 4 );
+                theDataFileMgr.insertWithObjMod( ns(), one );
+                ASSERT_EQUALS( 0, runCount( ns(), BSON( "query" << BSON( "a" << 4 ) ), err ) );
+                theDataFileMgr.insertWithObjMod( ns(), fourA );
+                ASSERT_EQUALS( 1, runCount( ns(), BSON( "query" << BSON( "a" << 4 ) ), err ) );
+                theDataFileMgr.insertWithObjMod( ns(), fourB );
+                ASSERT_EQUALS( 2, runCount( ns(), BSON( "query" << BSON( "a" << 4 ) ), err ) );
+                ASSERT_EQUALS( 3, runCount( ns(), BSON( "query" << BSONObj() ), err ) );
+                ASSERT_EQUALS( 3, runCount( ns(), BSON( "query" << BSON( "a" << GT << 0 ) ), err ) );
+                // missing ns
+                ASSERT_EQUALS( -1, runCount( "unittests.missingNS", BSONObj(), err ) );
+                // impossible match
+                ASSERT_EQUALS( 0, runCount( ns(), BSON( "query" << BSON( "a" << GT << 0 << LT << -1 ) ), err ) );
+            }
+        };
+
+        class QueryMissingNs : public Base {
+        public:
+            QueryMissingNs() { log() << "querymissingns starts" << endl; }
+            ~QueryMissingNs() {
+                log() << "end QueryMissingNs" << endl;
+            }
+            void run() {
+                Message m;
+                assembleRequest( "unittests.missingNS", BSONObj(), 0, 0, 0, 0, m );
+                DbMessage d(m);
+                QueryMessage q(d);
+                Message ret;
+                runQuery( m, q, ret );
+                ASSERT_EQUALS( 0, ((QueryResult*)ret.header())->nReturned );
+            }
+
+        };
+
+        class UnhelpfulIndex : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 1 << "c" << 2 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 1 << "c" << 2 ), BSONObj() );
+                ASSERT_EQUALS( 2, s.nPlans() );
+            }
+        };
+
+        class SingleException : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                ASSERT_EQUALS( 3, s.nPlans() );
+                bool threw = false;
+                auto_ptr< TestOp > t( new TestOp( true, threw ) );
+                boost::shared_ptr< TestOp > done = s.runOp( *t );
+                ASSERT( threw );
+                ASSERT( done->complete() );
+                ASSERT( done->exception().empty() );
+                ASSERT( !done->error() );
+            }
+        private:
+            class TestOp : public QueryOp {
+            public:
+                TestOp( bool iThrow, bool &threw ) : iThrow_( iThrow ), threw_( threw ), i_(), youThrow_( false ) {}
+                virtual void _init() {}
+                virtual void next() {
+                    if ( iThrow_ )
+                        threw_ = true;
+                    massert( 10408 ,  "throw", !iThrow_ );
+                    if ( ++i_ > 10 )
+                        setComplete();
+                }
+                virtual QueryOp *_createChild() const {
+                    QueryOp *op = new TestOp( youThrow_, threw_ );
+                    youThrow_ = !youThrow_;
+                    return op;
+                }
+                virtual bool mayRecordPlan() const { return true; }
+                virtual long long nscanned() { return 0; }
+            private:
+                bool iThrow_;
+                bool &threw_;
+                int i_;
+                mutable bool youThrow_;
+            };
+        };
+
+        class AllException : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                ASSERT_EQUALS( 3, s.nPlans() );
+                auto_ptr< TestOp > t( new TestOp() );
+                boost::shared_ptr< TestOp > done = s.runOp( *t );
+                ASSERT( !done->complete() );
+                ASSERT_EQUALS( "throw", done->exception().msg );
+                ASSERT( done->error() );
+            }
+        private:
+            class TestOp : public QueryOp {
+            public:
+                virtual void _init() {}
+                virtual void next() {
+                    massert( 10409 ,  "throw", false );
+                }
+                virtual QueryOp *_createChild() const {
+                    return new TestOp();
+                }
+                virtual bool mayRecordPlan() const { return true; }
+                virtual long long nscanned() { return 0; }
+            };
+        };
+
+        class SaveGoodIndex : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+                // No best plan - all must be tried.
+                nPlans( 3 );
+                runQuery();
+                // Best plan selected by query.
+                nPlans( 1 );
+                nPlans( 1 );
+                Helpers::ensureIndex( ns(), BSON( "c" << 1 ), false, "c_1" );
+                // Best plan cleared when new index added.
+                nPlans( 3 );
+                runQuery();
+                // Best plan selected by query.
+                nPlans( 1 );
+
+                {
+                    DBDirectClient client;
+                    for( int i = 0; i < 334; ++i ) {
+                        client.insert( ns(), BSON( "i" << i ) );
+                        client.update( ns(), QUERY( "i" << i ), BSON( "i" << i + 1 ) );
+                        client.remove( ns(), BSON( "i" << i + 1 ) );
+                    }
+                }
+                // Best plan cleared by ~1000 writes.
+                nPlans( 3 );
+
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                NoRecordTestOp original;
+                s.runOp( original );
+                // NoRecordTestOp doesn't record a best plan (test cases where mayRecordPlan() is false).
+                nPlans( 3 );
+
+                BSONObj hint = fromjson( "{hint:{$natural:1}}" );
+                BSONElement hintElt = hint.firstElement();
+                auto_ptr< FieldRangeSetPair > frsp2( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig2( new FieldRangeSetPair( *frsp2 ) );
+                QueryPlanSet s2( ns(), frsp2, frspOrig2, BSON( "a" << 4 ), BSON( "b" << 1 ), true, &hintElt );
+                TestOp newOriginal;
+                s2.runOp( newOriginal );
+                // No plan recorded when a hint is used.
+                nPlans( 3 );
+
+                auto_ptr< FieldRangeSetPair > frsp3( new FieldRangeSetPair( ns(), BSON( "a" << 4 ), true ) );
+                auto_ptr< FieldRangeSetPair > frspOrig3( new FieldRangeSetPair( *frsp3 ) );
+                QueryPlanSet s3( ns(), frsp3, frspOrig3, BSON( "a" << 4 ), BSON( "b" << 1 << "c" << 1 ) );
+                TestOp newerOriginal;
+                s3.runOp( newerOriginal );
+                // Plan recorded was for a different query pattern (different sort spec).
+                nPlans( 3 );
+
+                // Best plan still selected by query after all these other tests.
+                runQuery();
+                nPlans( 1 );
+            }
+        private:
+            void nPlans( int n ) {
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                ASSERT_EQUALS( n, s.nPlans() );
+            }
+            void runQuery() {
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                TestOp original;
+                s.runOp( original );
+            }
+            class TestOp : public QueryOp {
+            public:
+                virtual void _init() {}
+                virtual void next() {
+                    setComplete();
+                }
+                virtual QueryOp *_createChild() const {
+                    return new TestOp();
+                }
+                virtual bool mayRecordPlan() const { return true; }
+                virtual long long nscanned() { return 0; }
+            };
+            class NoRecordTestOp : public TestOp {
+                virtual bool mayRecordPlan() const { return false; }
+                virtual QueryOp *_createChild() const { return new NoRecordTestOp(); }
+            };
+        };
+
+        class TryAllPlansOnErr : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                ScanOnlyTestOp op;
+                s.runOp( op );
+                pair< BSONObj, long long > best = QueryUtilIndexed::bestIndexForPatterns( s.frsp(), BSON( "b" << 1 ) );
+                ASSERT( fromjson( "{$natural:1}" ).woCompare( best.first ) == 0 );
+                ASSERT_EQUALS( 1, best.second );
+
+                auto_ptr< FieldRangeSetPair > frsp2( new FieldRangeSetPair( ns(), BSON( "a" << 4 ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig2( new FieldRangeSetPair( *frsp2 ) );
+                QueryPlanSet s2( ns(), frsp2, frspOrig2, BSON( "a" << 4 ), BSON( "b" << 1 ) );
+                TestOp op2;
+                ASSERT( s2.runOp( op2 )->complete() );
+            }
+        private:
+            class TestOp : public QueryOp {
+            public:
+                TestOp() {}
+                virtual void _init() {}
+                virtual void next() {
+                    if ( qp().indexKey().firstElementFieldName() == string( "$natural" ) )
+                        massert( 10410 ,  "throw", false );
+                    setComplete();
+                }
+                virtual QueryOp *_createChild() const {
+                    return new TestOp();
+                }
+                virtual bool mayRecordPlan() const { return true; }
+                virtual long long nscanned() { return 1; }
+            };
+            class ScanOnlyTestOp : public TestOp {
+                virtual void next() {
+                    if ( qp().indexKey().firstElement().fieldName() == string( "$natural" ) )
+                        setComplete();
+                    massert( 10411 ,  "throw", false );
+                }
+                virtual QueryOp *_createChild() const {
+                    return new ScanOnlyTestOp();
+                }
+            };
+        };
+
+        class FindOne : public Base {
+        public:
+            void run() {
+                BSONObj one = BSON( "a" << 1 );
+                theDataFileMgr.insertWithObjMod( ns(), one );
+                BSONObj result;
+                ASSERT( Helpers::findOne( ns(), BSON( "a" << 1 ), result ) );
+                ASSERT_THROWS( Helpers::findOne( ns(), BSON( "a" << 1 ), result, true ), AssertionException );
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                ASSERT( Helpers::findOne( ns(), BSON( "a" << 1 ), result, true ) );
+            }
+        };
+
+        class Delete : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                for( int i = 0; i < 200; ++i ) {
+                    BSONObj two = BSON( "a" << 2 );
+                    theDataFileMgr.insertWithObjMod( ns(), two );
+                }
+                BSONObj one = BSON( "a" << 1 );
+                theDataFileMgr.insertWithObjMod( ns(), one );
+                BSONObj delSpec = BSON( "a" << 1 << "_id" << NE << 0 );
+                deleteObjects( ns(), delSpec, false );
+                ASSERT( BSON( "a" << 1 ).woCompare( NamespaceDetailsTransient::get_inlock( ns() ).indexForPattern( FieldRangeSet( ns(), delSpec, true ).pattern() ) ) == 0 );
+                ASSERT_EQUALS( 1, NamespaceDetailsTransient::get_inlock( ns() ).nScannedForPattern( FieldRangeSet( ns(), delSpec, true ).pattern() ) );
+            }
+        };
+
+        class DeleteOneScan : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "_id" << 1 ), false, "_id_1" );
+                BSONObj one = BSON( "_id" << 3 << "a" << 1 );
+                BSONObj two = BSON( "_id" << 2 << "a" << 1 );
+                BSONObj three = BSON( "_id" << 1 << "a" << -1 );
+                theDataFileMgr.insertWithObjMod( ns(), one );
+                theDataFileMgr.insertWithObjMod( ns(), two );
+                theDataFileMgr.insertWithObjMod( ns(), three );
+                deleteObjects( ns(), BSON( "_id" << GT << 0 << "a" << GT << 0 ), true );
+                for( boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( ns() ); c->ok(); c->advance() )
+                    ASSERT( 3 != c->current().getIntField( "_id" ) );
+            }
+        };
+
+        class DeleteOneIndex : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a" );
+                BSONObj one = BSON( "a" << 2 << "_id" << 0 );
+                BSONObj two = BSON( "a" << 1 << "_id" << 1 );
+                BSONObj three = BSON( "a" << 0 << "_id" << 2 );
+                theDataFileMgr.insertWithObjMod( ns(), one );
+                theDataFileMgr.insertWithObjMod( ns(), two );
+                theDataFileMgr.insertWithObjMod( ns(), three );
+                deleteObjects( ns(), BSON( "a" << GTE << 0 ), true );
+                for( boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( ns() ); c->ok(); c->advance() )
+                    ASSERT( 2 != c->current().getIntField( "_id" ) );
+            }
+        };
+
+        class TryOtherPlansBeforeFinish : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                for( int i = 0; i < 100; ++i ) {
+                    for( int j = 0; j < 2; ++j ) {
+                        BSONObj temp = BSON( "a" << 100 - i - 1 << "b" << i );
+                        theDataFileMgr.insertWithObjMod( ns(), temp );
+                    }
+                }
+                Message m;
+                // Need to return at least 2 records to cause plan to be recorded.
+                assembleRequest( ns(), QUERY( "b" << 0 << "a" << GTE << 0 ).obj, 2, 0, 0, 0, m );
+                stringstream ss;
+                {
+                    DbMessage d(m);
+                    QueryMessage q(d);
+                    runQuery( m, q);
+                }
+                ASSERT( BSON( "$natural" << 1 ).woCompare( NamespaceDetailsTransient::get_inlock( ns() ).indexForPattern( FieldRangeSet( ns(), BSON( "b" << 0 << "a" << GTE << 0 ), true ).pattern() ) ) == 0 );
+
+                Message m2;
+                assembleRequest( ns(), QUERY( "b" << 99 << "a" << GTE << 0 ).obj, 2, 0, 0, 0, m2 );
+                {
+                    DbMessage d(m2);
+                    QueryMessage q(d);
+                    runQuery( m2, q);
+                }
+                ASSERT( BSON( "a" << 1 ).woCompare( NamespaceDetailsTransient::get_inlock( ns() ).indexForPattern( FieldRangeSet( ns(), BSON( "b" << 0 << "a" << GTE << 0 ), true ).pattern() ) ) == 0 );
+                ASSERT_EQUALS( 3, NamespaceDetailsTransient::get_inlock( ns() ).nScannedForPattern( FieldRangeSet( ns(), BSON( "b" << 0 << "a" << GTE << 0 ), true ).pattern() ) );
+            }
+        };
+
+        class InQueryIntervals : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                for( int i = 0; i < 10; ++i ) {
+                    BSONObj temp = BSON( "a" << i );
+                    theDataFileMgr.insertWithObjMod( ns(), temp );
+                }
+                BSONObj hint = fromjson( "{$hint:{a:1}}" );
+                BSONElement hintElt = hint.firstElement();
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ) ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSONObj(), true, &hintElt );
+                QueryPlan qp( nsd(), 1, s.frsp(), s.originalFrsp(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSONObj() );
+                boost::shared_ptr<Cursor> c = qp.newCursor();
+                double expected[] = { 2, 3, 6, 9 };
+                for( int i = 0; i < 4; ++i, c->advance() ) {
+                    ASSERT_EQUALS( expected[ i ], c->current().getField( "a" ).number() );
+                }
+                ASSERT( !c->ok() );
+
+                // now check reverse
+                {
+                    auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ) ) );
+                    auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                    QueryPlanSet s( ns(), frsp, frspOrig, fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSON( "a" << -1 ), true, &hintElt );
+                    QueryPlan qp( nsd(), 1, s.frsp(), s.originalFrsp(), fromjson( "{a:{$in:[2,3,6,9,11]}}" ), BSON( "a" << -1 ) );
+                    boost::shared_ptr<Cursor> c = qp.newCursor();
+                    double expected[] = { 9, 6, 3, 2 };
+                    for( int i = 0; i < 4; ++i, c->advance() ) {
+                        ASSERT_EQUALS( expected[ i ], c->current().getField( "a" ).number() );
+                    }
+                    ASSERT( !c->ok() );
+                }
+            }
+        };
+
+        class EqualityThenIn : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 << "b" << 1 ), false, "a_1_b_1" );
+                for( int i = 0; i < 10; ++i ) {
+                    BSONObj temp = BSON( "a" << 5 << "b" << i );
+                    theDataFileMgr.insertWithObjMod( ns(), temp );
+                }
+                BSONObj hint = fromjson( "{$hint:{a:1,b:1}}" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ) ) );
+                QueryPlan qp( nsd(), 1, *frsp, frsp.get(), fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ), BSONObj() );
+                boost::shared_ptr<Cursor> c = qp.newCursor();
+                double expected[] = { 2, 3, 6, 9 };
+                ASSERT( c->ok() );
+                for( int i = 0; i < 4; ++i, c->advance() ) {
+                    ASSERT( c->ok() );
+                    ASSERT_EQUALS( expected[ i ], c->current().getField( "b" ).number() );
+                }
+                ASSERT( !c->ok() );
+            }
+        };
+
+        class NotEqualityThenIn : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 << "b" << 1 ), false, "a_1_b_1" );
+                for( int i = 0; i < 10; ++i ) {
+                    BSONObj temp = BSON( "a" << 5 << "b" << i );
+                    theDataFileMgr.insertWithObjMod( ns(), temp );
+                }
+                BSONObj hint = fromjson( "{$hint:{a:1,b:1}}" );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), fromjson( "{a:{$gte:5},b:{$in:[2,3,6,9,11]}}" ) ) );
+                QueryPlan qp( nsd(), 1, *frsp, frsp.get(), fromjson( "{a:{$gte:5},b:{$in:[2,3,6,9,11]}}" ), BSONObj() );
+                boost::shared_ptr<Cursor> c = qp.newCursor();
+                int matches[] = { 2, 3, 6, 9 };
+                for( int i = 0; i < 4; ++i, c->advance() ) {
+                    ASSERT_EQUALS( matches[ i ], c->current().getField( "b" ).number() );
+                }
+                ASSERT( !c->ok() );
+            }
+        };
+        
+        /** Exclude special plan candidate if there are btree plan candidates. SERVER-4531 */
+        class ExcludeSpecialPlanWhenBtreePlan : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << "2d" ), false, "a_2d" );
+                Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+                BSONObj query = BSON( "a" << BSON_ARRAY( 0 << 0 ) << "b" << 1 );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), query ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, query, BSONObj() );
+                // Two query plans, btree and collection scan.
+                ASSERT_EQUALS( 2, s.nPlans() );
+                // Not the geo plan.
+                ASSERT( s.firstPlan()->special().empty() );
+            }
+        };
+        
+        /** Exclude unindexed plan candidate if there is a special plan candidate. SERVER-4531 */
+        class ExcludeUnindexedPlanWhenSpecialPlan : public Base {
+        public:
+            void run() {
+                Helpers::ensureIndex( ns(), BSON( "a" << "2d" ), false, "a_2d" );
+                BSONObj query = BSON( "a" << BSON_ARRAY( 0 << 0 ) << "b" << 1 );
+                auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), query ) );
+                auto_ptr< FieldRangeSetPair > frspOrig( new FieldRangeSetPair( *frsp ) );
+                QueryPlanSet s( ns(), frsp, frspOrig, query, BSONObj() );
+                // Single query plan.
+                ASSERT_EQUALS( 1, s.nPlans() );
+                // It's the geo plan.
+                ASSERT( !s.firstPlan()->special().empty() );                
+            }
+        };
+
+    } // namespace QueryPlanSetTests
+
+    class Base {
+    public:
+        Base() : _ctx( ns() ) {
+            string err;
+            userCreateNS( ns(), BSONObj(), err, false );
+        }
+        ~Base() {
+            if ( !nsd() )
+                return;
+            string s( ns() );
+            dropCollection( ns() );
+        }
+    protected:
+        static const char *ns() { return "unittests.QueryOptimizerTests"; }
+        static NamespaceDetails *nsd() { return nsdetails( ns() ); }
+    private:
+        dblock lk_;
+        Client::Context _ctx;
+    };
+
+    class BestGuess : public Base {
+    public:
+        void run() {
+            Helpers::ensureIndex( ns(), BSON( "a" << 1 ), false, "a_1" );
+            Helpers::ensureIndex( ns(), BSON( "b" << 1 ), false, "b_1" );
+            BSONObj temp = BSON( "a" << 1 );
+            theDataFileMgr.insertWithObjMod( ns(), temp );
+            temp = BSON( "b" << 1 );
+            theDataFileMgr.insertWithObjMod( ns(), temp );
+
+            boost::shared_ptr< Cursor > c = bestGuessCursor( ns(), BSON( "b" << 1 ), BSON( "a" << 1 ) );
+            ASSERT_EQUALS( string( "a" ), c->indexKeyPattern().firstElement().fieldName() );
+            c = bestGuessCursor( ns(), BSON( "a" << 1 ), BSON( "b" << 1 ) );
+            ASSERT_EQUALS( string( "b" ), c->indexKeyPattern().firstElementFieldName() );
+            boost::shared_ptr< MultiCursor > m = dynamic_pointer_cast< MultiCursor >( bestGuessCursor( ns(), fromjson( "{b:1,$or:[{z:1}]}" ), BSON( "a" << 1 ) ) );
+            ASSERT_EQUALS( string( "a" ), m->sub_c()->indexKeyPattern().firstElement().fieldName() );
+            m = dynamic_pointer_cast< MultiCursor >( bestGuessCursor( ns(), fromjson( "{a:1,$or:[{y:1}]}" ), BSON( "b" << 1 ) ) );
+            ASSERT_EQUALS( string( "b" ), m->sub_c()->indexKeyPattern().firstElementFieldName() );
+
+            FieldRangeSet frs( "ns", BSON( "a" << 1 ), true );
+            {
+                SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+                NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( frs.pattern( BSON( "b" << 1 ) ), BSON( "a" << 1 ), 0 );
+            }
+            m = dynamic_pointer_cast< MultiCursor >( bestGuessCursor( ns(), fromjson( "{a:1,$or:[{y:1}]}" ), BSON( "b" << 1 ) ) );
+            ASSERT_EQUALS( string( "b" ), m->sub_c()->indexKeyPattern().firstElement().fieldName() );
+        }
+    };
+    
+    class BestGuessOrSortAssertion : public Base {
+    public:
+        void run() {
+            ASSERT_THROWS( bestGuessCursor( ns(), BSON( "$or" << BSON_ARRAY( BSON( "b" << 1 ) ) ), BSON( "a" << 1 ) ), MsgAssertionException );
+        }
+    };
+    
+    class All : public Suite {
+    public:
+        All() : Suite( "queryoptimizer" ) {}
+
+        void setupTests() {
+            __forceLinkGeoPlugin();
+            add<QueryPlanTests::NoIndex>();
+            add<QueryPlanTests::SimpleOrder>();
+            add<QueryPlanTests::MoreIndexThanNeeded>();
+            add<QueryPlanTests::IndexSigns>();
+            add<QueryPlanTests::IndexReverse>();
+            add<QueryPlanTests::NoOrder>();
+            add<QueryPlanTests::EqualWithOrder>();
+            add<QueryPlanTests::Optimal>();
+            add<QueryPlanTests::MoreOptimal>();
+            add<QueryPlanTests::KeyMatch>();
+            add<QueryPlanTests::MoreKeyMatch>();
+            add<QueryPlanTests::ExactKeyQueryTypes>();
+            add<QueryPlanTests::Unhelpful>();
+            add<QueryPlanSetTests::NoIndexes>();
+            add<QueryPlanSetTests::Optimal>();
+            add<QueryPlanSetTests::NoOptimal>();
+            add<QueryPlanSetTests::NoSpec>();
+            add<QueryPlanSetTests::HintSpec>();
+            add<QueryPlanSetTests::HintName>();
+            add<QueryPlanSetTests::NaturalHint>();
+            add<QueryPlanSetTests::NaturalSort>();
+            add<QueryPlanSetTests::BadHint>();
+            add<QueryPlanSetTests::Count>();
+            add<QueryPlanSetTests::QueryMissingNs>();
+            add<QueryPlanSetTests::UnhelpfulIndex>();
+            add<QueryPlanSetTests::SingleException>();
+            add<QueryPlanSetTests::AllException>();
+            add<QueryPlanSetTests::SaveGoodIndex>();
+            add<QueryPlanSetTests::TryAllPlansOnErr>();
+            add<QueryPlanSetTests::FindOne>();
+            add<QueryPlanSetTests::Delete>();
+            add<QueryPlanSetTests::DeleteOneScan>();
+            add<QueryPlanSetTests::DeleteOneIndex>();
+            add<QueryPlanSetTests::TryOtherPlansBeforeFinish>();
+            add<QueryPlanSetTests::InQueryIntervals>();
+            add<QueryPlanSetTests::EqualityThenIn>();
+            add<QueryPlanSetTests::NotEqualityThenIn>();
+            add<QueryPlanSetTests::ExcludeSpecialPlanWhenBtreePlan>();
+            add<QueryPlanSetTests::ExcludeUnindexedPlanWhenSpecialPlan>();
+            add<BestGuess>();
+            add<BestGuessOrSortAssertion>();
+        }
+    } myall;
+
+} // namespace QueryOptimizerTests
+
diff --git a/src/mongo/dbtests/querytests.cpp b/src/mongo/dbtests/querytests.cpp
new file mode 100644
index 00000000000..9416ae20723
--- /dev/null
+++ b/src/mongo/dbtests/querytests.cpp
@@ -0,0 +1,1408 @@
+// querytests.cpp : query.{h,cpp} unit tests.
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/ops/query.h"
+#include "../db/dbhelpers.h"
+#include "../db/clientcursor.h"
+
+#include "../db/instance.h"
+#include "../db/json.h"
+#include "../db/lasterror.h"
+
+#include "../util/timer.h"
+
+#include "dbtests.h"
+
+namespace mongo {
+    extern int __findingStartInitialTimeout;
+}
+
+namespace QueryTests {
+
+    class Base {
+        dblock lk;
+        Client::Context _context;
+    public:
+        Base() : _context( ns() ) {
+            addIndex( fromjson( "{\"a\":1}" ) );
+        }
+        ~Base() {
+            try {
+                boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( ns() );
+                vector< DiskLoc > toDelete;
+                for(; c->ok(); c->advance() )
+                    toDelete.push_back( c->currLoc() );
+                for( vector< DiskLoc >::iterator i = toDelete.begin(); i != toDelete.end(); ++i )
+                    theDataFileMgr.deleteRecord( ns(), i->rec(), *i, false );
+                DBDirectClient cl;
+                cl.dropIndexes( ns() );
+            }
+            catch ( ... ) {
+                FAIL( "Exception while cleaning up collection" );
+            }
+        }
+    protected:
+        static const char *ns() {
+            return "unittests.querytests";
+        }
+        static void addIndex( const BSONObj &key ) {
+            BSONObjBuilder b;
+            b.append( "name", key.firstElementFieldName() );
+            b.append( "ns", ns() );
+            b.append( "key", key );
+            BSONObj o = b.done();
+            stringstream indexNs;
+            indexNs << "unittests.system.indexes";
+            theDataFileMgr.insert( indexNs.str().c_str(), o.objdata(), o.objsize() );
+        }
+        static void insert( const char *s ) {
+            insert( fromjson( s ) );
+        }
+        static void insert( const BSONObj &o ) {
+            theDataFileMgr.insert( ns(), o.objdata(), o.objsize() );
+        }
+    };
+
+    class FindOne : public Base {
+    public:
+        void run() {
+            addIndex( BSON( "b" << 1 ) );
+            addIndex( BSON( "c" << 1 ) );
+            insert( BSON( "b" << 2 << "_id" << 0 ) );
+            insert( BSON( "c" << 3 << "_id" << 1 ) );
+            BSONObj query = fromjson( "{$or:[{b:2},{c:3}]}" );
+            BSONObj ret;
+            // Check findOne() returning object.
+            ASSERT( Helpers::findOne( ns(), query, ret, true ) );
+            ASSERT_EQUALS( string( "b" ), ret.firstElement().fieldName() );
+            // Cross check with findOne() returning location.
+            ASSERT_EQUALS( ret, Helpers::findOne( ns(), query, true ).obj() );
+        }
+    };
+    
+    class FindOneRequireIndex : public Base {
+    public:
+        void run() {
+            insert( BSON( "b" << 2 << "_id" << 0 ) );
+            BSONObj query = fromjson( "{b:2}" );
+            BSONObj ret;
+
+            // Check findOne() returning object, allowing unindexed scan.
+            ASSERT( Helpers::findOne( ns(), query, ret, false ) );
+            // Check findOne() returning location, allowing unindexed scan.
+            ASSERT_EQUALS( ret, Helpers::findOne( ns(), query, false ).obj() );
+            
+            // Check findOne() returning object, requiring indexed scan without index.
+            ASSERT_THROWS( Helpers::findOne( ns(), query, ret, true ), MsgAssertionException );
+            // Check findOne() returning location, requiring indexed scan without index.
+            ASSERT_THROWS( Helpers::findOne( ns(), query, true ), MsgAssertionException );
+
+            addIndex( BSON( "b" << 1 ) );
+            // Check findOne() returning object, requiring indexed scan with index.
+            ASSERT( Helpers::findOne( ns(), query, ret, false ) );
+            // Check findOne() returning location, requiring indexed scan with index.
+            ASSERT_EQUALS( ret, Helpers::findOne( ns(), query, false ).obj() );
+        }
+    };
+    
+    class FindOneEmptyObj : public Base {
+    public:
+        void run() {
+            // We don't normally allow empty objects in the database, but test that we can find
+            // an empty object (one might be allowed inside a reserved namespace at some point).
+            dblock lk;
+            Client::Context ctx( "unittests.querytests" );
+            // Set up security so godinsert command can run.
+            cc().getAuthenticationInfo()->isLocalHost = true;
+            DBDirectClient cl;
+            BSONObj info;
+            ASSERT( cl.runCommand( "unittests", BSON( "godinsert" << "querytests" << "obj" << BSONObj() ), info ) );
+            insert( BSONObj() );
+            BSONObj query;
+            BSONObj ret;
+            ASSERT( Helpers::findOne( ns(), query, ret, false ) );
+            ASSERT( ret.isEmpty() );
+            ASSERT_EQUALS( ret, Helpers::findOne( ns(), query, false ).obj() );
+        }
+    };
+    
+    class ClientBase {
+    public:
+        ClientBase() {
+            mongo::lastError.reset( new LastError() );
+        }
+        ~ClientBase() {
+            //mongo::lastError.release();
+        }
+    protected:
+        static void insert( const char *ns, BSONObj o ) {
+            client_.insert( ns, o );
+        }
+        static void update( const char *ns, BSONObj q, BSONObj o, bool upsert = 0 ) {
+            client_.update( ns, Query( q ), o, upsert );
+        }
+        static bool error() {
+            return !client_.getPrevError().getField( "err" ).isNull();
+        }
+        DBDirectClient &client() const { return client_; }
+
+        static DBDirectClient client_;
+    };
+    DBDirectClient ClientBase::client_;
+
+    class BoundedKey : public ClientBase {
+    public:
+        ~BoundedKey() {
+            client().dropCollection( "unittests.querytests.BoundedKey" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.BoundedKey";
+            insert( ns, BSON( "a" << 1 ) );
+            BSONObjBuilder a;
+            a.appendMaxKey( "$lt" );
+            BSONObj limit = a.done();
+            ASSERT( !client().findOne( ns, QUERY( "a" << limit ) ).isEmpty() );
+            client().ensureIndex( ns, BSON( "a" << 1 ) );
+            ASSERT( !client().findOne( ns, QUERY( "a" << limit ).hint( BSON( "a" << 1 ) ) ).isEmpty() );
+        }
+    };
+
+    class GetMore : public ClientBase {
+    public:
+        ~GetMore() {
+            client().dropCollection( "unittests.querytests.GetMore" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.GetMore";
+            insert( ns, BSON( "a" << 1 ) );
+            insert( ns, BSON( "a" << 2 ) );
+            insert( ns, BSON( "a" << 3 ) );
+            auto_ptr< DBClientCursor > cursor = client().query( ns, BSONObj(), 2 );
+            long long cursorId = cursor->getCursorId();
+            cursor->decouple();
+            cursor.reset();
+            cursor = client().getMore( ns, cursorId );
+            ASSERT( cursor->more() );
+            ASSERT_EQUALS( 3, cursor->next().getIntField( "a" ) );
+        }
+    };
+
+    class PositiveLimit : public ClientBase {
+    public:
+        const char* ns;
+        PositiveLimit() : ns("unittests.querytests.PositiveLimit") {}
+        ~PositiveLimit() {
+            client().dropCollection( ns );
+        }
+
+        void testLimit(int limit) {
+            ASSERT_EQUALS(client().query( ns, BSONObj(), limit )->itcount(), limit);
+        }
+        void run() {
+            for(int i=0; i<1000; i++)
+                insert( ns, BSON( GENOID << "i" << i ) );
+
+            ASSERT_EQUALS( client().query(ns, BSONObj(),    1 )->itcount(), 1);
+            ASSERT_EQUALS( client().query(ns, BSONObj(),   10 )->itcount(), 10);
+            ASSERT_EQUALS( client().query(ns, BSONObj(),  101 )->itcount(), 101);
+            ASSERT_EQUALS( client().query(ns, BSONObj(),  999 )->itcount(), 999);
+            ASSERT_EQUALS( client().query(ns, BSONObj(), 1000 )->itcount(), 1000);
+            ASSERT_EQUALS( client().query(ns, BSONObj(), 1001 )->itcount(), 1000);
+            ASSERT_EQUALS( client().query(ns, BSONObj(),    0 )->itcount(), 1000);
+        }
+    };
+
+    class ReturnOneOfManyAndTail : public ClientBase {
+    public:
+        ~ReturnOneOfManyAndTail() {
+            client().dropCollection( "unittests.querytests.ReturnOneOfManyAndTail" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.ReturnOneOfManyAndTail";
+            client().createCollection( ns, 1024, true );
+            insert( ns, BSON( "a" << 0 ) );
+            insert( ns, BSON( "a" << 1 ) );
+            insert( ns, BSON( "a" << 2 ) );
+            auto_ptr< DBClientCursor > c = client().query( ns, QUERY( "a" << GT << 0 ).hint( BSON( "$natural" << 1 ) ), 1, 0, 0, QueryOption_CursorTailable );
+            // If only one result requested, a cursor is not saved.
+            ASSERT_EQUALS( 0, c->getCursorId() );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 1, c->next().getIntField( "a" ) );
+        }
+    };
+
+    class TailNotAtEnd : public ClientBase {
+    public:
+        ~TailNotAtEnd() {
+            client().dropCollection( "unittests.querytests.TailNotAtEnd" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.TailNotAtEnd";
+            client().createCollection( ns, 2047, true );
+            insert( ns, BSON( "a" << 0 ) );
+            insert( ns, BSON( "a" << 1 ) );
+            insert( ns, BSON( "a" << 2 ) );
+            auto_ptr< DBClientCursor > c = client().query( ns, Query().hint( BSON( "$natural" << 1 ) ), 2, 0, 0, QueryOption_CursorTailable );
+            ASSERT( 0 != c->getCursorId() );
+            while( c->more() )
+                c->next();
+            ASSERT( 0 != c->getCursorId() );
+            insert( ns, BSON( "a" << 3 ) );
+            insert( ns, BSON( "a" << 4 ) );
+            insert( ns, BSON( "a" << 5 ) );
+            insert( ns, BSON( "a" << 6 ) );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 3, c->next().getIntField( "a" ) );
+        }
+    };
+
+    class EmptyTail : public ClientBase {
+    public:
+        ~EmptyTail() {
+            client().dropCollection( "unittests.querytests.EmptyTail" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.EmptyTail";
+            client().createCollection( ns, 1900, true );
+            auto_ptr< DBClientCursor > c = client().query( ns, Query().hint( BSON( "$natural" << 1 ) ), 2, 0, 0, QueryOption_CursorTailable );
+            ASSERT_EQUALS( 0, c->getCursorId() );
+            ASSERT( c->isDead() );
+            insert( ns, BSON( "a" << 0 ) );
+            c = client().query( ns, QUERY( "a" << 1 ).hint( BSON( "$natural" << 1 ) ), 2, 0, 0, QueryOption_CursorTailable );
+            ASSERT( 0 != c->getCursorId() );
+            ASSERT( !c->isDead() );
+        }
+    };
+
+    class TailableDelete : public ClientBase {
+    public:
+        ~TailableDelete() {
+            client().dropCollection( "unittests.querytests.TailableDelete" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.TailableDelete";
+            client().createCollection( ns, 8192, true, 2 );
+            insert( ns, BSON( "a" << 0 ) );
+            insert( ns, BSON( "a" << 1 ) );
+            auto_ptr< DBClientCursor > c = client().query( ns, Query().hint( BSON( "$natural" << 1 ) ), 2, 0, 0, QueryOption_CursorTailable );
+            c->next();
+            c->next();
+            ASSERT( !c->more() );
+            insert( ns, BSON( "a" << 2 ) );
+            insert( ns, BSON( "a" << 3 ) );
+            ASSERT( !c->more() );
+            ASSERT_EQUALS( 0, c->getCursorId() );
+        }
+    };
+
+    class TailableInsertDelete : public ClientBase {
+    public:
+        ~TailableInsertDelete() {
+            client().dropCollection( "unittests.querytests.TailableInsertDelete" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.TailableInsertDelete";
+            client().createCollection( ns, 1330, true );
+            insert( ns, BSON( "a" << 0 ) );
+            insert( ns, BSON( "a" << 1 ) );
+            auto_ptr< DBClientCursor > c = client().query( ns, Query().hint( BSON( "$natural" << 1 ) ), 2, 0, 0, QueryOption_CursorTailable );
+            c->next();
+            c->next();
+            ASSERT( !c->more() );
+            insert( ns, BSON( "a" << 2 ) );
+            client().remove( ns, QUERY( "a" << 1 ) );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 2, c->next().getIntField( "a" ) );
+            ASSERT( !c->more() );
+        }
+    };
+
+    class TailCappedOnly : public ClientBase {
+    public:
+        ~TailCappedOnly() {
+            client().dropCollection( "unittest.querytests.TailCappedOnly" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.TailCappedOnly";
+            client().insert( ns, BSONObj() );
+            auto_ptr< DBClientCursor > c = client().query( ns, BSONObj(), 0, 0, 0, QueryOption_CursorTailable );
+            ASSERT( c->isDead() );
+            ASSERT( !client().getLastError().empty() );
+        }
+    };
+
+    class TailableQueryOnId : public ClientBase {
+    public:
+        ~TailableQueryOnId() {
+            client().dropCollection( "unittests.querytests.TailableQueryOnId" );
+        }
+
+		void insertA(const char* ns, int a) {
+			BSONObjBuilder b;
+			b.appendOID("_id", 0, true);
+			b.appendOID("value", 0, true);
+			b.append("a", a);
+			insert(ns, b.obj());
+		}
+
+        void run() {
+            const char *ns = "unittests.querytests.TailableQueryOnId";
+            BSONObj info;
+            client().runCommand( "unittests", BSON( "create" << "querytests.TailableQueryOnId" << "capped" << true << "size" << 8192 << "autoIndexId" << true ), info );
+            insertA( ns, 0 );
+            insertA( ns, 1 );
+            auto_ptr< DBClientCursor > c1 = client().query( ns, QUERY( "a" << GT << -1 ), 0, 0, 0, QueryOption_CursorTailable );
+            OID id;
+            id.init("000000000000000000000000");
+            auto_ptr< DBClientCursor > c2 = client().query( ns, QUERY( "value" << GT << id ), 0, 0, 0, QueryOption_CursorTailable );
+            c1->next();
+            c1->next();
+            ASSERT( !c1->more() );
+            c2->next();
+            c2->next();
+            ASSERT( !c2->more() );
+            insertA( ns, 2 );
+            ASSERT( c1->more() );
+            ASSERT_EQUALS( 2, c1->next().getIntField( "a" ) );
+            ASSERT( !c1->more() );
+            ASSERT( c2->more() );
+            ASSERT_EQUALS( 2, c2->next().getIntField( "a" ) );  // SERVER-645
+            ASSERT( !c2->more() );
+            ASSERT( !c2->isDead() );
+        }
+    };
+
+    class OplogReplayMode : public ClientBase {
+    public:
+        ~OplogReplayMode() {
+            client().dropCollection( "unittests.querytests.OplogReplayMode" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.OplogReplayMode";
+            insert( ns, BSON( "ts" << 0 ) );
+            insert( ns, BSON( "ts" << 1 ) );
+            insert( ns, BSON( "ts" << 2 ) );
+            auto_ptr< DBClientCursor > c = client().query( ns, QUERY( "ts" << GT << 1 ).hint( BSON( "$natural" << 1 ) ), 0, 0, 0, QueryOption_OplogReplay );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 2, c->next().getIntField( "ts" ) );
+            ASSERT( !c->more() );
+
+            insert( ns, BSON( "ts" << 3 ) );
+            c = client().query( ns, QUERY( "ts" << GT << 1 ).hint( BSON( "$natural" << 1 ) ), 0, 0, 0, QueryOption_OplogReplay );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 2, c->next().getIntField( "ts" ) );
+            ASSERT( c->more() );
+        }
+    };
+
+    class BasicCount : public ClientBase {
+    public:
+        ~BasicCount() {
+            client().dropCollection( "unittests.querytests.BasicCount" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.BasicCount";
+            client().ensureIndex( ns, BSON( "a" << 1 ) );
+            count( 0 );
+            insert( ns, BSON( "a" << 3 ) );
+            count( 0 );
+            insert( ns, BSON( "a" << 4 ) );
+            count( 1 );
+            insert( ns, BSON( "a" << 5 ) );
+            count( 1 );
+            insert( ns, BSON( "a" << 4 ) );
+            count( 2 );
+        }
+    private:
+        void count( unsigned long long c ) const {
+            ASSERT_EQUALS( c, client().count( "unittests.querytests.BasicCount", BSON( "a" << 4 ) ) );
+        }
+    };
+
+    class ArrayId : public ClientBase {
+    public:
+        ~ArrayId() {
+            client().dropCollection( "unittests.querytests.ArrayId" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.ArrayId";
+            client().ensureIndex( ns, BSON( "_id" << 1 ) );
+            ASSERT( !error() );
+            client().insert( ns, fromjson( "{'_id':[1,2]}" ) );
+            ASSERT( error() );
+        }
+    };
+
+    class UnderscoreNs : public ClientBase {
+    public:
+        ~UnderscoreNs() {
+            client().dropCollection( "unittests.querytests._UnderscoreNs" );
+        }
+        void run() {
+            ASSERT( !error() );
+            const char *ns = "unittests.querytests._UnderscoreNs";
+            ASSERT( client().findOne( ns, "{}" ).isEmpty() );
+            client().insert( ns, BSON( "a" << 1 ) );
+            ASSERT_EQUALS( 1, client().findOne( ns, "{}" ).getIntField( "a" ) );
+            ASSERT( !error() );
+        }
+    };
+
+    class EmptyFieldSpec : public ClientBase {
+    public:
+        ~EmptyFieldSpec() {
+            client().dropCollection( "unittests.querytests.EmptyFieldSpec" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.EmptyFieldSpec";
+            client().insert( ns, BSON( "a" << 1 ) );
+            ASSERT( !client().findOne( ns, "" ).isEmpty() );
+            BSONObj empty;
+            ASSERT( !client().findOne( ns, "", &empty ).isEmpty() );
+        }
+    };
+
+    class MultiNe : public ClientBase {
+    public:
+        ~MultiNe() {
+            client().dropCollection( "unittests.querytests.Ne" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.Ne";
+            client().insert( ns, fromjson( "{a:[1,2]}" ) );
+            ASSERT( client().findOne( ns, fromjson( "{a:{$ne:1}}" ) ).isEmpty() );
+            BSONObj spec = fromjson( "{a:{$ne:1,$ne:2}}" );
+            ASSERT( client().findOne( ns, spec ).isEmpty() );
+        }
+    };
+
+    class EmbeddedNe : public ClientBase {
+    public:
+        ~EmbeddedNe() {
+            client().dropCollection( "unittests.querytests.NestedNe" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.NestedNe";
+            client().insert( ns, fromjson( "{a:[{b:1},{b:2}]}" ) );
+            ASSERT( client().findOne( ns, fromjson( "{'a.b':{$ne:1}}" ) ).isEmpty() );
+        }
+    };
+
+    class EmbeddedNumericTypes : public ClientBase {
+    public:
+        ~EmbeddedNumericTypes() {
+            client().dropCollection( "unittests.querytests.NumericEmbedded" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.NumericEmbedded";
+            client().insert( ns, BSON( "a" << BSON ( "b" << 1 ) ) );
+            ASSERT( ! client().findOne( ns, BSON( "a" << BSON ( "b" << 1.0 ) ) ).isEmpty() );
+            client().ensureIndex( ns , BSON( "a" << 1 ) );
+            ASSERT( ! client().findOne( ns, BSON( "a" << BSON ( "b" << 1.0 ) ) ).isEmpty() );
+        }
+    };
+
+    class AutoResetIndexCache : public ClientBase {
+    public:
+        ~AutoResetIndexCache() {
+            client().dropCollection( "unittests.querytests.AutoResetIndexCache" );
+        }
+        static const char *ns() { return "unittests.querytests.AutoResetIndexCache"; }
+        static const char *idxNs() { return "unittests.system.indexes"; }
+        void index() const { ASSERT( !client().findOne( idxNs(), BSON( "name" << NE << "_id_" ) ).isEmpty() ); }
+        void noIndex() const {
+            BSONObj o = client().findOne( idxNs(), BSON( "name" << NE << "_id_" ) );
+            if( !o.isEmpty() ) {
+                cout << o.toString() << endl;
+                ASSERT( false );
+            }
+        }
+        void checkIndex() {
+            client().ensureIndex( ns(), BSON( "a" << 1 ) );
+            index();
+        }
+        void run() {
+            client().dropDatabase( "unittests" );
+            noIndex();
+            checkIndex();
+            client().dropCollection( ns() );
+            noIndex();
+            checkIndex();
+            client().dropDatabase( "unittests" );
+            noIndex();
+            checkIndex();
+        }
+    };
+
+    class UniqueIndex : public ClientBase {
+    public:
+        ~UniqueIndex() {
+            client().dropCollection( "unittests.querytests.UniqueIndex" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.UniqueIndex";
+            client().ensureIndex( ns, BSON( "a" << 1 ), true );
+            client().insert( ns, BSON( "a" << 4 << "b" << 2 ) );
+            client().insert( ns, BSON( "a" << 4 << "b" << 3 ) );
+            ASSERT_EQUALS( 1U, client().count( ns, BSONObj() ) );
+            client().dropCollection( ns );
+            client().ensureIndex( ns, BSON( "b" << 1 ), true );
+            client().insert( ns, BSON( "a" << 4 << "b" << 2 ) );
+            client().insert( ns, BSON( "a" << 4 << "b" << 3 ) );
+            ASSERT_EQUALS( 2U, client().count( ns, BSONObj() ) );
+        }
+    };
+
+    class UniqueIndexPreexistingData : public ClientBase {
+    public:
+        ~UniqueIndexPreexistingData() {
+            client().dropCollection( "unittests.querytests.UniqueIndexPreexistingData" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.UniqueIndexPreexistingData";
+            client().insert( ns, BSON( "a" << 4 << "b" << 2 ) );
+            client().insert( ns, BSON( "a" << 4 << "b" << 3 ) );
+            client().ensureIndex( ns, BSON( "a" << 1 ), true );
+            ASSERT_EQUALS( 0U, client().count( "unittests.system.indexes", BSON( "ns" << ns << "name" << NE << "_id_" ) ) );
+        }
+    };
+
+    class SubobjectInArray : public ClientBase {
+    public:
+        ~SubobjectInArray() {
+            client().dropCollection( "unittests.querytests.SubobjectInArray" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.SubobjectInArray";
+            client().insert( ns, fromjson( "{a:[{b:{c:1}}]}" ) );
+            ASSERT( !client().findOne( ns, BSON( "a.b.c" << 1 ) ).isEmpty() );
+            ASSERT( !client().findOne( ns, fromjson( "{'a.c':null}" ) ).isEmpty() );
+        }
+    };
+
+    class Size : public ClientBase {
+    public:
+        ~Size() {
+            client().dropCollection( "unittests.querytests.Size" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.Size";
+            client().insert( ns, fromjson( "{a:[1,2,3]}" ) );
+            client().ensureIndex( ns, BSON( "a" << 1 ) );
+            ASSERT( client().query( ns, QUERY( "a" << mongo::SIZE << 3 ).hint( BSON( "a" << 1 ) ) )->more() );
+        }
+    };
+
+    class FullArray : public ClientBase {
+    public:
+        ~FullArray() {
+            client().dropCollection( "unittests.querytests.IndexedArray" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.IndexedArray";
+            client().insert( ns, fromjson( "{a:[1,2,3]}" ) );
+            ASSERT( client().query( ns, Query( "{a:[1,2,3]}" ) )->more() );
+            client().ensureIndex( ns, BSON( "a" << 1 ) );
+            ASSERT( client().query( ns, Query( "{a:{$in:[1,[1,2,3]]}}" ).hint( BSON( "a" << 1 ) ) )->more() );
+            ASSERT( client().query( ns, Query( "{a:[1,2,3]}" ).hint( BSON( "a" << 1 ) ) )->more() ); // SERVER-146
+        }
+    };
+
+    class InsideArray : public ClientBase {
+    public:
+        ~InsideArray() {
+            client().dropCollection( "unittests.querytests.InsideArray" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.InsideArray";
+            client().insert( ns, fromjson( "{a:[[1],2]}" ) );
+            check( "$natural" );
+            client().ensureIndex( ns, BSON( "a" << 1 ) );
+            check( "a" ); // SERVER-146
+        }
+    private:
+        void check( const string &hintField ) {
+            const char *ns = "unittests.querytests.InsideArray";
+            ASSERT( client().query( ns, Query( "{a:[[1],2]}" ).hint( BSON( hintField << 1 ) ) )->more() );
+            ASSERT( client().query( ns, Query( "{a:[1]}" ).hint( BSON( hintField << 1 ) ) )->more() );
+            ASSERT( client().query( ns, Query( "{a:2}" ).hint( BSON( hintField << 1 ) ) )->more() );
+            ASSERT( !client().query( ns, Query( "{a:1}" ).hint( BSON( hintField << 1 ) ) )->more() );
+        }
+    };
+
+    class IndexInsideArrayCorrect : public ClientBase {
+    public:
+        ~IndexInsideArrayCorrect() {
+            client().dropCollection( "unittests.querytests.IndexInsideArrayCorrect" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.IndexInsideArrayCorrect";
+            client().insert( ns, fromjson( "{'_id':1,a:[1]}" ) );
+            client().insert( ns, fromjson( "{'_id':2,a:[[1]]}" ) );
+            client().ensureIndex( ns, BSON( "a" << 1 ) );
+            ASSERT_EQUALS( 1, client().query( ns, Query( "{a:[1]}" ).hint( BSON( "a" << 1 ) ) )->next().getIntField( "_id" ) );
+        }
+    };
+
+    class SubobjArr : public ClientBase {
+    public:
+        ~SubobjArr() {
+            client().dropCollection( "unittests.querytests.SubobjArr" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.SubobjArr";
+            client().insert( ns, fromjson( "{a:[{b:[1]}]}" ) );
+            check( "$natural" );
+            client().ensureIndex( ns, BSON( "a" << 1 ) );
+            check( "a" );
+        }
+    private:
+        void check( const string &hintField ) {
+            const char *ns = "unittests.querytests.SubobjArr";
+            ASSERT( client().query( ns, Query( "{'a.b':1}" ).hint( BSON( hintField << 1 ) ) )->more() );
+            ASSERT( client().query( ns, Query( "{'a.b':[1]}" ).hint( BSON( hintField << 1 ) ) )->more() );
+        }
+    };
+
+    class MinMax : public ClientBase {
+    public:
+        MinMax() : ns( "unittests.querytests.MinMax" ) {}
+        ~MinMax() {
+            client().dropCollection( "unittests.querytests.MinMax" );
+        }
+        void run() {
+            client().ensureIndex( ns, BSON( "a" << 1 << "b" << 1 ) );
+            client().insert( ns, BSON( "a" << 1 << "b" << 1 ) );
+            client().insert( ns, BSON( "a" << 1 << "b" << 2 ) );
+            client().insert( ns, BSON( "a" << 2 << "b" << 1 ) );
+            client().insert( ns, BSON( "a" << 2 << "b" << 2 ) );
+
+            ASSERT_EQUALS( 4, count( client().query( ns, BSONObj() ) ) );
+            BSONObj hints[] = { BSONObj(), BSON( "a" << 1 << "b" << 1 ) };
+            for( int i = 0; i < 2; ++i ) {
+                check( 0, 0, 3, 3, 4, hints[ i ] );
+                check( 1, 1, 2, 2, 3, hints[ i ] );
+                check( 1, 2, 2, 2, 2, hints[ i ] );
+                check( 1, 2, 2, 1, 1, hints[ i ] );
+
+                auto_ptr< DBClientCursor > c = query( 1, 2, 2, 2, hints[ i ] );
+                BSONObj obj = c->next();
+                ASSERT_EQUALS( 1, obj.getIntField( "a" ) );
+                ASSERT_EQUALS( 2, obj.getIntField( "b" ) );
+                obj = c->next();
+                ASSERT_EQUALS( 2, obj.getIntField( "a" ) );
+                ASSERT_EQUALS( 1, obj.getIntField( "b" ) );
+                ASSERT( !c->more() );
+            }
+        }
+    private:
+        auto_ptr< DBClientCursor > query( int minA, int minB, int maxA, int maxB, const BSONObj &hint ) {
+            Query q;
+            q = q.minKey( BSON( "a" << minA << "b" << minB ) ).maxKey( BSON( "a" << maxA << "b" << maxB ) );
+            if ( !hint.isEmpty() )
+                q.hint( hint );
+            return client().query( ns, q );
+        }
+        void check( int minA, int minB, int maxA, int maxB, int expectedCount, const BSONObj &hint = empty_ ) {
+            ASSERT_EQUALS( expectedCount, count( query( minA, minB, maxA, maxB, hint ) ) );
+        }
+        int count( auto_ptr< DBClientCursor > c ) {
+            int ret = 0;
+            while( c->more() ) {
+                ++ret;
+                c->next();
+            }
+            return ret;
+        }
+        const char *ns;
+        static BSONObj empty_;
+    };
+    BSONObj MinMax::empty_;
+
+    class MatchCodeCodeWScope : public ClientBase {
+    public:
+        MatchCodeCodeWScope() : _ns( "unittests.querytests.MatchCodeCodeWScope" ) {}
+        ~MatchCodeCodeWScope() {
+            client().dropCollection( "unittests.querytests.MatchCodeCodeWScope" );
+        }
+        void run() {
+            checkMatch();
+            client().ensureIndex( _ns, BSON( "a" << 1 ) );
+            checkMatch();
+            // Use explain queries to check index bounds.
+            {
+                BSONObj explain = client().findOne( _ns, QUERY( "a" << BSON( "$type" << (int)Code ) ).explain() );
+                BSONObjBuilder lower;
+                lower.appendCode( "", "" );
+                BSONObjBuilder upper;
+                upper.appendCodeWScope( "", "", BSONObj() );
+                ASSERT( lower.done().firstElement().valuesEqual( explain[ "indexBounds" ].Obj()[ "a" ].Array()[ 0 ].Array()[ 0 ] ) );
+                ASSERT( upper.done().firstElement().valuesEqual( explain[ "indexBounds" ].Obj()[ "a" ].Array()[ 0 ].Array()[ 1 ] ) );
+            }
+            {
+                BSONObj explain = client().findOne( _ns, QUERY( "a" << BSON( "$type" << (int)CodeWScope ) ).explain() );
+                BSONObjBuilder lower;
+                lower.appendCodeWScope( "", "", BSONObj() );
+                // This upper bound may change if a new bson type is added.
+                BSONObjBuilder upper;
+                upper << "" << BSON( "$maxElement" << 1 );
+                ASSERT( lower.done().firstElement().valuesEqual( explain[ "indexBounds" ].Obj()[ "a" ].Array()[ 0 ].Array()[ 0 ] ) );
+                ASSERT( upper.done().firstElement().valuesEqual( explain[ "indexBounds" ].Obj()[ "a" ].Array()[ 0 ].Array()[ 1 ] ) );
+            }
+        }
+    private:
+        void checkMatch() {
+            client().remove( _ns, BSONObj() );
+            
+            client().insert( _ns, code() );
+            client().insert( _ns, codeWScope() );
+            
+            ASSERT_EQUALS( 1U, client().count( _ns, code() ) );
+            ASSERT_EQUALS( 1U, client().count( _ns, codeWScope() ) );
+            
+            ASSERT_EQUALS( 1U, client().count( _ns, BSON( "a" << BSON( "$type" << (int)Code ) ) ) );
+            ASSERT_EQUALS( 1U, client().count( _ns, BSON( "a" << BSON( "$type" << (int)CodeWScope ) ) ) );
+        }
+        BSONObj code() const {
+            BSONObjBuilder codeBuilder;
+            codeBuilder.appendCode( "a", "return 1;" );
+            return codeBuilder.obj();            
+        }
+        BSONObj codeWScope() const {
+            BSONObjBuilder codeWScopeBuilder;
+            codeWScopeBuilder.appendCodeWScope( "a", "return 1;", BSONObj() );
+            return codeWScopeBuilder.obj();            
+        }
+        const char *_ns;
+    };
+    
+    class MatchDBRefType : public ClientBase {
+    public:
+        MatchDBRefType() : _ns( "unittests.querytests.MatchDBRefType" ) {}
+        ~MatchDBRefType() {
+            client().dropCollection( "unittests.querytests.MatchDBRefType" );
+        }
+        void run() {
+            checkMatch();
+            client().ensureIndex( _ns, BSON( "a" << 1 ) );
+            checkMatch();
+        }
+    private:
+        void checkMatch() {
+            client().remove( _ns, BSONObj() );            
+            client().insert( _ns, dbref() );
+            ASSERT_EQUALS( 1U, client().count( _ns, dbref() ) );
+            ASSERT_EQUALS( 1U, client().count( _ns, BSON( "a" << BSON( "$type" << (int)DBRef ) ) ) );
+        }
+        BSONObj dbref() const {
+            BSONObjBuilder b;
+            OID oid;
+            b.appendDBRef( "a", "ns", oid );
+            return b.obj();            
+        }
+        const char *_ns;
+    };
+    
+    class DirectLocking : public ClientBase {
+    public:
+        void run() {
+            dblock lk;
+            Client::Context ctx( "unittests.DirectLocking" );
+            client().remove( "a.b", BSONObj() );
+            ASSERT_EQUALS( "unittests", cc().database()->name );
+        }
+        const char *ns;
+    };
+
+    class FastCountIn : public ClientBase {
+    public:
+        ~FastCountIn() {
+            client().dropCollection( "unittests.querytests.FastCountIn" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.FastCountIn";
+            client().insert( ns, BSON( "i" << "a" ) );
+            client().ensureIndex( ns, BSON( "i" << 1 ) );
+            ASSERT_EQUALS( 1U, client().count( ns, fromjson( "{i:{$in:['a']}}" ) ) );
+        }
+    };
+
+    class EmbeddedArray : public ClientBase {
+    public:
+        ~EmbeddedArray() {
+            client().dropCollection( "unittests.querytests.EmbeddedArray" );
+        }
+        void run() {
+            const char *ns = "unittests.querytests.EmbeddedArray";
+            client().insert( ns, fromjson( "{foo:{bar:['spam']}}" ) );
+            client().insert( ns, fromjson( "{foo:{bar:['spam','eggs']}}" ) );
+            client().insert( ns, fromjson( "{bar:['spam']}" ) );
+            client().insert( ns, fromjson( "{bar:['spam','eggs']}" ) );
+            ASSERT_EQUALS( 2U, client().count( ns, BSON( "bar" << "spam" ) ) );
+            ASSERT_EQUALS( 2U, client().count( ns, BSON( "foo.bar" << "spam" ) ) );
+        }
+    };
+
+    class DifferentNumbers : public ClientBase {
+    public:
+        ~DifferentNumbers() {
+            client().dropCollection( "unittests.querytests.DifferentNumbers" );
+        }
+        void t( const char * ns ) {
+            auto_ptr< DBClientCursor > cursor = client().query( ns, Query().sort( "7" ) );
+            while ( cursor->more() ) {
+                BSONObj o = cursor->next();
+                assert( o.valid() );
+                //cout << " foo " << o << endl;
+            }
+
+        }
+        void run() {
+            const char *ns = "unittests.querytests.DifferentNumbers";
+            { BSONObjBuilder b; b.append( "7" , (int)4 ); client().insert( ns , b.obj() ); }
+            { BSONObjBuilder b; b.append( "7" , (long long)2 ); client().insert( ns , b.obj() ); }
+            { BSONObjBuilder b; b.appendNull( "7" ); client().insert( ns , b.obj() ); }
+            { BSONObjBuilder b; b.append( "7" , "b" ); client().insert( ns , b.obj() ); }
+            { BSONObjBuilder b; b.appendNull( "8" ); client().insert( ns , b.obj() ); }
+            { BSONObjBuilder b; b.append( "7" , (double)3.7 ); client().insert( ns , b.obj() ); }
+
+            t(ns);
+            client().ensureIndex( ns , BSON( "7" << 1 ) );
+            t(ns);
+        }
+    };
+
+    class CollectionBase : public ClientBase {
+    public:
+
+        CollectionBase( string leaf ) {
+            _ns = "unittests.querytests.";
+            _ns += leaf;
+            client().dropCollection( ns() );
+        }
+
+        virtual ~CollectionBase() {
+            client().dropCollection( ns() );
+        }
+
+        int count() {
+            return (int) client().count( ns() );
+        }
+
+        const char * ns() {
+            return _ns.c_str();
+        }
+
+    private:
+        string _ns;
+    };
+
+    class SymbolStringSame : public CollectionBase {
+    public:
+        SymbolStringSame() : CollectionBase( "symbolstringsame" ) {}
+
+        void run() {
+            { BSONObjBuilder b; b.appendSymbol( "x" , "eliot" ); b.append( "z" , 17 ); client().insert( ns() , b.obj() ); }
+            ASSERT_EQUALS( 17 , client().findOne( ns() , BSONObj() )["z"].number() );
+            {
+                BSONObjBuilder b;
+                b.appendSymbol( "x" , "eliot" );
+                ASSERT_EQUALS( 17 , client().findOne( ns() , b.obj() )["z"].number() );
+            }
+            ASSERT_EQUALS( 17 , client().findOne( ns() , BSON( "x" << "eliot" ) )["z"].number() );
+            client().ensureIndex( ns() , BSON( "x" << 1 ) );
+            ASSERT_EQUALS( 17 , client().findOne( ns() , BSON( "x" << "eliot" ) )["z"].number() );
+        }
+    };
+
+    class TailableCappedRaceCondition : public CollectionBase {
+    public:
+
+        TailableCappedRaceCondition() : CollectionBase( "tailablecappedrace" ) {
+            client().dropCollection( ns() );
+            _n = 0;
+        }
+        void run() {
+            string err;
+
+            writelock lk("");
+            Client::Context ctx( "unittests" );
+
+            // note that extents are always at least 4KB now - so this will get rounded up a bit.
+            ASSERT( userCreateNS( ns() , fromjson( "{ capped : true , size : 2000 }" ) , err , false ) );
+            for ( int i=0; i<200; i++ ) {
+                insertNext();
+//                cout << count() << endl;
+                ASSERT( count() < 90 );
+            }
+
+            int a = count();
+
+            auto_ptr< DBClientCursor > c = client().query( ns() , QUERY( "i" << GT << 0 ).hint( BSON( "$natural" << 1 ) ), 0, 0, 0, QueryOption_CursorTailable );
+            int n=0;
+            while ( c->more() ) {
+                BSONObj z = c->next();
+                n++;
+            }
+
+            ASSERT_EQUALS( a , n );
+
+            insertNext();
+            ASSERT( c->more() );
+
+            for ( int i=0; i<90; i++ ) {
+                insertNext();
+            }
+
+            while ( c->more() ) { c->next(); }
+            ASSERT( c->isDead() );
+        }
+
+        void insertNext() {
+			BSONObjBuilder b;
+			b.appendOID("_id", 0, true);
+			b.append("i", _n++);
+            insert( ns() , b.obj() );
+        }
+
+        int _n;
+    };
+
+    class HelperTest : public CollectionBase {
+    public:
+
+        HelperTest() : CollectionBase( "helpertest" ) {
+        }
+
+        void run() {
+            writelock lk("");
+            Client::Context ctx( "unittests" );
+
+            for ( int i=0; i<50; i++ ) {
+                insert( ns() , BSON( "_id" << i << "x" << i * 2 ) );
+            }
+
+            ASSERT_EQUALS( 50 , count() );
+
+            BSONObj res;
+            ASSERT( Helpers::findOne( ns() , BSON( "_id" << 20 ) , res , true ) );
+            ASSERT_EQUALS( 40 , res["x"].numberInt() );
+
+            ASSERT( Helpers::findById( cc(), ns() , BSON( "_id" << 20 ) , res ) );
+            ASSERT_EQUALS( 40 , res["x"].numberInt() );
+
+            ASSERT( ! Helpers::findById( cc(), ns() , BSON( "_id" << 200 ) , res ) );
+
+            unsigned long long slow , fast;
+
+            int n = 10000;
+            DEV n = 1000;
+            {
+                Timer t;
+                for ( int i=0; i<n; i++ ) {
+                    ASSERT( Helpers::findOne( ns() , BSON( "_id" << 20 ) , res , true ) );
+                }
+                slow = t.micros();
+            }
+            {
+                Timer t;
+                for ( int i=0; i<n; i++ ) {
+                    ASSERT( Helpers::findById( cc(), ns() , BSON( "_id" << 20 ) , res ) );
+                }
+                fast = t.micros();
+            }
+
+            cout << "HelperTest  slow:" << slow << " fast:" << fast << endl;
+
+        }
+    };
+
+    class HelperByIdTest : public CollectionBase {
+    public:
+
+        HelperByIdTest() : CollectionBase( "helpertestbyid" ) {
+        }
+
+        void run() {
+            writelock lk("");
+            Client::Context ctx( "unittests" );
+
+            for ( int i=0; i<1000; i++ ) {
+                insert( ns() , BSON( "_id" << i << "x" << i * 2 ) );
+            }
+            for ( int i=0; i<1000; i+=2 ) {
+                client_.remove( ns() , BSON( "_id" << i ) );
+            }
+
+            BSONObj res;
+            for ( int i=0; i<1000; i++ ) {
+                bool found = Helpers::findById( cc(), ns() , BSON( "_id" << i ) , res );
+                ASSERT_EQUALS( i % 2 , int(found) );
+            }
+
+        }
+    };
+
+    class ClientCursorTest : public CollectionBase {
+        ClientCursorTest() : CollectionBase( "clientcursortest" ) {
+        }
+
+        void run() {
+            writelock lk("");
+            Client::Context ctx( "unittests" );
+
+            for ( int i=0; i<1000; i++ ) {
+                insert( ns() , BSON( "_id" << i << "x" << i * 2 ) );
+            }
+
+
+        }
+    };
+
+    class FindingStart : public CollectionBase {
+    public:
+        FindingStart() : CollectionBase( "findingstart" ), _old( __findingStartInitialTimeout ) {
+            __findingStartInitialTimeout = 0;
+        }
+        ~FindingStart() {
+            __findingStartInitialTimeout = _old;
+        }
+
+        void run() {
+            BSONObj info;
+            ASSERT( client().runCommand( "unittests", BSON( "create" << "querytests.findingstart" << "capped" << true << "$nExtents" << 5 << "autoIndexId" << false ), info ) );
+
+            int i = 0;
+            for( int oldCount = -1;
+                    count() != oldCount;
+                    oldCount = count(), client().insert( ns(), BSON( "ts" << i++ ) ) );
+
+            for( int k = 0; k < 5; ++k ) {
+                client().insert( ns(), BSON( "ts" << i++ ) );
+                int min = client().query( ns(), Query().sort( BSON( "$natural" << 1 ) ) )->next()[ "ts" ].numberInt();
+                for( int j = -1; j < i; ++j ) {
+                    auto_ptr< DBClientCursor > c = client().query( ns(), QUERY( "ts" << GTE << j ), 0, 0, 0, QueryOption_OplogReplay );
+                    ASSERT( c->more() );
+                    BSONObj next = c->next();
+                    ASSERT( !next[ "ts" ].eoo() );
+                    ASSERT_EQUALS( ( j > min ? j : min ), next[ "ts" ].numberInt() );
+                }
+                //cout << k << endl;
+            }
+        }
+
+    private:
+        int _old;
+    };
+
+    class FindingStartPartiallyFull : public CollectionBase {
+    public:
+        FindingStartPartiallyFull() : CollectionBase( "findingstart" ), _old( __findingStartInitialTimeout ) {
+            __findingStartInitialTimeout = 0;
+        }
+        ~FindingStartPartiallyFull() {
+            __findingStartInitialTimeout = _old;
+        }
+
+        void run() {
+            unsigned startNumCursors = ClientCursor::numCursors();
+
+            BSONObj info;
+            ASSERT( client().runCommand( "unittests", BSON( "create" << "querytests.findingstart" << "capped" << true << "$nExtents" << 5 << "autoIndexId" << false ), info ) );
+
+            int i = 0;
+            for( ; i < 150; client().insert( ns(), BSON( "ts" << i++ ) ) );
+
+            for( int k = 0; k < 5; ++k ) {
+                client().insert( ns(), BSON( "ts" << i++ ) );
+                int min = client().query( ns(), Query().sort( BSON( "$natural" << 1 ) ) )->next()[ "ts" ].numberInt();
+                for( int j = -1; j < i; ++j ) {
+                    auto_ptr< DBClientCursor > c = client().query( ns(), QUERY( "ts" << GTE << j ), 0, 0, 0, QueryOption_OplogReplay );
+                    ASSERT( c->more() );
+                    BSONObj next = c->next();
+                    ASSERT( !next[ "ts" ].eoo() );
+                    ASSERT_EQUALS( ( j > min ? j : min ), next[ "ts" ].numberInt() );
+                }
+            }
+
+            ASSERT_EQUALS( startNumCursors, ClientCursor::numCursors() );
+        }
+
+    private:
+        int _old;
+    };
+    
+    /**
+     * Check OplogReplay mode where query timestamp is earlier than the earliest
+     * entry in the collection.
+     */
+    class FindingStartStale : public CollectionBase {
+    public:
+        FindingStartStale() : CollectionBase( "findingstart" ) {}
+
+        void run() {
+            unsigned startNumCursors = ClientCursor::numCursors();
+            
+            BSONObj info;
+            ASSERT( client().runCommand( "unittests", BSON( "create" << "querytests.findingstart" << "capped" << true << "$nExtents" << 5 << "autoIndexId" << false ), info ) );
+            
+            // Check OplogReplay mode with empty collection.
+            auto_ptr< DBClientCursor > c = client().query( ns(), QUERY( "ts" << GTE << 50 ), 0, 0, 0, QueryOption_OplogReplay );
+            ASSERT( !c->more() );
+
+            // Check with some docs in the collection.
+            for( int i = 100; i < 150; client().insert( ns(), BSON( "ts" << i++ ) ) );
+            c = client().query( ns(), QUERY( "ts" << GTE << 50 ), 0, 0, 0, QueryOption_OplogReplay );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 100, c->next()[ "ts" ].numberInt() );
+
+            // Check that no persistent cursors outlast our queries above.
+            ASSERT_EQUALS( startNumCursors, ClientCursor::numCursors() );
+        }
+    };
+
+    class WhatsMyUri : public CollectionBase {
+    public:
+        WhatsMyUri() : CollectionBase( "whatsmyuri" ) {}
+        void run() {
+            BSONObj result;
+            client().runCommand( "admin", BSON( "whatsmyuri" << 1 ), result );
+            ASSERT_EQUALS( unknownAddress.toString(), result[ "you" ].str() );
+        }
+    };
+
+    namespace parsedtests {
+        class basic1 {
+        public:
+            void _test( const BSONObj& in ) {
+                ParsedQuery q( "a.b" , 5 , 6 , 9 , in , BSONObj() );
+                ASSERT_EQUALS( BSON( "x" << 5 ) , q.getFilter() );
+            }
+            void run() {
+                _test( BSON( "x" << 5 ) );
+                _test( BSON( "query" << BSON( "x" << 5 ) ) );
+                _test( BSON( "$query" << BSON( "x" << 5 ) ) );
+
+                {
+                    ParsedQuery q( "a.b" , 5 , 6 , 9 , BSON( "x" << 5 ) , BSONObj() );
+                    ASSERT_EQUALS( 6 , q.getNumToReturn() );
+                    ASSERT( q.wantMore() );
+                }
+                {
+                    ParsedQuery q( "a.b" , 5 , -6 , 9 , BSON( "x" << 5 ) , BSONObj() );
+                    ASSERT_EQUALS( 6 , q.getNumToReturn() );
+                    ASSERT( ! q.wantMore() );
+                }
+            }
+        };
+    };
+
+    namespace queryobjecttests {
+        class names1 {
+        public:
+            void run() {
+                ASSERT_EQUALS( BSON( "x" << 1 ) , QUERY( "query" << BSON( "x" << 1 ) ).getFilter() );
+                ASSERT_EQUALS( BSON( "x" << 1 ) , QUERY( "$query" << BSON( "x" << 1 ) ).getFilter() );
+            }
+
+        };
+    }
+
+    class OrderingTest {
+    public:
+        void run() {
+            {
+                Ordering o = Ordering::make( BSON( "a" << 1 << "b" << -1 << "c" << 1 ) );
+                ASSERT_EQUALS( 1 , o.get(0) );
+                ASSERT_EQUALS( -1 , o.get(1) );
+                ASSERT_EQUALS( 1 , o.get(2) );
+
+                ASSERT( ! o.descending( 1 ) );
+                ASSERT( o.descending( 1 << 1 ) );
+                ASSERT( ! o.descending( 1 << 2 ) );
+            }
+
+            {
+                Ordering o = Ordering::make( BSON( "a.d" << 1 << "a" << 1 << "e" << -1 ) );
+                ASSERT_EQUALS( 1 , o.get(0) );
+                ASSERT_EQUALS( 1 , o.get(1) );
+                ASSERT_EQUALS( -1 , o.get(2) );
+
+                ASSERT( ! o.descending( 1 ) );
+                ASSERT( ! o.descending( 1 << 1 ) );
+                ASSERT(  o.descending( 1 << 2 ) );
+            }
+
+        }
+    };
+
+    namespace proj { // Projection tests
+
+        class T1 {
+        public:
+            void run() {
+
+                Projection m;
+                m.init( BSON( "a" << 1 ) );
+                ASSERT_EQUALS( BSON( "a" << 5 ) , m.transform( BSON( "x" << 1 << "a" << 5 ) ) );
+            }
+        };
+
+        class K1 {
+        public:
+            void run() {
+
+                Projection m;
+                m.init( BSON( "a" << 1 ) );
+
+                scoped_ptr<Projection::KeyOnly> x( m.checkKey( BSON( "a" << 1 ) ) );
+                ASSERT( ! x );
+
+                x.reset( m.checkKey( BSON( "a" << 1  << "_id" << 1 ) ) );
+                ASSERT( x );
+
+                ASSERT_EQUALS( BSON( "a" << 5 << "_id" << 17 ) ,
+                               x->hydrate( BSON( "" << 5 << "" << 17 ) ) );
+
+                x.reset( m.checkKey( BSON( "a" << 1 << "x" << 1 << "_id" << 1 ) ) );
+                ASSERT( x );
+
+                ASSERT_EQUALS( BSON( "a" << 5 << "_id" << 17 ) ,
+                               x->hydrate( BSON( "" << 5 << "" << 123 << "" << 17 ) ) );
+
+            }
+        };
+
+        class K2 {
+        public:
+            void run() {
+
+                Projection m;
+                m.init( BSON( "a" << 1 << "_id" << 0 ) );
+
+                scoped_ptr<Projection::KeyOnly> x( m.checkKey( BSON( "a" << 1 ) ) );
+                ASSERT( x );
+
+                ASSERT_EQUALS( BSON( "a" << 17 ) ,
+                               x->hydrate( BSON( "" << 17 ) ) );
+
+                x.reset( m.checkKey( BSON( "x" << 1 << "a" << 1 << "_id" << 1 ) ) );
+                ASSERT( x );
+
+                ASSERT_EQUALS( BSON( "a" << 123 ) ,
+                               x->hydrate( BSON( "" << 5 << "" << 123 << "" << 17 ) ) );
+
+            }
+        };
+
+
+        class K3 {
+        public:
+            void run() {
+
+                {
+                    Projection m;
+                    m.init( BSON( "a" << 1 << "_id" << 0 ) );
+
+                    scoped_ptr<Projection::KeyOnly> x( m.checkKey( BSON( "a" << 1 << "x.a" << 1 ) ) );
+                    ASSERT( x );
+                }
+
+
+                {
+                    // TODO: this is temporary SERVER-2104
+                    Projection m;
+                    m.init( BSON( "x.a" << 1 << "_id" << 0 ) );
+
+                    scoped_ptr<Projection::KeyOnly> x( m.checkKey( BSON( "a" << 1 << "x.a" << 1 ) ) );
+                    ASSERT( ! x );
+                }
+
+            }
+        };
+
+
+    }
+
+    class All : public Suite {
+    public:
+        All() : Suite( "query" ) {
+        }
+
+        void setupTests() {
+            add< FindingStart >();
+            add< FindOne >();
+            add< FindOneRequireIndex >();
+            add< FindOneEmptyObj >();
+            add< BoundedKey >();
+            add< GetMore >();
+            add< PositiveLimit >();
+            add< ReturnOneOfManyAndTail >();
+            add< TailNotAtEnd >();
+            add< EmptyTail >();
+            add< TailableDelete >();
+            add< TailableInsertDelete >();
+            add< TailCappedOnly >();
+            add< TailableQueryOnId >();
+            add< OplogReplayMode >();
+            add< ArrayId >();
+            add< UnderscoreNs >();
+            add< EmptyFieldSpec >();
+            add< MultiNe >();
+            add< EmbeddedNe >();
+            add< EmbeddedNumericTypes >();
+            add< AutoResetIndexCache >();
+            add< UniqueIndex >();
+            add< UniqueIndexPreexistingData >();
+            add< SubobjectInArray >();
+            add< Size >();
+            add< FullArray >();
+            add< InsideArray >();
+            add< IndexInsideArrayCorrect >();
+            add< SubobjArr >();
+            add< MinMax >();
+            add< MatchCodeCodeWScope >();
+            add< MatchDBRefType >();
+            add< DirectLocking >();
+            add< FastCountIn >();
+            add< EmbeddedArray >();
+            add< DifferentNumbers >();
+            add< SymbolStringSame >();
+            add< TailableCappedRaceCondition >();
+            add< HelperTest >();
+            add< HelperByIdTest >();
+            add< FindingStartPartiallyFull >();
+            add< FindingStartStale >();
+            add< WhatsMyUri >();
+
+            add< parsedtests::basic1 >();
+
+            add< queryobjecttests::names1 >();
+
+            add< OrderingTest >();
+
+            add< proj::T1 >();
+            add< proj::K1 >();
+            add< proj::K2 >();
+            add< proj::K3 >();
+        }
+    } myall;
+
+} // namespace QueryTests
+
diff --git a/src/mongo/dbtests/queryutiltests.cpp b/src/mongo/dbtests/queryutiltests.cpp
new file mode 100644
index 00000000000..e825b4f8a9b
--- /dev/null
+++ b/src/mongo/dbtests/queryutiltests.cpp
@@ -0,0 +1,989 @@
+// queryutiltests.cpp : query utility unit tests
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/queryutil.h"
+#include "../db/querypattern.h"
+#include "../db/instance.h"
+#include "../db/pdfile.h"
+#include "dbtests.h"
+
+namespace QueryUtilTests {
+
+    namespace FieldRangeTests {
+        class Base {
+        public:
+            virtual ~Base() {}
+            void run() {
+                const FieldRangeSet s( "ns", query(), true );
+                checkElt( lower(), s.range( "a" ).min() );
+                checkElt( upper(), s.range( "a" ).max() );
+                ASSERT_EQUALS( lowerInclusive(), s.range( "a" ).minInclusive() );
+                ASSERT_EQUALS( upperInclusive(), s.range( "a" ).maxInclusive() );
+            }
+        protected:
+            virtual BSONObj query() = 0;
+            virtual BSONElement lower() { return minKey.firstElement(); }
+            virtual bool lowerInclusive() { return true; }
+            virtual BSONElement upper() { return maxKey.firstElement(); }
+            virtual bool upperInclusive() { return true; }
+            static void checkElt( BSONElement expected, BSONElement actual ) {
+                if ( expected.woCompare( actual, false ) ) {
+                    log() << "expected: " << expected << ", got: " << actual;
+                    ASSERT( false );
+                }
+            }
+        };
+
+
+        class NumericBase : public Base {
+        public:
+            NumericBase() {
+                o = BSON( "min" << -numeric_limits<double>::max() << "max" << numeric_limits<double>::max() );
+            }
+
+            virtual BSONElement lower() { return o["min"]; }
+            virtual BSONElement upper() { return o["max"]; }
+        private:
+            BSONObj o;
+        };
+
+        class Empty : public Base {
+            virtual BSONObj query() { return BSONObj(); }
+        };
+
+        class Eq : public Base {
+        public:
+            Eq() : o_( BSON( "a" << 1 ) ) {}
+            virtual BSONObj query() { return o_; }
+            virtual BSONElement lower() { return o_.firstElement(); }
+            virtual BSONElement upper() { return o_.firstElement(); }
+            BSONObj o_;
+        };
+
+        class DupEq : public Eq {
+        public:
+            virtual BSONObj query() { return BSON( "a" << 1 << "b" << 2 << "a" << 1 ); }
+        };
+
+        class Lt : public NumericBase {
+        public:
+            Lt() : o_( BSON( "-" << 1 ) ) {}
+            virtual BSONObj query() { return BSON( "a" << LT << 1 ); }
+            virtual BSONElement upper() { return o_.firstElement(); }
+            virtual bool upperInclusive() { return false; }
+            BSONObj o_;
+        };
+
+        class Lte : public Lt {
+            virtual BSONObj query() { return BSON( "a" << LTE << 1 ); }
+            virtual bool upperInclusive() { return true; }
+        };
+
+        class Gt : public NumericBase {
+        public:
+            Gt() : o_( BSON( "-" << 1 ) ) {}
+            virtual BSONObj query() { return BSON( "a" << GT << 1 ); }
+            virtual BSONElement lower() { return o_.firstElement(); }
+            virtual bool lowerInclusive() { return false; }
+            BSONObj o_;
+        };
+
+        class Gte : public Gt {
+            virtual BSONObj query() { return BSON( "a" << GTE << 1 ); }
+            virtual bool lowerInclusive() { return true; }
+        };
+
+        class TwoLt : public Lt {
+            virtual BSONObj query() { return BSON( "a" << LT << 1 << LT << 5 ); }
+        };
+
+        class TwoGt : public Gt {
+            virtual BSONObj query() { return BSON( "a" << GT << 0 << GT << 1 ); }
+        };
+
+        class EqGte : public Eq {
+            virtual BSONObj query() { return BSON( "a" << 1 << "a" << GTE << 1 ); }
+        };
+
+        class EqGteInvalid {
+        public:
+            void run() {
+                FieldRangeSet frs( "ns", BSON( "a" << 1 << "a" << GTE << 2 ), true );
+                ASSERT( !frs.matchPossible() );
+            }
+        };
+
+        struct RegexBase : Base {
+            void run() { //need to only look at first interval
+                FieldRangeSet s( "ns", query(), true );
+                checkElt( lower(), s.range( "a" ).intervals()[0]._lower._bound );
+                checkElt( upper(), s.range( "a" ).intervals()[0]._upper._bound );
+                ASSERT_EQUALS( lowerInclusive(), s.range( "a" ).intervals()[0]._lower._inclusive );
+                ASSERT_EQUALS( upperInclusive(), s.range( "a" ).intervals()[0]._upper._inclusive );
+            }
+        };
+
+        class Regex : public RegexBase {
+        public:
+            Regex() : o1_( BSON( "" << "abc" ) ), o2_( BSON( "" << "abd" ) ) {}
+            virtual BSONObj query() {
+                BSONObjBuilder b;
+                b.appendRegex( "a", "^abc" );
+                return b.obj();
+            }
+            virtual BSONElement lower() { return o1_.firstElement(); }
+            virtual BSONElement upper() { return o2_.firstElement(); }
+            virtual bool upperInclusive() { return false; }
+            BSONObj o1_, o2_;
+        };
+
+        class RegexObj : public RegexBase {
+        public:
+            RegexObj() : o1_( BSON( "" << "abc" ) ), o2_( BSON( "" << "abd" ) ) {}
+            virtual BSONObj query() { return BSON("a" << BSON("$regex" << "^abc")); }
+            virtual BSONElement lower() { return o1_.firstElement(); }
+            virtual BSONElement upper() { return o2_.firstElement(); }
+            virtual bool upperInclusive() { return false; }
+            BSONObj o1_, o2_;
+        };
+
+        class UnhelpfulRegex : public RegexBase {
+        public:
+            UnhelpfulRegex() {
+                BSONObjBuilder b;
+                b.appendMinForType("lower", String);
+                b.appendMaxForType("upper", String);
+                limits = b.obj();
+            }
+
+            virtual BSONObj query() {
+                BSONObjBuilder b;
+                b.appendRegex( "a", "abc" );
+                return b.obj();
+            }
+            virtual BSONElement lower() { return limits["lower"]; }
+            virtual BSONElement upper() { return limits["upper"]; }
+            virtual bool upperInclusive() { return false; }
+            BSONObj limits;
+        };
+
+        class In : public Base {
+        public:
+            In() : o1_( BSON( "-" << -3 ) ), o2_( BSON( "-" << 44 ) ) {}
+            virtual BSONObj query() {
+                vector< int > vals;
+                vals.push_back( 4 );
+                vals.push_back( 8 );
+                vals.push_back( 44 );
+                vals.push_back( -1 );
+                vals.push_back( -3 );
+                vals.push_back( 0 );
+                BSONObjBuilder bb;
+                bb.append( "$in", vals );
+                BSONObjBuilder b;
+                b.append( "a", bb.done() );
+                return b.obj();
+            }
+            virtual BSONElement lower() { return o1_.firstElement(); }
+            virtual BSONElement upper() { return o2_.firstElement(); }
+            BSONObj o1_, o2_;
+        };
+
+        class Equality {
+        public:
+            void run() {
+                FieldRangeSet s( "ns", BSON( "a" << 1 ), true );
+                ASSERT( s.range( "a" ).equality() );
+                FieldRangeSet s2( "ns", BSON( "a" << GTE << 1 << LTE << 1 ), true );
+                ASSERT( s2.range( "a" ).equality() );
+                FieldRangeSet s3( "ns", BSON( "a" << GT << 1 << LTE << 1 ), true );
+                ASSERT( !s3.range( "a" ).equality() );
+                FieldRangeSet s4( "ns", BSON( "a" << GTE << 1 << LT << 1 ), true );
+                ASSERT( !s4.range( "a" ).equality() );
+                FieldRangeSet s5( "ns", BSON( "a" << GTE << 1 << LTE << 1 << GT << 1 ), true );
+                ASSERT( !s5.range( "a" ).equality() );
+                FieldRangeSet s6( "ns", BSON( "a" << GTE << 1 << LTE << 1 << LT << 1 ), true );
+                ASSERT( !s6.range( "a" ).equality() );
+            }
+        };
+
+        class SimplifiedQuery {
+        public:
+            void run() {
+                FieldRangeSet frs( "ns", BSON( "a" << GT << 1 << GT << 5 << LT << 10 << "b" << 4 << "c" << LT << 4 << LT << 6 << "d" << GTE << 0 << GT << 0 << "e" << GTE << 0 << LTE << 10 ), true );
+                BSONObj simple = frs.simplifiedQuery();
+                cout << "simple: " << simple << endl;
+                ASSERT( !simple.getObjectField( "a" ).woCompare( fromjson( "{$gt:5,$lt:10}" ) ) );
+                ASSERT_EQUALS( 4, simple.getIntField( "b" ) );
+                ASSERT( !simple.getObjectField( "c" ).woCompare( BSON("$gte" << -numeric_limits<double>::max() << "$lt" << 4 ) ) );
+                ASSERT( !simple.getObjectField( "d" ).woCompare( BSON("$gt" << 0 << "$lte" << numeric_limits<double>::max() ) ) );
+                ASSERT( !simple.getObjectField( "e" ).woCompare( fromjson( "{$gte:0,$lte:10}" ) ) );
+            }
+        };
+
+        class QueryPatternTest {
+        public:
+            void run() {
+                ASSERT( p( BSON( "a" << 1 ) ) == p( BSON( "a" << 1 ) ) );
+                ASSERT( p( BSON( "a" << 1 ) ) == p( BSON( "a" << 5 ) ) );
+                ASSERT( p( BSON( "a" << 1 ) ) != p( BSON( "b" << 1 ) ) );
+                ASSERT( p( BSON( "a" << 1 ) ) != p( BSON( "a" << LTE << 1 ) ) );
+                ASSERT( p( BSON( "a" << 1 ) ) != p( BSON( "a" << 1 << "b" << 2 ) ) );
+                ASSERT( p( BSON( "a" << 1 << "b" << 3 ) ) != p( BSON( "a" << 1 ) ) );
+                ASSERT( p( BSON( "a" << LT << 1 ) ) == p( BSON( "a" << LTE << 5 ) ) );
+                ASSERT( p( BSON( "a" << LT << 1 << GTE << 0 ) ) == p( BSON( "a" << LTE << 5 << GTE << 0 ) ) );
+                ASSERT( p( BSON( "a" << 1 ) ) < p( BSON( "a" << 1 << "b" << 1 ) ) );
+                ASSERT( !( p( BSON( "a" << 1 << "b" << 1 ) ) < p( BSON( "a" << 1 ) ) ) );
+                ASSERT( p( BSON( "a" << 1 ), BSON( "b" << 1 ) ) == p( BSON( "a" << 4 ), BSON( "b" << "a" ) ) );
+                ASSERT( p( BSON( "a" << 1 ), BSON( "b" << 1 ) ) == p( BSON( "a" << 4 ), BSON( "b" << -1 ) ) );
+                ASSERT( p( BSON( "a" << 1 ), BSON( "b" << 1 ) ) != p( BSON( "a" << 4 ), BSON( "c" << 1 ) ) );
+                ASSERT( p( BSON( "a" << 1 ), BSON( "b" << 1 << "c" << -1 ) ) == p( BSON( "a" << 4 ), BSON( "b" << -1 << "c" << 1 ) ) );
+                ASSERT( p( BSON( "a" << 1 ), BSON( "b" << 1 << "c" << 1 ) ) != p( BSON( "a" << 4 ), BSON( "b" << 1 ) ) );
+                ASSERT( p( BSON( "a" << 1 ), BSON( "b" << 1 ) ) != p( BSON( "a" << 4 ), BSON( "b" << 1 << "c" << 1 ) ) );
+            }
+        private:
+            static QueryPattern p( const BSONObj &query, const BSONObj &sort = BSONObj() ) {
+                return FieldRangeSet( "", query, true ).pattern( sort );
+            }
+        };
+
+        class NoWhere {
+        public:
+            void run() {
+                ASSERT_EQUALS( 0, FieldRangeSet( "ns", BSON( "$where" << 1 ), true ).nNontrivialRanges() );
+            }
+        };
+
+        class Numeric {
+        public:
+            void run() {
+                FieldRangeSet f( "", BSON( "a" << 1 ), true );
+                ASSERT( f.range( "a" ).min().woCompare( BSON( "a" << 2.0 ).firstElement() ) < 0 );
+                ASSERT( f.range( "a" ).min().woCompare( BSON( "a" << 0.0 ).firstElement() ) > 0 );
+            }
+        };
+
+        class InLowerBound {
+        public:
+            void run() {
+                FieldRangeSet f( "", fromjson( "{a:{$gt:4,$in:[1,2,3,4,5,6]}}" ), true );
+                ASSERT( f.range( "a" ).min().woCompare( BSON( "a" << 5.0 ).firstElement(), false ) == 0 );
+                ASSERT( f.range( "a" ).max().woCompare( BSON( "a" << 6.0 ).firstElement(), false ) == 0 );
+            }
+        };
+
+        class InUpperBound {
+        public:
+            void run() {
+                FieldRangeSet f( "", fromjson( "{a:{$lt:4,$in:[1,2,3,4,5,6]}}" ), true );
+                ASSERT( f.range( "a" ).min().woCompare( BSON( "a" << 1.0 ).firstElement(), false ) == 0 );
+                ASSERT( f.range( "a" ).max().woCompare( BSON( "a" << 3.0 ).firstElement(), false ) == 0 );
+            }
+        };
+
+        class UnionBound {
+        public:
+            void run() {
+                FieldRangeSet frs( "", fromjson( "{a:{$gt:1,$lt:9},b:{$gt:9,$lt:12}}" ), true );
+                FieldRange ret = frs.range( "a" );
+                ret |= frs.range( "b" );
+                ASSERT_EQUALS( 2U, ret.intervals().size() );
+            }
+        };
+
+        class MultiBound {
+        public:
+            void run() {
+                FieldRangeSet frs1( "", fromjson( "{a:{$in:[1,3,5,7,9]}}" ), true );
+                FieldRangeSet frs2( "", fromjson( "{a:{$in:[2,3,5,8,9]}}" ), true );
+                FieldRange fr1 = frs1.range( "a" );
+                FieldRange fr2 = frs2.range( "a" );
+                fr1 &= fr2;
+                ASSERT( fr1.min().woCompare( BSON( "a" << 3.0 ).firstElement(), false ) == 0 );
+                ASSERT( fr1.max().woCompare( BSON( "a" << 9.0 ).firstElement(), false ) == 0 );
+                vector< FieldInterval > intervals = fr1.intervals();
+                vector< FieldInterval >::const_iterator j = intervals.begin();
+                double expected[] = { 3, 5, 9 };
+                for( int i = 0; i < 3; ++i, ++j ) {
+                    ASSERT_EQUALS( expected[ i ], j->_lower._bound.number() );
+                    ASSERT( j->_lower._inclusive );
+                    ASSERT( j->_lower == j->_upper );
+                }
+                ASSERT( j == intervals.end() );
+            }
+        };
+
+        class DiffBase {
+        public:
+            virtual ~DiffBase() {}
+            void run() {
+                FieldRangeSet frs( "", fromjson( obj().toString() ), true );
+                FieldRange ret = frs.range( "a" );
+                ret -= frs.range( "b" );
+                check( ret );
+            }
+        protected:
+            void check( const FieldRange &fr ) {
+                vector< FieldInterval > fi = fr.intervals();
+                ASSERT_EQUALS( len(), fi.size() );
+                int i = 0;
+                for( vector< FieldInterval >::const_iterator j = fi.begin(); j != fi.end(); ++j ) {
+                    ASSERT_EQUALS( nums()[ i ], j->_lower._bound.numberInt() );
+                    ASSERT_EQUALS( incs()[ i ], j->_lower._inclusive );
+                    ++i;
+                    ASSERT_EQUALS( nums()[ i ], j->_upper._bound.numberInt() );
+                    ASSERT_EQUALS( incs()[ i ], j->_upper._inclusive );
+                    ++i;
+                }
+            }
+            virtual unsigned len() const = 0;
+            virtual const int *nums() const = 0;
+            virtual const bool *incs() const = 0;
+            virtual BSONObj obj() const = 0;
+        };
+
+        class TwoRangeBase : public DiffBase {
+        public:
+            TwoRangeBase( string obj, int low, int high, bool lowI, bool highI )
+                : _obj( obj ) {
+                _n[ 0 ] = low;
+                _n[ 1 ] = high;
+                _b[ 0 ] = lowI;
+                _b[ 1 ] = highI;
+            }
+        private:
+            virtual unsigned len() const { return 1; }
+            virtual const int *nums() const { return _n; }
+            virtual const bool *incs() const { return _b; }
+            virtual BSONObj obj() const { return fromjson( _obj ); }
+            string _obj;
+            int _n[ 2 ];
+            bool _b[ 2 ];
+        };
+
+        struct Diff1 : public TwoRangeBase {
+            Diff1() : TwoRangeBase( "{a:{$gt:1,$lt:2},b:{$gt:3,$lt:4}}", 1, 2, false, false ) {}
+        };
+
+        struct Diff2 : public TwoRangeBase {
+            Diff2() : TwoRangeBase( "{a:{$gt:1,$lt:2},b:{$gt:2,$lt:4}}", 1, 2, false, false ) {}
+        };
+
+        struct Diff3 : public TwoRangeBase {
+            Diff3() : TwoRangeBase( "{a:{$gt:1,$lte:2},b:{$gt:2,$lt:4}}", 1, 2, false, true ) {}
+        };
+
+        struct Diff4 : public TwoRangeBase {
+            Diff4() : TwoRangeBase( "{a:{$gt:1,$lt:2},b:{$gte:2,$lt:4}}", 1, 2, false, false) {}
+        };
+
+        struct Diff5 : public TwoRangeBase {
+            Diff5() : TwoRangeBase( "{a:{$gt:1,$lte:2},b:{$gte:2,$lt:4}}", 1, 2, false, false) {}
+        };
+
+        struct Diff6 : public TwoRangeBase {
+            Diff6() : TwoRangeBase( "{a:{$gt:1,$lte:3},b:{$gte:2,$lt:4}}", 1, 2, false, false) {}
+        };
+
+        struct Diff7 : public TwoRangeBase {
+            Diff7() : TwoRangeBase( "{a:{$gt:1,$lte:3},b:{$gt:2,$lt:4}}", 1, 2, false, true) {}
+        };
+
+        struct Diff8 : public TwoRangeBase {
+            Diff8() : TwoRangeBase( "{a:{$gt:1,$lt:4},b:{$gt:2,$lt:4}}", 1, 2, false, true) {}
+        };
+
+        struct Diff9 : public TwoRangeBase {
+            Diff9() : TwoRangeBase( "{a:{$gt:1,$lt:4},b:{$gt:2,$lte:4}}", 1, 2, false, true) {}
+        };
+
+        struct Diff10 : public TwoRangeBase {
+            Diff10() : TwoRangeBase( "{a:{$gt:1,$lte:4},b:{$gt:2,$lte:4}}", 1, 2, false, true) {}
+        };
+
+        class SplitRangeBase : public DiffBase {
+        public:
+            SplitRangeBase( string obj, int low1, bool low1I, int high1, bool high1I, int low2, bool low2I, int high2, bool high2I )
+                : _obj( obj ) {
+                _n[ 0 ] = low1;
+                _n[ 1 ] = high1;
+                _n[ 2 ] = low2;
+                _n[ 3 ] = high2;
+                _b[ 0 ] = low1I;
+                _b[ 1 ] = high1I;
+                _b[ 2 ] = low2I;
+                _b[ 3 ] = high2I;
+            }
+        private:
+            virtual unsigned len() const { return 2; }
+            virtual const int *nums() const { return _n; }
+            virtual const bool *incs() const { return _b; }
+            virtual BSONObj obj() const { return fromjson( _obj ); }
+            string _obj;
+            int _n[ 4 ];
+            bool _b[ 4 ];
+        };
+
+        struct Diff11 : public SplitRangeBase {
+            Diff11() : SplitRangeBase( "{a:{$gt:1,$lte:4},b:{$gt:2,$lt:4}}", 1, false, 2, true, 4, true, 4, true) {}
+        };
+
+        struct Diff12 : public SplitRangeBase {
+            Diff12() : SplitRangeBase( "{a:{$gt:1,$lt:5},b:{$gt:2,$lt:4}}", 1, false, 2, true, 4, true, 5, false) {}
+        };
+
+        struct Diff13 : public TwoRangeBase {
+            Diff13() : TwoRangeBase( "{a:{$gt:1,$lt:5},b:{$gt:1,$lt:4}}", 4, 5, true, false) {}
+        };
+
+        struct Diff14 : public SplitRangeBase {
+            Diff14() : SplitRangeBase( "{a:{$gte:1,$lt:5},b:{$gt:1,$lt:4}}", 1, true, 1, true, 4, true, 5, false) {}
+        };
+
+        struct Diff15 : public TwoRangeBase {
+            Diff15() : TwoRangeBase( "{a:{$gt:1,$lt:5},b:{$gte:1,$lt:4}}", 4, 5, true, false) {}
+        };
+
+        struct Diff16 : public TwoRangeBase {
+            Diff16() : TwoRangeBase( "{a:{$gte:1,$lt:5},b:{$gte:1,$lt:4}}", 4, 5, true, false) {}
+        };
+
+        struct Diff17 : public TwoRangeBase {
+            Diff17() : TwoRangeBase( "{a:{$gt:1,$lt:5},b:{$gt:0,$lt:4}}", 4, 5, true, false) {}
+        };
+
+        struct Diff18 : public TwoRangeBase {
+            Diff18() : TwoRangeBase( "{a:{$gt:1,$lt:5},b:{$gt:0,$lte:4}}", 4, 5, false, false) {}
+        };
+
+        struct Diff19 : public TwoRangeBase {
+            Diff19() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:{$gte:0,$lte:1}}", 1, 5, false, true) {}
+        };
+
+        struct Diff20 : public TwoRangeBase {
+            Diff20() : TwoRangeBase( "{a:{$gt:1,$lte:5},b:{$gte:0,$lte:1}}", 1, 5, false, true) {}
+        };
+
+        struct Diff21 : public TwoRangeBase {
+            Diff21() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:{$gte:0,$lt:1}}", 1, 5, true, true) {}
+        };
+
+        struct Diff22 : public TwoRangeBase {
+            Diff22() : TwoRangeBase( "{a:{$gt:1,$lte:5},b:{$gte:0,$lt:1}}", 1, 5, false, true) {}
+        };
+
+        struct Diff23 : public TwoRangeBase {
+            Diff23() : TwoRangeBase( "{a:{$gt:1,$lte:5},b:{$gte:0,$lt:0.5}}", 1, 5, false, true) {}
+        };
+
+        struct Diff24 : public TwoRangeBase {
+            Diff24() : TwoRangeBase( "{a:{$gt:1,$lte:5},b:0}", 1, 5, false, true) {}
+        };
+
+        struct Diff25 : public TwoRangeBase {
+            Diff25() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:0}", 1, 5, true, true) {}
+        };
+
+        struct Diff26 : public TwoRangeBase {
+            Diff26() : TwoRangeBase( "{a:{$gt:1,$lte:5},b:1}", 1, 5, false, true) {}
+        };
+
+        struct Diff27 : public TwoRangeBase {
+            Diff27() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:1}", 1, 5, false, true) {}
+        };
+
+        struct Diff28 : public SplitRangeBase {
+            Diff28() : SplitRangeBase( "{a:{$gte:1,$lte:5},b:3}", 1, true, 3, false, 3, false, 5, true) {}
+        };
+
+        struct Diff29 : public TwoRangeBase {
+            Diff29() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:5}", 1, 5, true, false) {}
+        };
+
+        struct Diff30 : public TwoRangeBase {
+            Diff30() : TwoRangeBase( "{a:{$gte:1,$lt:5},b:5}", 1, 5, true, false) {}
+        };
+
+        struct Diff31 : public TwoRangeBase {
+            Diff31() : TwoRangeBase( "{a:{$gte:1,$lt:5},b:6}", 1, 5, true, false) {}
+        };
+
+        struct Diff32 : public TwoRangeBase {
+            Diff32() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:6}", 1, 5, true, true) {}
+        };
+
+        class EmptyBase : public DiffBase {
+        public:
+            EmptyBase( string obj )
+                : _obj( obj ) {}
+        private:
+            virtual unsigned len() const { return 0; }
+            virtual const int *nums() const { return 0; }
+            virtual const bool *incs() const { return 0; }
+            virtual BSONObj obj() const { return fromjson( _obj ); }
+            string _obj;
+        };
+
+        struct Diff33 : public EmptyBase {
+            Diff33() : EmptyBase( "{a:{$gte:1,$lte:5},b:{$gt:0,$lt:6}}" ) {}
+        };
+
+        struct Diff34 : public EmptyBase {
+            Diff34() : EmptyBase( "{a:{$gte:1,$lte:5},b:{$gte:1,$lt:6}}" ) {}
+        };
+
+        struct Diff35 : public EmptyBase {
+            Diff35() : EmptyBase( "{a:{$gt:1,$lte:5},b:{$gte:1,$lt:6}}" ) {}
+        };
+
+        struct Diff36 : public EmptyBase {
+            Diff36() : EmptyBase( "{a:{$gt:1,$lte:5},b:{$gt:1,$lt:6}}" ) {}
+        };
+
+        struct Diff37 : public TwoRangeBase {
+            Diff37() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:{$gt:1,$lt:6}}", 1, 1, true, true ) {}
+        };
+
+        struct Diff38 : public EmptyBase {
+            Diff38() : EmptyBase( "{a:{$gt:1,$lt:5},b:{$gt:0,$lt:5}}" ) {}
+        };
+
+        struct Diff39 : public EmptyBase {
+            Diff39() : EmptyBase( "{a:{$gt:1,$lt:5},b:{$gt:0,$lte:5}}" ) {}
+        };
+
+        struct Diff40 : public EmptyBase {
+            Diff40() : EmptyBase( "{a:{$gt:1,$lte:5},b:{$gt:0,$lte:5}}" ) {}
+        };
+
+        struct Diff41 : public TwoRangeBase {
+            Diff41() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:{$gt:0,$lt:5}}", 5, 5, true, true ) {}
+        };
+
+        struct Diff42 : public EmptyBase {
+            Diff42() : EmptyBase( "{a:{$gt:1,$lt:5},b:{$gt:1,$lt:5}}" ) {}
+        };
+
+        struct Diff43 : public EmptyBase {
+            Diff43() : EmptyBase( "{a:{$gt:1,$lt:5},b:{$gt:1,$lte:5}}" ) {}
+        };
+
+        struct Diff44 : public EmptyBase {
+            Diff44() : EmptyBase( "{a:{$gt:1,$lt:5},b:{$gte:1,$lt:5}}" ) {}
+        };
+
+        struct Diff45 : public EmptyBase {
+            Diff45() : EmptyBase( "{a:{$gt:1,$lt:5},b:{$gte:1,$lte:5}}" ) {}
+        };
+
+        struct Diff46 : public TwoRangeBase {
+            Diff46() : TwoRangeBase( "{a:{$gt:1,$lte:5},b:{$gt:1,$lt:5}}", 5, 5, true, true ) {}
+        };
+
+        struct Diff47 : public EmptyBase {
+            Diff47() : EmptyBase( "{a:{$gt:1,$lte:5},b:{$gt:1,$lte:5}}" ) {}
+        };
+
+        struct Diff48 : public TwoRangeBase {
+            Diff48() : TwoRangeBase( "{a:{$gt:1,$lte:5},b:{$gte:1,$lt:5}}", 5, 5, true, true ) {}
+        };
+
+        struct Diff49 : public EmptyBase {
+            Diff49() : EmptyBase( "{a:{$gt:1,$lte:5},b:{$gte:1,$lte:5}}" ) {}
+        };
+
+        struct Diff50 : public TwoRangeBase {
+            Diff50() : TwoRangeBase( "{a:{$gte:1,$lt:5},b:{$gt:1,$lt:5}}", 1, 1, true, true ) {}
+        };
+
+        struct Diff51 : public TwoRangeBase {
+            Diff51() : TwoRangeBase( "{a:{$gte:1,$lt:5},b:{$gt:1,$lte:5}}", 1, 1, true, true ) {}
+        };
+
+        struct Diff52 : public EmptyBase {
+            Diff52() : EmptyBase( "{a:{$gte:1,$lt:5},b:{$gte:1,$lt:5}}" ) {}
+        };
+
+        struct Diff53 : public EmptyBase {
+            Diff53() : EmptyBase( "{a:{$gte:1,$lt:5},b:{$gte:1,$lte:5}}" ) {}
+        };
+
+        struct Diff54 : public SplitRangeBase {
+            Diff54() : SplitRangeBase( "{a:{$gte:1,$lte:5},b:{$gt:1,$lt:5}}", 1, true, 1, true, 5, true, 5, true ) {}
+        };
+
+        struct Diff55 : public TwoRangeBase {
+            Diff55() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:{$gt:1,$lte:5}}", 1, 1, true, true ) {}
+        };
+
+        struct Diff56 : public TwoRangeBase {
+            Diff56() : TwoRangeBase( "{a:{$gte:1,$lte:5},b:{$gte:1,$lt:5}}", 5, 5, true, true ) {}
+        };
+
+        struct Diff57 : public EmptyBase {
+            Diff57() : EmptyBase( "{a:{$gte:1,$lte:5},b:{$gte:1,$lte:5}}" ) {}
+        };
+
+        struct Diff58 : public TwoRangeBase {
+            Diff58() : TwoRangeBase( "{a:1,b:{$gt:1,$lt:5}}", 1, 1, true, true ) {}
+        };
+
+        struct Diff59 : public EmptyBase {
+            Diff59() : EmptyBase( "{a:1,b:{$gte:1,$lt:5}}" ) {}
+        };
+
+        struct Diff60 : public EmptyBase {
+            Diff60() : EmptyBase( "{a:2,b:{$gte:1,$lt:5}}" ) {}
+        };
+
+        struct Diff61 : public EmptyBase {
+            Diff61() : EmptyBase( "{a:5,b:{$gte:1,$lte:5}}" ) {}
+        };
+
+        struct Diff62 : public TwoRangeBase {
+            Diff62() : TwoRangeBase( "{a:5,b:{$gt:1,$lt:5}}", 5, 5, true, true ) {}
+        };
+
+        struct Diff63 : public EmptyBase {
+            Diff63() : EmptyBase( "{a:5,b:5}" ) {}
+        };
+
+        struct Diff64 : public TwoRangeBase {
+            Diff64() : TwoRangeBase( "{a:{$gte:1,$lte:2},b:{$gt:0,$lte:1}}", 1, 2, false, true ) {}
+        };
+
+        class DiffMulti1 : public DiffBase {
+        public:
+            void run() {
+                FieldRangeSet frs( "", fromjson( "{a:{$gt:1,$lt:9},b:{$gt:0,$lt:2},c:3,d:{$gt:4,$lt:5},e:{$gt:7,$lt:10}}" ), true );
+                FieldRange ret = frs.range( "a" );
+                FieldRange other = frs.range( "b" );
+                other |= frs.range( "c" );
+                other |= frs.range( "d" );
+                other |= frs.range( "e" );
+                ret -= other;
+                check( ret );
+            }
+        protected:
+            virtual unsigned len() const { return 3; }
+            virtual const int *nums() const { static int n[] = { 2, 3, 3, 4, 5, 7 }; return n; }
+            virtual const bool *incs() const { static bool b[] = { true, false, false, true, true, true }; return b; }
+            virtual BSONObj obj() const { return BSONObj(); }
+        };
+
+        class DiffMulti2 : public DiffBase {
+        public:
+            void run() {
+                FieldRangeSet frs( "", fromjson( "{a:{$gt:1,$lt:9},b:{$gt:0,$lt:2},c:3,d:{$gt:4,$lt:5},e:{$gt:7,$lt:10}}" ), true );
+                FieldRange mask = frs.range( "a" );
+                FieldRange ret = frs.range( "b" );
+                ret |= frs.range( "c" );
+                ret |= frs.range( "d" );
+                ret |= frs.range( "e" );
+                ret -= mask;
+                check( ret );
+            }
+        protected:
+            virtual unsigned len() const { return 2; }
+            virtual const int *nums() const { static int n[] = { 0, 1, 9, 10 }; return n; }
+            virtual const bool *incs() const { static bool b[] = { false, true, true, false }; return b; }
+            virtual BSONObj obj() const { return BSONObj(); }
+        };
+
+    } // namespace FieldRangeTests
+
+    namespace FieldRangeSetTests {
+
+        class Intersect {
+        public:
+            void run() {
+                FieldRangeSet frs1( "", fromjson( "{b:{$in:[5,6]},c:7,d:{$in:[8,9]}}" ), true );
+                FieldRangeSet frs2( "", fromjson( "{a:1,b:5,c:{$in:[7,8]},d:{$in:[8,9]},e:10}" ), true );
+                frs1 &= frs2;
+                ASSERT_EQUALS( fromjson( "{a:1,b:5,c:7,d:{$gte:8,$lte:9},e:10}" ), frs1.simplifiedQuery( BSONObj() ) );
+            }
+        };
+        
+        class MultiKeyIntersect {
+        public:
+            void run() {
+                FieldRangeSet frs1( "", BSONObj(), false );
+                FieldRangeSet frs2( "", BSON( "a" << GT << 4 ), false );
+                FieldRangeSet frs3( "", BSON( "a" << LT << 6 ), false );
+                // An intersection with a trivial range is allowed.
+                frs1 &= frs2;
+                ASSERT_EQUALS( frs2.simplifiedQuery( BSONObj() ), frs1.simplifiedQuery( BSONObj() ) );
+                // An intersection with a nontrivial range is not allowed, as it might prevent a valid
+                // multikey match.
+                frs1 &= frs3;
+                ASSERT_EQUALS( frs2.simplifiedQuery( BSONObj() ), frs1.simplifiedQuery( BSONObj() ) );
+                // Now intersect with a fully contained range.
+                FieldRangeSet frs4( "", BSON( "a" << GT << 6 ), false );
+                frs1 &= frs4;
+                ASSERT_EQUALS( frs4.simplifiedQuery( BSONObj() ), frs1.simplifiedQuery( BSONObj() ) );                
+            }
+        };
+        
+        class MultiKeyDiff {
+        public:
+            void run() {
+                FieldRangeSet frs1( "", BSON( "a" << GT << 4 ), false );
+                FieldRangeSet frs2( "", BSON( "a" << GT << 6 ), false );
+                // Range subtraction is no different for multikey ranges.
+                frs1 -= frs2;
+                ASSERT_EQUALS( BSON( "a" << GT << 4 << LTE << 6 ), frs1.simplifiedQuery( BSONObj() ) );
+            }
+        };
+        
+        class MatchPossible {
+        public:
+            void run() {
+                FieldRangeSet frs1( "", BSON( "a" << GT << 4 ), true );
+                ASSERT( frs1.matchPossible() );
+                // Conflicting constraints invalid for a single key set.
+                FieldRangeSet frs2( "", BSON( "a" << GT << 4 << LT << 2 ), true );
+                ASSERT( !frs2.matchPossible() );
+                // Conflicting constraints not possible for a multi key set.
+                FieldRangeSet frs3( "", BSON( "a" << GT << 4 << LT << 2 ), false );
+                ASSERT( frs3.matchPossible() );
+            }
+        };
+        
+        class MatchPossibleForIndex {
+        public:
+            void run() {
+                // Conflicting constraints not possible for a multi key set.
+                FieldRangeSet frs1( "", BSON( "a" << GT << 4 << LT << 2 ), false );
+                ASSERT( frs1.matchPossibleForIndex( BSON( "a" << 1 ) ) );
+                // Conflicting constraints for a multi key set.
+                FieldRangeSet frs2( "", BSON( "a" << GT << 4 << LT << 2 ), true );
+                ASSERT( !frs2.matchPossibleForIndex( BSON( "a" << 1 ) ) );
+                // If the index doesn't include the key, it is not single key invalid.
+                ASSERT( frs2.matchPossibleForIndex( BSON( "b" << 1 ) ) );
+                // If the index key is not an index, the set is not single key invalid.
+                ASSERT( frs2.matchPossibleForIndex( BSON( "$natural" << 1 ) ) );
+                ASSERT( frs2.matchPossibleForIndex( BSONObj() ) );
+            }
+        };
+                
+    } // namespace FieldRangeSetTests
+    
+    namespace FieldRangeSetPairTests {
+
+        class NoNontrivialRanges {
+        public:
+            void run() {
+                FieldRangeSetPair frsp1( "", BSONObj() );
+                ASSERT( frsp1.noNontrivialRanges() );
+                FieldRangeSetPair frsp2( "", BSON( "a" << 1 ) );
+                ASSERT( !frsp2.noNontrivialRanges() );
+                FieldRangeSetPair frsp3( "", BSON( "a" << GT << 1 ) );
+                ASSERT( !frsp3.noNontrivialRanges() );
+                // A single key invalid constraint is still nontrivial.
+                FieldRangeSetPair frsp4( "", BSON( "a" << GT << 1 << LT << 0 ) );
+                ASSERT( !frsp4.noNontrivialRanges() );
+                // Still nontrivial if multikey invalid.
+                frsp4 -= frsp4.frsForIndex( 0, -1 );
+                ASSERT( !frsp4.noNontrivialRanges() );
+            }
+        };
+        
+        class MatchPossible {
+        public:
+            void run() {
+                // Match possible for simple query.
+                FieldRangeSetPair frsp1( "", BSON( "a" << 1 ) );
+                ASSERT( frsp1.matchPossible() );
+                // Match possible for single key invalid query.
+                FieldRangeSetPair frsp2( "", BSON( "a" << GT << 1 << LT << 0 ) );
+                ASSERT( frsp2.matchPossible() );
+                // Match not possible for multi key invalid query.
+                frsp1 -= frsp1.frsForIndex( 0, - 1 );
+                ASSERT( !frsp1.matchPossible() );
+            }
+        };
+
+        class IndexBase {
+        public:
+            IndexBase() : _ctx( ns() ) , indexNum_( 0 ) {
+                string err;
+                userCreateNS( ns(), BSONObj(), err, false );
+            }
+            ~IndexBase() {
+                if ( !nsd() )
+                    return;
+                string s( ns() );
+                dropNS( s );
+            }
+        protected:
+            static const char *ns() { return "unittests.FieldRangeSetPairTests"; }
+            static NamespaceDetails *nsd() { return nsdetails( ns() ); }
+            IndexDetails *index( const BSONObj &key ) {
+                stringstream ss;
+                ss << indexNum_++;
+                string name = ss.str();
+                client_.resetIndexCache();
+                client_.ensureIndex( ns(), key, false, name.c_str() );
+                NamespaceDetails *d = nsd();
+                for( int i = 0; i < d->nIndexes; ++i ) {
+                    if ( d->idx(i).keyPattern() == key /*indexName() == name*/ || ( d->idx(i).isIdIndex() && IndexDetails::isIdIndexPattern( key ) ) )
+                        return &d->idx(i);
+                }
+                assert( false );
+                return 0;
+            }
+            int indexno( const BSONObj &key ) {
+                return nsd()->idxNo( *index(key) );
+            }
+            static DBDirectClient client_;
+        private:
+            dblock lk_;
+            Client::Context _ctx;
+            int indexNum_;
+        };
+        DBDirectClient IndexBase::client_;
+        
+        class MatchPossibleForIndex : public IndexBase {
+        public:
+            void run() {
+                int a = indexno( BSON( "a" << 1 ) );
+                int b = indexno( BSON( "b" << 1 ) );
+                IndexBase::client_.insert( ns(), BSON( "a" << BSON_ARRAY( 1 << 2 ) << "b" << 1 ) );
+                // Valid ranges match possible for both indexes.
+                FieldRangeSetPair frsp1( ns(), BSON( "a" << GT << 1 << LT << 4 << "b" << GT << 1 << LT << 4 ) );
+                ASSERT( frsp1.matchPossibleForIndex( nsd(), a, BSON( "a" << 1 ) ) );
+                ASSERT( frsp1.matchPossibleForIndex( nsd(), b, BSON( "b" << 1 ) ) );
+                // Single key invalid range means match impossible for single key index.
+                FieldRangeSetPair frsp2( ns(), BSON( "a" << GT << 4 << LT << 1 << "b" << GT << 4 << LT << 1 ) );
+                ASSERT( frsp2.matchPossibleForIndex( nsd(), a, BSON( "a" << 1 ) ) );
+                ASSERT( !frsp2.matchPossibleForIndex( nsd(), b, BSON( "b" << 1 ) ) );
+            }
+        };
+        
+    } // namespace FieldRangeSetPairTests
+    
+    class All : public Suite {
+    public:
+        All() : Suite( "queryutil" ) {}
+
+        void setupTests() {
+            add< FieldRangeTests::Empty >();
+            add< FieldRangeTests::Eq >();
+            add< FieldRangeTests::DupEq >();
+            add< FieldRangeTests::Lt >();
+            add< FieldRangeTests::Lte >();
+            add< FieldRangeTests::Gt >();
+            add< FieldRangeTests::Gte >();
+            add< FieldRangeTests::TwoLt >();
+            add< FieldRangeTests::TwoGt >();
+            add< FieldRangeTests::EqGte >();
+            add< FieldRangeTests::EqGteInvalid >();
+            add< FieldRangeTests::Regex >();
+            add< FieldRangeTests::RegexObj >();
+            add< FieldRangeTests::UnhelpfulRegex >();
+            add< FieldRangeTests::In >();
+            add< FieldRangeTests::Equality >();
+            add< FieldRangeTests::SimplifiedQuery >();
+            add< FieldRangeTests::QueryPatternTest >();
+            add< FieldRangeTests::NoWhere >();
+            add< FieldRangeTests::Numeric >();
+            add< FieldRangeTests::InLowerBound >();
+            add< FieldRangeTests::InUpperBound >();
+            add< FieldRangeTests::UnionBound >();
+            add< FieldRangeTests::MultiBound >();
+            add< FieldRangeTests::Diff1 >();
+            add< FieldRangeTests::Diff2 >();
+            add< FieldRangeTests::Diff3 >();
+            add< FieldRangeTests::Diff4 >();
+            add< FieldRangeTests::Diff5 >();
+            add< FieldRangeTests::Diff6 >();
+            add< FieldRangeTests::Diff7 >();
+            add< FieldRangeTests::Diff8 >();
+            add< FieldRangeTests::Diff9 >();
+            add< FieldRangeTests::Diff10 >();
+            add< FieldRangeTests::Diff11 >();
+            add< FieldRangeTests::Diff12 >();
+            add< FieldRangeTests::Diff13 >();
+            add< FieldRangeTests::Diff14 >();
+            add< FieldRangeTests::Diff15 >();
+            add< FieldRangeTests::Diff16 >();
+            add< FieldRangeTests::Diff17 >();
+            add< FieldRangeTests::Diff18 >();
+            add< FieldRangeTests::Diff19 >();
+            add< FieldRangeTests::Diff20 >();
+            add< FieldRangeTests::Diff21 >();
+            add< FieldRangeTests::Diff22 >();
+            add< FieldRangeTests::Diff23 >();
+            add< FieldRangeTests::Diff24 >();
+            add< FieldRangeTests::Diff25 >();
+            add< FieldRangeTests::Diff26 >();
+            add< FieldRangeTests::Diff27 >();
+            add< FieldRangeTests::Diff28 >();
+            add< FieldRangeTests::Diff29 >();
+            add< FieldRangeTests::Diff30 >();
+            add< FieldRangeTests::Diff31 >();
+            add< FieldRangeTests::Diff32 >();
+            add< FieldRangeTests::Diff33 >();
+            add< FieldRangeTests::Diff34 >();
+            add< FieldRangeTests::Diff35 >();
+            add< FieldRangeTests::Diff36 >();
+            add< FieldRangeTests::Diff37 >();
+            add< FieldRangeTests::Diff38 >();
+            add< FieldRangeTests::Diff39 >();
+            add< FieldRangeTests::Diff40 >();
+            add< FieldRangeTests::Diff41 >();
+            add< FieldRangeTests::Diff42 >();
+            add< FieldRangeTests::Diff43 >();
+            add< FieldRangeTests::Diff44 >();
+            add< FieldRangeTests::Diff45 >();
+            add< FieldRangeTests::Diff46 >();
+            add< FieldRangeTests::Diff47 >();
+            add< FieldRangeTests::Diff48 >();
+            add< FieldRangeTests::Diff49 >();
+            add< FieldRangeTests::Diff50 >();
+            add< FieldRangeTests::Diff51 >();
+            add< FieldRangeTests::Diff52 >();
+            add< FieldRangeTests::Diff53 >();
+            add< FieldRangeTests::Diff54 >();
+            add< FieldRangeTests::Diff55 >();
+            add< FieldRangeTests::Diff56 >();
+            add< FieldRangeTests::Diff57 >();
+            add< FieldRangeTests::Diff58 >();
+            add< FieldRangeTests::Diff59 >();
+            add< FieldRangeTests::Diff60 >();
+            add< FieldRangeTests::Diff61 >();
+            add< FieldRangeTests::Diff62 >();
+            add< FieldRangeTests::Diff63 >();
+            add< FieldRangeTests::Diff64 >();
+            add< FieldRangeTests::DiffMulti1 >();
+            add< FieldRangeTests::DiffMulti2 >();
+            add< FieldRangeSetTests::Intersect >();
+            add< FieldRangeSetTests::MultiKeyIntersect >();
+            add< FieldRangeSetTests::MultiKeyDiff >();
+            add< FieldRangeSetTests::MatchPossible >();
+            add< FieldRangeSetTests::MatchPossibleForIndex >();
+            add< FieldRangeSetPairTests::NoNontrivialRanges >();
+            add< FieldRangeSetPairTests::MatchPossible >();
+            add< FieldRangeSetPairTests::MatchPossibleForIndex >();
+        }
+    } myall;
+
+} // namespace QueryUtilTests
+
diff --git a/src/mongo/dbtests/replsettests.cpp b/src/mongo/dbtests/replsettests.cpp
new file mode 100644
index 00000000000..c1fca3b1ad6
--- /dev/null
+++ b/src/mongo/dbtests/replsettests.cpp
@@ -0,0 +1,227 @@
+// replsettests.cpp : Unit tests for replica sets
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/repl.h"
+
+#include "../db/db.h"
+#include "../db/instance.h"
+#include "../db/json.h"
+
+#include "dbtests.h"
+#include "../db/oplog.h"
+#include "../db/queryoptimizer.h"
+
+#include "../db/repl/rs.h"
+
+namespace mongo {
+    void createOplog();
+}
+
+namespace ReplSetTests {
+
+    class Base {
+        static DBDirectClient client_;
+    public:
+        Base() {
+            cmdLine._replSet = "foo";
+            cmdLine.oplogSize = 5;
+            createOplog();
+        }
+
+        static const char *ns() {
+            return "unittests.repltests";
+        }
+
+        DBDirectClient *client() const { return &client_; }
+
+        static void insert( const BSONObj &o, bool god = false ) {
+            dblock lk;
+            Client::Context ctx( ns() );
+            theDataFileMgr.insert( ns(), o.objdata(), o.objsize(), god );
+        }
+        BSONObj findOne( const BSONObj &query = BSONObj() ) const {
+            return client()->findOne( ns(), query );
+        }
+    };
+    DBDirectClient Base::client_;
+
+
+    class MockInitialSync : public replset::InitialSync {
+        int step;
+    public:
+        MockInitialSync() : replset::InitialSync(""), step(0), failOnStep(SUCCEED), retry(true) {}
+
+        enum FailOn {SUCCEED, FAIL_FIRST_APPLY, FAIL_BOTH_APPLY};
+
+        FailOn failOnStep;
+        bool retry;
+
+        // instead of actually applying operations, we return success or failure
+        virtual bool syncApply(const BSONObj& o) {
+            step++;
+
+            if ((failOnStep == FAIL_FIRST_APPLY && step == 1) ||
+                (failOnStep == FAIL_BOTH_APPLY)) {
+                return false;
+            }
+
+            return true;
+        }
+
+        virtual bool shouldRetry(const BSONObj& o) {
+            return retry;
+        }
+    };
+
+    class TestInitApplyOp : public Base {
+    public:
+        void run() {
+            writelock lk("");
+
+            OpTime o1 = OpTime::now();
+            OpTime o2 = OpTime::now();
+
+            BSONObjBuilder b;
+            b.appendTimestamp("ts", o2.asLL());
+            BSONObj obj = b.obj();
+
+            MockInitialSync mock;
+
+            // all three should succeed
+            mock.applyOp(obj, o1);
+
+            mock.failOnStep = MockInitialSync::FAIL_FIRST_APPLY;
+            mock.applyOp(obj, o1);
+
+            mock.retry = false;
+            mock.applyOp(obj, o1);
+
+            // force failure
+            MockInitialSync mock2;
+            mock2.failOnStep = MockInitialSync::FAIL_BOTH_APPLY;
+
+            ASSERT_THROWS(mock2.applyOp(obj, o2), UserException);
+        }
+    };
+
+    class SyncTest2 : public replset::InitialSync {
+    public:
+        bool insertOnRetry;
+        SyncTest2() : replset::InitialSync(""), insertOnRetry(false) {}
+        virtual ~SyncTest2() {}
+        virtual bool shouldRetry(const BSONObj& o) {
+            if (!insertOnRetry) {
+                return true;
+            }
+
+            Base::insert(BSON("_id" << 123));
+            return true;
+        }
+    };
+
+    class TestInitApplyOp2 : public Base {
+    public:
+        void run() {
+            writelock lk("");
+
+            OpTime o1 = OpTime::now();
+            OpTime o2 = OpTime::now();
+
+            BSONObjBuilder b;
+            b.appendTimestamp("ts", o2.asLL());
+            b.append("op", "u");
+            b.append("o", BSON("$set" << BSON("x" << 456)));
+            b.append("o2", BSON("_id" << 123));
+            b.append("ns", ns());
+            BSONObj obj = b.obj();
+
+            SyncTest2 sync;
+            ASSERT_THROWS(sync.applyOp(obj, o1), UserException);
+
+            sync.insertOnRetry = true;
+            // succeeds
+            sync.applyOp(obj, o1);
+
+            BSONObj fin = findOne();
+            assert(fin["x"].Number() == 456);
+        }
+    };
+
+    class CappedInitialSync : public Base {
+        string _ns;
+        dblock lk;
+        Client::Context _context;
+
+        string spec() const {
+            return "{\"capped\":true,\"size\":512}";
+        }
+
+        void create() {
+            dblock lk;
+            string err;
+            ASSERT(userCreateNS( _ns.c_str(), fromjson( spec() ), err, false ));
+        }
+
+        void drop() {
+            string s( _ns );
+            string errmsg;
+            BSONObjBuilder result;
+            dropCollection( s, errmsg, result );
+        }
+    public:
+        CappedInitialSync() : _ns("unittests.foo.bar"), _context(_ns) {
+            if (nsdetails(_ns.c_str()) != NULL) {
+                drop();
+            }
+        }
+        ~CappedInitialSync() {
+            if ( nsdetails(_ns.c_str()) == NULL )
+                return;
+            drop();
+        }
+
+        void run() {
+            create();
+
+            BSONObjBuilder b;
+            b.appendTimestamp("ts", OpTime::now().asLL());
+            b.append("op", "u");
+            b.append("o", BSON("$set" << BSON("x" << 456)));
+            b.append("o2", BSON("_id" << 123 << "x" << 123));
+            b.append("ns", _ns);
+
+            // in an annoying twist of api, returns true on failure
+            assert(applyOperation_inlock(b.obj(), true));
+        }
+    };
+
+
+    class All : public Suite {
+    public:
+        All() : Suite( "replset" ) {
+        }
+
+        void setupTests() {
+            add< TestInitApplyOp >();
+            add< TestInitApplyOp2 >();
+            add< CappedInitialSync >();
+        }
+    } myall;
+}
diff --git a/src/mongo/dbtests/repltests.cpp b/src/mongo/dbtests/repltests.cpp
new file mode 100644
index 00000000000..86288ad9426
--- /dev/null
+++ b/src/mongo/dbtests/repltests.cpp
@@ -0,0 +1,1228 @@
+// repltests.cpp : Unit tests for replication
+//
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/repl.h"
+
+#include "../db/db.h"
+#include "../db/instance.h"
+#include "../db/json.h"
+
+#include "dbtests.h"
+#include "../db/oplog.h"
+#include "../db/queryoptimizer.h"
+
+#include "../db/repl/rs.h"
+
+namespace mongo {
+    void createOplog();
+}
+
+namespace ReplTests {
+
+    BSONObj f( const char *s ) {
+        return fromjson( s );
+    }
+
+    class Base {
+        dblock lk;
+        Client::Context _context;
+    public:
+        Base() : _context( ns() ) {
+            replSettings.master = true;
+            createOplog();
+            ensureHaveIdIndex( ns() );
+        }
+        ~Base() {
+            try {
+                replSettings.master = false;
+                deleteAll( ns() );
+                deleteAll( cllNS() );
+            }
+            catch ( ... ) {
+                FAIL( "Exception while cleaning up test" );
+            }
+        }
+    protected:
+        static const char *ns() {
+            return "unittests.repltests";
+        }
+        static const char *cllNS() {
+            return "local.oplog.$main";
+        }
+        DBDirectClient *client() const { return &client_; }
+        BSONObj one( const BSONObj &query = BSONObj() ) const {
+            return client()->findOne( ns(), query );
+        }
+        void checkOne( const BSONObj &o ) const {
+            check( o, one( o ) );
+        }
+        void checkAll( const BSONObj &o ) const {
+            auto_ptr< DBClientCursor > c = client()->query( ns(), o );
+            assert( c->more() );
+            while( c->more() ) {
+                check( o, c->next() );
+            }
+        }
+        void check( const BSONObj &expected, const BSONObj &got ) const {
+            if ( expected.woCompare( got ) ) {
+                out() << "expected: " << expected.toString()
+                      << ", got: " << got.toString() << endl;
+            }
+            ASSERT_EQUALS( expected , got );
+        }
+        BSONObj oneOp() const {
+            return client()->findOne( cllNS(), BSONObj() );
+        }
+        int count() const {
+            int count = 0;
+            dblock lk;
+            Client::Context ctx( ns() );
+            boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( ns() );
+            for(; c->ok(); c->advance(), ++count ) {
+//                cout << "obj: " << c->current().toString() << endl;
+            }
+            return count;
+        }
+        static int opCount() {
+            dblock lk;
+            Client::Context ctx( cllNS() );
+            int count = 0;
+            for( boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( cllNS() ); c->ok(); c->advance() )
+                ++count;
+            return count;
+        }
+        static void applyAllOperations() {
+            dblock lk;
+            vector< BSONObj > ops;
+            {
+                Client::Context ctx( cllNS() );
+                for( boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( cllNS() ); c->ok(); c->advance() )
+                    ops.push_back( c->current() );
+            }
+            {
+                Client::Context ctx( ns() );
+                BSONObjBuilder b;
+                b.append("host", "localhost");
+                b.appendTimestamp("syncedTo", 0);
+                ReplSource a(b.obj());
+                for( vector< BSONObj >::iterator i = ops.begin(); i != ops.end(); ++i ) {
+                    a.applyOperation( *i );
+                }
+            }
+        }
+        static void printAll( const char *ns ) {
+            dblock lk;
+            Client::Context ctx( ns );
+            boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( ns );
+            vector< DiskLoc > toDelete;
+            out() << "all for " << ns << endl;
+            for(; c->ok(); c->advance() ) {
+                out() << c->current().toString() << endl;
+            }
+        }
+        // These deletes don't get logged.
+        static void deleteAll( const char *ns ) {
+            dblock lk;
+            Client::Context ctx( ns );
+            boost::shared_ptr<Cursor> c = theDataFileMgr.findAll( ns );
+            vector< DiskLoc > toDelete;
+            for(; c->ok(); c->advance() ) {
+                toDelete.push_back( c->currLoc() );
+            }
+            for( vector< DiskLoc >::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) {
+                theDataFileMgr.deleteRecord( ns, i->rec(), *i, true );
+            }
+        }
+        static void insert( const BSONObj &o, bool god = false ) {
+            dblock lk;
+            Client::Context ctx( ns() );
+            theDataFileMgr.insert( ns(), o.objdata(), o.objsize(), god );
+        }
+        static BSONObj wid( const char *json ) {
+            class BSONObjBuilder b;
+            OID id;
+            id.init();
+            b.appendOID( "_id", &id );
+            b.appendElements( fromjson( json ) );
+            return b.obj();
+        }
+    private:
+        static DBDirectClient client_;
+    };
+    DBDirectClient Base::client_;
+
+    class LogBasic : public Base {
+    public:
+        void run() {
+            ASSERT_EQUALS( 1, opCount() );
+            client()->insert( ns(), fromjson( "{\"a\":\"b\"}" ) );
+            ASSERT_EQUALS( 2, opCount() );
+        }
+    };
+
+    namespace Idempotence {
+
+        class Base : public ReplTests::Base {
+        public:
+            virtual ~Base() {}
+            void run() {
+                reset();
+                doIt();
+                int nOps = opCount();
+                check();
+                applyAllOperations();
+                check();
+                ASSERT_EQUALS( nOps, opCount() );
+
+                reset();
+                applyAllOperations();
+                check();
+                ASSERT_EQUALS( nOps, opCount() );
+                applyAllOperations();
+                check();
+                ASSERT_EQUALS( nOps, opCount() );
+            }
+        protected:
+            virtual void doIt() const = 0;
+            virtual void check() const = 0;
+            virtual void reset() const = 0;
+        };
+
+        class InsertTimestamp : public Base {
+        public:
+            void doIt() const {
+                BSONObjBuilder b;
+                b.append( "a", 1 );
+                b.appendTimestamp( "t" );
+                client()->insert( ns(), b.done() );
+                date_ = client()->findOne( ns(), QUERY( "a" << 1 ) ).getField( "t" ).date();
+            }
+            void check() const {
+                BSONObj o = client()->findOne( ns(), QUERY( "a" << 1 ) );
+                ASSERT( 0 != o.getField( "t" ).date() );
+                ASSERT_EQUALS( date_, o.getField( "t" ).date() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+            }
+        private:
+            mutable Date_t date_;
+        };
+
+        class InsertAutoId : public Base {
+        public:
+            InsertAutoId() : o_( fromjson( "{\"a\":\"b\"}" ) ) {}
+            void doIt() const {
+                client()->insert( ns(), o_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+            }
+        protected:
+            BSONObj o_;
+        };
+
+        class InsertWithId : public InsertAutoId {
+        public:
+            InsertWithId() {
+                o_ = fromjson( "{\"_id\":ObjectId(\"0f0f0f0f0f0f0f0f0f0f0f0f\"),\"a\":\"b\"}" );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( o_ );
+            }
+        };
+
+        class InsertTwo : public Base {
+        public:
+            InsertTwo() :
+                o_( fromjson( "{'_id':1,a:'b'}" ) ),
+                t_( fromjson( "{'_id':2,c:'d'}" ) ) {}
+            void doIt() const {
+                vector< BSONObj > v;
+                v.push_back( o_ );
+                v.push_back( t_ );
+                client()->insert( ns(), v );
+            }
+            void check() const {
+                ASSERT_EQUALS( 2, count() );
+                checkOne( o_ );
+                checkOne( t_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+            }
+        private:
+            BSONObj o_;
+            BSONObj t_;
+        };
+
+        class InsertTwoIdentical : public Base {
+        public:
+            InsertTwoIdentical() : o_( fromjson( "{\"a\":\"b\"}" ) ) {}
+            void doIt() const {
+                client()->insert( ns(), o_ );
+                client()->insert( ns(), o_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 2, count() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+            }
+        private:
+            BSONObj o_;
+        };
+
+        class UpdateTimestamp : public Base {
+        public:
+            void doIt() const {
+                BSONObjBuilder b;
+                b.append( "_id", 1 );
+                b.appendTimestamp( "t" );
+                client()->update( ns(), BSON( "_id" << 1 ), b.done() );
+                date_ = client()->findOne( ns(), QUERY( "_id" << 1 ) ).getField( "t" ).date();
+            }
+            void check() const {
+                BSONObj o = client()->findOne( ns(), QUERY( "_id" << 1 ) );
+                ASSERT( 0 != o.getField( "t" ).date() );
+                ASSERT_EQUALS( date_, o.getField( "t" ).date() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( BSON( "_id" << 1 ) );
+            }
+        private:
+            mutable Date_t date_;
+        };
+
+        class UpdateSameField : public Base {
+        public:
+            UpdateSameField() :
+                q_( fromjson( "{a:'b'}" ) ),
+                o1_( wid( "{a:'b'}" ) ),
+                o2_( wid( "{a:'b'}" ) ),
+                u_( fromjson( "{a:'c'}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 2, count() );
+                ASSERT( !client()->findOne( ns(), q_ ).isEmpty() );
+                ASSERT( !client()->findOne( ns(), u_ ).isEmpty() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o1_ );
+                insert( o2_ );
+            }
+        private:
+            BSONObj q_, o1_, o2_, u_;
+        };
+
+        class UpdateSameFieldWithId : public Base {
+        public:
+            UpdateSameFieldWithId() :
+                o_( fromjson( "{'_id':1,a:'b'}" ) ),
+                q_( fromjson( "{a:'b'}" ) ),
+                u_( fromjson( "{'_id':1,a:'c'}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 2, count() );
+                ASSERT( !client()->findOne( ns(), q_ ).isEmpty() );
+                ASSERT( !client()->findOne( ns(), u_ ).isEmpty() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o_ );
+                insert( fromjson( "{'_id':2,a:'b'}" ) );
+            }
+        private:
+            BSONObj o_, q_, u_;
+        };
+
+        class UpdateSameFieldExplicitId : public Base {
+        public:
+            UpdateSameFieldExplicitId() :
+                o_( fromjson( "{'_id':1,a:'b'}" ) ),
+                u_( fromjson( "{'_id':1,a:'c'}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), o_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( u_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o_ );
+            }
+        protected:
+            BSONObj o_, u_;
+        };
+
+        class UpdateDifferentFieldExplicitId : public Base {
+        public:
+            UpdateDifferentFieldExplicitId() :
+                o_( fromjson( "{'_id':1,a:'b'}" ) ),
+                q_( fromjson( "{'_id':1}" ) ),
+                u_( fromjson( "{'_id':1,a:'c'}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( u_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o_ );
+            }
+        protected:
+            BSONObj o_, q_, u_;
+        };
+
+        class UpsertUpdateNoMods : public UpdateDifferentFieldExplicitId {
+            void doIt() const {
+                client()->update( ns(), q_, u_, true );
+            }
+        };
+
+        class UpsertInsertNoMods : public InsertAutoId {
+            void doIt() const {
+                client()->update( ns(), fromjson( "{a:'c'}" ), o_, true );
+            }
+        };
+
+        class UpdateSet : public Base {
+        public:
+            UpdateSet() :
+                o_( fromjson( "{'_id':1,a:5}" ) ),
+                q_( fromjson( "{a:5}" ) ),
+                u_( fromjson( "{$set:{a:7}}" ) ),
+                ou_( fromjson( "{'_id':1,a:7}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( ou_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o_ );
+            }
+        protected:
+            BSONObj o_, q_, u_, ou_;
+        };
+
+        class UpdateInc : public Base {
+        public:
+            UpdateInc() :
+                o_( fromjson( "{'_id':1,a:5}" ) ),
+                q_( fromjson( "{a:5}" ) ),
+                u_( fromjson( "{$inc:{a:3}}" ) ),
+                ou_( fromjson( "{'_id':1,a:8}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( ou_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o_ );
+            }
+        protected:
+            BSONObj o_, q_, u_, ou_;
+        };
+
+        class UpdateInc2 : public Base {
+        public:
+            UpdateInc2() :
+                o_( fromjson( "{'_id':1,a:5}" ) ),
+                q_( fromjson( "{a:5}" ) ),
+                u_( fromjson( "{$inc:{a:3},$set:{x:5}}" ) ),
+                ou_( fromjson( "{'_id':1,a:8,x:5}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( ou_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o_ );
+            }
+        protected:
+            BSONObj o_, q_, u_, ou_;
+        };
+
+        class IncEmbedded : public Base {
+        public:
+            IncEmbedded() :
+                o_( fromjson( "{'_id':1,a:{b:3},b:{b:1}}" ) ),
+                q_( fromjson( "{'_id':1}" ) ),
+                u_( fromjson( "{$inc:{'a.b':1,'b.b':1}}" ) ),
+                ou_( fromjson( "{'_id':1,a:{b:4},b:{b:2}}" ) )
+            {}
+            void doIt() const {
+                client()->update( ns(), q_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( ou_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o_ );
+            }
+        protected:
+            BSONObj o_, q_, u_, ou_;
+        };
+
+        class IncCreates : public Base {
+        public:
+            IncCreates() :
+                o_( fromjson( "{'_id':1}" ) ),
+                q_( fromjson( "{'_id':1}" ) ),
+                u_( fromjson( "{$inc:{'a':1}}" ) ),
+                ou_( fromjson( "{'_id':1,a:1}") )
+            {}
+            void doIt() const {
+                client()->update( ns(), q_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( ou_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o_ );
+            }
+        protected:
+            BSONObj o_, q_, u_, ou_;
+        };
+
+
+        class UpsertInsertIdMod : public Base {
+        public:
+            UpsertInsertIdMod() :
+                q_( fromjson( "{'_id':5,a:4}" ) ),
+                u_( fromjson( "{$inc:{a:3}}" ) ),
+                ou_( fromjson( "{'_id':5,a:7}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_, true );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( ou_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+            }
+        protected:
+            BSONObj q_, u_, ou_;
+        };
+
+        class UpsertInsertSet : public Base {
+        public:
+            UpsertInsertSet() :
+                q_( fromjson( "{a:5}" ) ),
+                u_( fromjson( "{$set:{a:7}}" ) ),
+                ou_( fromjson( "{a:7}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_, true );
+            }
+            void check() const {
+                ASSERT_EQUALS( 2, count() );
+                ASSERT( !client()->findOne( ns(), ou_ ).isEmpty() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':7,a:7}" ) );
+            }
+        protected:
+            BSONObj o_, q_, u_, ou_;
+        };
+
+        class UpsertInsertInc : public Base {
+        public:
+            UpsertInsertInc() :
+                q_( fromjson( "{a:5}" ) ),
+                u_( fromjson( "{$inc:{a:3}}" ) ),
+                ou_( fromjson( "{a:8}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), q_, u_, true );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                ASSERT( !client()->findOne( ns(), ou_ ).isEmpty() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+            }
+        protected:
+            BSONObj o_, q_, u_, ou_;
+        };
+
+        class MultiInc : public Base {
+        public:
+
+            string s() const {
+                stringstream ss;
+                auto_ptr<DBClientCursor> cc = client()->query( ns() , Query().sort( BSON( "_id" << 1 ) ) );
+                bool first = true;
+                while ( cc->more() ) {
+                    if ( first ) first = false;
+                    else ss << ",";
+
+                    BSONObj o = cc->next();
+                    ss << o["x"].numberInt();
+                }
+                return ss.str();
+            }
+
+            void doIt() const {
+                client()->insert( ns(), BSON( "_id" << 1 << "x" << 1 ) );
+                client()->insert( ns(), BSON( "_id" << 2 << "x" << 5 ) );
+
+                ASSERT_EQUALS( "1,5" , s() );
+
+                client()->update( ns() , BSON( "_id" << 1 ) , BSON( "$inc" << BSON( "x" << 1 ) ) );
+                ASSERT_EQUALS( "2,5" , s() );
+
+                client()->update( ns() , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) );
+                ASSERT_EQUALS( "3,5" , s() );
+
+                client()->update( ns() , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) , false , true );
+                check();
+            }
+
+            void check() const {
+                ASSERT_EQUALS( "4,6" , s() );
+            }
+
+            void reset() const {
+                deleteAll( ns() );
+            }
+        };
+
+        class UpdateWithoutPreexistingId : public Base {
+        public:
+            UpdateWithoutPreexistingId() :
+                o_( fromjson( "{a:5}" ) ),
+                u_( fromjson( "{a:5}" ) ),
+                ot_( fromjson( "{b:4}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), o_, u_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 2, count() );
+                checkOne( u_ );
+                checkOne( ot_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( ot_, true );
+                insert( o_, true );
+            }
+        protected:
+            BSONObj o_, u_, ot_;
+        };
+
+        class Remove : public Base {
+        public:
+            Remove() :
+                o1_( f( "{\"_id\":\"010101010101010101010101\",\"a\":\"b\"}" ) ),
+                o2_( f( "{\"_id\":\"010101010101010101010102\",\"a\":\"b\"}" ) ),
+                q_( f( "{\"a\":\"b\"}" ) ) {}
+            void doIt() const {
+                client()->remove( ns(), q_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 0, count() );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( o1_ );
+                insert( o2_ );
+            }
+        protected:
+            BSONObj o1_, o2_, q_;
+        };
+
+        class RemoveOne : public Remove {
+            void doIt() const {
+                client()->remove( ns(), q_, true );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+            }
+        };
+
+        class FailingUpdate : public Base {
+        public:
+            FailingUpdate() :
+                o_( fromjson( "{'_id':1,a:'b'}" ) ),
+                u_( fromjson( "{'_id':1,c:'d'}" ) ) {}
+            void doIt() const {
+                client()->update( ns(), o_, u_ );
+                client()->insert( ns(), o_ );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( o_ );
+            }
+            void reset() const {
+                deleteAll( ns() );
+            }
+        protected:
+            BSONObj o_, u_;
+        };
+
+        class SetNumToStr : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), BSON( "$set" << BSON( "a" << "bcd" ) ) );
+            }
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                checkOne( BSON( "_id" << 0 << "a" << "bcd" ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( BSON( "_id" << 0 << "a" << 4.0 ) );
+            }
+        };
+
+        class Push : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), BSON( "$push" << BSON( "a" << 5.0 ) ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[4,5]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4]}" ) );
+            }
+        };
+
+        class PushUpsert : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), BSON( "$push" << BSON( "a" << 5.0 ) ), true );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[4,5]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4]}" ) );
+            }
+        };
+
+        class MultiPush : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), BSON( "$push" << BSON( "a" << 5.0 ) << "$push" << BSON( "b.c" << 6.0 ) ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[4,5],b:{c:[6]}}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4]}" ) );
+            }
+        };
+
+        class EmptyPush : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), BSON( "$push" << BSON( "a" << 5.0 ) ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[5]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0}" ) );
+            }
+        };
+
+        class PushAll : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pushAll:{a:[5.0,6.0]}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[4,5,6]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4]}" ) );
+            }
+        };
+
+        class PushAllUpsert : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pushAll:{a:[5.0,6.0]}}" ), true );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[4,5,6]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4]}" ) );
+            }
+        };
+
+        class EmptyPushAll : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pushAll:{a:[5.0,6.0]}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[5,6]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0}" ) );
+            }
+        };
+
+        class Pull : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), BSON( "$pull" << BSON( "a" << 4.0 ) ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[5]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4,5]}" ) );
+            }
+        };
+
+        class PullNothing : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), BSON( "$pull" << BSON( "a" << 6.0 ) ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[4,5]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4,5]}" ) );
+            }
+        };
+
+        class PullAll : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pullAll:{a:[4,5]}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[6]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4,5,6]}" ) );
+            }
+        };
+
+        class Pop : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pop:{a:1}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[4,5]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4,5,6]}" ) );
+            }
+        };
+
+        class PopReverse : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$pop:{a:-1}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( fromjson( "{'_id':0,a:[5,6]}" ), one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:[4,5,6]}" ) );
+            }
+        };
+
+        class BitOp : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$bit:{a:{and:2,or:8}}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( BSON( "_id" << 0 << "a" << ( ( 3 & 2 ) | 8 ) ) , one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:3}" ) );
+            }
+        };
+
+        class Rename : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$rename:{a:'b'}}" ) );
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$set:{a:50}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( BSON( "_id" << 0 << "a" << 50 << "b" << 3 ) , one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:3}" ) );
+            }
+        };
+
+        class RenameReplace : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$rename:{a:'b'}}" ) );
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$set:{a:50}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( BSON( "_id" << 0 << "a" << 50 << "b" << 3 ) , one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:3,b:100}" ) );
+            }
+        };
+
+        class RenameOverwrite : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$rename:{a:'b'}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( BSON( "_id" << 0 << "b" << 3 << "z" << 1 ) , one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,z:1,a:3}" ) );
+            }
+        };
+
+        class NoRename : public Base {
+        public:
+            void doIt() const {
+                client()->update( ns(), BSON( "_id" << 0 ), fromjson( "{$rename:{c:'b'},$set:{z:1}}" ) );
+            }
+            using ReplTests::Base::check;
+            void check() const {
+                ASSERT_EQUALS( 1, count() );
+                check( BSON( "_id" << 0 << "a" << 3 << "z" << 1 ) , one( fromjson( "{'_id':0}" ) ) );
+            }
+            void reset() const {
+                deleteAll( ns() );
+                insert( fromjson( "{'_id':0,a:3}" ) );
+            }
+        };
+
+
+    } // namespace Idempotence
+
+    class DeleteOpIsIdBased : public Base {
+    public:
+        void run() {
+            insert( BSON( "_id" << 0 << "a" << 10 ) );
+            insert( BSON( "_id" << 1 << "a" << 11 ) );
+            insert( BSON( "_id" << 3 << "a" << 10 ) );
+            client()->remove( ns(), BSON( "a" << 10 ) );
+            ASSERT_EQUALS( 1U, client()->count( ns(), BSONObj() ) );
+            insert( BSON( "_id" << 0 << "a" << 11 ) );
+            insert( BSON( "_id" << 2 << "a" << 10 ) );
+            insert( BSON( "_id" << 3 << "a" << 10 ) );
+
+            applyAllOperations();
+            ASSERT_EQUALS( 2U, client()->count( ns(), BSONObj() ) );
+            ASSERT( !one( BSON( "_id" << 1 ) ).isEmpty() );
+            ASSERT( !one( BSON( "_id" << 2 ) ).isEmpty() );
+        }
+    };
+
+    class DatabaseIgnorerBasic {
+    public:
+        void run() {
+            DatabaseIgnorer d;
+            ASSERT( !d.ignoreAt( "a", OpTime( 4, 0 ) ) );
+            d.doIgnoreUntilAfter( "a", OpTime( 5, 0 ) );
+            ASSERT( d.ignoreAt( "a", OpTime( 4, 0 ) ) );
+            ASSERT( !d.ignoreAt( "b", OpTime( 4, 0 ) ) );
+            ASSERT( d.ignoreAt( "a", OpTime( 4, 10 ) ) );
+            ASSERT( d.ignoreAt( "a", OpTime( 5, 0 ) ) );
+            ASSERT( !d.ignoreAt( "a", OpTime( 5, 1 ) ) );
+            // Ignore state is expired.
+            ASSERT( !d.ignoreAt( "a", OpTime( 4, 0 ) ) );
+        }
+    };
+
+    class DatabaseIgnorerUpdate {
+    public:
+        void run() {
+            DatabaseIgnorer d;
+            d.doIgnoreUntilAfter( "a", OpTime( 5, 0 ) );
+            d.doIgnoreUntilAfter( "a", OpTime( 6, 0 ) );
+            ASSERT( d.ignoreAt( "a", OpTime( 5, 5 ) ) );
+            ASSERT( d.ignoreAt( "a", OpTime( 6, 0 ) ) );
+            ASSERT( !d.ignoreAt( "a", OpTime( 6, 1 ) ) );
+
+            d.doIgnoreUntilAfter( "a", OpTime( 5, 0 ) );
+            d.doIgnoreUntilAfter( "a", OpTime( 6, 0 ) );
+            d.doIgnoreUntilAfter( "a", OpTime( 6, 0 ) );
+            d.doIgnoreUntilAfter( "a", OpTime( 5, 0 ) );
+            ASSERT( d.ignoreAt( "a", OpTime( 5, 5 ) ) );
+            ASSERT( d.ignoreAt( "a", OpTime( 6, 0 ) ) );
+            ASSERT( !d.ignoreAt( "a", OpTime( 6, 1 ) ) );
+        }
+    };
+
+    /**
+     * Check against oldest document in the oplog before scanning backward
+     * from the newest document.
+     */
+    class FindingStartCursorStale : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 10; ++i ) {
+                client()->insert( ns(), BSON( "_id" << i ) );
+            }
+            dblock lk;
+            Client::Context ctx( cllNS() );
+            NamespaceDetails *nsd = nsdetails( cllNS() );
+            BSONObjBuilder b;
+            b.appendTimestamp( "$gte" );
+            BSONObj query = BSON( "ts" << b.obj() );
+            FieldRangeSetPair frsp( cllNS(), query );
+            BSONObj order = BSON( "$natural" << 1 );
+            QueryPlan qp( nsd, -1, frsp, &frsp, query, order );
+            FindingStartCursor fsc( qp );
+            ASSERT( fsc.done() );
+            ASSERT_EQUALS( 0, fsc.cursor()->current()[ "o" ].Obj()[ "_id" ].Int() );
+        }
+    };
+
+    /** Check unsuccessful yield recovery with FindingStartCursor */
+    class FindingStartCursorYield : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 10; ++i ) {
+                client()->insert( ns(), BSON( "_id" << i ) );
+            }
+            Date_t ts = client()->query( "local.oplog.$main", Query().sort( BSON( "$natural" << 1 ) ), 1, 4 )->next()[ "ts" ].date();
+            Client::Context ctx( cllNS() );
+            NamespaceDetails *nsd = nsdetails( cllNS() );
+            BSONObjBuilder b;
+            b.appendDate( "$gte", ts );
+            BSONObj query = BSON( "ts" << b.obj() );
+            FieldRangeSetPair frsp( cllNS(), query );
+            BSONObj order = BSON( "$natural" << 1 );
+            QueryPlan qp( nsd, -1, frsp, &frsp, query, order );
+            FindingStartCursor fsc( qp );
+            ASSERT( !fsc.done() );
+            fsc.next();
+            ASSERT( !fsc.done() );
+            ASSERT( fsc.prepareToYield() );
+            ClientCursor::invalidate( "local.oplog.$main" );
+            ASSERT_THROWS( fsc.recoverFromYield(), MsgAssertionException );
+        }
+    };
+
+    /** Check ReplSetConfig::MemberCfg equality */
+    class ReplSetMemberCfgEquality : public Base {
+    public:
+        void run() {
+            ReplSetConfig::MemberCfg m1, m2;
+            assert(m1 == m2);
+            m1.tags["x"] = "foo";
+            assert(m1 != m2);
+            m2.tags["y"] = "bar";
+            assert(m1 != m2);
+            m1.tags["y"] = "bar";
+            assert(m1 != m2);
+            m2.tags["x"] = "foo";
+            assert(m1 == m2);
+            m1.tags.clear();
+            assert(m1 != m2);
+        }
+    };
+
+    class SyncTest : public Sync {
+    public:
+        bool returnEmpty;
+        SyncTest() : Sync(""), returnEmpty(false) {}
+        virtual ~SyncTest() {}
+        virtual BSONObj getMissingDoc(const BSONObj& o) {
+            if (returnEmpty) {
+                BSONObj o;
+                return o;
+            }
+            return BSON("_id" << "on remote" << "foo" << "baz");
+        }
+    };
+
+    class ShouldRetry : public Base {
+    public:
+        void run() {
+            bool threw = false;
+            BSONObj o = BSON("ns" << ns() << "o" << BSON("foo" << "bar") << "o2" << BSON("_id" << "in oplog" << "foo" << "bar"));
+
+            // this should fail because we can't connect
+            try {
+                Sync badSource("localhost:123");
+                badSource.getMissingDoc(o);
+            }
+            catch (DBException&) {
+                threw = true;
+            }
+            assert(threw);
+
+            // now this should succeed
+            SyncTest t;
+            assert(t.shouldRetry(o));
+            assert(!client()->findOne(ns(), BSON("_id" << "on remote")).isEmpty());
+
+            // force it not to find an obj
+            t.returnEmpty = true;
+            assert(!t.shouldRetry(o));
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "repl" ) {
+        }
+
+        void setupTests() {
+            add< LogBasic >();
+            add< Idempotence::InsertTimestamp >();
+            add< Idempotence::InsertAutoId >();
+            add< Idempotence::InsertWithId >();
+            add< Idempotence::InsertTwo >();
+            add< Idempotence::InsertTwoIdentical >();
+            add< Idempotence::UpdateTimestamp >();
+            add< Idempotence::UpdateSameField >();
+            add< Idempotence::UpdateSameFieldWithId >();
+            add< Idempotence::UpdateSameFieldExplicitId >();
+            add< Idempotence::UpdateDifferentFieldExplicitId >();
+            add< Idempotence::UpsertUpdateNoMods >();
+            add< Idempotence::UpsertInsertNoMods >();
+            add< Idempotence::UpdateSet >();
+            add< Idempotence::UpdateInc >();
+            add< Idempotence::UpdateInc2 >();
+            add< Idempotence::IncEmbedded >(); // SERVER-716
+            add< Idempotence::IncCreates >(); // SERVER-717
+            add< Idempotence::UpsertInsertIdMod >();
+            add< Idempotence::UpsertInsertSet >();
+            add< Idempotence::UpsertInsertInc >();
+            add< Idempotence::MultiInc >();
+            // Don't worry about this until someone wants this functionality.
+//            add< Idempotence::UpdateWithoutPreexistingId >();
+            add< Idempotence::Remove >();
+            add< Idempotence::RemoveOne >();
+            add< Idempotence::FailingUpdate >();
+            add< Idempotence::SetNumToStr >();
+            add< Idempotence::Push >();
+            add< Idempotence::PushUpsert >();
+            add< Idempotence::MultiPush >();
+            add< Idempotence::EmptyPush >();
+            add< Idempotence::PushAll >();
+            add< Idempotence::PushAllUpsert >();
+            add< Idempotence::EmptyPushAll >();
+            add< Idempotence::Pull >();
+            add< Idempotence::PullNothing >();
+            add< Idempotence::PullAll >();
+            add< Idempotence::Pop >();
+            add< Idempotence::PopReverse >();
+            add< Idempotence::BitOp >();
+            add< Idempotence::Rename >();
+            add< Idempotence::RenameReplace >();
+            add< Idempotence::RenameOverwrite >();
+            add< Idempotence::NoRename >();
+            add< DeleteOpIsIdBased >();
+            add< DatabaseIgnorerBasic >();
+            add< DatabaseIgnorerUpdate >();
+            add< FindingStartCursorStale >();
+            add< FindingStartCursorYield >();
+            add< ReplSetMemberCfgEquality >();
+            add< ShouldRetry >();
+        }
+    } myall;
+
+} // namespace ReplTests
+
diff --git a/src/mongo/dbtests/sharding.cpp b/src/mongo/dbtests/sharding.cpp
new file mode 100644
index 00000000000..19edd5537ab
--- /dev/null
+++ b/src/mongo/dbtests/sharding.cpp
@@ -0,0 +1,56 @@
+// sharding.cpp : some unit tests for sharding internals
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "dbtests.h"
+
+#include "../client/parallel.h"
+
+namespace ShardingTests {
+
+    namespace serverandquerytests {
+        class test1 {
+        public:
+            void run() {
+                ServerAndQuery a( "foo:1" , BSON( "a" << GT << 0 << LTE << 100 ) );
+                ServerAndQuery b( "foo:1" , BSON( "a" << GT << 200 << LTE << 1000 ) );
+
+                ASSERT( a < b );
+                ASSERT( ! ( b < a ) );
+
+                set<ServerAndQuery> s;
+                s.insert( a );
+                s.insert( b );
+
+                ASSERT_EQUALS( (unsigned int)2 , s.size() );
+            }
+        };
+    }
+
+    class All : public Suite {
+    public:
+        All() : Suite( "sharding" ) {
+        }
+
+        void setupTests() {
+            add< serverandquerytests::test1 >();
+        }
+    } myall;
+
+}
diff --git a/src/mongo/dbtests/socktests.cpp b/src/mongo/dbtests/socktests.cpp
new file mode 100644
index 00000000000..176db8c8e95
--- /dev/null
+++ b/src/mongo/dbtests/socktests.cpp
@@ -0,0 +1,48 @@
+// socktests.cpp : sock.{h,cpp} unit tests.
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../util/net/sock.h"
+#include "dbtests.h"
+
+namespace SockTests {
+
+    class HostByName {
+    public:
+        void run() {
+            ASSERT_EQUALS( "127.0.0.1", hostbyname( "localhost" ) );
+            ASSERT_EQUALS( "127.0.0.1", hostbyname( "127.0.0.1" ) );
+            // ASSERT_EQUALS( "::1", hostbyname( "::1" ) ); // IPv6 disabled at runtime by default.
+
+            HostAndPort h("asdfasdfasdf_no_such_host");
+            // this fails uncomment when fixed.
+            ASSERT( !h.isSelf() );
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "sock" ) {}
+        void setupTests() {
+            add< HostByName >();
+        }
+    } myall;
+
+} // namespace SockTests
+
diff --git a/src/mongo/dbtests/spin_lock_test.cpp b/src/mongo/dbtests/spin_lock_test.cpp
new file mode 100644
index 00000000000..ed1f1ae1ca5
--- /dev/null
+++ b/src/mongo/dbtests/spin_lock_test.cpp
@@ -0,0 +1,114 @@
+// spin_lock_test.cpp : spin_lcok.{h, cpp} unit test
+
+/**
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include <boost/thread/thread.hpp>
+#include "dbtests.h"
+#include "../util/concurrency/spin_lock.h"
+#include "../util/timer.h"
+
+namespace {
+
+    using mongo::SpinLock;
+
+    class LockTester {
+    public:
+        LockTester( SpinLock* spin, int* counter )
+            : _spin(spin), _counter(counter), _requests(0) {}
+
+        ~LockTester() {
+            delete _t;
+        }
+
+        void start( int increments ) {
+            _t = new boost::thread( boost::bind(&LockTester::test, this, increments) );
+        }
+
+        void join() {
+            if ( _t ) _t->join();
+        }
+
+        int requests() const {
+            return _requests;
+        }
+
+    private:
+        SpinLock*      _spin;     // not owned here
+        int*           _counter;  // not owned here
+        int            _requests;
+        boost::thread* _t;
+
+        void test( int increments ) {
+            while ( increments-- > 0 ) {
+                _spin->lock();
+                ++(*_counter);
+                ++_requests;
+                _spin->unlock();
+            }
+        }
+
+        LockTester( LockTester& );
+        LockTester& operator=( LockTester& );
+    };
+
+    class ConcurrentIncs {
+    public:
+        void run() {
+
+            SpinLock spin;
+            int counter = 0;
+
+            const int threads = 64;
+            const int incs = 50000;
+            LockTester* testers[threads];
+            
+            Timer timer;
+
+            for ( int i = 0; i < threads; i++ ) {
+                testers[i] = new LockTester( &spin, &counter );
+            }
+            for ( int i = 0; i < threads; i++ ) {
+                testers[i]->start( incs );
+            }
+            for ( int i = 0; i < threads; i++ ) {
+                testers[i]->join();
+                ASSERT_EQUALS( testers[i]->requests(), incs );
+                delete testers[i];
+            }
+      
+            int ms = timer.millis();
+            log() << "spinlock ConcurrentIncs time: " << ms << endl;
+            
+            ASSERT_EQUALS( counter, threads*incs );
+#if defined(__linux__)
+            ASSERT( SpinLock::isfast() );
+#endif
+
+        }
+    };
+
+    class SpinLockSuite : public Suite {
+    public:
+        SpinLockSuite() : Suite( "spinlock" ) {}
+
+        void setupTests() {
+            add< ConcurrentIncs >();
+        }
+    } spinLockSuite;
+
+}  // anonymous namespace
diff --git a/src/mongo/dbtests/test.sln b/src/mongo/dbtests/test.sln
new file mode 100755
index 00000000000..3a1b741c716
--- /dev/null
+++ b/src/mongo/dbtests/test.sln
@@ -0,0 +1,26 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test.vcxproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/src/mongo/dbtests/test.vcxproj b/src/mongo/dbtests/test.vcxproj
new file mode 100644
index 00000000000..c5d1aad61e9
--- /dev/null
+++ b/src/mongo/dbtests/test.vcxproj
@@ -0,0 +1,776 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{215B2D68-0A70-4D10-8E75-B33010C62A91}</ProjectGuid>
+    <RootNamespace>dbtests</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>No</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+      <Profile>true</Profile>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_DURABLE;_UNICODE;UNICODE;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\boostw\boost_1_34_1\boost\config\auto_link.hpp" />
+    <ClInclude Include="..\bson\bson-inl.h" />
+    <ClInclude Include="..\bson\bson.h" />
+    <ClInclude Include="..\bson\bsonelement.h" />
+    <ClInclude Include="..\bson\bsonmisc.h" />
+    <ClInclude Include="..\bson\bsonobj.h" />
+    <ClInclude Include="..\bson\bsonobjbuilder.h" />
+    <ClInclude Include="..\bson\bsonobjiterator.h" />
+    <ClInclude Include="..\bson\bsontypes.h" />
+    <ClInclude Include="..\bson\bson_db.h" />
+    <ClInclude Include="..\bson\inline_decls.h" />
+    <ClInclude Include="..\bson\oid.h" />
+    <ClInclude Include="..\bson\ordering.h" />
+    <ClInclude Include="..\bson\stringdata.h" />
+    <ClInclude Include="..\client\dbclientmockcursor.h" />
+    <ClInclude Include="..\db\collection.h" />
+    <ClInclude Include="..\db\databaseholder.h" />
+    <ClInclude Include="..\db\dur.h" />
+    <ClInclude Include="..\db\durop.h" />
+    <ClInclude Include="..\db\dur_journal.h" />
+    <ClInclude Include="..\db\jsobjmanipulator.h" />
+    <ClInclude Include="..\db\mongommf.h" />
+    <ClInclude Include="..\db\mongomutex.h" />
+    <ClInclude Include="..\db\ops\count.h" />
+    <ClInclude Include="..\db\ops\delete.h" />
+    <ClInclude Include="..\db\ops\query.h" />
+    <ClInclude Include="..\db\ops\update.h" />
+    <ClInclude Include="..\db\pagefault.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcrecpp.h" />
+    <ClInclude Include="..\server.h" />
+    <ClInclude Include="..\targetver.h" />
+    <ClInclude Include="..\..\boostw\boost_1_34_1\boost\version.hpp" />
+    <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="..\db\btree.h" />
+    <ClInclude Include="..\db\clientcursor.h" />
+    <ClInclude Include="..\db\cmdline.h" />
+    <ClInclude Include="..\db\commands.h" />
+    <ClInclude Include="..\db\concurrency.h" />
+    <ClInclude Include="..\db\curop.h" />
+    <ClInclude Include="..\db\cursor.h" />
+    <ClInclude Include="..\db\database.h" />
+    <ClInclude Include="..\db\db.h" />
+    <ClInclude Include="..\db\dbhelpers.h" />
+    <ClInclude Include="..\db\dbinfo.h" />
+    <ClInclude Include="..\db\dbmessage.h" />
+    <ClInclude Include="..\db\diskloc.h" />
+    <ClInclude Include="..\db\extsort.h" />
+    <ClInclude Include="..\db\introspect.h" />
+    <ClInclude Include="..\db\jsobj.h" />
+    <ClInclude Include="..\db\json.h" />
+    <ClInclude Include="..\db\matcher.h" />
+    <ClInclude Include="..\grid\message.h" />
+    <ClInclude Include="..\db\minilex.h" />
+    <ClInclude Include="..\db\namespace.h" />
+    <ClInclude Include="..\pch.h" />
+    <ClInclude Include="..\db\pdfile.h" />
+    <ClInclude Include="..\grid\protocol.h" />
+    <ClInclude Include="..\db\query.h" />
+    <ClInclude Include="..\db\queryoptimizer.h" />
+    <ClInclude Include="..\db\repl.h" />
+    <ClInclude Include="..\db\replset.h" />
+    <ClInclude Include="..\db\resource.h" />
+    <ClInclude Include="..\db\scanandorder.h" />
+    <ClInclude Include="..\db\security.h" />
+    <ClInclude Include="..\third_party\snappy\config.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-c.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-internal.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-sinksource.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-stubs-internal.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-stubs-public.h" />
+    <ClInclude Include="..\third_party\snappy\snappy.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\checksum.h" />
+    <ClInclude Include="..\util\compress.h" />
+    <ClInclude Include="..\util\concurrency\list.h" />
+    <ClInclude Include="..\util\concurrency\task.h" />
+    <ClInclude Include="..\util\concurrency\value.h" />
+    <ClInclude Include="..\util\file.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\util\hashtab.h" />
+    <ClInclude Include="..\db\lasterror.h" />
+    <ClInclude Include="..\util\log.h" />
+    <ClInclude Include="..\util\logfile.h" />
+    <ClInclude Include="..\util\lruishmap.h" />
+    <ClInclude Include="..\util\md5.h" />
+    <ClInclude Include="..\util\md5.hpp" />
+    <ClInclude Include="..\util\miniwebserver.h" />
+    <ClInclude Include="..\util\mmap.h" />
+    <ClInclude Include="..\util\mongoutils\hash.h" />
+    <ClInclude Include="..\util\sock.h" />
+    <ClInclude Include="..\util\unittest.h" />
+    <ClInclude Include="framework.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\bson\oid.cpp" />
+    <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\client\distlock.cpp" />
+    <ClCompile Include="..\client\gridfs.cpp" />
+    <ClCompile Include="..\client\model.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="..\db\btreebuilder.cpp" />
+    <ClCompile Include="..\db\cap.cpp" />
+    <ClCompile Include="..\db\commands\isself.cpp" />
+    <ClCompile Include="..\db\compact.cpp" />
+    <ClCompile Include="..\db\curop.cpp" />
+    <ClCompile Include="..\db\dbcommands_admin.cpp" />
+    <ClCompile Include="..\db\dbcommands_generic.cpp" />
+    <ClCompile Include="..\db\dur.cpp" />
+    <ClCompile Include="..\db\durop.cpp" />
+    <ClCompile Include="..\db\dur_commitjob.cpp" />
+    <ClCompile Include="..\db\dur_journal.cpp" />
+    <ClCompile Include="..\db\dur_preplogbuffer.cpp" />
+    <ClCompile Include="..\db\dur_recover.cpp" />
+    <ClCompile Include="..\db\dur_writetodatafiles.cpp" />
+    <ClCompile Include="..\db\d_concurrency.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Use</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Use</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\d_globals.cpp" />
+    <ClCompile Include="..\db\geo\2d.cpp" />
+    <ClCompile Include="..\db\geo\haystack.cpp" />
+    <ClCompile Include="..\db\key.cpp" />
+    <ClCompile Include="..\db\mongommf.cpp" />
+    <ClCompile Include="..\db\ops\count.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\ops\delete.cpp" />
+    <ClCompile Include="..\db\ops\query.cpp" />
+    <ClCompile Include="..\db\ops\update.cpp" />
+    <ClCompile Include="..\db\pagefault.cpp" />
+    <ClCompile Include="..\db\projection.cpp" />
+    <ClCompile Include="..\db\queryoptimizercursor.cpp" />
+    <ClCompile Include="..\db\querypattern.cpp">
+      <PrecompiledHeader />
+    </ClCompile>
+    <ClCompile Include="..\db\record.cpp" />
+    <ClCompile Include="..\db\repl\consensus.cpp" />
+    <ClCompile Include="..\db\repl\heartbeat.cpp" />
+    <ClCompile Include="..\db\repl\manager.cpp" />
+    <ClCompile Include="..\db\repl\rs.cpp" />
+    <ClCompile Include="..\db\repl\rs_initialsync.cpp" />
+    <ClCompile Include="..\db\repl\rs_initiate.cpp" />
+    <ClCompile Include="..\db\repl\rs_rollback.cpp" />
+    <ClCompile Include="..\db\repl\rs_sync.cpp" />
+    <ClCompile Include="..\db\restapi.cpp" />
+    <ClCompile Include="..\db\scanandorder.cpp" />
+    <ClCompile Include="..\db\security_common.cpp" />
+    <ClCompile Include="..\s\default_version.cpp" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+      <PrecompiledHeader />
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+      <PrecompiledHeader />
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+      <PrecompiledHeader />
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+      <PrecompiledHeader />
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+      <PrecompiledHeader />
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+      <PrecompiledHeader />
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\db\btree.cpp" />
+    <ClCompile Include="..\db\btreecursor.cpp" />
+    <ClCompile Include="..\pch.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\client.cpp" />
+    <ClCompile Include="..\db\clientcursor.cpp" />
+    <ClCompile Include="..\db\cloner.cpp" />
+    <ClCompile Include="..\db\commands\cloud.cpp">
+      <PrecompiledHeader />
+    </ClCompile>
+    <ClCompile Include="..\db\commands.cpp" />
+    <ClCompile Include="..\db\common.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\cursor.cpp" />
+    <ClCompile Include="..\db\database.cpp" />
+    <ClCompile Include="..\db\dbcommands.cpp" />
+    <ClCompile Include="..\db\dbeval.cpp" />
+    <ClCompile Include="..\db\dbhelpers.cpp" />
+    <ClCompile Include="..\db\dbwebserver.cpp" />
+    <ClCompile Include="..\db\extsort.cpp" />
+    <ClCompile Include="..\db\index.cpp" />
+    <ClCompile Include="..\db\indexkey.cpp" />
+    <ClCompile Include="..\db\instance.cpp" />
+    <ClCompile Include="..\db\introspect.cpp" />
+    <ClCompile Include="..\db\jsobj.cpp" />
+    <ClCompile Include="..\db\json.cpp" />
+    <ClCompile Include="..\db\lasterror.cpp" />
+    <ClCompile Include="..\db\matcher.cpp" />
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\s\chunk.cpp" />
+    <ClCompile Include="..\s\config.cpp" />
+    <ClCompile Include="..\s\d_chunk_manager.cpp" />
+    <ClCompile Include="..\s\d_migrate.cpp" />
+    <ClCompile Include="..\s\d_split.cpp" />
+    <ClCompile Include="..\s\d_state.cpp" />
+    <ClCompile Include="..\s\d_writeback.cpp" />
+    <ClCompile Include="..\s\grid.cpp" />
+    <ClCompile Include="..\s\shard.cpp" />
+    <ClCompile Include="..\s\shardconnection.cpp" />
+    <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\alignedbuilder.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\compress.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+    <ClCompile Include="..\util\concurrency\task.cpp" />
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+    <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\file_allocator.cpp" />
+    <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\logfile.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="..\db\namespace.cpp" />
+    <ClCompile Include="..\db\nonce.cpp" />
+    <ClCompile Include="..\db\pdfile.cpp" />
+    <ClCompile Include="..\db\queryoptimizer.cpp" />
+    <ClCompile Include="..\util\processinfo.cpp" />
+    <ClCompile Include="..\db\repl.cpp" />
+    <ClCompile Include="..\db\security.cpp" />
+    <ClCompile Include="..\db\security_commands.cpp" />
+    <ClCompile Include="..\db\tests.cpp" />
+    <ClCompile Include="..\db\cmdline.cpp" />
+    <ClCompile Include="..\db\dbmessage.cpp" />
+    <ClCompile Include="..\db\matcher_covered.cpp" />
+    <ClCompile Include="..\db\oplog.cpp" />
+    <ClCompile Include="..\db\queryutil.cpp" />
+    <ClCompile Include="..\db\repl_block.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\util\md5.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeaderFile>
+    </ClCompile>
+    <ClCompile Include="..\util\md5main.cpp" />
+    <ClCompile Include="..\util\net\message.cpp" />
+    <ClCompile Include="..\util\net\listen.cpp" />
+    <ClCompile Include="..\util\net\message_server_port.cpp" />
+    <ClCompile Include="..\util\net\message_port.cpp" />
+    <ClCompile Include="..\util\net\miniwebserver.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\util\ramlog.cpp" />
+    <ClCompile Include="..\util\net\sock.cpp" />
+    <ClCompile Include="..\util\stringutils.cpp" />
+    <ClCompile Include="..\util\text.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="..\s\d_logic.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\shell\mongo.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\scripting\utils.cpp" />
+    <ClCompile Include="..\util\version.cpp" />
+    <ClCompile Include="basictests.cpp" />
+    <ClCompile Include="btreetests.cpp" />
+    <ClCompile Include="clienttests.cpp" />
+    <ClCompile Include="cursortests.cpp" />
+    <ClCompile Include="dbtests.cpp" />
+    <ClCompile Include="directclienttests.cpp" />
+    <ClCompile Include="d_chunk_manager_tests.cpp" />
+    <ClCompile Include="framework.cpp" />
+    <ClCompile Include="jsobjtests.cpp" />
+    <ClCompile Include="jsontests.cpp" />
+    <ClCompile Include="jstests.cpp" />
+    <ClCompile Include="matchertests.cpp" />
+    <ClCompile Include="mmaptests.cpp" />
+    <ClCompile Include="namespacetests.cpp" />
+    <ClCompile Include="pdfiletests.cpp" />
+    <ClCompile Include="perftests.cpp" />
+    <ClCompile Include="queryoptimizercursortests.cpp" />
+    <ClCompile Include="queryoptimizertests.cpp" />
+    <ClCompile Include="querytests.cpp" />
+    <ClCompile Include="repltests.cpp" />
+    <ClCompile Include="socktests.cpp" />
+    <ClCompile Include="spin_lock_test.cpp" />
+    <ClCompile Include="threadedtests.cpp">
+      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4180;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">4180;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+    </ClCompile>
+    <ClCompile Include="updatetests.cpp" />
+    <ClCompile Include="..\db\stats\counters.cpp" />
+    <ClCompile Include="..\db\stats\snapshots.cpp" />
+    <ClCompile Include="..\db\stats\top.cpp" />
+    <ClCompile Include="..\db\repl\health.cpp" />
+    <ClCompile Include="..\db\repl\replset_commands.cpp" />
+    <ClCompile Include="..\db\repl\rs_config.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\SConstruct" />
+    <None Include="btreetests.inl" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js32d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js32r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/dbtests/test.vcxproj.filters b/src/mongo/dbtests/test.vcxproj.filters
new file mode 100755
index 00000000000..a692d0ca692
--- /dev/null
+++ b/src/mongo/dbtests/test.vcxproj.filters
@@ -0,0 +1,939 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="misc and third party">
+      <UniqueIdentifier>{17c97725-06a4-41a6-bc1c-f0e05eada682}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="misc and third party">
+      <UniqueIdentifier>{0a50fb63-4ac3-4e30-a9d4-b0841878ee73}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="client">
+      <UniqueIdentifier>{45dab36c-864e-45de-bb8e-cf1d87a2c4f6}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="db">
+      <UniqueIdentifier>{69e233b0-5354-4612-8474-d4e4faaee607}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="db\cpp">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="db\h">
+      <UniqueIdentifier>{f86d2fc9-fb76-40cf-943d-330feb945ff3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="util">
+      <UniqueIdentifier>{0ec2e082-aace-46da-9898-a1a7b24d60b7}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="util\cpp">
+      <UniqueIdentifier>{12efa241-3593-4177-a0cb-1eb672491f49}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="shard">
+      <UniqueIdentifier>{3865c5a5-bdb1-4420-a3ae-5a6615d563d4}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="scripting">
+      <UniqueIdentifier>{28893dc5-8a18-429a-b5c9-2cf701d324da}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="dbtests">
+      <UniqueIdentifier>{bc08b47a-daa3-4894-b9af-ae88755838db}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="stats">
+      <UniqueIdentifier>{2b914dc3-a760-4397-a12b-73a0381fa71d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="replsets">
+      <UniqueIdentifier>{9320a670-3b28-471a-bf92-6c8d881a37a4}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="util\concurrency">
+      <UniqueIdentifier>{d499fdba-b256-4b12-af20-cdd1ae1addff}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="util\h">
+      <UniqueIdentifier>{353b6f01-1cab-4156-a576-bc75ab204776}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="btree">
+      <UniqueIdentifier>{4fff2dbf-30c4-4295-8db8-d513c1e36220}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="dur">
+      <UniqueIdentifier>{c296d097-0d46-46ee-9097-f2df659d9596}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="bson">
+      <UniqueIdentifier>{e6652333-c77f-420c-af8e-72d55bc095fe}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="misc and third party\snappy">
+      <UniqueIdentifier>{fbc4416f-ca67-4e63-a1ea-49027de7e080}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\boostw\boost_1_34_1\boost\config\auto_link.hpp">
+      <Filter>misc and third party</Filter>
+    </ClInclude>
+    <ClInclude Include="..\targetver.h">
+      <Filter>misc and third party</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\boostw\boost_1_34_1\boost\version.hpp">
+      <Filter>misc and third party</Filter>
+    </ClInclude>
+    <ClInclude Include="..\client\connpool.h">
+      <Filter>client</Filter>
+    </ClInclude>
+    <ClInclude Include="..\client\dbclient.h">
+      <Filter>client</Filter>
+    </ClInclude>
+    <ClInclude Include="..\client\model.h">
+      <Filter>client</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\clientcursor.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\cmdline.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\commands.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\concurrency.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\curop.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\cursor.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\database.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\db.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\dbhelpers.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\dbinfo.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\dbmessage.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\diskloc.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\extsort.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\introspect.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\jsobj.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\json.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\matcher.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\grid\message.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\minilex.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\namespace.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\pch.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pdfile.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\grid\protocol.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\query.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\queryoptimizer.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\repl.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\replset.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\resource.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\scanandorder.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\security.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\btree.h">
+      <Filter>btree</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\concurrency\list.h">
+      <Filter>util\concurrency</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\concurrency\value.h">
+      <Filter>util\concurrency</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\concurrency\task.h">
+      <Filter>util\concurrency</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\builder.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\unittest.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\file.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\goodies.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\hashtab.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\lasterror.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\log.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\lruishmap.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\md5.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\md5.hpp">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\miniwebserver.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\mmap.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\sock.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\dur.h">
+      <Filter>dur</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\dur_journal.h">
+      <Filter>dur</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\logfile.h">
+      <Filter>dur</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\mongommf.h">
+      <Filter>dur</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\durop.h">
+      <Filter>dur</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\jsobjmanipulator.h">
+      <Filter>db</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\mongomutex.h">
+      <Filter>db</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\mongoutils\hash.h">
+      <Filter>util\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\checksum.h">
+      <Filter>util</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bson.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bson_db.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bsonelement.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bson-inl.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bsonmisc.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bsonobj.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bsonobjbuilder.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bsonobjiterator.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\bsontypes.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\inline_decls.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\oid.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\ordering.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\bson\stringdata.h">
+      <Filter>bson</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\ops\delete.h">
+      <Filter>db\cpp</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\ops\update.h">
+      <Filter>db\cpp</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\ops\query.h">
+      <Filter>db\cpp</Filter>
+    </ClInclude>
+    <ClInclude Include="..\server.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\config.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-c.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-internal.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-sinksource.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-stubs-internal.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-stubs-public.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\compress.h">
+      <Filter>misc and third party</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\pcre-7.4\pcrecpp.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\db\collection.h" />
+    <ClInclude Include="..\db\databaseholder.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="framework.h" />
+    <ClInclude Include="..\db\ops\count.h">
+      <Filter>db\h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\client\dbclientmockcursor.h" />
+    <ClInclude Include="..\db\pagefault.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js64r.lib">
+      <Filter>misc and third party</Filter>
+    </Library>
+    <Library Include="..\..\js\js32d.lib">
+      <Filter>misc and third party</Filter>
+    </Library>
+    <Library Include="..\..\js\js32r.lib">
+      <Filter>misc and third party</Filter>
+    </Library>
+    <Library Include="..\..\js\js64d.lib">
+      <Filter>misc and third party</Filter>
+    </Library>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\client\connpool.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\dbclient.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\dbclientcursor.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\syncclusterconnection.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\pch.cpp">
+      <Filter>db</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\client.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\clientcursor.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\cloner.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\commands.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\common.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\cursor.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\database.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbcommands.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbeval.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbhelpers.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbwebserver.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\extsort.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\index.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\indexkey.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\instance.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\introspect.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\jsobj.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\json.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\lasterror.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\matcher.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\mmap_win.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\namespace.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\nonce.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pdfile.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\queryoptimizer.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\security.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\security_commands.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\tests.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\cmdline.cpp">
+      <Filter>db\h</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\matcher_covered.cpp">
+      <Filter>db\h</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\oplog.cpp">
+      <Filter>db\h</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\queryutil.cpp">
+      <Filter>db\h</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl_block.cpp">
+      <Filter>db\h</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\assert_util.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\background.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\base64.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\md5.c">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\md5main.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\mmap.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\processinfo_win32.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\util.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\d_logic.cpp">
+      <Filter>shard</Filter>
+    </ClCompile>
+    <ClCompile Include="..\scripting\engine.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\shell\mongo.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\scripting\utils.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="basictests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="btreetests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="clienttests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="cursortests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="dbtests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="framework.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="jsobjtests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="jsontests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="jstests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="matchertests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="namespacetests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="pdfiletests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="queryoptimizertests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="querytests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="repltests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="socktests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="threadedtests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="updatetests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\stats\counters.cpp">
+      <Filter>stats</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\stats\snapshots.cpp">
+      <Filter>stats</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\stats\top.cpp">
+      <Filter>stats</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\consensus.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\health.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\replset_commands.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\rs_config.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\btree.cpp">
+      <Filter>btree</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\btreecursor.cpp">
+      <Filter>btree</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\manager.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\rs_initiate.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\vars.cpp">
+      <Filter>util\concurrency</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\task.cpp">
+      <Filter>util\concurrency</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\heartbeat.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\shardconnection.cpp">
+      <Filter>shard</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp">
+      <Filter>util\concurrency</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\version.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\rs.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\text.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\gridfs.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\d_writeback.cpp">
+      <Filter>shard</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\d_state.cpp">
+      <Filter>shard</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\geo\2d.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\chunk.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\config.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\shardkey.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\shard.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\model.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\parallel.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\stringutils.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\distlock.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\d_migrate.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\d_split.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\rs_rollback.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\rs_sync.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\repl\rs_initialsync.cpp">
+      <Filter>replsets</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\geo\haystack.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\cap.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\log.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\processinfo.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\grid.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\restapi.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="mmaptests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="..\scripting\bench.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\compact.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\commands\isself.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dur.cpp">
+      <Filter>dur</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dur_journal.cpp">
+      <Filter>dur</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\logfile.cpp">
+      <Filter>dur</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\mongommf.cpp">
+      <Filter>dur</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\projection.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\d_chunk_manager.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dur_recover.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\durop.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbcommands_generic.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\alignedbuilder.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\bson\oid.cpp">
+      <Filter>db</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\synchronization.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dur_commitjob.cpp">
+      <Filter>dur</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dur_writetodatafiles.cpp">
+      <Filter>dur</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\dbclient_rs.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dur_preplogbuffer.cpp">
+      <Filter>dur</Filter>
+    </ClCompile>
+    <ClCompile Include="perftests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="directclienttests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\file_allocator.cpp">
+      <Filter>util\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbcommands_admin.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\querypattern.cpp">
+      <Filter>db</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\ramlog.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\key.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\btreebuilder.cpp">
+      <Filter>btree</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\queryoptimizercursor.cpp">
+      <Filter>db</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\record.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\ops\delete.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\ops\update.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\security_common.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\ops\query.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbmessage.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\listen.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message_server_port.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message_port.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\miniwebserver.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\sock.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="spin_lock_test.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp">
+      <Filter>util\concurrency</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <Filter>misc and third party\snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\compress.cpp">
+      <Filter>misc and third party</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <Filter>misc and third party\snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\scanandorder.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\commands\cloud.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\d_concurrency.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="d_chunk_manager_tests.cpp">
+      <Filter>shard</Filter>
+    </ClCompile>
+    <ClCompile Include="queryoptimizercursortests.cpp">
+      <Filter>dbtests</Filter>
+    </ClCompile>
+    <ClCompile Include="..\s\default_version.cpp">
+      <Filter>shard</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\ops\count.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pagefault.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\d_globals.cpp">
+      <Filter>db</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\curop.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\SConstruct">
+      <Filter>misc and third party</Filter>
+    </None>
+    <None Include="btreetests.inl">
+      <Filter>dbtests</Filter>
+    </None>
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/dbtests/threadedtests.cpp b/src/mongo/dbtests/threadedtests.cpp
new file mode 100644
index 00000000000..1304a276b7d
--- /dev/null
+++ b/src/mongo/dbtests/threadedtests.cpp
@@ -0,0 +1,649 @@
+// threadedtests.cpp - Tests for threaded code
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../bson/util/atomic_int.h"
+#include "../util/concurrency/mvar.h"
+#include "../util/concurrency/thread_pool.h"
+#include "../util/concurrency/list.h"
+#include "../util/timer.h"
+#include <boost/thread.hpp>
+#include <boost/bind.hpp>
+#include "../db/d_concurrency.h"
+
+#include "dbtests.h"
+
+namespace ThreadedTests {
+
+    template <int nthreads_param=10>
+    class ThreadedTest {
+    public:
+        virtual void setup() {} //optional
+        virtual void subthread(int remaining) = 0; // each thread whatever test work you want done
+        virtual void validate() = 0; // after work is done
+
+        static const int nthreads = nthreads_param;
+
+        void run() {
+            setup();
+            launch_subthreads(nthreads);
+            validate();
+        }
+
+        virtual ~ThreadedTest() {}; // not necessary, but makes compilers happy
+
+    private:
+        void launch_subthreads(int remaining) {
+            if (!remaining) 
+                return;
+
+            boost::thread athread(boost::bind(&ThreadedTest::subthread, this, remaining));
+            launch_subthreads(remaining - 1);
+            athread.join();
+        }
+    };
+
+    class MongoMutexTest : public ThreadedTest<135> {
+#if defined(_DEBUG)
+        enum { N = 5000 };
+#else
+        enum { N = 40000 };
+#endif
+        MongoMutex *mm;
+        ProgressMeter pm;
+    public:
+        MongoMutexTest() : pm(N * nthreads) {}
+        void run() {
+            DEV {
+                // in _DEBUG builds on linux we mprotect each time a writelock
+                // is taken. That can greatly slow down this test if there are
+                // many open files
+                DBDirectClient db;
+                db.simpleCommand("admin", NULL, "closeAllDatabases");
+            }
+
+            Timer t;
+            cout << "MongoMutexTest N:" << N << endl;
+            ThreadedTest<135>::run();
+            cout << "MongoMutexTest " << t.millis() << "ms" << endl;
+        }
+    private:
+        virtual void setup() {
+            mm = &d.dbMutex;
+        }
+        virtual void subthread(int) {
+            Client::initThread("mongomutextest");
+            sleepmillis(0);
+            for( int i = 0; i < N; i++ ) {
+                if( i % 7 == 0 ) {
+                    mm->lock_shared();
+                    mm->lock_shared();
+                    mm->unlock_shared();
+                    mm->unlock_shared();
+                }
+                else if( i % 7 == 1 ) {
+                    mm->lock_shared();
+                    ASSERT( mm->atLeastReadLocked() );
+                    mm->unlock_shared();
+                }
+                else if( i % 7 == 2 ) {
+                    mm->lock();
+                    ASSERT( mm->isWriteLocked() );
+                    mm->unlock();
+                }
+                else if( i % 7 == 3 ) {
+                    mm->lock();
+                    mm->lock_shared();
+                    ASSERT( mm->isWriteLocked() );
+                    mm->unlock_shared();
+                    mm->unlock();
+                }
+                else if( i % 7 == 4 ) {
+                    mm->lock();
+                    mm->releaseEarly();
+                    mm->unlock();
+                }
+                else if( i % 7 == 5 ) {
+                    if( mm->lock_try(1) ) {
+                        mm->unlock();
+                    }
+                }
+                else if( i % 7 == 6 ) {
+                    if( mm->lock_shared_try(0) ) {
+                        mm->unlock_shared();
+                    }
+                }
+                else {
+                    mm->lock_shared();
+                    mm->unlock_shared();
+                }
+                pm.hit();
+            }
+            cc().shutdown();
+        }
+        virtual void validate() {
+            ASSERT( !mm->atLeastReadLocked() );
+            mm->lock();
+            mm->unlock();
+            mm->lock_shared();
+            mm->unlock_shared();
+        }
+    };
+
+    // Tested with up to 30k threads
+    class IsAtomicUIntAtomic : public ThreadedTest<> {
+        static const int iterations = 1000000;
+        AtomicUInt target;
+
+        void subthread(int) {
+            for(int i=0; i < iterations; i++) {
+                //target.x++; // verified to fail with this version
+                target++;
+            }
+        }
+        void validate() {
+            ASSERT_EQUALS(target.x , unsigned(nthreads * iterations));
+
+            AtomicUInt u;
+            ASSERT_EQUALS(0u, u);
+            ASSERT_EQUALS(0u, u++);
+            ASSERT_EQUALS(2u, ++u);
+            ASSERT_EQUALS(2u, u--);
+            ASSERT_EQUALS(0u, --u);
+            ASSERT_EQUALS(0u, u);
+            
+            u++;
+            ASSERT( u > 0 );
+
+            u--;
+            ASSERT( ! ( u > 0 ) );
+        }
+    };
+
+    class MVarTest : public ThreadedTest<> {
+        static const int iterations = 10000;
+        MVar<int> target;
+
+    public:
+        MVarTest() : target(0) {}
+        void subthread(int) {
+            for(int i=0; i < iterations; i++) {
+                int val = target.take();
+#if BOOST_VERSION >= 103500
+                //increase chances of catching failure
+                boost::this_thread::yield();
+#endif
+                target.put(val+1);
+            }
+        }
+        void validate() {
+            ASSERT_EQUALS(target.take() , nthreads * iterations);
+        }
+    };
+
+    class ThreadPoolTest {
+        static const int iterations = 10000;
+        static const int nThreads = 8;
+
+        AtomicUInt counter;
+        void increment(int n) {
+            for (int i=0; i<n; i++) {
+                counter++;
+            }
+        }
+
+    public:
+        void run() {
+            ThreadPool tp(nThreads);
+
+            for (int i=0; i < iterations; i++) {
+                tp.schedule(&ThreadPoolTest::increment, this, 2);
+            }
+
+            tp.join();
+
+            ASSERT(counter == (unsigned)(iterations * 2));
+        }
+    };
+
+    class LockTest {
+    public:
+        void run() {
+            // quick atomicint wrap test
+            // MSGID likely assumes this semantic
+            AtomicUInt counter = 0xffffffff;
+            counter++;
+            ASSERT( counter == 0 );
+
+            writelocktry lk( "" , 0 );
+            ASSERT( lk.got() );
+            ASSERT( d.dbMutex.isWriteLocked() );
+        }
+    };
+
+    class RWLockTest1 { 
+    public:
+        void run() { 
+            RWLock lk( "eliot" );
+            {
+                rwlock r( lk , true , 1000 );
+            }
+        }
+    };
+
+    class RWLockTest2 { 
+    public:
+        
+        static void worker1( RWLockRecursiveNongreedy * lk , AtomicUInt * x ) {
+            (*x)++; // 1
+            //cout << "lock b try" << endl;
+            RWLockRecursiveNongreedy::Exclusive b(*lk);
+            //cout << "lock b got" << endl;
+            (*x)++; // 2
+        }
+
+        static void worker2( RWLockRecursiveNongreedy * lk , AtomicUInt * x ) {
+            //cout << "lock c try" << endl;
+            RWLockRecursiveNongreedy::Shared c(*lk);
+            (*x)++;
+            //cout << "lock c got" << endl;
+        }
+
+        void run() { 
+            /**
+             * note: this test will deadlock if the code breaks
+             */
+            
+            RWLockRecursiveNongreedy lk( "eliot2" , 120 * 1000 );
+            cout << "RWLock impl: " << lk.implType() << endl;
+
+            auto_ptr<RWLockRecursiveNongreedy::Shared> a( new RWLockRecursiveNongreedy::Shared(lk) );
+            
+            AtomicUInt x1 = 0;
+            cout << "A : " << &x1 << endl;
+            boost::thread t1( boost::bind( worker1 , &lk , &x1 ) );
+            while ( ! x1 );
+            assert( x1 == 1 );
+            sleepmillis( 500 );
+            assert( x1 == 1 );
+            
+            AtomicUInt x2 = 0;
+
+            boost::thread t2( boost::bind( worker2, &lk , &x2 ) );
+            t2.join();
+            assert( x2 == 1 );
+
+            a.reset();
+
+            for ( int i=0; i<2000; i++ ) {
+                if ( x1 == 2 )
+                    break;
+                sleepmillis(1);
+            }
+
+            assert( x1 == 2 );
+            t1.join();
+            
+        }
+    };
+
+
+
+    /** test of shared lock */
+    class RWLockTest3 { 
+    public:
+        
+        static void worker2( RWLockRecursiveNongreedy * lk , AtomicUInt * x ) {
+    	    assert( ! lk->__lock_try(0) );
+            //cout << "lock c try" << endl;
+            RWLockRecursiveNongreedy::Shared c( *lk  );
+            (*x)++;
+            //cout << "lock c got" << endl;
+        }
+
+        void run() { 
+            /**
+             * note: this test will deadlock if the code breaks
+             */
+            
+            RWLockRecursiveNongreedy lk( "eliot2" , 120 * 1000 );
+            
+            auto_ptr<RWLockRecursiveNongreedy::Shared> a( new RWLockRecursiveNongreedy::Shared( lk ) );
+            
+            AtomicUInt x2 = 0;
+
+            boost::thread t2( boost::bind( worker2, &lk , &x2 ) );
+            t2.join();
+            assert( x2 == 1 );
+
+            a.reset();
+            
+        }
+    };
+
+    class RWLockTest4 { 
+    public:
+        
+#if defined(__linux__) || defined(__APPLE__)
+        static void worker1( pthread_rwlock_t * lk , AtomicUInt * x ) {
+            (*x)++; // 1
+            cout << "lock b try" << endl;
+            while ( 1 ) {
+                if ( pthread_rwlock_trywrlock( lk ) == 0 )
+                    break;
+                sleepmillis(10);
+            }
+            cout << "lock b got" << endl;
+            (*x)++; // 2
+            pthread_rwlock_unlock( lk );
+        }
+
+        static void worker2( pthread_rwlock_t * lk , AtomicUInt * x ) {
+            cout << "lock c try" << endl;
+            pthread_rwlock_rdlock( lk );
+            (*x)++;
+            cout << "lock c got" << endl;
+            pthread_rwlock_unlock( lk );
+        }
+#endif
+        void run() { 
+            /**
+             * note: this test will deadlock if the code breaks
+             */
+      
+#if defined(__linux__) || defined(__APPLE__)      
+            
+            // create
+            pthread_rwlock_t lk;
+            assert( pthread_rwlock_init( &lk , 0 ) == 0 );
+            
+            // read lock
+            assert( pthread_rwlock_rdlock( &lk ) == 0 );
+            
+            AtomicUInt x1 = 0;
+            boost::thread t1( boost::bind( worker1 , &lk , &x1 ) );
+            while ( ! x1 );
+            assert( x1 == 1 );
+            sleepmillis( 500 );
+            assert( x1 == 1 );
+            
+            AtomicUInt x2 = 0;
+
+            boost::thread t2( boost::bind( worker2, &lk , &x2 ) );
+            t2.join();
+            assert( x2 == 1 );
+
+            pthread_rwlock_unlock( &lk );
+
+            for ( int i=0; i<2000; i++ ) {
+                if ( x1 == 2 )
+                    break;
+                sleepmillis(1);
+            }
+
+            assert( x1 == 2 );
+            t1.join();
+#endif            
+        }
+    };
+
+    class List1Test2 : public ThreadedTest<> {
+        static const int iterations = 1000; // note: a lot of iterations will use a lot of memory as List1 leaks on purpose
+        class M : public List1<M>::Base {
+        public:
+            M(int x) : _x(x) { }
+            const int _x;
+        };
+        List1<M> l;
+    public:
+        void validate() { }
+        void subthread(int) {
+            for(int i=0; i < iterations; i++) {
+                int r = std::rand() % 256;
+                if( r == 0 ) {
+                    l.orphanAll();
+                }
+                else if( r < 4 ) { 
+                    l.push(new M(r));
+                }
+                else {
+                    M *orph = 0;
+                    for( M *m = l.head(); m; m=m->next() ) { 
+                        ASSERT( m->_x > 0 && m->_x < 4 );
+                        if( r > 192 && std::rand() % 8 == 0 )
+                            orph = m;
+                    }
+                    if( orph ) {
+                        try { 
+                            l.orphan(orph);
+                        }
+                        catch(...) { }
+                    }
+                }
+            }
+        }
+    };
+
+    class List1Test {
+    public:
+        class M : public List1<M>::Base {
+            ~M();
+        public:
+            M( int x ) {
+                num = x;
+            }
+            int num;
+        };
+
+        void run(){
+            List1<M> l;
+            
+            vector<M*> ms;
+            for ( int i=0; i<5; i++ ) {
+                M * m = new M(i);
+                ms.push_back( m );
+                l.push( m );
+            }
+            
+            // must assert as the item is missing
+            ASSERT_THROWS( l.orphan( new M( -3 ) ) , UserException );
+        }
+    };
+
+    class Hierarchical1 {
+    public:
+        void run() {
+            {
+                LockCollectionForReading x("bar");
+            }
+            {
+                LockCollectionForReading x("foo");
+                LockCollectionForReading y("foo"); // recursion is ok
+            }
+            {
+                LockCollectionForReading x("foo");
+                LockCollectionForReading y("foo.$bar"); 
+            }
+#if defined(CLC)
+            {
+                LockCollectionForWriting x("foo");
+                LockCollectionForWriting y("foo");
+            }
+            {
+                LockCollectionForReading x("foo");
+                ASSERT_THROWS( LockCollectionForWriting y("foo"), DBException )
+            }
+            {
+                LockCollectionForReading x("foo");
+                ASSERT_THROWS( LockCollectionForReading y("bar"), DBException )
+            }
+#endif
+            cout << "temp ok" << endl;
+        }
+    };
+
+#if 1
+    class UpgradableTest : public ThreadedTest<7> {
+        RWLock m;
+    public:
+        UpgradableTest() : m("utest") {}
+    private:
+        virtual void validate() { }
+        virtual void subthread(int x) {
+            Client::initThread("utest");
+
+            /* r = get a read lock 
+               R = get a read lock and we expect it to be fast
+               u = get upgradable 
+               U = get upgradable and we expect it to be fast
+               w = get a write lock
+            */
+            //                    /-- verify upgrade can be done instantly while in a read lock already
+            //                    |  /-- verify upgrade acquisition isn't greedy
+            //                    |  | /-- verify writes aren't greedy while in upgradable (or are they?)
+            //                    v  v v
+            const char *what = " RURuRwR";
+
+            sleepmillis(100*x);
+
+            log() << x << ' ' << what[x] << " request" << endl;
+            char ch = what[x];
+            switch( ch ) { 
+            case 'w':
+                {
+                    m.lock();
+                    log() << x << " w got" << endl;
+                    sleepmillis(100);
+                    log() << x << " w unlock" << endl;
+                    m.unlock();
+                }
+                break;
+            case 'u':
+            case 'U':
+                {
+                    Timer t;
+                    RWLock::Upgradable u(m);
+                    log() << x << ' ' << ch << " got" << endl;
+                    if( ch == 'U' ) {
+#ifdef MONGO_USE_SRW_ON_WINDOWS
+                        if( t.millis() > 200 ) {
+#else
+                        if( t.millis() > 20 ) {
+#endif
+                            DEV {
+                                // a _DEBUG buildbot might be slow, try to avoid false positives
+                                log() << "warning lock upgrade was slow " << t.millis() << endl;
+                            }
+                            else {
+                                log() << "assertion failure: lock upgrade was too slow: " << t.millis() << endl;
+                                ASSERT( false );
+                            }
+                        }
+                    }
+                    sleepsecs(1);
+                    log() << x << ' ' << ch << " unlock" << endl;
+                }
+                break;
+            case 'r':
+            case 'R':
+                {
+                    Timer t;
+                    m.lock_shared();
+                    log() << x << ' ' << ch << " got " << endl;
+                    if( what[x] == 'R' ) {
+                        if( t.millis() > 15 ) { 
+                            log() << x << " warning: when in upgradable, write locks are still greedy on this platform" << endl;
+                        }
+                    }
+                    sleepmillis(200);
+                    log() << x << ' ' << ch << " unlock" << endl;
+                    m.unlock_shared();
+                }
+                break;
+            default:
+                ASSERT(false);
+            }
+
+            cc().shutdown();
+        }
+    };
+#endif
+
+    class WriteLocksAreGreedy : public ThreadedTest<3> {
+    public:
+        WriteLocksAreGreedy() : m("gtest") {}
+    private:
+        RWLock m;
+        virtual void validate() { }
+        virtual void subthread(int x) {
+            Client::initThread("utest");
+            if( x == 1 ) { 
+                cout << mongo::curTimeMillis64() % 10000 << " 1" << endl;
+                rwlock_shared lk(m);
+                sleepmillis(300);
+                cout << mongo::curTimeMillis64() % 10000 << " 1x" << endl;
+            }
+            if( x == 2 ) {
+                sleepmillis(100);
+                cout << mongo::curTimeMillis64() % 10000 << " 2" << endl;
+                rwlock lk(m, true);
+                //m._lock();
+                cout << mongo::curTimeMillis64() % 10000 << " 2x" << endl;
+                //m.unlock();
+            }
+            if( x == 3 ) {
+                sleepmillis(200);
+                Timer t;
+                cout << mongo::curTimeMillis64() % 10000 << " 3" << endl;
+                rwlock_shared lk(m);
+                cout << mongo::curTimeMillis64() % 10000 << " 3x" << endl;
+                cout << t.millis() << endl;
+                ASSERT( t.millis() > 50 );
+            }
+            cc().shutdown();
+        }
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "threading" ) { }
+
+        void setupTests() {
+            add< Hierarchical1 >();
+
+            add< WriteLocksAreGreedy >();
+            add< UpgradableTest >();
+            add< List1Test >();
+            add< List1Test2 >();
+
+            add< IsAtomicUIntAtomic >();
+            add< MVarTest >();
+            add< ThreadPoolTest >();
+            add< LockTest >();
+
+
+            add< RWLockTest1 >();
+            //add< RWLockTest2 >(); // SERVER-2996
+            add< RWLockTest3 >();
+            add< RWLockTest4 >();
+
+            add< MongoMutexTest >();
+        }
+    } myall;
+}
diff --git a/src/mongo/dbtests/updatetests.cpp b/src/mongo/dbtests/updatetests.cpp
new file mode 100644
index 00000000000..c912bf437d0
--- /dev/null
+++ b/src/mongo/dbtests/updatetests.cpp
@@ -0,0 +1,877 @@
+// updatetests.cpp : unit tests relating to update requests
+//
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db/ops/query.h"
+
+#include "../db/db.h"
+#include "../db/instance.h"
+#include "../db/json.h"
+#include "../db/lasterror.h"
+#include "../db/ops/update.h"
+
+#include "dbtests.h"
+
+namespace UpdateTests {
+
+    class ClientBase {
+    public:
+        // NOTE: Not bothering to backup the old error record.
+        ClientBase() {
+            mongo::lastError.reset( new LastError() );
+        }
+        ~ClientBase() {
+            mongo::lastError.release();
+        }
+    protected:
+        static void insert( const char *ns, BSONObj o ) {
+            client_.insert( ns, o );
+        }
+        static void update( const char *ns, BSONObj q, BSONObj o, bool upsert = 0 ) {
+            client_.update( ns, Query( q ), o, upsert );
+        }
+        static bool error() {
+            return !client_.getPrevError().getField( "err" ).isNull();
+        }
+        DBDirectClient &client() const { return client_; }
+    private:
+        static DBDirectClient client_;
+    };
+    DBDirectClient ClientBase::client_;
+
+    class Fail : public ClientBase {
+    public:
+        virtual ~Fail() {}
+        void run() {
+            prep();
+            ASSERT( !error() );
+            doIt();
+            ASSERT( error() );
+        }
+    protected:
+        const char *ns() { return "unittests.UpdateTests_Fail"; }
+        virtual void prep() {
+            insert( ns(), fromjson( "{a:1}" ) );
+        }
+        virtual void doIt() = 0;
+    };
+
+    class ModId : public Fail {
+        void doIt() {
+            update( ns(), BSONObj(), fromjson( "{$set:{'_id':4}}" ) );
+        }
+    };
+
+    class ModNonmodMix : public Fail {
+        void doIt() {
+            update( ns(), BSONObj(), fromjson( "{$set:{a:4},z:3}" ) );
+        }
+    };
+
+    class InvalidMod : public Fail {
+        void doIt() {
+            update( ns(), BSONObj(), fromjson( "{$awk:{a:4}}" ) );
+        }
+    };
+
+    class ModNotFirst : public Fail {
+        void doIt() {
+            update( ns(), BSONObj(), fromjson( "{z:3,$set:{a:4}}" ) );
+        }
+    };
+
+    class ModDuplicateFieldSpec : public Fail {
+        void doIt() {
+            update( ns(), BSONObj(), fromjson( "{$set:{a:4},$inc:{a:1}}" ) );
+        }
+    };
+
+    class IncNonNumber : public Fail {
+        void doIt() {
+            update( ns(), BSONObj(), fromjson( "{$inc:{a:'d'}}" ) );
+        }
+    };
+
+    class PushAllNonArray : public Fail {
+        void doIt() {
+            insert( ns(), fromjson( "{a:[1]}" ) );
+            update( ns(), BSONObj(), fromjson( "{$pushAll:{a:'d'}}" ) );
+        }
+    };
+
+    class PullAllNonArray : public Fail {
+        void doIt() {
+            insert( ns(), fromjson( "{a:[1]}" ) );
+            update( ns(), BSONObj(), fromjson( "{$pullAll:{a:'d'}}" ) );
+        }
+    };
+
+    class IncTargetNonNumber : public Fail {
+        void doIt() {
+            insert( ns(), BSON( "a" << "a" ) );
+            update( ns(), BSON( "a" << "a" ), fromjson( "{$inc:{a:1}}" ) );
+        }
+    };
+
+    class SetBase : public ClientBase {
+    public:
+        ~SetBase() {
+            client().dropCollection( ns() );
+        }
+    protected:
+        const char *ns() { return "unittests.updatetests.SetBase"; }
+    };
+
+    class SetNum : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), BSON( "a" << 1 ) );
+            client().update( ns(), BSON( "a" << 1 ), BSON( "$set" << BSON( "a" << 4 ) ) );
+            ASSERT( !client().findOne( ns(), BSON( "a" << 4 ) ).isEmpty() );
+        }
+    };
+
+    class SetString : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), BSON( "a" << "b" ) );
+            client().update( ns(), BSON( "a" << "b" ), BSON( "$set" << BSON( "a" << "c" ) ) );
+            ASSERT( !client().findOne( ns(), BSON( "a" << "c" ) ).isEmpty() );
+        }
+    };
+
+    class SetStringDifferentLength : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), BSON( "a" << "b" ) );
+            client().update( ns(), BSON( "a" << "b" ), BSON( "$set" << BSON( "a" << "cd" ) ) );
+            ASSERT( !client().findOne( ns(), BSON( "a" << "cd" ) ).isEmpty() );
+        }
+    };
+
+    class SetStringToNum : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), BSON( "a" << "b" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a" << 5 ) ) );
+            ASSERT( !client().findOne( ns(), BSON( "a" << 5 ) ).isEmpty() );
+        }
+    };
+
+    class SetStringToNumInPlace : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), BSON( "a" << "bcd" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a" << 5.0 ) ) );
+            ASSERT( !client().findOne( ns(), BSON( "a" << 5.0 ) ).isEmpty() );
+        }
+    };
+
+    class ModDotted : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{a:{b:4}}" ) );
+            client().update( ns(), Query(), BSON( "$inc" << BSON( "a.b" << 10 ) ) );
+            ASSERT( !client().findOne( ns(), BSON( "a.b" << 14 ) ).isEmpty() );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.b" << 55 ) ) );
+            ASSERT( !client().findOne( ns(), BSON( "a.b" << 55 ) ).isEmpty() );
+        }
+    };
+
+    class SetInPlaceDotted : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{a:{b:'cdef'}}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.b" << "llll" ) ) );
+            ASSERT( !client().findOne( ns(), BSON( "a.b" << "llll" ) ).isEmpty() );
+        }
+    };
+
+    class SetRecreateDotted : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:'cdef'}}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.b" << "lllll" ) ) );
+            ASSERT( client().findOne( ns(), BSON( "a.b" << "lllll" ) ).woCompare( fromjson( "{'_id':0,a:{b:'lllll'}}" ) ) == 0 );
+        }
+    };
+
+    class SetMissingDotted : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), BSONObj(), BSON( "$set" << BSON( "a.b" << "lllll" ) ) );
+            ASSERT( client().findOne( ns(), BSON( "a.b" << "lllll" ) ).woCompare( fromjson( "{'_id':0,a:{b:'lllll'}}" ) ) == 0 );
+        }
+    };
+
+    class SetAdjacentDotted : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{c:4}}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.b" << "lllll" ) ) );
+            ASSERT_EQUALS( client().findOne( ns(), BSON( "a.b" << "lllll" ) ) , fromjson( "{'_id':0,a:{b:'lllll',c:4}}" ) );
+        }
+    };
+
+    class IncMissing : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), Query(), BSON( "$inc" << BSON( "f" << 3.0 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,f:3}" ) ) == 0 );
+        }
+    };
+
+    class MultiInc : public SetBase {
+    public:
+
+        string s() {
+            stringstream ss;
+            auto_ptr<DBClientCursor> cc = client().query( ns() , Query().sort( BSON( "_id" << 1 ) ) );
+            bool first = true;
+            while ( cc->more() ) {
+                if ( first ) first = false;
+                else ss << ",";
+
+                BSONObj o = cc->next();
+                ss << o["x"].numberInt();
+            }
+            return ss.str();
+        }
+
+        void run() {
+            client().insert( ns(), BSON( "_id" << 1 << "x" << 1 ) );
+            client().insert( ns(), BSON( "_id" << 2 << "x" << 5 ) );
+
+            ASSERT_EQUALS( "1,5" , s() );
+
+            client().update( ns() , BSON( "_id" << 1 ) , BSON( "$inc" << BSON( "x" << 1 ) ) );
+            ASSERT_EQUALS( "2,5" , s() );
+
+            client().update( ns() , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) );
+            ASSERT_EQUALS( "3,5" , s() );
+
+            client().update( ns() , BSONObj() , BSON( "$inc" << BSON( "x" << 1 ) ) , false , true );
+            ASSERT_EQUALS( "4,6" , s() );
+
+        }
+    };
+
+    class UnorderedNewSet : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "f.g.h" << 3.0 << "f.g.a" << 2.0 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,f:{g:{a:2,h:3}}}" ) ) == 0 );
+        }
+    };
+
+    class UnorderedNewSetAdjacent : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), BSONObj(), BSON( "$set" << BSON( "f.g.h.b" << 3.0 << "f.g.a.b" << 2.0 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,f:{g:{a:{b:2},h:{b:3}}}}" ) ) == 0 );
+        }
+    };
+
+    class ArrayEmbeddedSet : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,z:[4,'b']}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "z.0" << "a" ) ) );
+            ASSERT_EQUALS( client().findOne( ns(), Query() ) , fromjson( "{'_id':0,z:['a','b']}" ) );
+        }
+    };
+
+    class AttemptEmbedInExistingNum : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:1}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.b" << 1 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:1}" ) ) == 0 );
+        }
+    };
+
+    class AttemptEmbedConflictsWithOtherSet : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a" << 2 << "a.b" << 1 ) ) );
+            ASSERT_EQUALS( client().findOne( ns(), Query() ) , fromjson( "{'_id':0}" ) );
+        }
+    };
+
+    class ModMasksEmbeddedConflict : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:2}}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a" << 2 << "a.b" << 1 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:{b:2}}" ) ) == 0 );
+        }
+    };
+
+    class ModOverwritesExistingObject : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:2}}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a" << BSON( "c" << 2 ) ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:{c:2}}" ) ) == 0 );
+        }
+    };
+
+    class InvalidEmbeddedSet : public Fail {
+    public:
+        virtual void doIt() {
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a." << 1 ) ) );
+        }
+    };
+
+    class UpsertMissingEmbedded : public SetBase {
+    public:
+        void run() {
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.b" << 1 ) ), true );
+            ASSERT( !client().findOne( ns(), QUERY( "a.b" << 1 ) ).isEmpty() );
+        }
+    };
+
+    class Push : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:[1]}" ) );
+            client().update( ns(), Query(), BSON( "$push" << BSON( "a" << 5 ) ) );
+            ASSERT_EQUALS( client().findOne( ns(), Query() ) , fromjson( "{'_id':0,a:[1,5]}" ) );
+        }
+    };
+
+    class PushInvalidEltType : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:1}" ) );
+            client().update( ns(), Query(), BSON( "$push" << BSON( "a" << 5 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:1}" ) ) == 0 );
+        }
+    };
+
+    class PushConflictsWithOtherMod : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:[1]}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a" << 1 ) <<"$push" << BSON( "a" << 5 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:[1]}" ) ) == 0 );
+        }
+    };
+
+    class PushFromNothing : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), Query(), BSON( "$push" << BSON( "a" << 5 ) ) );
+            ASSERT_EQUALS( client().findOne( ns(), Query() ) , fromjson( "{'_id':0,a:[5]}" ) );
+        }
+    };
+
+    class PushFromEmpty : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:[]}" ) );
+            client().update( ns(), Query(), BSON( "$push" << BSON( "a" << 5 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:[5]}" ) ) == 0 );
+        }
+    };
+
+    class PushInsideNothing : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), Query(), BSON( "$push" << BSON( "a.b" << 5 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:{b:[5]}}" ) ) == 0 );
+        }
+    };
+
+    class CantPushInsideOtherMod : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a" << BSONObj() ) << "$push" << BSON( "a.b" << 5 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0}" ) ) == 0 );
+        }
+    };
+
+    class CantPushTwice : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:[]}" ) );
+            client().update( ns(), Query(), BSON( "$push" << BSON( "a" << 4 ) << "$push" << BSON( "a" << 5 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:[]}" ) ) == 0 );
+        }
+    };
+
+    class SetEncapsulationConflictsWithExistingType : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:4}}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.b.c" << 4.0 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:{b:4}}" ) ) == 0 );
+        }
+    };
+
+    class CantPushToParent : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:4}}" ) );
+            client().update( ns(), Query(), BSON( "$push" << BSON( "a" << 4.0 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:{b:4}}" ) ) == 0 );
+        }
+    };
+
+    class CantIncParent : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:4}}" ) );
+            client().update( ns(), Query(), BSON( "$inc" << BSON( "a" << 4.0 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:{b:4}}" ) ) == 0 );
+        }
+    };
+
+    class DontDropEmpty : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:{}}}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.c" << 4.0 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:{b:{},c:4}}" ) ) == 0 );
+        }
+    };
+
+    class InsertInEmpty : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:{}}}" ) );
+            client().update( ns(), Query(), BSON( "$set" << BSON( "a.b.f" << 4.0 ) ) );
+            ASSERT( client().findOne( ns(), Query() ).woCompare( fromjson( "{'_id':0,a:{b:{f:4}}}" ) ) == 0 );
+        }
+    };
+
+    class IndexParentOfMod : public SetBase {
+    public:
+        void run() {
+            client().ensureIndex( ns(), BSON( "a" << 1 ) );
+            client().insert( ns(), fromjson( "{'_id':0}" ) );
+            client().update( ns(), Query(), fromjson( "{$set:{'a.b':4}}" ) );
+            ASSERT_EQUALS( fromjson( "{'_id':0,a:{b:4}}" ) , client().findOne( ns(), Query() ) );
+            ASSERT_EQUALS( fromjson( "{'_id':0,a:{b:4}}" ) , client().findOne( ns(), fromjson( "{'a.b':4}" ) ) ); // make sure the index works
+        }
+    };
+
+    class IndexModSet : public SetBase {
+    public:
+        void run() {
+            client().ensureIndex( ns(), BSON( "a.b" << 1 ) );
+            client().insert( ns(), fromjson( "{'_id':0,a:{b:3}}" ) );
+            client().update( ns(), Query(), fromjson( "{$set:{'a.b':4}}" ) );
+            ASSERT_EQUALS( fromjson( "{'_id':0,a:{b:4}}" ) , client().findOne( ns(), Query() ) );
+            ASSERT_EQUALS( fromjson( "{'_id':0,a:{b:4}}" ) , client().findOne( ns(), fromjson( "{'a.b':4}" ) ) ); // make sure the index works
+        }
+    };
+
+
+    class PreserveIdWithIndex : public SetBase { // Not using $set, but base class is still useful
+    public:
+        void run() {
+            client().insert( ns(), BSON( "_id" << 55 << "i" << 5 ) );
+            client().update( ns(), BSON( "i" << 5 ), BSON( "i" << 6 ) );
+            ASSERT( !client().findOne( ns(), Query( BSON( "_id" << 55 ) ).hint
+                                       ( "{\"_id\":ObjectId(\"000000000000000000000000\")}" ) ).isEmpty() );
+        }
+    };
+
+    class CheckNoMods : public SetBase {
+    public:
+        void run() {
+            client().update( ns(), BSONObj(), BSON( "i" << 5 << "$set" << BSON( "q" << 3 ) ), true );
+            ASSERT( error() );
+        }
+    };
+
+    class UpdateMissingToNull : public SetBase {
+    public:
+        void run() {
+            client().insert( ns(), BSON( "a" << 5 ) );
+            client().update( ns(), BSON( "a" << 5 ), fromjson( "{$set:{b:null}}" ) );
+            ASSERT_EQUALS( jstNULL, client().findOne( ns(), QUERY( "a" << 5 ) ).getField( "b" ).type() );
+        }
+    };
+
+    namespace ModSetTests {
+
+        class internal1 {
+        public:
+            void run() {
+                BSONObj b = BSON( "$inc" << BSON( "x" << 1 << "a.b" << 1 ) );
+                ModSet m(b);
+
+                ASSERT( m.haveModForField( "x" ) );
+                ASSERT( m.haveModForField( "a.b" ) );
+                ASSERT( ! m.haveModForField( "y" ) );
+                ASSERT( ! m.haveModForField( "a.c" ) );
+                ASSERT( ! m.haveModForField( "a" ) );
+
+                ASSERT( m.haveConflictingMod( "x" ) );
+                ASSERT( m.haveConflictingMod( "a" ) );
+                ASSERT( m.haveConflictingMod( "a.b" ) );
+                ASSERT( ! m.haveConflictingMod( "a.bc" ) );
+                ASSERT( ! m.haveConflictingMod( "a.c" ) );
+                ASSERT( ! m.haveConflictingMod( "a.a" ) );
+            }
+        };
+
+        class Base {
+        public:
+
+            virtual ~Base() {}
+
+
+            void test( BSONObj morig , BSONObj in , BSONObj wanted ) {
+                BSONObj m = morig.copy();
+                ModSet set(m);
+
+                BSONObj out = set.prepare(in)->createNewFromMods();
+                ASSERT_EQUALS( wanted , out );
+            }
+        };
+
+        class inc1 : public Base {
+        public:
+            void run() {
+                BSONObj m = BSON( "$inc" << BSON( "x" << 1 ) );
+                test( m , BSON( "x" << 5 )  , BSON( "x" << 6 ) );
+                test( m , BSON( "a" << 5 )  , BSON( "a" << 5 << "x" << 1 ) );
+                test( m , BSON( "z" << 5 )  , BSON( "x" << 1 << "z" << 5 ) );
+            }
+        };
+
+        class inc2 : public Base {
+        public:
+            void run() {
+                BSONObj m = BSON( "$inc" << BSON( "a.b" << 1 ) );
+                test( m , BSONObj() , BSON( "a" << BSON( "b" << 1 ) ) );
+                test( m , BSON( "a" << BSON( "b" << 2 ) ) , BSON( "a" << BSON( "b" << 3 ) ) );
+
+                m = BSON( "$inc" << BSON( "a.b" << 1 << "a.c" << 1 ) );
+                test( m , BSONObj() , BSON( "a" << BSON( "b" << 1 << "c" << 1 ) ) );
+
+
+            }
+        };
+
+        class set1 : public Base {
+        public:
+            void run() {
+                test( BSON( "$set" << BSON( "x" << 17 ) ) , BSONObj() , BSON( "x" << 17 ) );
+                test( BSON( "$set" << BSON( "x" << 17 ) ) , BSON( "x" << 5 ) , BSON( "x" << 17 ) );
+
+                test( BSON( "$set" << BSON( "x.a" << 17 ) ) , BSON( "z" << 5 ) , BSON( "x" << BSON( "a" << 17 )<< "z" << 5 ) );
+            }
+        };
+
+        class push1 : public Base {
+        public:
+            void run() {
+                test( BSON( "$push" << BSON( "a" << 5 ) ) , fromjson( "{a:[1]}" ) , fromjson( "{a:[1,5]}" ) );
+            }
+        };
+
+    };
+
+    namespace basic {
+        class Base : public ClientBase {
+        protected:
+
+            virtual const char * ns() = 0;
+            virtual void dotest() = 0;
+
+            void insert( const BSONObj& o ) {
+                client().insert( ns() , o );
+            }
+
+            void update( const BSONObj& m ) {
+                client().update( ns() , BSONObj() , m );
+            }
+
+            BSONObj findOne() {
+                return client().findOne( ns() , BSONObj() );
+            }
+
+            void test( const char* initial , const char* mod , const char* after ) {
+                test( fromjson( initial ) , fromjson( mod ) , fromjson( after ) );
+            }
+
+
+            void test( const BSONObj& initial , const BSONObj& mod , const BSONObj& after ) {
+                client().dropCollection( ns() );
+                insert( initial );
+                update( mod );
+                ASSERT_EQUALS( after , findOne() );
+                client().dropCollection( ns() );
+            }
+
+        public:
+
+            Base() {}
+            virtual ~Base() {
+            }
+
+            void run() {
+                client().dropCollection( ns() );
+
+                dotest();
+
+                client().dropCollection( ns() );
+            }
+        };
+
+        class SingleTest : public Base {
+            virtual BSONObj initial() = 0;
+            virtual BSONObj mod() = 0;
+            virtual BSONObj after() = 0;
+
+            void dotest() {
+                test( initial() , mod() , after() );
+            }
+
+        };
+
+        class inc1 : public SingleTest {
+            virtual BSONObj initial() {
+                return BSON( "_id" << 1 << "x" << 1 );
+            }
+            virtual BSONObj mod() {
+                return BSON( "$inc" << BSON( "x" << 2 ) );
+            }
+            virtual BSONObj after() {
+                return BSON( "_id" << 1 << "x" << 3 );
+            }
+            virtual const char * ns() {
+                return "unittests.inc1";
+            }
+
+        };
+
+        class inc2 : public SingleTest {
+            virtual BSONObj initial() {
+                return BSON( "_id" << 1 << "x" << 1 );
+            }
+            virtual BSONObj mod() {
+                return BSON( "$inc" << BSON( "x" << 2.5 ) );
+            }
+            virtual BSONObj after() {
+                return BSON( "_id" << 1 << "x" << 3.5 );
+            }
+            virtual const char * ns() {
+                return "unittests.inc2";
+            }
+
+        };
+
+        class inc3 : public SingleTest {
+            virtual BSONObj initial() {
+                return BSON( "_id" << 1 << "x" << 537142123123LL );
+            }
+            virtual BSONObj mod() {
+                return BSON( "$inc" << BSON( "x" << 2 ) );
+            }
+            virtual BSONObj after() {
+                return BSON( "_id" << 1 << "x" << 537142123125LL );
+            }
+            virtual const char * ns() {
+                return "unittests.inc3";
+            }
+
+        };
+
+        class inc4 : public SingleTest {
+            virtual BSONObj initial() {
+                return BSON( "_id" << 1 << "x" << 537142123123LL );
+            }
+            virtual BSONObj mod() {
+                return BSON( "$inc" << BSON( "x" << 2LL ) );
+            }
+            virtual BSONObj after() {
+                return BSON( "_id" << 1 << "x" << 537142123125LL );
+            }
+            virtual const char * ns() {
+                return "unittests.inc4";
+            }
+
+        };
+
+        class inc5 : public SingleTest {
+            virtual BSONObj initial() {
+                return BSON( "_id" << 1 << "x" << 537142123123LL );
+            }
+            virtual BSONObj mod() {
+                return BSON( "$inc" << BSON( "x" << 2.0 ) );
+            }
+            virtual BSONObj after() {
+                return BSON( "_id" << 1 << "x" << 537142123125LL );
+            }
+            virtual const char * ns() {
+                return "unittests.inc5";
+            }
+
+        };
+
+        class inc6 : public Base {
+
+            virtual const char * ns() {
+                return "unittests.inc6";
+            }
+
+
+            virtual BSONObj initial() { return BSONObj(); }
+            virtual BSONObj mod() { return BSONObj(); }
+            virtual BSONObj after() { return BSONObj(); }
+
+            void dotest() {
+                long long start = numeric_limits<int>::max() - 5;
+                long long max   = numeric_limits<int>::max() + 5ll;
+
+                client().insert( ns() , BSON( "x" << (int)start ) );
+                ASSERT( findOne()["x"].type() == NumberInt );
+
+                while ( start < max ) {
+                    update( BSON( "$inc" << BSON( "x" << 1 ) ) );
+                    start += 1;
+                    ASSERT_EQUALS( start , findOne()["x"].numberLong() ); // SERVER-2005
+                }
+
+                ASSERT( findOne()["x"].type() == NumberLong );
+            }
+        };
+
+        class bit1 : public Base {
+            const char * ns() {
+                return "unittests.bit1";
+            }
+            void dotest() {
+                test( BSON( "_id" << 1 << "x" << 3 ) , BSON( "$bit" << BSON( "x" << BSON( "and" << 2 ) ) ) , BSON( "_id" << 1 << "x" << ( 3 & 2 ) ) );
+                test( BSON( "_id" << 1 << "x" << 1 ) , BSON( "$bit" << BSON( "x" << BSON( "or" << 4 ) ) ) , BSON( "_id" << 1 << "x" << ( 1 | 4 ) ) );
+                test( BSON( "_id" << 1 << "x" << 3 ) , BSON( "$bit" << BSON( "x" << BSON( "and" << 2 << "or" << 8 ) ) ) , BSON( "_id" << 1 << "x" << ( ( 3 & 2 ) | 8 ) ) );
+                test( BSON( "_id" << 1 << "x" << 3 ) , BSON( "$bit" << BSON( "x" << BSON( "or" << 2 << "and" << 8 ) ) ) , BSON( "_id" << 1 << "x" << ( ( 3 | 2 ) & 8 ) ) );
+
+            }
+        };
+
+        class unset : public Base {
+            const char * ns() {
+                return "unittests.unset";
+            }
+            void dotest() {
+                test( "{_id:1,x:1}" , "{$unset:{x:1}}" , "{_id:1}" );
+            }
+        };
+
+        class setswitchint : public Base {
+            const char * ns() {
+                return "unittests.int1";
+            }
+            void dotest() {
+                test( BSON( "_id" << 1 << "x" << 1 ) , BSON( "$set" << BSON( "x" << 5.6 ) ) , BSON( "_id" << 1 << "x" << 5.6 ) );
+                test( BSON( "_id" << 1 << "x" << 5.6 ) , BSON( "$set" << BSON( "x" << 1 ) ) , BSON( "_id" << 1 << "x" << 1 ) );
+            }
+        };
+
+
+    };
+
+    class All : public Suite {
+    public:
+        All() : Suite( "update" ) {
+        }
+        void setupTests() {
+            add< ModId >();
+            add< ModNonmodMix >();
+            add< InvalidMod >();
+            add< ModNotFirst >();
+            add< ModDuplicateFieldSpec >();
+            add< IncNonNumber >();
+            add< PushAllNonArray >();
+            add< PullAllNonArray >();
+            add< IncTargetNonNumber >();
+            add< SetNum >();
+            add< SetString >();
+            add< SetStringDifferentLength >();
+            add< SetStringToNum >();
+            add< SetStringToNumInPlace >();
+            add< ModDotted >();
+            add< SetInPlaceDotted >();
+            add< SetRecreateDotted >();
+            add< SetMissingDotted >();
+            add< SetAdjacentDotted >();
+            add< IncMissing >();
+            add< MultiInc >();
+            add< UnorderedNewSet >();
+            add< UnorderedNewSetAdjacent >();
+            add< ArrayEmbeddedSet >();
+            add< AttemptEmbedInExistingNum >();
+            add< AttemptEmbedConflictsWithOtherSet >();
+            add< ModMasksEmbeddedConflict >();
+            add< ModOverwritesExistingObject >();
+            add< InvalidEmbeddedSet >();
+            add< UpsertMissingEmbedded >();
+            add< Push >();
+            add< PushInvalidEltType >();
+            add< PushConflictsWithOtherMod >();
+            add< PushFromNothing >();
+            add< PushFromEmpty >();
+            add< PushInsideNothing >();
+            add< CantPushInsideOtherMod >();
+            add< CantPushTwice >();
+            add< SetEncapsulationConflictsWithExistingType >();
+            add< CantPushToParent >();
+            add< CantIncParent >();
+            add< DontDropEmpty >();
+            add< InsertInEmpty >();
+            add< IndexParentOfMod >();
+            add< IndexModSet >();
+            add< PreserveIdWithIndex >();
+            add< CheckNoMods >();
+            add< UpdateMissingToNull >();
+
+            add< ModSetTests::internal1 >();
+            add< ModSetTests::inc1 >();
+            add< ModSetTests::inc2 >();
+            add< ModSetTests::set1 >();
+            add< ModSetTests::push1 >();
+
+            add< basic::inc1 >();
+            add< basic::inc2 >();
+            add< basic::inc3 >();
+            add< basic::inc4 >();
+            add< basic::inc5 >();
+            add< basic::inc6 >();
+            add< basic::bit1 >();
+            add< basic::unset >();
+            add< basic::setswitchint >();
+        }
+    } myall;
+
+} // namespace UpdateTests
+
diff --git a/src/mongo/pch.cpp b/src/mongo/pch.cpp
new file mode 100644
index 00000000000..afa19a5be6b
--- /dev/null
+++ b/src/mongo/pch.cpp
@@ -0,0 +1,41 @@
+// pch.cpp : helper for using precompiled headers
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#ifndef JSTIME_VIRTUAL_SKEW
+#define JSTIME_VIRTUAL_SKEW
+
+namespace mongo {
+    // jsTime_virtual_skew is just for testing. a test command manipulates it.
+    long long jsTime_virtual_skew = 0;
+    boost::thread_specific_ptr<long long> jsTime_virtual_thread_skew;
+}
+
+#endif
+
+#if defined( __MSVC__ )
+// should probably check VS version here
+#elif defined( __GNUC__ )
+
+#if __GNUC__ < 4
+#error gcc < 4 not supported
+#endif
+
+#else
+// unknown compiler
+#endif
diff --git a/src/mongo/pch.h b/src/mongo/pch.h
new file mode 100644
index 00000000000..162ff48cc69
--- /dev/null
+++ b/src/mongo/pch.h
@@ -0,0 +1,184 @@
+/** @file pch.h : include file for standard system include files,
+ *  or project specific include files that are used frequently, but
+ *  are changed infrequently
+ */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#ifndef MONGO_PCH_H
+#define MONGO_PCH_H
+
+#if defined(MONGO_EXPOSE_MACROS)
+# define JS_C_STRINGS_ARE_UTF8
+# undef  SUPPORT_UCP
+# define SUPPORT_UCP
+# undef  SUPPORT_UTF8
+# define SUPPORT_UTF8
+# undef  _CRT_SECURE_NO_WARNINGS
+# define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#if defined(_WIN32)
+// for rand_s() usage:
+# define _CRT_RAND_S
+# ifndef NOMINMAX
+#  define NOMINMAX
+# endif
+#define WIN32_LEAN_AND_MEAN
+# include <winsock2.h> //this must be included before the first windows.h include
+# include <ws2tcpip.h>
+# include <wspiapi.h>
+# include <windows.h>
+#endif
+
+#if defined(__linux__) && defined(MONGO_EXPOSE_MACROS)
+// glibc's optimized versions are better than g++ builtins
+# define __builtin_strcmp strcmp
+# define __builtin_strlen strlen
+# define __builtin_memchr memchr
+# define __builtin_memcmp memcmp
+# define __builtin_memcpy memcpy
+# define __builtin_memset memset
+# define __builtin_memmove memmove
+#endif
+
+
+#include <ctime>
+#include <cstring>
+#include <sstream>
+#include <string>
+#include <memory>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <vector>
+#include <set>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sstream>
+#include <signal.h>
+#include "targetver.h"
+#include "time.h"
+#include "string.h"
+#include "limits.h"
+
+//#include <boost/any.hpp>
+#include "boost/thread/once.hpp"
+//#include <boost/archive/iterators/transform_width.hpp>
+#define BOOST_FILESYSTEM_VERSION 2
+#include <boost/filesystem/convenience.hpp>
+#include <boost/filesystem/exception.hpp>
+#include <boost/filesystem/operations.hpp>
+#include <boost/program_options.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/smart_ptr.hpp>
+#include <boost/function.hpp>
+#include "boost/bind.hpp"
+#include "boost/function.hpp"
+#include <boost/thread/tss.hpp>
+#include "boost/detail/endian.hpp"
+#define BOOST_SPIRIT_THREADSAFE
+#include <boost/version.hpp>
+#include <boost/tuple/tuple.hpp>
+#include <boost/thread/thread.hpp>
+#include <boost/thread/condition.hpp>
+#include <boost/thread/recursive_mutex.hpp>
+#include <boost/thread/xtime.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+    using namespace std;
+    using boost::shared_ptr;
+
+#if defined(_DEBUG)
+    const bool debug=true;
+#else
+    const bool debug=false;
+#endif
+
+    // pdfile versions
+    const int PDFILE_VERSION = 4;
+    const int PDFILE_VERSION_MINOR = 5;
+
+    enum ExitCode {
+        EXIT_CLEAN = 0 ,
+        EXIT_BADOPTIONS = 2 ,
+        EXIT_REPLICATION_ERROR = 3 ,
+        EXIT_NEED_UPGRADE = 4 ,
+        EXIT_SHARDING_ERROR = 5 ,
+        EXIT_KILL = 12 ,
+        EXIT_ABRUPT = 14 ,
+        EXIT_NTSERVICE_ERROR = 20 ,
+        EXIT_JAVA = 21 ,
+        EXIT_OOM_MALLOC = 42 ,
+        EXIT_OOM_REALLOC = 43 ,
+        EXIT_FS = 45 ,
+        EXIT_CLOCK_SKEW = 47 ,
+        EXIT_NET_ERROR = 48 ,
+        EXIT_WINDOWS_SERVICE_STOP = 49 ,
+        EXIT_POSSIBLE_CORRUPTION = 60 , // this means we detected a possible corruption situation, like a buf overflow
+        EXIT_UNCAUGHT = 100 , // top level exception that wasn't caught
+        EXIT_TEST = 101 ,
+
+    };
+
+    void dbexit( ExitCode returnCode, const char *whyMsg = "", bool tryToGetLock = false);
+
+    /**
+       this is here so you can't just type exit() to quit the program
+       you should either use dbexit to shutdown cleanly, or ::exit to tell the system to quit
+       if you use this, you'll get a link error since mongo::exit isn't defined
+     */
+    void exit( ExitCode returnCode );
+    bool inShutdown();
+
+    using namespace boost::filesystem;
+    void asserted(const char *msg, const char *file, unsigned line);
+}
+
+
+
+// TODO: Rework the headers so we don't need this craziness
+#include "bson/inline_decls.h"
+#define MONGO_assert(_Expression) (void)( MONGO_likely(!!(_Expression)) || (mongo::asserted(#_Expression, __FILE__, __LINE__), 0) )
+
+#include "util/debug_util.h"
+#include "util/goodies.h"
+#include "util/log.h"
+#include "util/allocator.h"
+#include "util/assert_util.h"
+
+namespace mongo {
+
+    void sayDbContext(const char *msg = 0);
+    void rawOut( const string &s );
+
+    typedef char _TCHAR;
+
+    using boost::uint32_t;
+    using boost::uint64_t;
+
+    /** called by mongos, mongod, test. do not call from clients and such. 
+        invoked before about everything except global var construction.
+     */
+    void doPreServerStartupInits();
+
+} // namespace mongo
+
+#endif // MONGO_PCH_H
diff --git a/src/mongo/s/balance.cpp b/src/mongo/s/balance.cpp
new file mode 100644
index 00000000000..e1c4b65ca0b
--- /dev/null
+++ b/src/mongo/s/balance.cpp
@@ -0,0 +1,348 @@
+//@file balance.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "../db/jsobj.h"
+#include "../db/cmdline.h"
+
+#include "../client/distlock.h"
+
+#include "balance.h"
+#include "server.h"
+#include "shard.h"
+#include "config.h"
+#include "chunk.h"
+#include "grid.h"
+
+namespace mongo {
+
+    Balancer balancer;
+
+    Balancer::Balancer() : _balancedLastTime(0), _policy( new BalancerPolicy() ) {}
+
+    Balancer::~Balancer() {
+    }
+
+    int Balancer::_moveChunks( const vector<CandidateChunkPtr>* candidateChunks ) {
+        int movedCount = 0;
+
+        for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) {
+            const CandidateChunk& chunkInfo = *it->get();
+
+            DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );
+            assert( cfg );
+
+            ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
+            assert( cm );
+
+            const BSONObj& chunkToMove = chunkInfo.chunk;
+            ChunkPtr c = cm->findChunk( chunkToMove["min"].Obj() );
+            if ( c->getMin().woCompare( chunkToMove["min"].Obj() ) || c->getMax().woCompare( chunkToMove["max"].Obj() ) ) {
+                // likely a split happened somewhere
+                cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
+                assert( cm );
+
+                c = cm->findChunk( chunkToMove["min"].Obj() );
+                if ( c->getMin().woCompare( chunkToMove["min"].Obj() ) || c->getMax().woCompare( chunkToMove["max"].Obj() ) ) {
+                    log() << "chunk mismatch after reload, ignoring will retry issue cm: "
+                          << c->getMin() << " min: " << chunkToMove["min"].Obj() << endl;
+                    continue;
+                }
+            }
+
+            BSONObj res;
+            if ( c->moveAndCommit( Shard::make( chunkInfo.to ) , Chunk::MaxChunkSize , res ) ) {
+                movedCount++;
+                continue;
+            }
+
+            // the move requires acquiring the collection metadata's lock, which can fail
+            log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to
+                  << " chunk: " << chunkToMove << endl;
+
+            if ( res["chunkTooBig"].trueValue() ) {
+                // reload just to be safe
+                cm = cfg->getChunkManager( chunkInfo.ns );
+                assert( cm );
+                c = cm->findChunk( chunkToMove["min"].Obj() );
+                
+                log() << "forcing a split because migrate failed for size reasons" << endl;
+                
+                res = BSONObj();
+                c->singleSplit( true , res );
+                log() << "forced split results: " << res << endl;
+                
+                if ( ! res["ok"].trueValue() ) {
+                    log() << "marking chunk as jumbo: " << c->toString() << endl;
+                    c->markAsJumbo();
+                    // we increment moveCount so we do another round right away
+                    movedCount++;
+                }
+
+            }
+        }
+
+        return movedCount;
+    }
+
+    void Balancer::_ping( DBClientBase& conn ) {
+        WriteConcern w = conn.getWriteConcern();
+        conn.setWriteConcern( W_NONE );
+
+        conn.update( ShardNS::mongos ,
+                     BSON( "_id" << _myid ) ,
+                     BSON( "$set" << BSON( "ping" << DATENOW << "up" << (int)(time(0)-_started) ) ) ,
+                     true );
+
+        conn.setWriteConcern( w);
+    }
+
+    bool Balancer::_checkOIDs() {
+        vector<Shard> all;
+        Shard::getAllShards( all );
+
+        map<int,Shard> oids;
+
+        for ( vector<Shard>::iterator i=all.begin(); i!=all.end(); ++i ) {
+            Shard s = *i;
+            BSONObj f = s.runCommand( "admin" , "features" );
+            if ( f["oidMachine"].isNumber() ) {
+                int x = f["oidMachine"].numberInt();
+                if ( oids.count(x) == 0 ) {
+                    oids[x] = s;
+                }
+                else {
+                    log() << "error: 2 machines have " << x << " as oid machine piece " << s.toString() << " and " << oids[x].toString() << endl;
+                    s.runCommand( "admin" , BSON( "features" << 1 << "oidReset" << 1 ) );
+                    oids[x].runCommand( "admin" , BSON( "features" << 1 << "oidReset" << 1 ) );
+                    return false;
+                }
+            }
+            else {
+                log() << "warning: oidMachine not set on: " << s.toString() << endl;
+            }
+        }
+        return true;
+    }
+
+    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
+        assert( candidateChunks );
+
+        //
+        // 1. Check whether there is any sharded collection to be balanced by querying
+        // the ShardsNS::collections collection
+        //
+
+        auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() );
+        vector< string > collections;
+        while ( cursor->more() ) {
+            BSONObj col = cursor->nextSafe();
+
+            // sharded collections will have a shard "key".
+            if ( ! col["key"].eoo() )
+                collections.push_back( col["_id"].String() );
+        }
+        cursor.reset();
+
+        if ( collections.empty() ) {
+            LOG(1) << "no collections to balance" << endl;
+            return;
+        }
+
+        //
+        // 2. Get a list of all the shards that are participating in this balance round
+        // along with any maximum allowed quotas and current utilization. We get the
+        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
+        //
+        // TODO: skip unresponsive shards and mark information as stale.
+        //
+
+        vector<Shard> allShards;
+        Shard::getAllShards( allShards );
+        if ( allShards.size() < 2) {
+            LOG(1) << "can't balance without more active shards" << endl;
+            return;
+        }
+
+        map< string, BSONObj > shardLimitsMap;
+        for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) {
+            const Shard& s = *it;
+            ShardStatus status = s.getStatus();
+
+            BSONObj limitsObj = BSON( ShardFields::maxSize( s.getMaxSize() ) <<
+                                      LimitsFields::currSize( status.mapped() ) <<
+                                      ShardFields::draining( s.isDraining() )  <<
+                                      LimitsFields::hasOpsQueued( status.hasOpsQueued() )
+                                    );
+
+            shardLimitsMap[ s.getName() ] = limitsObj;
+        }
+
+        //
+        // 3. For each collection, check if the balancing policy recommends moving anything around.
+        //
+
+        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
+            const string& ns = *it;
+
+            map< string,vector<BSONObj> > shardToChunksMap;
+            cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) );
+            while ( cursor->more() ) {
+                BSONObj chunk = cursor->nextSafe();
+                if ( chunk["jumbo"].trueValue() )
+                    continue;
+                vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()];
+                chunks.push_back( chunk.getOwned() );
+            }
+            cursor.reset();
+
+            if (shardToChunksMap.empty()) {
+                LOG(1) << "skipping empty collection (" << ns << ")";
+                continue;
+            }
+
+            for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) {
+                // this just makes sure there is an entry in shardToChunksMap for every shard
+                Shard s = *i;
+                shardToChunksMap[s.getName()].size();
+            }
+
+            CandidateChunk* p = _policy->balance( ns , shardLimitsMap , shardToChunksMap , _balancedLastTime );
+            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
+        }
+    }
+
+    bool Balancer::_init() {
+        try {
+
+            log() << "about to contact config servers and shards" << endl;
+
+            // contact the config server and refresh shard information
+            // checks that each shard is indeed a different process (no hostname mixup)
+            // these checks are redundant in that they're redone at every new round but we want to do them initially here
+            // so to catch any problem soon
+            Shard::reloadShardInfo();
+            _checkOIDs();
+
+            log() << "config servers and shards contacted successfully" << endl;
+
+            StringBuilder buf;
+            buf << getHostNameCached() << ":" << cmdLine.port;
+            _myid = buf.str();
+            _started = time(0);
+
+            log() << "balancer id: " << _myid << " started at " << time_t_to_String_short(_started) << endl;
+
+            return true;
+
+        }
+        catch ( std::exception& e ) {
+            warning() << "could not initialize balancer, please check that all shards and config servers are up: " << e.what() << endl;
+            return false;
+
+        }
+    }
+
+    void Balancer::run() {
+
+        // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
+        while ( ! inShutdown() ) {
+
+            if ( ! _init() ) {
+                log() << "will retry to initialize balancer in one minute" << endl;
+                sleepsecs( 60 );
+                continue;
+            }
+
+            break;
+        }
+
+        // getConnectioString and dist lock constructor does not throw, which is what we expect on while
+        // on the balancer thread
+        ConnectionString config = configServer.getConnectionString();
+        DistributedLock balanceLock( config , "balancer" );
+
+        while ( ! inShutdown() ) {
+
+            try {
+                
+                ScopedDbConnection conn( config );
+                
+                // ping has to be first so we keep things in the config server in sync
+                _ping( conn.conn() );
+
+                // now make sure we should even be running
+                if ( ! grid.shouldBalance() ) {
+                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;
+                    conn.done();
+                    
+                    sleepsecs( 30 );
+                    continue;
+                }
+                
+                uassert( 13258 , "oids broken after resetting!" , _checkOIDs() );
+
+                // use fresh shard state
+                Shard::reloadShardInfo();
+                    
+                // refresh chunk size (even though another balancer might be active)
+                Chunk::refreshChunkSize();
+
+                {
+                    dist_lock_try lk( &balanceLock , "doing balance round" );
+                    if ( ! lk.got() ) {
+                        LOG(1) << "skipping balancing round because another balancer is active" << endl;
+                        conn.done();
+                        
+                        sleepsecs( 30 ); // no need to wake up soon
+                        continue;
+                    }
+                    
+                    LOG(1) << "*** start balancing round" << endl;
+
+                    vector<CandidateChunkPtr> candidateChunks;
+                    _doBalanceRound( conn.conn() , &candidateChunks );
+                    if ( candidateChunks.size() == 0 ) {
+                        LOG(1) << "no need to move any chunk" << endl;
+                    }
+                    else {
+                        _balancedLastTime = _moveChunks( &candidateChunks );
+                    }
+                    
+                    LOG(1) << "*** end of balancing round" << endl;
+                }
+                
+                conn.done();
+                    
+                sleepsecs( _balancedLastTime ? 5 : 10 );
+            }
+            catch ( std::exception& e ) {
+                log() << "caught exception while doing balance: " << e.what() << endl;
+
+                // Just to match the opening statement if in log level 1
+                LOG(1) << "*** End of balancing round" << endl;
+
+                sleepsecs( 30 ); // sleep a fair amount b/c of error
+                continue;
+            }
+        }
+
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/s/balance.h b/src/mongo/s/balance.h
new file mode 100644
index 00000000000..687599610db
--- /dev/null
+++ b/src/mongo/s/balance.h
@@ -0,0 +1,105 @@
+//@file balance.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/background.h"
+#include "../client/dbclient.h"
+#include "balancer_policy.h"
+
+namespace mongo {
+
+    /**
+     * The balancer is a background task that tries to keep the number of chunks across all servers of the cluster even. Although
+     * every mongos will have one balancer running, only one of them will be active at the any given point in time. The balancer
+     * uses a 'DistributedLock' for that coordination.
+     *
+     * The balancer does act continuously but in "rounds". At a given round, it would decide if there is an imbalance by
+     * checking the difference in chunks between the most and least loaded shards. It would issue a request for a chunk
+     * migration per round, if it found so.
+     */
+    class Balancer : public BackgroundJob {
+    public:
+        Balancer();
+        virtual ~Balancer();
+
+        // BackgroundJob methods
+
+        virtual void run();
+
+        virtual string name() const { return "Balancer"; }
+
+    private:
+        typedef BalancerPolicy::ChunkInfo CandidateChunk;
+        typedef shared_ptr<CandidateChunk> CandidateChunkPtr;
+
+        // hostname:port of my mongos
+        string _myid;
+
+        // time the Balancer started running
+        time_t _started;
+
+        // number of moved chunks in last round
+        int _balancedLastTime;
+
+        // decide which chunks to move; owned here.
+        scoped_ptr<BalancerPolicy> _policy;
+        
+        /**
+         * Checks that the balancer can connect to all servers it needs to do its job.
+         *
+         * @return true if balancing can be started
+         *
+         * This method throws on a network exception
+         */
+        bool _init();
+
+        /**
+         * Gathers all the necessary information about shards and chunks, and decides whether there are candidate chunks to
+         * be moved.
+         *
+         * @param conn is the connection with the config server(s)
+         * @param candidateChunks (IN/OUT) filled with candidate chunks, one per collection, that could possibly be moved
+         */
+        void _doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks );
+
+        /**
+         * Issues chunk migration request, one at a time.
+         *
+         * @param candidateChunks possible chunks to move
+         * @return number of chunks effectively moved
+         */
+        int _moveChunks( const vector<CandidateChunkPtr>* candidateChunks );
+
+        /**
+         * Marks this balancer as being live on the config server(s).
+         *
+         * @param conn is the connection with the config server(s)
+         */
+        void _ping( DBClientBase& conn );
+
+        /**
+         * @return true if all the servers listed in configdb as being shards are reachable and are distinct processes
+         */
+        bool _checkOIDs();
+
+    };
+
+    extern Balancer balancer;
+}
diff --git a/src/mongo/s/balancer_policy.cpp b/src/mongo/s/balancer_policy.cpp
new file mode 100644
index 00000000000..03defa5678a
--- /dev/null
+++ b/src/mongo/s/balancer_policy.cpp
@@ -0,0 +1,192 @@
+// balancer_policy.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "config.h"
+
+#include "../client/dbclient.h"
+#include "../util/stringutils.h"
+#include "../util/unittest.h"
+
+#include "balancer_policy.h"
+
+namespace mongo {
+
+    // limits map fields
+    BSONField<long long> LimitsFields::currSize( "currSize" );
+    BSONField<bool> LimitsFields::hasOpsQueued( "hasOpsQueued" );
+
+    BalancerPolicy::ChunkInfo* BalancerPolicy::balance( const string& ns,
+            const ShardToLimitsMap& shardToLimitsMap,
+            const ShardToChunksMap& shardToChunksMap,
+            int balancedLastTime ) {
+        pair<string,unsigned> min("",numeric_limits<unsigned>::max());
+        pair<string,unsigned> max("",0);
+        vector<string> drainingShards;
+
+        bool maxOpsQueued = false;
+
+        for (ShardToChunksIter i = shardToChunksMap.begin(); i!=shardToChunksMap.end(); ++i ) {
+
+            // Find whether this shard's capacity or availability are exhausted
+            const string& shard = i->first;
+            BSONObj shardLimits;
+            ShardToLimitsIter it = shardToLimitsMap.find( shard );
+            if ( it != shardToLimitsMap.end() ) shardLimits = it->second;
+            const bool maxedOut = isSizeMaxed( shardLimits );
+            const bool draining = isDraining( shardLimits );
+            const bool opsQueued = hasOpsQueued( shardLimits );
+
+            
+            // Is this shard a better chunk receiver then the current one?
+            // Shards that would be bad receiver candidates:
+            // + maxed out shards
+            // + draining shards
+            // + shards with operations queued for writeback
+            const unsigned size = i->second.size();
+            if ( ! maxedOut && ! draining && ! opsQueued ) {
+                if ( size < min.second ) {
+                    min = make_pair( shard , size );
+                }
+            }
+            else if ( opsQueued ) {
+                LOG(1) << "won't send a chunk to: " << shard << " because it has ops queued" << endl;
+            }
+            else if ( maxedOut ) {
+                LOG(1) << "won't send a chunk to: " << shard << " because it is maxedOut" << endl;
+            }
+
+
+            // Check whether this shard is a better chunk donor then the current one.
+            // Draining shards take a lower priority than overloaded shards.
+            if ( size > max.second ) {
+                max = make_pair( shard , size );
+                maxOpsQueued = opsQueued;
+            }
+            if ( draining && (size > 0)) {
+                drainingShards.push_back( shard );
+            }
+        }
+
+        // If there is no candidate chunk receiver -- they may have all been maxed out,
+        // draining, ... -- there's not much that the policy can do.
+        if ( min.second == numeric_limits<unsigned>::max() ) {
+            log() << "no available shards to take chunks" << endl;
+            return NULL;
+        }
+
+        if ( maxOpsQueued ) {
+            log() << "biggest shard " << max.first << " has unprocessed writebacks, waiting for completion of migrate" << endl;
+            return NULL;
+        }
+
+        LOG(1) << "collection : " << ns << endl;
+        LOG(1) << "donor      : " << max.second << " chunks on " << max.first << endl;
+        LOG(1) << "receiver   : " << min.second << " chunks on " << min.first << endl;
+        if ( ! drainingShards.empty() ) {
+            string drainingStr;
+            joinStringDelim( drainingShards, &drainingStr, ',' );
+            LOG(1) << "draining           : " << ! drainingShards.empty() << "(" << drainingShards.size() << ")" << endl;
+        }
+
+        // Solving imbalances takes a higher priority than draining shards. Many shards can
+        // be draining at once but we choose only one of them to cater to per round.
+        // Important to start balanced, so when there are few chunks any imbalance must be fixed.
+        const int imbalance = max.second - min.second;
+        int threshold = 8;
+        if (balancedLastTime || max.second < 20) threshold = 2;
+        else if (max.second < 80) threshold = 4;
+        string from, to;
+        if ( imbalance >= threshold ) {
+            from = max.first;
+            to = min.first;
+
+        }
+        else if ( ! drainingShards.empty() ) {
+            from = drainingShards[ rand() % drainingShards.size() ];
+            to = min.first;
+
+        }
+        else {
+            // Everything is balanced here!
+            return NULL;
+        }
+
+        const vector<BSONObj>& chunksFrom = shardToChunksMap.find( from )->second;
+        const vector<BSONObj>& chunksTo = shardToChunksMap.find( to )->second;
+        BSONObj chunkToMove = pickChunk( chunksFrom , chunksTo );
+        log() << "chose [" << from << "] to [" << to << "] " << chunkToMove << endl;
+
+        return new ChunkInfo( ns, to, from, chunkToMove );
+    }
+
+    BSONObj BalancerPolicy::pickChunk( const vector<BSONObj>& from, const vector<BSONObj>& to ) {
+        // It is possible for a donor ('from') shard to have less chunks than a receiver one ('to')
+        // if the donor is in draining mode.
+
+        if ( to.size() == 0 )
+            return from[0];
+
+        if ( from[0]["min"].Obj().woCompare( to[to.size()-1]["max"].Obj() , BSONObj() , false ) == 0 )
+            return from[0];
+
+        if ( from[from.size()-1]["max"].Obj().woCompare( to[0]["min"].Obj() , BSONObj() , false ) == 0 )
+            return from[from.size()-1];
+
+        return from[0];
+    }
+
+    bool BalancerPolicy::isSizeMaxed( BSONObj limits ) {
+        // If there's no limit information for the shard, assume it can be a chunk receiver
+        // (i.e., there's not bound on space utilization)
+        if ( limits.isEmpty() ) {
+            return false;
+        }
+
+        long long maxUsage = limits[ ShardFields::maxSize.name() ].Long();
+        if ( maxUsage == 0 ) {
+            return false;
+        }
+
+        long long currUsage = limits[ LimitsFields::currSize.name() ].Long();
+        if ( currUsage < maxUsage ) {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool BalancerPolicy::isDraining( BSONObj limits ) {
+        BSONElement draining = limits[ ShardFields::draining.name() ];
+        if ( draining.eoo() || ! draining.trueValue() ) {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool BalancerPolicy::hasOpsQueued( BSONObj limits ) {
+        BSONElement opsQueued = limits[ LimitsFields::hasOpsQueued.name() ];
+        if ( opsQueued.eoo() || ! opsQueued.trueValue() ) {
+            return false;
+        }
+        return true;
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/s/balancer_policy.h b/src/mongo/s/balancer_policy.h
new file mode 100644
index 00000000000..cef5aa64afc
--- /dev/null
+++ b/src/mongo/s/balancer_policy.h
@@ -0,0 +1,98 @@
+// @file balancer_policy.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef S_BALANCER_POLICY_HEADER
+#define S_BALANCER_POLICY_HEADER
+
+#include "../pch.h"
+
+namespace mongo {
+
+    class BalancerPolicy {
+    public:
+        struct ChunkInfo;
+
+        /**
+         * Returns a suggested chunk to move whithin a collection's shards, given information about
+         * space usage and number of chunks for that collection. If the policy doesn't recommend
+         * moving, it returns NULL.
+         *
+         * @param ns is the collections namepace.
+         * @param shardLimitMap is a map from shardId to an object that describes (for now) space
+         * cap and usage. E.g.: { "maxSize" : <size_in_MB> , "usedSize" : <size_in_MB> }.
+         * @param shardToChunksMap is a map from shardId to chunks that live there. A chunk's format
+         * is { }.
+         * @param balancedLastTime is the number of chunks effectively moved in the last round.
+         * @returns NULL or ChunkInfo of the best move to make towards balacing the collection.
+         */
+        typedef map< string,BSONObj > ShardToLimitsMap;
+        typedef map< string,vector<BSONObj> > ShardToChunksMap;
+        static ChunkInfo* balance( const string& ns, const ShardToLimitsMap& shardToLimitsMap,
+                                   const ShardToChunksMap& shardToChunksMap, int balancedLastTime );
+
+        // below exposed for testing purposes only -- treat it as private --
+
+        static BSONObj pickChunk( const vector<BSONObj>& from, const vector<BSONObj>& to );
+
+        /**
+         * Returns true if a shard cannot receive any new chunks bacause it reache 'shardLimits'.
+         * Expects the optional fields "maxSize", can in size in MB, and "usedSize", currently used size
+         * in MB, on 'shardLimits'.
+         */
+        static bool isSizeMaxed( BSONObj shardLimits );
+
+        /**
+         * Returns true if 'shardLimist' contains a field "draining". Expects the optional field
+         * "isDraining" on 'shrdLimits'.
+         */
+        static bool isDraining( BSONObj shardLimits );
+
+        /**
+         * Returns true if a shard currently has operations in any of its writeback queues
+         */
+        static bool hasOpsQueued( BSONObj shardLimits );
+
+    private:
+        // Convenience types
+        typedef ShardToChunksMap::const_iterator ShardToChunksIter;
+        typedef ShardToLimitsMap::const_iterator ShardToLimitsIter;
+
+    };
+
+    struct BalancerPolicy::ChunkInfo {
+        const string ns;
+        const string to;
+        const string from;
+        const BSONObj chunk;
+
+        ChunkInfo( const string& a_ns , const string& a_to , const string& a_from , const BSONObj& a_chunk )
+            : ns( a_ns ) , to( a_to ) , from( a_from ), chunk( a_chunk ) {}
+    };
+
+    /**
+     * Field names used in the 'limits' map.
+     */
+    struct LimitsFields {
+        // we use 'draining' and 'maxSize' from the 'shards' collection plus the following
+        static BSONField<long long> currSize; // currently used disk space in bytes
+        static BSONField<bool> hasOpsQueued;  // writeback queue is not empty?
+    };
+
+}  // namespace mongo
+
+#endif  // S_BALANCER_POLICY_HEADER
diff --git a/src/mongo/s/chunk.cpp b/src/mongo/s/chunk.cpp
new file mode 100644
index 00000000000..e0e7edee9bd
--- /dev/null
+++ b/src/mongo/s/chunk.cpp
@@ -0,0 +1,1104 @@
+// @file chunk.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "../client/connpool.h"
+#include "../db/querypattern.h"
+#include "../db/queryutil.h"
+#include "../util/unittest.h"
+#include "../util/timer.h"
+
+#include "chunk.h"
+#include "config.h"
+#include "cursors.h"
+#include "grid.h"
+#include "strategy.h"
+#include "client.h"
+
+namespace mongo {
+
+    inline bool allOfType(BSONType type, const BSONObj& o) {
+        BSONObjIterator it(o);
+        while(it.more()) {
+            if (it.next().type() != type)
+                return false;
+        }
+        return true;
+    }
+
+    // -------  Shard --------
+
+    string Chunk::chunkMetadataNS = "config.chunks";
+
+    int Chunk::MaxChunkSize = 1024 * 1024 * 64;
+    int Chunk::MaxObjectPerChunk = 250000;
+    
+
+    Chunk::Chunk(const ChunkManager * manager, BSONObj from)
+        : _manager(manager), _lastmod(0), _dataWritten(mkDataWritten())
+    {
+        string ns = from.getStringField( "ns" );
+        _shard.reset( from.getStringField( "shard" ) );
+
+        _lastmod = from["lastmod"];
+        assert( _lastmod > 0 );
+
+        _min = from.getObjectField( "min" ).getOwned();
+        _max = from.getObjectField( "max" ).getOwned();
+        
+        _jumbo = from["jumbo"].trueValue();
+
+        uassert( 10170 ,  "Chunk needs a ns" , ! ns.empty() );
+        uassert( 13327 ,  "Chunk ns must match server ns" , ns == _manager->getns() );
+
+        uassert( 10171 ,  "Chunk needs a server" , _shard.ok() );
+
+        uassert( 10172 ,  "Chunk needs a min" , ! _min.isEmpty() );
+        uassert( 10173 ,  "Chunk needs a max" , ! _max.isEmpty() );
+    }
+
+
+    Chunk::Chunk(const ChunkManager * info , const BSONObj& min, const BSONObj& max, const Shard& shard)
+        : _manager(info), _min(min), _max(max), _shard(shard), _lastmod(0), _jumbo(false), _dataWritten(mkDataWritten())
+    {}
+
+    long Chunk::mkDataWritten() {
+        return rand() % ( MaxChunkSize / 5 );
+    }
+
+    string Chunk::getns() const {
+        assert( _manager );
+        return _manager->getns();
+    }
+
+    bool Chunk::contains( const BSONObj& obj ) const {
+        return
+            _manager->getShardKey().compare( getMin() , obj ) <= 0 &&
+            _manager->getShardKey().compare( obj , getMax() ) < 0;
+    }
+
+    bool ChunkRange::contains(const BSONObj& obj) const {
+        // same as Chunk method
+        return
+            _manager->getShardKey().compare( getMin() , obj ) <= 0 &&
+            _manager->getShardKey().compare( obj , getMax() ) < 0;
+    }
+
+    bool Chunk::minIsInf() const {
+        return _manager->getShardKey().globalMin().woCompare( getMin() ) == 0;
+    }
+
+    bool Chunk::maxIsInf() const {
+        return _manager->getShardKey().globalMax().woCompare( getMax() ) == 0;
+    }
+
+    BSONObj Chunk::_getExtremeKey( int sort ) const {
+        ShardConnection conn( getShard().getConnString() , _manager->getns() );
+        Query q;
+        if ( sort == 1 ) {
+            q.sort( _manager->getShardKey().key() );
+        }
+        else {
+            // need to invert shard key pattern to sort backwards
+            // TODO: make a helper in ShardKeyPattern?
+
+            BSONObj k = _manager->getShardKey().key();
+            BSONObjBuilder r;
+
+            BSONObjIterator i(k);
+            while( i.more() ) {
+                BSONElement e = i.next();
+                uassert( 10163 ,  "can only handle numbers here - which i think is correct" , e.isNumber() );
+                r.append( e.fieldName() , -1 * e.number() );
+            }
+
+            q.sort( r.obj() );
+        }
+
+        // find the extreme key
+        BSONObj end = conn->findOne( _manager->getns() , q );
+        conn.done();
+
+        if ( end.isEmpty() )
+            return BSONObj();
+
+        return _manager->getShardKey().extractKey( end );
+    }
+
+    void Chunk::pickMedianKey( BSONObj& medianKey ) const {
+        // Ask the mongod holding this chunk to figure out the split points.
+        ScopedDbConnection conn( getShard().getConnString() );
+        BSONObj result;
+        BSONObjBuilder cmd;
+        cmd.append( "splitVector" , _manager->getns() );
+        cmd.append( "keyPattern" , _manager->getShardKey().key() );
+        cmd.append( "min" , getMin() );
+        cmd.append( "max" , getMax() );
+        cmd.appendBool( "force" , true );
+        BSONObj cmdObj = cmd.obj();
+
+        if ( ! conn->runCommand( "admin" , cmdObj , result )) {
+            conn.done();
+            ostringstream os;
+            os << "splitVector command (median key) failed: " << result;
+            uassert( 13503 , os.str() , 0 );
+        }
+
+        BSONObjIterator it( result.getObjectField( "splitKeys" ) );
+        if ( it.more() ) {
+            medianKey = it.next().Obj().getOwned();
+        }
+
+        conn.done();
+    }
+
+    void Chunk::pickSplitVector( vector<BSONObj>& splitPoints , int chunkSize /* bytes */, int maxPoints, int maxObjs ) const {
+        // Ask the mongod holding this chunk to figure out the split points.
+        ScopedDbConnection conn( getShard().getConnString() );
+        BSONObj result;
+        BSONObjBuilder cmd;
+        cmd.append( "splitVector" , _manager->getns() );
+        cmd.append( "keyPattern" , _manager->getShardKey().key() );
+        cmd.append( "min" , getMin() );
+        cmd.append( "max" , getMax() );
+        cmd.append( "maxChunkSizeBytes" , chunkSize );
+        cmd.append( "maxSplitPoints" , maxPoints );
+        cmd.append( "maxChunkObjects" , maxObjs );
+        BSONObj cmdObj = cmd.obj();
+
+        if ( ! conn->runCommand( "admin" , cmdObj , result )) {
+            conn.done();
+            ostringstream os;
+            os << "splitVector command failed: " << result;
+            uassert( 13345 , os.str() , 0 );
+        }
+
+        BSONObjIterator it( result.getObjectField( "splitKeys" ) );
+        while ( it.more() ) {
+            splitPoints.push_back( it.next().Obj().getOwned() );
+        }
+        conn.done();
+    }
+
+    BSONObj Chunk::singleSplit( bool force , BSONObj& res ) const {
+        vector<BSONObj> splitPoint;
+
+        // if splitting is not obligatory we may return early if there are not enough data
+        // we cap the number of objects that would fall in the first half (before the split point)
+        // the rationale is we'll find a split point without traversing all the data
+        if ( ! force ) {
+            vector<BSONObj> candidates;
+            const int maxPoints = 2;
+            pickSplitVector( candidates , getManager()->getCurrentDesiredChunkSize() , maxPoints , MaxObjectPerChunk );
+            if ( candidates.size() <= 1 ) {
+                // no split points means there isn't enough data to split on
+                // 1 split point means we have between half the chunk size to full chunk size
+                // so we shouldn't split
+                LOG(1) << "chunk not full enough to trigger auto-split " << ( candidates.size() == 0 ? "no split entry" : candidates[0].toString() ) << endl;
+                return BSONObj();
+            }
+
+            splitPoint.push_back( candidates.front() );
+
+        }
+        else {
+            // if forcing a split, use the chunk's median key
+            BSONObj medianKey;
+            pickMedianKey( medianKey );
+            if ( ! medianKey.isEmpty() )
+                splitPoint.push_back( medianKey );
+        }
+
+        // We assume that if the chunk being split is the first (or last) one on the collection, this chunk is
+        // likely to see more insertions. Instead of splitting mid-chunk, we use the very first (or last) key
+        // as a split point.
+        if ( minIsInf() ) {
+            splitPoint.clear();
+            BSONObj key = _getExtremeKey( 1 );
+            if ( ! key.isEmpty() ) {
+                splitPoint.push_back( key );
+            }
+
+        }
+        else if ( maxIsInf() ) {
+            splitPoint.clear();
+            BSONObj key = _getExtremeKey( -1 );
+            if ( ! key.isEmpty() ) {
+                splitPoint.push_back( key );
+            }
+        }
+
+        // Normally, we'd have a sound split point here if the chunk is not empty. It's also a good place to
+        // sanity check.
+        if ( splitPoint.empty() || _min == splitPoint.front() || _max == splitPoint.front() ) {
+            log() << "want to split chunk, but can't find split point chunk " << toString()
+                  << " got: " << ( splitPoint.empty() ? "<empty>" : splitPoint.front().toString() ) << endl;
+            return BSONObj();
+        }
+        
+        if (multiSplit( splitPoint , res ))
+            return splitPoint.front();
+        else
+            return BSONObj();
+    }
+
+    bool Chunk::multiSplit( const vector<BSONObj>& m , BSONObj& res ) const {
+        const size_t maxSplitPoints = 8192;
+
+        uassert( 10165 , "can't split as shard doesn't have a manager" , _manager );
+        uassert( 13332 , "need a split key to split chunk" , !m.empty() );
+        uassert( 13333 , "can't split a chunk in that many parts", m.size() < maxSplitPoints );
+        uassert( 13003 , "can't split a chunk with only one distinct value" , _min.woCompare(_max) );
+
+        ScopedDbConnection conn( getShard().getConnString() );
+
+        BSONObjBuilder cmd;
+        cmd.append( "splitChunk" , _manager->getns() );
+        cmd.append( "keyPattern" , _manager->getShardKey().key() );
+        cmd.append( "min" , getMin() );
+        cmd.append( "max" , getMax() );
+        cmd.append( "from" , getShard().getConnString() );
+        cmd.append( "splitKeys" , m );
+        cmd.append( "shardId" , genID() );
+        cmd.append( "configdb" , configServer.modelServer() );
+        BSONObj cmdObj = cmd.obj();
+
+        if ( ! conn->runCommand( "admin" , cmdObj , res )) {
+            warning() << "splitChunk failed - cmd: " << cmdObj << " result: " << res << endl;
+            conn.done();
+
+            // reloading won't strictly solve all problems, e.g. the collection's metadata lock can be taken
+            // but we issue here so that mongos may refresh without needing to be written/read against
+            _manager->reload();
+
+            return false;
+        }
+
+        conn.done();
+        
+        // force reload of config
+        _manager->reload();
+
+        return true;
+    }
+
+    bool Chunk::moveAndCommit( const Shard& to , long long chunkSize /* bytes */, BSONObj& res ) const {
+        uassert( 10167 ,  "can't move shard to its current location!" , getShard() != to );
+
+        log() << "moving chunk ns: " << _manager->getns() << " moving ( " << toString() << ") " << _shard.toString() << " -> " << to.toString() << endl;
+
+        Shard from = _shard;
+
+        ScopedDbConnection fromconn( from);
+
+        bool worked = fromconn->runCommand( "admin" ,
+                                            BSON( "moveChunk" << _manager->getns() <<
+                                                    "from" << from.getConnString() <<
+                                                    "to" << to.getConnString() <<
+                                                    "min" << _min <<
+                                                    "max" << _max <<
+                                                    "maxChunkSizeBytes" << chunkSize <<
+                                                    "shardId" << genID() <<
+                                                    "configdb" << configServer.modelServer()
+                                                ) ,
+                                            res
+                                          );
+
+        fromconn.done();
+
+        log( worked ) << "moveChunk result: " << res << endl;
+
+        // if succeeded, needs to reload to pick up the new location
+        // if failed, mongos may be stale
+        // reload is excessive here as the failure could be simply because collection metadata is taken
+        _manager->reload();
+
+        return worked;
+    }
+
+    bool Chunk::splitIfShould( long dataWritten ) const {
+        LastError::Disabled d( lastError.get() );
+
+        try {
+            _dataWritten += dataWritten;
+            int splitThreshold = getManager()->getCurrentDesiredChunkSize();
+            if ( minIsInf() || maxIsInf() ) {
+                splitThreshold = (int) ((double)splitThreshold * .9);
+            }
+
+            if ( _dataWritten < splitThreshold / 5 )
+                return false;
+            
+            if ( ! getManager()->_splitTickets.tryAcquire() ) {
+                LOG(1) << "won't auto split becaue not enough tickets: " << getManager()->getns() << endl;
+                return false;
+            }
+            TicketHolderReleaser releaser( &getManager()->_splitTickets );
+
+            // this is a bit ugly
+            // we need it so that mongos blocks for the writes to actually be committed
+            // this does mean mongos has more back pressure than mongod alone
+            // since it nots 100% tcp queue bound
+            // this was implicit before since we did a splitVector on the same socket
+            ShardConnection::sync();
+
+            LOG(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten << " splitThreshold: " << splitThreshold << endl;
+
+            BSONObj res;
+            BSONObj splitPoint = singleSplit( false /* does not force a split if not enough data */ , res );
+            if ( splitPoint.isEmpty() ) {
+                // singleSplit would have issued a message if we got here
+                _dataWritten = 0; // this means there wasn't enough data to split, so don't want to try again until considerable more data
+                return false;
+            }
+            
+            if ( maxIsInf() || minIsInf() ) {
+                // we don't want to reset _dataWritten since we kind of want to check the other side right away
+            }
+            else {
+                _dataWritten = 0; // we're splitting, so should wait a bit
+            }
+
+
+
+            log() << "autosplitted " << _manager->getns() << " shard: " << toString()
+                  << " on: " << splitPoint << " (splitThreshold " << splitThreshold << ")"
+#ifdef _DEBUG
+                  << " size: " << getPhysicalSize() // slow - but can be useful when debugging
+#endif
+                  << ( res["shouldMigrate"].eoo() ? "" : " (migrate suggested)" ) << endl;
+
+            BSONElement shouldMigrate = res["shouldMigrate"]; // not in mongod < 1.9.1 but that is ok
+            if (!shouldMigrate.eoo() && grid.shouldBalance()){
+                BSONObj range = shouldMigrate.embeddedObject();
+                BSONObj min = range["min"].embeddedObject();
+                BSONObj max = range["max"].embeddedObject();
+
+                Shard newLocation = Shard::pick( getShard() );
+                if ( getShard() == newLocation ) {
+                    // if this is the best shard, then we shouldn't do anything (Shard::pick already logged our shard).
+                    LOG(1) << "recently split chunk: " << range << " already in the best shard: " << getShard() << endl;
+                    return true; // we did split even if we didn't migrate
+                }
+
+                ChunkManagerPtr cm = _manager->reload(false/*just reloaded in mulitsplit*/);
+                ChunkPtr toMove = cm->findChunk(min);
+
+                if ( ! (toMove->getMin() == min && toMove->getMax() == max) ){
+                    LOG(1) << "recently split chunk: " << range << " modified before we could migrate " << toMove << endl;
+                    return true;
+                }
+
+                log() << "moving chunk (auto): " << toMove << " to: " << newLocation.toString() << endl;
+
+                BSONObj res;
+                massert( 10412 ,
+                         str::stream() << "moveAndCommit failed: " << res ,
+                         toMove->moveAndCommit( newLocation , MaxChunkSize , res ) );
+                
+                // update our config
+                _manager->reload();
+            }
+
+            return true;
+
+        }
+        catch ( std::exception& e ) {
+            // if the collection lock is taken (e.g. we're migrating), it is fine for the split to fail.
+            warning() << "could have autosplit on collection: " << _manager->getns() << " but: " << e.what() << endl;
+            return false;
+        }
+    }
+
+    long Chunk::getPhysicalSize() const {
+        ScopedDbConnection conn( getShard().getConnString() );
+
+        BSONObj result;
+        uassert( 10169 ,  "datasize failed!" , conn->runCommand( "admin" ,
+                 BSON( "datasize" << _manager->getns()
+                       << "keyPattern" << _manager->getShardKey().key()
+                       << "min" << getMin()
+                       << "max" << getMax()
+                       << "maxSize" << ( MaxChunkSize + 1 )
+                       << "estimate" << true
+                     ) , result ) );
+
+        conn.done();
+        return (long)result["size"].number();
+    }
+
+    void Chunk::appendShortVersion( const char * name , BSONObjBuilder& b ) const {
+        BSONObjBuilder bb( b.subobjStart( name ) );
+        bb.append( "min" , _min );
+        bb.append( "max" , _max );
+        bb.done();
+    }
+
+    bool Chunk::operator==( const Chunk& s ) const {
+        return
+            _manager->getShardKey().compare( _min , s._min ) == 0 &&
+            _manager->getShardKey().compare( _max , s._max ) == 0
+            ;
+    }
+
+    void Chunk::serialize(BSONObjBuilder& to,ShardChunkVersion myLastMod) {
+
+        to.append( "_id" , genID( _manager->getns() , _min ) );
+
+        if ( myLastMod.isSet() ) {
+            to.appendTimestamp( "lastmod" , myLastMod );
+        }
+        else if ( _lastmod.isSet() ) {
+            assert( _lastmod > 0 && _lastmod < 1000 );
+            to.appendTimestamp( "lastmod" , _lastmod );
+        }
+        else {
+            assert(0);
+        }
+
+        to << "ns" << _manager->getns();
+        to << "min" << _min;
+        to << "max" << _max;
+        to << "shard" << _shard.getName();
+    }
+
+    string Chunk::genID( const string& ns , const BSONObj& o ) {
+        StringBuilder buf( ns.size() + o.objsize() + 16 );
+        buf << ns << "-";
+
+        BSONObjIterator i(o);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            buf << e.fieldName() << "_" << e.toString(false, true);
+        }
+
+        return buf.str();
+    }
+
+    string Chunk::toString() const {
+        stringstream ss;
+        ss << "ns:" << _manager->getns() << " at: " << _shard.toString() << " lastmod: " << _lastmod.toString() << " min: " << _min << " max: " << _max;
+        return ss.str();
+    }
+
+    ShardKeyPattern Chunk::skey() const {
+        return _manager->getShardKey();
+    }
+
+    void Chunk::markAsJumbo() const {
+        // set this first
+        // even if we can't set it in the db
+        // at least this mongos won't try and keep moving
+        _jumbo = true;
+
+        try {
+            ScopedDbConnection conn( configServer.modelServer() );
+            conn->update( chunkMetadataNS , BSON( "_id" << genID() ) , BSON( "$set" << BSON( "jumbo" << true ) ) );
+            conn.done();
+        }
+        catch ( std::exception& ) {
+            warning() << "couldn't set jumbo for chunk: " << genID() << endl;
+        }
+    }
+
+    void Chunk::refreshChunkSize() {
+        BSONObj o = grid.getConfigSetting("chunksize");
+
+        if ( o.isEmpty() ) {
+           return;
+        }
+
+        int csize = o["value"].numberInt();
+
+        // validate chunksize before proceeding
+        if ( csize == 0 ) {
+            // setting was not modified; mark as such
+            log() << "warning: invalid chunksize (" << csize << ") ignored" << endl;
+            return;
+        }
+
+        LOG(1) << "Refreshing MaxChunkSize: " << csize << endl;
+        Chunk::MaxChunkSize = csize * 1024 * 1024;
+    }
+
+    // -------  ChunkManager --------
+
+    AtomicUInt ChunkManager::NextSequenceNumber = 1;
+
+    ChunkManager::ChunkManager( string ns , ShardKeyPattern pattern , bool unique ) :
+        _ns( ns ) , _key( pattern ) , _unique( unique ) , _chunkRanges(), _mutex("ChunkManager"),
+        _nsLock( ConnectionString( configServer.modelServer() , ConnectionString::SYNC ) , ns ),
+
+        // The shard versioning mechanism hinges on keeping track of the number of times we reloaded ChunkManager's.
+        // Increasing this number here will prompt checkShardVersion() to refresh the connection-level versions to
+        // the most up to date value.
+        _sequenceNumber(++NextSequenceNumber),
+
+        _splitTickets( 5 )
+
+    {
+        int tries = 3;
+        while (tries--) {
+            ChunkMap chunkMap;
+            set<Shard> shards;
+            ShardVersionMap shardVersions;
+            Timer t;
+            _load(chunkMap, shards, shardVersions);
+            {
+                int ms = t.millis();
+                log() << "ChunkManager: time to load chunks for " << ns << ": " << ms << "ms" 
+                      << " sequenceNumber: " << _sequenceNumber 
+                      << " version: " << _version.toString() 
+                      << endl;
+            }
+
+            if (_isValid(chunkMap)) {
+                // These variables are const for thread-safety. Since the
+                // constructor can only be called from one thread, we don't have
+                // to worry about that here.
+                const_cast<ChunkMap&>(_chunkMap).swap(chunkMap);
+                const_cast<set<Shard>&>(_shards).swap(shards);
+                const_cast<ShardVersionMap&>(_shardVersions).swap(shardVersions);
+                const_cast<ChunkRangeManager&>(_chunkRanges).reloadAll(_chunkMap);
+                return;
+            }
+            
+            if (_chunkMap.size() < 10) {
+                _printChunks();
+            }
+            
+            warning() << "ChunkManager loaded an invalid config, trying again" << endl;
+
+            sleepmillis(10 * (3-tries));
+        }
+
+        // this will abort construction so we should never have a reference to an invalid config
+        msgasserted(13282, "Couldn't load a valid config for " + _ns + " after 3 attempts. Please try again.");
+    }
+
+    ChunkManagerPtr ChunkManager::reload(bool force) const {
+        return grid.getDBConfig(getns())->getChunkManager(getns(), force);
+    }
+
+    void ChunkManager::_load(ChunkMap& chunkMap, set<Shard>& shards, ShardVersionMap& shardVersions) {
+        ScopedDbConnection conn( configServer.modelServer() );
+
+        // TODO really need the sort?
+        auto_ptr<DBClientCursor> cursor = conn->query( Chunk::chunkMetadataNS, QUERY("ns" << _ns).sort("lastmod",-1), 0, 0, 0, 0,
+                                          (DEBUG_BUILD ? 2 : 1000000)); // batch size. Try to induce potential race conditions in debug builds
+        assert( cursor.get() );
+        while ( cursor->more() ) {
+            BSONObj d = cursor->next();
+            if ( d["isMaxMarker"].trueValue() ) {
+                continue;
+            }
+
+            ChunkPtr c( new Chunk( this, d ) );
+
+            chunkMap[c->getMax()] = c;
+            shards.insert(c->getShard());
+            
+
+            // set global max
+            if ( c->getLastmod() > _version )
+                _version = c->getLastmod();
+            
+            // set shard max
+            ShardChunkVersion& shardMax = shardVersions[c->getShard()];
+            if ( c->getLastmod() > shardMax )
+                shardMax = c->getLastmod();
+        }
+        conn.done();
+    }
+
+    bool ChunkManager::_isValid(const ChunkMap& chunkMap) {
+#define ENSURE(x) do { if(!(x)) { log() << "ChunkManager::_isValid failed: " #x << endl; return false; } } while(0)
+
+        if (chunkMap.empty())
+            return true;
+
+        // Check endpoints
+        ENSURE(allOfType(MinKey, chunkMap.begin()->second->getMin()));
+        ENSURE(allOfType(MaxKey, boost::prior(chunkMap.end())->second->getMax()));
+
+        // Make sure there are no gaps or overlaps
+        for (ChunkMap::const_iterator it=boost::next(chunkMap.begin()), end=chunkMap.end(); it != end; ++it) {
+            ChunkMap::const_iterator last = boost::prior(it);
+
+            if (!(it->second->getMin() == last->second->getMax())) {
+                PRINT(it->second->toString());
+                PRINT(it->second->getMin());
+                PRINT(last->second->getMax());
+            }
+            ENSURE(it->second->getMin() == last->second->getMax());
+        }
+
+        return true;
+
+#undef ENSURE
+    }
+
+    void ChunkManager::_printChunks() const {
+        for (ChunkMap::const_iterator it=_chunkMap.begin(), end=_chunkMap.end(); it != end; ++it) {
+            log() << *it->second << endl;
+        }
+    }
+
+    bool ChunkManager::hasShardKey( const BSONObj& obj ) const {
+        return _key.hasShardKey( obj );
+    }
+
+    void ChunkManager::createFirstChunks( const Shard& primary , vector<BSONObj>* initPoints , vector<Shard>* initShards ) const {
+        // TODO distlock?
+        assert( _chunkMap.size() == 0 );
+
+        vector<BSONObj> splitPoints;
+        vector<Shard> shards;
+        unsigned long long numObjects = 0;
+        Chunk c(this, _key.globalMin(), _key.globalMax(), primary);
+
+        if ( !initPoints || !initPoints->size() ) {
+            // discover split points
+            {
+                // get stats to see if there is any data
+                ScopedDbConnection shardConn( primary.getConnString() );
+                numObjects = shardConn->count( getns() );
+                shardConn.done();
+            }
+
+            if ( numObjects > 0 )
+                c.pickSplitVector( splitPoints , Chunk::MaxChunkSize );
+
+            // since docs alread exists, must use primary shard
+            shards.push_back( primary );
+        } else {
+            // make sure points are unique and ordered
+            set<BSONObj> orderedPts;
+            for ( unsigned i = 0; i < initPoints->size(); ++i ) {
+                BSONObj pt = (*initPoints)[i];
+                orderedPts.insert( pt );
+            }
+            for ( set<BSONObj>::iterator it = orderedPts.begin(); it != orderedPts.end(); ++it ) {
+                splitPoints.push_back( *it );
+            }
+
+            if ( !initShards || !initShards->size() ) {
+                // use all shards, starting with primary
+                shards.push_back( primary );
+                vector<Shard> tmp;
+                primary.getAllShards( tmp );
+                for ( unsigned i = 0; i < tmp.size(); ++i ) {
+                    if ( tmp[i] != primary )
+                        shards.push_back( tmp[i] );
+                }
+            }
+        }
+
+        // this is the first chunk; start the versioning from scratch
+        ShardChunkVersion version;
+        version.incMajor();
+        
+        log() << "going to create " << splitPoints.size() + 1 << " chunk(s) for: " << _ns << endl;
+        
+        ScopedDbConnection conn( configServer.modelServer() );        
+
+        for ( unsigned i=0; i<=splitPoints.size(); i++ ) {
+            BSONObj min = i == 0 ? _key.globalMin() : splitPoints[i-1];
+            BSONObj max = i < splitPoints.size() ? splitPoints[i] : _key.globalMax();
+            
+            Chunk temp( this , min , max , shards[ i % shards.size() ] );
+        
+            BSONObjBuilder chunkBuilder;
+            temp.serialize( chunkBuilder , version );
+            BSONObj chunkObj = chunkBuilder.obj();
+        
+            conn->update( Chunk::chunkMetadataNS, QUERY( "_id" << temp.genID() ), chunkObj,  true, false );
+
+            version.incMinor();
+        }
+
+        string errmsg = conn->getLastError();
+        if ( errmsg.size() ) {
+            string ss = str::stream() << "creating first chunks failed. result: " << errmsg;
+            error() << ss << endl;
+            msgasserted( 15903 , ss );
+        }
+        
+        conn.done();
+
+        if ( numObjects == 0 ) {
+            // the ensure index will have the (desired) indirect effect of creating the collection on the
+            // assigned shard, as it sets up the index over the sharding keys.
+            ScopedDbConnection shardConn( c.getShard().getConnString() );
+            shardConn->ensureIndex( getns() , getShardKey().key() , _unique , "" , false ); // do not cache ensureIndex SERVER-1691 
+            shardConn.done();
+        }
+
+    }
+
+    ChunkPtr ChunkManager::findChunk( const BSONObj & obj ) const {
+        BSONObj key = _key.extractKey(obj);
+
+        {
+            BSONObj foo;
+            ChunkPtr c;
+            {
+                ChunkMap::const_iterator it = _chunkMap.upper_bound(key);
+                if (it != _chunkMap.end()) {
+                    foo = it->first;
+                    c = it->second;
+                }
+            }
+
+            if ( c ) {
+                if ( c->contains( key ) ){
+                    dassert(c->contains(key)); // doesn't use fast-path in extractKey
+                    return c;
+                }
+
+                PRINT(foo);
+                PRINT(*c);
+                PRINT(key);
+
+                reload();
+                massert(13141, "Chunk map pointed to incorrect chunk", false);
+            }
+        }
+
+        throw UserException( 8070 , str::stream() << "couldn't find a chunk which should be impossible: " << key );
+    }
+
+    ChunkPtr ChunkManager::findChunkOnServer( const Shard& shard ) const {
+        for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) {
+            ChunkPtr c = i->second;
+            if ( c->getShard() == shard )
+                return c;
+        }
+
+        return ChunkPtr();
+    }
+
+    void ChunkManager::getShardsForQuery( set<Shard>& shards , const BSONObj& query ) const {
+        //TODO look into FieldRangeSetOr
+        OrRangeGenerator org(_ns.c_str(), query, false);
+
+        const string special = org.getSpecial();
+        if (special == "2d") {
+            BSONForEach(field, query) {
+                if (getGtLtOp(field) == BSONObj::opNEAR) {
+                    uassert(13501, "use geoNear command rather than $near query", false);
+                    // TODO: convert to geoNear rather than erroring out
+                }
+                // $within queries are fine
+            }
+        }
+        else if (!special.empty()) {
+            uassert(13502, "unrecognized special query type: " + special, false);
+        }
+
+        do {
+            boost::scoped_ptr<FieldRangeSetPair> frsp (org.topFrsp());
+            {
+                // special case if most-significant field isn't in query
+                FieldRange range = frsp->singleKeyRange(_key.key().firstElementFieldName());
+                if ( !range.nontrivial() ) {
+                    DEV PRINT(range.nontrivial());
+                    getShardsForRange( shards, _key.globalMin(), _key.globalMax() );
+                    return;
+                }
+            }
+
+            BoundList ranges = frsp->singleKeyIndexBounds(_key.key(), 1);
+            for (BoundList::const_iterator it=ranges.begin(), end=ranges.end(); it != end; ++it) {
+
+                BSONObj minObj = it->first.replaceFieldNames(_key.key());
+                BSONObj maxObj = it->second.replaceFieldNames(_key.key());
+
+                getShardsForRange( shards, minObj, maxObj, false );
+
+                // once we know we need to visit all shards no need to keep looping
+                if( shards.size() == _shards.size() ) return;
+            }
+
+            if (org.moreOrClauses())
+                org.popOrClauseSingleKey();
+
+        }
+        while (org.moreOrClauses());
+    }
+
+    void ChunkManager::getShardsForRange(set<Shard>& shards, const BSONObj& min, const BSONObj& max, bool fullKeyReq ) const {
+
+        if( fullKeyReq ){
+            uassert(13405, str::stream() << "min value " << min << " does not have shard key", hasShardKey(min));
+            uassert(13406, str::stream() << "max value " << max << " does not have shard key", hasShardKey(max));
+        }
+
+        ChunkRangeMap::const_iterator it = _chunkRanges.upper_bound(min);
+        ChunkRangeMap::const_iterator end = _chunkRanges.upper_bound(max);
+
+        massert( 13507 , str::stream() << "no chunks found between bounds " << min << " and " << max , it != _chunkRanges.ranges().end() );
+
+        if( end != _chunkRanges.ranges().end() ) ++end;
+
+        for( ; it != end; ++it ){
+            shards.insert(it->second->getShard());
+
+            // once we know we need to visit all shards no need to keep looping
+            if (shards.size() == _shards.size()) break;
+        }
+    }
+
+    void ChunkManager::getAllShards( set<Shard>& all ) const {
+        all.insert(_shards.begin(), _shards.end());
+    }
+
+    bool ChunkManager::compatibleWith( const ChunkManager& other, const Shard& shard ) const {
+        // TODO: Make this much smarter - currently returns true only if we're the same chunk manager
+        return getns() == other.getns() && getSequenceNumber() == other.getSequenceNumber();
+    }
+
+    void ChunkManager::drop( ChunkManagerPtr me ) const {
+        scoped_lock lk( _mutex );
+
+        configServer.logChange( "dropCollection.start" , _ns , BSONObj() );
+
+        dist_lock_try dlk;
+        try{
+        	dlk = dist_lock_try( &_nsLock  , "drop" );
+        }
+        catch( LockException& e ){
+        	uassert( 14022, str::stream() << "Error locking distributed lock for chunk drop." << causedBy( e ), false);
+        }
+
+        uassert( 13331 ,  "collection's metadata is undergoing changes. Please try again." , dlk.got() );
+
+        uassert( 10174 ,  "config servers not all up" , configServer.allUp() );
+
+        set<Shard> seen;
+
+        LOG(1) << "ChunkManager::drop : " << _ns << endl;
+
+        // lock all shards so no one can do a split/migrate
+        for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) {
+            ChunkPtr c = i->second;
+            seen.insert( c->getShard() );
+        }
+
+        LOG(1) << "ChunkManager::drop : " << _ns << "\t all locked" << endl;
+
+        // delete data from mongod
+        for ( set<Shard>::iterator i=seen.begin(); i!=seen.end(); i++ ) {
+            ScopedDbConnection conn( *i );
+            conn->dropCollection( _ns );
+            conn.done();
+        }
+
+        LOG(1) << "ChunkManager::drop : " << _ns << "\t removed shard data" << endl;
+
+        // remove chunk data
+        ScopedDbConnection conn( configServer.modelServer() );
+        conn->remove( Chunk::chunkMetadataNS , BSON( "ns" << _ns ) );
+        conn.done();
+        LOG(1) << "ChunkManager::drop : " << _ns << "\t removed chunk data" << endl;
+
+        for ( set<Shard>::iterator i=seen.begin(); i!=seen.end(); i++ ) {
+            ScopedDbConnection conn( *i );
+            BSONObj res;
+
+            // this is horrible
+            // we need a special command for dropping on the d side
+            // this hack works for the moment
+
+            if ( ! setShardVersion( conn.conn() , _ns , 0 , true , res ) )
+                throw UserException( 8071 , str::stream() << "cleaning up after drop failed: " << res );
+            conn->simpleCommand( "admin", 0, "unsetSharding" );
+            conn.done();
+        }
+
+        LOG(1) << "ChunkManager::drop : " << _ns << "\t DONE" << endl;
+        configServer.logChange( "dropCollection" , _ns , BSONObj() );
+    }
+
+    ShardChunkVersion ChunkManager::getVersion( const Shard& shard ) const {
+        ShardVersionMap::const_iterator i = _shardVersions.find( shard );
+        if ( i == _shardVersions.end() )
+            return 0;
+        return i->second;
+    }
+
+    ShardChunkVersion ChunkManager::getVersion() const {
+        return _version;
+    }
+
+    string ChunkManager::toString() const {
+        stringstream ss;
+        ss << "ChunkManager: " << _ns << " key:" << _key.toString() << '\n';
+        for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) {
+            const ChunkPtr c = i->second;
+            ss << "\t" << c->toString() << '\n';
+        }
+        return ss.str();
+    }
+
+    void ChunkRangeManager::assertValid() const {
+        if (_ranges.empty())
+            return;
+
+        try {
+            // No Nulls
+            for (ChunkRangeMap::const_iterator it=_ranges.begin(), end=_ranges.end(); it != end; ++it) {
+                assert(it->second);
+            }
+
+            // Check endpoints
+            assert(allOfType(MinKey, _ranges.begin()->second->getMin()));
+            assert(allOfType(MaxKey, boost::prior(_ranges.end())->second->getMax()));
+
+            // Make sure there are no gaps or overlaps
+            for (ChunkRangeMap::const_iterator it=boost::next(_ranges.begin()), end=_ranges.end(); it != end; ++it) {
+                ChunkRangeMap::const_iterator last = boost::prior(it);
+                assert(it->second->getMin() == last->second->getMax());
+            }
+
+            // Check Map keys
+            for (ChunkRangeMap::const_iterator it=_ranges.begin(), end=_ranges.end(); it != end; ++it) {
+                assert(it->first == it->second->getMax());
+            }
+
+            // Make sure we match the original chunks
+            const ChunkMap chunks = _ranges.begin()->second->getManager()->_chunkMap;
+            for ( ChunkMap::const_iterator i=chunks.begin(); i!=chunks.end(); ++i ) {
+                const ChunkPtr chunk = i->second;
+
+                ChunkRangeMap::const_iterator min = _ranges.upper_bound(chunk->getMin());
+                ChunkRangeMap::const_iterator max = _ranges.lower_bound(chunk->getMax());
+
+                assert(min != _ranges.end());
+                assert(max != _ranges.end());
+                assert(min == max);
+                assert(min->second->getShard() == chunk->getShard());
+                assert(min->second->contains( chunk->getMin() ));
+                assert(min->second->contains( chunk->getMax() ) || (min->second->getMax() == chunk->getMax()));
+            }
+
+        }
+        catch (...) {
+            log( LL_ERROR ) << "\t invalid ChunkRangeMap! printing ranges:" << endl;
+
+            for (ChunkRangeMap::const_iterator it=_ranges.begin(), end=_ranges.end(); it != end; ++it)
+                cout << it->first << ": " << *it->second << endl;
+
+            throw;
+        }
+    }
+
+    void ChunkRangeManager::reloadAll(const ChunkMap& chunks) {
+        _ranges.clear();
+        _insertRange(chunks.begin(), chunks.end());
+
+        DEV assertValid();
+    }
+
+    void ChunkRangeManager::_insertRange(ChunkMap::const_iterator begin, const ChunkMap::const_iterator end) {
+        while (begin != end) {
+            ChunkMap::const_iterator first = begin;
+            Shard shard = first->second->getShard();
+            while (begin != end && (begin->second->getShard() == shard))
+                ++begin;
+
+            shared_ptr<ChunkRange> cr (new ChunkRange(first, begin));
+            _ranges[cr->getMax()] = cr;
+        }
+    }
+
+    int ChunkManager::getCurrentDesiredChunkSize() const {
+        // split faster in early chunks helps spread out an initial load better
+        const int minChunkSize = 1 << 20;  // 1 MBytes
+
+        int splitThreshold = Chunk::MaxChunkSize;
+
+        int nc = numChunks();
+
+        if ( nc <= 1 ) {
+            return 1024;
+        }
+        else if ( nc < 3 ) {
+            return minChunkSize / 2;
+        }
+        else if ( nc < 10 ) {
+            splitThreshold = max( splitThreshold / 4 , minChunkSize );
+        }
+        else if ( nc < 20 ) {
+            splitThreshold = max( splitThreshold / 2 , minChunkSize );
+        }
+
+        return splitThreshold;
+    }
+
+    class ChunkObjUnitTest : public UnitTest {
+    public:
+        void runShardChunkVersion() {
+            vector<ShardChunkVersion> all;
+            all.push_back( ShardChunkVersion(1,1) );
+            all.push_back( ShardChunkVersion(1,2) );
+            all.push_back( ShardChunkVersion(2,1) );
+            all.push_back( ShardChunkVersion(2,2) );
+
+            for ( unsigned i=0; i<all.size(); i++ ) {
+                for ( unsigned j=i+1; j<all.size(); j++ ) {
+                    assert( all[i] < all[j] );
+                }
+            }
+
+        }
+
+        void run() {
+            runShardChunkVersion();
+            LOG(1) << "shardObjTest passed" << endl;
+        }
+    } shardObjTest;
+
+
+    // ----- to be removed ---
+    extern OID serverID;
+
+    // NOTE (careful when deprecating)
+    //   currently the sharding is enabled because of a write or read (as opposed to a split or migrate), the shard learns
+    //   its name and through the 'setShardVersion' command call
+    bool setShardVersion( DBClientBase & conn , const string& ns , ShardChunkVersion version , bool authoritative , BSONObj& result ) {
+        BSONObjBuilder cmdBuilder;
+        cmdBuilder.append( "setShardVersion" , ns.c_str() );
+        cmdBuilder.append( "configdb" , configServer.modelServer() );
+        cmdBuilder.appendTimestamp( "version" , version.toLong() );
+        cmdBuilder.appendOID( "serverID" , &serverID );
+        if ( authoritative )
+            cmdBuilder.appendBool( "authoritative" , 1 );
+
+        Shard s = Shard::make( conn.getServerAddress() );
+        cmdBuilder.append( "shard" , s.getName() );
+        cmdBuilder.append( "shardHost" , s.getConnString() );
+        BSONObj cmd = cmdBuilder.obj();
+
+        LOG(1) << "    setShardVersion  " << s.getName() << " " << conn.getServerAddress() << "  " << ns << "  " << cmd << " " << &conn << endl;
+
+        return conn.runCommand( "admin" , cmd , result );
+    }
+
+} // namespace mongo
diff --git a/src/mongo/s/chunk.h b/src/mongo/s/chunk.h
new file mode 100644
index 00000000000..0f323514a76
--- /dev/null
+++ b/src/mongo/s/chunk.h
@@ -0,0 +1,420 @@
+// @file chunk.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+
+#include "../bson/util/atomic_int.h"
+#include "../client/dbclient.h"
+#include "../client/distlock.h"
+
+#include "shardkey.h"
+#include "shard.h"
+#include "util.h"
+
+namespace mongo {
+
+    class DBConfig;
+    class Chunk;
+    class ChunkRange;
+    class ChunkManager;
+    class ChunkObjUnitTest;
+
+    typedef shared_ptr<const Chunk> ChunkPtr;
+
+    // key is max for each Chunk or ChunkRange
+    typedef map<BSONObj,ChunkPtr,BSONObjCmp> ChunkMap;
+    typedef map<BSONObj,shared_ptr<ChunkRange>,BSONObjCmp> ChunkRangeMap;
+
+    typedef shared_ptr<const ChunkManager> ChunkManagerPtr;
+
+    /**
+       config.chunks
+       { ns : "alleyinsider.fs.chunks" , min : {} , max : {} , server : "localhost:30001" }
+
+       x is in a shard iff
+       min <= x < max
+     */
+    class Chunk : boost::noncopyable {
+    public:
+        Chunk( const ChunkManager * info , BSONObj from);
+        Chunk( const ChunkManager * info , const BSONObj& min, const BSONObj& max, const Shard& shard);
+
+        //
+        // serialization support
+        //
+
+        void serialize(BSONObjBuilder& to, ShardChunkVersion myLastMod=0);
+
+        //
+        // chunk boundary support
+        //
+
+        const BSONObj& getMin() const { return _min; }
+        const BSONObj& getMax() const { return _max; }
+
+        // if min/max key is pos/neg infinity
+        bool minIsInf() const;
+        bool maxIsInf() const;
+
+        bool contains( const BSONObj& obj ) const;
+
+        string genID() const;
+        static string genID( const string& ns , const BSONObj& min );
+
+        //
+        // chunk version support
+        //
+
+        void appendShortVersion( const char * name , BSONObjBuilder& b ) const;
+
+        ShardChunkVersion getLastmod() const { return _lastmod; }
+        void setLastmod( ShardChunkVersion v ) { _lastmod = v; }
+
+        //
+        // split support
+        //
+
+        /**
+         * if the amount of data written nears the max size of a shard
+         * then we check the real size, and if its too big, we split
+         * @return if something was split
+         */
+        bool splitIfShould( long dataWritten ) const;
+
+        /**
+         * Splits this chunk at a non-specificed split key to be chosen by the mongod holding this chunk.
+         *
+         * @param force if set to true, will split the chunk regardless if the split is really necessary size wise
+         *              if set to false, will only split if the chunk has reached the currently desired maximum size
+         * @param res the object containing details about the split execution
+         * @return splitPoint if found a key and split successfully, else empty BSONObj
+         */
+        BSONObj singleSplit( bool force , BSONObj& res ) const;
+
+        /**
+         * Splits this chunk at the given key (or keys)
+         *
+         * @param splitPoints the vector of keys that should be used to divide this chunk
+         * @param res the object containing details about the split execution
+         * @return if the split was successful
+         */
+        bool multiSplit( const  vector<BSONObj>& splitPoints , BSONObj& res ) const;
+
+        /**
+         * Asks the mongod holding this chunk to find a key that approximately divides this chunk in two
+         *
+         * @param medianKey the key that divides this chunk, if there is one, or empty
+         */
+        void pickMedianKey( BSONObj& medianKey ) const;
+
+        /**
+         * @param splitPoints vector to be filled in
+         * @param chunkSize chunk size to target in bytes
+         * @param maxPoints limits the number of split points that are needed, zero is max (optional)
+         * @param maxObjs limits the number of objects in each chunk, zero is as max (optional)
+         */
+        void pickSplitVector( vector<BSONObj>& splitPoints , int chunkSize , int maxPoints = 0, int maxObjs = 0) const;
+
+        //
+        // migration support
+        //
+
+        /**
+         * Issues a migrate request for this chunk
+         *
+         * @param to shard to move this chunk to
+         * @param chunSize maximum number of bytes beyond which the migrate should no go trhough
+         * @param res the object containing details about the migrate execution
+         * @return true if move was successful
+         */
+        bool moveAndCommit( const Shard& to , long long chunkSize , BSONObj& res ) const;
+
+        /**
+         * @return size of shard in bytes
+         *  talks to mongod to do this
+         */
+        long getPhysicalSize() const;
+
+        /**
+         * marks this chunk as a jumbo chunk
+         * that means the chunk will be inelligble for migrates
+         */
+        void markAsJumbo() const;
+
+        bool isJumbo() const { return _jumbo; }
+
+        /**
+         * Attempt to refresh maximum chunk size from config.
+         */
+         static void refreshChunkSize();
+
+        //
+        // public constants
+        //
+
+        static string chunkMetadataNS;
+        static int MaxChunkSize;
+        static int MaxObjectPerChunk;
+        //
+        // accessors and helpers
+        //
+
+        string toString() const;
+
+        friend ostream& operator << (ostream& out, const Chunk& c) { return (out << c.toString()); }
+        bool operator==(const Chunk& s) const;
+        bool operator!=(const Chunk& s) const { return ! ( *this == s ); }
+
+        string getns() const;
+        const char * getNS() { return "config.chunks"; }
+        Shard getShard() const { return _shard; }
+        const ChunkManager* getManager() const { return _manager; }
+        
+
+    private:
+
+        // main shard info
+        
+        const ChunkManager * _manager;
+
+        BSONObj _min;
+        BSONObj _max;
+        Shard _shard;
+        ShardChunkVersion _lastmod;
+        mutable bool _jumbo;
+
+        // transient stuff
+
+        mutable long _dataWritten;
+
+        // methods, etc..
+
+        /**
+         * if sort 1, return lowest key
+         * if sort -1, return highest key
+         * will return empty object if have none
+         */
+        BSONObj _getExtremeKey( int sort ) const;
+
+        /** initializes _dataWritten with a random value so that a mongos restart wouldn't cause delay in splitting */
+        static long mkDataWritten();
+
+        ShardKeyPattern skey() const;
+    };
+
+    class ChunkRange {
+    public:
+        const ChunkManager* getManager() const { return _manager; }
+        Shard getShard() const { return _shard; }
+
+        const BSONObj& getMin() const { return _min; }
+        const BSONObj& getMax() const { return _max; }
+
+        // clones of Chunk methods
+        bool contains(const BSONObj& obj) const;
+
+        ChunkRange(ChunkMap::const_iterator begin, const ChunkMap::const_iterator end)
+            : _manager(begin->second->getManager())
+            , _shard(begin->second->getShard())
+            , _min(begin->second->getMin())
+            , _max(boost::prior(end)->second->getMax()) {
+            assert( begin != end );
+
+            DEV while (begin != end) {
+                assert(begin->second->getManager() == _manager);
+                assert(begin->second->getShard() == _shard);
+                ++begin;
+            }
+        }
+
+        // Merge min and max (must be adjacent ranges)
+        ChunkRange(const ChunkRange& min, const ChunkRange& max)
+            : _manager(min.getManager())
+            , _shard(min.getShard())
+            , _min(min.getMin())
+            , _max(max.getMax()) {
+            assert(min.getShard() == max.getShard());
+            assert(min.getManager() == max.getManager());
+            assert(min.getMax() == max.getMin());
+        }
+
+        friend ostream& operator<<(ostream& out, const ChunkRange& cr) {
+            return (out << "ChunkRange(min=" << cr._min << ", max=" << cr._max << ", shard=" << cr._shard <<")");
+        }
+
+    private:
+        const ChunkManager* _manager;
+        const Shard _shard;
+        const BSONObj _min;
+        const BSONObj _max;
+    };
+
+
+    class ChunkRangeManager {
+    public:
+        const ChunkRangeMap& ranges() const { return _ranges; }
+
+        void clear() { _ranges.clear(); }
+
+        void reloadAll(const ChunkMap& chunks);
+
+        // Slow operation -- wrap with DEV
+        void assertValid() const;
+
+        ChunkRangeMap::const_iterator upper_bound(const BSONObj& o) const { return _ranges.upper_bound(o); }
+        ChunkRangeMap::const_iterator lower_bound(const BSONObj& o) const { return _ranges.lower_bound(o); }
+
+    private:
+        // assumes nothing in this range exists in _ranges
+        void _insertRange(ChunkMap::const_iterator begin, const ChunkMap::const_iterator end);
+
+        ChunkRangeMap _ranges;
+    };
+
+    /* config.sharding
+         { ns: 'alleyinsider.fs.chunks' ,
+           key: { ts : 1 } ,
+           shards: [ { min: 1, max: 100, server: a } , { min: 101, max: 200 , server : b } ]
+         }
+    */
+    class ChunkManager {
+    public:
+        typedef map<Shard,ShardChunkVersion> ShardVersionMap;
+
+        ChunkManager( string ns , ShardKeyPattern pattern , bool unique );
+
+        string getns() const { return _ns; }
+
+        int numChunks() const { return _chunkMap.size(); }
+        bool hasShardKey( const BSONObj& obj ) const;
+
+        void createFirstChunks( const Shard& primary , vector<BSONObj>* initPoints , vector<Shard>* initShards ) const; // only call from DBConfig::shardCollection
+        ChunkPtr findChunk( const BSONObj& obj ) const;
+        ChunkPtr findChunkOnServer( const Shard& shard ) const;
+
+        const ShardKeyPattern& getShardKey() const {  return _key; }
+        bool isUnique() const { return _unique; }
+
+        void getShardsForQuery( set<Shard>& shards , const BSONObj& query ) const;
+        void getAllShards( set<Shard>& all ) const;
+        void getShardsForRange(set<Shard>& shards, const BSONObj& min, const BSONObj& max, bool fullKeyReq = true) const; // [min, max)
+
+        ChunkMap getChunkMap() const { return _chunkMap; }
+
+        /**
+         * Returns true if, for this shard, the chunks are identical in both chunk managers
+         */
+        bool compatibleWith( const ChunkManager& other, const Shard& shard ) const;
+        bool compatibleWith( ChunkManagerPtr other, const Shard& shard ) const { if( ! other ) return false; return compatibleWith( *other, shard ); }
+
+        string toString() const;
+
+        ShardChunkVersion getVersion( const Shard& shard ) const;
+        ShardChunkVersion getVersion() const;
+
+        /**
+         * this is just an increasing number of how many ChunkManagers we have so we know if something has been updated
+         */
+        unsigned long long getSequenceNumber() const { return _sequenceNumber; }
+
+        void getInfo( BSONObjBuilder& b ) const {
+            b.append( "key" , _key.key() );
+            b.appendBool( "unique" , _unique );
+        }
+
+        /**
+         * @param me - so i don't get deleted before i'm done
+         */
+        void drop( ChunkManagerPtr me ) const;
+
+        void _printChunks() const;
+
+        int getCurrentDesiredChunkSize() const;
+
+    private:
+        ChunkManagerPtr reload(bool force=true) const; // doesn't modify self!
+
+        // helpers for constructor
+        void _load(ChunkMap& chunks, set<Shard>& shards, ShardVersionMap& shardVersions);
+        static bool _isValid(const ChunkMap& chunks);
+
+        // All members should be const for thread-safety
+        const string _ns;
+        const ShardKeyPattern _key;
+        const bool _unique;
+
+        const ChunkMap _chunkMap;
+        const ChunkRangeManager _chunkRanges;
+
+        const set<Shard> _shards;
+
+        const ShardVersionMap _shardVersions; // max version per shard
+
+        ShardChunkVersion _version; // max version of any chunk
+
+        mutable mutex _mutex; // only used with _nsLock
+        mutable DistributedLock _nsLock;
+
+        const unsigned long long _sequenceNumber;
+
+        mutable TicketHolder _splitTickets; // number of concurrent splitVector we can do from a splitIfShould per collection
+        
+        friend class Chunk;
+        friend class ChunkRangeManager; // only needed for CRM::assertValid()
+        static AtomicUInt NextSequenceNumber;
+    };
+
+    // like BSONObjCmp. for use as an STL comparison functor
+    // key-order in "order" argument must match key-order in shardkey
+    class ChunkCmp {
+    public:
+        ChunkCmp( const BSONObj &order = BSONObj() ) : _cmp( order ) {}
+        bool operator()( const Chunk &l, const Chunk &r ) const {
+            return _cmp(l.getMin(), r.getMin());
+        }
+        bool operator()( const ptr<Chunk> l, const ptr<Chunk> r ) const {
+            return operator()(*l, *r);
+        }
+
+        // Also support ChunkRanges
+        bool operator()( const ChunkRange &l, const ChunkRange &r ) const {
+            return _cmp(l.getMin(), r.getMin());
+        }
+        bool operator()( const shared_ptr<ChunkRange> l, const shared_ptr<ChunkRange> r ) const {
+            return operator()(*l, *r);
+        }
+    private:
+        BSONObjCmp _cmp;
+    };
+
+    /*
+    struct chunk_lock {
+        chunk_lock( const Chunk* c ){
+
+        }
+
+        Chunk _c;
+    };
+    */
+    inline string Chunk::genID() const { return genID(_manager->getns(), _min); }
+
+    bool setShardVersion( DBClientBase & conn , const string& ns , ShardChunkVersion version , bool authoritative , BSONObj& result );
+
+} // namespace mongo
diff --git a/src/mongo/s/client.cpp b/src/mongo/s/client.cpp
new file mode 100644
index 00000000000..36063347d85
--- /dev/null
+++ b/src/mongo/s/client.cpp
@@ -0,0 +1,326 @@
+// s/client.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "server.h"
+
+#include "../db/commands.h"
+#include "../db/dbmessage.h"
+#include "../db/stats/counters.h"
+
+#include "../client/connpool.h"
+
+#include "client.h"
+#include "request.h"
+#include "config.h"
+#include "chunk.h"
+#include "stats.h"
+#include "cursors.h"
+#include "grid.h"
+#include "s/writeback_listener.h"
+
+namespace mongo {
+
+    /* todo: rename this file clientinfo.cpp would be more intuitive? */
+
+    ClientInfo::ClientInfo() {
+        _cur = &_a;
+        _prev = &_b;
+        _autoSplitOk = true;
+        newRequest();
+    }
+
+    ClientInfo::~ClientInfo() {
+    }
+
+    void ClientInfo::addShard( const string& shard ) {
+        _cur->insert( shard );
+        _sinceLastGetError.insert( shard );
+    }
+
+    void ClientInfo::newRequest( AbstractMessagingPort* p ) {
+
+        if ( p ) {
+            HostAndPort r = p->remote();
+            if ( ! _remote.hasPort() )
+                _remote = r;
+            else if ( _remote != r ) {
+                stringstream ss;
+                ss << "remotes don't match old [" << _remote.toString() << "] new [" << r.toString() << "]";
+                throw UserException( 13134 , ss.str() );
+            }
+        }
+
+        _lastAccess = (int) time(0);
+
+        set<string> * temp = _cur;
+        _cur = _prev;
+        _prev = temp;
+        _cur->clear();
+    }
+
+    ClientInfo * ClientInfo::get() {
+        ClientInfo * info = _tlInfo.get();
+        if ( ! info ) {
+            info = new ClientInfo();
+            _tlInfo.reset( info );
+            info->newRequest();
+        }
+        return info;
+    }
+
+    ClientBasic* ClientBasic::getCurrent() {
+        return ClientInfo::get();
+    }
+
+
+    void ClientInfo::disconnect() {
+        // should be handled by TL cleanup
+        _lastAccess = 0;
+    }
+
+    void ClientInfo::_addWriteBack( vector<WBInfo>& all , const BSONObj& gle ) {
+        BSONElement w = gle["writeback"];
+
+        if ( w.type() != jstOID )
+            return;
+
+        BSONElement cid = gle["connectionId"];
+
+        if ( cid.eoo() ) {
+            error() << "getLastError writeback can't work because of version mismatch" << endl;
+            return;
+        }
+
+        string ident = "";
+        if ( gle["instanceIdent"].type() == String )
+            ident = gle["instanceIdent"].String();
+
+        all.push_back( WBInfo( WriteBackListener::ConnectionIdent( ident , cid.numberLong() ) , w.OID() ) );
+    }
+
+    vector<BSONObj> ClientInfo::_handleWriteBacks( vector<WBInfo>& all , bool fromWriteBackListener ) {
+        vector<BSONObj> res;
+
+        if ( all.size() == 0 )
+            return res;
+        
+        if ( fromWriteBackListener ) {
+            LOG(1) << "not doing recursive writeback" << endl;
+            return res;
+        }
+        
+        for ( unsigned i=0; i<all.size(); i++ ) {
+            res.push_back( WriteBackListener::waitFor( all[i].ident , all[i].id ) );
+        }
+
+        return res;
+    }
+
+
+
+    bool ClientInfo::getLastError( const BSONObj& options , BSONObjBuilder& result , bool fromWriteBackListener ) {
+        set<string> * shards = getPrev();
+
+        if ( shards->size() == 0 ) {
+            result.appendNull( "err" );
+            return true;
+        }
+
+        vector<WBInfo> writebacks;
+
+        // handle single server
+        if ( shards->size() == 1 ) {
+            string theShard = *(shards->begin() );
+
+            ShardConnection conn( theShard , "" );
+            
+            BSONObj res;
+            bool ok = false;
+            try{
+            	ok = conn->runCommand( "admin" , options , res );
+            }
+            catch( std::exception &e ){
+                
+                warning() << "could not get last error from shard " << theShard << causedBy( e ) << endl;
+                
+                // Catch everything that happens here, since we need to ensure we return our connection when we're
+            	// finished.
+            	conn.done();
+                
+            	return false;
+            }
+            
+            res = res.getOwned();
+            conn.done();
+            
+
+            _addWriteBack( writebacks , res );
+
+            // hit other machines just to block
+            for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
+                string temp = *i;
+                if ( temp == theShard )
+                    continue;
+
+                ShardConnection conn( temp , "" );
+
+                try {
+                    _addWriteBack( writebacks , conn->getLastErrorDetailed() );
+                }
+                catch( std::exception &e ){
+                    warning() << "could not clear last error from shard " << temp << causedBy( e ) << endl;
+                }
+
+                conn.done();
+            }
+            clearSinceLastGetError();
+            
+            if ( writebacks.size() ){
+                vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener );
+                if ( v.size() == 0 && fromWriteBackListener ) {
+                    // ok
+                }
+                else {
+                    // this will usually be 1
+                    // it can be greater than 1 if a write to a different shard
+                    // than the last write op had a writeback
+                    // all we're going to report is the first
+                    // since that's the current write
+                    // but we block for all
+                    assert( v.size() >= 1 );
+                    result.appendElements( v[0] );
+                    result.appendElementsUnique( res );
+                    result.append( "writebackGLE" , v[0] );
+                    result.append( "initialGLEHost" , theShard );
+                }
+            }
+            else {
+                result.append( "singleShard" , theShard );
+                result.appendElements( res );
+            }
+            
+            return ok;
+        }
+
+        BSONArrayBuilder bbb( result.subarrayStart( "shards" ) );
+        BSONObjBuilder shardRawGLE;
+
+        long long n = 0;
+        
+        int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true
+
+        // hit each shard
+        vector<string> errors;
+        vector<BSONObj> errorObjects;
+        for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) {
+            string theShard = *i;
+            bbb.append( theShard );
+            ShardConnection conn( theShard , "" );
+            BSONObj res;
+            bool ok = false;
+            try {
+                ok = conn->runCommand( "admin" , options , res );
+                shardRawGLE.append( theShard , res );
+            }
+            catch( std::exception &e ){
+
+        	    // Safe to return here, since we haven't started any extra processing yet, just collecting
+        	    // responses.
+                
+        	    warning() << "could not get last error from a shard " << theShard << causedBy( e ) << endl;
+                conn.done();
+                
+                return false;
+            }
+            
+            _addWriteBack( writebacks, res );
+            
+            string temp = DBClientWithCommands::getLastErrorString( res );
+            if ( conn->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) {
+                errors.push_back( temp );
+                errorObjects.push_back( res );
+            }
+
+            n += res["n"].numberLong();
+            if ( res["updatedExisting"].type() ) {
+                if ( res["updatedExisting"].trueValue() )
+                    updatedExistingStat = 1;
+                else if ( updatedExistingStat == 0 )
+                    updatedExistingStat = -1;
+            }
+
+            conn.done();
+        }
+
+        bbb.done();
+        result.append( "shardRawGLE" , shardRawGLE.obj() );
+
+        result.appendNumber( "n" , n );
+        if ( updatedExistingStat )
+            result.appendBool( "updatedExisting" , updatedExistingStat > 0 );
+
+        // hit other machines just to block
+        for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
+            string temp = *i;
+            if ( shards->count( temp ) )
+                continue;
+
+            ShardConnection conn( temp , "" );
+            try {
+                _addWriteBack( writebacks, conn->getLastErrorDetailed() );
+            }
+            catch( std::exception &e ){
+                warning() << "could not clear last error from a shard " << temp << causedBy( e ) << endl;
+            }
+            conn.done();
+        }
+        clearSinceLastGetError();
+
+        if ( errors.size() == 0 ) {
+            result.appendNull( "err" );
+            _handleWriteBacks( writebacks , fromWriteBackListener );
+            return true;
+        }
+
+        result.append( "err" , errors[0].c_str() );
+
+        {
+            // errs
+            BSONArrayBuilder all( result.subarrayStart( "errs" ) );
+            for ( unsigned i=0; i<errors.size(); i++ ) {
+                all.append( errors[i].c_str() );
+            }
+            all.done();
+        }
+
+        {
+            // errObjects
+            BSONArrayBuilder all( result.subarrayStart( "errObjects" ) );
+            for ( unsigned i=0; i<errorObjects.size(); i++ ) {
+                all.append( errorObjects[i] );
+            }
+            all.done();
+        }
+        _handleWriteBacks( writebacks , fromWriteBackListener );
+        return true;
+    }
+
+    boost::thread_specific_ptr<ClientInfo> ClientInfo::_tlInfo;
+
+} // namespace mongo
diff --git a/src/mongo/s/client.h b/src/mongo/s/client.h
new file mode 100644
index 00000000000..1237f66b88a
--- /dev/null
+++ b/src/mongo/s/client.h
@@ -0,0 +1,128 @@
+// @file s/client.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include "../pch.h"
+#include "writeback_listener.h"
+#include "../db/security.h"
+#include "../db/client_common.h"
+
+namespace mongo {
+
+    /**
+     * holds information about a client connected to a mongos
+     * 1 per client socket
+     * currently implemented with a thread local
+     */
+    class ClientInfo : public ClientBasic {
+    public:
+        ClientInfo();
+        ~ClientInfo();
+
+        /** new request from client, adjusts internal state */
+        void newRequest( AbstractMessagingPort* p = 0 );
+
+        /** client disconnected */
+        void disconnect();
+
+        bool hasRemote() const { return true; }
+
+        /**
+         * @return remote socket address of the client
+         */
+        HostAndPort getRemote() const { return _remote; }
+
+        /**
+         * notes that this client use this shard
+         * keeps track of all shards accessed this request
+         */
+        void addShard( const string& shard );
+
+        /**
+         * gets shards used on the previous request
+         */
+        set<string> * getPrev() const { return _prev; };
+        
+        /**
+         * gets all shards we've accessed since the last time we called clearSinceLastGetError
+         */
+        const set<string>& sinceLastGetError() const { return _sinceLastGetError; }
+
+        /**
+         * clears list of shards we've talked to
+         */
+        void clearSinceLastGetError() { _sinceLastGetError.clear(); }
+
+
+        /**
+         * resets the list of shards using to process the current request
+         */
+        void clearCurrentShards(){ _cur->clear(); }
+
+        /**
+         * calls getLastError
+         * resets shards since get last error
+         * @return if the command was ok or if there was an error
+         */
+        bool getLastError( const BSONObj& options , BSONObjBuilder& result , bool fromWriteBackListener = false );
+
+        /** @return if its ok to auto split from this client */
+        bool autoSplitOk() const { return _autoSplitOk; }
+        
+        void noAutoSplit() { _autoSplitOk = false; }
+
+        static ClientInfo * get();
+        const AuthenticationInfo* getAuthenticationInfo() const { return (AuthenticationInfo*)&_ai; }
+        AuthenticationInfo* getAuthenticationInfo() { return (AuthenticationInfo*)&_ai; }
+        bool isAdmin() { return _ai.isAuthorized( "admin" ); }
+    private:
+        AuthenticationInfo _ai;
+        struct WBInfo {
+            WBInfo( const WriteBackListener::ConnectionIdent& c , OID o ) : ident( c ) , id( o ) {}
+            WriteBackListener::ConnectionIdent ident;
+            OID id;
+        };
+
+        // for getLastError
+        void _addWriteBack( vector<WBInfo>& all , const BSONObj& o );
+        vector<BSONObj> _handleWriteBacks( vector<WBInfo>& all , bool fromWriteBackListener );
+
+
+        int _id; // unique client id
+        HostAndPort _remote; // server:port of remote socket end
+
+        // we use _a and _b to store shards we've talked to on the current request and the previous
+        // we use 2 so we can flip for getLastError type operations
+
+        set<string> _a; // actual set for _cur or _prev
+        set<string> _b; //   "
+
+        set<string> * _cur; // pointer to _a or _b depending on state
+        set<string> * _prev; //  ""
+
+
+        set<string> _sinceLastGetError; // all shards accessed since last getLastError
+
+        int _lastAccess;
+        bool _autoSplitOk; 
+
+        static boost::thread_specific_ptr<ClientInfo> _tlInfo;
+    };
+
+
+}
diff --git a/src/mongo/s/commands_admin.cpp b/src/mongo/s/commands_admin.cpp
new file mode 100644
index 00000000000..cbe306f47a8
--- /dev/null
+++ b/src/mongo/s/commands_admin.cpp
@@ -0,0 +1,1239 @@
+// s/commands_admin.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* TODO
+   _ concurrency control.
+   _ limit() works right?
+   _ KillCursors
+
+   later
+   _ secondary indexes
+*/
+
+#include "pch.h"
+#include "../util/net/message.h"
+#include "../util/net/listen.h"
+#include "../util/processinfo.h"
+#include "../util/stringutils.h"
+#include "../util/version.h"
+#include "../util/timer.h"
+
+#include "../client/connpool.h"
+
+#include "../db/dbmessage.h"
+#include "../db/commands.h"
+#include "../db/stats/counters.h"
+
+#include "config.h"
+#include "chunk.h"
+#include "grid.h"
+#include "strategy.h"
+#include "stats.h"
+#include "writeback_listener.h"
+#include "client.h"
+#include "../util/ramlog.h"
+
+namespace mongo {
+
+    namespace dbgrid_cmds {
+
+        class GridAdminCmd : public Command {
+        public:
+            GridAdminCmd( const char * n ) : Command( n , false, tolowerString(n).c_str() ) {
+            }
+            virtual bool slaveOk() const {
+                return true;
+            }
+            virtual bool adminOnly() const {
+                return true;
+            }
+
+            // all grid commands are designed not to lock
+            virtual LockType locktype() const { return NONE; }
+
+            bool okForConfigChanges( string& errmsg ) {
+                string e;
+                if ( ! configServer.allUp(e) ) {
+                    errmsg = str::stream() << "not all config servers are up: " << e;
+                    return false;
+                }
+                return true;
+            }
+        };
+
+        // --------------- misc commands ----------------------
+
+        class NetStatCmd : public GridAdminCmd {
+        public:
+            NetStatCmd() : GridAdminCmd("netstat") { }
+            virtual void help( stringstream& help ) const {
+                help << " shows status/reachability of servers in the cluster";
+            }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                result.append("configserver", configServer.getPrimary().getConnString() );
+                result.append("isdbgrid", 1);
+                return true;
+            }
+        } netstat;
+
+        class FlushRouterConfigCmd : public GridAdminCmd {
+        public:
+            FlushRouterConfigCmd() : GridAdminCmd("flushRouterConfig") { }
+            virtual void help( stringstream& help ) const {
+                help << "flush all router config";
+            }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                grid.flushConfig();
+                result.appendBool( "flushed" , true );
+                return true;
+            }
+        } flushRouterConfigCmd;
+
+
+        class ServerStatusCmd : public Command {
+        public:
+            ServerStatusCmd() : Command( "serverStatus" , true ) {
+                _started = time(0);
+            }
+
+            virtual bool slaveOk() const { return true; }
+            virtual LockType locktype() const { return NONE; }
+
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+                result.append( "host" , prettyHostName() );
+                result.append("version", versionString);
+                result.append("process","mongos");
+                result.append("uptime",(double) (time(0)-_started));
+                result.appendDate( "localTime" , jsTime() );
+
+                {
+                    BSONObjBuilder t( result.subobjStart( "mem" ) );
+
+                    ProcessInfo p;
+                    if ( p.supported() ) {
+                        t.appendNumber( "resident" , p.getResidentSize() );
+                        t.appendNumber( "virtual" , p.getVirtualMemorySize() );
+                        t.appendBool( "supported" , true );
+                    }
+                    else {
+                        result.append( "note" , "not all mem info support on this platform" );
+                        t.appendBool( "supported" , false );
+                    }
+
+                    t.done();
+                }
+
+                {
+                    BSONObjBuilder bb( result.subobjStart( "connections" ) );
+                    bb.append( "current" , connTicketHolder.used() );
+                    bb.append( "available" , connTicketHolder.available() );
+                    bb.done();
+                }
+
+                {
+                    BSONObjBuilder bb( result.subobjStart( "extra_info" ) );
+                    bb.append("note", "fields vary by platform");
+                    ProcessInfo p;
+                    p.getExtraInfo(bb);
+                    bb.done();
+                }
+
+                result.append( "opcounters" , globalOpCounters.getObj() );
+                {
+                    BSONObjBuilder bb( result.subobjStart( "ops" ) );
+                    bb.append( "sharded" , opsSharded.getObj() );
+                    bb.append( "notSharded" , opsNonSharded.getObj() );
+                    bb.done();
+                }
+
+                result.append( "shardCursorType" , shardedCursorTypes.getObj() );
+
+                {
+                    BSONObjBuilder asserts( result.subobjStart( "asserts" ) );
+                    asserts.append( "regular" , assertionCount.regular );
+                    asserts.append( "warning" , assertionCount.warning );
+                    asserts.append( "msg" , assertionCount.msg );
+                    asserts.append( "user" , assertionCount.user );
+                    asserts.append( "rollovers" , assertionCount.rollovers );
+                    asserts.done();
+                }
+
+                {
+                    BSONObjBuilder bb( result.subobjStart( "network" ) );
+                    networkCounter.append( bb );
+                    bb.done();
+                }
+
+                {
+                    RamLog* rl = RamLog::get( "warnings" );
+                    verify(15879, rl);
+                    
+                    if (rl->lastWrite() >= time(0)-(10*60)){ // only show warnings from last 10 minutes
+                        vector<const char*> lines;
+                        rl->get( lines );
+                        
+                        BSONArrayBuilder arr( result.subarrayStart( "warnings" ) );
+                        for ( unsigned i=std::max(0,(int)lines.size()-10); i<lines.size(); i++ )
+                            arr.append( lines[i] );
+                        arr.done();
+                    }
+                }
+
+                return 1;
+            }
+
+            time_t _started;
+        } cmdServerStatus;
+
+        class FsyncCommand : public GridAdminCmd {
+        public:
+            FsyncCommand() : GridAdminCmd( "fsync" ) {}
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                if ( cmdObj["lock"].trueValue() ) {
+                    errmsg = "can't do lock through mongos";
+                    return false;
+                }
+
+                BSONObjBuilder sub;
+
+                bool ok = true;
+                int numFiles = 0;
+
+                vector<Shard> shards;
+                Shard::getAllShards( shards );
+                for ( vector<Shard>::iterator i=shards.begin(); i!=shards.end(); i++ ) {
+                    Shard s = *i;
+
+                    BSONObj x = s.runCommand( "admin" , "fsync" );
+                    sub.append( s.getName() , x );
+
+                    if ( ! x["ok"].trueValue() ) {
+                        ok = false;
+                        errmsg = x["errmsg"].String();
+                    }
+
+                    numFiles += x["numFiles"].numberInt();
+                }
+
+                result.append( "numFiles" , numFiles );
+                result.append( "all" , sub.obj() );
+                return ok;
+            }
+        } fsyncCmd;
+
+        // ------------ database level commands -------------
+
+        class MoveDatabasePrimaryCommand : public GridAdminCmd {
+        public:
+            MoveDatabasePrimaryCommand() : GridAdminCmd("movePrimary") { }
+            virtual void help( stringstream& help ) const {
+                help << " example: { moveprimary : 'foo' , to : 'localhost:9999' }";
+            }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string dbname = cmdObj.firstElement().valuestrsafe();
+
+                if ( dbname.size() == 0 ) {
+                    errmsg = "no db";
+                    return false;
+                }
+
+                if ( dbname == "config" ) {
+                    errmsg = "can't move config db";
+                    return false;
+                }
+
+                DBConfigPtr config = grid.getDBConfig( dbname , false );
+                if ( ! config ) {
+                    errmsg = "can't find db!";
+                    return false;
+                }
+
+                string to = cmdObj["to"].valuestrsafe();
+                if ( ! to.size()  ) {
+                    errmsg = "you have to specify where you want to move it";
+                    return false;
+                }
+                Shard s = Shard::make( to );
+
+                if ( config->getPrimary() == s.getConnString() ) {
+                    errmsg = "it is already the primary";
+                    return false;
+                }
+
+                if ( ! grid.knowAboutShard( s.getConnString() ) ) {
+                    errmsg = "that server isn't known to me";
+                    return false;
+                }
+
+                log() << "Moving " << dbname << " primary from: " << config->getPrimary().toString()
+                      << " to: " << s.toString() << endl;
+
+                // Locking enabled now...
+                DistributedLock lockSetup( configServer.getConnectionString(), dbname + "-movePrimary" );
+                dist_lock_try dlk;
+
+                // Distributed locking added.
+                try{
+                    dlk = dist_lock_try( &lockSetup , string("Moving primary shard of ") + dbname );
+                }
+                catch( LockException& e ){
+	                errmsg = str::stream() << "error locking distributed lock to move primary shard of " << dbname << causedBy( e );
+	                warning() << errmsg << endl;
+	                return false;
+                }
+
+                if ( ! dlk.got() ) {
+	                errmsg = (string)"metadata lock is already taken for moving " + dbname;
+	                return false;
+                }
+
+                ScopedDbConnection toconn( s.getConnString() );
+
+                // TODO ERH - we need a clone command which replays operations from clone start to now
+                //            can just use local.oplog.$main
+                BSONObj cloneRes;
+                bool worked = toconn->runCommand( dbname.c_str() , BSON( "clone" << config->getPrimary().getConnString() ) , cloneRes );
+                toconn.done();
+
+                if ( ! worked ) {
+                    log() << "clone failed" << cloneRes << endl;
+                    errmsg = "clone failed";
+                    return false;
+                }
+
+                ScopedDbConnection fromconn( config->getPrimary() );
+
+                config->setPrimary( s.getConnString() );
+
+                log() << "movePrimary:  dropping " << dbname << " from old" << endl;
+
+                fromconn->dropDatabase( dbname.c_str() );
+                fromconn.done();
+
+                result << "primary " << s.toString();
+
+                return true;
+            }
+        } movePrimary;
+
+        class EnableShardingCmd : public GridAdminCmd {
+        public:
+            EnableShardingCmd() : GridAdminCmd( "enableSharding" ) {}
+            virtual void help( stringstream& help ) const {
+                help
+                        << "Enable sharding for a db. (Use 'shardcollection' command afterwards.)\n"
+                        << "  { enablesharding : \"<dbname>\" }\n";
+            }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string dbname = cmdObj.firstElement().valuestrsafe();
+                if ( dbname.size() == 0 ) {
+                    errmsg = "no db";
+                    return false;
+                }
+                
+                if ( dbname == "admin" ) {
+                    errmsg = "can't shard the admin db";
+                    return false;
+                }
+                if ( dbname == "local" ) {
+                    errmsg = "can't shard the local db";
+                    return false;
+                }
+
+                DBConfigPtr config = grid.getDBConfig( dbname );
+                if ( config->isShardingEnabled() ) {
+                    errmsg = "already enabled";
+                    return false;
+                }
+                
+                if ( ! okForConfigChanges( errmsg ) )
+                    return false;
+                
+                log() << "enabling sharding on: " << dbname << endl;
+
+                config->enableSharding();
+
+                return true;
+            }
+        } enableShardingCmd;
+
+        // ------------ collection level commands -------------
+
+        class ShardCollectionCmd : public GridAdminCmd {
+        public:
+            ShardCollectionCmd() : GridAdminCmd( "shardCollection" ) {}
+
+            virtual void help( stringstream& help ) const {
+                help
+                        << "Shard a collection.  Requires key.  Optional unique. Sharding must already be enabled for the database.\n"
+                        << "  { enablesharding : \"<dbname>\" }\n";
+            }
+
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string ns = cmdObj.firstElement().valuestrsafe();
+                if ( ns.size() == 0 ) {
+                    errmsg = "no ns";
+                    return false;
+                }
+
+                DBConfigPtr config = grid.getDBConfig( ns );
+                if ( ! config->isShardingEnabled() ) {
+                    errmsg = "sharding not enabled for db";
+                    return false;
+                }
+
+                if ( config->isSharded( ns ) ) {
+                    errmsg = "already sharded";
+                    return false;
+                }
+
+                BSONObj key = cmdObj.getObjectField( "key" );
+                if ( key.isEmpty() ) {
+                    errmsg = "no shard key";
+                    return false;
+                }
+
+                BSONForEach(e, key) {
+                    if (!e.isNumber() || e.number() != 1.0) {
+                        errmsg = "shard keys must all be ascending";
+                        return false;
+                    }
+                }
+
+                if ( ns.find( ".system." ) != string::npos ) {
+                    errmsg = "can't shard system namespaces";
+                    return false;
+                }
+
+                if ( ! okForConfigChanges( errmsg ) )
+                    return false;
+
+                // Sharding interacts with indexing in at least three ways:
+                //
+                // 1. A unique index must have the sharding key as its prefix. Otherwise maintaining uniqueness would
+                // require coordinated access to all shards. Trying to shard a collection with such an index is not
+                // allowed.
+                //
+                // 2. Sharding a collection requires an index over the sharding key. That index must be create upfront.
+                // The rationale is that sharding a non-empty collection would need to create the index and that could
+                // be slow. Requiring the index upfront allows the admin to plan before sharding and perhaps use
+                // background index construction. One exception to the rule: empty collections. It's fairly easy to
+                // create the index as part of the sharding process.
+                //
+                // 3. If unique : true is specified, we require that the sharding index be unique or created as unique.
+                //
+                // We enforce both these conditions in what comes next.
+
+                bool careAboutUnique = cmdObj["unique"].trueValue();
+
+                {
+                    ShardKeyPattern proposedKey( key );
+                    bool hasShardIndex = false;
+                    bool hasUniqueShardIndex = false;
+
+                    ScopedDbConnection conn( config->getPrimary() );
+                    BSONObjBuilder b;
+                    b.append( "ns" , ns );
+
+                    BSONArrayBuilder allIndexes;
+
+                    auto_ptr<DBClientCursor> cursor = conn->query( config->getName() + ".system.indexes" , b.obj() );
+                    while ( cursor->more() ) {
+                        BSONObj idx = cursor->next();
+
+                        allIndexes.append( idx );
+
+                        bool idIndex = ! idx["name"].eoo() && idx["name"].String() == "_id_";
+                        bool uniqueIndex = ( ! idx["unique"].eoo() && idx["unique"].trueValue() ) ||
+                        		           idIndex;
+
+                        // Is index key over the sharding key? Remember that.
+                        if ( key.woCompare( idx["key"].embeddedObjectUserCheck() ) == 0 ) {
+
+                            if( idx["sparse"].trueValue() ){
+                                errmsg = (string)"can't shard collection " + ns + " with sparse shard key index";
+                                conn.done();
+                                return false;
+                            }
+
+                            hasShardIndex = true;
+                            hasUniqueShardIndex = uniqueIndex;
+                            continue;
+                        }
+
+                        // Not a unique index? Move on.
+                        if ( ! uniqueIndex || idIndex )
+                            continue;
+
+                        // Shard key is prefix of unique index? Move on.
+                        if ( proposedKey.isPrefixOf( idx["key"].embeddedObjectUserCheck() ) )
+                            continue;
+
+                        errmsg = str::stream() << "can't shard collection '" << ns << "' with unique index on: " + idx.toString()
+                                               << ", uniqueness can't be maintained across unless shard key index is a prefix";
+                        conn.done();
+                        return false;
+                    }
+
+                    if( careAboutUnique && hasShardIndex && ! hasUniqueShardIndex ){
+                        errmsg = (string)"can't shard collection " + ns + ", shard key index not unique and unique index explicitly specified";
+                        conn.done();
+                        return false;
+                    }
+
+                    BSONObj res = conn->findOne( config->getName() + ".system.namespaces" , BSON( "name" << ns ) );
+                    if ( res["options"].type() == Object && res["options"].embeddedObject()["capped"].trueValue() ) {
+                        errmsg = "can't shard capped collection";
+                        conn.done();
+                        return false;
+                    }
+
+                    if ( hasShardIndex ) {
+                        // make sure there are no null entries in the sharding index
+                        BSONObjBuilder cmd;
+                        cmd.append( "checkShardingIndex" , ns );
+                        cmd.append( "keyPattern" , key );
+                        BSONObj cmdObj = cmd.obj();
+                        if ( ! conn->runCommand( "admin" , cmdObj , res )) {
+                            errmsg = res["errmsg"].str();
+                            conn.done();
+                            return false;
+                        }
+                    }
+
+                    if ( ! hasShardIndex && ( conn->count( ns ) != 0 ) ) {
+                        errmsg = "please create an index over the sharding key before sharding.";
+                        result.append( "proposedKey" , key );
+                        result.appendArray( "curIndexes" , allIndexes.done() );
+                        conn.done();
+                        return false;
+                    }
+
+                    conn.done();
+                }
+
+                tlog() << "CMD: shardcollection: " << cmdObj << endl;
+
+//                vector<BSONObj> pts;
+//                if (cmdObj.hasField("splitPoints")) {
+//                    if ( cmdObj.getField("splitPoints").type() != Array ) {
+//                        errmsg = "Value of splitPoints must be an array of objects";
+//                        return false;
+//                    }
+//
+//                    vector<BSONElement> elmts = cmdObj.getField("splitPoints").Array();
+//                    for ( unsigned i = 0 ; i < elmts.size() ; ++i) {
+//                        if ( elmts[i].type() != Object ) {
+//                            errmsg = "Elements in the splitPoints array must be objects";
+//                            return false;
+//                        }
+//                        pts.push_back( elmts[i].Obj() );
+//                    }
+//                }
+                config->shardCollection( ns , key , careAboutUnique );
+
+                result << "collectionsharded" << ns;
+                return true;
+            }
+        } shardCollectionCmd;
+
+        class GetShardVersion : public GridAdminCmd {
+        public:
+            GetShardVersion() : GridAdminCmd( "getShardVersion" ) {}
+            virtual void help( stringstream& help ) const {
+                help << " example: { getShardVersion : 'alleyinsider.foo'  } ";
+            }
+
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string ns = cmdObj.firstElement().valuestrsafe();
+                if ( ns.size() == 0 ) {
+                    errmsg = "need to specify fully namespace";
+                    return false;
+                }
+
+                DBConfigPtr config = grid.getDBConfig( ns );
+                if ( ! config->isSharded( ns ) ) {
+                    errmsg = "ns not sharded.";
+                    return false;
+                }
+
+                ChunkManagerPtr cm = config->getChunkManagerIfExists( ns );
+                if ( ! cm ) {
+                    errmsg = "no chunk manager?";
+                    return false;
+                }
+                cm->_printChunks();
+                result.appendTimestamp( "version" , cm->getVersion().toLong() );
+
+                return 1;
+            }
+        } getShardVersionCmd;
+
+        class SplitCollectionCmd : public GridAdminCmd {
+        public:
+            SplitCollectionCmd() : GridAdminCmd( "split" ) {}
+            virtual void help( stringstream& help ) const {
+                help
+                        << " example: - split the shard that contains give key \n"
+                        << " { split : 'alleyinsider.blog.posts' , find : { ts : 1 } }\n"
+                        << " example: - split the shard that contains the key with this as the middle \n"
+                        << " { split : 'alleyinsider.blog.posts' , middle : { ts : 1 } }\n"
+                        << " NOTE: this does not move move the chunks, it merely creates a logical separation \n"
+                        ;
+            }
+
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+                if ( ! okForConfigChanges( errmsg ) )
+                    return false;
+
+                ShardConnection::sync();
+
+                string ns = cmdObj.firstElement().valuestrsafe();
+                if ( ns.size() == 0 ) {
+                    errmsg = "no ns";
+                    return false;
+                }
+
+                DBConfigPtr config = grid.getDBConfig( ns );
+                if ( ! config->isSharded( ns ) ) {
+                    config->reload();
+                    if ( ! config->isSharded( ns ) ) {
+                        errmsg = "ns not sharded.  have to shard before can split";
+                        return false;
+                    }
+                }
+
+                BSONObj find = cmdObj.getObjectField( "find" );
+                if ( find.isEmpty() ) {
+                    find = cmdObj.getObjectField( "middle" );
+
+                    if ( find.isEmpty() ) {
+                        errmsg = "need to specify find or middle";
+                        return false;
+                    }
+                }
+
+                ChunkManagerPtr info = config->getChunkManager( ns );
+                ChunkPtr chunk = info->findChunk( find );
+                BSONObj middle = cmdObj.getObjectField( "middle" );
+
+                assert( chunk.get() );
+                log() << "splitting: " << ns << "  shard: " << chunk << endl;
+
+                BSONObj res;
+                bool worked;
+                if ( middle.isEmpty() ) {
+                    BSONObj ret = chunk->singleSplit( true /* force a split even if not enough data */ , res );
+                    worked = !ret.isEmpty();
+                }
+                else {
+                    // sanity check if the key provided is a valid split point
+                    if ( ( middle == chunk->getMin() ) || ( middle == chunk->getMax() ) ) {
+                        errmsg = "cannot split on initial or final chunk's key";
+                        return false;
+                    }
+
+                    if (!fieldsMatch(middle, info->getShardKey().key())){
+                        errmsg = "middle has different fields (or different order) than shard key";
+                        return false;
+                    }
+
+                    vector<BSONObj> splitPoints;
+                    splitPoints.push_back( middle );
+                    worked = chunk->multiSplit( splitPoints , res );
+                }
+
+                if ( !worked ) {
+                    errmsg = "split failed";
+                    result.append( "cause" , res );
+                    return false;
+                }
+                config->getChunkManager( ns , true );
+                return true;
+            }
+        } splitCollectionCmd;
+
+        class MoveChunkCmd : public GridAdminCmd {
+        public:
+            MoveChunkCmd() : GridAdminCmd( "moveChunk" ) {}
+            virtual void help( stringstream& help ) const {
+                help << "{ movechunk : 'test.foo' , find : { num : 1 } , to : 'localhost:30001' }";
+            }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+                if ( ! okForConfigChanges( errmsg ) )
+                    return false;
+
+                ShardConnection::sync();
+
+                Timer t;
+                string ns = cmdObj.firstElement().valuestrsafe();
+                if ( ns.size() == 0 ) {
+                    errmsg = "no ns";
+                    return false;
+                }
+
+                DBConfigPtr config = grid.getDBConfig( ns );
+                if ( ! config->isSharded( ns ) ) {
+                    config->reload();
+                    if ( ! config->isSharded( ns ) ) {
+                        errmsg = "ns not sharded.  have to shard before we can move a chunk";
+                        return false;
+                    }
+                }
+
+                BSONObj find = cmdObj.getObjectField( "find" );
+                if ( find.isEmpty() ) {
+                    errmsg = "need to specify find.  see help";
+                    return false;
+                }
+
+                string toString = cmdObj["to"].valuestrsafe();
+                if ( ! toString.size()  ) {
+                    errmsg = "you have to specify where you want to move the chunk";
+                    return false;
+                }
+
+                Shard to = Shard::make( toString );
+
+                // so far, chunk size serves test purposes; it may or may not become a supported parameter
+                long long maxChunkSizeBytes = cmdObj["maxChunkSizeBytes"].numberLong();
+                if ( maxChunkSizeBytes == 0 ) {
+                    maxChunkSizeBytes = Chunk::MaxChunkSize;
+                }
+
+                tlog() << "CMD: movechunk: " << cmdObj << endl;
+
+                ChunkManagerPtr info = config->getChunkManager( ns );
+                ChunkPtr c = info->findChunk( find );
+                const Shard& from = c->getShard();
+
+                if ( from == to ) {
+                    errmsg = "that chunk is already on that shard";
+                    return false;
+                }
+
+                BSONObj res;
+                if ( ! c->moveAndCommit( to , maxChunkSizeBytes , res ) ) {
+                    errmsg = "move failed";
+                    result.append( "cause" , res );
+                    return false;
+                }
+                
+                // preemptively reload the config to get new version info
+                config->getChunkManager( ns , true );
+
+                result.append( "millis" , t.millis() );
+                return true;
+            }
+        } moveChunkCmd;
+
+        // ------------ server level commands -------------
+
+        class ListShardsCmd : public GridAdminCmd {
+        public:
+            ListShardsCmd() : GridAdminCmd("listShards") { }
+            virtual void help( stringstream& help ) const {
+                help << "list all shards of the system";
+            }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                ScopedDbConnection conn( configServer.getPrimary() );
+
+                vector<BSONObj> all;
+                auto_ptr<DBClientCursor> cursor = conn->query( "config.shards" , BSONObj() );
+                while ( cursor->more() ) {
+                    BSONObj o = cursor->next();
+                    all.push_back( o );
+                }
+
+                result.append("shards" , all );
+                conn.done();
+
+                return true;
+            }
+        } listShardsCmd;
+
+        /* a shard is a single mongod server or a replica pair.  add it (them) to the cluster as a storage partition. */
+        class AddShard : public GridAdminCmd {
+        public:
+            AddShard() : GridAdminCmd("addShard") { }
+            virtual void help( stringstream& help ) const {
+                help << "add a new shard to the system";
+            }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                errmsg.clear();
+
+                // get replica set component hosts
+                ConnectionString servers = ConnectionString::parse( cmdObj.firstElement().valuestrsafe() , errmsg );
+                if ( ! errmsg.empty() ) {
+                    log() << "addshard request " << cmdObj << " failed:" << errmsg << endl;
+                    return false;
+                }
+
+                // using localhost in server names implies every other process must use localhost addresses too
+                vector<HostAndPort> serverAddrs = servers.getServers();
+                for ( size_t i = 0 ; i < serverAddrs.size() ; i++ ) {
+                    if ( serverAddrs[i].isLocalHost() != grid.allowLocalHost() ) {
+                        errmsg = str::stream() << 
+                            "can't use localhost as a shard since all shards need to communicate. " <<
+                            "either use all shards and configdbs in localhost or all in actual IPs " << 
+                            " host: " << serverAddrs[i].toString() << " isLocalHost:" << serverAddrs[i].isLocalHost();
+                        
+                        log() << "addshard request " << cmdObj << " failed: attempt to mix localhosts and IPs" << endl;
+                        return false;
+                    }
+
+                    // it's fine if mongods of a set all use default port
+                    if ( ! serverAddrs[i].hasPort() ) {
+                        serverAddrs[i].setPort( CmdLine::ShardServerPort );
+                    }
+                }
+
+                // name is optional; addShard will provide one if needed
+                string name = "";
+                if ( cmdObj["name"].type() == String ) {
+                    name = cmdObj["name"].valuestrsafe();
+                }
+
+                // maxSize is the space usage cap in a shard in MBs
+                long long maxSize = 0;
+                if ( cmdObj[ ShardFields::maxSize.name() ].isNumber() ) {
+                    maxSize = cmdObj[ ShardFields::maxSize.name() ].numberLong();
+                }
+
+                if ( ! grid.addShard( &name , servers , maxSize , errmsg ) ) {
+                    log() << "addshard request " << cmdObj << " failed: " << errmsg << endl;
+                    return false;
+                }
+
+                result << "shardAdded" << name;
+                return true;
+            }
+
+        } addServer;
+
+        /* See usage docs at:
+         * http://www.mongodb.org/display/DOCS/Configuring+Sharding#ConfiguringSharding-Removingashard
+         */
+        class RemoveShardCmd : public GridAdminCmd {
+        public:
+            RemoveShardCmd() : GridAdminCmd("removeShard") { }
+            virtual void help( stringstream& help ) const {
+                help << "remove a shard to the system.";
+            }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string target = cmdObj.firstElement().valuestrsafe();
+                Shard s = Shard::make( target );
+                if ( ! grid.knowAboutShard( s.getConnString() ) ) {
+                    errmsg = "unknown shard";
+                    return false;
+                }
+
+                ScopedDbConnection conn( configServer.getPrimary() );
+
+                if (conn->count("config.shards", BSON("_id" << NE << s.getName() << ShardFields::draining(true)))){
+                    conn.done();
+                    errmsg = "Can't have more than one draining shard at a time";
+                    return false;
+                }
+
+                if (conn->count("config.shards", BSON("_id" << NE << s.getName())) == 0){
+                    conn.done();
+                    errmsg = "Can't remove last shard";
+                    return false;
+                }
+
+                BSONObj primaryDoc = BSON( "_id" << NE << "local" << "primary" << s.getName() );
+                BSONObj dbInfo; // appended at end of result on success
+                {
+                    boost::scoped_ptr<DBClientCursor> cursor (conn->query("config.databases", primaryDoc));
+                    if (cursor->more()) { // skip block and allocations if empty
+                        BSONObjBuilder dbInfoBuilder;
+                        dbInfoBuilder.append("note", "you need to drop or movePrimary these databases");
+                        BSONArrayBuilder dbs(dbInfoBuilder.subarrayStart("dbsToMove"));
+
+                        while (cursor->more()){
+                            BSONObj db = cursor->nextSafe();
+                            dbs.append(db["_id"]);
+                        }
+                        dbs.doneFast();
+
+                        dbInfo = dbInfoBuilder.obj();
+                    }
+                }
+
+                // If the server is not yet draining chunks, put it in draining mode.
+                BSONObj searchDoc = BSON( "_id" << s.getName() );
+                BSONObj drainingDoc = BSON( "_id" << s.getName() << ShardFields::draining(true) );
+                BSONObj shardDoc = conn->findOne( "config.shards", drainingDoc );
+                if ( shardDoc.isEmpty() ) {
+
+                    // TODO prevent move chunks to this shard.
+
+                    log() << "going to start draining shard: " << s.getName() << endl;
+                    BSONObj newStatus = BSON( "$set" << BSON( ShardFields::draining(true) ) );
+                    conn->update( "config.shards" , searchDoc , newStatus, false /* do no upsert */);
+
+                    errmsg = conn->getLastError();
+                    if ( errmsg.size() ) {
+                        log() << "error starting remove shard: " << s.getName() << " err: " << errmsg << endl;
+                        return false;
+                    }
+
+                    BSONObj primaryLocalDoc = BSON("_id" << "local" <<  "primary" << s.getName() );
+                    PRINT(primaryLocalDoc);
+                    if (conn->count("config.databases", primaryLocalDoc)) {
+                        log() << "This shard is listed as primary of local db. Removing entry." << endl;
+                        conn->remove("config.databases", BSON("_id" << "local"));
+                        errmsg = conn->getLastError();
+                        if ( errmsg.size() ) {
+                            log() << "error removing local db: " << errmsg << endl;
+                            return false;
+                        }
+                    }
+
+                    Shard::reloadShardInfo();
+
+                    result.append( "msg"   , "draining started successfully" );
+                    result.append( "state" , "started" );
+                    result.append( "shard" , s.getName() );
+                    result.appendElements(dbInfo);
+                    conn.done();
+                    return true;
+                }
+
+                // If the server has been completely drained, remove it from the ConfigDB.
+                // Check not only for chunks but also databases.
+                BSONObj shardIDDoc = BSON( "shard" << shardDoc[ "_id" ].str() );
+                long long chunkCount = conn->count( "config.chunks" , shardIDDoc );
+                long long dbCount = conn->count( "config.databases" , primaryDoc );
+                if ( ( chunkCount == 0 ) && ( dbCount == 0 ) ) {
+                    log() << "going to remove shard: " << s.getName() << endl;
+                    conn->remove( "config.shards" , searchDoc );
+
+                    errmsg = conn->getLastError();
+                    if ( errmsg.size() ) {
+                        log() << "error concluding remove shard: " << s.getName() << " err: " << errmsg << endl;
+                        return false;
+                    }
+
+                    Shard::removeShard( shardDoc[ "_id" ].str() );
+                    Shard::reloadShardInfo();
+
+                    result.append( "msg"   , "removeshard completed successfully" );
+                    result.append( "state" , "completed" );
+                    result.append( "shard" , s.getName() );
+                    conn.done();
+                    return true;
+                }
+
+                // If the server is already in draining mode, just report on its progress.
+                // Report on databases (not just chunks) that are left too.
+                result.append( "msg"  , "draining ongoing" );
+                result.append( "state" , "ongoing" );
+                BSONObjBuilder inner;
+                inner.append( "chunks" , chunkCount );
+                inner.append( "dbs" , dbCount );
+                result.append( "remaining" , inner.obj() );
+                result.appendElements(dbInfo);
+
+                conn.done();
+                return true;
+            }
+        } removeShardCmd;
+
+
+        // --------------- public commands ----------------
+
+        class IsDbGridCmd : public Command {
+        public:
+            virtual LockType locktype() const { return NONE; }
+            virtual bool requiresAuth() { return false; }
+            virtual bool slaveOk() const {
+                return true;
+            }
+            IsDbGridCmd() : Command("isdbgrid") { }
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                result.append("isdbgrid", 1);
+                result.append("hostname", getHostNameCached());
+                return true;
+            }
+        } isdbgrid;
+
+        class CmdIsMaster : public Command {
+        public:
+            virtual LockType locktype() const { return NONE; }
+            virtual bool requiresAuth() { return false; }
+            virtual bool slaveOk() const {
+                return true;
+            }
+            virtual void help( stringstream& help ) const {
+                help << "test if this is master half of a replica pair";
+            }
+            CmdIsMaster() : Command("isMaster" , false , "ismaster") { }
+            virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                result.appendBool("ismaster", true );
+                result.append("msg", "isdbgrid");
+                result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
+                return true;
+            }
+        } ismaster;
+
+        class CmdWhatsMyUri : public Command {
+        public:
+            CmdWhatsMyUri() : Command("whatsmyuri") { }
+            virtual bool logTheOp() {
+                return false; // the modification will be logged directly
+            }
+            virtual bool slaveOk() const {
+                return true;
+            }
+            virtual LockType locktype() const { return NONE; }
+            virtual void help( stringstream &help ) const {
+                help << "{whatsmyuri:1}";
+            }
+            virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                result << "you" << ClientInfo::get()->getRemote();
+                return true;
+            }
+        } cmdWhatsMyUri;
+
+
+        class CmdShardingGetPrevError : public Command {
+        public:
+            virtual LockType locktype() const { return NONE; }
+            virtual bool requiresAuth() { return false; }
+
+            virtual bool slaveOk() const {
+                return true;
+            }
+            virtual void help( stringstream& help ) const {
+                help << "get previous error (since last reseterror command)";
+            }
+            CmdShardingGetPrevError() : Command( "getPrevError" , false , "getpreverror") { }
+            virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                errmsg += "getpreverror not supported for sharded environments";
+                return false;
+            }
+        } cmdGetPrevError;
+
+        class CmdShardingGetLastError : public Command {
+        public:
+            virtual LockType locktype() const { return NONE; }
+            virtual bool requiresAuth() { return false; }
+            virtual bool slaveOk() const {
+                return true;
+            }
+            virtual void help( stringstream& help ) const {
+                help << "check for an error on the last command executed";
+            }
+            CmdShardingGetLastError() : Command("getLastError" , false , "getlasterror") { }
+
+            virtual bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                LastError *le = lastError.disableForCommand();
+                {
+                    assert( le );
+                    if ( le->msg.size() && le->nPrev == 1 ) {
+                        le->appendSelf( result );
+                        return true;
+                    }
+                }
+
+                ClientInfo * client = ClientInfo::get();
+                return client->getLastError( cmdObj , result );
+            }
+        } cmdGetLastError;
+
+    }
+
+    class CmdShardingResetError : public Command {
+    public:
+        CmdShardingResetError() : Command( "resetError" , false , "reseterror" ) {}
+
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const {
+            return true;
+        }
+
+        bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            LastError *le = lastError.get();
+            if ( le )
+                le->reset();
+
+            ClientInfo * client = ClientInfo::get();
+            set<string> * shards = client->getPrev();
+
+            for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) {
+                string theShard = *i;
+                ShardConnection conn( theShard , "" );
+                BSONObj res;
+                conn->runCommand( dbName , cmdObj , res );
+                conn.done();
+            }
+
+            return true;
+        }
+    } cmdShardingResetError;
+
+    class CmdListDatabases : public Command {
+    public:
+        CmdListDatabases() : Command("listDatabases", true , "listdatabases" ) {}
+
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool slaveOverrideOk() { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream& help ) const { help << "list databases on cluster"; }
+
+        bool run(const string& , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            vector<Shard> shards;
+            Shard::getAllShards( shards );
+
+            map<string,long long> sizes;
+            map< string,shared_ptr<BSONObjBuilder> > dbShardInfo;
+
+            for ( vector<Shard>::iterator i=shards.begin(); i!=shards.end(); i++ ) {
+                Shard s = *i;
+                BSONObj x = s.runCommand( "admin" , "listDatabases" );
+
+                BSONObjIterator j( x["databases"].Obj() );
+                while ( j.more() ) {
+                    BSONObj theDB = j.next().Obj();
+
+                    string name = theDB["name"].String();
+                    long long size = theDB["sizeOnDisk"].numberLong();
+
+                    long long& totalSize = sizes[name];
+                    if ( size == 1 ) {
+                        if ( totalSize <= 1 )
+                            totalSize = 1;
+                    }
+                    else
+                        totalSize += size;
+
+                    shared_ptr<BSONObjBuilder>& bb = dbShardInfo[name];
+                    if ( ! bb.get() )
+                        bb.reset( new BSONObjBuilder() );
+                    bb->appendNumber( s.getName() , size );
+                }
+
+            }
+
+            long long totalSize = 0;
+
+            BSONArrayBuilder bb( result.subarrayStart( "databases" ) );
+            for ( map<string,long long>::iterator i=sizes.begin(); i!=sizes.end(); ++i ) {
+                string name = i->first;
+
+                if ( name == "local" ) {
+                    // we don't return local
+                    // since all shards have their own independent local
+                    continue;
+                }
+
+                long long size = i->second;
+                totalSize += size;
+
+                BSONObjBuilder temp;
+                temp.append( "name" , name );
+                temp.appendNumber( "sizeOnDisk" , size );
+                temp.appendBool( "empty" , size == 1 );
+                temp.append( "shards" , dbShardInfo[name]->obj() );
+
+                bb.append( temp.obj() );
+            }
+            
+            if ( sizes.find( "config" ) == sizes.end() ){
+                ScopedDbConnection conn( configServer.getPrimary() );
+                BSONObj x;
+                if ( conn->simpleCommand( "config" , &x , "dbstats" ) ){
+                    BSONObjBuilder b;
+                    b.append( "name" , "config" );
+                    b.appendBool( "empty" , false );
+                    if ( x["fileSize"].type() )
+                        b.appendAs( x["fileSize"] , "sizeOnDisk" );
+                    else
+                        b.append( "sizeOnDisk" , 1 );
+                    bb.append( b.obj() );
+                }
+                else {
+                    bb.append( BSON( "name" << "config" ) );
+                }
+                conn.done();
+            }
+
+            bb.done();
+
+            result.appendNumber( "totalSize" , totalSize );
+            result.appendNumber( "totalSizeMb" , totalSize / ( 1024 * 1024 ) );
+
+            return 1;
+        }
+
+    } cmdListDatabases;
+
+    class CmdCloseAllDatabases : public Command {
+    public:
+        CmdCloseAllDatabases() : Command("closeAllDatabases", false , "closeAllDatabases" ) {}
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool slaveOverrideOk() { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream& help ) const { help << "Not supported sharded"; }
+
+        bool run(const string& , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& /*result*/, bool /*fromRepl*/) {
+            errmsg = "closeAllDatabases isn't supported through mongos";
+            return false;
+        }
+    } cmdCloseAllDatabases;
+
+
+    class CmdReplSetGetStatus : public Command {
+    public:
+        CmdReplSetGetStatus() : Command("replSetGetStatus"){}
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream& help ) const { help << "Not supported through mongos"; }
+
+        bool run(const string& , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            if ( jsobj["forShell"].trueValue() )
+                lastError.disableForCommand();
+
+            errmsg = "replSetGetStatus is not supported through mongos";
+            result.append("info", "mongos"); // see sayReplSetMemberState
+            return false;
+        }
+    } cmdReplSetGetStatus;
+
+    CmdShutdown cmdShutdown;
+
+    void CmdShutdown::help( stringstream& help ) const {
+        help << "shutdown the database.  must be ran against admin db and "
+             << "either (1) ran from localhost or (2) authenticated.";
+    }
+
+    bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        return shutdownHelper();
+    }
+
+} // namespace mongo
diff --git a/src/mongo/s/commands_public.cpp b/src/mongo/s/commands_public.cpp
new file mode 100644
index 00000000000..375c4f6feec
--- /dev/null
+++ b/src/mongo/s/commands_public.cpp
@@ -0,0 +1,1565 @@
+// s/commands_public.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/message.h"
+#include "../db/dbmessage.h"
+#include "../client/connpool.h"
+#include "../client/parallel.h"
+#include "../db/commands.h"
+#include "../db/commands/pipeline.h"
+#include "../db/pipeline/document_source.h"
+#include "../db/pipeline/expression_context.h"
+#include "../db/queryutil.h"
+#include "../scripting/engine.h"
+#include "../util/timer.h"
+
+
+#include "config.h"
+#include "chunk.h"
+#include "strategy.h"
+#include "grid.h"
+#include "mr_shard.h"
+#include "client.h"
+
+namespace mongo {
+
+    bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl )
+    { 
+        return true;
+    }
+
+    namespace dbgrid_pub_cmds {
+
+        class PublicGridCommand : public Command {
+        public:
+            PublicGridCommand( const char* n, const char* oldname=NULL ) : Command( n, false, oldname ) {
+            }
+            virtual bool slaveOk() const {
+                return true;
+            }
+            virtual bool adminOnly() const {
+                return false;
+            }
+
+            // Override if passthrough should also send query options
+            // Safer as off by default, can slowly enable as we add more tests
+            virtual bool passOptions() const { return false; }
+
+            // all grid commands are designed not to lock
+            virtual LockType locktype() const { return NONE; }
+
+        protected:
+
+            bool passthrough( DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ) {
+                return _passthrough(conf->getName(), conf, cmdObj, 0, result);
+            }
+            bool adminPassthrough( DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ) {
+                return _passthrough("admin", conf, cmdObj, 0, result);
+            }
+
+            bool passthrough( DBConfigPtr conf, const BSONObj& cmdObj , int options, BSONObjBuilder& result ) {
+                return _passthrough(conf->getName(), conf, cmdObj, options, result);
+            }
+            bool adminPassthrough( DBConfigPtr conf, const BSONObj& cmdObj , int options, BSONObjBuilder& result ) {
+                return _passthrough("admin", conf, cmdObj, options, result);
+            }
+
+        private:
+            bool _passthrough(const string& db,  DBConfigPtr conf, const BSONObj& cmdObj , int options , BSONObjBuilder& result ) {
+                ShardConnection conn( conf->getPrimary() , "" );
+                BSONObj res;
+                bool ok = conn->runCommand( db , cmdObj , res , passOptions() ? options : 0 );
+                if ( ! ok && res["code"].numberInt() == SendStaleConfigCode ) {
+                    conn.done();
+                    throw RecvStaleConfigException( res["ns"].toString(),"command failed because of stale config");
+                }
+                result.appendElements( res );
+                conn.done();
+                return ok;
+            }
+        };
+
+        class RunOnAllShardsCommand : public Command {
+        public:
+            RunOnAllShardsCommand(const char* n, const char* oldname=NULL) : Command(n, false, oldname) {}
+
+            virtual bool slaveOk() const { return true; }
+            virtual bool adminOnly() const { return false; }
+
+            // all grid commands are designed not to lock
+            virtual LockType locktype() const { return NONE; }
+
+
+            // default impl uses all shards for DB
+            virtual void getShards(const string& dbName , BSONObj& cmdObj, set<Shard>& shards) {
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+                conf->getAllShards(shards);
+            }
+
+            virtual void aggregateResults(const vector<BSONObj>& results, BSONObjBuilder& output) {}
+
+            // don't override
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& output, bool) {
+                LOG(1) << "RunOnAllShardsCommand db: " << dbName << " cmd:" << cmdObj << endl;
+                set<Shard> shards;
+                getShards(dbName, cmdObj, shards);
+
+                list< shared_ptr<Future::CommandResult> > futures;
+                for ( set<Shard>::const_iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) {
+                    futures.push_back( Future::spawnCommand( i->getConnString() , dbName , cmdObj, 0 ) );
+                }
+
+                vector<BSONObj> results;
+                BSONObjBuilder subobj (output.subobjStart("raw"));
+                BSONObjBuilder errors;
+                for ( list< shared_ptr<Future::CommandResult> >::iterator i=futures.begin(); i!=futures.end(); i++ ) {
+                    shared_ptr<Future::CommandResult> res = *i;
+                    if ( ! res->join() ) {
+                        errors.appendAs(res->result()["errmsg"], res->getServer());
+                    }
+                    results.push_back( res->result() );
+                    subobj.append( res->getServer() , res->result() );
+                }
+
+                subobj.done();
+
+                BSONObj errobj = errors.done();
+                if (! errobj.isEmpty()) {
+                    errmsg = errobj.toString(false, true);
+                    return false;
+                }
+
+                aggregateResults(results, output);
+                return true;
+            }
+
+        };
+
+        class AllShardsCollectionCommand : public RunOnAllShardsCommand {
+        public:
+            AllShardsCollectionCommand(const char* n, const char* oldname=NULL) : RunOnAllShardsCommand(n, oldname) {}
+
+            virtual void getShards(const string& dbName , BSONObj& cmdObj, set<Shard>& shards) {
+                string fullns = dbName + '.' + cmdObj.firstElement().valuestrsafe();
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    shards.insert(conf->getShard(fullns));
+                }
+                else {
+                    conf->getChunkManager(fullns)->getAllShards(shards);
+                }
+            }
+        };
+
+
+        class NotAllowedOnShardedCollectionCmd : public PublicGridCommand {
+        public:
+            NotAllowedOnShardedCollectionCmd( const char * n ) : PublicGridCommand( n ) {}
+
+            virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ) = 0;
+
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
+                string fullns = getFullNS( dbName , cmdObj );
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    return passthrough( conf , cmdObj , options, result );
+                }
+                errmsg = "can't do command: " + name + " on sharded collection";
+                return false;
+            }
+        };
+
+        // ----
+
+        class DropIndexesCmd : public AllShardsCollectionCommand {
+        public:
+            DropIndexesCmd() :  AllShardsCollectionCommand("dropIndexes", "deleteIndexes") {}
+        } dropIndexesCmd;
+
+        class ReIndexCmd : public AllShardsCollectionCommand {
+        public:
+            ReIndexCmd() :  AllShardsCollectionCommand("reIndex") {}
+        } reIndexCmd;
+
+        class ProfileCmd : public PublicGridCommand {
+        public:
+            ProfileCmd() :  PublicGridCommand("profile") {}
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
+                errmsg = "profile currently not supported via mongos";
+                return false;
+            }
+        } profileCmd;
+        
+
+        class ValidateCmd : public AllShardsCollectionCommand {
+        public:
+            ValidateCmd() :  AllShardsCollectionCommand("validate") {}
+            virtual void aggregateResults(const vector<BSONObj>& results, BSONObjBuilder& output) {
+                for (vector<BSONObj>::const_iterator it(results.begin()), end(results.end()); it!=end; it++){
+                    const BSONObj& result = *it;
+                    const BSONElement valid = result["valid"];
+                    if (!valid.eoo()){
+                        if (!valid.trueValue()) {
+                            output.appendBool("valid", false);
+                            return;
+                        }
+                    }
+                    else {
+                        // Support pre-1.9.0 output with everything in a big string
+                        const char* s = result["result"].valuestrsafe();
+                        if (strstr(s, "exception") ||  strstr(s, "corrupt")){
+                            output.appendBool("valid", false);
+                            return;
+                        }
+                    }
+                }
+
+                output.appendBool("valid", true);
+            }
+        } validateCmd;
+
+        class RepairDatabaseCmd : public RunOnAllShardsCommand {
+        public:
+            RepairDatabaseCmd() :  RunOnAllShardsCommand("repairDatabase") {}
+        } repairDatabaseCmd;
+
+        class DBStatsCmd : public RunOnAllShardsCommand {
+        public:
+            DBStatsCmd() :  RunOnAllShardsCommand("dbStats", "dbstats") {}
+
+            virtual void aggregateResults(const vector<BSONObj>& results, BSONObjBuilder& output) {
+                long long objects = 0;
+                long long dataSize = 0;
+                long long storageSize = 0;
+                long long numExtents = 0;
+                long long indexes = 0;
+                long long indexSize = 0;
+                long long fileSize = 0;
+
+                for (vector<BSONObj>::const_iterator it(results.begin()), end(results.end()); it != end; ++it) {
+                    const BSONObj& b = *it;
+                    objects     += b["objects"].numberLong();
+                    dataSize    += b["dataSize"].numberLong();
+                    storageSize += b["storageSize"].numberLong();
+                    numExtents  += b["numExtents"].numberLong();
+                    indexes     += b["indexes"].numberLong();
+                    indexSize   += b["indexSize"].numberLong();
+                    fileSize    += b["fileSize"].numberLong();
+                }
+
+                //result.appendNumber( "collections" , ncollections ); //TODO: need to find a good way to get this
+                output.appendNumber( "objects" , objects );
+                output.append      ( "avgObjSize" , double(dataSize) / double(objects) );
+                output.appendNumber( "dataSize" , dataSize );
+                output.appendNumber( "storageSize" , storageSize);
+                output.appendNumber( "numExtents" , numExtents );
+                output.appendNumber( "indexes" , indexes );
+                output.appendNumber( "indexSize" , indexSize );
+                output.appendNumber( "fileSize" , fileSize );
+            }
+        } DBStatsCmdObj;
+
+        class DropCmd : public PublicGridCommand {
+        public:
+            DropCmd() : PublicGridCommand( "drop" ) {}
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string collection = cmdObj.firstElement().valuestrsafe();
+                string fullns = dbName + "." + collection;
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                log() << "DROP: " << fullns << endl;
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    return passthrough( conf , cmdObj , result );
+                }
+
+                ChunkManagerPtr cm = conf->getChunkManager( fullns );
+                massert( 10418 ,  "how could chunk manager be null!" , cm );
+
+                cm->drop( cm );
+                uassert( 13512 , "drop collection attempted on non-sharded collection" , conf->removeSharding( fullns ) );
+
+                return 1;
+            }
+        } dropCmd;
+
+        class DropDBCmd : public PublicGridCommand {
+        public:
+            DropDBCmd() : PublicGridCommand( "dropDatabase" ) {}
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+                BSONElement e = cmdObj.firstElement();
+
+                if ( ! e.isNumber() || e.number() != 1 ) {
+                    errmsg = "invalid params";
+                    return 0;
+                }
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                log() << "DROP DATABASE: " << dbName << endl;
+
+                if ( ! conf ) {
+                    result.append( "info" , "database didn't exist" );
+                    return true;
+                }
+
+                if ( ! conf->dropDatabase( errmsg ) )
+                    return false;
+
+                result.append( "dropped" , dbName );
+                return true;
+            }
+        } dropDBCmd;
+
+        class RenameCollectionCmd : public PublicGridCommand {
+        public:
+            RenameCollectionCmd() : PublicGridCommand( "renameCollection" ) {}
+            bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string fullnsFrom = cmdObj.firstElement().valuestrsafe();
+                string dbNameFrom = nsToDatabase( fullnsFrom.c_str() );
+                DBConfigPtr confFrom = grid.getDBConfig( dbNameFrom , false );
+
+                string fullnsTo = cmdObj["to"].valuestrsafe();
+                string dbNameTo = nsToDatabase( fullnsTo.c_str() );
+                DBConfigPtr confTo = grid.getDBConfig( dbNameTo , false );
+
+                uassert(13140, "Don't recognize source or target DB", confFrom && confTo);
+                uassert(13138, "You can't rename a sharded collection", !confFrom->isSharded(fullnsFrom));
+                uassert(13139, "You can't rename to a sharded collection", !confTo->isSharded(fullnsTo));
+
+                const Shard& shardTo = confTo->getShard(fullnsTo);
+                const Shard& shardFrom = confFrom->getShard(fullnsFrom);
+
+                uassert(13137, "Source and destination collections must be on same shard", shardFrom == shardTo);
+
+                return adminPassthrough( confFrom , cmdObj , result );
+            }
+        } renameCollectionCmd;
+
+        class CopyDBCmd : public PublicGridCommand {
+        public:
+            CopyDBCmd() : PublicGridCommand( "copydb" ) {}
+            bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string todb = cmdObj.getStringField("todb");
+                uassert(13402, "need a todb argument", !todb.empty());
+
+                DBConfigPtr confTo = grid.getDBConfig( todb );
+                uassert(13398, "cant copy to sharded DB", !confTo->isShardingEnabled());
+
+                string fromhost = cmdObj.getStringField("fromhost");
+                if (!fromhost.empty()) {
+                    return adminPassthrough( confTo , cmdObj , result );
+                }
+                else {
+                    string fromdb = cmdObj.getStringField("fromdb");
+                    uassert(13399, "need a fromdb argument", !fromdb.empty());
+
+                    DBConfigPtr confFrom = grid.getDBConfig( fromdb , false );
+                    uassert(13400, "don't know where source DB is", confFrom);
+                    uassert(13401, "cant copy from sharded DB", !confFrom->isShardingEnabled());
+
+                    BSONObjBuilder b;
+                    BSONForEach(e, cmdObj) {
+                        if (strcmp(e.fieldName(), "fromhost") != 0)
+                            b.append(e);
+                    }
+                    b.append("fromhost", confFrom->getPrimary().getConnString());
+                    BSONObj fixed = b.obj();
+
+                    return adminPassthrough( confTo , fixed , result );
+                }
+
+            }
+        } copyDBCmd;
+
+        class CountCmd : public PublicGridCommand {
+        public:
+            CountCmd() : PublicGridCommand("count") { }
+            virtual bool passOptions() const { return true; }
+            bool run(const string& dbName, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
+                string collection = cmdObj.firstElement().valuestrsafe();
+                string fullns = dbName + "." + collection;
+
+                BSONObj filter;
+                if ( cmdObj["query"].isABSONObj() )
+                    filter = cmdObj["query"].Obj();
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    ShardConnection conn( conf->getPrimary() , fullns );
+
+                    BSONObj temp;
+                    bool ok = false;
+                    try{
+                        ok = conn->runCommand( dbName , cmdObj , temp, options );
+                    }
+                    catch( RecvStaleConfigException& e ){
+                        conn.done();
+                        throw e;
+                    }
+                    conn.done();
+
+                    if ( ok ) {
+                        result.append( temp["n"] );
+                        return true;
+                    }
+
+                    if ( temp["code"].numberInt() != SendStaleConfigCode ) {
+                        errmsg = temp["errmsg"].String();
+                        result.appendElements( temp );
+                        return false;
+                    }
+
+                    // this collection got sharded
+                    ChunkManagerPtr cm = conf->getChunkManagerIfExists( fullns , true );
+                    if ( ! cm ) {
+                        errmsg = "should be sharded now";
+                        result.append( "root" , temp );
+                        return false;
+                    }
+                }
+
+                long long total = 0;
+                map<string,long long> shardCounts;
+                int numTries = 0;
+                bool hadToBreak = false;
+
+                ChunkManagerPtr cm = conf->getChunkManagerIfExists( fullns );
+                while ( numTries < 5 ) {
+                    numTries++;
+
+                    // This all should eventually be replaced by new pcursor framework, but for now match query
+                    // retry behavior manually
+                    if( numTries >= 2 ) sleepsecs( numTries - 1 );
+
+                    if ( ! cm ) {
+                        // probably unsharded now
+                        return run( dbName , cmdObj , options , errmsg , result, false );
+                    }
+
+                    set<Shard> shards;
+                    cm->getShardsForQuery( shards , filter );
+                    assert( shards.size() );
+
+                    hadToBreak = false;
+
+                    for (set<Shard>::iterator it=shards.begin(), end=shards.end(); it != end; ++it) {
+                        ShardConnection conn(*it, fullns);
+                        if ( conn.setVersion() ){
+                            ChunkManagerPtr newCM = conf->getChunkManagerIfExists( fullns );
+                            if( newCM->getVersion() != cm->getVersion() ){
+                                cm = newCM;
+                                total = 0;
+                                shardCounts.clear();
+                                conn.done();
+                                hadToBreak = true;
+                                break;
+                            }
+                        }
+
+                        BSONObj temp;
+                        bool ok = false;
+                        try{
+                            ok = conn->runCommand( dbName , BSON( "count" << collection << "query" << filter ) , temp, options );
+                        }
+                        catch( RecvStaleConfigException& e ){
+                            conn.done();
+                            throw e;
+                        }
+                        conn.done();
+
+                        if ( ok ) {
+                            long long mine = temp["n"].numberLong();
+                            total += mine;
+                            shardCounts[it->getName()] = mine;
+                            continue;
+
+                        }
+
+                        if ( SendStaleConfigCode == temp["code"].numberInt() ) {
+                            // my version is old
+                            total = 0;
+                            shardCounts.clear();
+                            cm = conf->getChunkManagerIfExists( fullns , true, numTries > 2 ); // Force reload on third attempt
+                            hadToBreak = true;
+                            break;
+                        }
+
+                        // command failed :(
+                        errmsg = "failed on : " + it->getName();
+                        result.append( "cause" , temp );
+                        return false;
+                    }
+                    if ( ! hadToBreak )
+                        break;
+                }
+                if (hadToBreak) {
+                    errmsg = "Tried 5 times without success to get count for " + fullns + " from all shards";
+                    return false;
+                }
+
+                total = applySkipLimit( total , cmdObj );
+                result.appendNumber( "n" , total );
+                BSONObjBuilder temp( result.subobjStart( "shards" ) );
+                for ( map<string,long long>::iterator i=shardCounts.begin(); i!=shardCounts.end(); ++i )
+                    temp.appendNumber( i->first , i->second );
+                temp.done();
+                return true;
+            }
+        } countCmd;
+
+        class CollectionStats : public PublicGridCommand {
+        public:
+            CollectionStats() : PublicGridCommand("collStats", "collstats") { }
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string collection = cmdObj.firstElement().valuestrsafe();
+                string fullns = dbName + "." + collection;
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    result.appendBool("sharded", false);
+                    result.append( "primary" , conf->getPrimary().getName() );
+                    return passthrough( conf , cmdObj , result);
+                }
+                result.appendBool("sharded", true);
+
+                ChunkManagerPtr cm = conf->getChunkManager( fullns );
+                massert( 12594 ,  "how could chunk manager be null!" , cm );
+
+                set<Shard> servers;
+                cm->getAllShards(servers);
+
+                BSONObjBuilder shardStats;
+                map<string,long long> counts;
+                map<string,long long> indexSizes;
+                /*
+                long long count=0;
+                long long size=0;
+                long long storageSize=0;
+                */
+                int nindexes=0;
+                bool warnedAboutIndexes = false;
+                for ( set<Shard>::iterator i=servers.begin(); i!=servers.end(); i++ ) {
+                    ScopedDbConnection conn( *i );
+                    BSONObj res;
+                    if ( ! conn->runCommand( dbName , cmdObj , res ) ) {
+                        errmsg = "failed on shard: " + res.toString();
+                        return false;
+                    }
+                    conn.done();
+                    
+                    BSONObjIterator j( res );
+                    while ( j.more() ) {
+                        BSONElement e = j.next();
+
+                        if ( str::equals( e.fieldName() , "ns" ) || 
+                             str::equals( e.fieldName() , "ok" ) || 
+                             str::equals( e.fieldName() , "avgObjSize" ) ||
+                             str::equals( e.fieldName() , "lastExtentSize" ) ||
+                             str::equals( e.fieldName() , "paddingFactor" ) ) {
+                            continue;
+                        }
+                        else if ( str::equals( e.fieldName() , "count" ) ||
+                                  str::equals( e.fieldName() , "size" ) ||
+                                  str::equals( e.fieldName() , "storageSize" ) ||
+                                  str::equals( e.fieldName() , "numExtents" ) ||
+                                  str::equals( e.fieldName() , "totalIndexSize" ) ) {
+                            counts[e.fieldName()] += e.numberLong();
+                        }
+                        else if ( str::equals( e.fieldName() , "indexSizes" ) ) {
+                            BSONObjIterator k( e.Obj() );
+                            while ( k.more() ) {
+                                BSONElement temp = k.next();
+                                indexSizes[temp.fieldName()] += temp.numberLong();
+                            }
+                        }
+                        else if ( str::equals( e.fieldName() , "flags" ) ) {
+                            if ( ! result.hasField( e.fieldName() ) )
+                                result.append( e );
+                        }
+                        else if ( str::equals( e.fieldName() , "nindexes" ) ) {
+                            int myIndexes = e.numberInt();
+                            
+                            if ( nindexes == 0 ) {
+                                nindexes = myIndexes;
+                            }
+                            else if ( nindexes == myIndexes ) {
+                                // no-op
+                            }
+                            else {
+                                // hopefully this means we're building an index
+                                
+                                if ( myIndexes > nindexes )
+                                    nindexes = myIndexes;
+                                
+                                if ( ! warnedAboutIndexes ) {
+                                    result.append( "warning" , "indexes don't all match - ok if ensureIndex is running" );
+                                    warnedAboutIndexes = true;
+                                }
+                            }
+                        }
+                        else {
+                            warning() << "mongos collstats doesn't know about: " << e.fieldName() << endl;
+                        }
+                        
+                    }
+                    shardStats.append(i->getName(), res);
+                }
+
+                result.append("ns", fullns);
+                
+                for ( map<string,long long>::iterator i=counts.begin(); i!=counts.end(); ++i )
+                    result.appendNumber( i->first , i->second );
+                
+                {
+                    BSONObjBuilder ib( result.subobjStart( "indexSizes" ) );
+                    for ( map<string,long long>::iterator i=indexSizes.begin(); i!=indexSizes.end(); ++i )
+                        ib.appendNumber( i->first , i->second );
+                    ib.done();
+                }
+
+                if ( counts["count"] > 0 )
+                    result.append("avgObjSize", (double)counts["size"] / (double)counts["count"] );
+                else
+                    result.append( "avgObjSize", 0.0 );
+                
+                result.append("nindexes", nindexes);
+
+                result.append("nchunks", cm->numChunks());
+                result.append("shards", shardStats.obj());
+
+                return true;
+            }
+        } collectionStatsCmd;
+
+        class FindAndModifyCmd : public PublicGridCommand {
+        public:
+            FindAndModifyCmd() : PublicGridCommand("findAndModify", "findandmodify") { }
+            bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string collection = cmdObj.firstElement().valuestrsafe();
+                string fullns = dbName + "." + collection;
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    return passthrough( conf , cmdObj , result);
+                }
+
+                ChunkManagerPtr cm = conf->getChunkManager( fullns );
+                massert( 13002 ,  "shard internal error chunk manager should never be null" , cm );
+
+                BSONObj filter = cmdObj.getObjectField("query");
+                uassert(13343,  "query for sharded findAndModify must have shardkey", cm->hasShardKey(filter));
+
+                //TODO with upsert consider tracking for splits
+
+                ChunkPtr chunk = cm->findChunk(filter);
+                ShardConnection conn( chunk->getShard() , fullns );
+                BSONObj res;
+                bool ok = conn->runCommand( conf->getName() , cmdObj , res );
+                conn.done();
+
+                if (!ok && res.getIntField("code") == RecvStaleConfigCode) { // code for RecvStaleConfigException
+                    throw RecvStaleConfigException(fullns, "FindAndModify"); // Command code traps this and re-runs
+                }
+
+                result.appendElements(res);
+                return ok;
+            }
+
+        } findAndModifyCmd;
+
+        class DataSizeCmd : public PublicGridCommand {
+        public:
+            DataSizeCmd() : PublicGridCommand("dataSize", "datasize") { }
+            bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string fullns = cmdObj.firstElement().String();
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    return passthrough( conf , cmdObj , result);
+                }
+
+                ChunkManagerPtr cm = conf->getChunkManager( fullns );
+                massert( 13407 ,  "how could chunk manager be null!" , cm );
+
+                BSONObj min = cmdObj.getObjectField( "min" );
+                BSONObj max = cmdObj.getObjectField( "max" );
+                BSONObj keyPattern = cmdObj.getObjectField( "keyPattern" );
+
+                uassert(13408,  "keyPattern must equal shard key", cm->getShardKey().key() == keyPattern);
+
+                // yes these are doubles...
+                double size = 0;
+                double numObjects = 0;
+                int millis = 0;
+
+                set<Shard> shards;
+                cm->getShardsForRange(shards, min, max);
+                for ( set<Shard>::iterator i=shards.begin(), end=shards.end() ; i != end; ++i ) {
+                    ScopedDbConnection conn( *i );
+                    BSONObj res;
+                    bool ok = conn->runCommand( conf->getName() , cmdObj , res );
+                    conn.done();
+
+                    if ( ! ok ) {
+                        result.appendElements( res );
+                        return false;
+                    }
+
+                    size       += res["size"].number();
+                    numObjects += res["numObjects"].number();
+                    millis     += res["millis"].numberInt();
+
+                }
+
+                result.append( "size", size );
+                result.append( "numObjects" , numObjects );
+                result.append( "millis" , millis );
+                return true;
+            }
+
+        } DataSizeCmd;
+
+        class ConvertToCappedCmd : public NotAllowedOnShardedCollectionCmd  {
+        public:
+            ConvertToCappedCmd() : NotAllowedOnShardedCollectionCmd("convertToCapped") {}
+
+            virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ) {
+                return dbName + "." + cmdObj.firstElement().valuestrsafe();
+            }
+
+        } convertToCappedCmd;
+
+
+        class GroupCmd : public NotAllowedOnShardedCollectionCmd  {
+        public:
+            GroupCmd() : NotAllowedOnShardedCollectionCmd("group") {}
+            virtual bool passOptions() const { return true; }
+            virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ) {
+                return dbName + "." + cmdObj.firstElement().embeddedObjectUserCheck()["ns"].valuestrsafe();
+            }
+
+        } groupCmd;
+
+        class DistinctCmd : public PublicGridCommand {
+        public:
+            DistinctCmd() : PublicGridCommand("distinct") {}
+            virtual void help( stringstream &help ) const {
+                help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
+            }
+            virtual bool passOptions() const { return true; }
+            bool run(const string& dbName , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
+                string collection = cmdObj.firstElement().valuestrsafe();
+                string fullns = dbName + "." + collection;
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    return passthrough( conf , cmdObj , options, result );
+                }
+
+                ChunkManagerPtr cm = conf->getChunkManager( fullns );
+                massert( 10420 ,  "how could chunk manager be null!" , cm );
+
+                BSONObj query = getQuery(cmdObj);
+                set<Shard> shards;
+                cm->getShardsForQuery(shards, query);
+
+                set<BSONObj,BSONObjCmp> all;
+                int size = 32;
+
+                for ( set<Shard>::iterator i=shards.begin(), end=shards.end() ; i != end; ++i ) {
+                    ShardConnection conn( *i , fullns );
+                    BSONObj res;
+                    bool ok = conn->runCommand( conf->getName() , cmdObj , res, options );
+                    conn.done();
+
+                    if ( ! ok ) {
+                        result.appendElements( res );
+                        return false;
+                    }
+
+                    BSONObjIterator it( res["values"].embeddedObject() );
+                    while ( it.more() ) {
+                        BSONElement nxt = it.next();
+                        BSONObjBuilder temp(32);
+                        temp.appendAs( nxt , "" );
+                        all.insert( temp.obj() );
+                    }
+
+                }
+
+                BSONObjBuilder b( size );
+                int n=0;
+                for ( set<BSONObj,BSONObjCmp>::iterator i = all.begin() ; i != all.end(); i++ ) {
+                    b.appendAs( i->firstElement() , b.numStr( n++ ) );
+                }
+
+                result.appendArray( "values" , b.obj() );
+                return true;
+            }
+        } disinctCmd;
+
+        class FileMD5Cmd : public PublicGridCommand {
+        public:
+            FileMD5Cmd() : PublicGridCommand("filemd5") {}
+            virtual void help( stringstream &help ) const {
+                help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }";
+            }
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                string fullns = dbName;
+                fullns += ".";
+                {
+                    string root = cmdObj.getStringField( "root" );
+                    if ( root.size() == 0 )
+                        root = "fs";
+                    fullns += root;
+                }
+                fullns += ".chunks";
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    return passthrough( conf , cmdObj , result );
+                }
+
+                ChunkManagerPtr cm = conf->getChunkManager( fullns );
+                massert( 13091 , "how could chunk manager be null!" , cm );
+                uassert( 13092 , "GridFS chunks collection can only be sharded on files_id", cm->getShardKey().key() == BSON("files_id" << 1));
+
+                ChunkPtr chunk = cm->findChunk( BSON("files_id" << cmdObj.firstElement()) );
+
+                ShardConnection conn( chunk->getShard() , fullns );
+                BSONObj res;
+                bool ok = conn->runCommand( conf->getName() , cmdObj , res );
+                conn.done();
+
+                result.appendElements(res);
+                return ok;
+            }
+        } fileMD5Cmd;
+
+        class Geo2dFindNearCmd : public PublicGridCommand {
+        public:
+            Geo2dFindNearCmd() : PublicGridCommand( "geoNear" ) {}
+            void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; }
+            virtual bool passOptions() const { return true; }
+            bool run(const string& dbName , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
+                string collection = cmdObj.firstElement().valuestrsafe();
+                string fullns = dbName + "." + collection;
+
+                DBConfigPtr conf = grid.getDBConfig( dbName , false );
+
+                if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
+                    return passthrough( conf , cmdObj , options, result );
+                }
+
+                ChunkManagerPtr cm = conf->getChunkManager( fullns );
+                massert( 13500 ,  "how could chunk manager be null!" , cm );
+
+                BSONObj query = getQuery(cmdObj);
+                set<Shard> shards;
+                cm->getShardsForQuery(shards, query);
+
+                int limit = 100;
+                if (cmdObj["num"].isNumber())
+                    limit = cmdObj["num"].numberInt();
+
+                list< shared_ptr<Future::CommandResult> > futures;
+                BSONArrayBuilder shardArray;
+                for ( set<Shard>::const_iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) {
+                    futures.push_back( Future::spawnCommand( i->getConnString() , dbName , cmdObj, options ) );
+                    shardArray.append(i->getName());
+                }
+
+                multimap<double, BSONObj> results; // TODO: maybe use merge-sort instead
+                string nearStr;
+                double time = 0;
+                double btreelocs = 0;
+                double nscanned = 0;
+                double objectsLoaded = 0;
+                for ( list< shared_ptr<Future::CommandResult> >::iterator i=futures.begin(); i!=futures.end(); i++ ) {
+                    shared_ptr<Future::CommandResult> res = *i;
+                    if ( ! res->join() ) {
+                        errmsg = res->result()["errmsg"].String();
+                        return false;
+                    }
+
+                    nearStr = res->result()["near"].String();
+                    time += res->result()["stats"]["time"].Number();
+                    btreelocs += res->result()["stats"]["btreelocs"].Number();
+                    nscanned += res->result()["stats"]["nscanned"].Number();
+                    objectsLoaded += res->result()["stats"]["objectsLoaded"].Number();
+
+                    BSONForEach(obj, res->result()["results"].embeddedObject()) {
+                        results.insert(make_pair(obj["dis"].Number(), obj.embeddedObject().getOwned()));
+                    }
+
+                    // TODO: maybe shrink results if size() > limit
+                }
+
+                result.append("ns" , fullns);
+                result.append("near", nearStr);
+
+                int outCount = 0;
+                double totalDistance = 0;
+                double maxDistance = 0;
+                {
+                    BSONArrayBuilder sub (result.subarrayStart("results"));
+                    for (multimap<double, BSONObj>::const_iterator it(results.begin()), end(results.end()); it!= end && outCount < limit; ++it, ++outCount) {
+                        totalDistance += it->first;
+                        maxDistance = it->first; // guaranteed to be highest so far
+
+                        sub.append(it->second);
+                    }
+                    sub.done();
+                }
+
+                {
+                    BSONObjBuilder sub (result.subobjStart("stats"));
+                    sub.append("time", time);
+                    sub.append("btreelocs", btreelocs);
+                    sub.append("nscanned", nscanned);
+                    sub.append("objectsLoaded", objectsLoaded);
+                    sub.append("avgDistance", totalDistance / outCount);
+                    sub.append("maxDistance", maxDistance);
+                    sub.append("shards", shardArray.arr());
+                    sub.done();
+                }
+
+                return true;
+            }
+        } geo2dFindNearCmd;
+
+        class MRCmd : public PublicGridCommand {
+        public:
+            AtomicUInt JOB_NUMBER;
+
+            MRCmd() : PublicGridCommand( "mapreduce" ) {}
+
+            string getTmpName( const string& coll ) {
+                stringstream ss;
+                ss << "tmp.mrs." << coll << "_" << time(0) << "_" << JOB_NUMBER++;
+                return ss.str();
+            }
+
+            BSONObj fixForShards( const BSONObj& orig , const string& output , string& badShardedField , int maxChunkSizeBytes ) {
+                BSONObjBuilder b;
+                BSONObjIterator i( orig );
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    string fn = e.fieldName();
+                    if ( fn == "map" ||
+                            fn == "mapreduce" ||
+                            fn == "mapparams" ||
+                            fn == "reduce" ||
+                            fn == "query" ||
+                            fn == "sort" ||
+                            fn == "scope" ||
+                            fn == "verbose" ) {
+                        b.append( e );
+                    }
+                    else if ( fn == "out" ||
+                              fn == "finalize" ) {
+                        // we don't want to copy these
+                    }
+                    else {
+                        badShardedField = fn;
+                        return BSONObj();
+                    }
+                }
+                b.append( "out" , output );
+
+                if ( maxChunkSizeBytes > 0 ) {
+                    // will need to figure out chunks, ask shards for points
+                    b.append("splitInfo", maxChunkSizeBytes);
+                }
+
+                return b.obj();
+            }
+
+            ChunkPtr insertSharded( ChunkManagerPtr manager, const char* ns, BSONObj& o, int flags, bool safe ) {
+                // note here, the MR output process requires no splitting / migration during process, hence StaleConfigException should not happen
+                Strategy* s = SHARDED;
+                ChunkPtr c = manager->findChunk( o );
+                LOG(4) << "  server:" << c->getShard().toString() << " " << o << endl;
+                s->insert( c->getShard() , ns , o , flags, safe);
+                return c;
+            }
+
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                return run( dbName, cmdObj, errmsg, result, 0 );
+            }
+
+            bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, int retry ) {
+                Timer t;
+
+                string collection = cmdObj.firstElement().valuestrsafe();
+                string fullns = dbName + "." + collection;
+
+                // Abort after two retries, m/r is an expensive operation
+                if( retry > 2 ){
+                    errmsg = "shard version errors preventing parallel mapreduce, check logs for further info";
+                    return false;
+                }
+                // Re-check shard version after 1st retry
+                if( retry > 0 ){
+                    versionManager.forceRemoteCheckShardVersionCB( fullns );
+                }
+
+                const string shardResultCollection = getTmpName( collection );
+
+                BSONObj customOut;
+                string finalColShort;
+                string finalColLong;
+                bool customOutDB = false;
+                string outDB = dbName;
+                BSONElement outElmt = cmdObj.getField("out");
+                if (outElmt.type() == Object) {
+                    // check if there is a custom output
+                    BSONObj out = outElmt.embeddedObject();
+                    customOut = out;
+                    // mode must be 1st element
+                    finalColShort = out.firstElement().str();
+                    if (customOut.hasField( "db" )) {
+                        customOutDB = true;
+                        outDB = customOut.getField("db").str();
+                    }
+                    finalColLong = outDB + "." + finalColShort;
+                }
+
+                DBConfigPtr confIn = grid.getDBConfig( dbName , false );
+                DBConfigPtr confOut = confIn;
+                if (customOutDB) {
+                    confOut = grid.getDBConfig( outDB , true );
+                }
+
+                bool shardedInput = confIn && confIn->isShardingEnabled() && confIn->isSharded( fullns );
+                bool shardedOutput = customOut.getBoolField("sharded");
+
+                if (!shardedOutput)
+                    uassert( 15920 ,  "Cannot output to a non-sharded collection, a sharded collection exists" , !confOut->isSharded(finalColLong) );
+                // should we also prevent going from non-sharded to sharded? during the transition client may see partial data
+
+                long long maxChunkSizeBytes = 0;
+                if (shardedOutput) {
+                    // will need to figure out chunks, ask shards for points
+                    maxChunkSizeBytes = cmdObj["maxChunkSizeBytes"].numberLong();
+                    if ( maxChunkSizeBytes == 0 ) {
+                        maxChunkSizeBytes = Chunk::MaxChunkSize;
+                    }
+                }
+
+                // modify command to run on shards with output to tmp collection
+                string badShardedField;
+                assert( maxChunkSizeBytes < 0x7fffffff );
+                BSONObj shardedCommand = fixForShards( cmdObj , shardResultCollection , badShardedField, static_cast<int>(maxChunkSizeBytes) );
+
+                if ( ! shardedInput && ! shardedOutput && ! customOutDB ) {
+                    LOG(1) << "simple MR, just passthrough" << endl;
+                    return passthrough( confIn , cmdObj , result );
+                }
+
+                if ( badShardedField.size() ) {
+                    errmsg = str::stream() << "unknown m/r field for sharding: " << badShardedField;
+                    return false;
+                }
+
+                BSONObjBuilder timingBuilder;
+                BSONObj q;
+                if ( cmdObj["query"].type() == Object ) {
+                    q = cmdObj["query"].embeddedObjectUserCheck();
+                }
+
+                set<Shard> shards;
+                set<ServerAndQuery> servers;
+                map<Shard,BSONObj> results;
+
+                BSONObjBuilder shardCountsB;
+                BSONObjBuilder aggCountsB;
+                map<string,long long> countsMap;
+                set< BSONObj > splitPts;
+
+                {
+                    // take distributed lock to prevent split / migration
+                    ConnectionString config = configServer.getConnectionString();
+                    DistributedLock lockSetup( config , fullns );
+                    dist_lock_try dlk;
+
+                    if (shardedInput) {
+                        try{
+                            int tryc = 0;
+                            while ( !dlk.got() ) {
+                                dlk = dist_lock_try( &lockSetup , (string)"mr-parallel" );
+                                if ( ! dlk.got() ) {
+                                    if ( ++tryc % 100 == 0 )
+                                        warning() << "the collection metadata could not be locked for mapreduce, already locked by " << dlk.other() << endl;
+                                    sleepmillis(100);
+                                }
+                            }
+                        }
+                        catch( LockException& e ){
+                            errmsg = str::stream() << "error locking distributed lock for mapreduce " << causedBy( e );
+                            return false;
+                        }
+                    }
+
+                    try {
+                        SHARDED->commandOp( dbName, shardedCommand, 0, fullns, q, results );
+                    }
+                    catch( DBException& e ){
+                        e.addContext( str::stream() << "could not run map command on all shards for ns " << fullns << " and query " << q );
+                        throw;
+                    }
+
+                    for ( map<Shard,BSONObj>::iterator i = results.begin(); i != results.end(); ++i ){
+
+                        BSONObj mrResult = i->second;
+                        string server = i->first.getConnString();
+
+                        BSONObj counts = mrResult["counts"].embeddedObjectUserCheck();
+                        shardCountsB.append( server , counts );
+                        servers.insert( server );
+
+                        // add up the counts for each shard
+                        // some of them will be fixed later like output and reduce
+                        BSONObjIterator j( counts );
+                        while ( j.more() ) {
+                            BSONElement temp = j.next();
+                            countsMap[temp.fieldName()] += temp.numberLong();
+                        }
+
+                        if (mrResult.hasField("splitKeys")) {
+                            BSONElement splitKeys = mrResult.getField("splitKeys");
+                            vector<BSONElement> pts = splitKeys.Array();
+                            for (vector<BSONElement>::iterator it = pts.begin(); it != pts.end(); ++it) {
+                                splitPts.insert(it->Obj().getOwned());
+                            }
+                        }
+                    }
+                }
+
+                // build the sharded finish command
+                BSONObjBuilder finalCmd;
+                finalCmd.append( "mapreduce.shardedfinish" , cmdObj );
+                finalCmd.append( "inputNS" , dbName + "." + shardResultCollection );
+
+                BSONObj shardCounts = shardCountsB.done();
+                finalCmd.append( "shardCounts" , shardCounts );
+                timingBuilder.append( "shardProcessing" , t.millis() );
+
+                for ( map<string,long long>::iterator i=countsMap.begin(); i!=countsMap.end(); i++ ) {
+                    aggCountsB.append( i->first , i->second );
+                }
+                BSONObj aggCounts = aggCountsB.done();
+                finalCmd.append( "counts" , aggCounts );
+
+                Timer t2;
+                BSONObj singleResult;
+                bool ok = false;
+                long long reduceCount = 0;
+                long long outputCount = 0;
+                BSONObjBuilder postCountsB;
+
+                if (!shardedOutput) {
+                    LOG(1) << "MR with single shard output, NS=" << finalColLong << " primary=" << confOut->getPrimary() << endl;
+                    ShardConnection conn( confOut->getPrimary() , finalColLong );
+                    ok = conn->runCommand( outDB , finalCmd.obj() , singleResult );
+
+                    BSONObj counts = singleResult.getObjectField("counts");
+                    postCountsB.append(conn->getServerAddress(), counts);
+                    reduceCount = counts.getIntField("reduce");
+                    outputCount = counts.getIntField("output");
+
+                    conn.done();
+                } else {
+
+                    LOG(1) << "MR with sharded output, NS=" << finalColLong << endl;
+
+                    // create the sharded collection if needed
+                    if (!confOut->isSharded(finalColLong)) {
+                        // enable sharding on db
+                        confOut->enableSharding();
+
+                        // shard collection according to split points
+                        BSONObj sortKey = BSON( "_id" << 1 );
+                        vector<BSONObj> sortedSplitPts;
+                        // points will be properly sorted using the set
+                        for ( set<BSONObj>::iterator it = splitPts.begin() ; it != splitPts.end() ; ++it )
+                            sortedSplitPts.push_back( *it );
+                        confOut->shardCollection( finalColLong, sortKey, true, &sortedSplitPts );
+                    }
+
+                    map<BSONObj, int> chunkSizes;
+                    {
+                        // take distributed lock to prevent split / migration
+                        ConnectionString config = configServer.getConnectionString();
+                        DistributedLock lockSetup( config , finalColLong );
+                        dist_lock_try dlk;
+
+                        try{
+                            int tryc = 0;
+                            while ( !dlk.got() ) {
+                                dlk = dist_lock_try( &lockSetup , (string)"mr-post-process" );
+                                if ( ! dlk.got() ) {
+                                    if ( ++tryc % 100 == 0 )
+                                        warning() << "the collection metadata could not be locked for mapreduce, already locked by " << dlk.other() << endl;
+                                    sleepmillis(100);
+                                }
+                            }
+                        }
+                        catch( LockException& e ){
+                            errmsg = str::stream() << "error locking distributed lock for mapreduce " << causedBy( e );
+                            return false;
+                        }
+
+                        BSONObj finalCmdObj = finalCmd.obj();
+                        results.clear();
+
+                        try {
+                            SHARDED->commandOp( outDB, finalCmdObj, 0, finalColLong, BSONObj(), results );
+                            ok = true;
+                        }
+                        catch( DBException& e ){
+                            e.addContext( str::stream() << "could not run final reduce command on all shards for ns " << fullns << ", output " << finalColLong );
+                            throw;
+                        }
+
+                        for ( map<Shard,BSONObj>::iterator i = results.begin(); i != results.end(); ++i ){
+
+                            string server = i->first.getConnString();
+                            singleResult = i->second;
+
+                            BSONObj counts = singleResult.getObjectField("counts");
+                            reduceCount += counts.getIntField("reduce");
+                            outputCount += counts.getIntField("output");
+                            postCountsB.append(server, counts);
+
+                            // get the size inserted for each chunk
+                            // split cannot be called here since we already have the distributed lock
+                            if (singleResult.hasField("chunkSizes")) {
+                                vector<BSONElement> sizes = singleResult.getField("chunkSizes").Array();
+                                for (unsigned int i = 0; i < sizes.size(); i += 2) {
+                                    BSONObj key = sizes[i].Obj().getOwned();
+                                    long long size = sizes[i+1].numberLong();
+                                    assert( size < 0x7fffffff );
+                                    chunkSizes[key] = static_cast<int>(size);
+                                }
+                            }
+                        }
+                    }
+
+                    // do the splitting round
+                    ChunkManagerPtr cm = confOut->getChunkManagerIfExists( finalColLong );
+                    for ( map<BSONObj, int>::iterator it = chunkSizes.begin() ; it != chunkSizes.end() ; ++it ) {
+                        BSONObj key = it->first;
+                        int size = it->second;
+                        assert( size < 0x7fffffff );
+
+                        // key reported should be the chunk's minimum
+                        ChunkPtr c =  cm->findChunk(key);
+                        if ( !c ) {
+                            warning() << "Mongod reported " << size << " bytes inserted for key " << key << " but can't find chunk" << endl;
+                        } else {
+                            c->splitIfShould( size );
+                        }
+                    }
+                }
+
+                try {
+                    // drop collections with tmp results on each shard
+                    for ( set<ServerAndQuery>::iterator i=servers.begin(); i!=servers.end(); i++ ) {
+                        ScopedDbConnection conn( i->_server );
+                        conn->dropCollection( dbName + "." + shardResultCollection );
+                        conn.done();
+                    }
+                } catch ( std::exception e ) {
+                    log() << "Cannot cleanup shard results" << causedBy( e ) << endl;
+                }
+
+                if ( ! ok ) {
+                    errmsg = "final reduce failed: ";
+                    errmsg += singleResult.toString();
+                    return 0;
+                }
+
+                // copy some elements from a single result
+                // annoying that we have to copy all results for inline, but no way around it
+                if (singleResult.hasField("result"))
+                    result.append(singleResult.getField("result"));
+                else if (singleResult.hasField("results"))
+                    result.append(singleResult.getField("results"));
+
+                BSONObjBuilder countsB(32);
+                // input stat is determined by aggregate MR job
+                countsB.append("input", aggCounts.getField("input").numberLong());
+                countsB.append("emit", aggCounts.getField("emit").numberLong());
+
+                // reduce count is sum of all reduces that happened
+                countsB.append("reduce", aggCounts.getField("reduce").numberLong() + reduceCount);
+
+                // ouput is determined by post processing on each shard
+                countsB.append("output", outputCount);
+                result.append( "counts" , countsB.done() );
+
+                timingBuilder.append( "postProcessing" , t2.millis() );
+
+                result.append( "timeMillis" , t.millis() );
+                result.append( "timing" , timingBuilder.done() );
+                result.append("shardCounts", shardCounts);
+                result.append("postProcessCounts", postCountsB.done());
+                return 1;
+            }
+        } mrCmd;
+
+        class ApplyOpsCmd : public PublicGridCommand {
+        public:
+            ApplyOpsCmd() : PublicGridCommand( "applyOps" ) {}
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                errmsg = "applyOps not allowed through mongos";
+                return false;
+            }
+        } applyOpsCmd;
+
+
+        class CompactCmd : public PublicGridCommand {
+        public:
+            CompactCmd() : PublicGridCommand( "compact" ) {}
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                errmsg = "compact not allowed through mongos";
+                return false;
+            }
+        } compactCmd;
+
+
+	/*
+	  Note these are in the pub_grid_cmds namespace, so they don't
+	  conflict with those in db/commands/pipeline_command.cpp.
+	 */
+	class PipelineCommand :
+	    public PublicGridCommand {
+	public:
+	    PipelineCommand();
+
+	    // virtuals from Command
+	    virtual bool run(const string &dbName , BSONObj &cmdObj,
+			     int options, string &errmsg,
+			     BSONObjBuilder &result, bool fromRepl);
+
+	private:
+	    
+	};
+
+
+	/* -------------------- PipelineCommand ----------------------------- */
+
+	static const PipelineCommand pipelineCommand;
+
+	PipelineCommand::PipelineCommand():
+	    PublicGridCommand(Pipeline::commandName) {
+	}
+
+	bool PipelineCommand::run(const string &dbName , BSONObj &cmdObj,
+				  int options, string &errmsg,
+				  BSONObjBuilder &result, bool fromRepl) {
+	    //const string shardedOutputCollection = getTmpName( collection );
+
+	    intrusive_ptr<ExpressionContext> pCtx(
+		ExpressionContext::create());
+	    pCtx->setInRouter(true);
+
+	    /* parse the pipeline specification */
+	    boost::shared_ptr<Pipeline> pPipeline(
+		Pipeline::parseCommand(errmsg, cmdObj, pCtx));
+	    if (!pPipeline.get())
+		return false; // there was some parsing error
+
+	    string fullns(dbName + "." + pPipeline->getCollectionName());
+
+	    /*
+	      If the system isn't running sharded, or the target collection
+	      isn't sharded, pass this on to a mongod.
+	    */
+	    DBConfigPtr conf(grid.getDBConfig(dbName , false));
+	    if (!conf || !conf->isShardingEnabled() || !conf->isSharded(fullns))
+		return passthrough(conf, cmdObj, result);
+
+	    /* split the pipeline into pieces for mongods and this mongos */
+	    boost::shared_ptr<Pipeline> pShardPipeline(
+		pPipeline->splitForSharded());
+
+	    /* create the command for the shards */
+	    BSONObjBuilder commandBuilder;
+	    pShardPipeline->toBson(&commandBuilder);
+	    BSONObj shardedCommand(commandBuilder.done());
+
+	    BSONObjBuilder shardQueryBuilder;
+	    BSONObjBuilder shardSortBuilder;
+	    pShardPipeline->getCursorMods(
+		&shardQueryBuilder, &shardSortBuilder);
+	    BSONObj shardQuery(shardQueryBuilder.done());
+	    BSONObj shardSort(shardSortBuilder.done());
+
+	    ChunkManagerPtr cm(conf->getChunkManager(fullns));
+	    set<Shard> shards;
+	    cm->getShardsForQuery(shards, shardQuery);
+
+	    /*
+	      From MRCmd::Run: "we need to use our connections to the shard
+	      so filtering is done correctly for un-owned docs so we allocate
+	      them in our thread and hand off"
+	    */
+	    vector<boost::shared_ptr<ShardConnection> > shardConns;
+	    list<boost::shared_ptr<Future::CommandResult> > futures;
+	    for (set<Shard>::iterator i=shards.begin(), end=shards.end();
+		 i != end; i++) {
+		boost::shared_ptr<ShardConnection> temp(
+		    new ShardConnection(i->getConnString(), fullns));
+		assert(temp->get());
+		futures.push_back(
+		    Future::spawnCommand(i->getConnString(), dbName,
+					 shardedCommand , 0, temp->get()));
+		shardConns.push_back(temp);
+	    }
+                    
+	    /* wrap the list of futures with a source */
+	    intrusive_ptr<DocumentSourceCommandFutures> pSource(
+		DocumentSourceCommandFutures::create(errmsg, &futures));
+
+	    /* run the pipeline */
+	    bool failed = pPipeline->run(result, errmsg, pSource);
+
+/*
+	    BSONObjBuilder shardresults;
+	    for (list<boost::shared_ptr<Future::CommandResult> >::iterator i(
+		     futures.begin()); i!=futures.end(); ++i) {
+		boost::shared_ptr<Future::CommandResult> res(*i);
+		if (!res->join()) {
+		    error() << "sharded pipeline failed on shard: " <<
+			res->getServer() << " error: " << res->result() << endl;
+		    result.append( "cause" , res->result() );
+		    errmsg = "mongod pipeline failed: ";
+		    errmsg += res->result().toString();
+		    failed = true;
+		    continue;
+		}
+
+		shardresults.append( res->getServer() , res->result() );
+	    }
+*/
+
+	    for(unsigned i = 0; i < shardConns.size(); ++i)
+		shardConns[i]->done();
+
+	    if (failed && (errmsg.length() > 0))
+		return false;
+
+	    return true;
+	}
+
+    } // namespace pub_grid_cmds
+
+    bool Command::runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder, int queryOptions) {
+        const char *p = strchr(ns, '.');
+        if ( !p ) return false;
+        if ( strcmp(p, ".$cmd") != 0 ) return false;
+
+        bool ok = false;
+
+        BSONElement e = jsobj.firstElement();
+        map<string,Command*>::iterator i;
+
+        if ( e.eoo() )
+            ;
+        // check for properly registered command objects.
+        else if ( (i = _commands->find(e.fieldName())) != _commands->end() ) {
+            string errmsg;
+            Command *c = i->second;
+            ClientInfo *client = ClientInfo::get();
+            AuthenticationInfo *ai = client->getAuthenticationInfo();
+
+            char cl[256];
+            nsToDatabase(ns, cl);
+            if( c->requiresAuth() && !ai->isAuthorizedForLock(cl, c->locktype())) {
+                ok = false;
+                errmsg = "unauthorized";
+                anObjBuilder.append( "note" , str::stream() << "need to authorized on db: " << cl << " for command: " << e.fieldName() );
+            }
+            else if( c->adminOnly() && c->localHostOnlyIfNoAuth( jsobj ) && noauth && !ai->isLocalHost ) {
+                ok = false;
+                errmsg = "unauthorized: this command must run from localhost when running db without auth";
+                log() << "command denied: " << jsobj.toString() << endl;
+            }
+            else if ( c->adminOnly() && !startsWith(ns, "admin.") ) {
+                ok = false;
+                errmsg = "access denied - use admin db";
+            }
+            else if ( jsobj.getBoolField( "help" ) ) {
+                stringstream help;
+                help << "help for: " << e.fieldName() << " ";
+                c->help( help );
+                anObjBuilder.append( "help" , help.str() );
+            }
+            else {
+                try {
+                    ok = c->run( nsToDatabase( ns ) , jsobj, queryOptions, errmsg, anObjBuilder, false );
+                }
+                catch (DBException& e) {
+                    int code = e.getCode();
+                    if (code == RecvStaleConfigCode) { // code for StaleConfigException
+                        throw;
+                    }
+
+                    {
+                        stringstream ss;
+                        ss << "exception: " << e.what();
+                        anObjBuilder.append( "errmsg" , ss.str() );
+                        anObjBuilder.append( "code" , code );
+                    }
+                }
+            }
+
+            BSONObj tmp = anObjBuilder.asTempObj();
+            bool have_ok = tmp.hasField("ok");
+            bool have_errmsg = tmp.hasField("errmsg");
+
+            if (!have_ok)
+                anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 );
+
+            if ( !ok && !have_errmsg) {
+                anObjBuilder.append("errmsg", errmsg);
+                uassert_nothrow(errmsg.c_str());
+            }
+            return true;
+        }
+
+        return false;
+    }
+
+} // namespace mongo
+
diff --git a/src/mongo/s/config.cpp b/src/mongo/s/config.cpp
new file mode 100644
index 00000000000..b4923b56a1f
--- /dev/null
+++ b/src/mongo/s/config.cpp
@@ -0,0 +1,879 @@
+// config.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/message.h"
+#include "../util/stringutils.h"
+#include "../util/unittest.h"
+#include "../client/connpool.h"
+#include "../client/model.h"
+#include "../db/pdfile.h"
+#include "../db/cmdline.h"
+
+#include "chunk.h"
+#include "config.h"
+#include "grid.h"
+#include "server.h"
+
+namespace mongo {
+
+    int ConfigServer::VERSION = 3;
+    Shard Shard::EMPTY;
+
+    string ShardNS::shard = "config.shards";
+    string ShardNS::database = "config.databases";
+    string ShardNS::collection = "config.collections";
+    string ShardNS::chunk = "config.chunks";
+
+    string ShardNS::mongos = "config.mongos";
+    string ShardNS::settings = "config.settings";
+
+    BSONField<bool>      ShardFields::draining("draining");
+    BSONField<long long> ShardFields::maxSize ("maxSize");
+
+    OID serverID;
+
+    /* --- DBConfig --- */
+
+    DBConfig::CollectionInfo::CollectionInfo( const BSONObj& in ) {
+        _dirty = false;
+        _dropped = in["dropped"].trueValue();
+        if ( in["key"].isABSONObj() ) {
+            _key = in["key"].Obj().getOwned();
+            _unqiue = in["unique"].trueValue();
+            shard( in["_id"].String() , _key , _unqiue );
+        }
+        _dirty = false;
+    }
+    
+
+    void DBConfig::CollectionInfo::shard( const string& ns , const ShardKeyPattern& key , bool unique ) {
+        _cm.reset( new ChunkManager( ns , key , unique ) );
+        _key = key.key().getOwned();
+        _unqiue = unique;
+        _dirty = true;
+        _dropped = false;
+    }
+
+    void DBConfig::CollectionInfo::unshard() {
+        _cm.reset();
+        _dropped = true;
+        _dirty = true;
+        _key = BSONObj();
+    }
+
+    void DBConfig::CollectionInfo::save( const string& ns , DBClientBase* conn ) {
+        BSONObj key = BSON( "_id" << ns );
+
+        BSONObjBuilder val;
+        val.append( "_id" , ns );
+        val.appendDate( "lastmod" , time(0) );
+        val.appendBool( "dropped" , _dropped );
+        if ( _cm )
+            _cm->getInfo( val );
+
+        conn->update( ShardNS::collection , key , val.obj() , true );
+        string err = conn->getLastError();
+        uassert( 13473 , (string)"failed to save collection (" + ns + "): " + err , err.size() == 0 );
+
+        _dirty = false;
+    }
+
+    bool DBConfig::isSharded( const string& ns ) {
+        if ( ! _shardingEnabled )
+            return false;
+        scoped_lock lk( _lock );
+        return _isSharded( ns );
+    }
+
+    bool DBConfig::_isSharded( const string& ns ) {
+        if ( ! _shardingEnabled )
+            return false;
+        Collections::iterator i = _collections.find( ns );
+        if ( i == _collections.end() )
+            return false;
+        return i->second.isSharded();
+    }
+
+    ShardPtr DBConfig::getShardIfExists( const string& ns ){
+        try{
+            // TODO: this function assumes the _primary will not change under-the-covers, but so does
+            // getShard() in general
+            return ShardPtr( new Shard( getShard( ns ) ) );
+        }
+        catch( AssertionException& e ){
+            warning() << "primary not found for " << ns << causedBy( e ) << endl;
+            return ShardPtr();
+        }
+    }
+
+    const Shard& DBConfig::getShard( const string& ns ) {
+        if ( isSharded( ns ) )
+            return Shard::EMPTY;
+
+        uassert( 10178 ,  "no primary!" , _primary.ok() );
+        return _primary;
+    }
+
+    void DBConfig::enableSharding() {
+        if ( _shardingEnabled )
+            return;
+        
+        assert( _name != "config" );
+
+        scoped_lock lk( _lock );
+        _shardingEnabled = true;
+        _save();
+    }
+
+    /**
+     *
+     */
+    ChunkManagerPtr DBConfig::shardCollection( const string& ns , ShardKeyPattern fieldsAndOrder , bool unique , vector<BSONObj>* initPoints, vector<Shard>* initShards ) {
+        uassert( 8042 , "db doesn't have sharding enabled" , _shardingEnabled );
+        uassert( 13648 , str::stream() << "can't shard collection because not all config servers are up" , configServer.allUp() );
+
+        
+        {
+            scoped_lock lk( _lock );
+
+            CollectionInfo& ci = _collections[ns];
+            uassert( 8043 , "collection already sharded" , ! ci.isSharded() );
+
+            log() << "enable sharding on: " << ns << " with shard key: " << fieldsAndOrder << endl;
+
+            ci.shard( ns , fieldsAndOrder , unique );
+            ChunkManagerPtr cm = ci.getCM();
+            uassert( 13449 , "collections already sharded" , (cm->numChunks() == 0) );
+
+            cm->createFirstChunks( getPrimary() , initPoints, initShards );
+            _save();
+        }
+
+        ChunkManagerPtr manager = getChunkManager(ns,true,true);
+
+        // Tell the primary mongod to refresh it's data
+        // TODO:  Think the real fix here is for mongos to just assume all collections sharded, when we get there
+        for( int i = 0; i < 4; i++ ){
+            if( i == 3 ){
+                warning() << "too many tries updating initial version of " << ns << " on shard primary " << getPrimary() <<
+                             ", other mongoses may not see the collection as sharded immediately" << endl;
+                break;
+            }
+            try {
+                ShardConnection conn( getPrimary(), ns );
+                conn.setVersion();
+                conn.done();
+                break;
+            }
+            catch( DBException& e ){
+                warning() << "could not update initial version of " << ns << " on shard primary " << getPrimary() <<
+                             causedBy( e ) << endl;
+            }
+            sleepsecs( i );
+        }
+
+        return manager;
+    }
+
+    bool DBConfig::removeSharding( const string& ns ) {
+        if ( ! _shardingEnabled ) {
+            return false;
+        }
+
+        scoped_lock lk( _lock );
+
+        Collections::iterator i = _collections.find( ns );
+
+        if ( i == _collections.end() )
+            return false;
+
+        CollectionInfo& ci = _collections[ns];
+        if ( ! ci.isSharded() )
+            return false;
+
+        ci.unshard();
+        _save( false, true );
+        return true;
+    }
+
+    ChunkManagerPtr DBConfig::getChunkManagerIfExists( const string& ns, bool shouldReload, bool forceReload ){
+        try{
+            return getChunkManager( ns, shouldReload, forceReload );
+        }
+        catch( AssertionException& e ){
+            warning() << "chunk manager not found for " << ns << causedBy( e ) << endl;
+            return ChunkManagerPtr();
+        }
+    }
+
+    ChunkManagerPtr DBConfig::getChunkManager( const string& ns , bool shouldReload, bool forceReload ) {
+        BSONObj key;
+        bool unique;
+        ShardChunkVersion oldVersion;
+
+        {
+            scoped_lock lk( _lock );
+            
+            CollectionInfo& ci = _collections[ns];
+            
+            bool earlyReload = ! ci.isSharded() && ( shouldReload || forceReload );
+            if ( earlyReload ) {
+                // this is to catch cases where there this is a new sharded collection
+                _reload();
+                ci = _collections[ns];
+            }
+            massert( 10181 ,  (string)"not sharded:" + ns , ci.isSharded() );
+            assert( ! ci.key().isEmpty() );
+            
+            if ( ! ( shouldReload || forceReload ) || earlyReload )
+                return ci.getCM();
+
+            key = ci.key().copy();
+            unique = ci.unique();
+            if ( ci.getCM() )
+                oldVersion = ci.getCM()->getVersion();
+        }
+        
+        assert( ! key.isEmpty() );
+        
+        BSONObj newest;
+        if ( oldVersion > 0 && ! forceReload ) {
+            ScopedDbConnection conn( configServer.modelServer() , 30.0 );
+            newest = conn->findOne( ShardNS::chunk , 
+                                    Query( BSON( "ns" << ns ) ).sort( "lastmod" , -1 ) );
+            conn.done();
+            
+            if ( ! newest.isEmpty() ) {
+                ShardChunkVersion v = newest["lastmod"];
+                if ( v == oldVersion ) {
+                    scoped_lock lk( _lock );
+                    CollectionInfo& ci = _collections[ns];
+                    massert( 15885 , str::stream() << "not sharded after reloading from chunks : " << ns , ci.isSharded() );
+                    return ci.getCM();
+                }
+            }
+
+        }
+        else if( oldVersion == 0 ){
+            warning() << "version 0 found when " << ( forceReload ? "reloading" : "checking" ) << " chunk manager"
+                      << ", collection '" << ns << "' initially detected as sharded" << endl;
+        }
+
+        // we are not locked now, and want to load a new ChunkManager
+        
+        auto_ptr<ChunkManager> temp;
+
+        {
+            scoped_lock lll ( _hitConfigServerLock );
+            
+            if ( ! newest.isEmpty() && ! forceReload ) {
+                // if we have a target we're going for
+                // see if we've hit already
+                
+                scoped_lock lk( _lock );
+                CollectionInfo& ci = _collections[ns];
+                if ( ci.isSharded() && ci.getCM() ) {
+                    ShardChunkVersion currentVersion = newest["lastmod"];
+                    if ( currentVersion == ci.getCM()->getVersion() ) {
+                        return ci.getCM();
+                    }
+                }
+                
+            }
+            
+            temp.reset( new ChunkManager( ns , key , unique ) );
+            if ( temp->numChunks() == 0 ) {
+                // maybe we're not sharded any more
+                reload(); // this is a full reload
+                return getChunkManager( ns , false );
+            }
+        }
+
+        scoped_lock lk( _lock );
+        
+        CollectionInfo& ci = _collections[ns];
+        massert( 14822 ,  (string)"state changed in the middle: " + ns , ci.isSharded() );
+        
+        bool forced = false;
+        if ( temp->getVersion() > ci.getCM()->getVersion() ||
+            (forced = (temp->getVersion() == ci.getCM()->getVersion() && forceReload ) ) ) {
+
+            if( forced ){
+                warning() << "chunk manager reload forced for collection '" << ns << "', config version is " << temp->getVersion() << endl;
+            }
+
+            // we only want to reset if we're newer or equal and forced
+            // otherwise we go into a bad cycle
+            ci.resetCM( temp.release() );
+        }
+        
+        massert( 15883 , str::stream() << "not sharded after chunk manager reset : " << ns , ci.isSharded() );
+        return ci.getCM();
+    }
+
+    void DBConfig::setPrimary( string s ) {
+        scoped_lock lk( _lock );
+        _primary.reset( s );
+        _save();
+    }
+
+    void DBConfig::serialize(BSONObjBuilder& to) {
+        to.append("_id", _name);
+        to.appendBool("partitioned", _shardingEnabled );
+        to.append("primary", _primary.getName() );
+    }
+
+    void DBConfig::unserialize(const BSONObj& from) {
+        LOG(1) << "DBConfig unserialize: " << _name << " " << from << endl;
+        assert( _name == from["_id"].String() );
+
+        _shardingEnabled = from.getBoolField("partitioned");
+        _primary.reset( from.getStringField("primary") );
+
+        // In the 1.5.x series, we used to have collection metadata nested in the database entry. The 1.6.x series
+        // had migration code that ported that info to where it belongs now: the 'collections' collection. We now
+        // just assert that we're not migrating from a 1.5.x directly into a 1.7.x without first converting.
+        BSONObj sharded = from.getObjectField( "sharded" );
+        if ( ! sharded.isEmpty() )
+            uasserted( 13509 , "can't migrate from 1.5.x release to the current one; need to upgrade to 1.6.x first");
+    }
+
+    bool DBConfig::load() {
+        scoped_lock lk( _lock );
+        return _load();
+    }
+
+    bool DBConfig::_load() {
+        ScopedDbConnection conn( configServer.modelServer(), 30.0 );
+
+        BSONObj o = conn->findOne( ShardNS::database , BSON( "_id" << _name ) );
+
+        if ( o.isEmpty() ) {
+            conn.done();
+            return false;
+        }
+
+        unserialize( o );
+
+        BSONObjBuilder b;
+        b.appendRegex( "_id" , (string)"^" + _name + "\\." );
+
+        auto_ptr<DBClientCursor> cursor = conn->query( ShardNS::collection ,b.obj() );
+        assert( cursor.get() );
+        while ( cursor->more() ) {
+            BSONObj o = cursor->next();
+            if( o["dropped"].trueValue() ) _collections.erase( o["_id"].String() );
+            else _collections[o["_id"].String()] = CollectionInfo( o );
+        }
+
+        conn.done();
+
+        return true;
+    }
+
+    void DBConfig::_save( bool db, bool coll ) {
+        ScopedDbConnection conn( configServer.modelServer(), 30.0 );
+
+        if( db ){
+
+            BSONObj n;
+            {
+                BSONObjBuilder b;
+                serialize(b);
+                n = b.obj();
+            }
+
+            conn->update( ShardNS::database , BSON( "_id" << _name ) , n , true );
+            string err = conn->getLastError();
+            uassert( 13396 , (string)"DBConfig save failed: " + err , err.size() == 0 );
+
+        }
+
+        if( coll ){
+
+            for ( Collections::iterator i=_collections.begin(); i!=_collections.end(); ++i ) {
+                if ( ! i->second.isDirty() )
+                    continue;
+                i->second.save( i->first , conn.get() );
+            }
+
+        }
+
+        conn.done();
+    }
+
+    bool DBConfig::reload() {
+        scoped_lock lk( _lock );
+        return _reload();
+    }
+
+    bool DBConfig::_reload() {
+        // TODO: i don't think is 100% correct
+        return _load();
+    }
+
+    bool DBConfig::dropDatabase( string& errmsg ) {
+        /**
+         * 1) make sure everything is up
+         * 2) update config server
+         * 3) drop and reset sharded collections
+         * 4) drop and reset primary
+         * 5) drop everywhere to clean up loose ends
+         */
+
+        log() << "DBConfig::dropDatabase: " << _name << endl;
+        configServer.logChange( "dropDatabase.start" , _name , BSONObj() );
+
+        // 1
+        if ( ! configServer.allUp( errmsg ) ) {
+            LOG(1) << "\t DBConfig::dropDatabase not all up" << endl;
+            return 0;
+        }
+
+        // 2
+        grid.removeDB( _name );
+        {
+            ScopedDbConnection conn( configServer.modelServer(), 30.0 );
+            conn->remove( ShardNS::database , BSON( "_id" << _name ) );
+            errmsg = conn->getLastError();
+            if ( ! errmsg.empty() ) {
+                log() << "could not drop '" << _name << "': " << errmsg << endl;
+                conn.done();
+                return false;
+            }
+
+            conn.done();
+        }
+
+        if ( ! configServer.allUp( errmsg ) ) {
+            log() << "error removing from config server even after checking!" << endl;
+            return 0;
+        }
+        LOG(1) << "\t removed entry from config server for: " << _name << endl;
+
+        set<Shard> allServers;
+
+        // 3
+        while ( true ) {
+            int num = 0;
+            if ( ! _dropShardedCollections( num , allServers , errmsg ) )
+                return 0;
+            log() << "   DBConfig::dropDatabase: " << _name << " dropped sharded collections: " << num << endl;
+            if ( num == 0 )
+                break;
+        }
+
+        // 4
+        {
+            ScopedDbConnection conn( _primary, 30.0 );
+            BSONObj res;
+            if ( ! conn->dropDatabase( _name , &res ) ) {
+                errmsg = res.toString();
+                return 0;
+            }
+            conn.done();
+        }
+
+        // 5
+        for ( set<Shard>::iterator i=allServers.begin(); i!=allServers.end(); i++ ) {
+            ScopedDbConnection conn( *i, 30.0 );
+            BSONObj res;
+            if ( ! conn->dropDatabase( _name , &res ) ) {
+                errmsg = res.toString();
+                return 0;
+            }
+            conn.done();
+        }
+
+        LOG(1) << "\t dropped primary db for: " << _name << endl;
+
+        configServer.logChange( "dropDatabase" , _name , BSONObj() );
+        return true;
+    }
+
+    bool DBConfig::_dropShardedCollections( int& num, set<Shard>& allServers , string& errmsg ) {
+        num = 0;
+        set<string> seen;
+        while ( true ) {
+            Collections::iterator i = _collections.begin();
+            for ( ; i != _collections.end(); ++i ) {
+                // log() << "coll : " << i->first << " and " << i->second.isSharded() << endl;
+                if ( i->second.isSharded() )
+                    break;
+            }
+
+            if ( i == _collections.end() )
+                break;
+
+            if ( seen.count( i->first ) ) {
+                errmsg = "seen a collection twice!";
+                return false;
+            }
+
+            seen.insert( i->first );
+            LOG(1) << "\t dropping sharded collection: " << i->first << endl;
+
+            i->second.getCM()->getAllShards( allServers );
+            i->second.getCM()->drop( i->second.getCM() );
+            uassert( 10176 , str::stream() << "shard state missing for " << i->first , removeSharding( i->first ) );
+
+            num++;
+            uassert( 10184 ,  "_dropShardedCollections too many collections - bailing" , num < 100000 );
+            LOG(2) << "\t\t dropped " << num << " so far" << endl;
+        }
+
+        return true;
+    }
+
+    void DBConfig::getAllShards(set<Shard>& shards) const {
+        scoped_lock lk( _lock );
+        shards.insert(getPrimary());
+        for (Collections::const_iterator it(_collections.begin()), end(_collections.end()); it != end; ++it) {
+            if (it->second.isSharded()) {
+                it->second.getCM()->getAllShards(shards);
+            } // TODO: handle collections on non-primary shard
+        }
+    }
+
+    /* --- ConfigServer ---- */
+
+    ConfigServer::ConfigServer() : DBConfig( "config" ) {
+        _shardingEnabled = false;
+    }
+
+    ConfigServer::~ConfigServer() {
+    }
+
+    bool ConfigServer::init( string s ) {
+        vector<string> configdbs;
+        splitStringDelim( s, &configdbs, ',' );
+        return init( configdbs );
+    }
+
+    bool ConfigServer::init( vector<string> configHosts ) {
+
+        uassert( 10187 ,  "need configdbs" , configHosts.size() );
+
+        string hn = getHostName();
+        if ( hn.empty() ) {
+            sleepsecs(5);
+            dbexit( EXIT_BADOPTIONS );
+        }
+
+        set<string> hosts;
+        for ( size_t i=0; i<configHosts.size(); i++ ) {
+            string host = configHosts[i];
+            hosts.insert( getHost( host , false ) );
+            configHosts[i] = getHost( host , true );
+        }
+
+        for ( set<string>::iterator i=hosts.begin(); i!=hosts.end(); i++ ) {
+            string host = *i;
+            bool ok = false;
+            for ( int x=10; x>0; x-- ) {
+                if ( ! hostbyname( host.c_str() ).empty() ) {
+                    ok = true;
+                    break;
+                }
+                log() << "can't resolve DNS for [" << host << "]  sleeping and trying " << x << " more times" << endl;
+                sleepsecs( 10 );
+            }
+            if ( ! ok )
+                return false;
+        }
+
+        _config = configHosts;
+
+        string fullString;
+        joinStringDelim( configHosts, &fullString, ',' );
+        _primary.setAddress( ConnectionString( fullString , ConnectionString::SYNC ) );
+        LOG(1) << " config string : " << fullString << endl;
+
+        return true;
+    }
+
+    bool ConfigServer::checkConfigServersConsistent( string& errmsg , int tries ) const {
+        if ( tries <= 0 )
+            return false;
+
+        unsigned firstGood = 0;
+        int up = 0;
+        vector<BSONObj> res;
+        for ( unsigned i=0; i<_config.size(); i++ ) {
+            BSONObj x;
+            try {
+                ScopedDbConnection conn( _config[i], 30.0 );
+
+                // check auth
+                conn->update("config.foo.bar", BSONObj(), BSON("x" << 1));
+                conn->simpleCommand( "admin", &x, "getlasterror");
+                if (x["err"].type() == String && x["err"].String() == "unauthorized") {
+                    errmsg = "not authorized, did you start with --keyFile?";
+                    return false;
+                }
+
+                if ( ! conn->simpleCommand( "config" , &x , "dbhash" ) )
+                    x = BSONObj();
+                else {
+                    x = x.getOwned();
+                    if ( up == 0 )
+                        firstGood = i;
+                    up++;
+                }
+                conn.done();
+            }
+            catch ( SocketException& e ) {
+                warning() << " couldn't check on config server:" << _config[i] << " ok for now : " << e.toString() << endl;
+            }
+            res.push_back(x);
+        }
+
+        if ( _config.size() == 1 )
+            return true;
+
+        if ( up == 0 ) {
+            errmsg = "no config servers reachable";
+            return false;
+        }
+
+        if ( up == 1 ) {
+            log( LL_WARNING ) << "only 1 config server reachable, continuing" << endl;
+            return true;
+        }
+
+        BSONObj base = res[firstGood];
+        for ( unsigned i=firstGood+1; i<res.size(); i++ ) {
+            if ( res[i].isEmpty() )
+                continue;
+
+            string c1 = base.getFieldDotted( "collections.chunks" );
+            string c2 = res[i].getFieldDotted( "collections.chunks" );
+
+            string d1 = base.getFieldDotted( "collections.databases" );
+            string d2 = res[i].getFieldDotted( "collections.databases" );
+
+            if ( c1 == c2 && d1 == d2 )
+                continue;
+
+            stringstream ss;
+            ss << "config servers " << _config[firstGood] << " and " << _config[i] << " differ";
+            log( LL_WARNING ) << ss.str();
+            if ( tries <= 1 ) {
+                ss << "\n" << c1 << "\t" << c2 << "\n" << d1 << "\t" << d2;
+                errmsg = ss.str();
+                return false;
+            }
+
+            return checkConfigServersConsistent( errmsg , tries - 1 );
+        }
+
+        return true;
+    }
+
+    bool ConfigServer::ok( bool checkConsistency ) {
+        if ( ! _primary.ok() )
+            return false;
+
+        if ( checkConsistency ) {
+            string errmsg;
+            if ( ! checkConfigServersConsistent( errmsg ) ) {
+                log( LL_ERROR ) << "config servers not in sync! " << errmsg << warnings;
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool ConfigServer::allUp() {
+        string errmsg;
+        return allUp( errmsg );
+    }
+
+    bool ConfigServer::allUp( string& errmsg ) {
+        try {
+            ScopedDbConnection conn( _primary, 30.0 );
+            conn->getLastError();
+            conn.done();
+            return true;
+        }
+        catch ( DBException& ) {
+            log() << "ConfigServer::allUp : " << _primary.toString() << " seems down!" << endl;
+            errmsg = _primary.toString() + " seems down";
+            return false;
+        }
+
+    }
+
+    int ConfigServer::dbConfigVersion() {
+        ScopedDbConnection conn( _primary, 30.0 );
+        int version = dbConfigVersion( conn.conn() );
+        conn.done();
+        return version;
+    }
+
+    int ConfigServer::dbConfigVersion( DBClientBase& conn ) {
+        auto_ptr<DBClientCursor> c = conn.query( "config.version" , BSONObj() );
+        int version = 0;
+        if ( c->more() ) {
+            BSONObj o = c->next();
+            version = o["version"].numberInt();
+            uassert( 10189 ,  "should only have 1 thing in config.version" , ! c->more() );
+        }
+        else {
+            if ( conn.count( ShardNS::shard ) || conn.count( ShardNS::database ) ) {
+                version = 1;
+            }
+        }
+
+        return version;
+    }
+
+    void ConfigServer::reloadSettings() {
+        set<string> got;
+
+        ScopedDbConnection conn( _primary, 30.0 );
+        auto_ptr<DBClientCursor> c = conn->query( ShardNS::settings , BSONObj() );
+        assert( c.get() );
+        while ( c->more() ) {
+            BSONObj o = c->next();
+            string name = o["_id"].valuestrsafe();
+            got.insert( name );
+            if ( name == "chunksize" ) {
+                int csize = o["value"].numberInt();
+
+                // validate chunksize before proceeding
+                if ( csize == 0 ) {
+                    // setting was not modified; mark as such
+                    got.erase(name);
+                    log() << "warning: invalid chunksize (" << csize << ") ignored" << endl;
+                } else {
+                    LOG(1) << "MaxChunkSize: " << csize << endl;
+                    Chunk::MaxChunkSize = csize * 1024 * 1024;
+                }
+            }
+            else if ( name == "balancer" ) {
+                // ones we ignore here
+            }
+            else {
+                log() << "warning: unknown setting [" << name << "]" << endl;
+            }
+        }
+
+        if ( ! got.count( "chunksize" ) ) {
+            conn->insert( ShardNS::settings , BSON( "_id" << "chunksize"  <<
+                                                    "value" << (Chunk::MaxChunkSize / ( 1024 * 1024 ) ) ) );
+        }
+
+
+        // indexes
+        try {
+            conn->ensureIndex( ShardNS::chunk , BSON( "ns" << 1 << "min" << 1 ) , true );
+            conn->ensureIndex( ShardNS::chunk , BSON( "ns" << 1 << "shard" << 1 << "min" << 1 ) , true );
+            conn->ensureIndex( ShardNS::chunk , BSON( "ns" << 1 << "lastmod" << 1 ) , true );
+            conn->ensureIndex( ShardNS::shard , BSON( "host" << 1 ) , true );
+        }
+        catch ( std::exception& e ) {
+            log( LL_WARNING ) << "couldn't create indexes on config db: " << e.what() << endl;
+        }
+
+        conn.done();
+    }
+
+    string ConfigServer::getHost( string name , bool withPort ) {
+        if ( name.find( ":" ) != string::npos ) {
+            if ( withPort )
+                return name;
+            return name.substr( 0 , name.find( ":" ) );
+        }
+
+        if ( withPort ) {
+            stringstream ss;
+            ss << name << ":" << CmdLine::ConfigServerPort;
+            return ss.str();
+        }
+
+        return name;
+    }
+
+    /* must never throw */
+    void ConfigServer::logChange( const string& what , const string& ns , const BSONObj& detail ) {
+        string changeID;
+
+        try {
+            // get this entry's ID so we can use on the exception code path too
+            stringstream id;
+            static AtomicUInt num;
+            id << getHostNameCached() << "-" << terseCurrentTime() << "-" << num++;
+            changeID = id.str();
+
+            // send a copy of the message to the log in case it doesn't manage to reach config.changelog
+            Client* c = currentClient.get();
+            BSONObj msg = BSON( "_id" << changeID << "server" << getHostNameCached() << "clientAddr" << (c ? c->clientAddress(true) : "N/A")
+                                << "time" << DATENOW << "what" << what << "ns" << ns << "details" << detail );
+            log() << "about to log metadata event: " << msg << endl;
+
+            assert( _primary.ok() );
+
+            ScopedDbConnection conn( _primary, 30.0 );
+
+            static bool createdCapped = false;
+            if ( ! createdCapped ) {
+                try {
+                    conn->createCollection( "config.changelog" , 1024 * 1024 * 10 , true );
+                }
+                catch ( UserException& e ) {
+                    LOG(1) << "couldn't create changelog (like race condition): " << e << endl;
+                    // don't care
+                }
+                createdCapped = true;
+            }
+
+            conn->insert( "config.changelog" , msg );
+
+            conn.done();
+
+        }
+
+        catch ( std::exception& e ) {
+            // if we got here, it means the config change is only in the log; it didn't make it to config.changelog
+            log() << "not logging config change: " << changeID << " " << e.what() << endl;
+        }
+    }
+
+    void ConfigServer::replicaSetChange( const ReplicaSetMonitor * monitor ) {
+        try {
+            Shard s = Shard::lookupRSName(monitor->getName());
+            if (s == Shard::EMPTY) {
+                log(1) << "replicaSetChange: shard not found for set: " << monitor->getServerAddress() << endl;
+                return;
+            }
+            ScopedDbConnection conn( configServer.getConnectionString(), 30.0 );
+            conn->update( ShardNS::shard , BSON( "_id" << s.getName() ) , BSON( "$set" << BSON( "host" << monitor->getServerAddress() ) ) );
+            conn.done();
+        }
+        catch ( DBException & ) {
+            error() << "RSChangeWatcher: could not update config db for set: " << monitor->getName() << " to: " << monitor->getServerAddress() << endl;
+        }
+    }
+
+    DBConfigPtr configServerPtr (new ConfigServer());
+    ConfigServer& configServer = dynamic_cast<ConfigServer&>(*configServerPtr);
+
+}
diff --git a/src/mongo/s/config.h b/src/mongo/s/config.h
new file mode 100644
index 00000000000..650371c4fa8
--- /dev/null
+++ b/src/mongo/s/config.h
@@ -0,0 +1,268 @@
+// config.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* This file is things related to the "grid configuration":
+   - what machines make up the db component of our cloud
+   - where various ranges of things live
+*/
+
+#pragma once
+
+#include "../db/namespace.h"
+#include "../client/dbclient.h"
+#include "../client/model.h"
+
+#include "chunk.h"
+#include "shard.h"
+#include "shardkey.h"
+
+namespace mongo {
+
+    struct ShardNS {
+        static string shard;
+
+        static string database;
+        static string collection;
+        static string chunk;
+
+        static string mongos;
+        static string settings;
+    };
+
+    /**
+     * Field names used in the 'shards' collection.
+     */
+    struct ShardFields {
+        static BSONField<bool> draining;      // is it draining chunks?
+        static BSONField<long long> maxSize;  // max allowed disk space usage
+    };
+
+    class ConfigServer;
+
+    class DBConfig;
+    typedef boost::shared_ptr<DBConfig> DBConfigPtr;
+    typedef shared_ptr<Shard> ShardPtr;
+
+    extern DBConfigPtr configServerPtr;
+    extern ConfigServer& configServer;
+
+    /**
+     * top level configuration for a database
+     */
+    class DBConfig  {
+
+        struct CollectionInfo {
+            CollectionInfo() {
+                _dirty = false;
+                _dropped = false;
+            }
+
+            CollectionInfo( const BSONObj& in );
+
+            bool isSharded() const {
+                return _cm.get();
+            }
+
+            ChunkManagerPtr getCM() const {
+                return _cm;
+            }
+
+            void resetCM( ChunkManager * cm ) {
+                assert(cm);
+                assert(_cm); // this has to be already sharded
+                _cm.reset( cm );
+            }
+
+            void shard( const string& ns , const ShardKeyPattern& key , bool unique );
+            void unshard();
+
+            bool isDirty() const { return _dirty; }
+            bool wasDropped() const { return _dropped; }
+            
+            void save( const string& ns , DBClientBase* conn );
+            
+            bool unique() const { return _unqiue; }
+            BSONObj key() const { return _key; } 
+
+
+        private:
+            BSONObj _key;
+            bool _unqiue;
+            ChunkManagerPtr _cm;
+            bool _dirty;
+            bool _dropped;
+        };
+
+        typedef map<string,CollectionInfo> Collections;
+
+    public:
+
+        DBConfig( string name )
+            : _name( name ) ,
+              _primary("config","") ,
+              _shardingEnabled(false),
+              _lock("DBConfig") ,
+              _hitConfigServerLock( "DBConfig::_hitConfigServerLock" ) {
+            assert( name.size() );
+        }
+        virtual ~DBConfig() {}
+
+        string getName() { return _name; };
+
+        /**
+         * @return if anything in this db is partitioned or not
+         */
+        bool isShardingEnabled() {
+            return _shardingEnabled;
+        }
+
+        void enableSharding();
+        ChunkManagerPtr shardCollection( const string& ns , ShardKeyPattern fieldsAndOrder , bool unique , vector<BSONObj>* initPoints=0, vector<Shard>* initShards=0 );
+
+        /**
+           @return true if there was sharding info to remove
+         */
+        bool removeSharding( const string& ns );
+
+        /**
+         * @return whether or not the 'ns' collection is partitioned
+         */
+        bool isSharded( const string& ns );
+
+        ChunkManagerPtr getChunkManager( const string& ns , bool reload = false, bool forceReload = false );
+        ChunkManagerPtr getChunkManagerIfExists( const string& ns , bool reload = false, bool forceReload = false );
+
+        const Shard& getShard( const string& ns );
+        /**
+         * @return the correct for shard for the ns
+         * if the namespace is sharded, will return NULL
+         */
+        ShardPtr getShardIfExists( const string& ns );
+
+        const Shard& getPrimary() const {
+            uassert( 8041 , (string)"no primary shard configured for db: " + _name , _primary.ok() );
+            return _primary;
+        }
+
+        void setPrimary( string s );
+
+        bool load();
+        bool reload();
+
+        bool dropDatabase( string& errmsg );
+
+        // model stuff
+
+        // lockless loading
+        void serialize(BSONObjBuilder& to);
+
+        void unserialize(const BSONObj& from);
+
+        void getAllShards(set<Shard>& shards) const;
+
+    protected:
+
+        /**
+            lockless
+        */
+        bool _isSharded( const string& ns );
+
+        bool _dropShardedCollections( int& num, set<Shard>& allServers , string& errmsg );
+
+        bool _load();
+        bool _reload();
+        void _save( bool db = true, bool coll = true );
+
+        string _name; // e.g. "alleyinsider"
+        Shard _primary; // e.g. localhost , mongo.foo.com:9999
+        bool _shardingEnabled;
+
+        //map<string,CollectionInfo> _sharded; // { "alleyinsider.blog.posts" : { ts : 1 }  , ... ] - all ns that are sharded
+        //map<string,ChunkManagerPtr> _shards; // this will only have entries for things that have been looked at
+
+        Collections _collections;
+
+        mutable mongo::mutex _lock; // TODO: change to r/w lock ??
+        mutable mongo::mutex _hitConfigServerLock;
+    };
+
+    class ConfigServer : public DBConfig {
+    public:
+
+        ConfigServer();
+        ~ConfigServer();
+
+        bool ok( bool checkConsistency = false );
+
+        virtual string modelServer() {
+            uassert( 10190 ,  "ConfigServer not setup" , _primary.ok() );
+            return _primary.getConnString();
+        }
+
+        /**
+           call at startup, this will initiate connection to the grid db
+        */
+        bool init( vector<string> configHosts );
+
+        bool init( string s );
+
+        bool allUp();
+        bool allUp( string& errmsg );
+
+        int dbConfigVersion();
+        int dbConfigVersion( DBClientBase& conn );
+
+        void reloadSettings();
+
+        /**
+         * @return 0 = ok, otherwise error #
+         */
+        int checkConfigVersion( bool upgrade );
+
+        /**
+         * Create a metadata change log entry in the config.changelog collection.
+         *
+         * @param what e.g. "split" , "migrate"
+         * @param ns to which collection the metadata change is being applied
+         * @param msg additional info about the metadata change
+         *
+         * This call is guaranteed never to throw.
+         */
+        void logChange( const string& what , const string& ns , const BSONObj& detail = BSONObj() );
+
+        ConnectionString getConnectionString() const {
+            return ConnectionString( _primary.getConnString() , ConnectionString::SYNC );
+        }
+
+        void replicaSetChange( const ReplicaSetMonitor * monitor );
+
+        static int VERSION;
+
+
+        /**
+         * check to see if all config servers have the same state
+         * will try tries time to make sure not catching in a bad state
+         */
+        bool checkConfigServersConsistent( string& errmsg , int tries = 4 ) const;
+
+    private:
+        string getHost( string name , bool withPort );
+        vector<string> _config;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/s/config_migrate.cpp b/src/mongo/s/config_migrate.cpp
new file mode 100644
index 00000000000..fff023cfb5b
--- /dev/null
+++ b/src/mongo/s/config_migrate.cpp
@@ -0,0 +1,196 @@
+// config_migrate.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/message.h"
+#include "../util/unittest.h"
+#include "../client/connpool.h"
+#include "../client/model.h"
+#include "../db/pdfile.h"
+#include "../db/cmdline.h"
+
+#include "server.h"
+#include "config.h"
+#include "chunk.h"
+
+namespace mongo {
+
+    int ConfigServer::checkConfigVersion( bool upgrade ) {
+        int cur = dbConfigVersion();
+        if ( cur == VERSION )
+            return 0;
+
+        if ( cur == 0 ) {
+            ScopedDbConnection conn( _primary );
+            conn->insert( "config.version" , BSON( "_id" << 1 << "version" << VERSION ) );
+            pool.flush();
+            assert( VERSION == dbConfigVersion( conn.conn() ) );
+            conn.done();
+            return 0;
+        }
+
+        if ( cur == 2 ) {
+
+            // need to upgrade
+            assert( VERSION == 3 );
+            if ( ! upgrade ) {
+                log() << "newer version of mongo meta data\n"
+                      << "need to --upgrade after shutting all mongos down"
+                      << endl;
+                return -9;
+            }
+
+            ScopedDbConnection conn( _primary );
+
+            // do a backup
+            string backupName;
+            {
+                stringstream ss;
+                ss << "config-backup-" << terseCurrentTime(false);
+                backupName = ss.str();
+            }
+            log() << "backing up config to: " << backupName << endl;
+            conn->copyDatabase( "config" , backupName );
+
+            map<string,string> hostToShard;
+            set<string> shards;
+            // shards
+            {
+                unsigned n = 0;
+                auto_ptr<DBClientCursor> c = conn->query( ShardNS::shard , BSONObj() );
+                while ( c->more() ) {
+                    BSONObj o = c->next();
+                    string host = o["host"].String();
+
+                    string name = "";
+
+                    BSONElement id = o["_id"];
+                    if ( id.type() == String ) {
+                        name = id.String();
+                    }
+                    else {
+                        stringstream ss;
+                        ss << "shard" << hostToShard.size();
+                        name = ss.str();
+                    }
+
+                    hostToShard[host] = name;
+                    shards.insert( name );
+                    n++;
+                }
+
+                assert( n == hostToShard.size() );
+                assert( n == shards.size() );
+
+                conn->remove( ShardNS::shard , BSONObj() );
+
+                for ( map<string,string>::iterator i=hostToShard.begin(); i != hostToShard.end(); i++ ) {
+                    conn->insert( ShardNS::shard , BSON( "_id" << i->second << "host" << i->first ) );
+                }
+            }
+
+            // databases
+            {
+                auto_ptr<DBClientCursor> c = conn->query( ShardNS::database , BSONObj() );
+                map<string,BSONObj> newDBs;
+                unsigned n = 0;
+                while ( c->more() ) {
+                    BSONObj old = c->next();
+                    n++;
+
+                    if ( old["name"].eoo() ) {
+                        // already done
+                        newDBs[old["_id"].String()] = old;
+                        continue;
+                    }
+
+                    BSONObjBuilder b(old.objsize());
+                    b.appendAs( old["name"] , "_id" );
+
+                    BSONObjIterator i(old);
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( strcmp( "_id" , e.fieldName() ) == 0 ||
+                                strcmp( "name" , e.fieldName() ) == 0 ) {
+                            continue;
+                        }
+
+                        b.append( e );
+                    }
+
+                    BSONObj x = b.obj();
+                    log() << old << "\n\t" << x << endl;
+                    newDBs[old["name"].String()] = x;
+                }
+
+                assert( n == newDBs.size() );
+
+                conn->remove( ShardNS::database , BSONObj() );
+
+                for ( map<string,BSONObj>::iterator i=newDBs.begin(); i!=newDBs.end(); i++ ) {
+                    conn->insert( ShardNS::database , i->second );
+                }
+
+            }
+
+            // chunks
+            {
+                unsigned num = 0;
+                map<string,BSONObj> chunks;
+                auto_ptr<DBClientCursor> c = conn->query( ShardNS::chunk , BSONObj() );
+                while ( c->more() ) {
+                    BSONObj x = c->next();
+                    BSONObjBuilder b;
+
+                    string id = Chunk::genID( x["ns"].String() , x["min"].Obj() );
+                    b.append( "_id" , id );
+
+                    BSONObjIterator i(x);
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( strcmp( e.fieldName() , "_id" ) == 0 )
+                            continue;
+                        b.append( e );
+                    }
+
+                    BSONObj n = b.obj();
+                    log() << x << "\n\t" << n << endl;
+                    chunks[id] = n;
+                    num++;
+                }
+
+                assert( num == chunks.size() );
+
+                conn->remove( ShardNS::chunk , BSONObj() );
+                for ( map<string,BSONObj>::iterator i=chunks.begin(); i!=chunks.end(); i++ ) {
+                    conn->insert( ShardNS::chunk , i->second );
+                }
+
+            }
+
+            conn->update( "config.version" , BSONObj() , BSON( "_id" << 1 << "version" << VERSION ) );
+            conn.done();
+            pool.flush();
+            return 1;
+        }
+
+        log() << "don't know how to upgrade " << cur << " to " << VERSION << endl;
+        return -8;
+    }
+
+}
diff --git a/src/mongo/s/cursors.cpp b/src/mongo/s/cursors.cpp
new file mode 100644
index 00000000000..241c2cfdb8d
--- /dev/null
+++ b/src/mongo/s/cursors.cpp
@@ -0,0 +1,316 @@
+// cursors.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "cursors.h"
+#include "../client/connpool.h"
+#include "../db/queryutil.h"
+#include "../db/commands.h"
+#include "../util/concurrency/task.h"
+#include "../util/net/listen.h"
+
+namespace mongo {
+
+    // --------  ShardedCursor -----------
+
+    ShardedClientCursor::ShardedClientCursor( QueryMessage& q , ClusteredCursor * cursor ) {
+        assert( cursor );
+        _cursor = cursor;
+
+        _skip = q.ntoskip;
+        _ntoreturn = q.ntoreturn;
+
+        _totalSent = 0;
+        _done = false;
+
+        _id = 0;
+
+        if ( q.queryOptions & QueryOption_NoCursorTimeout ) {
+            _lastAccessMillis = 0;
+        }
+        else
+            _lastAccessMillis = Listener::getElapsedTimeMillis();
+    }
+
+    ShardedClientCursor::~ShardedClientCursor() {
+        assert( _cursor );
+        delete _cursor;
+        _cursor = 0;
+    }
+
+    long long ShardedClientCursor::getId() {
+        if ( _id <= 0 ) {
+            _id = cursorCache.genId();
+            assert( _id >= 0 );
+        }
+        return _id;
+    }
+
+    void ShardedClientCursor::accessed() {
+        if ( _lastAccessMillis > 0 )
+            _lastAccessMillis = Listener::getElapsedTimeMillis();
+    }
+
+    long long ShardedClientCursor::idleTime( long long now ) {
+        if ( _lastAccessMillis == 0 )
+            return 0;
+        return now - _lastAccessMillis;
+    }
+
+    bool ShardedClientCursor::sendNextBatch( Request& r , int ntoreturn ) {
+        uassert( 10191 ,  "cursor already done" , ! _done );
+
+        int maxSize = 1024 * 1024;
+        if ( _totalSent > 0 )
+            maxSize *= 3;
+
+        BufBuilder b(32768);
+
+        int num = 0;
+        bool sendMore = true;
+
+        while ( _cursor->more() ) {
+            BSONObj o = _cursor->next();
+
+            b.appendBuf( (void*)o.objdata() , o.objsize() );
+            num++;
+
+            if ( b.len() > maxSize ) {
+                break;
+            }
+
+            if ( num == ntoreturn ) {
+                // soft limit aka batch size
+                break;
+            }
+
+            if ( ntoreturn != 0 && ( -1 * num + _totalSent ) == ntoreturn ) {
+                // hard limit - total to send
+                sendMore = false;
+                break;
+            }
+
+            if ( ntoreturn == 0 && _totalSent == 0 && num > 100 ) {
+                // first batch should be max 100 unless batch size specified
+                break;
+            }
+        }
+
+        bool hasMore = sendMore && _cursor->more();
+        LOG(6) << "\t hasMore:" << hasMore << " wouldSendMoreIfHad: " << sendMore << " id:" << getId() << " totalSent: " << _totalSent << endl;
+
+        replyToQuery( 0 , r.p() , r.m() , b.buf() , b.len() , num , _totalSent , hasMore ? getId() : 0 );
+        _totalSent += num;
+        _done = ! hasMore;
+
+        return hasMore;
+    }
+
+    // ---- CursorCache -----
+
+    long long CursorCache::TIMEOUT = 600000;
+
+    CursorCache::CursorCache()
+        :_mutex( "CursorCache" ), _shardedTotal(0) {
+    }
+
+    CursorCache::~CursorCache() {
+        // TODO: delete old cursors?
+        bool print = logLevel > 0;
+        if ( _cursors.size() || _refs.size() )
+            print = true;
+        
+        if ( print ) 
+            cout << " CursorCache at shutdown - "
+                 << " sharded: " << _cursors.size()
+                 << " passthrough: " << _refs.size()
+                 << endl;
+    }
+
+    ShardedClientCursorPtr CursorCache::get( long long id ) const {
+        LOG(_myLogLevel) << "CursorCache::get id: " << id << endl;
+        scoped_lock lk( _mutex );
+        MapSharded::const_iterator i = _cursors.find( id );
+        if ( i == _cursors.end() ) {
+            OCCASIONALLY log() << "Sharded CursorCache missing cursor id: " << id << endl;
+            return ShardedClientCursorPtr();
+        }
+        i->second->accessed();
+        return i->second;
+    }
+
+    void CursorCache::store( ShardedClientCursorPtr cursor ) {
+        LOG(_myLogLevel) << "CursorCache::store cursor " << " id: " << cursor->getId() << endl;
+        assert( cursor->getId() );
+        scoped_lock lk( _mutex );
+        _cursors[cursor->getId()] = cursor;
+        _shardedTotal++;
+    }
+    void CursorCache::remove( long long id ) {
+        assert( id );
+        scoped_lock lk( _mutex );
+        _cursors.erase( id );
+    }
+    
+    void CursorCache::storeRef( const string& server , long long id ) {
+        LOG(_myLogLevel) << "CursorCache::storeRef server: " << server << " id: " << id << endl;
+        assert( id );
+        scoped_lock lk( _mutex );
+        _refs[id] = server;
+    }
+
+    string CursorCache::getRef( long long id ) const {
+        LOG(_myLogLevel) << "CursorCache::getRef id: " << id << endl;
+        assert( id );
+        scoped_lock lk( _mutex );
+        MapNormal::const_iterator i = _refs.find( id );
+        if ( i == _refs.end() )
+            return "";
+        return i->second;
+    }
+
+
+    long long CursorCache::genId() {
+        while ( true ) {
+            long long x = Security::getNonce();
+            if ( x == 0 )
+                continue;
+            if ( x < 0 )
+                x *= -1;
+
+            scoped_lock lk( _mutex );
+            MapSharded::iterator i = _cursors.find( x );
+            if ( i != _cursors.end() )
+                continue;
+
+            MapNormal::iterator j = _refs.find( x );
+            if ( j != _refs.end() )
+                continue;
+
+            return x;
+        }
+    }
+
+    void CursorCache::gotKillCursors(Message& m ) {
+        int *x = (int *) m.singleData()->_data;
+        x++; // reserved
+        int n = *x++;
+
+        if ( n > 2000 ) {
+            log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl;
+        }
+
+
+        uassert( 13286 , "sent 0 cursors to kill" , n >= 1 );
+        uassert( 13287 , "too many cursors to kill" , n < 30000 );
+
+        long long * cursors = (long long *)x;
+        for ( int i=0; i<n; i++ ) {
+            long long id = cursors[i];
+            LOG(_myLogLevel) << "CursorCache::gotKillCursors id: " << id << endl;
+
+            if ( ! id ) {
+                log( LL_WARNING ) << " got cursor id of 0 to kill" << endl;
+                continue;
+            }
+
+            string server;
+            {
+                scoped_lock lk( _mutex );
+
+                MapSharded::iterator i = _cursors.find( id );
+                if ( i != _cursors.end() ) {
+                    _cursors.erase( i );
+                    continue;
+                }
+
+                MapNormal::iterator j = _refs.find( id );
+                if ( j == _refs.end() ) {
+                    log( LL_WARNING ) << "can't find cursor: " << id << endl;
+                    continue;
+                }
+                server = j->second;
+                _refs.erase( j );
+            }
+
+            LOG(_myLogLevel) << "CursorCache::found gotKillCursors id: " << id << " server: " << server << endl;
+
+            assert( server.size() );
+            ScopedDbConnection conn( server );
+            conn->killCursor( id );
+            conn.done();
+        }
+    }
+
+    void CursorCache::appendInfo( BSONObjBuilder& result ) const {
+        scoped_lock lk( _mutex );
+        result.append( "sharded" , (int)_cursors.size() );
+        result.appendNumber( "shardedEver" , _shardedTotal );
+        result.append( "refs" , (int)_refs.size() );
+        result.append( "totalOpen" , (int)(_cursors.size() + _refs.size() ) );
+    }
+
+    void CursorCache::doTimeouts() {
+        long long now = Listener::getElapsedTimeMillis();
+        scoped_lock lk( _mutex );
+        for ( MapSharded::iterator i=_cursors.begin(); i!=_cursors.end(); ++i ) {
+            long long idleFor = i->second->idleTime( now );
+            if ( idleFor < TIMEOUT ) {
+                continue;
+            }
+            log() << "killing old cursor " << i->second->getId() << " idle for: " << idleFor << "ms" << endl; // TODO: make log(1)
+            _cursors.erase( i );
+            i = _cursors.begin(); // possible 2nd entry will get skipped, will get on next pass
+            if ( i == _cursors.end() )
+                break;
+        }
+    }
+
+    CursorCache cursorCache;
+
+    const int CursorCache::_myLogLevel = 3;
+
+    class CursorTimeoutTask : public task::Task {
+    public:
+        virtual string name() const { return "cursorTimeout"; }
+        virtual void doWork() {
+            cursorCache.doTimeouts();
+        }
+    } cursorTimeoutTask;
+
+    void CursorCache::startTimeoutThread() {
+        task::repeat( &cursorTimeoutTask , 400 );
+    }
+
+    class CmdCursorInfo : public Command {
+    public:
+        CmdCursorInfo() : Command( "cursorInfo", true ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream& help ) const {
+            help << " example: { cursorInfo : 1 }";
+        }
+        virtual LockType locktype() const { return NONE; }
+        bool run(const string&, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            cursorCache.appendInfo( result );
+            if ( jsobj["setTimeout"].isNumber() )
+                CursorCache::TIMEOUT = jsobj["setTimeout"].numberLong();
+            return true;
+        }
+    } cmdCursorInfo;
+
+}
diff --git a/src/mongo/s/cursors.h b/src/mongo/s/cursors.h
new file mode 100644
index 00000000000..862f3731031
--- /dev/null
+++ b/src/mongo/s/cursors.h
@@ -0,0 +1,106 @@
+// cursors.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "../pch.h"
+
+#include "../db/jsobj.h"
+#include "../db/dbmessage.h"
+#include "../client/dbclient.h"
+#include "../client/parallel.h"
+
+#include "request.h"
+
+namespace mongo {
+
+    class ShardedClientCursor : boost::noncopyable {
+    public:
+        ShardedClientCursor( QueryMessage& q , ClusteredCursor * cursor );
+        virtual ~ShardedClientCursor();
+
+        long long getId();
+
+        /**
+         * @return whether there is more data left
+         */
+        bool sendNextBatch( Request& r ) { return sendNextBatch( r , _ntoreturn ); }
+        bool sendNextBatch( Request& r , int ntoreturn );
+
+        void accessed();
+        /** @return idle time in ms */
+        long long idleTime( long long now );
+
+    protected:
+
+        ClusteredCursor * _cursor;
+
+        int _skip;
+        int _ntoreturn;
+
+        int _totalSent;
+        bool _done;
+
+        long long _id;
+        long long _lastAccessMillis; // 0 means no timeout
+
+    };
+
+    typedef boost::shared_ptr<ShardedClientCursor> ShardedClientCursorPtr;
+
+    class CursorCache {
+    public:
+
+        static long long TIMEOUT;
+
+        typedef map<long long,ShardedClientCursorPtr> MapSharded;
+        typedef map<long long,string> MapNormal;
+
+        CursorCache();
+        ~CursorCache();
+
+        ShardedClientCursorPtr get( long long id ) const;
+        void store( ShardedClientCursorPtr cursor );
+        void remove( long long id );
+
+        void storeRef( const string& server , long long id );
+
+        /** @return the server for id or "" */
+        string getRef( long long id ) const ;
+        
+        void gotKillCursors(Message& m );
+
+        void appendInfo( BSONObjBuilder& result ) const ;
+
+        long long genId();
+
+        void doTimeouts();
+        void startTimeoutThread();
+    private:
+        mutable mongo::mutex _mutex;
+
+        MapSharded _cursors;
+        MapNormal _refs;
+
+        long long _shardedTotal;
+
+        static const int _myLogLevel;
+    };
+
+    extern CursorCache cursorCache;
+}
diff --git a/src/mongo/s/d_chunk_manager.cpp b/src/mongo/s/d_chunk_manager.cpp
new file mode 100644
index 00000000000..82a06f61f2c
--- /dev/null
+++ b/src/mongo/s/d_chunk_manager.cpp
@@ -0,0 +1,339 @@
+// @file d_chunk_manager.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "../client/connpool.h"
+#include "../client/dbclientmockcursor.h"
+#include "../db/instance.h"
+#include "../db/clientcursor.h"
+
+#include "d_chunk_manager.h"
+
+namespace mongo {
+
+    ShardChunkManager::ShardChunkManager( const string& configServer , const string& ns , const string& shardName ) {
+
+        // have to get a connection to the config db
+        // special case if I'm the configdb since I'm locked and if I connect to myself
+        // its a deadlock
+        scoped_ptr<ScopedDbConnection> scoped;
+        scoped_ptr<DBDirectClient> direct;
+        DBClientBase * conn;
+        if ( configServer.empty() ) {
+            direct.reset( new DBDirectClient() );
+            conn = direct.get();
+        }
+        else {
+            scoped.reset( new ScopedDbConnection( configServer ) );
+            conn = scoped->get();
+        }
+
+        // get this collection's sharding key
+        BSONObj collectionDoc = conn->findOne( "config.collections", BSON( "_id" << ns ) );
+        uassert( 13539 , str::stream() << ns << " does not exist" , !collectionDoc.isEmpty() );
+        uassert( 13540 , str::stream() << ns << " collection config entry corrupted" , collectionDoc["dropped"].type() );
+        uassert( 13541 , str::stream() << ns << " dropped. Re-shard collection first." , !collectionDoc["dropped"].Bool() );
+        _fillCollectionKey( collectionDoc );
+
+        // query for all the chunks for 'ns' that live in this shard, sorting so we can efficiently bucket them
+        BSONObj q = BSON( "ns" << ns << "shard" << shardName );
+        auto_ptr<DBClientCursor> cursor = conn->query( "config.chunks" , Query(q).sort( "min" ) );
+        _fillChunks( cursor.get() );
+        _fillRanges();
+
+        if ( scoped.get() )
+            scoped->done();
+
+        if ( _chunksMap.empty() )
+            log() << "no chunk for collection " << ns << " on shard " << shardName << endl;
+    }
+
+    ShardChunkManager::ShardChunkManager( const BSONObj& collectionDoc , const BSONArray& chunksArr ) {
+        _fillCollectionKey( collectionDoc );
+
+        scoped_ptr<DBClientMockCursor> c ( new DBClientMockCursor( chunksArr ) );
+        _fillChunks( c.get() );
+        _fillRanges();
+    }
+
+    void ShardChunkManager::_fillCollectionKey( const BSONObj& collectionDoc ) {
+        BSONElement e = collectionDoc["key"];
+        uassert( 13542 , str::stream() << "collection doesn't have a key: " << collectionDoc , ! e.eoo() && e.isABSONObj() );
+
+        BSONObj keys = e.Obj().getOwned();
+        BSONObjBuilder b;
+        BSONForEach( key , keys ) {
+            b.append( key.fieldName() , 1 );
+        }
+        _key = b.obj();
+    }
+
+    void ShardChunkManager::_fillChunks( DBClientCursorInterface* cursor ) {
+        assert( cursor );
+
+        ShardChunkVersion version;
+        while ( cursor->more() ) {
+            BSONObj d = cursor->next();
+            _chunksMap.insert( make_pair( d["min"].Obj().getOwned() , d["max"].Obj().getOwned() ) );
+
+            ShardChunkVersion currVersion( d["lastmod"] );
+            if ( currVersion > version ) {
+                version = currVersion;
+            }
+        }
+        _version = version;
+    }
+
+    void ShardChunkManager::_fillRanges() {
+        if ( _chunksMap.empty() )
+            return;
+
+        // load the chunk information, coallesceing their ranges
+        // the version for this shard would be the highest version for any of the chunks
+        RangeMap::const_iterator it = _chunksMap.begin();
+        BSONObj min,max;
+        while ( it != _chunksMap.end() ) {
+            BSONObj currMin = it->first;
+            BSONObj currMax = it->second;
+            ++it;
+
+            // coalesce the chunk's bounds in ranges if they are adjacent chunks
+            if ( min.isEmpty() ) {
+                min = currMin;
+                max = currMax;
+                continue;
+            }
+            if ( max == currMin ) {
+                max = currMax;
+                continue;
+            }
+
+            _rangesMap.insert( make_pair( min , max ) );
+
+            min = currMin;
+            max = currMax;
+        }
+        assert( ! min.isEmpty() );
+
+        _rangesMap.insert( make_pair( min , max ) );
+    }
+
+    static bool contains( const BSONObj& min , const BSONObj& max , const BSONObj& point ) {
+        return point.woCompare( min ) >= 0 && point.woCompare( max ) < 0;
+    }
+    
+    bool ShardChunkManager::belongsToMe( ClientCursor* cc ) const {
+        verify( 15851 , cc );
+        if ( _rangesMap.size() == 0 )
+            return false;
+        
+        return _belongsToMe( cc->extractFields( _key , true ) );
+    }
+
+    bool ShardChunkManager::belongsToMe( const BSONObj& obj ) const {
+        if ( _rangesMap.size() == 0 )
+            return false;
+
+        return _belongsToMe( obj.extractFields( _key , true ) );
+    }
+
+    bool ShardChunkManager::_belongsToMe( const BSONObj& x ) const {
+        RangeMap::const_iterator it = _rangesMap.upper_bound( x );
+        if ( it != _rangesMap.begin() )
+            it--;
+
+        bool good = contains( it->first , it->second , x );
+
+#if 0
+        if ( ! good ) {
+            log() << "bad: " << x << " " << it->first << " " << x.woCompare( it->first ) << " " << x.woCompare( it->second ) << endl;
+            for ( RangeMap::const_iterator i=_rangesMap.begin(); i!=_rangesMap.end(); ++i ) {
+                log() << "\t" << i->first << "\t" << i->second << "\t" << endl;
+            }
+        }
+#endif
+
+        return good;
+    }
+
+    bool ShardChunkManager::getNextChunk( const BSONObj& lookupKey, BSONObj* foundMin , BSONObj* foundMax ) const {
+        assert( foundMin );
+        assert( foundMax );
+        *foundMin = BSONObj();
+        *foundMax = BSONObj();
+
+        if ( _chunksMap.empty() ) {
+            return true;
+        }
+
+        RangeMap::const_iterator it;
+        if ( lookupKey.isEmpty() ) {
+            it = _chunksMap.begin();
+            *foundMin = it->first;
+            *foundMax = it->second;
+            return _chunksMap.size() == 1;
+        }
+
+        it = _chunksMap.upper_bound( lookupKey );
+        if ( it != _chunksMap.end() ) {
+            *foundMin = it->first;
+            *foundMax = it->second;
+            return false;
+        }
+
+        return true;
+    }
+
+    void ShardChunkManager::_assertChunkExists( const BSONObj& min , const BSONObj& max ) const {
+        RangeMap::const_iterator it = _chunksMap.find( min );
+        if ( it == _chunksMap.end() ) {
+            uasserted( 13586 , str::stream() << "couldn't find chunk " << min << "->" << max );
+        }
+
+        if ( it->second.woCompare( max ) != 0 ) {
+            ostringstream os;
+            os << "ranges differ, "
+               << "requested: "  << min << " -> " << max << " "
+               << "existing: " << (it == _chunksMap.end()) ? "<empty>" : it->first.toString() + " -> " + it->second.toString();
+            uasserted( 13587 , os.str() );
+        }
+    }
+
+    ShardChunkManager* ShardChunkManager::cloneMinus( const BSONObj& min, const BSONObj& max, const ShardChunkVersion& version ) {
+
+        // check that we have the exact chunk that will be subtracted
+        _assertChunkExists( min , max );
+
+        auto_ptr<ShardChunkManager> p( new ShardChunkManager );
+        p->_key = this->_key;
+
+        if ( _chunksMap.size() == 1 ) {
+            // if left with no chunks, just reset version
+            uassert( 13590 , str::stream() << "setting version to " << version << " on removing last chunk", version == 0 );
+
+            p->_version = 0;
+
+        }
+        else {
+            // can't move version backwards when subtracting chunks
+            // this is what guarantees that no read or write would be taken once we subtract data from the current shard
+            if ( version <= _version ) {
+                uasserted( 13585 , str::stream() << "version " << version.toString() << " not greater than " << _version.toString() );
+            }
+
+            p->_chunksMap = this->_chunksMap;
+            p->_chunksMap.erase( min );
+            p->_version = version;
+            p->_fillRanges();
+        }
+
+        return p.release();
+    }
+
+    static bool overlap( const BSONObj& l1 , const BSONObj& h1 , const BSONObj& l2 , const BSONObj& h2 ) {
+        return ! ( ( h1.woCompare( l2 ) <= 0 ) || ( h2.woCompare( l1 ) <= 0 ) );
+    }
+
+    ShardChunkManager* ShardChunkManager::clonePlus( const BSONObj& min , const BSONObj& max , const ShardChunkVersion& version ) {
+
+        // it is acceptable to move version backwards (e.g., undoing a migration that went bad during commit)
+        // but only cloning away the last chunk may reset the version to 0
+        uassert( 13591 , "version can't be set to zero" , version > 0 );
+
+        if ( ! _chunksMap.empty() ) {
+
+            // check that there isn't any chunk on the interval to be added
+            RangeMap::const_iterator it = _chunksMap.lower_bound( max );
+            if ( it != _chunksMap.begin() ) {
+                --it;
+            }
+            if ( overlap( min , max , it->first , it->second ) ) {
+                ostringstream os;
+                os << "ranges overlap, "
+                   << "requested: " << min << " -> " << max << " "
+                   << "existing: " << it->first.toString() + " -> " + it->second.toString();
+                uasserted( 13588 , os.str() );
+            }
+        }
+
+        auto_ptr<ShardChunkManager> p( new ShardChunkManager );
+
+        p->_key = this->_key;
+        p->_chunksMap = this->_chunksMap;
+        p->_chunksMap.insert( make_pair( min.getOwned() , max.getOwned() ) );
+        p->_version = version;
+        p->_fillRanges();
+
+        return p.release();
+    }
+
+    ShardChunkManager* ShardChunkManager::cloneSplit( const BSONObj& min , const BSONObj& max , const vector<BSONObj>& splitKeys ,
+            const ShardChunkVersion& version ) {
+
+        // the version required in both resulting chunks could be simply an increment in the minor portion of the current version
+        // however, we are enforcing uniqueness over the attributes <ns, lastmod> of the configdb collection 'chunks'
+        // so in practice, a migrate somewhere may force this split to pick up a version that has the major portion higher
+        // than the one that this shard has been using
+        //
+        // TODO drop the uniqueness constraint and tigthen the check below so that only the minor portion of version changes
+        if ( version <= _version ) {
+            uasserted( 14039 , str::stream() << "version " << version.toString() << " not greater than " << _version.toString() );
+        }
+
+        // check that we have the exact chunk that will be split and that the split point is valid
+        _assertChunkExists( min , max );
+        for ( vector<BSONObj>::const_iterator it = splitKeys.begin() ; it != splitKeys.end() ; ++it ) {
+            if ( ! contains( min , max , *it ) ) {
+                uasserted( 14040 , str::stream() << "can split " << min << " -> " << max << " on " << *it );
+            }
+        }
+
+        auto_ptr<ShardChunkManager> p( new ShardChunkManager );
+
+        p->_key = this->_key;
+        p->_chunksMap = this->_chunksMap;
+        p->_version = version; // will increment second, third, ... chunks below
+
+        BSONObj startKey = min;
+        for ( vector<BSONObj>::const_iterator it = splitKeys.begin() ; it != splitKeys.end() ; ++it ) {
+            BSONObj split = *it;
+            p->_chunksMap[min] = split.getOwned();
+            p->_chunksMap.insert( make_pair( split.getOwned() , max.getOwned() ) );
+            p->_version.incMinor();
+            startKey = split;
+        }
+        p->_fillRanges();
+
+        return p.release();
+    }
+
+    string ShardChunkManager::toString() const {
+        StringBuilder ss;
+        ss << " ShardChunkManager version: " << _version << " key: " << _key;
+        bool first = true;
+        for ( RangeMap::const_iterator i=_rangesMap.begin(); i!=_rangesMap.end(); ++i ) {
+            if ( first ) first = false;
+            else ss << " , ";
+
+            ss << i->first << " -> " << i->second;
+        }
+        return ss.str();
+    }
+    
+}  // namespace mongo
diff --git a/src/mongo/s/d_chunk_manager.h b/src/mongo/s/d_chunk_manager.h
new file mode 100644
index 00000000000..fd5974e4953
--- /dev/null
+++ b/src/mongo/s/d_chunk_manager.h
@@ -0,0 +1,167 @@
+// @file d_chunk_manager.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+
+#include "../db/jsobj.h"
+#include "util.h"
+
+namespace mongo {
+
+    class ClientCursor;
+
+    /**
+     * Controls the boundaries of all the chunks for a given collection that live in this shard.
+     *
+     * ShardChunkManager instances never change after construction. There are methods provided that would generate a
+     * new manager if new chunks are added, subtracted, or split.
+     *
+     * TODO
+     *   The responsibility of maintaining the version for a shard is still shared between this class and its caller. The
+     *   manager does check corner cases (e.g. cloning out the last chunk generates a manager with version 0) but ultimately
+     *   still cannot be responsible to set all versions. Currently, they are a function of the global state as opposed to
+     *   the per-shard one.
+     */
+    class ShardChunkManager : public boost::noncopyable {
+    public:
+
+        /**
+         * Loads the ShardChunkManager with all boundaries for chunks of a given collection that live in an given
+         * shard.
+         *
+         * @param configServer name of the server where the configDB currently is. Can be empty to indicate
+         *        that the configDB is running locally
+         * @param ns namespace for the collections whose chunks we're interested
+         * @param shardName name of the shard that this chunk matcher should track
+         *
+         * This constructor throws if collection is dropped/malformed and on connectivity errors
+         */
+        ShardChunkManager( const string& configServer , const string& ns , const string& shardName );
+
+        /**
+         * Same as the regular constructor but used in unittest (no access to configDB required).
+         *
+         * @param collectionDoc simulates config.collection's entry for one colleciton
+         * @param chunksDocs simulates config.chunks' entries for one collection's shard
+         */
+        ShardChunkManager( const BSONObj& collectionDoc , const BSONArray& chunksDoc );
+
+        ~ShardChunkManager() {}
+
+        /**
+         * Generates a new manager based on 'this's state minus a given chunk.
+         *
+         * @param min max chunk boundaries for the chunk to subtract
+         * @param version that the resulting manager should be at. The version has to be higher than the current one.
+         *        When cloning away the last chunk, verstion must be 0.
+         * @return a new ShardChunkManager, to be owned by the caller
+         */
+        ShardChunkManager* cloneMinus( const BSONObj& min , const BSONObj& max , const ShardChunkVersion& version );
+
+        /**
+         * Generates a new manager based on 'this's state plus a given chunk.
+         *
+         * @param min max chunk boundaries for the chunk to add
+         * @param version that the resulting manager should be at. It can never be 0, though (see CloneMinus).
+         * @return a new ShardChunkManager, to be owned by the caller
+         */
+        ShardChunkManager* clonePlus( const BSONObj& min , const BSONObj& max , const ShardChunkVersion& version );
+
+        /**
+         * Generates a new manager by splitting an existing chunk at one or more points.
+         *
+         * @param min max boundaries of chunk to be split
+         * @param splitKeys points to split original chunk at
+         * @param version to be used in first chunk. The subsequent chunks would increment the minor version.
+         * @return a new ShardChunkManager with the chunk split, to be owned by the caller
+         */
+        ShardChunkManager* cloneSplit( const BSONObj& min , const BSONObj& max , const vector<BSONObj>& splitKeys ,
+                                       const ShardChunkVersion& version );
+
+        /**
+         * Checks whether a document belongs to this shard.
+         *
+         * @param obj document containing sharding keys (and, optionally, other attributes)
+         * @return true if shards hold the object
+         */
+        bool belongsToMe( const BSONObj& obj ) const;
+
+        /**
+         * Checks whether a document belongs to this shard.
+         *
+         * @param obj document containing sharding keys (and, optionally, other attributes)
+         * @return true if shards hold the object
+         */
+        bool belongsToMe( ClientCursor* cc ) const;
+        
+        /**
+         * Given a chunk's min key (or empty doc), gets the boundary of the chunk following that one (the first).
+         *
+         * @param lookupKey is the min key for a previously obtained chunk or the empty document
+         * @param foundMin IN/OUT min for chunk following the one starting at lookupKey
+         * @param foundMax IN/OUT max for the above chunk
+         * @return true if the chunk returned is the last one
+         */
+        bool getNextChunk( const BSONObj& lookupKey, BSONObj* foundMin , BSONObj* foundMax ) const;
+
+        // accessors
+
+        ShardChunkVersion getVersion() const { return _version; }
+        BSONObj getKey() const { return _key.getOwned(); }
+        unsigned getNumChunks() const { return _chunksMap.size(); }
+
+        string toString() const;
+    private:
+
+        /**
+         * @same as belongsToMe to but key has to be the shard key
+         */
+        bool _belongsToMe( const BSONObj& key ) const;
+
+        
+        // highest ShardChunkVersion for which this ShardChunkManager's information is accurate
+        ShardChunkVersion _version;
+
+        // key pattern for chunks under this range
+        BSONObj _key;
+
+        // a map from a min key into the chunk's (or range's) max boundary
+        typedef map< BSONObj, BSONObj , BSONObjCmp > RangeMap;
+        RangeMap _chunksMap;
+
+        // a map from a min key into a range or continguous chunks
+        // redundant but we expect high chunk continguity, expecially in small installations
+        RangeMap _rangesMap;
+
+        /** constructors helpers */
+        void _fillCollectionKey( const BSONObj& collectionDoc );
+        void _fillChunks( DBClientCursorInterface* cursor );
+        void _fillRanges();
+
+        /** throws if the exact chunk is not in the chunks' map */
+        void _assertChunkExists( const BSONObj& min , const BSONObj& max ) const;
+
+        /** can only be used in the cloning calls */
+        ShardChunkManager() {}
+    };
+
+    typedef shared_ptr<ShardChunkManager> ShardChunkManagerPtr;
+
+}  // namespace mongo
diff --git a/src/mongo/s/d_logic.cpp b/src/mongo/s/d_logic.cpp
new file mode 100644
index 00000000000..7350856e91a
--- /dev/null
+++ b/src/mongo/s/d_logic.cpp
@@ -0,0 +1,121 @@
+// @file d_logic.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/**
+   these are commands that live in mongod
+   mostly around shard management and checking
+ */
+
+#include "pch.h"
+#include <map>
+#include <string>
+
+#include "../db/commands.h"
+#include "../db/jsobj.h"
+#include "../db/dbmessage.h"
+#include "../db/ops/query.h"
+
+#include "../client/connpool.h"
+
+#include "../util/queue.h"
+
+#include "shard.h"
+#include "d_logic.h"
+#include "d_writeback.h"
+
+using namespace std;
+
+namespace mongo {
+
+    bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) {
+        DEV assert( shardingState.enabled() );
+
+        int op = m.operation();
+        if ( op < 2000
+                || op >= 3000
+                || op == dbGetMore  // cursors are weird
+           )
+            return false;
+
+        DbMessage d(m);
+        const char *ns = d.getns();
+        string errmsg;
+        if ( shardVersionOk( ns , errmsg ) ) {
+            return false;
+        }
+
+        LOG(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl;
+
+        if ( doesOpGetAResponse( op ) ) {
+            assert( dbresponse );
+            BufBuilder b( 32768 );
+            b.skip( sizeof( QueryResult ) );
+            {
+                BSONObj obj = BSON( "$err" << errmsg << "ns" << ns );
+                b.appendBuf( obj.objdata() , obj.objsize() );
+            }
+
+            QueryResult *qr = (QueryResult*)b.buf();
+            qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale;
+            qr->len = b.len();
+            qr->setOperation( opReply );
+            qr->cursorId = 0;
+            qr->startingFrom = 0;
+            qr->nReturned = 1;
+            b.decouple();
+
+            Message * resp = new Message();
+            resp->setData( qr , true );
+
+            dbresponse->response = resp;
+            dbresponse->responseTo = m.header()->id;
+            return true;
+        }
+        
+        uassert( 9517 , "writeback" , ( d.reservedField() & DbMessage::Reserved_FromWriteback ) == 0 );
+
+        OID writebackID;
+        writebackID.init();
+        lastError.getSafe()->writeback( writebackID );
+
+        const OID& clientID = ShardedConnectionInfo::get(false)->getID();
+        massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );
+
+        LOG(1) << "got write with an old config - writing back ns: " << ns << endl;
+        LOG(1) << m.toString() << endl;
+
+        BSONObjBuilder b;
+        b.appendBool( "writeBack" , true );
+        b.append( "ns" , ns );
+        b.append( "id" , writebackID );
+        b.append( "connectionId" , cc().getConnectionId() );
+        b.append( "instanceIdent" , prettyHostName() );
+        b.appendTimestamp( "version" , shardingState.getVersion( ns ) );
+        
+        ShardedConnectionInfo* info = ShardedConnectionInfo::get( false );
+        b.appendTimestamp( "yourVersion" , info ? info->getVersion(ns) : (ConfigVersion)0 );
+
+        b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
+        LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
+        writeBackManager.queueWriteBack( clientID.str() , b.obj() );
+
+        return true;
+    }
+
+}
diff --git a/src/mongo/s/d_logic.h b/src/mongo/s/d_logic.h
new file mode 100644
index 00000000000..6cbdfadf6af
--- /dev/null
+++ b/src/mongo/s/d_logic.h
@@ -0,0 +1,246 @@
+// @file d_logic.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "../pch.h"
+
+#include "../db/jsobj.h"
+
+#include "d_chunk_manager.h"
+#include "util.h"
+
+namespace mongo {
+
+    class Database;
+    class DiskLoc;
+
+    typedef ShardChunkVersion ConfigVersion;
+
+    // --------------
+    // --- global state ---
+    // --------------
+
+    class ShardingState {
+    public:
+        ShardingState();
+
+        bool enabled() const { return _enabled; }
+        const string& getConfigServer() const { return _configServer; }
+        void enable( const string& server );
+
+        void gotShardName( const string& name );
+        void gotShardHost( string host );
+
+        string getShardName() { return _shardName; }
+        string getShardHost() { return _shardHost; }
+
+        /** Reverts back to a state where this mongod is not sharded. */
+        void resetShardingState(); 
+
+        // versioning support
+
+        bool hasVersion( const string& ns );
+        bool hasVersion( const string& ns , ConfigVersion& version );
+        const ConfigVersion getVersion( const string& ns ) const;
+
+        /**
+         * Uninstalls the manager for a given collection. This should be used when the collection is dropped.
+         *
+         * NOTE:
+         *   An existing collection with no chunks on this shard will have a manager on version 0, which is different than a
+         *   a dropped collection, which will not have a manager.
+         *
+         * TODO
+         *   When sharding state is enabled, absolutely all collections should have a manager. (The non-sharded ones are
+         *   a be degenerate case of one-chunk collections).
+         *   For now, a dropped collection and an non-sharded one are indistinguishable (SERVER-1849)
+         *
+         * @param ns the collection to be dropped
+         */
+        void resetVersion( const string& ns );
+
+        /**
+         * Requests to access a collection at a certain version. If the collection's manager is not at that version it
+         * will try to update itself to the newest version. The request is only granted if the version is the current or
+         * the newest one.
+         *
+         * @param ns collection to be accessed
+         * @param version (IN) the client belive this collection is on and (OUT) the version the manager is actually in
+         * @return true if the access can be allowed at the provided version
+         */
+        bool trySetVersion( const string& ns , ConfigVersion& version );
+
+        void appendInfo( BSONObjBuilder& b );
+
+        // querying support
+
+        bool needShardChunkManager( const string& ns ) const;
+        ShardChunkManagerPtr getShardChunkManager( const string& ns );
+
+        // chunk migrate and split support
+
+        /**
+         * Creates and installs a new chunk manager for a given collection by "forgetting" about one of its chunks.
+         * The new manager uses the provided version, which has to be higher than the current manager's.
+         * One exception: if the forgotten chunk is the last one in this shard for the collection, version has to be 0.
+         *
+         * If it runs successfully, clients need to grab the new version to access the collection.
+         *
+         * @param ns the collection
+         * @param min max the chunk to eliminate from the current manager
+         * @param version at which the new manager should be at
+         */
+        void donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version );
+
+        /**
+         * Creates and installs a new chunk manager for a given collection by reclaiming a previously donated chunk.
+         * The previous manager's version has to be provided.
+         *
+         * If it runs successfully, clients that became stale by the previous donateChunk will be able to access the
+         * collection again.
+         *
+         * @param ns the collection
+         * @param min max the chunk to reclaim and add to the current manager
+         * @param version at which the new manager should be at
+         */
+        void undoDonateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version );
+
+        /**
+         * Creates and installs a new chunk manager for a given collection by splitting one of its chunks in two or more.
+         * The version for the first split chunk should be provided. The subsequent chunks' version would be the latter with the
+         * minor portion incremented.
+         *
+         * The effect on clients will depend on the version used. If the major portion is the same as the current shards,
+         * clients shouldn't perceive the split.
+         *
+         * @param ns the collection
+         * @param min max the chunk that should be split
+         * @param splitKeys point in which to split
+         * @param version at which the new manager should be at
+         */
+        void splitChunk( const string& ns , const BSONObj& min , const BSONObj& max , const vector<BSONObj>& splitKeys ,
+                         ShardChunkVersion version );
+
+        bool inCriticalMigrateSection();
+
+    private:
+        bool _enabled;
+
+        string _configServer;
+
+        string _shardName;
+        string _shardHost;
+
+        // protects state below
+        mutable mongo::mutex _mutex;
+
+        // map from a namespace into the ensemble of chunk ranges that are stored in this mongod
+        // a ShardChunkManager carries all state we need for a collection at this shard, including its version information
+        typedef map<string,ShardChunkManagerPtr> ChunkManagersMap;
+        ChunkManagersMap _chunks;
+    };
+
+    extern ShardingState shardingState;
+
+    /**
+     * one per connection from mongos
+     * holds version state for each namesapce
+     */
+    class ShardedConnectionInfo {
+    public:
+        ShardedConnectionInfo();
+
+        const OID& getID() const { return _id; }
+        bool hasID() const { return _id.isSet(); }
+        void setID( const OID& id );
+
+        const ConfigVersion getVersion( const string& ns ) const;
+        void setVersion( const string& ns , const ConfigVersion& version );
+
+        static ShardedConnectionInfo* get( bool create );
+        static void reset();
+        static void addHook();
+
+        bool inForceVersionOkMode() const {
+            return _forceVersionOk;
+        }
+
+        void enterForceVersionOkMode() { _forceVersionOk = true; }
+        void leaveForceVersionOkMode() { _forceVersionOk = false; }
+
+    private:
+
+        OID _id;
+        bool _forceVersionOk; // if this is true, then chunk version #s aren't check, and all ops are allowed
+
+        typedef map<string,ConfigVersion> NSVersionMap;
+        NSVersionMap _versions;
+
+        static boost::thread_specific_ptr<ShardedConnectionInfo> _tl;
+    };
+
+    struct ShardForceVersionOkModeBlock {
+        ShardForceVersionOkModeBlock() {
+            info = ShardedConnectionInfo::get( false );
+            if ( info )
+                info->enterForceVersionOkMode();
+        }
+        ~ShardForceVersionOkModeBlock() {
+            if ( info )
+                info->leaveForceVersionOkMode();
+        }
+
+        ShardedConnectionInfo * info;
+    };
+
+    // -----------------
+    // --- core ---
+    // -----------------
+
+    unsigned long long extractVersion( BSONElement e , string& errmsg );
+
+
+    /**
+     * @return true if we have any shard info for the ns
+     */
+    bool haveLocalShardingInfo( const string& ns );
+
+    /**
+     * @return true if the current threads shard version is ok, or not in sharded version
+     */
+    bool shardVersionOk( const string& ns , string& errmsg );
+
+    /**
+     * @return true if we took care of the message and nothing else should be done
+     */
+    struct DbResponse;
+
+    bool _handlePossibleShardedMessage( Message &m, DbResponse * dbresponse );
+
+    /** What does this do? document please? */
+    inline bool handlePossibleShardedMessage( Message &m, DbResponse * dbresponse ) {
+        if( !shardingState.enabled() ) 
+            return false;
+        return _handlePossibleShardedMessage(m, dbresponse);
+    }
+
+    void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt );
+    void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl );
+
+}
diff --git a/src/mongo/s/d_migrate.cpp b/src/mongo/s/d_migrate.cpp
new file mode 100644
index 00000000000..5e62661ec7e
--- /dev/null
+++ b/src/mongo/s/d_migrate.cpp
@@ -0,0 +1,1728 @@
+// d_migrate.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/**
+   these are commands that live in mongod
+   mostly around shard management and checking
+ */
+
+#include "pch.h"
+#include <map>
+#include <string>
+#include <algorithm>
+
+#include "../db/commands.h"
+#include "../db/jsobj.h"
+#include "../db/cmdline.h"
+#include "../db/queryoptimizer.h"
+#include "../db/btree.h"
+#include "../db/repl_block.h"
+#include "../db/dur.h"
+#include "../db/clientcursor.h"
+
+#include "../client/connpool.h"
+#include "../client/distlock.h"
+
+#include "../util/queue.h"
+#include "../util/unittest.h"
+#include "../util/processinfo.h"
+#include "../util/ramlog.h"
+
+#include "shard.h"
+#include "d_logic.h"
+#include "config.h"
+#include "chunk.h"
+
+using namespace std;
+
+namespace mongo {
+
+    Tee* migrateLog = new RamLog( "migrate" );
+
+    class MoveTimingHelper {
+    public:
+        MoveTimingHelper( const string& where , const string& ns , BSONObj min , BSONObj max , int total )
+            : _where( where ) , _ns( ns ) , _next( 0 ) , _total( total ) {
+            _nextNote = 0;
+            _b.append( "min" , min );
+            _b.append( "max" , max );
+        }
+
+        ~MoveTimingHelper() {
+            // even if logChange doesn't throw, bson does
+            // sigh
+            try {
+                if ( _next != _total ) {
+                    note( "aborted" );
+                }
+                configServer.logChange( (string)"moveChunk." + _where , _ns, _b.obj() );
+            }
+            catch ( const std::exception& e ) {
+                warning() << "couldn't record timing for moveChunk '" << _where << "': " << e.what() << migrateLog;
+            }
+        }
+
+        void done( int step ) {
+            assert( step == ++_next );
+            assert( step <= _total );
+
+            stringstream ss;
+            ss << "step" << step;
+            string s = ss.str();
+
+            CurOp * op = cc().curop();
+            if ( op )
+                op->setMessage( s.c_str() );
+            else
+                warning() << "op is null in MoveTimingHelper::done" << migrateLog;
+
+            _b.appendNumber( s , _t.millis() );
+            _t.reset();
+
+#if 0
+            // debugging for memory leak?
+            ProcessInfo pi;
+            ss << " v:" << pi.getVirtualMemorySize()
+               << " r:" << pi.getResidentSize();
+            log() << ss.str() << migrateLog;
+#endif
+        }
+
+
+        void note( const string& s ) {
+            string field = "note";
+            if ( _nextNote > 0 ) {
+                StringBuilder buf;
+                buf << "note" << _nextNote;
+                field = buf.str();
+            }
+            _nextNote++;
+
+            _b.append( field , s );
+        }
+
+    private:
+        Timer _t;
+
+        string _where;
+        string _ns;
+
+        int _next;
+        int _total; // expected # of steps
+        int _nextNote;
+
+        BSONObjBuilder _b;
+
+    };
+
+    struct OldDataCleanup {
+        static AtomicUInt _numThreads; // how many threads are doing async cleanup
+
+        string ns;
+        BSONObj min;
+        BSONObj max;
+        set<CursorId> initial;
+
+        OldDataCleanup(){
+            _numThreads++;
+        }
+        OldDataCleanup( const OldDataCleanup& other ) {
+            ns = other.ns;
+            min = other.min.getOwned();
+            max = other.max.getOwned();
+            initial = other.initial;
+            _numThreads++;
+        }
+        ~OldDataCleanup(){
+            _numThreads--;
+        }
+
+        string toString() const {
+            return str::stream() << ns << " from " << min << " -> " << max;
+        }
+        
+        void doRemove() {
+            ShardForceVersionOkModeBlock sf;
+            {
+                writelock lk(ns);
+                RemoveSaver rs("moveChunk",ns,"post-cleanup");
+                long long numDeleted = Helpers::removeRange( ns , min , max , true , false , cmdLine.moveParanoia ? &rs : 0 );
+                log() << "moveChunk deleted: " << numDeleted << migrateLog;
+            }
+            
+            
+            ReplTime lastOpApplied = cc().getLastOp().asDate();
+            Timer t;
+            for ( int i=0; i<3600; i++ ) {
+                if ( opReplicatedEnough( lastOpApplied , ( getSlaveCount() / 2 ) + 1 ) ) {
+                    LOG(t.seconds() < 30 ? 1 : 0) << "moveChunk repl sync took " << t.seconds() << " seconds" << migrateLog;
+                    return;
+                }
+                sleepsecs(1);
+            }
+            
+            warning() << "moveChunk repl sync timed out after " << t.seconds() << " seconds" << migrateLog;
+        }
+
+    };
+
+    AtomicUInt OldDataCleanup::_numThreads = 0;
+
+    static const char * const cleanUpThreadName = "cleanupOldData";
+
+    class ChunkCommandHelper : public Command {
+    public:
+        ChunkCommandHelper( const char * name )
+            : Command( name ) {
+        }
+
+        virtual void help( stringstream& help ) const {
+            help << "internal - should not be called directly" << migrateLog;
+        }
+        virtual bool slaveOk() const { return false; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+
+    };
+
+    bool isInRange( const BSONObj& obj , const BSONObj& min , const BSONObj& max ) {
+        BSONObj k = obj.extractFields( min, true );
+
+        return k.woCompare( min ) >= 0 && k.woCompare( max ) < 0;
+    }
+
+
+    class MigrateFromStatus {
+    public:
+
+        MigrateFromStatus() : _m("MigrateFromStatus") , _workLock("MigrateFromStatus::workLock") {
+            _active = false;
+            _inCriticalSection = false;
+            _memoryUsed = 0;
+        }
+
+        void start( string ns , const BSONObj& min , const BSONObj& max ) {
+            scoped_lock ll(_workLock);
+            scoped_lock l(_m); // reads and writes _active
+
+            assert( ! _active );
+
+            assert( ! min.isEmpty() );
+            assert( ! max.isEmpty() );
+            assert( ns.size() );
+
+            _ns = ns;
+            _min = min;
+            _max = max;
+
+            assert( _cloneLocs.size() == 0 );
+            assert( _deleted.size() == 0 );
+            assert( _reload.size() == 0 );
+            assert( _memoryUsed == 0 );
+
+            _active = true;
+        }
+
+        void done() {
+            readlock lk( _ns );
+
+            {
+                scoped_spinlock lk( _trackerLocks );
+                _deleted.clear();
+                _reload.clear();
+                _cloneLocs.clear();
+            }
+            _memoryUsed = 0;
+
+            scoped_lock l(_m);
+            _active = false;
+            _inCriticalSection = false;
+        }
+
+        void logOp( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) {
+            if ( ! _getActive() )
+                return;
+
+            if ( _ns != ns )
+                return;
+
+            // no need to log if this is not an insertion, an update, or an actual deletion
+            // note: opstr 'db' isn't a deletion but a mention that a database exists (for replication
+            // machinery mostly)
+            char op = opstr[0];
+            if ( op == 'n' || op =='c' || ( op == 'd' && opstr[1] == 'b' ) )
+                return;
+
+            BSONElement ide;
+            if ( patt )
+                ide = patt->getField( "_id" );
+            else
+                ide = obj["_id"];
+
+            if ( ide.eoo() ) {
+                warning() << "logOpForSharding got mod with no _id, ignoring  obj: " << obj << migrateLog;
+                return;
+            }
+
+            BSONObj it;
+
+            switch ( opstr[0] ) {
+
+            case 'd': {
+
+                if ( getThreadName() == cleanUpThreadName ) {
+                    // we don't want to xfer things we're cleaning
+                    // as then they'll be deleted on TO
+                    // which is bad
+                    return;
+                }
+
+                // can't filter deletes :(
+                _deleted.push_back( ide.wrap() );
+                _memoryUsed += ide.size() + 5;
+                return;
+            }
+
+            case 'i':
+                it = obj;
+                break;
+
+            case 'u':
+                if ( ! Helpers::findById( cc() , _ns.c_str() , ide.wrap() , it ) ) {
+                    warning() << "logOpForSharding couldn't find: " << ide << " even though should have" << migrateLog;
+                    return;
+                }
+                break;
+
+            }
+
+            if ( ! isInRange( it , _min , _max ) )
+                return;
+
+            _reload.push_back( ide.wrap() );
+            _memoryUsed += ide.size() + 5;
+        }
+
+        void xfer( list<BSONObj> * l , BSONObjBuilder& b , const char * name , long long& size , bool explode ) {
+            const long long maxSize = 1024 * 1024;
+
+            if ( l->size() == 0 || size > maxSize )
+                return;
+
+            BSONArrayBuilder arr(b.subarrayStart(name));
+
+            list<BSONObj>::iterator i = l->begin();
+
+            while ( i != l->end() && size < maxSize ) {
+                BSONObj t = *i;
+                if ( explode ) {
+                    BSONObj it;
+                    if ( Helpers::findById( cc() , _ns.c_str() , t, it ) ) {
+                        arr.append( it );
+                        size += it.objsize();
+                    }
+                }
+                else {
+                    arr.append( t );
+                }
+                i = l->erase( i );
+                size += t.objsize();
+            }
+
+            arr.done();
+        }
+
+        /**
+         * called from the dest of a migrate
+         * transfers mods from src to dest
+         */
+        bool transferMods( string& errmsg , BSONObjBuilder& b ) {
+            if ( ! _getActive() ) {
+                errmsg = "no active migration!";
+                return false;
+            }
+
+            long long size = 0;
+
+            {
+                readlock rl( _ns );
+                Client::Context cx( _ns );
+
+                xfer( &_deleted , b , "deleted" , size , false );
+                xfer( &_reload , b , "reload" , size , true );
+            }
+
+            b.append( "size" , size );
+
+            return true;
+        }
+
+        /**
+         * Get the disklocs that belong to the chunk migrated and sort them in _cloneLocs (to avoid seeking disk later)
+         *
+         * @param maxChunkSize number of bytes beyond which a chunk's base data (no indices) is considered too large to move
+         * @param errmsg filled with textual description of error if this call return false
+         * @return false if approximate chunk size is too big to move or true otherwise
+         */
+        bool storeCurrentLocs( long long maxChunkSize , string& errmsg , BSONObjBuilder& result ) {
+            readlock l( _ns );
+            Client::Context ctx( _ns );
+            NamespaceDetails *d = nsdetails( _ns.c_str() );
+            if ( ! d ) {
+                errmsg = "ns not found, should be impossible";
+                return false;
+            }
+
+            BSONObj keyPattern;
+            // the copies are needed because the indexDetailsForRange destroys the input
+            BSONObj min = _min.copy();
+            BSONObj max = _max.copy();
+            IndexDetails *idx = indexDetailsForRange( _ns.c_str() , errmsg , min , max , keyPattern );
+            if ( idx == NULL ) {
+                errmsg = (string)"can't find index in storeCurrentLocs" + causedBy( errmsg );
+                return false;
+            }
+
+            auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout ,
+                                                         shared_ptr<Cursor>( BtreeCursor::make( d , d->idxNo(*idx) , *idx , min , max , false , 1 ) ) ,
+                                                         _ns ) );
+
+            // use the average object size to estimate how many objects a full chunk would carry
+            // do that while traversing the chunk's range using the sharding index, below
+            // there's a fair amount of slack before we determine a chunk is too large because object sizes will vary
+            unsigned long long maxRecsWhenFull;
+            long long avgRecSize;
+            const long long totalRecs = d->stats.nrecords;
+            if ( totalRecs > 0 ) {
+                avgRecSize = d->stats.datasize / totalRecs;
+                maxRecsWhenFull = maxChunkSize / avgRecSize;
+                maxRecsWhenFull = 130 * maxRecsWhenFull / 100; // slack
+            }
+            else {
+                avgRecSize = 0;
+                maxRecsWhenFull = numeric_limits<long long>::max();
+            }
+
+            // do a full traversal of the chunk and don't stop even if we think it is a large chunk
+            // we want the number of records to better report, in that case
+            bool isLargeChunk = false;
+            unsigned long long recCount = 0;;
+            while ( cc->ok() ) {
+                DiskLoc dl = cc->currLoc();
+                if ( ! isLargeChunk ) {
+                    scoped_spinlock lk( _trackerLocks );
+                    _cloneLocs.insert( dl );
+                }
+                cc->advance();
+
+                // we can afford to yield here because any change to the base data that we might miss is already being
+                // queued and will be migrated in the 'transferMods' stage
+                if ( ! cc->yieldSometimes( ClientCursor::DontNeed ) ) {
+                    cc.release();
+                    break;
+                }
+
+                if ( ++recCount > maxRecsWhenFull ) {
+                    isLargeChunk = true;
+                }
+            }
+
+            if ( isLargeChunk ) {
+                warning() << "can't move chunk of size (approximately) " << recCount * avgRecSize
+                          << " because maximum size allowed to move is " << maxChunkSize
+                          << " ns: " << _ns << " " << _min << " -> " << _max
+                          << migrateLog;
+                result.appendBool( "chunkTooBig" , true );
+                result.appendNumber( "estimatedChunkSize" , (long long)(recCount * avgRecSize) );
+                errmsg = "chunk too big to move";
+                return false;
+            }
+
+            {
+                scoped_spinlock lk( _trackerLocks );
+                log() << "moveChunk number of documents: " << _cloneLocs.size() << migrateLog;
+            }
+            return true;
+        }
+
+        bool clone( string& errmsg , BSONObjBuilder& result ) {
+            if ( ! _getActive() ) {
+                errmsg = "not active";
+                return false;
+            }
+
+            ElapsedTracker tracker (128, 10); // same as ClientCursor::_yieldSometimesTracker
+
+            int allocSize;
+            {
+                readlock l(_ns);
+                Client::Context ctx( _ns );
+                NamespaceDetails *d = nsdetails( _ns.c_str() );
+                assert( d );
+                scoped_spinlock lk( _trackerLocks );
+                allocSize = std::min(BSONObjMaxUserSize, (int)((12 + d->averageObjectSize()) * _cloneLocs.size()));
+            }
+            BSONArrayBuilder a (allocSize);
+            
+            while ( 1 ) {
+                bool filledBuffer = false;
+                
+                readlock l( _ns );
+                Client::Context ctx( _ns );
+                scoped_spinlock lk( _trackerLocks );
+                set<DiskLoc>::iterator i = _cloneLocs.begin();
+                for ( ; i!=_cloneLocs.end(); ++i ) {
+                    if (tracker.intervalHasElapsed()) // should I yield?
+                        break;
+
+                    DiskLoc dl = *i;
+                    BSONObj o = dl.obj();
+
+                    // use the builder size instead of accumulating 'o's size so that we take into consideration
+                    // the overhead of BSONArray indices
+                    if ( a.len() + o.objsize() + 1024 > BSONObjMaxUserSize ) {
+                        filledBuffer = true; // break out of outer while loop
+                        break;
+                    }
+
+                    a.append( o );
+                }
+
+                _cloneLocs.erase( _cloneLocs.begin() , i );
+
+                if ( _cloneLocs.empty() || filledBuffer )
+                    break;
+            }
+
+            result.appendArray( "objects" , a.arr() );
+            return true;
+        }
+
+        void aboutToDelete( const Database* db , const DiskLoc& dl ) {
+            d.dbMutex.assertWriteLocked();
+
+            if ( ! _getActive() )
+                return;
+
+            if ( ! db->ownsNS( _ns ) )
+                return;
+
+            
+            // not needed right now
+            // but trying to prevent a future bug
+            scoped_spinlock lk( _trackerLocks ); 
+
+            _cloneLocs.erase( dl );
+        }
+
+        long long mbUsed() const { return _memoryUsed / ( 1024 * 1024 ); }
+
+        bool getInCriticalSection() const { scoped_lock l(_m); return _inCriticalSection; }
+        void setInCriticalSection( bool b ) { scoped_lock l(_m); _inCriticalSection = b; }
+
+        bool isActive() const { return _getActive(); }
+        
+        void doRemove( OldDataCleanup& cleanup ) {
+            int it = 0;
+            while ( true ) { 
+                if ( it > 20 && it % 10 == 0 ) log() << "doRemote iteration " << it << " for: " << cleanup << endl;
+                {
+                    scoped_lock ll(_workLock);
+                    if ( ! _active ) {
+                        cleanup.doRemove();
+                        return;
+                    }
+                }
+                sleepmillis( 1000 );
+            }
+        }
+
+    private:
+        mutable mongo::mutex _m; // protect _inCriticalSection and _active
+        bool _inCriticalSection;
+        bool _active;
+
+        string _ns;
+        BSONObj _min;
+        BSONObj _max;
+
+        // we need the lock in case there is a malicious _migrateClone for example
+        // even though it shouldn't be needed under normal operation
+        SpinLock _trackerLocks;
+
+        // disk locs yet to be transferred from here to the other side
+        // no locking needed because built initially by 1 thread in a read lock
+        // emptied by 1 thread in a read lock
+        // updates applied by 1 thread in a write lock
+        set<DiskLoc> _cloneLocs;
+
+        list<BSONObj> _reload; // objects that were modified that must be recloned
+        list<BSONObj> _deleted; // objects deleted during clone that should be deleted later
+        long long _memoryUsed; // bytes in _reload + _deleted
+
+        mutable mongo::mutex _workLock; // this is used to make sure only 1 thread is doing serious work
+                                        // for now, this means migrate or removing old chunk data
+
+        bool _getActive() const { scoped_lock l(_m); return _active; }
+        void _setActive( bool b ) { scoped_lock l(_m); _active = b; }
+
+    } migrateFromStatus;
+
+    struct MigrateStatusHolder {
+        MigrateStatusHolder( string ns , const BSONObj& min , const BSONObj& max ) {
+            migrateFromStatus.start( ns , min , max );
+        }
+        ~MigrateStatusHolder() {
+            migrateFromStatus.done();
+        }
+    };
+
+    void _cleanupOldData( OldDataCleanup cleanup ) {
+        Client::initThread( cleanUpThreadName );
+        if (!noauth) {
+            cc().getAuthenticationInfo()->authorize("local", internalSecurity.user);
+        }
+        log() << " (start) waiting to cleanup " << cleanup << "  # cursors:" << cleanup.initial.size() << migrateLog;
+
+        int loops = 0;
+        Timer t;
+        while ( t.seconds() < 900 ) { // 15 minutes
+            assert( d.dbMutex.getState() == 0 );
+            sleepmillis( 20 );
+
+            set<CursorId> now;
+            ClientCursor::find( cleanup.ns , now );
+
+            set<CursorId> left;
+            for ( set<CursorId>::iterator i=cleanup.initial.begin(); i!=cleanup.initial.end(); ++i ) {
+                CursorId id = *i;
+                if ( now.count(id) )
+                    left.insert( id );
+            }
+
+            if ( left.size() == 0 )
+                break;
+            cleanup.initial = left;
+
+            if ( ( loops++ % 200 ) == 0 ) {
+                log() << " (looping " << loops << ") waiting to cleanup " << cleanup.ns << " from " << cleanup.min << " -> " << cleanup.max << "  # cursors:" << cleanup.initial.size() << migrateLog;
+
+                stringstream ss;
+                for ( set<CursorId>::iterator i=cleanup.initial.begin(); i!=cleanup.initial.end(); ++i ) {
+                    CursorId id = *i;
+                    ss << id << " ";
+                }
+                log() << " cursors: " << ss.str() << migrateLog;
+            }
+        }
+
+        migrateFromStatus.doRemove( cleanup );
+
+        cc().shutdown();
+    }
+
+    void cleanupOldData( OldDataCleanup cleanup ) {
+        try {
+            _cleanupOldData( cleanup );
+        }
+        catch ( std::exception& e ) {
+            log() << " error cleaning old data:" << e.what() << migrateLog;
+        }
+        catch ( ... ) {
+            log() << " unknown error cleaning old data" << migrateLog;
+        }
+    }
+
+    void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) {
+        migrateFromStatus.logOp( opstr , ns , obj , patt );
+    }
+
+    void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ) {
+        migrateFromStatus.aboutToDelete( db , dl );
+    }
+
+    class TransferModsCommand : public ChunkCommandHelper {
+    public:
+        TransferModsCommand() : ChunkCommandHelper( "_transferMods" ) {}
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            return migrateFromStatus.transferMods( errmsg, result );
+        }
+    } transferModsCommand;
+
+
+    class InitialCloneCommand : public ChunkCommandHelper {
+    public:
+        InitialCloneCommand() : ChunkCommandHelper( "_migrateClone" ) {}
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            return migrateFromStatus.clone( errmsg, result );
+        }
+    } initialCloneCommand;
+
+
+    /**
+     * this is the main entry for moveChunk
+     * called to initial a move
+     * usually by a mongos
+     * this is called on the "from" side
+     */
+    class MoveChunkCommand : public Command {
+    public:
+        MoveChunkCommand() : Command( "moveChunk" ) {}
+        virtual void help( stringstream& help ) const {
+            help << "should not be calling this directly" << migrateLog;
+        }
+
+        virtual bool slaveOk() const { return false; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            // 1. parse options
+            // 2. make sure my view is complete and lock
+            // 3. start migrate
+            //    in a read lock, get all DiskLoc and sort so we can do as little seeking as possible
+            //    tell to start transferring
+            // 4. pause till migrate caught up
+            // 5. LOCK
+            //    a) update my config, essentially locking
+            //    b) finish migrate
+            //    c) update config server
+            //    d) logChange to config server
+            // 6. wait for all current cursors to expire
+            // 7. remove data locally
+
+            // -------------------------------
+
+            // 1.
+            string ns = cmdObj.firstElement().str();
+            string to = cmdObj["to"].str();
+            string from = cmdObj["from"].str(); // my public address, a tad redundant, but safe
+            BSONObj min  = cmdObj["min"].Obj();
+            BSONObj max  = cmdObj["max"].Obj();
+            BSONElement shardId = cmdObj["shardId"];
+            BSONElement maxSizeElem = cmdObj["maxChunkSizeBytes"];
+
+            if ( ns.empty() ) {
+                errmsg = "need to specify namespace in command";
+                return false;
+            }
+
+            if ( to.empty() ) {
+                errmsg = "need to specify server to move chunk to";
+                return false;
+            }
+            if ( from.empty() ) {
+                errmsg = "need to specify server to move chunk from";
+                return false;
+            }
+
+            if ( min.isEmpty() ) {
+                errmsg = "need to specify a min";
+                return false;
+            }
+
+            if ( max.isEmpty() ) {
+                errmsg = "need to specify a max";
+                return false;
+            }
+
+            if ( shardId.eoo() ) {
+                errmsg = "need shardId";
+                return false;
+            }
+
+            if ( maxSizeElem.eoo() || ! maxSizeElem.isNumber() ) {
+                errmsg = "need to specify maxChunkSizeBytes";
+                return false;
+            }
+            const long long maxChunkSize = maxSizeElem.numberLong(); // in bytes
+
+            if ( ! shardingState.enabled() ) {
+                if ( cmdObj["configdb"].type() != String ) {
+                    errmsg = "sharding not enabled";
+                    return false;
+                }
+                string configdb = cmdObj["configdb"].String();
+                shardingState.enable( configdb );
+                configServer.init( configdb );
+            }
+
+            MoveTimingHelper timing( "from" , ns , min , max , 6 /* steps */);
+
+            Shard fromShard( from );
+            Shard toShard( to );
+
+            log() << "received moveChunk request: " << cmdObj << migrateLog;
+
+            timing.done(1);
+
+            // 2.
+            
+            if ( migrateFromStatus.isActive() ) {
+                errmsg = "migration already in progress";
+                return false;
+            }
+
+            DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC ) , ns );
+            dist_lock_try dlk;
+
+            try{
+                dlk = dist_lock_try( &lockSetup , (string)"migrate-" + min.toString() );
+            }
+            catch( LockException& e ){
+                errmsg = str::stream() << "error locking distributed lock for migration " << "migrate-" << min.toString() << causedBy( e );
+                return false;
+            }
+
+            if ( ! dlk.got() ) {
+                errmsg = str::stream() << "the collection metadata could not be locked with lock " << "migrate-" << min.toString();
+                result.append( "who" , dlk.other() );
+                return false;
+            }
+
+            BSONObj chunkInfo = BSON("min" << min << "max" << max << "from" << fromShard.getName() << "to" << toShard.getName());
+            configServer.logChange( "moveChunk.start" , ns , chunkInfo );
+
+            ShardChunkVersion maxVersion;
+            string myOldShard;
+            {
+                ScopedDbConnection conn( shardingState.getConfigServer() );
+
+                BSONObj x;
+                BSONObj currChunk;
+                try{
+                    x = conn->findOne( ShardNS::chunk , Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) );
+                    currChunk = conn->findOne( ShardNS::chunk , shardId.wrap( "_id" ) );
+                }
+                catch( DBException& e ){
+                    errmsg = str::stream() << "aborted moveChunk because could not get chunk data from config server " << shardingState.getConfigServer() << causedBy( e );
+                    warning() << errmsg << endl;
+                    return false;
+                }
+
+                maxVersion = x["lastmod"];
+                assert( currChunk["shard"].type() );
+                assert( currChunk["min"].type() );
+                assert( currChunk["max"].type() );
+                myOldShard = currChunk["shard"].String();
+                conn.done();
+
+                BSONObj currMin = currChunk["min"].Obj();
+                BSONObj currMax = currChunk["max"].Obj();
+                if ( currMin.woCompare( min ) || currMax.woCompare( max ) ) {
+                    errmsg = "boundaries are outdated (likely a split occurred)";
+                    result.append( "currMin" , currMin );
+                    result.append( "currMax" , currMax );
+                    result.append( "requestedMin" , min );
+                    result.append( "requestedMax" , max );
+
+                    warning() << "aborted moveChunk because" <<  errmsg << ": " << min << "->" << max
+                                      << " is now " << currMin << "->" << currMax << migrateLog;
+                    return false;
+                }
+
+                if ( myOldShard != fromShard.getName() ) {
+                    errmsg = "location is outdated (likely balance or migrate occurred)";
+                    result.append( "from" , fromShard.getName() );
+                    result.append( "official" , myOldShard );
+
+                    warning() << "aborted moveChunk because " << errmsg << ": chunk is at " << myOldShard
+                                      << " and not at " << fromShard.getName() << migrateLog;
+                    return false;
+                }
+
+                if ( maxVersion < shardingState.getVersion( ns ) ) {
+                    errmsg = "official version less than mine?";
+                    result.appendTimestamp( "officialVersion" , maxVersion );
+                    result.appendTimestamp( "myVersion" , shardingState.getVersion( ns ) );
+
+                    warning() << "aborted moveChunk because " << errmsg << ": official " << maxVersion
+                                      << " mine: " << shardingState.getVersion(ns) << migrateLog;
+                    return false;
+                }
+
+                // since this could be the first call that enable sharding we also make sure to have the chunk manager up to date
+                shardingState.gotShardName( myOldShard );
+                ShardChunkVersion shardVersion;
+                shardingState.trySetVersion( ns , shardVersion /* will return updated */ );
+
+                log() << "moveChunk request accepted at version " << shardVersion << migrateLog;
+            }
+
+            timing.done(2);
+
+            // 3.
+            MigrateStatusHolder statusHolder( ns , min , max );
+            {
+                // this gets a read lock, so we know we have a checkpoint for mods
+                if ( ! migrateFromStatus.storeCurrentLocs( maxChunkSize , errmsg , result ) )
+                    return false;
+
+                ScopedDbConnection connTo( to );
+                BSONObj res;
+                bool ok;
+                try{
+                    ok = connTo->runCommand( "admin" ,
+                                              BSON( "_recvChunkStart" << ns <<
+                                                    "from" << from <<
+                                                    "min" << min <<
+                                                    "max" << max <<
+                                                    "configServer" << configServer.modelServer()
+                                                  ) ,
+                                              res );
+                }
+                catch( DBException& e ){
+                    errmsg = str::stream() << "moveChunk could not contact to: shard " << to << " to start transfer" << causedBy( e );
+                    warning() << errmsg << endl;
+                    return false;
+                }
+
+                connTo.done();
+
+                if ( ! ok ) {
+                    errmsg = "moveChunk failed to engage TO-shard in the data transfer: ";
+                    assert( res["errmsg"].type() );
+                    errmsg += res["errmsg"].String();
+                    result.append( "cause" , res );
+                    return false;
+                }
+
+            }
+            timing.done( 3 );
+
+            // 4.
+            for ( int i=0; i<86400; i++ ) { // don't want a single chunk move to take more than a day
+                assert( d.dbMutex.getState() == 0 );
+                sleepsecs( 1 );
+                ScopedDbConnection conn( to );
+                BSONObj res;
+                bool ok;
+                try {
+                    ok = conn->runCommand( "admin" , BSON( "_recvChunkStatus" << 1 ) , res );
+                    res = res.getOwned();
+                }
+                catch( DBException& e ){
+                    errmsg = str::stream() << "moveChunk could not contact to: shard " << to << " to monitor transfer" << causedBy( e );
+                    warning() << errmsg << endl;
+                    return false;
+                }
+
+                conn.done();
+
+                log(0) << "moveChunk data transfer progress: " << res << " my mem used: " << migrateFromStatus.mbUsed() << migrateLog;
+
+                if ( ! ok || res["state"].String() == "fail" ) {
+                    warning() << "moveChunk error transferring data caused migration abort: " << res << migrateLog;
+                    errmsg = "data transfer error";
+                    result.append( "cause" , res );
+                    return false;
+                }
+
+                if ( res["state"].String() == "steady" )
+                    break;
+
+                if ( migrateFromStatus.mbUsed() > (500 * 1024 * 1024) ) {
+                    // this is too much memory for us to use for this
+                    // so we're going to abort the migrate
+                    ScopedDbConnection conn( to );
+                    BSONObj res;
+                    conn->runCommand( "admin" , BSON( "_recvChunkAbort" << 1 ) , res );
+                    res = res.getOwned();
+                    conn.done();
+                    error() << "aborting migrate because too much memory used res: " << res << migrateLog;
+                    errmsg = "aborting migrate because too much memory used";
+                    result.appendBool( "split" , true );
+                    return false;
+                }
+
+                killCurrentOp.checkForInterrupt();
+            }
+            timing.done(4);
+
+            // 5.
+            {
+                // 5.a
+                // we're under the collection lock here, so no other migrate can change maxVersion or ShardChunkManager state
+                migrateFromStatus.setInCriticalSection( true );
+                ShardChunkVersion currVersion = maxVersion;
+                ShardChunkVersion myVersion = currVersion;
+                myVersion.incMajor();
+
+                {
+                    writelock lk( ns );
+                    assert( myVersion > shardingState.getVersion( ns ) );
+
+                    // bump the chunks manager's version up and "forget" about the chunk being moved
+                    // this is not the commit point but in practice the state in this shard won't until the commit it done
+                    shardingState.donateChunk( ns , min , max , myVersion );
+                }
+
+                log() << "moveChunk setting version to: " << myVersion << migrateLog;
+
+                // 5.b
+                // we're under the collection lock here, too, so we can undo the chunk donation because no other state change
+                // could be ongoing
+                {
+                    BSONObj res;
+                    ScopedDbConnection connTo( to );
+                    bool ok;
+
+                    try{
+                        ok = connTo->runCommand( "admin" ,
+                                                  BSON( "_recvChunkCommit" << 1 ) ,
+                                                  res );
+                    }
+                    catch( DBException& e ){
+                        errmsg = str::stream() << "moveChunk could not contact to: shard " << to << " to commit transfer" << causedBy( e );
+                        warning() << errmsg << endl;
+                        return false;
+                    }
+
+                    connTo.done();
+
+                    if ( ! ok ) {
+                        {
+                            writelock lk( ns );
+
+                            // revert the chunk manager back to the state before "forgetting" about the chunk
+                            shardingState.undoDonateChunk( ns , min , max , currVersion );
+                        }
+
+                        log() << "moveChunk migrate commit not accepted by TO-shard: " << res
+                              << " resetting shard version to: " << currVersion << migrateLog;
+
+                        errmsg = "_recvChunkCommit failed!";
+                        result.append( "cause" , res );
+                        return false;
+                    }
+
+                    log() << "moveChunk migrate commit accepted by TO-shard: " << res << migrateLog;
+                }
+
+                // 5.c
+
+                // version at which the next highest lastmod will be set
+                // if the chunk being moved is the last in the shard, nextVersion is that chunk's lastmod
+                // otherwise the highest version is from the chunk being bumped on the FROM-shard
+                ShardChunkVersion nextVersion;
+
+                // we want to go only once to the configDB but perhaps change two chunks, the one being migrated and another
+                // local one (so to bump version for the entire shard)
+                // we use the 'applyOps' mechanism to group the two updates and make them safer
+                // TODO pull config update code to a module
+
+                BSONObjBuilder cmdBuilder;
+
+                BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) );
+                {
+                    // update for the chunk being moved
+                    BSONObjBuilder op;
+                    op.append( "op" , "u" );
+                    op.appendBool( "b" , false /* no upserting */ );
+                    op.append( "ns" , ShardNS::chunk );
+
+                    BSONObjBuilder n( op.subobjStart( "o" ) );
+                    n.append( "_id" , Chunk::genID( ns , min ) );
+                    n.appendTimestamp( "lastmod" , myVersion /* same as used on donateChunk */ );
+                    n.append( "ns" , ns );
+                    n.append( "min" , min );
+                    n.append( "max" , max );
+                    n.append( "shard" , toShard.getName() );
+                    n.done();
+
+                    BSONObjBuilder q( op.subobjStart( "o2" ) );
+                    q.append( "_id" , Chunk::genID( ns , min ) );
+                    q.done();
+
+                    updates.append( op.obj() );
+                }
+
+                nextVersion = myVersion;
+
+                // if we have chunks left on the FROM shard, update the version of one of them as well
+                // we can figure that out by grabbing the chunkManager installed on 5.a
+                // TODO expose that manager when installing it
+
+                ShardChunkManagerPtr chunkManager = shardingState.getShardChunkManager( ns );
+                if( chunkManager->getNumChunks() > 0 ) {
+
+                    // get another chunk on that shard
+                    BSONObj lookupKey;
+                    BSONObj bumpMin, bumpMax;
+                    do {
+                        chunkManager->getNextChunk( lookupKey , &bumpMin , &bumpMax );
+                        lookupKey = bumpMin;
+                    }
+                    while( bumpMin == min );
+
+                    BSONObjBuilder op;
+                    op.append( "op" , "u" );
+                    op.appendBool( "b" , false );
+                    op.append( "ns" , ShardNS::chunk );
+
+                    nextVersion.incMinor();  // same as used on donateChunk
+                    BSONObjBuilder n( op.subobjStart( "o" ) );
+                    n.append( "_id" , Chunk::genID( ns , bumpMin ) );
+                    n.appendTimestamp( "lastmod" , nextVersion );
+                    n.append( "ns" , ns );
+                    n.append( "min" , bumpMin );
+                    n.append( "max" , bumpMax );
+                    n.append( "shard" , fromShard.getName() );
+                    n.done();
+
+                    BSONObjBuilder q( op.subobjStart( "o2" ) );
+                    q.append( "_id" , Chunk::genID( ns , bumpMin  ) );
+                    q.done();
+
+                    updates.append( op.obj() );
+
+                    log() << "moveChunk updating self version to: " << nextVersion << " through "
+                          << bumpMin << " -> " << bumpMax << " for collection '" << ns << "'" << migrateLog;
+
+                }
+                else {
+
+                    log() << "moveChunk moved last chunk out for collection '" << ns << "'" << migrateLog;
+                }
+
+                updates.done();
+
+                BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) );
+                {
+                    BSONObjBuilder b;
+                    b.append( "ns" , ShardNS::chunk );
+                    b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) );
+                    {
+                        BSONObjBuilder bb( b.subobjStart( "res" ) );
+                        bb.appendTimestamp( "lastmod" , maxVersion );
+                        bb.done();
+                    }
+                    preCond.append( b.obj() );
+                }
+
+                preCond.done();
+
+                BSONObj cmd = cmdBuilder.obj();
+                LOG(7) << "moveChunk update: " << cmd << migrateLog;
+
+                bool ok = false;
+                BSONObj cmdResult;
+                try {
+                    ScopedDbConnection conn( shardingState.getConfigServer() );
+                    ok = conn->runCommand( "config" , cmd , cmdResult );
+                    conn.done();
+                }
+                catch ( DBException& e ) {
+                    warning() << e << migrateLog;
+                    ok = false;
+                    BSONObjBuilder b;
+                    e.getInfo().append( b );
+                    cmdResult = b.obj();
+                }
+
+                if ( ! ok ) {
+
+                    // this could be a blip in the connectivity
+                    // wait out a few seconds and check if the commit request made it
+                    //
+                    // if the commit made it to the config, we'll see the chunk in the new shard and there's no action
+                    // if the commit did not make it, currently the only way to fix this state is to bounce the mongod so
+                    // that the old state (before migrating) be brought in
+
+                    warning() << "moveChunk commit outcome ongoing: " << cmd << " for command :" << cmdResult << migrateLog;
+                    sleepsecs( 10 );
+
+                    try {
+                        ScopedDbConnection conn( shardingState.getConfigServer() );
+
+                        // look for the chunk in this shard whose version got bumped
+                        // we assume that if that mod made it to the config, the applyOps was successful
+                        BSONObj doc = conn->findOne( ShardNS::chunk , Query(BSON( "ns" << ns )).sort( BSON("lastmod" << -1)));
+                        ShardChunkVersion checkVersion = doc["lastmod"];
+
+                        if ( checkVersion == nextVersion ) {
+                            log() << "moveChunk commit confirmed" << migrateLog;
+
+                        }
+                        else {
+                            error() << "moveChunk commit failed: version is at"
+                                            << checkVersion << " instead of " << nextVersion << migrateLog;
+                            error() << "TERMINATING" << migrateLog;
+                            dbexit( EXIT_SHARDING_ERROR );
+                        }
+
+                        conn.done();
+
+                    }
+                    catch ( ... ) {
+                        error() << "moveChunk failed to get confirmation of commit" << migrateLog;
+                        error() << "TERMINATING" << migrateLog;
+                        dbexit( EXIT_SHARDING_ERROR );
+                    }
+                }
+
+                migrateFromStatus.setInCriticalSection( false );
+
+                // 5.d
+                configServer.logChange( "moveChunk.commit" , ns , chunkInfo );
+            }
+
+            migrateFromStatus.done();
+            timing.done(5);
+
+            {
+                // 6.
+                OldDataCleanup c;
+                c.ns = ns;
+                c.min = min.getOwned();
+                c.max = max.getOwned();
+                ClientCursor::find( ns , c.initial );
+                if ( c.initial.size() ) {
+                    log() << "forking for cleaning up chunk data" << migrateLog;
+                    boost::thread t( boost::bind( &cleanupOldData , c ) );
+                }
+                else {
+                    log() << "doing delete inline" << migrateLog;
+                    // 7.
+                    c.doRemove();
+                }
+
+
+            }
+            timing.done(6);
+
+            return true;
+
+        }
+
+    } moveChunkCmd;
+
+    bool ShardingState::inCriticalMigrateSection() {
+        return migrateFromStatus.getInCriticalSection();
+    }
+
+    /* -----
+       below this are the "to" side commands
+
+       command to initiate
+       worker thread
+         does initial clone
+         pulls initial change set
+         keeps pulling
+         keeps state
+       command to get state
+       commend to "commit"
+    */
+
+    class MigrateStatus {
+    public:
+        
+        MigrateStatus() : m_active("MigrateStatus") { active = false; }
+
+        void prepare() {
+            scoped_lock l(m_active); // reading and writing 'active'
+
+            assert( ! active );
+            state = READY;
+            errmsg = "";
+
+            numCloned = 0;
+            clonedBytes = 0;
+            numCatchup = 0;
+            numSteady = 0;
+
+            active = true;
+        }
+
+        void go() {
+            try {
+                _go();
+            }
+            catch ( std::exception& e ) {
+                state = FAIL;
+                errmsg = e.what();
+                error() << "migrate failed: " << e.what() << migrateLog;
+            }
+            catch ( ... ) {
+                state = FAIL;
+                errmsg = "UNKNOWN ERROR";
+                error() << "migrate failed with unknown exception" << migrateLog;
+            }
+            setActive( false );
+        }
+
+        void _go() {
+            assert( getActive() );
+            assert( state == READY );
+            assert( ! min.isEmpty() );
+            assert( ! max.isEmpty() );
+            
+            slaveCount = ( getSlaveCount() / 2 ) + 1;
+
+            MoveTimingHelper timing( "to" , ns , min , max , 5 /* steps */ );
+
+            ScopedDbConnection conn( from );
+            conn->getLastError(); // just test connection
+
+            {
+                // 1. copy indexes
+                auto_ptr<DBClientCursor> indexes = conn->getIndexes( ns );
+                vector<BSONObj> all;
+                while ( indexes->more() ) {
+                    all.push_back( indexes->next().getOwned() );
+                }
+
+                writelock lk( ns );
+                Client::Context ct( ns );
+
+                string system_indexes = cc().database()->name + ".system.indexes";
+                for ( unsigned i=0; i<all.size(); i++ ) {
+                    BSONObj idx = all[i];
+                    theDataFileMgr.insertAndLog( system_indexes.c_str() , idx );
+                }
+
+                timing.done(1);
+            }
+
+            {
+                // 2. delete any data already in range
+                writelock lk( ns );
+                RemoveSaver rs( "moveChunk" , ns , "preCleanup" );
+                long long num = Helpers::removeRange( ns , min , max , true , false , cmdLine.moveParanoia ? &rs : 0 );
+                if ( num )
+                    warning() << "moveChunkCmd deleted data already in chunk # objects: " << num << migrateLog;
+
+                timing.done(2);
+            }
+
+
+            {
+                // 3. initial bulk clone
+                state = CLONE;
+
+                while ( true ) {
+                    BSONObj res;
+                    if ( ! conn->runCommand( "admin" , BSON( "_migrateClone" << 1 ) , res ) ) {
+                        state = FAIL;
+                        errmsg = "_migrateClone failed: ";
+                        errmsg += res.toString();
+                        error() << errmsg << migrateLog;
+                        conn.done();
+                        return;
+                    }
+
+                    BSONObj arr = res["objects"].Obj();
+                    int thisTime = 0;
+
+                    BSONObjIterator i( arr );
+                    while( i.more() ) {
+                        BSONObj o = i.next().Obj();
+                        {
+                            writelock lk( ns );
+                            Helpers::upsert( ns , o );
+                        }
+                        thisTime++;
+                        numCloned++;
+                        clonedBytes += o.objsize();
+                    }
+
+                    if ( thisTime == 0 )
+                        break;
+                }
+
+                timing.done(3);
+            }
+
+            // if running on a replicated system, we'll need to flush the docs we cloned to the secondaries
+            ReplTime lastOpApplied = cc().getLastOp().asDate();
+
+            {
+                // 4. do bulk of mods
+                state = CATCHUP;
+                while ( true ) {
+                    BSONObj res;
+                    if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {
+                        state = FAIL;
+                        errmsg = "_transferMods failed: ";
+                        errmsg += res.toString();
+                        error() << "_transferMods failed: " << res << migrateLog;
+                        conn.done();
+                        return;
+                    }
+                    if ( res["size"].number() == 0 )
+                        break;
+
+                    apply( res , &lastOpApplied );
+                    
+                    const int maxIterations = 3600*50;
+                    int i;
+                    for ( i=0;i<maxIterations; i++) {
+                        if ( state == ABORT ) {
+                            timing.note( "aborted" );
+                            return;
+                        }
+                        
+                        if ( opReplicatedEnough( lastOpApplied ) )
+                            break;
+                        
+                        if ( i > 100 ) {
+                            warning() << "secondaries having hard time keeping up with migrate" << migrateLog;
+                        }
+
+                        sleepmillis( 20 );
+                    }
+
+                    if ( i == maxIterations ) {
+                        errmsg = "secondary can't keep up with migrate";
+                        error() << errmsg << migrateLog;
+                        conn.done();
+                        state = FAIL;
+                        return;
+                    } 
+                }
+
+                timing.done(4);
+            }
+
+            { 
+                // pause to wait for replication
+                // this will prevent us from going into critical section until we're ready
+                Timer t;
+                while ( t.minutes() < 600 ) {
+                    if ( flushPendingWrites( lastOpApplied ) )
+                        break;
+                    sleepsecs(1);
+                }
+            }
+
+            {
+                // 5. wait for commit
+
+                state = STEADY;
+                while ( state == STEADY || state == COMMIT_START ) {
+                    BSONObj res;
+                    if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {
+                        log() << "_transferMods failed in STEADY state: " << res << migrateLog;
+                        errmsg = res.toString();
+                        state = FAIL;
+                        conn.done();
+                        return;
+                    }
+
+                    if ( res["size"].number() > 0 && apply( res , &lastOpApplied ) )
+                        continue;
+
+                    if ( state == ABORT ) {
+                        timing.note( "aborted" );
+                        return;
+                    }
+                    
+                    if ( state == COMMIT_START ) {
+                        if ( flushPendingWrites( lastOpApplied ) )
+                            break;
+                    }
+                    
+                    sleepmillis( 10 );
+                }
+
+                if ( state == FAIL ) {
+                    errmsg = "imted out waiting for commit";
+                    return;
+                }
+
+                timing.done(5);
+            }
+
+            state = DONE;
+            conn.done();
+        }
+
+        void status( BSONObjBuilder& b ) {
+            b.appendBool( "active" , getActive() );
+
+            b.append( "ns" , ns );
+            b.append( "from" , from );
+            b.append( "min" , min );
+            b.append( "max" , max );
+
+            b.append( "state" , stateString() );
+            if ( state == FAIL )
+                b.append( "errmsg" , errmsg );
+            {
+                BSONObjBuilder bb( b.subobjStart( "counts" ) );
+                bb.append( "cloned" , numCloned );
+                bb.append( "clonedBytes" , clonedBytes );
+                bb.append( "catchup" , numCatchup );
+                bb.append( "steady" , numSteady );
+                bb.done();
+            }
+
+
+        }
+
+        bool apply( const BSONObj& xfer , ReplTime* lastOpApplied ) {
+            ReplTime dummy;
+            if ( lastOpApplied == NULL ) {
+                lastOpApplied = &dummy;
+            }
+
+            bool didAnything = false;
+
+            if ( xfer["deleted"].isABSONObj() ) {
+                writelock lk(ns);
+                Client::Context cx(ns);
+
+                RemoveSaver rs( "moveChunk" , ns , "removedDuring" );
+
+                BSONObjIterator i( xfer["deleted"].Obj() );
+                while ( i.more() ) {
+                    BSONObj id = i.next().Obj();
+
+                    // do not apply deletes if they do not belong to the chunk being migrated
+                    BSONObj fullObj;
+                    if ( Helpers::findById( cc() , ns.c_str() , id, fullObj ) ) {
+                        if ( ! isInRange( fullObj , min , max ) ) {
+                            log() << "not applying out of range deletion: " << fullObj << migrateLog;
+
+                            continue;
+                        }
+                    }
+
+                    Helpers::removeRange( ns , id , id, false , true , cmdLine.moveParanoia ? &rs : 0 );
+
+                    *lastOpApplied = cx.getClient()->getLastOp().asDate();
+                    didAnything = true;
+                }
+            }
+
+            if ( xfer["reload"].isABSONObj() ) {
+                writelock lk(ns);
+                Client::Context cx(ns);
+
+                BSONObjIterator i( xfer["reload"].Obj() );
+                while ( i.more() ) {
+                    BSONObj it = i.next().Obj();
+
+                    Helpers::upsert( ns , it );
+
+                    *lastOpApplied = cx.getClient()->getLastOp().asDate();
+                    didAnything = true;
+                }
+            }
+
+            return didAnything;
+        }
+
+        bool opReplicatedEnough( const ReplTime& lastOpApplied ) {
+            // if replication is on, try to force enough secondaries to catch up
+            // TODO opReplicatedEnough should eventually honor priorities and geo-awareness
+            //      for now, we try to replicate to a sensible number of secondaries
+            return mongo::opReplicatedEnough( lastOpApplied , slaveCount );
+        }
+
+        bool flushPendingWrites( const ReplTime& lastOpApplied ) {
+            if ( ! opReplicatedEnough( lastOpApplied ) ) {
+                OpTime op( lastOpApplied );
+                OCCASIONALLY warning() << "migrate commit waiting for " << slaveCount 
+                                       << " slaves for '" << ns << "' " << min << " -> " << max 
+                                       << " waiting for: " << op
+                                       << migrateLog;
+                return false;
+            }
+
+            log() << "migrate commit succeeded flushing to secondaries for '" << ns << "' " << min << " -> " << max << migrateLog;
+
+            {
+                readlock lk(ns);  // commitNow() currently requires it
+
+                // if durability is on, force a write to journal
+                if ( getDur().commitNow() ) {
+                    log() << "migrate commit flushed to journal for '" << ns << "' " << min << " -> " << max << migrateLog;
+                }
+            }
+
+            return true;
+        }
+
+        string stateString() {
+            switch ( state ) {
+            case READY: return "ready";
+            case CLONE: return "clone";
+            case CATCHUP: return "catchup";
+            case STEADY: return "steady";
+            case COMMIT_START: return "commitStart";
+            case DONE: return "done";
+            case FAIL: return "fail";
+            case ABORT: return "abort";
+            }
+            assert(0);
+            return "";
+        }
+
+        bool startCommit() {
+            if ( state != STEADY )
+                return false;
+            state = COMMIT_START;
+            
+            Timer t;
+            // we wait for the commit to succeed before giving up
+            while ( t.minutes() <= 5 ) {
+                sleepmillis(1);
+                if ( state == DONE )
+                    return true;
+            }
+            state = FAIL;
+            log() << "startCommit never finished!" << migrateLog;
+            return false;
+        }
+
+        void abort() {
+            state = ABORT;
+            errmsg = "aborted";
+        }
+
+        bool getActive() const { scoped_lock l(m_active); return active; }
+        void setActive( bool b ) { scoped_lock l(m_active); active = b; }
+
+        mutable mongo::mutex m_active;
+        bool active;
+
+        string ns;
+        string from;
+
+        BSONObj min;
+        BSONObj max;
+
+        long long numCloned;
+        long long clonedBytes;
+        long long numCatchup;
+        long long numSteady;
+        
+        int slaveCount;
+
+        enum State { READY , CLONE , CATCHUP , STEADY , COMMIT_START , DONE , FAIL , ABORT } state;
+        string errmsg;
+
+    } migrateStatus;
+
+    void migrateThread() {
+        Client::initThread( "migrateThread" );
+        if (!noauth) {
+            ShardedConnectionInfo::addHook();
+            cc().getAuthenticationInfo()->authorize("local", internalSecurity.user);
+        }
+        migrateStatus.go();
+        cc().shutdown();
+    }
+
+    class RecvChunkStartCommand : public ChunkCommandHelper {
+    public:
+        RecvChunkStartCommand() : ChunkCommandHelper( "_recvChunkStart" ) {}
+
+        virtual LockType locktype() const { return WRITE; }  // this is so don't have to do locking internally
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+            if ( migrateStatus.getActive() ) {
+                errmsg = "migrate already in progress";
+                return false;
+            }
+            
+            if ( OldDataCleanup::_numThreads > 0 ) {
+                errmsg = 
+                    str::stream() 
+                    << "still waiting for a previous migrates data to get cleaned, can't accept new chunks, num threads: " 
+                    << OldDataCleanup::_numThreads;
+                return false;
+            }
+
+            if ( ! configServer.ok() )
+                configServer.init( cmdObj["configServer"].String() );
+
+            migrateStatus.prepare();
+
+            migrateStatus.ns = cmdObj.firstElement().String();
+            migrateStatus.from = cmdObj["from"].String();
+            migrateStatus.min = cmdObj["min"].Obj().getOwned();
+            migrateStatus.max = cmdObj["max"].Obj().getOwned();
+
+            boost::thread m( migrateThread );
+
+            result.appendBool( "started" , true );
+            return true;
+        }
+
+    } recvChunkStartCmd;
+
+    class RecvChunkStatusCommand : public ChunkCommandHelper {
+    public:
+        RecvChunkStatusCommand() : ChunkCommandHelper( "_recvChunkStatus" ) {}
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            migrateStatus.status( result );
+            return 1;
+        }
+
+    } recvChunkStatusCommand;
+
+    class RecvChunkCommitCommand : public ChunkCommandHelper {
+    public:
+        RecvChunkCommitCommand() : ChunkCommandHelper( "_recvChunkCommit" ) {}
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            bool ok = migrateStatus.startCommit();
+            migrateStatus.status( result );
+            return ok;
+        }
+
+    } recvChunkCommitCommand;
+
+    class RecvChunkAbortCommand : public ChunkCommandHelper {
+    public:
+        RecvChunkAbortCommand() : ChunkCommandHelper( "_recvChunkAbort" ) {}
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            migrateStatus.abort();
+            migrateStatus.status( result );
+            return true;
+        }
+
+    } recvChunkAboortCommand;
+
+
+    class IsInRangeTest : public UnitTest {
+    public:
+        void run() {
+            BSONObj min = BSON( "x" << 1 );
+            BSONObj max = BSON( "x" << 5 );
+
+            assert( ! isInRange( BSON( "x" << 0 ) , min , max ) );
+            assert( isInRange( BSON( "x" << 1 ) , min , max ) );
+            assert( isInRange( BSON( "x" << 3 ) , min , max ) );
+            assert( isInRange( BSON( "x" << 4 ) , min , max ) );
+            assert( ! isInRange( BSON( "x" << 5 ) , min , max ) );
+            assert( ! isInRange( BSON( "x" << 6 ) , min , max ) );
+
+            LOG(1) << "isInRangeTest passed" << migrateLog;
+        }
+    } isInRangeTest;
+}
diff --git a/src/mongo/s/d_split.cpp b/src/mongo/s/d_split.cpp
new file mode 100644
index 00000000000..d0ba7b44c10
--- /dev/null
+++ b/src/mongo/s/d_split.cpp
@@ -0,0 +1,830 @@
+// @file  d_split.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include <map>
+#include <string>
+
+#include "../db/btree.h"
+#include "../db/commands.h"
+#include "../db/jsobj.h"
+#include "../db/instance.h"
+#include "../db/queryoptimizer.h"
+#include "../db/clientcursor.h"
+
+#include "../client/connpool.h"
+#include "../client/distlock.h"
+#include "../util/timer.h"
+
+#include "chunk.h" // for static genID only
+#include "config.h"
+#include "d_logic.h"
+
+namespace mongo {
+
+    // TODO: Fold these checks into each command.
+    static IndexDetails *cmdIndexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+        if ( ns[ 0 ] == '\0' || min.isEmpty() || max.isEmpty() ) {
+            errmsg = "invalid command syntax (note: min and max are required)";
+            return 0;
+        }
+        return indexDetailsForRange( ns, errmsg, min, max, keyPattern );
+    }
+
+
+    class CmdMedianKey : public Command {
+    public:
+        CmdMedianKey() : Command( "medianKey" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help <<
+                 "Internal command.\n"
+                 "example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n"
+                 "NOTE: This command may take a while to run";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            const char *ns = jsobj.getStringField( "medianKey" );
+            BSONObj min = jsobj.getObjectField( "min" );
+            BSONObj max = jsobj.getObjectField( "max" );
+            BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+
+            Client::Context ctx( ns );
+
+            IndexDetails *id = cmdIndexDetailsForRange( ns, errmsg, min, max, keyPattern );
+            if ( id == 0 )
+                return false;
+
+            Timer timer;
+            int num = 0;
+            NamespaceDetails *d = nsdetails(ns);
+            int idxNo = d->idxNo(*id);
+
+            // only yielding on first half for now
+            // after this it should be in ram, so 2nd should be fast
+            {
+                shared_ptr<Cursor> c( BtreeCursor::make( d, idxNo, *id, min, max, false, 1 ) );
+                auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
+                while ( c->ok() ) {
+                    num++;
+                    c->advance();
+                    if ( ! cc->yieldSometimes( ClientCursor::DontNeed ) ) {
+                        cc.release();
+                        break;
+                    }
+                }
+            }
+
+            num /= 2;
+
+            auto_ptr<BtreeCursor> _c( BtreeCursor::make( d, idxNo, *id, min, max, false, 1 ) );
+            BtreeCursor& c = *_c;
+            for( ; num; c.advance(), --num );
+
+            ostringstream os;
+            os << "Finding median for index: " << keyPattern << " between " << min << " and " << max;
+            logIfSlow( timer , os.str() );
+
+            if ( !c.ok() ) {
+                errmsg = "no index entries in the specified range";
+                return false;
+            }
+
+            BSONObj median = c.prettyKey( c.currKey() );
+            result.append( "median", median );
+
+            int x = median.woCompare( min , BSONObj() , false );
+            int y = median.woCompare( max , BSONObj() , false );
+            if ( x == 0 || y == 0 ) {
+                // its on an edge, ok
+            }
+            else if ( x < 0 && y < 0 ) {
+                log( LL_ERROR ) << "median error (1) min: " << min << " max: " << max << " median: " << median << endl;
+                errmsg = "median error 1";
+                return false;
+            }
+            else if ( x > 0 && y > 0 ) {
+                log( LL_ERROR ) << "median error (2) min: " << min << " max: " << max << " median: " << median << endl;
+                errmsg = "median error 2";
+                return false;
+            }
+
+            return true;
+        }
+    } cmdMedianKey;
+
+    class CheckShardingIndex : public Command {
+    public:
+        CheckShardingIndex() : Command( "checkShardingIndex" , false ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help << "Internal command.\n";
+        }
+
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+
+            const char* ns = jsobj.getStringField( "checkShardingIndex" );
+            BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+
+            if ( keyPattern.nFields() == 1 && str::equals( "_id" , keyPattern.firstElementFieldName() ) ) {
+                result.appendBool( "idskip" , true );
+                return true;
+            }
+
+            // If min and max are not provided use the "minKey" and "maxKey" for the sharding key pattern.
+            BSONObj min = jsobj.getObjectField( "min" );
+            BSONObj max = jsobj.getObjectField( "max" );
+            if ( min.isEmpty() && max.isEmpty() ) {
+                BSONObjBuilder minBuilder;
+                BSONObjBuilder maxBuilder;
+                BSONForEach(key, keyPattern) {
+                    minBuilder.appendMinKey( key.fieldName() );
+                    maxBuilder.appendMaxKey( key.fieldName() );
+                }
+                min = minBuilder.obj();
+                max = maxBuilder.obj();
+            }
+            else if ( min.isEmpty() || max.isEmpty() ) {
+                errmsg = "either provide both min and max or leave both empty";
+                return false;
+            }
+
+            Client::Context ctx( ns );
+            NamespaceDetails *d = nsdetails( ns );
+            if ( ! d ) {
+                errmsg = "ns not found";
+                return false;
+            }
+
+            IndexDetails *idx = cmdIndexDetailsForRange( ns , errmsg , min , max , keyPattern );
+            if ( idx == NULL ) {
+                errmsg = "couldn't find index over splitting key";
+                return false;
+            }
+
+            if( d->isMultikey( d->idxNo( *idx ) ) ) {
+                errmsg = "index is multikey, cannot use for sharding";
+                return false;
+            }
+
+            BtreeCursor * bc = BtreeCursor::make( d , d->idxNo(*idx) , *idx , min , max , false , 1 );
+            shared_ptr<Cursor> c( bc );
+            auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
+            if ( ! cc->ok() ) {
+                // range is empty
+                return true;
+            }
+
+            // for now, the only check is that all shard keys are filled
+            // null is ok, 
+            // TODO if $exist for nulls were picking the index, it could be used instead efficiently
+            while ( cc->ok() ) {
+                BSONObj currKey = c->currKey();
+                
+                BSONObjIterator i( currKey );
+                int n = 0;
+                while ( i.more() ) {
+                    BSONElement key = i.next();
+                    n++;
+
+                    if ( key.type() && key.type() != jstNULL )
+                        continue;
+
+                    BSONObj obj = c->current();
+                    BSONObjIterator j( keyPattern );
+                    BSONElement real;
+                    for ( int x=0; x<n; x++ )
+                        real = j.next();
+
+                    real = obj.getFieldDotted( real.fieldName() );
+
+                    if ( real.type() )
+                        continue;
+                    
+                    ostringstream os;
+                    os << "found null value in key " << bc->prettyKey( currKey ) << " for doc: " << ( obj["_id"].eoo() ? obj.toString() : obj["_id"].toString() );
+                    log() << "checkShardingIndex for '" << ns << "' failed: " << os.str() << endl;
+                    
+                    errmsg = os.str();
+                    return false;
+                }
+                cc->advance();
+
+                if ( ! cc->yieldSometimes( ClientCursor::DontNeed ) ) {
+                    cc.release();
+                    break;
+                }
+            }
+
+            return true;
+        }
+    } cmdCheckShardingIndex;
+
+    class SplitVector : public Command {
+    public:
+        SplitVector() : Command( "splitVector" , false ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help <<
+                 "Internal command.\n"
+                 "examples:\n"
+                 "  { splitVector : \"blog.post\" , keyPattern:{x:1} , min:{x:10} , max:{x:20}, maxChunkSize:200 }\n"
+                 "  maxChunkSize unit in MBs\n"
+                 "  May optionally specify 'maxSplitPoints' and 'maxChunkObjects' to avoid traversing the whole chunk\n"
+                 "  \n"
+                 "  { splitVector : \"blog.post\" , keyPattern:{x:1} , min:{x:10} , max:{x:20}, force: true }\n"
+                 "  'force' will produce one split point even if data is small; defaults to false\n"
+                 "NOTE: This command may take a while to run";
+        }
+
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+
+            //
+            // 1.a We'll parse the parameters in two steps. First, make sure the we can use the split index to get
+            //     a good approximation of the size of the chunk -- without needing to access the actual data.
+            //
+
+            const char* ns = jsobj.getStringField( "splitVector" );
+            BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+
+            // If min and max are not provided use the "minKey" and "maxKey" for the sharding key pattern.
+            BSONObj min = jsobj.getObjectField( "min" );
+            BSONObj max = jsobj.getObjectField( "max" );
+            if ( min.isEmpty() && max.isEmpty() ) {
+                BSONObjBuilder minBuilder;
+                BSONObjBuilder maxBuilder;
+                BSONForEach(key, keyPattern) {
+                    minBuilder.appendMinKey( key.fieldName() );
+                    maxBuilder.appendMaxKey( key.fieldName() );
+                }
+                min = minBuilder.obj();
+                max = maxBuilder.obj();
+            }
+            else if ( min.isEmpty() || max.isEmpty() ) {
+                errmsg = "either provide both min and max or leave both empty";
+                return false;
+            }
+
+            long long maxSplitPoints = 0;
+            BSONElement maxSplitPointsElem = jsobj[ "maxSplitPoints" ];
+            if ( maxSplitPointsElem.isNumber() ) {
+                maxSplitPoints = maxSplitPointsElem.numberLong();
+            }
+
+            long long maxChunkObjects = Chunk::MaxObjectPerChunk;
+            BSONElement MaxChunkObjectsElem = jsobj[ "maxChunkObjects" ];
+            if ( MaxChunkObjectsElem.isNumber() ) {
+                maxChunkObjects = MaxChunkObjectsElem.numberLong();
+            }
+
+            vector<BSONObj> splitKeys;
+
+            {
+                // Get the size estimate for this namespace
+                Client::Context ctx( ns );
+                NamespaceDetails *d = nsdetails( ns );
+                if ( ! d ) {
+                    errmsg = "ns not found";
+                    return false;
+                }
+                
+                IndexDetails *idx = cmdIndexDetailsForRange( ns , errmsg , min , max , keyPattern );
+                if ( idx == NULL ) {
+                    errmsg = "couldn't find index over splitting key";
+                    return false;
+                }
+                
+                const long long recCount = d->stats.nrecords;
+                const long long dataSize = d->stats.datasize;
+                
+                //
+                // 1.b Now that we have the size estimate, go over the remaining parameters and apply any maximum size
+                //     restrictions specified there.
+                //
+                
+                // 'force'-ing a split is equivalent to having maxChunkSize be the size of the current chunk, i.e., the
+                // logic below will split that chunk in half
+                long long maxChunkSize = 0;
+                bool force = false;
+                {
+                    BSONElement maxSizeElem = jsobj[ "maxChunkSize" ];
+                    BSONElement forceElem = jsobj[ "force" ];
+                    
+                    if ( forceElem.trueValue() ) {
+                        force = true;
+                        maxChunkSize = dataSize;
+                        
+                    }
+                    else if ( maxSizeElem.isNumber() ) {
+                        maxChunkSize = maxSizeElem.numberLong() * 1<<20;
+                        
+                    }
+                    else {
+                        maxSizeElem = jsobj["maxChunkSizeBytes"];
+                        if ( maxSizeElem.isNumber() ) {
+                            maxChunkSize = maxSizeElem.numberLong();
+                        }
+                    }
+                    
+                    if ( maxChunkSize <= 0 ) {
+                        errmsg = "need to specify the desired max chunk size (maxChunkSize or maxChunkSizeBytes)";
+                        return false;
+                    }
+                }
+                
+                
+                // If there's not enough data for more than one chunk, no point continuing.
+                if ( dataSize < maxChunkSize || recCount == 0 ) {
+                    vector<BSONObj> emptyVector;
+                    result.append( "splitKeys" , emptyVector );
+                    return true;
+                }
+                
+                log() << "request split points lookup for chunk " << ns << " " << min << " -->> " << max << endl;
+                
+                // We'll use the average object size and number of object to find approximately how many keys
+                // each chunk should have. We'll split at half the maxChunkSize or maxChunkObjects, if
+                // provided.
+                const long long avgRecSize = dataSize / recCount;
+                long long keyCount = maxChunkSize / (2 * avgRecSize);
+                if ( maxChunkObjects && ( maxChunkObjects < keyCount ) ) {
+                    log() << "limiting split vector to " << maxChunkObjects << " (from " << keyCount << ") objects " << endl;
+                    keyCount = maxChunkObjects;
+                }
+                
+                //
+                // 2. Traverse the index and add the keyCount-th key to the result vector. If that key
+                //    appeared in the vector before, we omit it. The invariant here is that all the
+                //    instances of a given key value live in the same chunk.
+                //
+                
+                Timer timer;
+                long long currCount = 0;
+                long long numChunks = 0;
+                
+                BtreeCursor * bc = BtreeCursor::make( d , d->idxNo(*idx) , *idx , min , max , false , 1 );
+                shared_ptr<Cursor> c( bc );
+                auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
+                if ( ! cc->ok() ) {
+                    errmsg = "can't open a cursor for splitting (desired range is possibly empty)";
+                    return false;
+                }
+                
+                // Use every 'keyCount'-th key as a split point. We add the initial key as a sentinel, to be removed
+                // at the end. If a key appears more times than entries allowed on a chunk, we issue a warning and
+                // split on the following key.
+                set<BSONObj> tooFrequentKeys;
+                splitKeys.push_back( c->currKey().getOwned() );
+                while ( 1 ) {
+                    while ( cc->ok() ) {
+                        currCount++;
+                        BSONObj currKey = c->currKey();
+                        
+                        DEV assert( currKey.woCompare( max ) <= 0 );
+                        
+                        if ( currCount > keyCount ) {
+                            // Do not use this split key if it is the same used in the previous split point.
+                            if ( currKey.woCompare( splitKeys.back() ) == 0 ) {
+                                tooFrequentKeys.insert( currKey.getOwned() );
+                                
+                            }
+                            else {
+                                splitKeys.push_back( currKey.getOwned() );
+                                currCount = 0;
+                                numChunks++;
+                                
+                                LOG(4) << "picked a split key: " << bc->prettyKey( currKey ) << endl;
+                            }
+                            
+                        }
+                        
+                        cc->advance();
+                        
+                        // Stop if we have enough split points.
+                        if ( maxSplitPoints && ( numChunks >= maxSplitPoints ) ) {
+                            log() << "max number of requested split points reached (" << numChunks
+                                  << ") before the end of chunk " << ns << " " << min << " -->> " << max
+                                  << endl;
+                            break;
+                        }
+                        
+                        if ( ! cc->yieldSometimes( ClientCursor::DontNeed ) ) {
+                            // we were near and and got pushed to the end
+                            // i think returning the splits we've already found is fine
+                            
+                            // don't use the btree cursor pointer to access keys beyond this point but ok
+                            // to use it for format the keys we've got already
+                            cc.release();
+                            break;
+                        }
+                    }
+                    
+                    if ( splitKeys.size() > 1 || ! force )
+                        break;
+                    
+                    force = false;
+                    keyCount = currCount / 2;
+                    currCount = 0;
+                    log() << "splitVector doing another cycle because of force, keyCount now: " << keyCount << endl;
+                    
+                    bc = BtreeCursor::make( d , d->idxNo(*idx) , *idx , min , max , false , 1 );
+                    c.reset( bc );
+                    cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
+                }
+                
+                //
+                // 3. Format the result and issue any warnings about the data we gathered while traversing the
+                //    index
+                //
+                
+                // Warn for keys that are more numerous than maxChunkSize allows.
+                for ( set<BSONObj>::const_iterator it = tooFrequentKeys.begin(); it != tooFrequentKeys.end(); ++it ) {
+                    warning() << "chunk is larger than " << maxChunkSize
+                              << " bytes because of key " << bc->prettyKey( *it ) << endl;
+                }
+                
+                // Remove the sentinel at the beginning before returning and add fieldnames.
+                splitKeys.erase( splitKeys.begin() );
+                assert( c.get() );
+                for ( vector<BSONObj>::iterator it = splitKeys.begin(); it != splitKeys.end() ; ++it ) {
+                    *it = bc->prettyKey( *it );
+                }
+                
+                if ( timer.millis() > cmdLine.slowMS ) {
+                    warning() << "Finding the split vector for " <<  ns << " over "<< keyPattern
+                              << " keyCount: " << keyCount << " numSplits: " << splitKeys.size() 
+                              << " lookedAt: " << currCount << " took " << timer.millis() << "ms"
+                              << endl;
+                }
+                
+                // Warning: we are sending back an array of keys but are currently limited to
+                // 4MB work of 'result' size. This should be okay for now.
+                
+            }
+
+            result.append( "splitKeys" , splitKeys );
+
+            return true;
+
+        }
+    } cmdSplitVector;
+
+    // ** temporary ** 2010-10-22
+    // chunkInfo is a helper to collect and log information about the chunks generated in splitChunk.
+    // It should hold the chunk state for this module only, while we don't have min/max key info per chunk on the
+    // mongod side. Do not build on this; it will go away.
+    struct ChunkInfo {
+        BSONObj min;
+        BSONObj max;
+        ShardChunkVersion lastmod;
+
+        ChunkInfo() { }
+        ChunkInfo( BSONObj aMin , BSONObj aMax , ShardChunkVersion aVersion ) : min(aMin) , max(aMax) , lastmod(aVersion) {}
+        void appendShortVersion( const char* name, BSONObjBuilder& b ) const;
+        string toString() const;
+    };
+
+    void ChunkInfo::appendShortVersion( const char * name , BSONObjBuilder& b ) const {
+        BSONObjBuilder bb( b.subobjStart( name ) );
+        bb.append( "min" , min );
+        bb.append( "max" , max );
+        bb.appendTimestamp( "lastmod" , lastmod );
+        bb.done();
+    }
+
+    string ChunkInfo::toString() const {
+        ostringstream os;
+        os << "lastmod: " << lastmod.toString() << " min: " << min << " max: " << endl;
+        return os.str();
+    }
+    // ** end temporary **
+
+    class SplitChunkCommand : public Command {
+    public:
+        SplitChunkCommand() : Command( "splitChunk" ) {}
+        virtual void help( stringstream& help ) const {
+            help <<
+                 "internal command usage only\n"
+                 "example:\n"
+                 " { splitChunk:\"db.foo\" , keyPattern: {a:1} , min : {a:100} , max: {a:200} { splitKeys : [ {a:150} , ... ]}";
+        }
+
+        virtual bool slaveOk() const { return false; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+
+            //
+            // 1. check whether parameters passed to splitChunk are sound
+            //
+
+            const string ns = cmdObj.firstElement().str();
+            if ( ns.empty() ) {
+                errmsg  = "need to specify namespace in command";
+                return false;
+            }
+
+            const BSONObj keyPattern = cmdObj["keyPattern"].Obj();
+            if ( keyPattern.isEmpty() ) {
+                errmsg = "need to specify the key pattern the collection is sharded over";
+                return false;
+            }
+
+            const BSONObj min = cmdObj["min"].Obj();
+            if ( min.isEmpty() ) {
+                errmsg = "need to specify the min key for the chunk";
+                return false;
+            }
+
+            const BSONObj max = cmdObj["max"].Obj();
+            if ( max.isEmpty() ) {
+                errmsg = "need to specify the max key for the chunk";
+                return false;
+            }
+
+            const string from = cmdObj["from"].str();
+            if ( from.empty() ) {
+                errmsg = "need specify server to split chunk at";
+                return false;
+            }
+
+            const BSONObj splitKeysElem = cmdObj["splitKeys"].Obj();
+            if ( splitKeysElem.isEmpty() ) {
+                errmsg = "need to provide the split points to chunk over";
+                return false;
+            }
+            vector<BSONObj> splitKeys;
+            BSONObjIterator it( splitKeysElem );
+            while ( it.more() ) {
+                splitKeys.push_back( it.next().Obj().getOwned() );
+            }
+
+            const BSONElement shardId = cmdObj["shardId"];
+            if ( shardId.eoo() ) {
+                errmsg = "need to provide shardId";
+                return false;
+            }
+
+            // It is possible that this is the first sharded command this mongod is asked to perform. If so,
+            // start sharding apparatus. We'd still be missing some more shard-related info but we'll get it
+            // in step 2. below.
+            if ( ! shardingState.enabled() ) {
+                if ( cmdObj["configdb"].type() != String ) {
+                    errmsg = "sharding not enabled";
+                    return false;
+                }
+                string configdb = cmdObj["configdb"].String();
+                shardingState.enable( configdb );
+                configServer.init( configdb );
+            }
+
+            Shard myShard( from );
+
+            log() << "received splitChunk request: " << cmdObj << endl;
+
+            //
+            // 2. lock the collection's metadata and get highest version for the current shard
+            //
+
+            DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC) , ns );
+            dist_lock_try dlk;
+
+            try{
+            	dlk = dist_lock_try( &lockSetup, string("split-") + min.toString() );
+            }
+            catch( LockException& e ){
+            	errmsg = str::stream() << "Error locking distributed lock for split." << causedBy( e );
+            	return false;
+            }
+
+            if ( ! dlk.got() ) {
+                errmsg = "the collection's metadata lock is taken";
+                result.append( "who" , dlk.other() );
+                return false;
+            }
+
+            // TODO This is a check migrate does to the letter. Factor it out and share. 2010-10-22
+
+            ShardChunkVersion maxVersion;
+            string shard;
+            ChunkInfo origChunk;
+            {
+                ScopedDbConnection conn( shardingState.getConfigServer() );
+
+                BSONObj x = conn->findOne( ShardNS::chunk , Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) );
+                maxVersion = x["lastmod"];
+
+                BSONObj currChunk = conn->findOne( ShardNS::chunk , shardId.wrap( "_id" ) ).getOwned();
+                assert( currChunk["shard"].type() );
+                assert( currChunk["min"].type() );
+                assert( currChunk["max"].type() );
+                shard = currChunk["shard"].String();
+                conn.done();
+
+                BSONObj currMin = currChunk["min"].Obj();
+                BSONObj currMax = currChunk["max"].Obj();
+                if ( currMin.woCompare( min ) || currMax.woCompare( max ) ) {
+                    errmsg = "chunk boundaries are outdated (likely a split occurred)";
+                    result.append( "currMin" , currMin );
+                    result.append( "currMax" , currMax );
+                    result.append( "requestedMin" , min );
+                    result.append( "requestedMax" , max );
+
+                    log( LL_WARNING ) << "aborted split because " << errmsg << ": " << min << "->" << max
+                                      << " is now " << currMin << "->" << currMax << endl;
+                    return false;
+                }
+
+                if ( shard != myShard.getName() ) {
+                    errmsg = "location is outdated (likely balance or migrate occurred)";
+                    result.append( "from" , myShard.getName() );
+                    result.append( "official" , shard );
+
+                    log( LL_WARNING ) << "aborted split because " << errmsg << ": chunk is at " << shard
+                                      << " and not at " << myShard.getName() << endl;
+                    return false;
+                }
+
+                if ( maxVersion < shardingState.getVersion( ns ) ) {
+                    errmsg = "official version less than mine?";
+                    result.appendTimestamp( "officialVersion" , maxVersion );
+                    result.appendTimestamp( "myVersion" , shardingState.getVersion( ns ) );
+
+                    log( LL_WARNING ) << "aborted split because " << errmsg << ": official " << maxVersion
+                                      << " mine: " << shardingState.getVersion(ns) << endl;
+                    return false;
+                }
+
+                origChunk.min = currMin.getOwned();
+                origChunk.max = currMax.getOwned();
+                origChunk.lastmod = currChunk["lastmod"];
+
+                // since this could be the first call that enable sharding we also make sure to have the chunk manager up to date
+                shardingState.gotShardName( shard );
+                ShardChunkVersion shardVersion;
+                shardingState.trySetVersion( ns , shardVersion /* will return updated */ );
+
+                log() << "splitChunk accepted at version " << shardVersion << endl;
+
+            }
+
+            //
+            // 3. create the batch of updates to metadata ( the new chunks ) to be applied via 'applyOps' command
+            //
+
+            BSONObjBuilder logDetail;
+            origChunk.appendShortVersion( "before" , logDetail );
+            LOG(1) << "before split on " << origChunk << endl;
+            vector<ChunkInfo> newChunks;
+
+            ShardChunkVersion myVersion = maxVersion;
+            BSONObj startKey = min;
+            splitKeys.push_back( max ); // makes it easier to have 'max' in the next loop. remove later.
+
+            BSONObjBuilder cmdBuilder;
+            BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) );
+
+            for ( vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it ) {
+                BSONObj endKey = *it;
+
+                // splits only update the 'minor' portion of version
+                myVersion.incMinor();
+
+                // build an update operation against the chunks collection of the config database with
+                // upsert true
+                BSONObjBuilder op;
+                op.append( "op" , "u" );
+                op.appendBool( "b" , true );
+                op.append( "ns" , ShardNS::chunk );
+
+                // add the modified (new) chunk information as the update object
+                BSONObjBuilder n( op.subobjStart( "o" ) );
+                n.append( "_id" , Chunk::genID( ns , startKey ) );
+                n.appendTimestamp( "lastmod" , myVersion );
+                n.append( "ns" , ns );
+                n.append( "min" , startKey );
+                n.append( "max" , endKey );
+                n.append( "shard" , shard );
+                n.done();
+
+                // add the chunk's _id as the query part of the update statement
+                BSONObjBuilder q( op.subobjStart( "o2" ) );
+                q.append( "_id" , Chunk::genID( ns , startKey ) );
+                q.done();
+
+                updates.append( op.obj() );
+
+                // remember this chunk info for logging later
+                newChunks.push_back( ChunkInfo( startKey , endKey, myVersion ) );
+
+                startKey = endKey;
+            }
+
+            updates.done();
+
+            {
+                BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) );
+                BSONObjBuilder b;
+                b.append( "ns" , ShardNS::chunk );
+                b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) );
+                {
+                    BSONObjBuilder bb( b.subobjStart( "res" ) );
+                    bb.appendTimestamp( "lastmod" , maxVersion );
+                    bb.done();
+                }
+                preCond.append( b.obj() );
+                preCond.done();
+            }
+
+            //
+            // 4. apply the batch of updates to metadata and to the chunk manager
+            //
+
+            BSONObj cmd = cmdBuilder.obj();
+
+            LOG(1) << "splitChunk update: " << cmd << endl;
+
+            bool ok;
+            BSONObj cmdResult;
+            {
+                ScopedDbConnection conn( shardingState.getConfigServer() );
+                ok = conn->runCommand( "config" , cmd , cmdResult );
+                conn.done();
+            }
+
+            if ( ! ok ) {
+                stringstream ss;
+                ss << "saving chunks failed.  cmd: " << cmd << " result: " << cmdResult;
+                error() << ss.str() << endl;
+                msgasserted( 13593 , ss.str() ); // assert(13593)
+            }
+
+            // install a chunk manager with knowledge about newly split chunks in this shard's state
+            splitKeys.pop_back(); // 'max' was used as sentinel
+            maxVersion.incMinor();
+            shardingState.splitChunk( ns , min , max , splitKeys , maxVersion );
+
+            //
+            // 5. logChanges
+            //
+
+            // single splits are logged different than multisplits
+            if ( newChunks.size() == 2 ) {
+                newChunks[0].appendShortVersion( "left" , logDetail );
+                newChunks[1].appendShortVersion( "right" , logDetail );
+                configServer.logChange( "split" , ns , logDetail.obj() );
+
+            }
+            else {
+                BSONObj beforeDetailObj = logDetail.obj();
+                BSONObj firstDetailObj = beforeDetailObj.getOwned();
+                const int newChunksSize = newChunks.size();
+
+                for ( int i=0; i < newChunksSize; i++ ) {
+                    BSONObjBuilder chunkDetail;
+                    chunkDetail.appendElements( beforeDetailObj );
+                    chunkDetail.append( "number", i+1 );
+                    chunkDetail.append( "of" , newChunksSize );
+                    newChunks[i].appendShortVersion( "chunk" , chunkDetail );
+                    configServer.logChange( "multi-split" , ns , chunkDetail.obj() );
+                }
+            }
+
+            if (newChunks.size() == 2){
+                // If one of the chunks has only one object in it we should move it
+                static const BSONObj fields = BSON("_id" << 1 );
+                DBDirectClient conn;
+                for (int i=1; i >= 0 ; i--){ // high chunk more likely to have only one obj
+                    ChunkInfo chunk = newChunks[i];
+                    Query q = Query().minKey(chunk.min).maxKey(chunk.max);
+                    scoped_ptr<DBClientCursor> c (conn.query(ns, q, /*limit*/-2, 0, &fields));
+                    if (c && c->itcount() == 1) {
+                        result.append("shouldMigrate", BSON("min" << chunk.min << "max" << chunk.max));
+                        break;
+                    }
+                }
+            }
+
+            return true;
+        }
+    } cmdSplitChunk;
+
+}  // namespace mongo
diff --git a/src/mongo/s/d_state.cpp b/src/mongo/s/d_state.cpp
new file mode 100644
index 00000000000..39d84b6ff88
--- /dev/null
+++ b/src/mongo/s/d_state.cpp
@@ -0,0 +1,753 @@
+// @file d_state.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/**
+   these are commands that live in mongod
+   mostly around shard management and checking
+ */
+
+#include "pch.h"
+#include <map>
+#include <string>
+
+#include "../db/commands.h"
+#include "../db/jsobj.h"
+#include "../db/db.h"
+#include "../db/replutil.h"
+#include "../client/connpool.h"
+
+#include "../util/queue.h"
+
+#include "shard.h"
+#include "d_logic.h"
+#include "config.h"
+
+using namespace std;
+
+namespace mongo {
+
+    // -----ShardingState START ----
+
+    ShardingState::ShardingState()
+        : _enabled(false) , _mutex( "ShardingState" ) {
+    }
+
+    void ShardingState::enable( const string& server ) {
+        _enabled = true;
+        assert( server.size() );
+        if ( _configServer.size() == 0 )
+            _configServer = server;
+        else {
+            assert( server == _configServer );
+        }
+    }
+
+    void ShardingState::gotShardName( const string& name ) {
+        scoped_lock lk(_mutex);
+        if ( _shardName.size() == 0 ) {
+            // TODO SERVER-2299 verify the name is sound w.r.t IPs
+            _shardName = name;
+            return;
+        }
+
+        if ( _shardName == name )
+            return;
+
+        stringstream ss;
+        ss << "gotShardName different than what i had before "
+           << " before [" << _shardName << "] "
+           << " got [" << name << "] "
+           ;
+        msgasserted( 13298 , ss.str() );
+    }
+
+    void ShardingState::gotShardHost( string host ) {
+        scoped_lock lk(_mutex);
+        size_t slash = host.find( '/' );
+        if ( slash != string::npos )
+            host = host.substr( 0 , slash );
+
+        if ( _shardHost.size() == 0 ) {
+            _shardHost = host;
+            return;
+        }
+
+        if ( _shardHost == host )
+            return;
+
+        stringstream ss;
+        ss << "gotShardHost different than what i had before "
+           << " before [" << _shardHost << "] "
+           << " got [" << host << "] "
+           ;
+        msgasserted( 13299 , ss.str() );
+    }
+
+    void ShardingState::resetShardingState() {
+        scoped_lock lk(_mutex);
+        
+        _enabled = false;
+        _configServer.clear();
+        _shardName.clear();
+        _shardHost.clear();
+        _chunks.clear();
+    }
+
+    // TODO we shouldn't need three ways for checking the version. Fix this.
+    bool ShardingState::hasVersion( const string& ns ) {
+        scoped_lock lk(_mutex);
+
+        ChunkManagersMap::const_iterator it = _chunks.find(ns);
+        return it != _chunks.end();
+    }
+
+    bool ShardingState::hasVersion( const string& ns , ConfigVersion& version ) {
+        scoped_lock lk(_mutex);
+
+        ChunkManagersMap::const_iterator it = _chunks.find(ns);
+        if ( it == _chunks.end() )
+            return false;
+
+        ShardChunkManagerPtr p = it->second;
+        version = p->getVersion();
+        return true;
+    }
+
+    const ConfigVersion ShardingState::getVersion( const string& ns ) const {
+        scoped_lock lk(_mutex);
+
+        ChunkManagersMap::const_iterator it = _chunks.find( ns );
+        if ( it != _chunks.end() ) {
+            ShardChunkManagerPtr p = it->second;
+            return p->getVersion();
+        }
+        else {
+            return 0;
+        }
+    }
+
+    void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) {
+        scoped_lock lk( _mutex );
+
+        ChunkManagersMap::const_iterator it = _chunks.find( ns );
+        assert( it != _chunks.end() ) ;
+        ShardChunkManagerPtr p = it->second;
+
+        // empty shards should have version 0
+        version = ( p->getNumChunks() > 1 ) ? version : ShardChunkVersion( 0 , 0 );
+
+        ShardChunkManagerPtr cloned( p->cloneMinus( min , max , version ) );
+        _chunks[ns] = cloned;
+    }
+
+    void ShardingState::undoDonateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) {
+        scoped_lock lk( _mutex );
+
+        ChunkManagersMap::const_iterator it = _chunks.find( ns );
+        assert( it != _chunks.end() ) ;
+        ShardChunkManagerPtr p( it->second->clonePlus( min , max , version ) );
+        _chunks[ns] = p;
+    }
+
+    void ShardingState::splitChunk( const string& ns , const BSONObj& min , const BSONObj& max , const vector<BSONObj>& splitKeys ,
+                                    ShardChunkVersion version ) {
+        scoped_lock lk( _mutex );
+
+        ChunkManagersMap::const_iterator it = _chunks.find( ns );
+        assert( it != _chunks.end() ) ;
+        ShardChunkManagerPtr p( it->second->cloneSplit( min , max , splitKeys , version ) );
+        _chunks[ns] = p;
+    }
+
+    void ShardingState::resetVersion( const string& ns ) {
+        scoped_lock lk( _mutex );
+
+        _chunks.erase( ns );
+    }
+
+    bool ShardingState::trySetVersion( const string& ns , ConfigVersion& version /* IN-OUT */ ) {
+
+        // fast path - requested version is at the same version as this chunk manager
+        //
+        // cases:
+        //   + this shard updated the version for a migrate's commit (FROM side)
+        //     a client reloaded chunk state from config and picked the newest version
+        //   + two clients reloaded
+        //     one triggered the 'slow path' (below)
+        //     when the second's request gets here, the version is already current
+        {
+            scoped_lock lk( _mutex );
+            ChunkManagersMap::const_iterator it = _chunks.find( ns );
+            if ( it != _chunks.end() && it->second->getVersion() == version )
+                return true;
+        }
+
+        // slow path - requested version is different than the current chunk manager's, if one exists, so must check for
+        // newest version in the config server
+        //
+        // cases:
+        //   + a chunk moved TO here
+        //     (we don't bump up the version on the TO side but the commit to config does use higher version)
+        //     a client reloads from config an issued the request
+        //   + there was a take over from a secondary
+        //     the secondary had no state (managers) at all, so every client request will fall here
+        //   + a stale client request a version that's not current anymore
+
+        const string c = (_configServer == _shardHost) ? "" /* local */ : _configServer;
+        ShardChunkManagerPtr p( new ShardChunkManager( c , ns , _shardName ) );
+        {
+            scoped_lock lk( _mutex );
+
+            // since we loaded the chunk manager unlocked, other thread may have done the same
+            // make sure we keep the freshest config info only
+            ChunkManagersMap::const_iterator it = _chunks.find( ns );
+            if ( it == _chunks.end() || p->getVersion() >= it->second->getVersion() ) {
+                _chunks[ns] = p;
+            }
+
+            ShardChunkVersion oldVersion = version;
+            version = p->getVersion();
+            return oldVersion == version;
+        }
+    }
+
+    void ShardingState::appendInfo( BSONObjBuilder& b ) {
+        b.appendBool( "enabled" , _enabled );
+        if ( ! _enabled )
+            return;
+
+        b.append( "configServer" , _configServer );
+        b.append( "shardName" , _shardName );
+        b.append( "shardHost" , _shardHost );
+
+        {
+            BSONObjBuilder bb( b.subobjStart( "versions" ) );
+
+            scoped_lock lk(_mutex);
+
+            for ( ChunkManagersMap::iterator it = _chunks.begin(); it != _chunks.end(); ++it ) {
+                ShardChunkManagerPtr p = it->second;
+                bb.appendTimestamp( it->first , p->getVersion() );
+            }
+            bb.done();
+        }
+
+    }
+
+    bool ShardingState::needShardChunkManager( const string& ns ) const {
+        if ( ! _enabled )
+            return false;
+
+        if ( ! ShardedConnectionInfo::get( false ) )
+            return false;
+
+        return true;
+    }
+
+    ShardChunkManagerPtr ShardingState::getShardChunkManager( const string& ns ) {
+        scoped_lock lk( _mutex );
+
+        ChunkManagersMap::const_iterator it = _chunks.find( ns );
+        if ( it == _chunks.end() ) {
+            return ShardChunkManagerPtr();
+        }
+        else {
+            return it->second;
+        }
+    }
+
+    ShardingState shardingState;
+
+    // -----ShardingState END ----
+
+    // -----ShardedConnectionInfo START ----
+
+    boost::thread_specific_ptr<ShardedConnectionInfo> ShardedConnectionInfo::_tl;
+
+    ShardedConnectionInfo::ShardedConnectionInfo() {
+        _forceVersionOk = false;
+        _id.clear();
+    }
+
+    ShardedConnectionInfo* ShardedConnectionInfo::get( bool create ) {
+        ShardedConnectionInfo* info = _tl.get();
+        if ( ! info && create ) {
+            LOG(1) << "entering shard mode for connection" << endl;
+            info = new ShardedConnectionInfo();
+            _tl.reset( info );
+        }
+        return info;
+    }
+
+    void ShardedConnectionInfo::reset() {
+        _tl.reset();
+    }
+
+    const ConfigVersion ShardedConnectionInfo::getVersion( const string& ns ) const {
+        NSVersionMap::const_iterator it = _versions.find( ns );
+        if ( it != _versions.end() ) {
+            return it->second;
+        }
+        else {
+            return 0;
+        }
+    }
+
+    void ShardedConnectionInfo::setVersion( const string& ns , const ConfigVersion& version ) {
+        _versions[ns] = version;
+    }
+
+    void ShardedConnectionInfo::addHook() {
+        static bool done = false;
+        if (!done) {
+            LOG(1) << "adding sharding hook" << endl;
+            pool.addHook(new ShardingConnectionHook(false));
+            shardConnectionPool.addHook(new ShardingConnectionHook(true));
+            done = true;
+        }
+    }
+
+    void ShardedConnectionInfo::setID( const OID& id ) {
+        _id = id;
+    }
+
+    // -----ShardedConnectionInfo END ----
+
+    unsigned long long extractVersion( BSONElement e , string& errmsg ) {
+        if ( e.eoo() ) {
+            errmsg = "no version";
+            return 0;
+        }
+
+        if ( e.isNumber() )
+            return (unsigned long long)e.number();
+
+        if ( e.type() == Date || e.type() == Timestamp )
+            return e._numberLong();
+
+
+        errmsg = "version is not a numeric type";
+        return 0;
+    }
+
+    class MongodShardCommand : public Command {
+    public:
+        MongodShardCommand( const char * n ) : Command( n ) {
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual bool adminOnly() const {
+            return true;
+        }
+    };
+
+
+    bool haveLocalShardingInfo( const string& ns ) {
+        if ( ! shardingState.enabled() )
+            return false;
+
+        if ( ! shardingState.hasVersion( ns ) )
+            return false;
+
+        return ShardedConnectionInfo::get(false) > 0;
+    }
+
+    class UnsetShardingCommand : public MongodShardCommand {
+    public:
+        UnsetShardingCommand() : MongodShardCommand("unsetSharding") {}
+
+        virtual void help( stringstream& help ) const {
+            help << " example: { unsetSharding : 1 } ";
+        }
+
+        virtual LockType locktype() const { return NONE; }
+
+        virtual bool slaveOk() const { return true; }
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            ShardedConnectionInfo::reset();
+            return true;
+        }
+
+    } unsetShardingCommand;
+
+    class SetShardVersion : public MongodShardCommand {
+    public:
+        SetShardVersion() : MongodShardCommand("setShardVersion") {}
+
+        virtual void help( stringstream& help ) const {
+            help << " example: { setShardVersion : 'alleyinsider.foo' , version : 1 , configdb : '' } ";
+        }
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        
+        bool checkConfigOrInit( const string& configdb , bool authoritative , string& errmsg , BSONObjBuilder& result , bool locked=false ) const {
+            if ( configdb.size() == 0 ) {
+                errmsg = "no configdb";
+                return false;
+            }
+            
+            if ( shardingState.enabled() ) {
+                if ( configdb == shardingState.getConfigServer() ) 
+                    return true;
+                
+                result.append( "configdb" , BSON( "stored" << shardingState.getConfigServer() << 
+                                                  "given" << configdb ) );
+                errmsg = "specified a different configdb!";
+                return false;
+            }
+            
+            if ( ! authoritative ) {
+                result.appendBool( "need_authoritative" , true );
+                errmsg = "first setShardVersion";
+                return false;
+            }
+            
+            if ( locked ) {
+                ShardedConnectionInfo::addHook();
+                shardingState.enable( configdb );
+                configServer.init( configdb );
+                return true;
+            }
+
+            dblock lk;
+            return checkConfigOrInit( configdb , authoritative , errmsg , result , true );
+        }
+        
+        bool checkMongosID( ShardedConnectionInfo* info, const BSONElement& id, string& errmsg ) {
+            if ( id.type() != jstOID ) {
+                if ( ! info->hasID() ) {
+                    warning() << "bad serverID set in setShardVersion and none in info: " << id << endl;
+                }
+                // TODO: fix this
+                //errmsg = "need serverID to be an OID";
+                //return 0;
+                return true;
+            }
+            
+            OID clientId = id.__oid();
+            if ( ! info->hasID() ) {
+                info->setID( clientId );
+                return true;
+            }
+            
+            if ( clientId != info->getID() ) {
+                errmsg = "server id has changed!";
+                return false;
+            }
+
+            return true;
+        }
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+            // Steps
+            // 1. check basic config
+            // 2. extract params from command
+            // 3. fast check
+            // 4. slow check (LOCKS)
+            
+            // step 1
+
+            lastError.disableForCommand();
+            ShardedConnectionInfo* info = ShardedConnectionInfo::get( true );
+
+            // make sure we have the mongos id for writebacks
+            if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) 
+                return false;
+
+            bool authoritative = cmdObj.getBoolField( "authoritative" );
+            
+            // check config server is ok or enable sharding
+            if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) )
+                return false;
+
+            // check shard name/hosts are correct
+            if ( cmdObj["shard"].type() == String ) {
+                shardingState.gotShardName( cmdObj["shard"].String() );
+                shardingState.gotShardHost( cmdObj["shardHost"].String() );
+            }
+            
+
+            // Handle initial shard connection
+            if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ){
+                result.append( "initialized", true );
+                return true;
+            }
+
+            // we can run on a slave up to here
+            if ( ! isMaster( "admin" ) ) {
+                result.append( "errmsg" , "not master" );
+                result.append( "note" , "from post init in setShardVersion" );
+                return false;
+            }
+
+            // step 2
+            
+            string ns = cmdObj["setShardVersion"].valuestrsafe();
+            if ( ns.size() == 0 ) {
+                errmsg = "need to specify namespace";
+                return false;
+            }
+
+            const ConfigVersion version = extractVersion( cmdObj["version"] , errmsg );
+            if ( errmsg.size() )
+                return false;
+            
+            // step 3
+
+            const ConfigVersion oldVersion = info->getVersion(ns);
+            const ConfigVersion globalVersion = shardingState.getVersion(ns);
+
+            result.appendTimestamp( "oldVersion" , oldVersion );
+            
+            if ( globalVersion > 0 && version > 0 ) {
+                // this means there is no reset going on an either side
+                // so its safe to make some assumptions
+
+                if ( version == globalVersion ) {
+                    // mongos and mongod agree!
+                    if ( oldVersion != version ) {
+                        if ( oldVersion < globalVersion ) {
+                            info->setVersion( ns , version );
+                        }
+                        else if ( authoritative ) {
+                            // this means there was a drop and our version is reset
+                            info->setVersion( ns , version );
+                        }
+                        else {
+                            result.append( "ns" , ns );
+                            result.appendBool( "need_authoritative" , true );
+                            errmsg = "verifying drop on '" + ns + "'";
+                            return false;
+                        }
+                    }
+                    return true;
+                }
+                
+            }
+
+            // step 4
+            
+            // this is because of a weird segfault I saw and I can't see why this should ever be set
+            massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); 
+        
+            dblock setShardVersionLock; // TODO: can we get rid of this??
+            
+            if ( oldVersion > 0 && globalVersion == 0 ) {
+                // this had been reset
+                info->setVersion( ns , 0 );
+            }
+
+            if ( version == 0 && globalVersion == 0 ) {
+                // this connection is cleaning itself
+                info->setVersion( ns , 0 );
+                return true;
+            }
+
+            if ( version == 0 && globalVersion > 0 ) {
+                if ( ! authoritative ) {
+                    result.appendBool( "need_authoritative" , true );
+                    result.append( "ns" , ns );
+                    result.appendTimestamp( "globalVersion" , globalVersion );
+                    errmsg = "dropping needs to be authoritative";
+                    return false;
+                }
+                log() << "wiping data for: " << ns << endl;
+                result.appendTimestamp( "beforeDrop" , globalVersion );
+                // only setting global version on purpose
+                // need clients to re-find meta-data
+                shardingState.resetVersion( ns );
+                info->setVersion( ns , 0 );
+                return true;
+            }
+
+            if ( version < oldVersion ) {
+                errmsg = "this connection already had a newer version of collection '" + ns + "'";
+                result.append( "ns" , ns );
+                result.appendTimestamp( "newVersion" , version );
+                result.appendTimestamp( "globalVersion" , globalVersion );
+                return false;
+            }
+
+            if ( version < globalVersion ) {
+                while ( shardingState.inCriticalMigrateSection() ) {
+                    dbtemprelease r;
+                    sleepmillis(2);
+                    OCCASIONALLY log() << "waiting till out of critical section" << endl;
+                }
+                errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'";
+                result.append( "ns" , ns );
+                result.appendTimestamp( "version" , version );
+                result.appendTimestamp( "globalVersion" , globalVersion );
+                result.appendBool( "reloadConfig" , true );
+                return false;
+            }
+
+            if ( globalVersion == 0 && ! authoritative ) {
+                // need authoritative for first look
+                result.append( "ns" , ns );
+                result.appendBool( "need_authoritative" , true );
+                errmsg = "first time for collection '" + ns + "'";
+                return false;
+            }
+
+            Timer relockTime;
+            {
+                dbtemprelease unlock;
+
+                ShardChunkVersion currVersion = version;
+                if ( ! shardingState.trySetVersion( ns , currVersion ) ) {
+                    errmsg = str::stream() << "client version differs from config's for collection '" << ns << "'";
+                    result.append( "ns" , ns );
+                    result.appendTimestamp( "version" , version );
+                    result.appendTimestamp( "globalVersion" , currVersion );
+                    return false;
+                }
+            }
+            if ( relockTime.millis() >= ( cmdLine.slowMS - 10 ) ) {
+                log() << "setShardVersion - relocking slow: " << relockTime.millis() << endl;
+            }
+            
+            info->setVersion( ns , version );
+            return true;
+        }
+
+    } setShardVersionCmd;
+
+    class GetShardVersion : public MongodShardCommand {
+    public:
+        GetShardVersion() : MongodShardCommand("getShardVersion") {}
+
+        virtual void help( stringstream& help ) const {
+            help << " example: { getShardVersion : 'alleyinsider.foo'  } ";
+        }
+
+        virtual LockType locktype() const { return NONE; }
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string ns = cmdObj["getShardVersion"].valuestrsafe();
+            if ( ns.size() == 0 ) {
+                errmsg = "need to specify full namespace";
+                return false;
+            }
+
+            result.append( "configServer" , shardingState.getConfigServer() );
+
+            result.appendTimestamp( "global" , shardingState.getVersion(ns) );
+
+            ShardedConnectionInfo* info = ShardedConnectionInfo::get( false );
+            result.appendBool( "inShardedMode" , info != 0 );
+            if ( info )
+                result.appendTimestamp( "mine" , info->getVersion(ns) );
+            else
+                result.appendTimestamp( "mine" , 0 );
+
+            return true;
+        }
+
+    } getShardVersion;
+
+    class ShardingStateCmd : public MongodShardCommand {
+    public:
+        ShardingStateCmd() : MongodShardCommand( "shardingState" ) {}
+
+        virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            shardingState.appendInfo( result );
+            return true;
+        }
+
+    } shardingStateCmd;
+
+    /**
+     * @ return true if not in sharded mode
+                     or if version for this client is ok
+     */
+    bool shardVersionOk( const string& ns , string& errmsg ) {
+        if ( ! shardingState.enabled() )
+            return true;
+
+        if ( ! isMasterNs( ns.c_str() ) )  {
+            // right now connections to secondaries aren't versioned at all
+            return true;
+        }
+
+        ShardedConnectionInfo* info = ShardedConnectionInfo::get( false );
+
+        if ( ! info ) {
+            // this means the client has nothing sharded
+            // so this allows direct connections to do whatever they want
+            // which i think is the correct behavior
+            return true;
+        }
+
+        if ( info->inForceVersionOkMode() ) {
+            return true;
+        }
+
+        // TODO
+        //   all collections at some point, be sharded or not, will have a version (and a ShardChunkManager)
+        //   for now, we remove the sharding state of dropped collection
+        //   so delayed request may come in. This has to be fixed.
+        ConfigVersion clientVersion = info->getVersion(ns);
+        ConfigVersion version;
+        if ( ! shardingState.hasVersion( ns , version ) && clientVersion == 0 ) {
+            return true;
+        }
+
+
+        if ( version == 0 && clientVersion > 0 ) {
+            stringstream ss;
+            ss << "collection was dropped or this shard no longer valid version: " << version << " clientVersion: " << clientVersion;
+            errmsg = ss.str();
+            return false;
+        }
+
+        if ( clientVersion >= version )
+            return true;
+
+
+        if ( clientVersion == 0 ) {
+            stringstream ss;
+            ss << "client in sharded mode, but doesn't have version set for this collection: " << ns << " myVersion: " << version;
+            errmsg = ss.str();
+            return false;
+        }
+
+        if ( version.majorVersion() == clientVersion.majorVersion() ) {
+            // this means there was just a split
+            // since on a split w/o a migrate this server is ok
+            // going to accept 
+            return true;
+        }
+
+        stringstream ss;
+        ss << "your version is too old  ns: " + ns << " global: " << version << " client: " << clientVersion;
+        errmsg = ss.str();
+        return false;
+    }
+
+    void ShardingConnectionHook::onHandedOut( DBClientBase * conn ) {
+        // no-op for mongod
+    }
+}
diff --git a/src/mongo/s/d_writeback.cpp b/src/mongo/s/d_writeback.cpp
new file mode 100644
index 00000000000..01c0c14ac0a
--- /dev/null
+++ b/src/mongo/s/d_writeback.cpp
@@ -0,0 +1,179 @@
+// d_writeback.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "../db/commands.h"
+#include "../util/queue.h"
+#include "../util/net/listen.h"
+
+#include "d_writeback.h"
+
+using namespace std;
+
+namespace mongo {
+
+    // ---------- WriteBackManager class ----------
+
+    // TODO init at mongod startup
+    WriteBackManager writeBackManager;
+
+    WriteBackManager::WriteBackManager() : _writebackQueueLock("sharding:writebackQueueLock") {
+    }
+
+    WriteBackManager::~WriteBackManager() {
+    }
+
+    void WriteBackManager::queueWriteBack( const string& remote , const BSONObj& o ) {
+        getWritebackQueue( remote )->queue.push( o );
+    }
+
+    shared_ptr<WriteBackManager::QueueInfo> WriteBackManager::getWritebackQueue( const string& remote ) {
+        scoped_lock lk ( _writebackQueueLock );
+        shared_ptr<QueueInfo>& q = _writebackQueues[remote];
+        if ( ! q )
+            q.reset( new QueueInfo() );
+        q->lastCall = Listener::getElapsedTimeMillis();
+        return q;
+    }
+
+    bool WriteBackManager::queuesEmpty() const {
+        scoped_lock lk( _writebackQueueLock );
+        for ( WriteBackQueuesMap::const_iterator it = _writebackQueues.begin(); it != _writebackQueues.end(); ++it ) {
+            const shared_ptr<QueueInfo> queue = it->second;
+            if (! queue->queue.empty() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void WriteBackManager::appendStats( BSONObjBuilder& b ) const {
+        BSONObjBuilder sub;
+        long long totalQueued = 0;
+        long long now = Listener::getElapsedTimeMillis();
+        {
+            scoped_lock lk( _writebackQueueLock );
+            for ( WriteBackQueuesMap::const_iterator it = _writebackQueues.begin(); it != _writebackQueues.end(); ++it ) {
+                const shared_ptr<QueueInfo> queue = it->second;
+                
+                BSONObjBuilder t( sub.subobjStart( it->first ) );
+                t.appendNumber( "n" , queue->queue.size() );
+                t.appendNumber( "minutesSinceLastCall" , ( now - queue->lastCall ) / ( 1000 * 60 ) );
+                t.done();
+
+                totalQueued += queue->queue.size();
+            }
+        }
+        
+        b.appendBool( "hasOpsQueued" , totalQueued > 0 );
+        b.appendNumber( "totalOpsQueued" , totalQueued );
+        b.append( "queues" , sub.obj() );
+    }
+
+    bool WriteBackManager::cleanupOldQueues() {
+        long long now = Listener::getElapsedTimeMillis();
+
+        scoped_lock lk( _writebackQueueLock );
+        for ( WriteBackQueuesMap::iterator it = _writebackQueues.begin(); it != _writebackQueues.end(); ++it ) {
+            const shared_ptr<QueueInfo> queue = it->second;
+            long long sinceMinutes = ( now - queue->lastCall ) / ( 1000 * 60 );
+
+            if ( sinceMinutes < 60 ) // minutes of inactivity.  
+                continue;
+            
+            log() << "deleting queue from: " << it->first 
+                  << " of size: " << queue->queue.size() 
+                  << " after " << sinceMinutes << " inactivity" 
+                  << " (normal if any mongos has restarted)" 
+                  << endl;
+
+            _writebackQueues.erase( it );
+            return true;
+        }
+        return false;
+    }
+
+    void WriteBackManager::Cleaner::taskDoWork() { 
+        for ( int i=0; i<1000; i++ ) {
+            if ( ! writeBackManager.cleanupOldQueues() )
+                break;
+        }
+    }
+
+    // ---------- admin commands ----------
+
+    // Note, this command will block until there is something to WriteBack
+    class WriteBackCommand : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+
+        WriteBackCommand() : Command( "writebacklisten" ) {}
+
+        void help(stringstream& h) const { h<<"internal"; }
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+            BSONElement e = cmdObj.firstElement();
+            if ( e.type() != jstOID ) {
+                errmsg = "need oid as first value";
+                return 0;
+            }
+
+            // get the command issuer's (a mongos) serverID
+            const OID id = e.__oid();
+
+            // the command issuer is blocked awaiting a response
+            // we want to do return at least at every 5 minutes so sockets don't timeout
+            BSONObj z;
+            if ( writeBackManager.getWritebackQueue(id.str())->queue.blockingPop( z, 5 * 60 /* 5 minutes */ ) ) {
+                LOG(1) << "WriteBackCommand got : " << z << endl;
+                result.append( "data" , z );
+            }
+            else {
+                result.appendBool( "noop" , true );
+            }
+
+            return true;
+        }
+    } writeBackCommand;
+
+    class WriteBacksQueuedCommand : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+
+        WriteBacksQueuedCommand() : Command( "writeBacksQueued" ) {}
+
+        void help(stringstream& help) const {
+            help << "Returns whether there are operations in the writeback queue at the time the command was called. "
+                 << "This is an internal command";
+        }
+
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            writeBackManager.appendStats( result );
+            return true;
+        }
+
+    } writeBacksQueuedCommand;
+    
+
+}  // namespace mongo
diff --git a/src/mongo/s/d_writeback.h b/src/mongo/s/d_writeback.h
new file mode 100644
index 00000000000..d3f36a14aca
--- /dev/null
+++ b/src/mongo/s/d_writeback.h
@@ -0,0 +1,106 @@
+// @file d_writeback.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+
+#include "../util/queue.h"
+#include "../util/background.h"
+
+namespace mongo {
+
+    /*
+     * The WriteBackManager keeps one queue of pending operations per mongos. The operations get here
+     * if they were directed to a chunk that is no longer in this mongod server. The operations are
+     * "written back" to the mongos server per its request (command 'writebacklisten').
+     *
+     * The class is thread safe.
+     */
+    class WriteBackManager {
+    public:
+
+        class QueueInfo : boost::noncopyable {
+        public:
+            QueueInfo(){}
+
+            BlockingQueue<BSONObj> queue;
+            long long lastCall;   // this is ellapsed millis since startup
+        };
+
+        // a map from mongos's serverIDs to queues of "rejected" operations
+        // an operation is rejected if it targets data that does not live on this shard anymore
+        typedef map<string,shared_ptr<QueueInfo> > WriteBackQueuesMap;
+
+
+    public:
+        WriteBackManager();
+        ~WriteBackManager();
+
+        /*
+         * @param remote server ID this operation came from
+         * @param op the operation itself
+         *
+         * Enqueues opeartion 'op' in server 'remote's queue. The operation will be written back to
+         * remote at a later stager.
+         */
+        void queueWriteBack( const string& remote , const BSONObj& op );
+
+        /*
+         * @param remote server ID
+         * @return the queue for operations that came from 'remote'
+         *
+         * Gets access to server 'remote's queue, which is synchronized.
+         */
+        shared_ptr<QueueInfo> getWritebackQueue( const string& remote );
+
+        /*
+         * @return true if there is no operation queued for write back
+         */
+        bool queuesEmpty() const;
+
+        /** 
+         * appends a number of statistics
+         */
+        void appendStats( BSONObjBuilder& b ) const;
+        
+        /**
+         * removes queues that have been idle
+         * @return if something was removed
+         */
+        bool cleanupOldQueues();
+        
+    private:
+        
+        // '_writebackQueueLock' protects only the map itself, since each queue is syncrhonized.
+        mutable mongo::mutex _writebackQueueLock;
+        WriteBackQueuesMap _writebackQueues;
+        
+        class Cleaner : public PeriodicTask {
+        public:
+            virtual string taskName() const { return "WriteBackManager::cleaner"; }
+            virtual void taskDoWork();
+        };
+
+        Cleaner _cleaner;
+    };
+
+    // TODO collect global state in a central place and init during startup
+    extern WriteBackManager writeBackManager;
+
+} // namespace mongo
diff --git a/src/mongo/s/dbgrid.vcxproj b/src/mongo/s/dbgrid.vcxproj
new file mode 100644
index 00000000000..93edc46211e
--- /dev/null
+++ b/src/mongo/s/dbgrid.vcxproj
@@ -0,0 +1,691 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectName>mongos</ProjectName>
+    <ProjectGuid>{E03717ED-69B4-4D21-BC55-DF6690B585C6}</ProjectGuid>
+    <RootNamespace>dbgrid</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;MONGO_EXPOSE_MACROS;XP_WIN;OLDJS;STATIC_JS_API;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>No</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;MONGO_EXPOSE_MACROS;XP_WIN;OLDJS;STATIC_JS_API;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MinimalRebuild>No</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\bson\oid.cpp" />
+    <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\client\distlock.cpp" />
+    <ClCompile Include="..\db\commands\cloud.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\commands\pipeline.cpp" />
+    <ClCompile Include="..\db\common.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\dbmessage.cpp" />
+    <ClCompile Include="..\db\dbcommands_generic.cpp" />
+    <ClCompile Include="..\db\dbwebserver.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator_add_to_set.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator_avg.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator_first.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator_last.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator_min_max.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator_push.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator_single_value.cpp" />
+    <ClCompile Include="..\db\pipeline\accumulator_sum.cpp" />
+    <ClCompile Include="..\db\pipeline\builder.cpp" />
+    <ClCompile Include="..\db\pipeline\document.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_bson_array.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_command_futures.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_filter.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_filter_base.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_group.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_limit.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_match.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_out.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_project.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_skip.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_sort.cpp" />
+    <ClCompile Include="..\db\pipeline\document_source_unwind.cpp" />
+    <ClCompile Include="..\db\pipeline\doc_mem_monitor.cpp" />
+    <ClCompile Include="..\db\pipeline\expression.cpp" />
+    <ClCompile Include="..\db\pipeline\expression_context.cpp" />
+    <ClCompile Include="..\db\pipeline\field_path.cpp" />
+    <ClCompile Include="..\db\pipeline\value.cpp" />
+    <ClCompile Include="..\db\querypattern.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\security_commands.cpp" />
+    <ClCompile Include="..\db\security_common.cpp" />
+    <ClCompile Include="..\db\stats\top.cpp" />
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\util\alignedbuilder.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\util\concurrency\task.cpp" />
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+    <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\intrusive_counter.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\net\miniwebserver.cpp" />
+    <ClCompile Include="..\util\net\listen.cpp" />
+    <ClCompile Include="..\util\processinfo.cpp" />
+    <ClCompile Include="..\util\ramlog.cpp" />
+    <ClCompile Include="..\util\signal_handlers.cpp" />
+    <ClCompile Include="..\util\stringutils.cpp" />
+    <ClCompile Include="..\util\systeminfo_win32.cpp" />
+    <ClCompile Include="..\util\text.cpp" />
+    <ClCompile Include="..\util\version.cpp" />
+    <ClCompile Include="balance.cpp" />
+    <ClCompile Include="balancer_policy.cpp" />
+    <ClCompile Include="chunk.cpp" />
+    <ClCompile Include="client.cpp" />
+    <ClCompile Include="commands_admin.cpp" />
+    <ClCompile Include="commands_public.cpp" />
+    <ClCompile Include="config.cpp" />
+    <ClCompile Include="config_migrate.cpp" />
+    <ClCompile Include="cursors.cpp" />
+    <ClCompile Include="..\pch.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\queryutil.cpp" />
+    <ClCompile Include="grid.cpp" />
+    <ClCompile Include="mr_shard.cpp" />
+    <ClCompile Include="request.cpp" />
+    <ClCompile Include="security.cpp" />
+    <ClCompile Include="shardconnection.cpp" />
+    <ClCompile Include="shard_version.cpp" />
+    <ClCompile Include="s_only.cpp" />
+    <ClCompile Include="server.cpp" />
+    <ClCompile Include="shard.cpp" />
+    <ClCompile Include="shardkey.cpp" />
+    <ClCompile Include="stats.cpp" />
+    <ClCompile Include="strategy.cpp" />
+    <ClCompile Include="strategy_shard.cpp" />
+    <ClCompile Include="strategy_single.cpp" />
+    <ClCompile Include="..\scripting\utils.cpp" />
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="..\client\model.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\db\cmdline.cpp" />
+    <ClCompile Include="..\db\commands.cpp" />
+    <ClCompile Include="..\db\stats\counters.cpp" />
+    <ClCompile Include="..\util\debug_util.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\db\indexkey.cpp" />
+    <ClCompile Include="..\db\jsobj.cpp" />
+    <ClCompile Include="..\db\json.cpp" />
+    <ClCompile Include="..\db\lasterror.cpp" />
+    <ClCompile Include="..\db\matcher.cpp" />
+    <ClCompile Include="..\util\md5.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\md5main.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message.cpp" />
+    <ClCompile Include="..\util\net\message_port.cpp" />
+    <ClCompile Include="..\util\net\message_server_port.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="..\shell\mongo.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\db\nonce.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\util\net\sock.cpp" />
+    <ClCompile Include="..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="writeback_listener.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\db\commands\pipeline.h" />
+    <ClInclude Include="..\db\pipeline\accumulator.h" />
+    <ClInclude Include="..\db\pipeline\builder.h" />
+    <ClInclude Include="..\db\pipeline\document.h" />
+    <ClInclude Include="..\db\pipeline\document_source.h" />
+    <ClInclude Include="..\db\pipeline\doc_mem_monitor.h" />
+    <ClInclude Include="..\db\pipeline\expression.h" />
+    <ClInclude Include="..\db\pipeline\expression_context.h" />
+    <ClInclude Include="..\db\pipeline\field_path.h" />
+    <ClInclude Include="..\db\pipeline\value.h" />
+    <ClInclude Include="..\util\intrusive_counter.h" />
+    <ClInclude Include="..\util\processinfo.h" />
+    <ClInclude Include="..\util\signal_handlers.h" />
+    <ClInclude Include="..\util\systeminfo.h" />
+    <ClInclude Include="..\util\version.h" />
+    <ClInclude Include="balance.h" />
+    <ClInclude Include="balancer_policy.h" />
+    <ClInclude Include="chunk.h" />
+    <ClInclude Include="client.h" />
+    <ClInclude Include="config.h" />
+    <ClInclude Include="cursors.h" />
+    <ClInclude Include="d_chunk_manager.h" />
+    <ClInclude Include="d_logic.h" />
+    <ClInclude Include="d_writeback.h" />
+    <ClInclude Include="grid.h" />
+    <ClInclude Include="gridconfig.h" />
+    <ClInclude Include="griddatabase.h" />
+    <ClInclude Include="request.h" />
+    <ClInclude Include="server.h" />
+    <ClInclude Include="shard.h" />
+    <ClInclude Include="shardkey.h" />
+    <ClInclude Include="shard_version.h" />
+    <ClInclude Include="stats.h" />
+    <ClInclude Include="strategy.h" />
+    <ClInclude Include="..\util\background.h" />
+    <ClInclude Include="..\db\commands.h" />
+    <ClInclude Include="..\db\dbmessage.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\db\jsobj.h" />
+    <ClInclude Include="..\db\json.h" />
+    <ClInclude Include="..\pch.h" />
+    <ClInclude Include="..\..\boostw\boost_1_34_1\boost\config\auto_link.hpp" />
+    <ClInclude Include="..\..\boostw\boost_1_34_1\boost\version.hpp" />
+    <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="util.h" />
+    <ClInclude Include="writeback_listener.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js32d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js32r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/s/dbgrid.vcxproj.filters b/src/mongo/s/dbgrid.vcxproj.filters
new file mode 100755
index 00000000000..02b6e9972e9
--- /dev/null
+++ b/src/mongo/s/dbgrid.vcxproj.filters
@@ -0,0 +1,614 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="libs_etc">
+      <UniqueIdentifier>{17d48ddf-5c49-4dfd-bafa-16d5fed290cd}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="libs_etc\pcre">
+      <UniqueIdentifier>{4c2dd526-4a57-4ff7-862f-2bd7ec4955b3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="client">
+      <UniqueIdentifier>{b4f6635b-8c64-4ceb-8077-43203533d0b9}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Shared Source Files">
+      <UniqueIdentifier>{e59da087-4433-46b9-862d-746cbed27b97}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Shared">
+      <UniqueIdentifier>{4048b883-7255-40b3-b0e9-4c1044cff049}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="balance.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="chunk.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="commands_admin.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="commands_public.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="config.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="config_migrate.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="cursors.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\queryutil.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="request.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="s_only.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="server.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="shard.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="shardkey.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="stats.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="strategy.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="strategy_shard.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="strategy_single.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\scripting\utils.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\dbclient.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\dbclientcursor.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\model.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\assert_util.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\background.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\base64.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\cmdline.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\commands.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\stats\counters.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\debug_util.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\scripting\engine.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\indexkey.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\jsobj.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\json.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\lasterror.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\matcher.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\md5.c">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\md5main.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\mmap.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\mmap_win.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\shell\mongo.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\nonce.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\parallel.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\processinfo_win32.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\syncclusterconnection.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\util.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\pch.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="shardconnection.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\vars.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\version.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\text.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="balancer_policy.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\stringutils.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\distlock.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\log.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="grid.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\processinfo.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbwebserver.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\task.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\signal_handlers.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\scripting\bench.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="writeback_listener.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="shard_version.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\alignedbuilder.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\bson\oid.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\dbclient_rs.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="client.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbcommands_generic.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\querypattern.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\ramlog.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="mr_shard.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\common.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\security_common.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="security.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\dbmessage.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\miniwebserver.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\listen.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message_port.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message_server_port.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\net\sock.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\security_commands.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClInclude Include="gridconfig.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="griddatabase.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="shard.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="strategy.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\background.h">
+      <Filter>Header Shared</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\commands.h">
+      <Filter>Header Shared</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\dbmessage.h">
+      <Filter>Header Shared</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\goodies.h">
+      <Filter>Header Shared</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\jsobj.h">
+      <Filter>Header Shared</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\json.h">
+      <Filter>Header Shared</Filter>
+    </ClInclude>
+    <ClInclude Include="..\pch.h">
+      <Filter>Header Shared</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\boostw\boost_1_34_1\boost\config\auto_link.hpp">
+      <Filter>libs_etc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\boostw\boost_1_34_1\boost\version.hpp">
+      <Filter>libs_etc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\client\connpool.h">
+      <Filter>client</Filter>
+    </ClInclude>
+    <ClInclude Include="..\client\dbclient.h">
+      <Filter>client</Filter>
+    </ClInclude>
+    <ClInclude Include="..\client\model.h">
+      <Filter>client</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\version.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="balancer_policy.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="grid.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\processinfo.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\signal_handlers.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="writeback_listener.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="balance.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="chunk.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="client.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="config.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="cursors.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="d_chunk_manager.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="d_logic.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="d_writeback.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="request.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="server.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="shard_version.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="shardkey.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="stats.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="util.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\pcre-7.4\config.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClCompile Include="..\client\connpool.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\commands\cloud.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\commands\pipeline.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator_add_to_set.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator_avg.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator_first.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator_last.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator_min_max.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator_push.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator_single_value.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\accumulator_sum.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\builder.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\doc_mem_monitor.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_bson_array.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_command_futures.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_filter.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_filter_base.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_group.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_limit.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_match.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_out.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_project.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_skip.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_sort.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\document_source_unwind.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\expression.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\expression_context.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\field_path.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\pipeline\value.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\intrusive_counter.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\systeminfo_win32.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\stats\top.cpp">
+      <Filter>Shared Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js32d.lib" />
+    <Library Include="..\..\js\js32r.lib" />
+    <Library Include="..\..\js\js64d.lib" />
+    <Library Include="..\..\js\js64r.lib" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\db\commands\pipeline.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\accumulator.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\builder.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\doc_mem_monitor.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\document.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\document_source.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\expression.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\expression_context.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\field_path.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\db\pipeline\value.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\intrusive_counter.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\systeminfo.h">
+      <Filter>Shared Source Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/s/default_version.cpp b/src/mongo/s/default_version.cpp
new file mode 100644
index 00000000000..82368672a2e
--- /dev/null
+++ b/src/mongo/s/default_version.cpp
@@ -0,0 +1,52 @@
+// @file default_version.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "s/util.h"
+#include "shard_version.h"
+
+namespace mongo {
+
+    // Global version manager
+    VersionManager versionManager;
+
+    void VersionManager::resetShardVersionCB( DBClientBase * conn ) {
+        return;
+    }
+
+    bool VersionManager::isVersionableCB( DBClientBase* conn ){
+        return false;
+    }
+
+    bool VersionManager::initShardVersionCB( DBClientBase * conn_in, BSONObj& result ){
+        return false;
+    }
+
+    bool VersionManager::forceRemoteCheckShardVersionCB( const string& ns ){
+        return true;
+    }
+
+    bool VersionManager::checkShardVersionCB( DBClientBase* conn_in , const string& ns , bool authoritative , int tryNumber ) {
+        return false;
+    }
+
+    bool VersionManager::checkShardVersionCB( ShardConnection* conn_in , bool authoritative , int tryNumber ) {
+        return false;
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/s/grid.cpp b/src/mongo/s/grid.cpp
new file mode 100644
index 00000000000..9d9c2e4555e
--- /dev/null
+++ b/src/mongo/s/grid.cpp
@@ -0,0 +1,531 @@
+// grid.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include <iomanip>
+#include "../client/connpool.h"
+#include "../util/stringutils.h"
+#include "../util/unittest.h"
+#include "../db/namespacestring.h"
+
+#include "grid.h"
+#include "shard.h"
+
+namespace mongo {
+
+    DBConfigPtr Grid::getDBConfig( string database , bool create , const string& shardNameHint ) {
+        {
+            string::size_type i = database.find( "." );
+            if ( i != string::npos )
+                database = database.substr( 0 , i );
+        }
+
+        if ( database == "config" )
+            return configServerPtr;
+
+        uassert( 15918 , str::stream() << "invalid database name: " << database , NamespaceString::validDBName( database ) );
+
+        scoped_lock l( _lock );
+
+        DBConfigPtr& cc = _databases[database];
+        if ( !cc ) {
+            cc.reset(new DBConfig( database ));
+            if ( ! cc->load() ) {
+                if ( create ) {
+                    // note here that cc->primary == 0.
+                    log() << "couldn't find database [" << database << "] in config db" << endl;
+
+                    {
+                        // lets check case
+                        ScopedDbConnection conn( configServer.modelServer() );
+                        BSONObjBuilder b;
+                        b.appendRegex( "_id" , (string)"^" + database + "$" , "i" );
+                        BSONObj d = conn->findOne( ShardNS::database , b.obj() );
+                        conn.done();
+
+                        if ( ! d.isEmpty() ) {
+                            cc.reset();
+                            stringstream ss;
+                            ss <<  "can't have 2 databases that just differ on case "
+                               << " have: " << d["_id"].String()
+                               << " want to add: " << database;
+
+                            uasserted( DatabaseDifferCaseCode ,ss.str() );
+                        }
+                    }
+
+                    Shard primary;
+                    if ( database == "admin" ) {
+                        primary = configServer.getPrimary();
+
+                    }
+                    else if ( shardNameHint.empty() ) {
+                        primary = Shard::pick();
+
+                    }
+                    else {
+                        // use the shard name if provided
+                        Shard shard;
+                        shard.reset( shardNameHint );
+                        primary = shard;
+                    }
+
+                    if ( primary.ok() ) {
+                        cc->setPrimary( primary.getName() ); // saves 'cc' to configDB
+                        log() << "\t put [" << database << "] on: " << primary << endl;
+                    }
+                    else {
+                        cc.reset();
+                        log() << "\t can't find a shard to put new db on" << endl;
+                        uasserted( 10185 ,  "can't find a shard to put new db on" );
+                    }
+                }
+                else {
+                    cc.reset();
+                }
+            }
+
+        }
+
+        return cc;
+    }
+
+    void Grid::removeDB( string database ) {
+        uassert( 10186 ,  "removeDB expects db name" , database.find( '.' ) == string::npos );
+        scoped_lock l( _lock );
+        _databases.erase( database );
+
+    }
+
+    bool Grid::allowLocalHost() const {
+        return _allowLocalShard;
+    }
+
+    void Grid::setAllowLocalHost( bool allow ) {
+        _allowLocalShard = allow;
+    }
+
+    bool Grid::addShard( string* name , const ConnectionString& servers , long long maxSize , string& errMsg ) {
+        // name can be NULL, so provide a dummy one here to avoid testing it elsewhere
+        string nameInternal;
+        if ( ! name ) {
+            name = &nameInternal;
+        }
+
+        ReplicaSetMonitorPtr rsMonitor;
+
+        // Check whether the host (or set) exists and run several sanity checks on this request.
+        // There are two set of sanity checks: making sure adding this particular shard is consistent
+        // with the replica set state (if it exists) and making sure this shards databases can be
+        // brought into the grid without conflict.
+
+        vector<string> dbNames;
+        try {
+            ScopedDbConnection newShardConn( servers );
+            newShardConn->getLastError();
+
+            if ( newShardConn->type() == ConnectionString::SYNC ) {
+                newShardConn.done();
+                errMsg = "can't use sync cluster as a shard.  for replica set, have to use <setname>/<server1>,<server2>,...";
+                return false;
+            }
+            
+            BSONObj resIsMongos;
+            bool ok = newShardConn->runCommand( "admin" , BSON( "isdbgrid" << 1 ) , resIsMongos );
+
+            // should return ok=0, cmd not found if it's a normal mongod
+            if ( ok ) {
+                errMsg = "can't add a mongos process as a shard";
+                newShardConn.done();
+                return false;
+            }
+
+            BSONObj resIsMaster;
+            ok =  newShardConn->runCommand( "admin" , BSON( "isMaster" << 1 ) , resIsMaster );
+            if ( !ok ) {
+                ostringstream ss;
+                ss << "failed running isMaster: " << resIsMaster;
+                errMsg = ss.str();
+                newShardConn.done();
+                return false;
+            }
+
+            // if the shard has only one host, make sure it is not part of a replica set
+            string setName = resIsMaster["setName"].str();
+            string commandSetName = servers.getSetName();
+            if ( commandSetName.empty() && ! setName.empty() ) {
+                ostringstream ss;
+                ss << "host is part of set: " << setName << " use replica set url format <setname>/<server1>,<server2>,....";
+                errMsg = ss.str();
+                newShardConn.done();
+                return false;
+            }
+            if ( !commandSetName.empty() && setName.empty() ) {
+                ostringstream ss;
+                ss << "host did not return a set name, is the replica set still initializing? " << resIsMaster;
+                errMsg = ss.str();
+                newShardConn.done();
+                return false;
+            }
+
+            // if the shard is part of replica set, make sure it is the right one
+            if ( ! commandSetName.empty() && ( commandSetName != setName ) ) {
+                ostringstream ss;
+                ss << "host is part of a different set: " << setName;
+                errMsg = ss.str();
+                newShardConn.done();
+                return false;
+            }
+
+            // if the shard is part of a replica set, make sure all the hosts mentioned in 'servers' are part of
+            // the set. It is fine if not all members of the set are present in 'servers'.
+            bool foundAll = true;
+            string offendingHost;
+            if ( ! commandSetName.empty() ) {
+                set<string> hostSet;
+                BSONObjIterator iter( resIsMaster["hosts"].Obj() );
+                while ( iter.more() ) {
+                    hostSet.insert( iter.next().String() ); // host:port
+                }
+                if ( resIsMaster["passives"].isABSONObj() ) {
+                    BSONObjIterator piter( resIsMaster["passives"].Obj() );
+                    while ( piter.more() ) {
+                        hostSet.insert( piter.next().String() ); // host:port
+                    }
+                }
+                if ( resIsMaster["arbiters"].isABSONObj() ) {
+                    BSONObjIterator piter( resIsMaster["arbiters"].Obj() );
+                    while ( piter.more() ) {
+                        hostSet.insert( piter.next().String() ); // host:port
+                    }
+                }
+
+                vector<HostAndPort> hosts = servers.getServers();
+                for ( size_t i = 0 ; i < hosts.size() ; i++ ) {
+                    if (!hosts[i].hasPort()) {
+                        hosts[i].setPort(CmdLine::DefaultDBPort);
+                    }
+                    string host = hosts[i].toString(); // host:port
+                    if ( hostSet.find( host ) == hostSet.end() ) {
+                        offendingHost = host;
+                        foundAll = false;
+                        break;
+                    }
+                }
+            }
+            if ( ! foundAll ) {
+                ostringstream ss;
+                ss << "in seed list " << servers.toString() << ", host " << offendingHost
+                   << " does not belong to replica set " << setName;
+                errMsg = ss.str();
+                newShardConn.done();
+                return false;
+            }
+
+            // shard name defaults to the name of the replica set
+            if ( name->empty() && ! setName.empty() )
+                *name = setName;
+
+            // In order to be accepted as a new shard, that mongod must not have any database name that exists already
+            // in any other shards. If that test passes, the new shard's databases are going to be entered as
+            // non-sharded db's whose primary is the newly added shard.
+
+            BSONObj resListDB;
+            ok = newShardConn->runCommand( "admin" , BSON( "listDatabases" << 1 ) , resListDB );
+            if ( !ok ) {
+                ostringstream ss;
+                ss << "failed listing " << servers.toString() << "'s databases:" << resListDB;
+                errMsg = ss.str();
+                newShardConn.done();
+                return false;
+            }
+
+            BSONObjIterator i( resListDB["databases"].Obj() );
+            while ( i.more() ) {
+                BSONObj dbEntry = i.next().Obj();
+                const string& dbName = dbEntry["name"].String();
+                if ( _isSpecialLocalDB( dbName ) ) {
+                    // 'local', 'admin', and 'config' are system DBs and should be excluded here
+                    continue;
+                }
+                else {
+                    dbNames.push_back( dbName );
+                }
+            }
+
+            if ( newShardConn->type() == ConnectionString::SET ) 
+                rsMonitor = ReplicaSetMonitor::get( setName );
+
+            newShardConn.done();
+        }
+        catch ( DBException& e ) {
+            ostringstream ss;
+            ss << "couldn't connect to new shard ";
+            ss << e.what();
+            errMsg = ss.str();
+            return false;
+        }
+
+        // check that none of the existing shard candidate's db's exist elsewhere
+        for ( vector<string>::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ) {
+            DBConfigPtr config = getDBConfig( *it , false );
+            if ( config.get() != NULL ) {
+                ostringstream ss;
+                ss << "can't add shard " << servers.toString() << " because a local database '" << *it;
+                ss << "' exists in another " << config->getPrimary().toString();
+                errMsg = ss.str();
+                return false;
+            }
+        }
+
+        // if a name for a shard wasn't provided, pick one.
+        if ( name->empty() && ! _getNewShardName( name ) ) {
+            errMsg = "error generating new shard name";
+            return false;
+        }
+
+        // build the ConfigDB shard document
+        BSONObjBuilder b;
+        b.append( "_id" , *name );
+        b.append( "host" , rsMonitor ? rsMonitor->getServerAddress() : servers.toString() );
+        if ( maxSize > 0 ) {
+            b.append( ShardFields::maxSize.name() , maxSize );
+        }
+        BSONObj shardDoc = b.obj();
+
+        {
+            ScopedDbConnection conn( configServer.getPrimary() );
+
+            // check whether the set of hosts (or single host) is not an already a known shard
+            BSONObj old = conn->findOne( ShardNS::shard , BSON( "host" << servers.toString() ) );
+            if ( ! old.isEmpty() ) {
+                errMsg = "host already used";
+                conn.done();
+                return false;
+            }
+
+            log() << "going to add shard: " << shardDoc << endl;
+
+            conn->insert( ShardNS::shard , shardDoc );
+            errMsg = conn->getLastError();
+            if ( ! errMsg.empty() ) {
+                log() << "error adding shard: " << shardDoc << " err: " << errMsg << endl;
+                conn.done();
+                return false;
+            }
+
+            conn.done();
+        }
+
+        Shard::reloadShardInfo();
+
+        // add all databases of the new shard
+        for ( vector<string>::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ) {
+            DBConfigPtr config = getDBConfig( *it , true , *name );
+            if ( ! config ) {
+                log() << "adding shard " << servers << " even though could not add database " << *it << endl;
+            }
+        }
+
+        return true;
+    }
+
+    bool Grid::knowAboutShard( const string& name ) const {
+        ShardConnection conn( configServer.getPrimary() , "" );
+        BSONObj shard = conn->findOne( ShardNS::shard , BSON( "host" << name ) );
+        conn.done();
+        return ! shard.isEmpty();
+    }
+
+    bool Grid::_getNewShardName( string* name ) const {
+        DEV assert( name );
+
+        bool ok = false;
+        int count = 0;
+
+        ShardConnection conn( configServer.getPrimary() , "" );
+        BSONObj o = conn->findOne( ShardNS::shard , Query( fromjson ( "{_id: /^shard/}" ) ).sort(  BSON( "_id" << -1 ) ) );
+        if ( ! o.isEmpty() ) {
+            string last = o["_id"].String();
+            istringstream is( last.substr( 5 ) );
+            is >> count;
+            count++;
+        }
+        if (count < 9999) {
+            stringstream ss;
+            ss << "shard" << setfill('0') << setw(4) << count;
+            *name = ss.str();
+            ok = true;
+        }
+        conn.done();
+
+        return ok;
+    }
+
+    bool Grid::shouldBalance() const {
+        ShardConnection conn( configServer.getPrimary() , "" );
+
+        // look for the stop balancer marker
+        BSONObj balancerDoc = conn->findOne( ShardNS::settings, BSON( "_id" << "balancer" ) );
+        conn.done();
+
+        boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
+        if ( _balancerStopped( balancerDoc ) || ! _inBalancingWindow( balancerDoc , now ) ) {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool Grid::_balancerStopped( const BSONObj& balancerDoc ) {
+        // check the 'stopped' marker maker
+        // if present, it is a simple bool
+        BSONElement stoppedElem = balancerDoc["stopped"];
+        return stoppedElem.trueValue();
+    }
+
+    bool Grid::_inBalancingWindow( const BSONObj& balancerDoc , const boost::posix_time::ptime& now ) {
+        // check the 'activeWindow' marker
+        // if present, it is an interval during the day when the balancer should be active
+        // { start: "08:00" , stop: "19:30" }, strftime format is %H:%M
+        BSONElement windowElem = balancerDoc["activeWindow"];
+        if ( windowElem.eoo() ) {
+            return true;
+        }
+
+        // check if both 'start' and 'stop' are present
+        if ( ! windowElem.isABSONObj() ) {
+            warning() << "'activeWindow' format is { start: \"hh:mm\" , stop: ... }" << balancerDoc << endl;
+            return true;
+        }
+        BSONObj intervalDoc = windowElem.Obj();
+        const string start = intervalDoc["start"].str();
+        const string stop = intervalDoc["stop"].str();
+        if ( start.empty() || stop.empty() ) {
+            warning() << "must specify both start and end of balancing window: " << intervalDoc << endl;
+            return true;
+        }
+
+        // check that both 'start' and 'stop' are valid time-of-day
+        boost::posix_time::ptime startTime, stopTime;
+        if ( ! toPointInTime( start , &startTime ) || ! toPointInTime( stop , &stopTime ) ) {
+            warning() << "cannot parse active window (use hh:mm 24hs format): " << intervalDoc << endl;
+            return true;
+        }
+
+        if ( logLevel ) {
+            stringstream ss;
+            ss << " now: " << now
+               << " startTime: " << startTime 
+               << " stopTime: " << stopTime;
+            log() << "_inBalancingWindow: " << ss.str() << endl;
+        }
+
+        // allow balancing if during the activeWindow
+        // note that a window may be open during the night
+        if ( stopTime > startTime ) {
+            if ( ( now >= startTime ) && ( now <= stopTime ) ) {
+                return true;
+            }
+        }
+        else if ( startTime > stopTime ) {
+            if ( ( now >=startTime ) || ( now <= stopTime ) ) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    unsigned long long Grid::getNextOpTime() const {
+        ScopedDbConnection conn( configServer.getPrimary() );
+
+        BSONObj result;
+        massert( 10421 ,  "getoptime failed" , conn->simpleCommand( "admin" , &result , "getoptime" ) );
+        conn.done();
+
+        return result["optime"]._numberLong();
+    }
+
+    bool Grid::_isSpecialLocalDB( const string& dbName ) {
+        return ( dbName == "local" ) || ( dbName == "admin" ) || ( dbName == "config" );
+    }
+
+    void Grid::flushConfig() {
+        scoped_lock lk( _lock );
+        _databases.clear();
+    }
+
+    BSONObj Grid::getConfigSetting( string name ) const {
+        ScopedDbConnection conn( configServer.getPrimary() );
+        BSONObj result = conn->findOne( ShardNS::settings, BSON( "_id" << name ) );
+        conn.done();
+
+        return result;
+    }
+
+    Grid grid;
+
+
+    // unit tests
+
+    class BalancingWindowUnitTest : public UnitTest {
+    public:
+        void run() {
+            
+            if ( ! cmdLine.isMongos() )
+                return;
+
+            // T0 < T1 < now < T2 < T3 and Error
+            const string T0 = "9:00";
+            const string T1 = "11:00";
+            boost::posix_time::ptime now( currentDate(), boost::posix_time::hours( 13 ) + boost::posix_time::minutes( 48 ) );
+            const string T2 = "17:00";
+            const string T3 = "21:30";
+            const string E = "28:35";
+
+            BSONObj w1 = BSON( "activeWindow" << BSON( "start" << T0 << "stop" << T1 ) ); // closed in the past
+            BSONObj w2 = BSON( "activeWindow" << BSON( "start" << T2 << "stop" << T3 ) ); // not opened until the future
+            BSONObj w3 = BSON( "activeWindow" << BSON( "start" << T1 << "stop" << T2 ) ); // open now
+            BSONObj w4 = BSON( "activeWindow" << BSON( "start" << T3 << "stop" << T2 ) ); // open since last day
+
+            assert( ! Grid::_inBalancingWindow( w1 , now ) );
+            assert( ! Grid::_inBalancingWindow( w2 , now ) );
+            assert( Grid::_inBalancingWindow( w3 , now ) );
+            assert( Grid::_inBalancingWindow( w4 , now ) );
+
+            // bad input should not stop the balancer
+
+            BSONObj w5; // empty window
+            BSONObj w6 = BSON( "activeWindow" << BSON( "start" << 1 ) ); // missing stop
+            BSONObj w7 = BSON( "activeWindow" << BSON( "stop" << 1 ) ); // missing start
+            BSONObj w8 = BSON( "wrongMarker" << 1 << "start" << 1 << "stop" << 1 ); // active window marker missing
+            BSONObj w9 = BSON( "activeWindow" << BSON( "start" << T3 << "stop" << E ) ); // garbage in window
+
+            assert( Grid::_inBalancingWindow( w5 , now ) );
+            assert( Grid::_inBalancingWindow( w6 , now ) );
+            assert( Grid::_inBalancingWindow( w7 , now ) );
+            assert( Grid::_inBalancingWindow( w8 , now ) );
+            assert( Grid::_inBalancingWindow( w9 , now ) );
+
+            LOG(1) << "BalancingWidowObjTest passed" << endl;
+        }
+    } BalancingWindowObjTest;
+
+}
diff --git a/src/mongo/s/grid.h b/src/mongo/s/grid.h
new file mode 100644
index 00000000000..9731ada518b
--- /dev/null
+++ b/src/mongo/s/grid.h
@@ -0,0 +1,135 @@
+// grid.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <boost/date_time/posix_time/posix_time.hpp>
+
+#include "../util/time_support.h"
+#include "../util/concurrency/mutex.h"
+
+#include "config.h"  // DBConfigPtr
+
+namespace mongo {
+
+    /**
+     * stores meta-information about the grid
+     * TODO: used shard_ptr for DBConfig pointers
+     */
+    class Grid {
+    public:
+        Grid() : _lock( "Grid" ) , _allowLocalShard( true ) { }
+
+        /**
+         * gets the config the db.
+         * will return an empty DBConfig if not in db already
+         */
+        DBConfigPtr getDBConfig( string ns , bool create=true , const string& shardNameHint="" );
+
+        /**
+         * removes db entry.
+         * on next getDBConfig call will fetch from db
+         */
+        void removeDB( string db );
+
+        /**
+         * @return true if shards and config servers are allowed to use 'localhost' in address
+         */
+        bool allowLocalHost() const;
+
+        /**
+         * @param whether to allow shards and config servers to use 'localhost' in address
+         */
+        void setAllowLocalHost( bool allow );
+
+        /**
+         *
+         * addShard will create a new shard in the grid. It expects a mongod process to be runing
+         * on the provided address. Adding a shard that is a replica set is supported.
+         *
+         * @param name is an optional string with the name of the shard. if ommited, grid will
+         *        generate one and update the parameter.
+         * @param servers is the connection string of the shard being added
+         * @param maxSize is the optional space quota in bytes. Zeros means there's no limitation to
+         *        space usage
+         * @param errMsg is the error description in case the operation failed.
+         * @return true if shard was successfully added.
+         */
+        bool addShard( string* name , const ConnectionString& servers , long long maxSize , string& errMsg );
+
+        /**
+         * @return true if the config database knows about a host 'name'
+         */
+        bool knowAboutShard( const string& name ) const;
+
+        /**
+         * @return true if the chunk balancing functionality is enabled
+         */
+        bool shouldBalance() const;
+
+        /**
+         * 
+         * Obtain grid configuration and settings data.
+         *
+         * @param name identifies a particular type of configuration data.
+         * @return a BSON object containing the requested data.
+         */
+        BSONObj getConfigSetting( string name ) const;
+
+        unsigned long long getNextOpTime() const;
+        
+        void flushConfig();
+
+        // exposed methods below are for testing only
+
+        /**
+         * @param balancerDoc bson that may contain a window of time for the balancer to work
+         *        format { ... , activeWindow: { start: "8:30" , stop: "19:00" } , ... }
+         * @return true if there is no window of time specified for the balancer or it we're currently in it
+         */
+        static bool _inBalancingWindow( const BSONObj& balancerDoc , const boost::posix_time::ptime& now );
+
+    private:
+        mongo::mutex              _lock;            // protects _databases; TODO: change to r/w lock ??
+        map<string, DBConfigPtr > _databases;       // maps ns to DBConfig's
+        bool                      _allowLocalShard; // can 'localhost' be used in shard addresses?
+
+        /**
+         * @param name is the chose name for the shard. Parameter is mandatory.
+         * @return true if it managed to generate a shard name. May return false if (currently)
+         * 10000 shard
+         */
+        bool _getNewShardName( string* name ) const;
+
+        /**
+         * @return whether a give dbname is used for shard "local" databases (e.g., admin or local)
+         */
+        static bool _isSpecialLocalDB( const string& dbName );
+
+        /**
+         * @param balancerDoc bson that may contain a marker to stop the balancer
+         *        format { ... , stopped: [ "true" | "false" ] , ... }
+         * @return true if the marker is present and is set to true
+         */
+        static bool _balancerStopped( const BSONObj& balancerDoc );
+
+    };
+
+    extern Grid grid;
+
+} // namespace mongo
diff --git a/src/mongo/s/mr_shard.cpp b/src/mongo/s/mr_shard.cpp
new file mode 100644
index 00000000000..5bb83afedae
--- /dev/null
+++ b/src/mongo/s/mr_shard.cpp
@@ -0,0 +1,316 @@
+// mr_shard.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../util/net/message.h"
+#include "../db/dbmessage.h"
+#include "../scripting/engine.h"
+
+#include "mr_shard.h"
+
+namespace mongo {
+
+    namespace mr_shard {
+
+        AtomicUInt Config::JOB_NUMBER;
+
+        JSFunction::JSFunction( string type , const BSONElement& e ) {
+            _type = type;
+            _code = e._asCode();
+
+            if ( e.type() == CodeWScope )
+                _wantedScope = e.codeWScopeObject();
+        }
+
+        void JSFunction::init( State * state ) {
+            _scope = state->scope();
+            assert( _scope );
+            _scope->init( &_wantedScope );
+
+            _func = _scope->createFunction( _code.c_str() );
+            uassert( 14836 , str::stream() << "couldn't compile code for: " << _type , _func );
+
+            // install in JS scope so that it can be called in JS mode
+            _scope->setFunction(_type.c_str(), _code.c_str());
+        }
+
+        /**
+         * Applies the finalize function to a tuple obj (key, val)
+         * Returns tuple obj {_id: key, value: newval}
+         */
+        BSONObj JSFinalizer::finalize( const BSONObj& o ) {
+            Scope * s = _func.scope();
+
+            Scope::NoDBAccess no = s->disableDBAccess( "can't access db inside finalize" );
+            s->invokeSafe( _func.func() , &o, 0 );
+
+            // don't want to use o.objsize() to size b
+            // since there are many cases where the point of finalize
+            // is converting many fields to 1
+            BSONObjBuilder b;
+            b.append( o.firstElement() );
+            s->append( b , "value" , "return" );
+            return b.obj();
+        }
+
+        void JSReducer::init( State * state ) {
+            _func.init( state );
+        }
+
+        /**
+         * Reduces a list of tuple objects (key, value) to a single tuple {"0": key, "1": value}
+         */
+        BSONObj JSReducer::reduce( const BSONList& tuples ) {
+            if (tuples.size() <= 1)
+                return tuples[0];
+            BSONObj key;
+            int endSizeEstimate = 16;
+            _reduce( tuples , key , endSizeEstimate );
+
+            BSONObjBuilder b(endSizeEstimate);
+            b.appendAs( key.firstElement() , "0" );
+            _func.scope()->append( b , "1" , "return" );
+            return b.obj();
+        }
+
+        /**
+         * Reduces a list of tuple object (key, value) to a single tuple {_id: key, value: val}
+         * Also applies a finalizer method if present.
+         */
+        BSONObj JSReducer::finalReduce( const BSONList& tuples , Finalizer * finalizer ) {
+
+            BSONObj res;
+            BSONObj key;
+
+            if (tuples.size() == 1) {
+                // 1 obj, just use it
+                key = tuples[0];
+                BSONObjBuilder b(key.objsize());
+                BSONObjIterator it(key);
+                b.appendAs( it.next() , "_id" );
+                b.appendAs( it.next() , "value" );
+                res = b.obj();
+            }
+            else {
+                // need to reduce
+                int endSizeEstimate = 16;
+                _reduce( tuples , key , endSizeEstimate );
+                BSONObjBuilder b(endSizeEstimate);
+                b.appendAs( key.firstElement() , "_id" );
+                _func.scope()->append( b , "value" , "return" );
+                res = b.obj();
+            }
+
+            if ( finalizer ) {
+                res = finalizer->finalize( res );
+            }
+
+            return res;
+        }
+
+        /**
+         * actually applies a reduce, to a list of tuples (key, value).
+         * After the call, tuples will hold a single tuple {"0": key, "1": value}
+         */
+        void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) {
+            int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128;
+
+            // need to build the reduce args: ( key, [values] )
+            BSONObjBuilder reduceArgs( sizeEstimate );
+            boost::scoped_ptr<BSONArrayBuilder>  valueBuilder;
+            int sizeSoFar = 0;
+            unsigned n = 0;
+            for ( ; n<tuples.size(); n++ ) {
+                BSONObjIterator j(tuples[n]);
+                BSONElement keyE = j.next();
+                if ( n == 0 ) {
+                    reduceArgs.append( keyE );
+                    key = keyE.wrap();
+                    sizeSoFar = 5 + keyE.size();
+                    valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) ));
+                }
+
+                BSONElement ee = j.next();
+
+                uassert( 14837 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) );
+
+                if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) {
+                    assert( n > 1 ); // if not, inf. loop
+                    break;
+                }
+
+                valueBuilder->append( ee );
+                sizeSoFar += ee.size();
+            }
+            assert(valueBuilder);
+            valueBuilder->done();
+            BSONObj args = reduceArgs.obj();
+
+            Scope * s = _func.scope();
+
+            s->invokeSafe( _func.func() , &args, 0, 0, false, true, true );
+            ++numReduces;
+
+            if ( s->type( "return" ) == Array ) {
+                uasserted( 14838 , "reduce -> multiple not supported yet");
+                return;
+            }
+
+            endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() );
+
+            if ( n == tuples.size() )
+                return;
+
+            // the input list was too large, add the rest of elmts to new tuples and reduce again
+            // note: would be better to use loop instead of recursion to avoid stack overflow
+            BSONList x;
+            for ( ; n < tuples.size(); n++ ) {
+                x.push_back( tuples[n] );
+            }
+            BSONObjBuilder temp( endSizeEstimate );
+            temp.append( key.firstElement() );
+            s->append( temp , "1" , "return" );
+            x.push_back( temp.obj() );
+            _reduce( x , key , endSizeEstimate );
+        }
+
+        Config::Config( const string& _dbname , const BSONObj& cmdObj ) {
+
+            dbname = _dbname;
+            ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            verbose = cmdObj["verbose"].trueValue();
+            jsMode = cmdObj["jsMode"].trueValue();
+
+            jsMaxKeys = 500000;
+            reduceTriggerRatio = 2.0;
+            maxInMemSize = 5 * 1024 * 1024;
+
+            uassert( 14841 , "outType is no longer a valid option" , cmdObj["outType"].eoo() );
+
+            if ( cmdObj["out"].type() == String ) {
+                finalShort = cmdObj["out"].String();
+                outType = REPLACE;
+            }
+            else if ( cmdObj["out"].type() == Object ) {
+                BSONObj o = cmdObj["out"].embeddedObject();
+
+                BSONElement e = o.firstElement();
+                string t = e.fieldName();
+
+                if ( t == "normal" || t == "replace" ) {
+                    outType = REPLACE;
+                    finalShort = e.String();
+                }
+                else if ( t == "merge" ) {
+                    outType = MERGE;
+                    finalShort = e.String();
+                }
+                else if ( t == "reduce" ) {
+                    outType = REDUCE;
+                    finalShort = e.String();
+                }
+                else if ( t == "inline" ) {
+                    outType = INMEMORY;
+                }
+                else {
+                    uasserted( 14839 , str::stream() << "unknown out specifier [" << t << "]" );
+                }
+
+                if (o.hasElement("db")) {
+                    outDB = o["db"].String();
+                }
+
+                if (o.hasElement("nonAtomic")) {
+                    outNonAtomic = o["nonAtomic"].Bool();
+                }
+            }
+            else {
+                uasserted( 14840 , "'out' has to be a string or an object" );
+            }
+
+            if ( outType != INMEMORY ) { // setup names
+                tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << finalShort << "_" << JOB_NUMBER++;
+
+                incLong = tempLong + "_inc";
+
+                finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort;
+            }
+
+            {
+                // scope and code
+
+                if ( cmdObj["scope"].type() == Object )
+                    scopeSetup = cmdObj["scope"].embeddedObjectUserCheck();
+
+                reducer.reset( new JSReducer( cmdObj["reduce"] ) );
+                if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() )
+                    finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) );
+
+            }
+
+            {
+                // query options
+                if ( cmdObj["limit"].isNumber() )
+                    limit = cmdObj["limit"].numberLong();
+                else
+                    limit = 0;
+            }
+        }
+
+        State::State( const Config& c ) : _config( c ) {
+            _onDisk = _config.outType != Config::INMEMORY;
+        }
+
+        State::~State() {
+            if ( _onDisk ) {
+                try {
+//                    _db.dropCollection( _config.tempLong );
+//                    _db.dropCollection( _config.incLong );
+                }
+                catch ( std::exception& e ) {
+                    error() << "couldn't cleanup after map reduce: " << e.what() << endl;
+                }
+            }
+
+            if (_scope) {
+                // cleanup js objects
+                ScriptingFunction cleanup = _scope->createFunction("delete _emitCt; delete _keyCt; delete _mrMap;");
+                _scope->invoke(cleanup, 0, 0, 0, true);
+            }
+        }
+
+        /**
+         * Initialize the mapreduce operation, creating the inc collection
+         */
+        void State::init() {
+            // setup js
+            _scope.reset(globalScriptEngine->getPooledScope( _config.dbname ).release() );
+//            _scope->localConnect( _config.dbname.c_str() );
+            _scope->externalSetup();
+
+            if ( ! _config.scopeSetup.isEmpty() )
+                _scope->init( &_config.scopeSetup );
+
+            _config.reducer->init( this );
+            if ( _config.finalizer )
+                _config.finalizer->init( this );
+            _scope->setBoolean("_doFinal", _config.finalizer);
+        }
+    }
+}
+
diff --git a/src/mongo/s/mr_shard.h b/src/mongo/s/mr_shard.h
new file mode 100644
index 00000000000..7f96b54587f
--- /dev/null
+++ b/src/mongo/s/mr_shard.h
@@ -0,0 +1,235 @@
+// mr_shard.h
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    namespace mr_shard {
+
+        typedef vector<BSONObj> BSONList;
+
+        class State;
+
+        // ------------  function interfaces -----------
+
+        class Finalizer : boost::noncopyable {
+        public:
+            virtual ~Finalizer() {}
+            virtual void init( State * state ) = 0;
+
+            /**
+             * this takes a tuple and returns a tuple
+             */
+            virtual BSONObj finalize( const BSONObj& tuple ) = 0;
+        };
+
+        class Reducer : boost::noncopyable {
+        public:
+            Reducer() : numReduces(0) {}
+            virtual ~Reducer() {}
+            virtual void init( State * state ) = 0;
+
+            virtual BSONObj reduce( const BSONList& tuples ) = 0;
+            /** this means its a final reduce, even if there is no finalizer */
+            virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ) = 0;
+
+            long long numReduces;
+        };
+
+        // ------------  js function implementations -----------
+
+        /**
+         * used as a holder for Scope and ScriptingFunction
+         * visitor like pattern as Scope is gotten from first access
+         */
+        class JSFunction : boost::noncopyable {
+        public:
+            /**
+             * @param type (map|reduce|finalize)
+             */
+            JSFunction( string type , const BSONElement& e );
+            virtual ~JSFunction() {}
+
+            virtual void init( State * state );
+
+            Scope * scope() const { return _scope; }
+            ScriptingFunction func() const { return _func; }
+
+        private:
+            string _type;
+            string _code; // actual javascript code
+            BSONObj _wantedScope; // this is for CodeWScope
+
+            Scope * _scope; // this is not owned by us, and might be shared
+            ScriptingFunction _func;
+        };
+
+        class JSReducer : public Reducer {
+        public:
+            JSReducer( const BSONElement& code ) : _func( "_reduce" , code ) {}
+            virtual void init( State * state );
+
+            virtual BSONObj reduce( const BSONList& tuples );
+            virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer );
+
+        private:
+
+            /**
+             * result in "return"
+             * @param key OUT
+             * @param endSizeEstimate OUT
+            */
+            void _reduce( const BSONList& values , BSONObj& key , int& endSizeEstimate );
+
+            JSFunction _func;
+        };
+
+        class JSFinalizer : public Finalizer  {
+        public:
+            JSFinalizer( const BSONElement& code ) : _func( "_finalize" , code ) {}
+            virtual BSONObj finalize( const BSONObj& o );
+            virtual void init( State * state ) { _func.init( state ); }
+        private:
+            JSFunction _func;
+
+        };
+
+        // -----------------
+
+        /**
+         * holds map/reduce config information
+         */
+        class Config {
+        public:
+            Config( const string& _dbname , const BSONObj& cmdObj );
+
+            string dbname;
+            string ns;
+
+            // options
+            bool verbose;
+            bool jsMode;
+
+            // query options
+
+            BSONObj filter;
+            BSONObj sort;
+            long long limit;
+
+            // functions
+            scoped_ptr<Reducer> reducer;
+            scoped_ptr<Finalizer> finalizer;
+
+            BSONObj mapParams;
+            BSONObj scopeSetup;
+
+            // output tables
+            string incLong;
+            string tempLong;
+
+            string finalShort;
+            string finalLong;
+
+            string outDB;
+
+            // max number of keys allowed in JS map before switching mode
+            long jsMaxKeys;
+            // ratio of duplicates vs unique keys before reduce is triggered in js mode
+            float reduceTriggerRatio;
+            // maximum size of map before it gets dumped to disk
+            long maxInMemSize;
+
+            enum { REPLACE , // atomically replace the collection
+                   MERGE ,  // merge keys, override dups
+                   REDUCE , // merge keys, reduce dups
+                   INMEMORY // only store in memory, limited in size
+                 } outType;
+
+            // if true, no lock during output operation
+            bool outNonAtomic;
+
+            static AtomicUInt JOB_NUMBER;
+        }; // end MRsetup
+
+        /**
+         * stores information about intermediate map reduce state
+         * controls flow of data from map->reduce->finalize->output
+         */
+        class State {
+        public:
+            State( const Config& c );
+            ~State();
+
+            void init();
+
+            // ---- prep  -----
+            bool sourceExists();
+
+            long long incomingDocuments();
+
+            // ---- map stage ----
+
+            /**
+             * stages on in in-memory storage
+             */
+            void emit( const BSONObj& a );
+
+            /**
+             * if size is big, run a reduce
+             * if its still big, dump to temp collection
+             */
+            void checkSize();
+
+            /**
+             * run reduce on _temp
+             */
+            void reduceInMemory();
+
+            // ------ reduce stage -----------
+
+            void prepTempCollection();
+
+            void finalReduce( BSONList& values );
+
+            void finalReduce( CurOp * op , ProgressMeterHolder& pm );
+
+            // ------ simple accessors -----
+
+            /** State maintains ownership, do no use past State lifetime */
+            Scope* scope() { return _scope.get(); }
+
+            const Config& config() { return _config; }
+
+            const bool isOnDisk() { return _onDisk; }
+
+            long long numReduces() const { return _config.reducer->numReduces; }
+
+            const Config& _config;
+
+        protected:
+
+            scoped_ptr<Scope> _scope;
+            bool _onDisk; // if the end result of this map reduce is disk or not
+        };
+
+    } // end mr namespace
+}
+
+
diff --git a/src/mongo/s/request.cpp b/src/mongo/s/request.cpp
new file mode 100644
index 00000000000..96cce96685d
--- /dev/null
+++ b/src/mongo/s/request.cpp
@@ -0,0 +1,164 @@
+// s/request.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "server.h"
+
+#include "../db/commands.h"
+#include "../db/dbmessage.h"
+#include "../db/stats/counters.h"
+
+#include "../client/connpool.h"
+
+#include "request.h"
+#include "config.h"
+#include "chunk.h"
+#include "stats.h"
+#include "cursors.h"
+#include "grid.h"
+#include "client.h"
+
+namespace mongo {
+
+    Request::Request( Message& m, AbstractMessagingPort* p ) :
+        _m(m) , _d( m ) , _p(p) , _didInit(false) {
+
+        assert( _d.getns() );
+        _id = _m.header()->id;
+
+        _clientInfo = ClientInfo::get();
+        _clientInfo->newRequest( p );
+    }
+
+    void Request::checkAuth( Auth::Level levelNeeded ) const {
+        char cl[256];
+        nsToDatabase(getns(), cl);
+        uassert( 15845 , 
+                 str::stream() << "unauthorized for db:" << cl << " level: " << levelNeeded ,
+                 _clientInfo->getAuthenticationInfo()->isAuthorizedForLevel(cl,levelNeeded) );
+    }
+
+    void Request::init() {
+        if ( _didInit )
+            return;
+        _didInit = true;
+        reset();
+    }
+
+    // Deprecated, will move to the strategy itself
+    void Request::reset() {
+        if ( _m.operation() == dbKillCursors ) {
+            return;
+        }
+
+        uassert( 13644 , "can't use 'local' database through mongos" , ! str::startsWith( getns() , "local." ) );
+
+        // TODO: Deprecated, keeping to preserve codepath for now
+        const string nsStr (getns()); // use in functions taking string rather than char*
+
+        _config = grid.getDBConfig( nsStr );
+
+        // TODO:  In general, throwing an exception when the cm doesn't exist is really annoying
+        if ( _config->isSharded( nsStr ) ) {
+            _chunkManager = _config->getChunkManagerIfExists( nsStr );
+        }
+        else {
+            _chunkManager.reset();
+        }
+
+        _m.header()->id = _id;
+        _clientInfo->clearCurrentShards();
+    }
+
+    // Deprecated, will move to the strategy itself
+    Shard Request::primaryShard() const {
+        assert( _didInit );
+
+        if ( _chunkManager ) {
+            if ( _chunkManager->numChunks() > 1 )
+                throw UserException( 8060 , "can't call primaryShard on a sharded collection" );
+            return _chunkManager->findChunk( _chunkManager->getShardKey().globalMin() )->getShard();
+        }
+        Shard s = _config->getShard( getns() );
+        uassert( 10194 ,  "can't call primaryShard on a sharded collection!" , s.ok() );
+        return s;
+    }
+
+    void Request::process( int attempt ) {
+        init();
+        int op = _m.operation();
+        assert( op > dbMsg );
+
+        if ( op == dbKillCursors ) {
+            cursorCache.gotKillCursors( _m );
+            return;
+        }
+
+
+        LOG(3) << "Request::process ns: " << getns() << " msg id:" << (int)(_m.header()->id) << " attempt: " << attempt << endl;
+
+        Strategy * s = SHARDED;
+        _counter = &opsNonSharded;
+
+        _d.markSet();
+
+        bool iscmd = false;
+        if ( op == dbQuery ) {
+            iscmd = isCommand();
+            s->queryOp( *this );
+        }
+        else if ( op == dbGetMore ) {
+            checkAuth( Auth::READ ); // this is important so someone can't steal a cursor
+            s->getMore( *this );
+        }
+        else {
+            checkAuth( Auth::WRITE );
+            s->writeOp( op, *this );
+        }
+
+        globalOpCounters.gotOp( op , iscmd );
+        _counter->gotOp( op , iscmd );
+    }
+
+    bool Request::isCommand() const {
+        int x = _d.getQueryNToReturn();
+        return ( x == 1 || x == -1 ) && strstr( getns() , ".$cmd" );
+    }
+
+    void Request::gotInsert() {
+        globalOpCounters.gotInsert();
+        _counter->gotInsert();
+    }
+
+    void Request::reply( Message & response , const string& fromServer ) {
+        assert( _didInit );
+        long long cursor =response.header()->getCursor();
+        if ( cursor ) {
+            if ( fromServer.size() ) {
+                cursorCache.storeRef( fromServer , cursor );
+            }
+            else {
+                // probably a getMore
+                // make sure we have a ref for this
+                assert( cursorCache.getRef( cursor ).size() );
+            }
+        }
+        _p->reply( _m , response , _id );
+    }
+
+} // namespace mongo
diff --git a/src/mongo/s/request.h b/src/mongo/s/request.h
new file mode 100644
index 00000000000..f41ae6f6a5d
--- /dev/null
+++ b/src/mongo/s/request.h
@@ -0,0 +1,114 @@
+// request.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/net/message.h"
+#include "../db/dbmessage.h"
+#include "config.h"
+#include "util.h"
+
+namespace mongo {
+
+
+    class OpCounters;
+    class ClientInfo;
+
+    class Request : boost::noncopyable {
+    public:
+        Request( Message& m, AbstractMessagingPort* p );
+
+        // ---- message info -----
+
+
+        const char * getns() const {
+            return _d.getns();
+        }
+        int op() const {
+            return _m.operation();
+        }
+        bool expectResponse() const {
+            return op() == dbQuery || op() == dbGetMore;
+        }
+        bool isCommand() const;
+
+        MSGID id() const {
+            return _id;
+        }
+
+        DBConfigPtr getConfig() const {
+            assert( _didInit );
+            return _config;
+        }
+        bool isShardingEnabled() const {
+            assert( _didInit );
+            return _config->isShardingEnabled();
+        }
+
+        ChunkManagerPtr getChunkManager() const {
+            assert( _didInit );
+            return _chunkManager;
+        }
+
+        ClientInfo * getClientInfo() const {
+            return _clientInfo;
+        }
+
+        void checkAuth( Auth::Level levelNeeded ) const;
+
+        // ---- remote location info -----
+
+
+        Shard primaryShard() const ;
+
+        // ---- low level access ----
+
+        void reply( Message & response , const string& fromServer );
+
+        Message& m() { return _m; }
+        DbMessage& d() { return _d; }
+        AbstractMessagingPort* p() const { return _p; }
+
+        void process( int attempt = 0 );
+
+        void gotInsert();
+
+        void init();
+
+        void reset();
+
+    private:
+        Message& _m;
+        DbMessage _d;
+        AbstractMessagingPort* _p;
+
+        MSGID _id;
+        DBConfigPtr _config;
+        ChunkManagerPtr _chunkManager;
+
+        ClientInfo * _clientInfo;
+
+        OpCounters* _counter;
+
+        bool _didInit;
+    };
+
+}
+
+#include "strategy.h"
diff --git a/src/mongo/s/s_only.cpp b/src/mongo/s/s_only.cpp
new file mode 100644
index 00000000000..05e652db57e
--- /dev/null
+++ b/src/mongo/s/s_only.cpp
@@ -0,0 +1,111 @@
+// s_only.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "request.h"
+#include "client.h"
+#include "../client/dbclient.h"
+#include "../db/dbhelpers.h"
+#include "../db/matcher.h"
+#include "../db/commands.h"
+
+/*
+  most a pile of hacks to make linking nicer
+
+ */
+namespace mongo {
+
+    TSP_DEFINE(Client,currentClient)
+
+    Client::LockStatus::LockStatus() { 
+        // why is mongo::Client used in mongos?  that is very weird.  
+        // commenting this out until that is cleaned up or until someone puts a comment here
+        // explaining why it does make sense. 
+        ////dassert(false);
+    } 
+
+    Client::Client(const char *desc , AbstractMessagingPort *p) :
+        _context(0),
+        _shutdown(false),
+        _desc(desc),
+        _god(0),
+        _lastOp(0),
+        _mp(p) {
+    }
+    Client::~Client() {}
+    bool Client::shutdown() { return true; }
+
+    static unsigned long long nThreads = 0;
+    void assertStartingUp() { 
+        dassert( nThreads <= 1 );
+    }
+    Client& Client::initThread(const char *desc, AbstractMessagingPort *mp) {
+        DEV nThreads++; // never decremented.  this is for casi class asserts
+        setThreadName(desc);
+        assert( currentClient.get() == 0 );
+        Client *c = new Client(desc, mp);
+        currentClient.reset(c);
+        mongo::lastError.initThread();
+        return *c;
+    }
+
+    string Client::clientAddress(bool includePort) const {
+        ClientInfo * ci = ClientInfo::get();
+        if ( ci )
+            return ci->getRemote();
+        return "";
+    }
+
+    bool execCommand( Command * c ,
+                      Client& client , int queryOptions ,
+                      const char *ns, BSONObj& cmdObj ,
+                      BSONObjBuilder& result,
+                      bool fromRepl ) {
+        assert(c);
+
+        string dbname = nsToDatabase( ns );
+
+        if ( cmdObj["help"].trueValue() ) {
+            stringstream ss;
+            ss << "help for: " << c->name << " ";
+            c->help( ss );
+            result.append( "help" , ss.str() );
+            result.append( "lockType" , c->locktype() );
+            return true;
+        }
+
+        if ( c->adminOnly() ) {
+            if ( dbname != "admin" ) {
+                result.append( "errmsg" ,  "access denied- use admin db" );
+                log() << "command denied: " << cmdObj.toString() << endl;
+                return false;
+            }
+            log( 2 ) << "command: " << cmdObj << endl;
+        }
+
+        if (!client.getAuthenticationInfo()->isAuthorized(dbname)) {
+            result.append("errmsg" , "unauthorized");
+            return false;
+        }
+
+        string errmsg;
+        int ok = c->run( dbname , cmdObj , queryOptions, errmsg , result , fromRepl );
+        if ( ! ok )
+            result.append( "errmsg" , errmsg );
+        return ok;
+    }
+}
diff --git a/src/mongo/s/security.cpp b/src/mongo/s/security.cpp
new file mode 100644
index 00000000000..a88c36b1d20
--- /dev/null
+++ b/src/mongo/s/security.cpp
@@ -0,0 +1,101 @@
+// security.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// security.cpp
+
+#include "pch.h"
+#include "../db/security_common.h"
+#include "../db/security.h"
+#include "config.h"
+#include "client.h"
+#include "grid.h"
+
+// this is the _mongos only_ implementation of security.h
+
+namespace mongo {
+
+    bool AuthenticationInfo::_warned;
+
+    bool CmdAuthenticate::getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd) {
+        if (user == internalSecurity.user) {
+            uassert(15890, "key file must be used to log in with internal user", cmdLine.keyFile);
+            pwd = internalSecurity.pwd;
+        }
+        else {
+            string systemUsers = dbname + ".system.users";
+            DBConfigPtr config = grid.getDBConfig( systemUsers );
+            Shard s = config->getShard( systemUsers );
+
+            static BSONObj userPattern = BSON("user" << 1);
+
+            ShardConnection conn( s, systemUsers );
+            OCCASIONALLY conn->ensureIndex(systemUsers, userPattern, false, "user_1");
+            {
+                BSONObjBuilder b;
+                b << "user" << user;
+                BSONObj query = b.done();
+                userObj = conn->findOne(systemUsers, query);
+                if( userObj.isEmpty() ) {
+                    log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
+                    conn.done(); // return to pool
+                    return false;
+                }
+            }
+
+            pwd = userObj.getStringField("pwd");
+
+            conn.done(); // return to pool
+        }
+        return true;
+    }
+
+    bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) const {
+        if ( !isLocalHost ) {
+            return false;
+        }
+
+        string adminNs = "admin.system.users";
+
+        DBConfigPtr config = grid.getDBConfig( adminNs );
+        Shard s = config->getShard( adminNs );
+
+        ShardConnection conn( s, adminNs );
+        BSONObj result = conn->findOne("admin.system.users", Query());
+        if( result.isEmpty() ) {
+            if( ! _warned ) {
+                // you could get a few of these in a race, but that's ok
+                _warned = true;
+                log() << "note: no users configured in admin.system.users, allowing localhost access" << endl;
+            }
+
+            // Must return conn to pool
+            // TODO: Check for errors during findOne(), or just let the conn die?
+            conn.done();
+            return true;
+        }
+
+        // Must return conn to pool
+        conn.done();
+        return false;
+    }
+
+    bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        AuthenticationInfo *ai = ClientInfo::get()->getAuthenticationInfo();
+        ai->logout(dbname);
+        return true;
+    }
+}
diff --git a/src/mongo/s/server.cpp b/src/mongo/s/server.cpp
new file mode 100644
index 00000000000..63b3c368ab6
--- /dev/null
+++ b/src/mongo/s/server.cpp
@@ -0,0 +1,429 @@
+// server.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/message.h"
+#include "../util/unittest.h"
+#include "../client/connpool.h"
+#include "../util/net/message_server.h"
+#include "../util/stringutils.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "../util/signal_handlers.h"
+#include "../util/admin_access.h"
+#include "../util/concurrency/task.h"
+#include "../db/dbwebserver.h"
+#include "../scripting/engine.h"
+
+#include "server.h"
+#include "request.h"
+#include "client.h"
+#include "config.h"
+#include "chunk.h"
+#include "balance.h"
+#include "grid.h"
+#include "cursors.h"
+#include "shard_version.h"
+
+namespace mongo {
+
+    CmdLine cmdLine;
+    Database *database = 0;
+    string mongosCommand;
+    bool dbexitCalled = false;
+    static bool scriptingEnabled = true;
+
+    bool inShutdown() {
+        return dbexitCalled;
+    }
+
+    string getDbContext() {
+        return "?";
+    }
+
+    bool haveLocalShardingInfo( const string& ns ) {
+        assert( 0 );
+        return false;
+    }
+
+    void usage( char * argv[] ) {
+        out() << argv[0] << " usage:\n\n";
+        out() << " -v+  verbose 1: general 2: more 3: per request 4: more\n";
+        out() << " --port <portno>\n";
+        out() << " --configdb <configdbname>,[<configdbname>,<configdbname>]\n";
+        out() << endl;
+    }
+
+    void ShardingConnectionHook::onHandedOut( DBClientBase * conn ) {
+        ClientInfo::get()->addShard( conn->getServerAddress() );
+    }
+
+    class ShardedMessageHandler : public MessageHandler {
+    public:
+        virtual ~ShardedMessageHandler() {}
+
+        virtual void connected( AbstractMessagingPort* p ) {
+            ClientInfo *c = ClientInfo::get();
+            massert(15849, "client info not defined", c);
+            c->getAuthenticationInfo()->isLocalHost = p->remote().isLocalHost();
+        }
+
+        virtual void process( Message& m , AbstractMessagingPort* p , LastError * le) {
+            assert( p );
+            Request r( m , p );
+
+            assert( le );            
+            lastError.startRequest( m , le );
+
+            try {
+                r.init();
+                r.process();
+            }
+            catch ( AssertionException & e ) {
+                log( e.isUserAssertion() ? 1 : 0 ) << "AssertionException while processing op type : " << m.operation() << " to : " << r.getns() << causedBy(e) << endl;
+
+                le->raiseError( e.getCode() , e.what() );
+
+                m.header()->id = r.id();
+
+                if ( r.expectResponse() ) {
+                    BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() );
+                    replyToQuery( ResultFlag_ErrSet, p , m , err );
+                }
+            }
+            catch ( DBException& e ) {
+                log() << "DBException in process: " << e.what() << endl;
+
+                le->raiseError( e.getCode() , e.what() );
+
+                m.header()->id = r.id();
+
+                if ( r.expectResponse() ) {
+                    BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() );
+                    replyToQuery( ResultFlag_ErrSet, p , m , err );
+                }
+            }
+        }
+
+        virtual void disconnected( AbstractMessagingPort* p ) {
+            // all things are thread local
+        }
+    };
+
+    void sighandler(int sig) {
+        dbexit(EXIT_CLEAN, (string("received signal ") + BSONObjBuilder::numStr(sig)).c_str());
+    }
+
+    // this gets called when new fails to allocate memory
+    void my_new_handler() {
+        rawOut( "out of memory, printing stack and exiting:" );
+        printStackTrace();
+        ::exit(EXIT_ABRUPT);
+    }
+
+    void setupSignals( bool inFork ) {
+        signal(SIGTERM, sighandler);
+        signal(SIGINT, sighandler);
+
+#if defined(SIGQUIT)
+        signal( SIGQUIT , printStackAndExit );
+#endif
+        signal( SIGSEGV , printStackAndExit );
+        signal( SIGABRT , printStackAndExit );
+        signal( SIGFPE , printStackAndExit );
+#if defined(SIGBUS)
+        signal( SIGBUS , printStackAndExit );
+#endif
+
+        set_new_handler( my_new_handler );
+    }
+
+    void init() {
+        serverID.init();
+        setupSIGTRAPforGDB();
+        setupCoreSignals();
+        setupSignals( false );
+        Logstream::get().addGlobalTee( new RamLog("global") );
+    }
+
+    void start( const MessageServer::Options& opts ) {
+        setThreadName( "mongosMain" );
+
+        balancer.go();
+        cursorCache.startTimeoutThread();
+        PeriodicTask::theRunner->go();
+
+        ShardedMessageHandler handler;
+        MessageServer * server = createServer( opts , &handler );
+        server->setAsTimeTracker();
+        server->run();
+    }
+
+    DBClientBase *createDirectClient() {
+        uassert( 10197 ,  "createDirectClient not implemented for sharding yet" , 0 );
+        return 0;
+    }
+
+    void printShardingVersionInfo(bool out) {
+        if (out) {
+          cout << mongosCommand << " " << mongodVersion() << " starting (--help for usage)" << endl;
+          cout << "git version: " << gitVersion() << endl;
+          cout <<  "build sys info: " << sysInfo() << endl;
+        } else {
+          log() << mongosCommand << " " << mongodVersion() << " starting (--help for usage)" << endl;
+          printGitVersion();
+          printSysInfo();
+        }
+    }
+
+    void cloudCmdLineParamIs(string cmd);
+
+} // namespace mongo
+
+using namespace mongo;
+
+#include <boost/program_options.hpp>
+
+namespace po = boost::program_options;
+
+int _main(int argc, char* argv[]) {
+    static StaticObserver staticObserver;
+    mongosCommand = argv[0];
+
+    po::options_description options("General options");
+    po::options_description sharding_options("Sharding options");
+    po::options_description hidden("Hidden options");
+    po::positional_options_description positional;
+
+    CmdLine::addGlobalOptions( options , hidden );
+
+    sharding_options.add_options()
+    ( "configdb" , po::value<string>() , "1 or 3 comma separated config servers" )
+    ( "test" , "just run unit tests" )
+    ( "upgrade" , "upgrade meta data version" )
+    ( "chunkSize" , po::value<int>(), "maximum amount of data per chunk" )
+    ( "ipv6", "enable IPv6 support (disabled by default)" )
+    ( "jsonp","allow JSONP access via http (has security implications)" )
+    ("noscripting", "disable scripting engine")
+    ;
+
+    options.add(sharding_options);
+    // parse options
+    po::variables_map params;
+    if ( ! CmdLine::store( argc , argv , options , hidden , positional , params ) )
+        return 0;
+
+    // The default value may vary depending on compile options, but for mongos
+    // we want durability to be disabled.
+    cmdLine.dur = false;
+
+    if ( params.count( "help" ) ) {
+        cout << options << endl;
+        return 0;
+    }
+
+    if ( params.count( "version" ) ) {
+        printShardingVersionInfo(true);
+        return 0;
+    }
+
+    if ( params.count( "chunkSize" ) ) {
+        int csize = params["chunkSize"].as<int>();
+    
+        // validate chunksize before proceeding
+        if ( csize == 0 ) {
+            out() << "error: need a non-zero chunksize" << endl;
+            return 11;
+        }
+
+        Chunk::MaxChunkSize = csize * 1024 * 1024;
+    }
+
+    if ( params.count( "ipv6" ) ) {
+        enableIPv6();
+    }
+
+    if ( params.count( "jsonp" ) ) {
+        cmdLine.jsonp = true;
+    }
+
+    if ( params.count( "test" ) ) {
+        logLevel = 5;
+        UnitTest::runTests();
+        cout << "tests passed" << endl;
+        return 0;
+    }
+
+    if (params.count("noscripting")) {
+        scriptingEnabled = false;
+    }
+
+    if ( ! params.count( "configdb" ) ) {
+        out() << "error: no args for --configdb" << endl;
+        return 4;
+    }
+
+    if( params.count("cloud") ) {
+        string s = params["cloud"].as<string>();
+        cloudCmdLineParamIs(s);
+    }
+
+    vector<string> configdbs;
+    splitStringDelim( params["configdb"].as<string>() , &configdbs , ',' );
+    if ( configdbs.size() != 1 && configdbs.size() != 3 ) {
+        out() << "need either 1 or 3 configdbs" << endl;
+        return 5;
+    }
+
+    // we either have a setting where all processes are in localhost or none are
+    for ( vector<string>::const_iterator it = configdbs.begin() ; it != configdbs.end() ; ++it ) {
+        try {
+
+            HostAndPort configAddr( *it );  // will throw if address format is invalid
+
+            if ( it == configdbs.begin() ) {
+                grid.setAllowLocalHost( configAddr.isLocalHost() );
+            }
+
+            if ( configAddr.isLocalHost() != grid.allowLocalHost() ) {
+                out() << "cannot mix localhost and ip addresses in configdbs" << endl;
+                return 10;
+            }
+
+        }
+        catch ( DBException& e) {
+            out() << "configdb: " << e.what() << endl;
+            return 9;
+        }
+    }
+    
+    // set some global state
+
+    pool.addHook( new ShardingConnectionHook( false ) );
+    pool.setName( "mongos connectionpool" );
+
+    shardConnectionPool.addHook( new ShardingConnectionHook( true ) );
+    shardConnectionPool.setName( "mongos shardconnection connectionpool" );
+
+    
+    DBClientConnection::setLazyKillCursor( false );
+
+    ReplicaSetMonitor::setConfigChangeHook( boost::bind( &ConfigServer::replicaSetChange , &configServer , _1 ) );
+    
+    if ( argc <= 1 ) {
+        usage( argv );
+        return 3;
+    }
+
+    bool ok = cmdLine.port != 0 && configdbs.size();
+
+    if ( !ok ) {
+        usage( argv );
+        return 1;
+    }
+
+    printShardingVersionInfo(false);
+
+    if ( ! configServer.init( configdbs ) ) {
+        cout << "couldn't resolve config db address" << endl;
+        return 7;
+    }
+
+    if ( ! configServer.ok( true ) ) {
+        cout << "configServer connection startup check failed" << endl;
+        return 8;
+    }
+
+    {
+        class CheckConfigServers : public task::Task {
+            virtual string name() const { return "CheckConfigServers"; }
+            virtual void doWork() { configServer.ok(true); }
+        };
+        static CheckConfigServers checkConfigServers;
+
+        task::repeat(&checkConfigServers, 60*1000);
+    }
+
+    int configError = configServer.checkConfigVersion( params.count( "upgrade" ) );
+    if ( configError ) {
+        if ( configError > 0 ) {
+            cout << "upgrade success!" << endl;
+        }
+        else {
+            cout << "config server error: " << configError << endl;
+        }
+        return configError;
+    }
+    configServer.reloadSettings();
+    
+    init();
+
+#ifndef _WIN32
+    CmdLine::launchOk();
+#endif
+
+    boost::thread web( boost::bind(&webServerThread, new NoAdminAccess() /* takes ownership */) );
+
+    if ( scriptingEnabled ) {
+        ScriptEngine::setup();
+//        globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback );
+//        globalScriptEngine->setGetInterruptSpecCallback( jsGetInterruptSpecCallback );
+    }
+
+    MessageServer::Options opts;
+    opts.port = cmdLine.port;
+    opts.ipList = cmdLine.bind_ip;
+    start(opts);
+
+    dbexit( EXIT_CLEAN );
+    return 0;
+}
+int main(int argc, char* argv[]) {
+    try {
+        doPreServerStartupInits();
+        return _main(argc, argv);
+    }
+    catch(DBException& e) { 
+        cout << "uncaught exception in mongos main:" << endl;
+        cout << e.toString() << endl;
+    }
+    catch(std::exception& e) { 
+        cout << "uncaught exception in mongos main:" << endl;
+        cout << e.what() << endl;
+    }
+    catch(...) { 
+        cout << "uncaught exception in mongos main" << endl;
+    }
+    return 20;
+}
+
+#undef exit
+
+void mongo::exitCleanly( ExitCode code ) {
+    // TODO: do we need to add anything?
+    mongo::dbexit( code );
+}
+
+void mongo::dbexit( ExitCode rc, const char *why, bool tryToGetLock ) {
+    dbexitCalled = true;
+    log() << "dbexit: " << why
+          << " rc:" << rc
+          << " " << ( why ? why : "" )
+          << endl;
+    ::exit(rc);
+}
diff --git a/src/mongo/s/server.h b/src/mongo/s/server.h
new file mode 100644
index 00000000000..18e91e266fd
--- /dev/null
+++ b/src/mongo/s/server.h
@@ -0,0 +1,29 @@
+// server.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <string>
+#include "../util/net/message.h"
+#include "../db/jsobj.h"
+
+namespace mongo {
+
+    extern OID serverID;
+
+    // from request.cpp
+    void processRequest(Message& m, MessagingPort& p);
+}
diff --git a/src/mongo/s/shard.cpp b/src/mongo/s/shard.cpp
new file mode 100644
index 00000000000..81b41c7fcbc
--- /dev/null
+++ b/src/mongo/s/shard.cpp
@@ -0,0 +1,410 @@
+// shard.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "shard.h"
+#include "config.h"
+#include "request.h"
+#include "client.h"
+#include "../db/commands.h"
+#include <set>
+
+namespace mongo {
+
+    typedef shared_ptr<Shard> ShardPtr;
+    
+    class StaticShardInfo {
+    public:
+        StaticShardInfo() : _mutex("StaticShardInfo"), _rsMutex("RSNameMap") { }
+        void reload() {
+
+            list<BSONObj> all;
+            {
+                ScopedDbConnection conn( configServer.getPrimary() );
+                auto_ptr<DBClientCursor> c = conn->query( ShardNS::shard , Query() );
+                massert( 13632 , "couldn't get updated shard list from config server" , c.get() );
+                while ( c->more() ) {
+                    all.push_back( c->next().getOwned() );
+                }
+                conn.done();
+            }
+
+            scoped_lock lk( _mutex );
+
+            // We use the _lookup table for all shards and for the primary config DB. The config DB info,
+            // however, does not come from the ShardNS::shard. So when cleaning the _lookup table we leave
+            // the config state intact. The rationale is that this way we could drop shards that
+            // were removed without reinitializing the config DB information.
+
+            ShardMap::iterator i = _lookup.find( "config" );
+            if ( i != _lookup.end() ) {
+                ShardPtr config = i->second;
+                _lookup.clear();
+                _lookup[ "config" ] = config;
+            }
+            else {
+                _lookup.clear();
+            }
+            _rsLookup.clear();
+
+            for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); ++i ) {
+                BSONObj o = *i;
+                string name = o["_id"].String();
+                string host = o["host"].String();
+
+                long long maxSize = 0;
+                BSONElement maxSizeElem = o[ ShardFields::maxSize.name() ];
+                if ( ! maxSizeElem.eoo() ) {
+                    maxSize = maxSizeElem.numberLong();
+                }
+
+                bool isDraining = false;
+                BSONElement isDrainingElem = o[ ShardFields::draining.name() ];
+                if ( ! isDrainingElem.eoo() ) {
+                    isDraining = isDrainingElem.Bool();
+                }
+
+                ShardPtr s( new Shard( name , host , maxSize , isDraining ) );
+                _lookup[name] = s;
+                _installHost( host , s );
+            }
+
+        }
+        
+        ShardPtr find( const string& ident ) {
+            string mykey = ident;
+
+            {
+                scoped_lock lk( _mutex );
+                ShardMap::iterator i = _lookup.find( mykey );
+
+                if ( i != _lookup.end() )
+                    return i->second;
+            }
+
+            // not in our maps, re-load all
+            reload();
+
+            scoped_lock lk( _mutex );
+            ShardMap::iterator i = _lookup.find( mykey );
+            massert( 13129 , (string)"can't find shard for: " + mykey , i != _lookup.end() );
+            return i->second;
+        }
+
+        // Lookup shard by replica set name. Returns Shard::EMTPY if the name can't be found.
+        // Note: this doesn't refresh the table if the name isn't found, so it's possible that
+        // a newly added shard/Replica Set may not be found.
+        Shard lookupRSName( const string& name) {
+            scoped_lock lk( _rsMutex );
+            ShardMap::iterator i = _rsLookup.find( name );
+
+            return (i == _rsLookup.end()) ? Shard::EMPTY : i->second.get();
+        }
+
+        // Useful for ensuring our shard data will not be modified while we use it
+        Shard findCopy( const string& ident ){
+            ShardPtr found = find( ident );
+            scoped_lock lk( _mutex );
+            massert( 13128 , (string)"can't find shard for: " + ident , found.get() );
+            return *found.get();
+        }
+
+        void set( const string& name , const Shard& s , bool setName = true , bool setAddr = true ) {
+            scoped_lock lk( _mutex );
+            ShardPtr ss( new Shard( s ) );
+            if ( setName )
+                _lookup[name] = ss;
+            if ( setAddr )
+                _installHost( s.getConnString() , ss );
+        }
+
+        void _installHost( const string& host , const ShardPtr& s ) {
+            _lookup[host] = s;
+            
+            const ConnectionString& cs = s->getAddress();
+            if ( cs.type() == ConnectionString::SET ) {
+                if ( cs.getSetName().size() ) {
+                    scoped_lock lk( _rsMutex);
+                    _rsLookup[ cs.getSetName() ] = s;
+                }
+                vector<HostAndPort> servers = cs.getServers();
+                for ( unsigned i=0; i<servers.size(); i++ ) {
+                    _lookup[ servers[i].toString() ] = s;
+                }
+            }
+        }
+        
+        void remove( const string& name ) {
+            scoped_lock lk( _mutex );
+            for ( ShardMap::iterator i = _lookup.begin(); i!=_lookup.end(); ) {
+                ShardPtr s = i->second;
+                if ( s->getName() == name ) {
+                    _lookup.erase(i++);
+                }
+                else {
+                    ++i;
+                }
+            }
+            for ( ShardMap::iterator i = _rsLookup.begin(); i!=_rsLookup.end(); ) {
+                ShardPtr s = i->second;
+                if ( s->getName() == name ) {
+                    _rsLookup.erase(i++);
+                }
+                else {
+                    ++i;
+                }
+            }
+        }
+        
+        void getAllShards( vector<ShardPtr>& all ) const {
+            scoped_lock lk( _mutex );
+            std::set<string> seen;
+            for ( ShardMap::const_iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) {
+                const ShardPtr& s = i->second;
+                if ( s->getName() == "config" )
+                    continue;
+                if ( seen.count( s->getName() ) )
+                    continue;
+                seen.insert( s->getName() );
+                all.push_back( s );
+            }
+        }
+
+        void getAllShards( vector<Shard>& all ) const {
+            scoped_lock lk( _mutex );
+            std::set<string> seen;
+            for ( ShardMap::const_iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) {
+                const ShardPtr& s = i->second;
+                if ( s->getName() == "config" )
+                    continue;
+                if ( seen.count( s->getName() ) )
+                    continue;
+                seen.insert( s->getName() );
+                all.push_back( *s );
+            }
+        }
+
+        
+        bool isAShardNode( const string& addr ) const {
+            scoped_lock lk( _mutex );      
+            
+            // check direct nods or set names
+            ShardMap::const_iterator i = _lookup.find( addr );
+            if ( i != _lookup.end() )
+                return true;
+            
+            // check for set nodes
+            for ( ShardMap::const_iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) {
+                if ( i->first == "config" )
+                    continue;
+
+                if ( i->second->containsNode( addr ) )
+                    return true;
+            }
+
+            return false;
+        }
+        
+        bool getShardMap( BSONObjBuilder& result , string& errmsg ) const {
+            scoped_lock lk( _mutex );
+
+            BSONObjBuilder b( _lookup.size() + 50 );
+
+            for ( ShardMap::const_iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) {
+                b.append( i->first , i->second->getConnString() );
+            }
+            
+            result.append( "map" , b.obj() );
+            
+            return true;
+        }
+
+    private:
+        typedef map<string,ShardPtr> ShardMap;
+        ShardMap _lookup;
+        ShardMap _rsLookup; // Map from ReplSet name to shard
+        mutable mongo::mutex _mutex;
+        mutable mongo::mutex _rsMutex;
+    } staticShardInfo;
+
+    
+    class CmdGetShardMap : public Command {
+    public:
+        CmdGetShardMap() : Command( "getShardMap" ){}
+        virtual void help( stringstream &help ) const { help<<"internal"; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+
+        virtual bool run(const string&, mongo::BSONObj&, int, std::string& errmsg , mongo::BSONObjBuilder& result, bool) {
+            return staticShardInfo.getShardMap( result , errmsg );
+        }
+    } cmdGetShardMap;
+    
+
+    void Shard::_setAddr( const string& addr ) {
+        _addr = addr;
+        if ( !_addr.empty() ) {
+            _cs = ConnectionString( addr , ConnectionString::SET );
+            _rsInit();
+        }
+    }
+
+    void Shard::_rsInit() {
+        if ( _cs.type() == ConnectionString::SET ) {
+            string x = _cs.getSetName();
+            massert( 14807 , str::stream() << "no set name for shard: " << _name << " " << _cs.toString() , x.size() );
+            _rs = ReplicaSetMonitor::get( x , _cs.getServers() );
+        }
+    }
+
+    void Shard::setAddress( const ConnectionString& cs) {
+        assert( _name.size() );
+        _addr = cs.toString();
+        _cs = cs;
+        _rsInit();
+        staticShardInfo.set( _name , *this , true , false );
+    }
+
+    void Shard::reset( const string& ident ) {
+        *this = staticShardInfo.findCopy( ident );
+        _rs.reset();
+        _rsInit();
+    }
+
+    bool Shard::containsNode( const string& node ) const {
+        if ( _addr == node )
+            return true;
+        
+        if ( _rs && _rs->contains( node ) )
+            return true;
+
+        return false;
+    }
+
+    void Shard::getAllShards( vector<Shard>& all ) {
+        staticShardInfo.getAllShards( all );
+    }
+
+    bool Shard::isAShardNode( const string& ident ) {
+        return staticShardInfo.isAShardNode( ident );
+    }
+
+    Shard Shard::lookupRSName( const string& name) {
+        return staticShardInfo.lookupRSName(name);
+    }
+
+    void Shard::printShardInfo( ostream& out ) {
+        vector<Shard> all;
+        staticShardInfo.getAllShards( all );
+        for ( unsigned i=0; i<all.size(); i++ )
+            out << all[i].toString() << "\n";
+        out.flush();
+    }
+
+    BSONObj Shard::runCommand( const string& db , const BSONObj& cmd ) const {
+        ScopedDbConnection conn( this );
+        BSONObj res;
+        bool ok = conn->runCommand( db , cmd , res );
+        if ( ! ok ) {
+            stringstream ss;
+            ss << "runCommand (" << cmd << ") on shard (" << _name << ") failed : " << res;
+            throw UserException( 13136 , ss.str() );
+        }
+        res = res.getOwned();
+        conn.done();
+        return res;
+    }
+
+    ShardStatus Shard::getStatus() const {
+        return ShardStatus( *this , runCommand( "admin" , BSON( "serverStatus" << 1 ) ) );
+    }
+
+    void Shard::reloadShardInfo() {
+        staticShardInfo.reload();
+    }
+
+
+    void Shard::removeShard( const string& name ) {
+        staticShardInfo.remove( name );
+    }
+
+    Shard Shard::pick( const Shard& current ) {
+        vector<Shard> all;
+        staticShardInfo.getAllShards( all );
+        if ( all.size() == 0 ) {
+            staticShardInfo.reload();
+            staticShardInfo.getAllShards( all );
+            if ( all.size() == 0 )
+                return EMPTY;
+        }
+
+        // if current shard was provided, pick a different shard only if it is a better choice
+        ShardStatus best = all[0].getStatus();
+        if ( current != EMPTY ) {
+            best = current.getStatus();
+        }
+
+        for ( size_t i=0; i<all.size(); i++ ) {
+            ShardStatus t = all[i].getStatus();
+            if ( t < best )
+                best = t;
+        }
+
+        LOG(1) << "best shard for new allocation is " << best << endl;
+        return best.shard();
+    }
+
+    ShardStatus::ShardStatus( const Shard& shard , const BSONObj& obj )
+        : _shard( shard ) {
+        _mapped = obj.getFieldDotted( "mem.mapped" ).numberLong();
+        _hasOpsQueued = obj["writeBacksQueued"].Bool();
+        _writeLock = 0; // TODO
+    }
+
+    void ShardingConnectionHook::onCreate( DBClientBase * conn ) {
+        if( !noauth ) {
+            string err;
+            LOG(2) << "calling onCreate auth for " << conn->toString() << endl;
+            uassert( 15847, "can't authenticate to shard server",
+                    conn->auth("local", internalSecurity.user, internalSecurity.pwd, err, false));
+        }
+
+        if ( _shardedConnections && versionManager.isVersionableCB( conn ) ) {
+
+            // We must initialize sharding on all connections, so that we get exceptions if sharding is enabled on
+            // the collection.
+            BSONObj result;
+            bool ok = versionManager.initShardVersionCB( conn, result );
+
+            // assert that we actually successfully setup sharding
+            uassert( 15907, str::stream() << "could not initialize sharding on connection " << (*conn).toString() <<
+                        ( result["errmsg"].type() == String ? causedBy( result["errmsg"].String() ) :
+                                                              causedBy( (string)"unknown failure : " + result.toString() ) ), ok );
+
+        }
+    }
+
+    void ShardingConnectionHook::onDestroy( DBClientBase * conn ) {
+
+        if( _shardedConnections && versionManager.isVersionableCB( conn ) ){
+            versionManager.resetShardVersionCB( conn );
+        }
+
+    }
+}
diff --git a/src/mongo/s/shard.h b/src/mongo/s/shard.h
new file mode 100644
index 00000000000..6b52c58a932
--- /dev/null
+++ b/src/mongo/s/shard.h
@@ -0,0 +1,308 @@
+// @file shard.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../client/connpool.h"
+
+namespace mongo {
+
+    class ShardConnection;
+    class ShardStatus;
+
+    /*
+     * A "shard" one partition of the overall database (and a replica set typically).
+     */
+
+    class Shard {
+    public:
+        Shard()
+            : _name("") , _addr("") , _maxSize(0) , _isDraining( false ) {
+        }
+
+        Shard( const string& name , const string& addr, long long maxSize = 0 , bool isDraining = false )
+            : _name(name) , _addr( addr ) , _maxSize( maxSize ) , _isDraining( isDraining ) {
+            _setAddr( addr );
+        }
+
+        Shard( const string& ident ) {
+            reset( ident );
+        }
+
+        Shard( const Shard& other )
+            : _name( other._name ) , _addr( other._addr ) , _cs( other._cs ) , 
+              _maxSize( other._maxSize ) , _isDraining( other._isDraining ) , _rs( other._rs ) {
+        }
+
+        Shard( const Shard* other )
+            : _name( other->_name ) , _addr( other->_addr ), _cs( other->_cs ) , 
+              _maxSize( other->_maxSize ) , _isDraining( other->_isDraining ) , _rs( other->_rs ) {
+        }
+
+        static Shard make( const string& ident ) {
+            Shard s;
+            s.reset( ident );
+            return s;
+        }
+
+        /**
+         * @param ident either name or address
+         */
+        void reset( const string& ident );
+
+        void setAddress( const ConnectionString& cs );
+        
+        ConnectionString getAddress() const { return _cs; }
+
+        string getName() const {
+            assert( _name.size() );
+            return _name;
+        }
+
+        string getConnString() const {
+            assert( _addr.size() );
+            return _addr;
+        }
+
+        long long getMaxSize() const {
+            return _maxSize;
+        }
+
+        bool isDraining() const {
+            return _isDraining;
+        }
+
+        string toString() const {
+            return _name + ":" + _addr;
+        }
+
+        friend ostream& operator << (ostream& out, const Shard& s) {
+            return (out << s.toString());
+        }
+
+        bool operator==( const Shard& s ) const {
+            bool n = _name == s._name;
+            bool a = _addr == s._addr;
+
+            assert( n == a ); // names and address are 1 to 1
+            return n;
+        }
+
+        bool operator!=( const Shard& s ) const {
+            bool n = _name == s._name;
+            bool a = _addr == s._addr;
+            return ! ( n && a );
+        }
+
+
+        bool operator==( const string& s ) const {
+            return _name == s || _addr == s;
+        }
+
+        bool operator!=( const string& s ) const {
+            return _name != s && _addr != s;
+        }
+
+        bool operator<(const Shard& o) const {
+            return _name < o._name;
+        }
+
+        bool ok() const { return _addr.size() > 0; }
+
+        BSONObj runCommand( const string& db , const string& simple ) const {
+            return runCommand( db , BSON( simple << 1 ) );
+        }
+        BSONObj runCommand( const string& db , const BSONObj& cmd ) const ;
+
+        ShardStatus getStatus() const ;
+        
+        /**
+         * mostly for replica set
+         * retursn true if node is the shard 
+         * of if the replica set contains node
+         */
+        bool containsNode( const string& node ) const;
+
+        static void getAllShards( vector<Shard>& all );
+        static void printShardInfo( ostream& out );
+        static Shard lookupRSName( const string& name);
+        
+        /**
+         * @parm current - shard where the chunk/database currently lives in
+         * @return the currently emptiest shard, if best then current, or EMPTY
+         */
+        static Shard pick( const Shard& current = EMPTY );
+
+        static void reloadShardInfo();
+
+        static void removeShard( const string& name );
+
+        static bool isAShardNode( const string& ident );
+
+        static Shard EMPTY;
+        
+    private:
+        
+	void _rsInit();
+        void _setAddr( const string& addr );
+        
+        string    _name;
+        string    _addr;
+        ConnectionString _cs;
+        long long _maxSize;    // in MBytes, 0 is unlimited
+        bool      _isDraining; // shard is currently being removed
+        ReplicaSetMonitorPtr _rs;
+    };
+
+    class ShardStatus {
+    public:
+
+        ShardStatus( const Shard& shard , const BSONObj& obj );
+
+        friend ostream& operator << (ostream& out, const ShardStatus& s) {
+            out << s.toString();
+            return out;
+        }
+
+        string toString() const {
+            stringstream ss;
+            ss << "shard: " << _shard << " mapped: " << _mapped << " writeLock: " << _writeLock;
+            return ss.str();
+        }
+
+        bool operator<( const ShardStatus& other ) const {
+            return _mapped < other._mapped;
+        }
+
+        Shard shard() const {
+            return _shard;
+        }
+
+        long long mapped() const {
+            return _mapped;
+        }
+
+        bool hasOpsQueued() const {
+            return _hasOpsQueued;
+        }
+
+    private:
+        Shard _shard;
+        long long _mapped;
+        bool _hasOpsQueued;  // true if 'writebacks' are pending
+        double _writeLock;
+    };
+
+    class ChunkManager;
+    typedef shared_ptr<const ChunkManager> ChunkManagerPtr;
+
+    class ShardConnection : public AScopedConnection {
+    public:
+        ShardConnection( const Shard * s , const string& ns, ChunkManagerPtr manager = ChunkManagerPtr() );
+        ShardConnection( const Shard& s , const string& ns, ChunkManagerPtr manager = ChunkManagerPtr() );
+        ShardConnection( const string& addr , const string& ns, ChunkManagerPtr manager = ChunkManagerPtr() );
+
+        ~ShardConnection();
+
+        void done();
+        void kill();
+
+        DBClientBase& conn() {
+            _finishInit();
+            assert( _conn );
+            return *_conn;
+        }
+
+        DBClientBase* operator->() {
+            _finishInit();
+            assert( _conn );
+            return _conn;
+        }
+
+        DBClientBase* get() {
+            _finishInit();
+            assert( _conn );
+            return _conn;
+        }
+
+        string getHost() const {
+            return _addr;
+        }
+
+        string getNS() const {
+            return _ns;
+        }
+
+        ChunkManagerPtr getManager() const {
+            return _manager;
+        }
+
+        bool setVersion() {
+            _finishInit();
+            return _setVersion;
+        }
+
+        static void sync();
+
+        void donotCheckVersion() {
+            _setVersion = false;
+            _finishedInit = true;
+        }
+        
+        bool ok() const { return _conn > 0; }
+
+        /**
+           this just passes through excpet it checks for stale configs
+         */
+        bool runCommand( const string& db , const BSONObj& cmd , BSONObj& res );
+
+        /** checks all of my thread local connections for the version of this ns */
+        static void checkMyConnectionVersions( const string & ns );
+
+    private:
+        void _init();
+        void _finishInit();
+
+        bool _finishedInit;
+
+        string _addr;
+        string _ns;
+        ChunkManagerPtr _manager;
+
+        DBClientBase* _conn;
+        bool _setVersion;
+    };
+
+
+    extern DBConnectionPool shardConnectionPool;
+
+    class ShardingConnectionHook : public DBConnectionHook {
+    public:
+
+        ShardingConnectionHook( bool shardedConnections )
+            : _shardedConnections( shardedConnections ) {
+        }
+
+        virtual void onCreate( DBClientBase * conn );
+        virtual void onHandedOut( DBClientBase * conn );
+        virtual void onDestroy( DBClientBase * conn );
+
+        bool _shardedConnections;
+    };
+}
diff --git a/src/mongo/s/shard_version.cpp b/src/mongo/s/shard_version.cpp
new file mode 100644
index 00000000000..a80b339d858
--- /dev/null
+++ b/src/mongo/s/shard_version.cpp
@@ -0,0 +1,269 @@
+// @file shard_version.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "chunk.h"
+#include "config.h"
+#include "grid.h"
+#include "util.h"
+#include "shard.h"
+#include "writeback_listener.h"
+
+#include "shard_version.h"
+
+namespace mongo {
+
+    // Global version manager
+    VersionManager versionManager;
+
+    // when running in sharded mode, use chunk shard version control
+    struct ConnectionShardStatus {
+
+        typedef unsigned long long S;
+
+        ConnectionShardStatus()
+            : _mutex( "ConnectionShardStatus" ) {
+        }
+
+        S getSequence( DBClientBase * conn , const string& ns ) {
+            scoped_lock lk( _mutex );
+            return _map[conn][ns];
+        }
+
+        void setSequence( DBClientBase * conn , const string& ns , const S& s ) {
+            scoped_lock lk( _mutex );
+            _map[conn][ns] = s;
+        }
+
+        void reset( DBClientBase * conn ) {
+            scoped_lock lk( _mutex );
+            _map.erase( conn );
+        }
+
+        // protects _map
+        mongo::mutex _mutex;
+
+        // a map from a connection into ChunkManager's sequence number for each namespace
+        map<DBClientBase*, map<string,unsigned long long> > _map;
+
+    } connectionShardStatus;
+
+    void VersionManager::resetShardVersionCB( DBClientBase * conn ) {
+        connectionShardStatus.reset( conn );
+    }
+
+    bool VersionManager::isVersionableCB( DBClientBase* conn ){
+        return conn->type() == ConnectionString::MASTER || conn->type() == ConnectionString::SET;
+    }
+
+    DBClientBase* getVersionable( DBClientBase* conn ){
+
+        switch ( conn->type() ) {
+        case ConnectionString::INVALID:
+           massert( 15904, str::stream() << "cannot set version on invalid connection " << conn->toString(), false );
+           return NULL;
+        case ConnectionString::MASTER:
+           return conn;
+        case ConnectionString::PAIR:
+           massert( 15905, str::stream() << "cannot set version or shard on pair connection " << conn->toString(), false );
+           return NULL;
+        case ConnectionString::SYNC:
+           massert( 15906, str::stream() << "cannot set version or shard on sync connection " << conn->toString(), false );
+           return NULL;
+        case ConnectionString::SET:
+           DBClientReplicaSet* set = (DBClientReplicaSet*) conn;
+           return &( set->masterConn() );
+        }
+
+        assert( false );
+        return NULL;
+    }
+
+    extern OID serverID;
+
+    bool VersionManager::initShardVersionCB( DBClientBase * conn_in, BSONObj& result ){
+
+        WriteBackListener::init( *conn_in );
+
+        DBClientBase* conn = getVersionable( conn_in );
+        assert( conn ); // errors thrown above
+
+        BSONObjBuilder cmdBuilder;
+
+        cmdBuilder.append( "setShardVersion" , "" );
+        cmdBuilder.appendBool( "init", true );
+        cmdBuilder.append( "configdb" , configServer.modelServer() );
+        cmdBuilder.appendOID( "serverID" , &serverID );
+        cmdBuilder.appendBool( "authoritative" , true );
+
+        BSONObj cmd = cmdBuilder.obj();
+
+        LOG(1) << "initializing shard connection to " << conn->toString() << endl;
+        LOG(2) << "initial sharding settings : " << cmd << endl;
+
+        bool ok = conn->runCommand( "admin" , cmd , result );
+
+        // HACK for backwards compatibility with v1.8.x, v2.0.0 and v2.0.1
+        // Result is false, but will still initialize serverID and configdb
+        if( ! ok && ! result["errmsg"].eoo() && ( result["errmsg"].String() == "need to specify namespace"/* 2.0.1/2 */ ||
+                                                  result["errmsg"].String() == "need to speciy namespace" /* 1.8 */ ))
+        {
+            ok = true;
+        }
+
+        LOG(3) << "initial sharding result : " << result << endl;
+
+        return ok;
+
+    }
+
+    bool VersionManager::forceRemoteCheckShardVersionCB( const string& ns ){
+
+        DBConfigPtr conf = grid.getDBConfig( ns );
+        if ( ! conf ) return false;
+        conf->reload();
+
+        ChunkManagerPtr manager = conf->getChunkManagerIfExists( ns, true, true );
+        if( ! manager ) return false;
+
+        return true;
+
+    }
+
+    /**
+     * @return true if had to do something
+     */
+    bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) {
+        // TODO: cache, optimize, etc...
+
+        WriteBackListener::init( *conn_in );
+
+        DBConfigPtr conf = grid.getDBConfig( ns );
+        if ( ! conf )
+            return false;
+
+        DBClientBase* conn = getVersionable( conn_in );
+        assert(conn); // errors thrown above
+
+        unsigned long long officialSequenceNumber = 0;
+
+        ChunkManagerPtr manager;
+        const bool isSharded = conf->isSharded( ns );
+        if ( isSharded ) {
+            manager = conf->getChunkManagerIfExists( ns , authoritative );
+            // It's possible the chunk manager was reset since we checked whether sharded was true,
+            // so must check this here.
+            if( manager ) officialSequenceNumber = manager->getSequenceNumber();
+        }
+
+        // Check this manager against the reference manager
+        if( isSharded && manager ){
+
+            Shard shard = Shard::make( conn->getServerAddress() );
+            if( refManager && ! refManager->compatibleWith( manager, shard ) ){
+                throw SendStaleConfigException( ns, str::stream() << "manager (" << manager->getVersion( shard )  << " : " << manager->getSequenceNumber() << ") "
+                                                                      << "not compatible with reference manager (" << refManager->getVersion( shard )  << " : " << refManager->getSequenceNumber() << ") "
+                                                                      << "on shard " << shard.getName() << " (" << shard.getAddress().toString() << ")" );
+            }
+        }
+        else if( refManager ){
+            throw SendStaleConfigException( ns, str::stream() << "not sharded (" << ( (manager.get() == 0) ? ( str::stream() << manager->getSequenceNumber() << ") " ) : (string)"<none>) " ) <<
+                                                                     "but has reference manager (" << refManager->getSequenceNumber() << ") "
+                                                                  << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")" );
+        }
+
+        // has the ChunkManager been reloaded since the last time we updated the connection-level version?
+        // (ie., last time we issued the setShardVersions below)
+        unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns);
+        if ( sequenceNumber == officialSequenceNumber ) {
+            return false;
+        }
+
+
+        ShardChunkVersion version = 0;
+        if ( isSharded && manager ) {
+            version = manager->getVersion( Shard::make( conn->getServerAddress() ) );
+        }
+
+        if( version == 0 ){
+            LOG(0) << "resetting shard version of " << ns << " on " << conn->getServerAddress() << ", " <<
+                      ( ! isSharded ? "no longer sharded" :
+                      ( ! manager ? "no chunk manager found" :
+                                    "version is zero" ) ) << endl;
+        }
+
+
+        LOG(2) << " have to set shard version for conn: " << conn << " ns:" << ns
+               << " my last seq: " << sequenceNumber << "  current: " << officialSequenceNumber
+               << " version: " << version << " manager: " << manager.get()
+               << endl;
+
+        BSONObj result;
+        if ( setShardVersion( *conn , ns , version , authoritative , result ) ) {
+            // success!
+            LOG(1) << "      setShardVersion success: " << result << endl;
+            connectionShardStatus.setSequence( conn , ns , officialSequenceNumber );
+            return true;
+        }
+
+        LOG(1) << "       setShardVersion failed!\n" << result << endl;
+
+        if ( result["need_authoritative"].trueValue() )
+            massert( 10428 ,  "need_authoritative set but in authoritative mode already" , ! authoritative );
+
+        if ( ! authoritative ) {
+            checkShardVersion( conn , ns , refManager, 1 , tryNumber + 1 );
+            return true;
+        }
+        
+        if ( result["reloadConfig"].trueValue() ) {
+            if( result["version"].timestampTime() == 0 ){
+                // reload db
+                conf->reload();
+            }
+            else {
+                // reload config
+                conf->getChunkManager( ns , true );
+            }
+        }
+
+        const int maxNumTries = 7;
+        if ( tryNumber < maxNumTries ) {
+            LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) 
+                << "going to retry checkShardVersion host: " << conn->getServerAddress() << " " << result << endl;
+            sleepmillis( 10 * tryNumber );
+            checkShardVersion( conn , ns , refManager, true , tryNumber + 1 );
+            return true;
+        }
+        
+        string errmsg = str::stream() << "setShardVersion failed host: " << conn->getServerAddress() << " " << result;
+        log() << "     " << errmsg << endl;
+        massert( 10429 , errmsg , 0 );
+        return true;
+    }
+
+    bool VersionManager::checkShardVersionCB( DBClientBase* conn_in , const string& ns , bool authoritative , int tryNumber ) {
+        return checkShardVersion( conn_in, ns, ChunkManagerPtr(), authoritative, tryNumber );
+    }
+
+    bool VersionManager::checkShardVersionCB( ShardConnection* conn_in , bool authoritative , int tryNumber ) {
+        return checkShardVersion( conn_in->get(), conn_in->getNS(), conn_in->getManager(), authoritative, tryNumber );
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/s/shard_version.h b/src/mongo/s/shard_version.h
new file mode 100644
index 00000000000..98cacf67af2
--- /dev/null
+++ b/src/mongo/s/shard_version.h
@@ -0,0 +1,32 @@
+// @file shard_version.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /*
+     * Install chunk shard vesion callbaks in shardconnection code. This activates
+     * the chunk shard version control that mongos needs.
+     *
+     * MUST be called before accepting any connections.
+     */
+    void installChunkShardVersioning();
+
+
+}  // namespace mongo
diff --git a/src/mongo/s/shardconnection.cpp b/src/mongo/s/shardconnection.cpp
new file mode 100644
index 00000000000..5db8a67b736
--- /dev/null
+++ b/src/mongo/s/shardconnection.cpp
@@ -0,0 +1,248 @@
+// shardconnection.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "shard.h"
+#include "config.h"
+#include "request.h"
+#include <set>
+
+namespace mongo {
+
+    DBConnectionPool shardConnectionPool;
+
+    /**
+     * holds all the actual db connections for a client to various servers
+     * 1 per thread, so doesn't have to be thread safe
+     */
+    class ClientConnections : boost::noncopyable {
+    public:
+        struct Status : boost::noncopyable {
+            Status() : created(0), avail(0) {}
+
+            long long created;
+            DBClientBase* avail;
+        };
+
+
+        ClientConnections() {}
+
+        ~ClientConnections() {
+            for ( HostMap::iterator i=_hosts.begin(); i!=_hosts.end(); ++i ) {
+                string addr = i->first;
+                Status* ss = i->second;
+                assert( ss );
+                if ( ss->avail ) {
+                    /* if we're shutting down, don't want to initiate release mechanism as it is slow,
+                       and isn't needed since all connections will be closed anyway */
+                    if ( inShutdown() ) {
+                        if( versionManager.isVersionableCB( ss->avail ) ) versionManager.resetShardVersionCB( ss->avail );
+                        delete ss->avail;
+                    }
+                    else
+                        release( addr , ss->avail );
+                    ss->avail = 0;
+                }
+                delete ss;
+            }
+            _hosts.clear();
+        }
+
+        DBClientBase * get( const string& addr , const string& ns ) {
+            _check( ns );
+
+            Status* &s = _hosts[addr];
+            if ( ! s )
+                s = new Status();
+
+            if ( s->avail ) {
+                DBClientBase* c = s->avail;
+                s->avail = 0;
+                try {
+                    shardConnectionPool.onHandedOut( c );
+                }
+                catch ( std::exception& ) {
+                    delete c;
+                    throw;
+                }
+                return c;
+            }
+
+            s->created++;
+            return shardConnectionPool.get( addr );
+        }
+
+        void done( const string& addr , DBClientBase* conn ) {
+            Status* s = _hosts[addr];
+            assert( s );
+            if ( s->avail ) {
+                release( addr , conn );
+                return;
+            }
+            s->avail = conn;
+        }
+
+        void sync() {
+            for ( HostMap::iterator i=_hosts.begin(); i!=_hosts.end(); ++i ) {
+                string addr = i->first;
+                Status* ss = i->second;
+                if ( ss->avail )
+                    ss->avail->getLastError();
+                
+            }
+        }
+
+        void checkVersions( const string& ns ) {
+
+            vector<Shard> all;
+            Shard::getAllShards( all );
+
+            // Now only check top-level shard connections
+            for ( unsigned i=0; i<all.size(); i++ ) {
+
+                string sconnString = all[i].getConnString();
+                Status* &s = _hosts[sconnString];
+
+                if ( ! s ){
+                    s = new Status();
+                }
+
+                if( ! s->avail )
+                    s->avail = shardConnectionPool.get( sconnString );
+
+                versionManager.checkShardVersionCB( s->avail, ns, false, 1 );
+
+            }
+        }
+
+        void release( const string& addr , DBClientBase * conn ) {
+            shardConnectionPool.release( addr , conn );
+        }
+
+        void _check( const string& ns ) {
+            if ( ns.size() == 0 || _seenNS.count( ns ) )
+                return;
+            _seenNS.insert( ns );
+            checkVersions( ns );
+        }
+        
+        typedef map<string,Status*,DBConnectionPool::serverNameCompare> HostMap;
+        HostMap _hosts;
+        set<string> _seenNS;
+        // -----
+
+        static thread_specific_ptr<ClientConnections> _perThread;
+
+        static ClientConnections* threadInstance() {
+            ClientConnections* cc = _perThread.get();
+            if ( ! cc ) {
+                cc = new ClientConnections();
+                _perThread.reset( cc );
+            }
+            return cc;
+        }
+    };
+
+    thread_specific_ptr<ClientConnections> ClientConnections::_perThread;
+
+    ShardConnection::ShardConnection( const Shard * s , const string& ns, ChunkManagerPtr manager )
+        : _addr( s->getConnString() ) , _ns( ns ), _manager( manager ) {
+        _init();
+    }
+
+    ShardConnection::ShardConnection( const Shard& s , const string& ns, ChunkManagerPtr manager )
+        : _addr( s.getConnString() ) , _ns( ns ), _manager( manager ) {
+        _init();
+    }
+
+    ShardConnection::ShardConnection( const string& addr , const string& ns, ChunkManagerPtr manager )
+        : _addr( addr ) , _ns( ns ), _manager( manager ) {
+        _init();
+    }
+
+    void ShardConnection::_init() {
+        assert( _addr.size() );
+        _conn = ClientConnections::threadInstance()->get( _addr , _ns );
+        _finishedInit = false;
+    }
+
+    void ShardConnection::_finishInit() {
+        if ( _finishedInit )
+            return;
+        _finishedInit = true;
+
+        if ( _ns.size() && versionManager.isVersionableCB( _conn ) ) {
+            // Make sure we specified a manager for the correct namespace
+            if( _manager ) assert( _manager->getns() == _ns );
+            _setVersion = versionManager.checkShardVersionCB( this , false , 1 );
+        }
+        else {
+            // Make sure we didn't specify a manager for an empty namespace
+            assert( ! _manager );
+            _setVersion = false;
+        }
+
+    }
+
+    void ShardConnection::done() {
+        if ( _conn ) {
+            ClientConnections::threadInstance()->done( _addr , _conn );
+            _conn = 0;
+            _finishedInit = true;
+        }
+    }
+
+    void ShardConnection::kill() {
+        if ( _conn ) {
+            if( versionManager.isVersionableCB( _conn ) ) versionManager.resetShardVersionCB( _conn );
+            delete _conn;
+            _conn = 0;
+            _finishedInit = true;
+        }
+    }
+
+    void ShardConnection::sync() {
+        ClientConnections::threadInstance()->sync();
+    }
+
+    bool ShardConnection::runCommand( const string& db , const BSONObj& cmd , BSONObj& res ) {
+        assert( _conn );
+        bool ok = _conn->runCommand( db , cmd , res );
+        if ( ! ok ) {
+            if ( res["code"].numberInt() == SendStaleConfigCode ) {
+                done();
+                throw RecvStaleConfigException( res["ns"].String() , res["errmsg"].String() );
+            }
+        }
+        return ok;
+    }
+
+    void ShardConnection::checkMyConnectionVersions( const string & ns ) {
+        ClientConnections::threadInstance()->checkVersions( ns );
+    }
+
+    ShardConnection::~ShardConnection() {
+        if ( _conn ) {
+            if ( ! _conn->isFailed() ) {
+                /* see done() comments above for why we log this line */
+                log() << "~ScopedDBConnection: _conn != null" << endl;
+            }
+            kill();
+        }
+    }
+}
diff --git a/src/mongo/s/shardkey.cpp b/src/mongo/s/shardkey.cpp
new file mode 100644
index 00000000000..365435ef5ea
--- /dev/null
+++ b/src/mongo/s/shardkey.cpp
@@ -0,0 +1,273 @@
+// shardkey.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "chunk.h"
+#include "../db/jsobj.h"
+#include "../util/unittest.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    ShardKeyPattern::ShardKeyPattern( BSONObj p ) : pattern( p.getOwned() ) {
+        pattern.getFieldNames(patternfields);
+
+        BSONObjBuilder min;
+        BSONObjBuilder max;
+
+        BSONObjIterator it(p);
+        while (it.more()) {
+            BSONElement e (it.next());
+            min.appendMinKey(e.fieldName());
+            max.appendMaxKey(e.fieldName());
+        }
+
+        gMin = min.obj();
+        gMax = max.obj();
+    }
+
+    int ShardKeyPattern::compare( const BSONObj& lObject , const BSONObj& rObject ) const {
+        BSONObj L = extractKey(lObject);
+        uassert( 10198 , str::stream() << "left object ("  << lObject << ") doesn't have full shard key (" << pattern << ')',
+                L.nFields() == (int)patternfields.size());
+        BSONObj R = extractKey(rObject);
+        uassert( 10199 , str::stream() << "right object (" << rObject << ") doesn't have full shard key (" << pattern << ')',
+                R.nFields() == (int)patternfields.size());
+        return L.woCompare(R);
+    }
+
+    bool ShardKeyPattern::hasShardKey( const BSONObj& obj ) const {
+        /* this is written s.t. if obj has lots of fields, if the shard key fields are early,
+           it is fast.  so a bit more work to try to be semi-fast.
+           */
+
+        for(set<string>::const_iterator it = patternfields.begin(); it != patternfields.end(); ++it) {
+            BSONElement e = obj.getFieldDotted(it->c_str());
+            if(e.eoo() || e.type() == Array)
+                return false;
+        }
+        return true;
+    }
+
+    bool ShardKeyPattern::isPrefixOf( const BSONObj& otherPattern ) const {
+        BSONObjIterator a( pattern );
+        BSONObjIterator b( otherPattern );
+
+        while ( a.more() && b.more() ) {
+            BSONElement x = a.next();
+            BSONElement y = b.next();
+            if ( strcmp( x.fieldName() , y.fieldName() ) )
+                return false;
+        }
+
+        return ! a.more();
+    }
+
+    string ShardKeyPattern::toString() const {
+        return pattern.toString();
+    }
+
+    BSONObj ShardKeyPattern::moveToFront(const BSONObj& obj) const {
+        vector<const char*> keysToMove;
+        keysToMove.push_back("_id");
+        BSONForEach(e, pattern) {
+            if (strchr(e.fieldName(), '.') == NULL && strcmp(e.fieldName(), "_id") != 0)
+                keysToMove.push_back(e.fieldName());
+        }
+
+        if (keysToMove.size() == 1) {
+            return obj;
+
+        }
+        else {
+            BufBuilder buf (obj.objsize());
+            buf.appendNum((unsigned)0); // refcount
+            buf.appendNum(obj.objsize());
+
+            vector<pair<const char*, size_t> > copies;
+            pair<const char*, size_t> toCopy ((const char*)NULL, 0); // C++ NULL isn't a pointer type yet
+
+            BSONForEach(e, obj) {
+                bool moveToFront = false;
+                for (vector<const char*>::const_iterator it(keysToMove.begin()), end(keysToMove.end()); it!=end; ++it) {
+                    if (strcmp(e.fieldName(), *it) == 0) {
+                        moveToFront = true;
+                        break;
+                    }
+                }
+
+                if (moveToFront) {
+                    buf.appendBuf(e.fieldName()-1, e.size());
+                    if (toCopy.first) {
+                        copies.push_back(toCopy);
+                        toCopy.first = NULL;
+                    }
+                }
+                else {
+                    if (!toCopy.first) {
+                        toCopy.first = e.fieldName()-1;
+                        toCopy.second = e.size();
+                    }
+                    else {
+                        toCopy.second += e.size();
+                    }
+                }
+            }
+
+            for (vector<pair<const char*, size_t> >::const_iterator it(copies.begin()), end(copies.end()); it!=end; ++it) {
+                buf.appendBuf(it->first, it->second);
+            }
+
+            if (toCopy.first) {
+                buf.appendBuf(toCopy.first, toCopy.second);
+            }
+
+            buf.appendChar('\0');
+
+            BSONObj out ((BSONObj::Holder*)buf.buf());
+            buf.decouple();
+            return out;
+        }
+    }
+
+    /* things to test for compound :
+       \ middle (deprecating?)
+    */
+    class ShardKeyUnitTest : public UnitTest {
+    public:
+
+        void testIsPrefixOf() {
+            {
+                ShardKeyPattern k( BSON( "x" << 1 ) );
+                assert( ! k.isPrefixOf( BSON( "a" << 1 ) ) );
+                assert( k.isPrefixOf( BSON( "x" << 1 ) ) );
+                assert( k.isPrefixOf( BSON( "x" << 1 << "a" << 1 ) ) );
+                assert( ! k.isPrefixOf( BSON( "a" << 1 << "x" << 1 ) ) );
+            }
+            {
+                ShardKeyPattern k( BSON( "x" << 1 << "y" << 1 ) );
+                assert( ! k.isPrefixOf( BSON( "x" << 1 ) ) );
+                assert( ! k.isPrefixOf( BSON( "x" << 1 << "z" << 1 ) ) );
+                assert( k.isPrefixOf( BSON( "x" << 1 << "y" << 1 ) ) );
+                assert( k.isPrefixOf( BSON( "x" << 1 << "y" << 1 << "z" << 1 ) ) );
+            }
+        }
+
+        void hasshardkeytest() {
+            BSONObj x = fromjson("{ zid : \"abcdefg\", num: 1.0, name: \"eliot\" }");
+            ShardKeyPattern k( BSON( "num" << 1 ) );
+            assert( k.hasShardKey(x) );
+            assert( !k.hasShardKey( fromjson("{foo:'a'}") ) );
+
+            // try compound key
+            {
+                ShardKeyPattern k( fromjson("{a:1,b:-1,c:1}") );
+                assert( k.hasShardKey( fromjson("{foo:'a',a:'b',c:'z',b:9,k:99}") ) );
+                assert( !k.hasShardKey( fromjson("{foo:'a',a:'b',c:'z',bb:9,k:99}") ) );
+                assert( !k.hasShardKey( fromjson("{k:99}") ) );
+            }
+
+        }
+
+        void extractkeytest() {
+            ShardKeyPattern k( fromjson("{a:1,'sub.b':-1,'sub.c':1}") );
+
+            BSONObj x = fromjson("{a:1,'sub.b':2,'sub.c':3}");
+            assert( k.extractKey( fromjson("{a:1,sub:{b:2,c:3}}") ).binaryEqual(x) );
+            assert( k.extractKey( fromjson("{sub:{b:2,c:3},a:1}") ).binaryEqual(x) );
+        }
+        void moveToFrontTest() {
+            ShardKeyPattern sk (BSON("a" << 1 << "b" << 1));
+
+            BSONObj ret;
+
+            ret = sk.moveToFront(BSON("z" << 1 << "_id" << 1 << "y" << 1 << "a" << 1 << "x" << 1 << "b" << 1 << "w" << 1));
+            assert(ret.binaryEqual(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1)));
+
+            ret = sk.moveToFront(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1));
+            assert(ret.binaryEqual(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1)));
+
+            ret = sk.moveToFront(BSON("z" << 1 << "y" << 1 << "a" << 1 << "b" << 1 << "Z" << 1 << "Y" << 1));
+            assert(ret.binaryEqual(BSON("a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "Z" << 1 << "Y" << 1)));
+
+        }
+
+        void moveToFrontBenchmark(int numFields) {
+            BSONObjBuilder bb;
+            bb.append("_id", 1);
+            for (int i=0; i < numFields; i++)
+                bb.append(BSONObjBuilder::numStr(i), 1);
+            bb.append("key", 1);
+            BSONObj o = bb.obj();
+
+            ShardKeyPattern sk (BSON("key" << 1));
+
+            Timer t;
+            const int iterations = 100*1000;
+            for (int i=0; i< iterations; i++) {
+                sk.moveToFront(o);
+            }
+
+            const double secs = t.micros() / 1000000.0;
+            const double ops_per_sec = iterations / secs;
+
+            cout << "moveToFront (" << numFields << " fields) secs: " << secs << " ops_per_sec: " << ops_per_sec << endl;
+        }
+        void run() {
+            extractkeytest();
+
+            ShardKeyPattern k( BSON( "key" << 1 ) );
+
+            BSONObj min = k.globalMin();
+
+//            cout << min.jsonString(TenGen) << endl;
+
+            BSONObj max = k.globalMax();
+
+            BSONObj k1 = BSON( "key" << 5 );
+
+            assert( k.compare( min , max ) < 0 );
+            assert( k.compare( min , k1 ) < 0 );
+            assert( k.compare( max , min ) > 0 );
+            assert( k.compare( min , min ) == 0 );
+
+            hasshardkeytest();
+            assert( k.hasShardKey( k1 ) );
+            assert( ! k.hasShardKey( BSON( "key2" << 1 ) ) );
+
+            BSONObj a = k1;
+            BSONObj b = BSON( "key" << 999 );
+
+            assert( k.compare(a,b) < 0 );
+
+            testIsPrefixOf();
+            // add middle multitype tests
+
+            moveToFrontTest();
+
+            if (0) { // toggle to run benchmark
+                moveToFrontBenchmark(0);
+                moveToFrontBenchmark(10);
+                moveToFrontBenchmark(100);
+            }
+
+            LOG(1) << "shardKeyTest passed" << endl;
+        }
+    } shardKeyTest;
+
+} // namespace mongo
diff --git a/src/mongo/s/shardkey.h b/src/mongo/s/shardkey.h
new file mode 100644
index 00000000000..976cff09591
--- /dev/null
+++ b/src/mongo/s/shardkey.h
@@ -0,0 +1,124 @@
+// shardkey.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+    class Chunk;
+
+    /* A ShardKeyPattern is a pattern indicating what data to extract from the object to make the shard key from.
+       Analogous to an index key pattern.
+    */
+    class ShardKeyPattern {
+    public:
+        ShardKeyPattern( BSONObj p = BSONObj() );
+
+        /**
+           global min is the lowest possible value for this key
+           e.g. { num : MinKey }
+         */
+        BSONObj globalMin() const { return gMin; }
+
+        /**
+           global max is the highest possible value for this key
+         */
+        BSONObj globalMax() const { return gMax; }
+
+        bool isGlobalMin( const BSONObj& k ) const {
+            return k.woCompare( globalMin() ) == 0;
+        }
+
+        bool isGlobalMax( const BSONObj& k ) const {
+            return k.woCompare( globalMax() ) == 0;
+        }
+
+        bool isGlobal( const BSONObj& k ) const {
+            return isGlobalMin( k ) || isGlobalMax( k );
+        }
+
+        /** compare shard keys from the objects specified
+           l < r negative
+           l == r 0
+           l > r positive
+         */
+        int compare( const BSONObj& l , const BSONObj& r ) const;
+
+        /**
+           @return whether or not obj has all fields in this shard key pattern
+           e.g.
+             ShardKey({num:1}).hasShardKey({ name:"joe", num:3 }) is true
+         */
+        bool hasShardKey( const BSONObj& obj ) const;
+
+        BSONObj key() const { return pattern; }
+
+        string toString() const;
+
+        BSONObj extractKey(const BSONObj& from) const;
+
+        bool partOfShardKey(const char* key ) const {
+            return pattern.hasField(key);
+        }
+        bool partOfShardKey(const string& key ) const {
+            return pattern.hasField(key.c_str());
+        }
+
+        /**
+         * @return
+         * true if 'this' is a prefix (not necessarily contained) of 'otherPattern'.
+         */
+        bool isPrefixOf( const BSONObj& otherPattern ) const;
+
+        /**
+         * @return BSONObj with _id and shardkey at front. May return original object.
+         */
+        BSONObj moveToFront(const BSONObj& obj) const;
+
+    private:
+        BSONObj pattern;
+        BSONObj gMin;
+        BSONObj gMax;
+
+        /* question: better to have patternfields precomputed or not?  depends on if we use copy constructor often. */
+        set<string> patternfields;
+    };
+
+    inline BSONObj ShardKeyPattern::extractKey(const BSONObj& from) const {
+        BSONObj k = from;
+        bool needExtraction = false;
+
+        BSONObjIterator a(from);
+        BSONObjIterator b(pattern);
+        while (a.more() && b.more()){
+            if (strcmp(a.next().fieldName(), b.next().fieldName()) != 0){
+                needExtraction = true;
+                break;
+            }
+        }
+
+        if (needExtraction || a.more() != b.more())
+            k = from.extractFields(pattern);
+
+        uassert(13334, "Shard Key must be less than 512 bytes", k.objsize() < 512);
+        return k;
+    }
+
+}
diff --git a/src/mongo/s/stats.cpp b/src/mongo/s/stats.cpp
new file mode 100644
index 00000000000..460ada3ccd6
--- /dev/null
+++ b/src/mongo/s/stats.cpp
@@ -0,0 +1,28 @@
+// stats.cpp
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "stats.h"
+
+namespace mongo {
+
+    OpCounters opsNonSharded;
+    OpCounters opsSharded;
+
+    GenericCounter shardedCursorTypes;
+}
diff --git a/src/mongo/s/stats.h b/src/mongo/s/stats.h
new file mode 100644
index 00000000000..a7cc784e981
--- /dev/null
+++ b/src/mongo/s/stats.h
@@ -0,0 +1,30 @@
+// stats.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "../db/stats/counters.h"
+
+namespace mongo {
+
+    extern OpCounters opsNonSharded;
+    extern OpCounters opsSharded;
+
+    extern GenericCounter shardedCursorTypes;
+}
diff --git a/src/mongo/s/strategy.cpp b/src/mongo/s/strategy.cpp
new file mode 100644
index 00000000000..6f02c4183b1
--- /dev/null
+++ b/src/mongo/s/strategy.cpp
@@ -0,0 +1,111 @@
+// @file strategy.cpp
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "../client/connpool.h"
+#include "../db/commands.h"
+
+#include "grid.h"
+#include "request.h"
+#include "server.h"
+#include "writeback_listener.h"
+
+#include "strategy.h"
+
+namespace mongo {
+
+    // ----- Strategy ------
+
+    void Strategy::doWrite( int op , Request& r , const Shard& shard , bool checkVersion ) {
+        ShardConnection conn( shard , r.getns() );
+        if ( ! checkVersion )
+            conn.donotCheckVersion();
+        else if ( conn.setVersion() ) {
+            conn.done();
+            throw RecvStaleConfigException( r.getns() , "doWrite" , true );
+        }
+        conn->say( r.m() );
+        conn.done();
+    }
+
+    void Strategy::doQuery( Request& r , const Shard& shard ) {
+
+        r.checkAuth( Auth::READ );
+
+        ShardConnection dbcon( shard , r.getns() );
+        DBClientBase &c = dbcon.conn();
+
+        string actualServer;
+
+        Message response;
+        bool ok = c.call( r.m(), response, true , &actualServer );
+        uassert( 10200 , "mongos: error calling db", ok );
+
+        {
+            QueryResult *qr = (QueryResult *) response.singleData();
+            if ( qr->resultFlags() & ResultFlag_ShardConfigStale ) {
+                dbcon.done();
+                throw RecvStaleConfigException( r.getns() , "Strategy::doQuery" );
+            }
+        }
+
+        r.reply( response , actualServer.size() ? actualServer : c.getServerAddress() );
+        dbcon.done();
+    }
+
+    void Strategy::insert( const Shard& shard , const char * ns , const BSONObj& obj , int flags, bool safe ) {
+        ShardConnection dbcon( shard , ns );
+        if ( dbcon.setVersion() ) {
+            dbcon.done();
+            throw RecvStaleConfigException( ns , "for insert" );
+        }
+        dbcon->insert( ns , obj , flags);
+        if (safe)
+            dbcon->getLastError();
+        dbcon.done();
+    }
+
+    void Strategy::insert( const Shard& shard , const char * ns , const vector<BSONObj>& v , int flags, bool safe ) {
+        ShardConnection dbcon( shard , ns );
+        if ( dbcon.setVersion() ) {
+            dbcon.done();
+            throw RecvStaleConfigException( ns , "for insert" );
+        }
+        dbcon->insert( ns , v , flags);
+        if (safe)
+            dbcon->getLastError();
+        dbcon.done();
+    }
+
+    void Strategy::update( const Shard& shard , const char * ns , const BSONObj& query , const BSONObj& toupdate , int flags, bool safe ) {
+        bool upsert = flags & UpdateOption_Upsert;
+        bool multi = flags & UpdateOption_Multi;
+
+        ShardConnection dbcon( shard , ns );
+        if ( dbcon.setVersion() ) {
+            dbcon.done();
+            throw RecvStaleConfigException( ns , "for insert" );
+        }
+        dbcon->update( ns , query , toupdate, upsert, multi);
+        if (safe)
+            dbcon->getLastError();
+        dbcon.done();
+    }
+
+}
diff --git a/src/mongo/s/strategy.h b/src/mongo/s/strategy.h
new file mode 100644
index 00000000000..25c9b97630e
--- /dev/null
+++ b/src/mongo/s/strategy.h
@@ -0,0 +1,59 @@
+// strategy.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "../pch.h"
+#include "chunk.h"
+#include "request.h"
+
+namespace mongo {
+
+    class Strategy {
+    public:
+        Strategy() {}
+        virtual ~Strategy() {}
+        virtual void queryOp( Request& r ) = 0;
+        virtual void getMore( Request& r ) = 0;
+        virtual void writeOp( int op , Request& r ) = 0;
+
+        void insert( const Shard& shard , const char * ns , const BSONObj& obj , int flags=0 , bool safe=false );
+
+        virtual void commandOp( const string& db, const BSONObj& command, int options,
+                                const string& versionedNS, const BSONObj& filter,
+                                map<Shard,BSONObj>& results )
+        {
+            // Only call this from sharded, for now.
+            // TODO:  Refactor all this.
+            assert( false );
+        }
+
+    protected:
+        void doWrite( int op , Request& r , const Shard& shard , bool checkVersion = true );
+        void doQuery( Request& r , const Shard& shard );
+
+        void insert( const Shard& shard , const char * ns , const vector<BSONObj>& v , int flags=0 , bool safe=false );
+        void update( const Shard& shard , const char * ns , const BSONObj& query , const BSONObj& toupdate , int flags=0, bool safe=false );
+
+    };
+
+    extern Strategy * SINGLE;
+    extern Strategy * SHARDED;
+
+}
+
diff --git a/src/mongo/s/strategy_shard.cpp b/src/mongo/s/strategy_shard.cpp
new file mode 100644
index 00000000000..3f4c105204a
--- /dev/null
+++ b/src/mongo/s/strategy_shard.cpp
@@ -0,0 +1,414 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// strategy_sharded.cpp
+
+#include "pch.h"
+#include "request.h"
+#include "chunk.h"
+#include "cursors.h"
+#include "stats.h"
+#include "client.h"
+
+#include "../client/connpool.h"
+#include "../db/commands.h"
+
+// error codes 8010-8040
+
+namespace mongo {
+
+    class ShardStrategy : public Strategy {
+
+        virtual void queryOp( Request& r ) {
+
+            // TODO: These probably should just be handled here.
+            if ( r.isCommand() ) {
+                SINGLE->queryOp( r );
+                return;
+            }
+
+            QueryMessage q( r.d() );
+
+            r.checkAuth( Auth::READ );
+
+            LOG(3) << "shard query: " << q.ns << "  " << q.query << endl;
+
+            if ( q.ntoreturn == 1 && strstr(q.ns, ".$cmd") )
+                throw UserException( 8010 , "something is wrong, shouldn't see a command here" );
+
+            QuerySpec qSpec( (string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions );
+
+            ParallelSortClusteredCursor * cursor = new ParallelSortClusteredCursor( qSpec, CommandInfo() );
+            assert( cursor );
+
+            // TODO:  Move out to Request itself, not strategy based
+            try {
+                long long start_millis = 0;
+                if ( qSpec.isExplain() ) start_millis = curTimeMillis64();
+                cursor->init();
+
+                LOG(5) << "   cursor type: " << cursor->type() << endl;
+                shardedCursorTypes.hit( cursor->type() );
+
+                if ( qSpec.isExplain() ) {
+                    // fetch elapsed time for the query
+                    long long elapsed_millis = curTimeMillis64() - start_millis;
+                    BSONObjBuilder explain_builder;
+                    cursor->explain( explain_builder );
+                    explain_builder.appendNumber( "millis", elapsed_millis );
+                    BSONObj b = explain_builder.obj();
+
+                    replyToQuery( 0 , r.p() , r.m() , b );
+                    delete( cursor );
+                    return;
+                }
+            }
+            catch(...) {
+                delete cursor;
+                throw;
+            }
+
+            if( cursor->isSharded() ){
+                ShardedClientCursorPtr cc (new ShardedClientCursor( q , cursor ));
+
+                if ( ! cc->sendNextBatch( r ) ) {
+                    return;
+                }
+
+                LOG(6) << "storing cursor : " << cc->getId() << endl;
+                cursorCache.store( cc );
+            }
+            else{
+                // TODO:  Better merge this logic.  We potentially can now use the same cursor logic for everything.
+                ShardPtr primary = cursor->getPrimary();
+                DBClientCursorPtr shardCursor = cursor->getShardCursor( *primary );
+                r.reply( *(shardCursor->getMessage()) , primary->getConnString() );
+            }
+        }
+
+        virtual void commandOp( const string& db, const BSONObj& command, int options,
+                                const string& versionedNS, const BSONObj& filter,
+                                map<Shard,BSONObj>& results )
+        {
+
+            QuerySpec qSpec( db + ".$cmd", command, BSONObj(), 0, 1, options );
+
+            ParallelSortClusteredCursor cursor( qSpec, CommandInfo( versionedNS, filter ) );
+
+            // Initialize the cursor
+            cursor.init();
+
+            set<Shard> shards;
+            cursor.getQueryShards( shards );
+
+            for( set<Shard>::iterator i = shards.begin(), end = shards.end(); i != end; ++i ){
+                results[ *i ] = cursor.getShardCursor( *i )->peekFirst().getOwned();
+            }
+
+        }
+
+        virtual void getMore( Request& r ) {
+
+            // TODO:  Handle stale config exceptions here from coll being dropped or sharded during op
+            // for now has same semantics as legacy request
+            ChunkManagerPtr info = r.getChunkManager();
+
+            if( ! info ){
+                SINGLE->getMore( r );
+                return;
+            }
+            else {
+                int ntoreturn = r.d().pullInt();
+                long long id = r.d().pullInt64();
+
+                LOG(6) << "want cursor : " << id << endl;
+
+                ShardedClientCursorPtr cursor = cursorCache.get( id );
+                if ( ! cursor ) {
+                    LOG(6) << "\t invalid cursor :(" << endl;
+                    replyToQuery( ResultFlag_CursorNotFound , r.p() , r.m() , 0 , 0 , 0 );
+                    return;
+                }
+
+                if ( cursor->sendNextBatch( r , ntoreturn ) ) {
+                    // still more data
+                    cursor->accessed();
+                    return;
+                }
+
+                // we've exhausted the cursor
+                cursorCache.remove( id );
+            }
+        }
+
+        void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) {
+            const int flags = d.reservedField() | InsertOption_ContinueOnError; // ContinueOnError is always on when using sharding.
+            map<ChunkPtr, vector<BSONObj> > insertsForChunk; // Group bulk insert for appropriate shards
+            try {
+                while ( d.moreJSObjs() ) {
+                    BSONObj o = d.nextJsObj();
+                    if ( ! manager->hasShardKey( o ) ) {
+
+                        bool bad = true;
+
+                        if ( manager->getShardKey().partOfShardKey( "_id" ) ) {
+                            BSONObjBuilder b;
+                            b.appendOID( "_id" , 0 , true );
+                            b.appendElements( o );
+                            o = b.obj();
+                            bad = ! manager->hasShardKey( o );
+                        }
+
+                        if ( bad ) {
+                            log() << "tried to insert object with no valid shard key: " << r.getns() << "  " << o << endl;
+                            uasserted( 8011 , "tried to insert object with no valid shard key" );
+                        }
+
+                    }
+
+                    // Many operations benefit from having the shard key early in the object
+                    o = manager->getShardKey().moveToFront(o);
+                    insertsForChunk[manager->findChunk(o)].push_back(o);
+                }
+                for (map<ChunkPtr, vector<BSONObj> >::iterator it = insertsForChunk.begin(); it != insertsForChunk.end(); ++it) {
+                    ChunkPtr c = it->first;
+                    vector<BSONObj> objs = it->second;
+                    const int maxTries = 30;
+
+                    bool gotThrough = false;
+                    for ( int i=0; i<maxTries; i++ ) {
+                        try {
+                            LOG(4) << "  server:" << c->getShard().toString() << " bulk insert " << objs.size() << " documents" << endl;
+                            insert( c->getShard() , r.getns() , objs , flags);
+
+                            int bytesWritten = 0;
+                            for (vector<BSONObj>::iterator vecIt = objs.begin(); vecIt != objs.end(); ++vecIt) {
+                                r.gotInsert(); // Record the correct number of individual inserts
+                                bytesWritten += (*vecIt).objsize();
+                            }
+                            if ( r.getClientInfo()->autoSplitOk() )
+                                c->splitIfShould( bytesWritten );
+                            gotThrough = true;
+                            break;
+                        }
+                        catch ( StaleConfigException& e ) {
+                            int logLevel = i < ( maxTries / 2 );
+                            LOG( logLevel ) << "retrying bulk insert of " << objs.size() << " documents because of StaleConfigException: " << e << endl;
+                            r.reset();
+
+                            manager = r.getChunkManager();
+                            if( ! manager ) {
+                                uasserted(14804, "collection no longer sharded");
+                            }
+
+                            unsigned long long old = manager->getSequenceNumber();
+                            
+                            LOG( logLevel ) << "  sequence number - old: " << old << " new: " << manager->getSequenceNumber() << endl;
+                        }
+                        sleepmillis( i * 20 );
+                    }
+
+                    assert( inShutdown() || gotThrough ); // not caught below
+                }
+            } catch (const UserException&){
+                if (!d.moreJSObjs()){
+                    throw;
+                }
+                // Ignore and keep going. ContinueOnError is implied with sharding.
+            }
+        }
+
+        void _update( Request& r , DbMessage& d, ChunkManagerPtr manager ) {
+            int flags = d.pullInt();
+
+            BSONObj query = d.nextJsObj();
+            uassert( 13506 ,  "$atomic not supported sharded" , query["$atomic"].eoo() );
+            uassert( 10201 ,  "invalid update" , d.moreJSObjs() );
+            BSONObj toupdate = d.nextJsObj();
+            BSONObj chunkFinder = query;
+
+            bool upsert = flags & UpdateOption_Upsert;
+            bool multi = flags & UpdateOption_Multi;
+
+            if (upsert) {
+                uassert(8012, "can't upsert something without valid shard key",
+                        (manager->hasShardKey(toupdate) ||
+                         (toupdate.firstElementFieldName()[0] == '$' && manager->hasShardKey(query))));
+
+                BSONObj key = manager->getShardKey().extractKey(query);
+                BSONForEach(e, key) {
+                    uassert(13465, "shard key in upsert query must be an exact match", getGtLtOp(e) == BSONObj::Equality);
+                }
+            }
+
+            bool save = false;
+            if ( ! manager->hasShardKey( query ) ) {
+                if ( multi ) {
+                }
+                else if ( strcmp( query.firstElementFieldName() , "_id" ) || query.nFields() != 1 ) {
+                    log() << "Query " << query << endl;
+                    throw UserException( 8013 , "can't do non-multi update with query that doesn't have a valid shard key" );
+                }
+                else {
+                    save = true;
+                    chunkFinder = toupdate;
+                }
+            }
+
+
+            if ( ! save ) {
+                if ( toupdate.firstElementFieldName()[0] == '$' ) {
+                    BSONObjIterator ops(toupdate);
+                    while(ops.more()) {
+                        BSONElement op(ops.next());
+                        if (op.type() != Object)
+                            continue;
+                        BSONObjIterator fields(op.embeddedObject());
+                        while(fields.more()) {
+                            const string field = fields.next().fieldName();
+                            uassert(13123,
+                                    str::stream() << "Can't modify shard key's value field" << field
+                                    << " for collection: " << manager->getns(),
+                                    ! manager->getShardKey().partOfShardKey(field));
+                        }
+                    }
+                }
+                else if ( manager->hasShardKey( toupdate ) ) {
+                    uassert( 8014,
+                             str::stream() << "cannot modify shard key for collection: " << manager->getns(),
+                             manager->getShardKey().compare( query , toupdate ) == 0 );
+                }
+                else {
+                    uasserted(12376,
+                              str::stream() << "valid shard key must be in update object for collection: " << manager->getns() );
+                }
+            }
+
+            if ( multi ) {
+                set<Shard> shards;
+                manager->getShardsForQuery( shards , chunkFinder );
+                int * x = (int*)(r.d().afterNS());
+                x[0] |= UpdateOption_Broadcast;
+                for ( set<Shard>::iterator i=shards.begin(); i!=shards.end(); i++) {
+                    doWrite( dbUpdate , r , *i , false );
+                }
+            }
+            else {
+                int left = 5;
+                while ( true ) {
+                    try {
+                        ChunkPtr c = manager->findChunk( chunkFinder );
+                        doWrite( dbUpdate , r , c->getShard() );
+                        if ( r.getClientInfo()->autoSplitOk() )
+                            c->splitIfShould( d.msg().header()->dataLen() );
+                        break;
+                    }
+                    catch ( StaleConfigException& e ) {
+                        if ( left <= 0 )
+                            throw e;
+                        left--;
+                        log() << "update will be retried b/c sharding config info is stale, "
+                              << " left:" << left << " ns: " << r.getns() << " query: " << query << endl;
+                        r.reset();
+                        manager = r.getChunkManager();
+                        uassert(14806, "collection no longer sharded", manager);
+                    }
+                }
+            }
+        }
+
+        void _delete( Request& r , DbMessage& d, ChunkManagerPtr manager ) {
+
+            int flags = d.pullInt();
+            bool justOne = flags & 1;
+
+            uassert( 10203 ,  "bad delete message" , d.moreJSObjs() );
+            BSONObj pattern = d.nextJsObj();
+            uassert( 13505 ,  "$atomic not supported sharded" , pattern["$atomic"].eoo() );
+
+            set<Shard> shards;
+            int left = 5;
+
+            while ( true ) {
+                try {
+                    manager->getShardsForQuery( shards , pattern );
+                    LOG(2) << "delete : " << pattern << " \t " << shards.size() << " justOne: " << justOne << endl;
+                    if ( shards.size() == 1 ) {
+                        doWrite( dbDelete , r , *shards.begin() );
+                        return;
+                    }
+                    break;
+                }
+                catch ( StaleConfigException& e ) {
+                    if ( left <= 0 )
+                        throw e;
+                    left--;
+                    log() << "delete failed b/c of StaleConfigException, retrying "
+                          << " left:" << left << " ns: " << r.getns() << " patt: " << pattern << endl;
+                    r.reset();
+                    shards.clear();
+                    manager = r.getChunkManager();
+                    uassert(14805, "collection no longer sharded", manager);
+                }
+            }
+
+            if ( justOne && ! pattern.hasField( "_id" ) )
+                throw UserException( 8015 , "can only delete with a non-shard key pattern if can delete as many as we find" );
+
+            for ( set<Shard>::iterator i=shards.begin(); i!=shards.end(); i++) {
+                int * x = (int*)(r.d().afterNS());
+                x[0] |= RemoveOption_Broadcast;
+                doWrite( dbDelete , r , *i , false );
+            }
+        }
+
+        virtual void writeOp( int op , Request& r ) {
+
+            // TODO:  Handle stale config exceptions here from coll being dropped or sharded during op
+            // for now has same semantics as legacy request
+            ChunkManagerPtr info = r.getChunkManager();
+
+            if( ! info ){
+                SINGLE->writeOp( op, r );
+                return;
+            }
+            else{
+                const char *ns = r.getns();
+                LOG(3) << "write: " << ns << endl;
+
+                DbMessage& d = r.d();
+
+                if ( op == dbInsert ) {
+                    _insert( r , d , info );
+                }
+                else if ( op == dbUpdate ) {
+                    _update( r , d , info );
+                }
+                else if ( op == dbDelete ) {
+                    _delete( r , d , info );
+                }
+                else {
+                    log() << "sharding can't do write op: " << op << endl;
+                    throw UserException( 8016 , "can't do this write op on sharded collection" );
+                }
+                return;
+            }
+        }
+
+    };
+
+    Strategy * SHARDED = new ShardStrategy();
+}
diff --git a/src/mongo/s/strategy_single.cpp b/src/mongo/s/strategy_single.cpp
new file mode 100644
index 00000000000..d3cd958b6b1
--- /dev/null
+++ b/src/mongo/s/strategy_single.cpp
@@ -0,0 +1,272 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// strategy_simple.cpp
+
+#include "pch.h"
+#include "request.h"
+#include "cursors.h"
+#include "../client/connpool.h"
+#include "../db/commands.h"
+
+namespace mongo {
+
+    class SingleStrategy : public Strategy {
+
+    public:
+        SingleStrategy() {
+            _commandsSafeToPass.insert( "$eval" );
+            _commandsSafeToPass.insert( "create" );
+        }
+
+    private:
+        virtual void queryOp( Request& r ) {
+            QueryMessage q( r.d() );
+
+            LOG(3) << "single query: " << q.ns << "  " << q.query << "  ntoreturn: " << q.ntoreturn << " options : " << q.queryOptions << endl;
+
+            if ( r.isCommand() ) {
+
+                if ( handleSpecialNamespaces( r , q ) )
+                    return;
+
+                int loops = 5;
+                while ( true ) {
+                    BSONObjBuilder builder;
+                    try {
+                        BSONObj cmdObj = q.query;
+                        {
+                            BSONElement e = cmdObj.firstElement();
+                            if ( e.type() == Object && (e.fieldName()[0] == '$'
+                                                         ? str::equals("query", e.fieldName()+1)
+                                                         : str::equals("query", e.fieldName())))
+                                cmdObj = e.embeddedObject();
+                        }
+                        bool ok = Command::runAgainstRegistered(q.ns, cmdObj, builder, q.queryOptions);
+                        if ( ok ) {
+                            BSONObj x = builder.done();
+                            replyToQuery(0, r.p(), r.m(), x);
+                            return;
+                        }
+                        break;
+                    }
+                    catch ( StaleConfigException& e ) {
+                        if ( loops <= 0 )
+                            throw e;
+
+                        loops--;
+                        log() << "retrying command: " << q.query << endl;
+                        ShardConnection::checkMyConnectionVersions( e.getns() );
+                        if( loops < 4 ) versionManager.forceRemoteCheckShardVersionCB( e.getns() );
+                    }
+                    catch ( AssertionException& e ) {
+                        e.getInfo().append( builder , "assertion" , "assertionCode" );
+                        builder.append( "errmsg" , "db assertion failure" );
+                        builder.append( "ok" , 0 );
+                        BSONObj x = builder.done();
+                        replyToQuery(0, r.p(), r.m(), x);
+                        return;
+                    }
+                }
+
+                string commandName = q.query.firstElementFieldName();
+
+                uassert(13390, "unrecognized command: " + commandName, _commandsSafeToPass.count(commandName) != 0);
+            }
+
+            doQuery( r , r.primaryShard() );
+        }
+
+        virtual void getMore( Request& r ) {
+            const char *ns = r.getns();
+
+            LOG(3) << "single getmore: " << ns << endl;
+
+            long long id = r.d().getInt64( 4 );
+            
+            // we used ScopedDbConnection because we don't get about config versions
+            // not deleting data is handled elsewhere
+            // and we don't want to call setShardVersion
+            ScopedDbConnection conn( cursorCache.getRef( id ) );
+
+            Message response;
+            bool ok = conn->callRead( r.m() , response);
+            uassert( 10204 , "dbgrid: getmore: error calling db", ok);
+            r.reply( response , "" /*conn->getServerAddress() */ );
+
+            conn.done();
+
+        }
+
+        void handleIndexWrite( int op , Request& r ) {
+
+            DbMessage& d = r.d();
+
+            if ( op == dbInsert ) {
+                while( d.moreJSObjs() ) {
+                    BSONObj o = d.nextJsObj();
+                    const char * ns = o["ns"].valuestr();
+                    if ( r.getConfig()->isSharded( ns ) ) {
+                        BSONObj newIndexKey = o["key"].embeddedObjectUserCheck();
+
+                        uassert( 10205 ,  (string)"can't use unique indexes with sharding  ns:" + ns +
+                                 " key: " + o["key"].embeddedObjectUserCheck().toString() ,
+                                 IndexDetails::isIdIndexPattern( newIndexKey ) ||
+                                 ! o["unique"].trueValue() ||
+                                 r.getConfig()->getChunkManager( ns )->getShardKey().isPrefixOf( newIndexKey ) );
+
+                        ChunkManagerPtr cm = r.getConfig()->getChunkManager( ns );
+                        assert( cm );
+
+                        set<Shard> shards;
+                        cm->getAllShards(shards);
+                        for (set<Shard>::const_iterator it=shards.begin(), end=shards.end(); it != end; ++it)
+                            doWrite( op , r , *it );
+                    }
+                    else {
+                        doWrite( op , r , r.primaryShard() );
+                    }
+                    r.gotInsert();
+                }
+            }
+            else if ( op == dbUpdate ) {
+                throw UserException( 8050 , "can't update system.indexes" );
+            }
+            else if ( op == dbDelete ) {
+                // TODO
+                throw UserException( 8051 , "can't delete indexes on sharded collection yet" );
+            }
+            else {
+                log() << "handleIndexWrite invalid write op: " << op << endl;
+                throw UserException( 8052 , "handleIndexWrite invalid write op" );
+            }
+
+        }
+
+        virtual void writeOp( int op , Request& r ) {
+            const char *ns = r.getns();
+
+            if ( r.isShardingEnabled() &&
+                    strstr( ns , ".system.indexes" ) == strchr( ns , '.' ) &&
+                    strchr( ns , '.' ) ) {
+                LOG(1) << " .system.indexes write for: " << ns << endl;
+                handleIndexWrite( op , r );
+                return;
+            }
+
+            LOG(3) << "single write: " << ns << endl;
+            doWrite( op , r , r.primaryShard() );
+            r.gotInsert(); // Won't handle mulit-insert correctly. Not worth parsing the request.
+        }
+
+        bool handleSpecialNamespaces( Request& r , QueryMessage& q ) {
+            const char * ns = r.getns();
+            ns = strstr( r.getns() , ".$cmd.sys." );
+            if ( ! ns )
+                return false;
+            ns += 10;
+
+            r.checkAuth( Auth::WRITE );
+
+            BSONObjBuilder b;
+            vector<Shard> shards;
+
+            if ( strcmp( ns , "inprog" ) == 0 ) {
+                Shard::getAllShards( shards );
+
+                BSONArrayBuilder arr( b.subarrayStart( "inprog" ) );
+
+                for ( unsigned i=0; i<shards.size(); i++ ) {
+                    Shard shard = shards[i];
+                    ScopedDbConnection conn( shard );
+                    BSONObj temp = conn->findOne( r.getns() , BSONObj() );
+                    if ( temp["inprog"].isABSONObj() ) {
+                        BSONObjIterator i( temp["inprog"].Obj() );
+                        while ( i.more() ) {
+                            BSONObjBuilder x;
+
+                            BSONObjIterator j( i.next().Obj() );
+                            while( j.more() ) {
+                                BSONElement e = j.next();
+                                if ( str::equals( e.fieldName() , "opid" ) ) {
+                                    stringstream ss;
+                                    ss << shard.getName() << ':' << e.numberInt();
+                                    x.append( "opid" , ss.str() );
+                                }
+                                else if ( str::equals( e.fieldName() , "client" ) ) {
+                                    x.appendAs( e , "client_s" );
+                                }
+                                else {
+                                    x.append( e );
+                                }
+                            }
+                            arr.append( x.obj() );
+                        }
+                    }
+                    conn.done();
+                }
+
+                arr.done();
+            }
+            else if ( strcmp( ns , "killop" ) == 0 ) {
+                BSONElement e = q.query["op"];
+                if ( strstr( r.getns() , "admin." ) == 0 ) {
+                    b.append( "err" , "unauthorized" );
+                }
+                else if ( e.type() != String ) {
+                    b.append( "err" , "bad op" );
+                    b.append( e );
+                }
+                else {
+                    b.append( e );
+                    string s = e.String();
+                    string::size_type i = s.find( ':' );
+                    if ( i == string::npos ) {
+                        b.append( "err" , "bad opid" );
+                    }
+                    else {
+                        string shard = s.substr( 0 , i );
+                        int opid = atoi( s.substr( i + 1 ).c_str() );
+                        b.append( "shard" , shard );
+                        b.append( "shardid" , opid );
+
+                        log() << "want to kill op: " << e << endl;
+                        Shard s(shard);
+
+                        ScopedDbConnection conn( s );
+                        conn->findOne( r.getns() , BSON( "op" << opid ) );
+                        conn.done();
+                    }
+                }
+            }
+            else if ( strcmp( ns , "unlock" ) == 0 ) {
+                b.append( "err" , "can't do unlock through mongos" );
+            }
+            else {
+                log( LL_WARNING ) << "unknown sys command [" << ns << "]" << endl;
+                return false;
+            }
+
+            BSONObj x = b.done();
+            replyToQuery(0, r.p(), r.m(), x);
+            return true;
+        }
+
+        set<string> _commandsSafeToPass;
+    };
+
+    Strategy * SINGLE = new SingleStrategy();
+}
diff --git a/src/mongo/s/util.h b/src/mongo/s/util.h
new file mode 100644
index 00000000000..cce2131ca55
--- /dev/null
+++ b/src/mongo/s/util.h
@@ -0,0 +1,183 @@
+// util.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../client/dbclient.h"
+#include "../db/jsobj.h"
+
+/**
+   some generic sharding utils that can be used in mongod or mongos
+ */
+
+namespace mongo {
+
+    struct ShardChunkVersion {
+        union {
+            struct {
+                int _minor;
+                int _major;
+            };
+            unsigned long long _combined;
+        };
+
+        ShardChunkVersion( int major=0, int minor=0 )
+            : _minor(minor),_major(major) {
+        }
+
+        ShardChunkVersion( unsigned long long ll )
+            : _combined( ll ) {
+        }
+
+        ShardChunkVersion( const BSONElement& e ) {
+            if ( e.type() == Date || e.type() == Timestamp ) {
+                _combined = e._numberLong();
+            }
+            else if ( e.eoo() ) {
+                _combined = 0;
+            }
+            else {
+                _combined = 0;
+                log() << "ShardChunkVersion can't handle type (" << (int)(e.type()) << ") " << e << endl;
+                assert(0);
+            }
+        }
+
+        void inc( bool major ) {
+            if ( major )
+                incMajor();
+            else
+                incMinor();
+        }
+
+        void incMajor() {
+            _major++;
+            _minor = 0;
+        }
+
+        void incMinor() {
+            _minor++;
+        }
+
+        unsigned long long toLong() const {
+            return _combined;
+        }
+
+        bool isSet() const {
+            return _combined > 0;
+        }
+
+        string toString() const {
+            stringstream ss;
+            ss << _major << "|" << _minor;
+            return ss.str();
+        }
+
+        int majorVersion() const { return _major; }
+        int minorVersion() const { return _minor; }
+
+        operator unsigned long long() const { return _combined; }
+
+        ShardChunkVersion& operator=( const BSONElement& elem ) {
+            switch ( elem.type() ) {
+            case Timestamp:
+            case NumberLong:
+            case Date:
+                _combined = elem._numberLong();
+                break;
+            case EOO:
+                _combined = 0;
+                break;
+            default:
+                massert( 13657 , str::stream() << "unknown type for ShardChunkVersion: " << elem , 0 );
+            }
+            return *this;
+        }
+    };
+
+    inline ostream& operator<<( ostream &s , const ShardChunkVersion& v) {
+        s << v._major << "|" << v._minor;
+        return s;
+    }
+
+    /**
+     * your config info for a given shard/chunk is out of date
+     */
+    class StaleConfigException : public AssertionException {
+    public:
+        StaleConfigException( const string& ns , const string& raw , int code, bool justConnection = false )
+            : AssertionException( (string)"ns: " + ns + " " + raw , code ) ,
+              _justConnection(justConnection) ,
+              _ns(ns) {
+        }
+
+        virtual ~StaleConfigException() throw() {}
+
+        virtual void appendPrefix( stringstream& ss ) const { ss << "stale sharding config exception: "; }
+
+        bool justConnection() const { return _justConnection; }
+
+        string getns() const { return _ns; }
+
+        static bool parse( const string& big , string& ns , string& raw ) {
+            string::size_type start = big.find( '[' );
+            if ( start == string::npos )
+                return false;
+            string::size_type end = big.find( ']' ,start );
+            if ( end == string::npos )
+                return false;
+
+            ns = big.substr( start + 1 , ( end - start ) - 1 );
+            raw = big.substr( end + 1 );
+            return true;
+        }
+    private:
+        bool _justConnection;
+        string _ns;
+    };
+
+    class SendStaleConfigException : public StaleConfigException {
+    public:
+        SendStaleConfigException( const string& ns , const string& raw , bool justConnection = false )
+            : StaleConfigException( ns, raw + "(send)", SendStaleConfigCode, justConnection ) {}
+    };
+
+    class RecvStaleConfigException : public StaleConfigException {
+    public:
+        RecvStaleConfigException( const string& ns , const string& raw , bool justConnection = false )
+            : StaleConfigException( ns, raw + "(recv)", RecvStaleConfigCode, justConnection ) {}
+    };
+
+    class ShardConnection;
+    class VersionManager {
+    public:
+        VersionManager(){};
+
+        bool isVersionableCB( DBClientBase* );
+        bool initShardVersionCB( DBClientBase*, BSONObj& );
+        bool forceRemoteCheckShardVersionCB( const string& );
+        bool checkShardVersionCB( DBClientBase*, const string&, bool, int );
+        bool checkShardVersionCB( ShardConnection*, bool, int );
+        void resetShardVersionCB( DBClientBase* );
+
+    };
+
+    extern VersionManager versionManager;
+
+}
diff --git a/src/mongo/s/writeback_listener.cpp b/src/mongo/s/writeback_listener.cpp
new file mode 100644
index 00000000000..ebdefb05785
--- /dev/null
+++ b/src/mongo/s/writeback_listener.cpp
@@ -0,0 +1,285 @@
+// @file writeback_listener.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "../util/timer.h"
+
+#include "config.h"
+#include "grid.h"
+#include "request.h"
+#include "server.h"
+#include "shard.h"
+#include "util.h"
+#include "client.h"
+
+#include "writeback_listener.h"
+
+namespace mongo {
+
+    map<string,WriteBackListener*> WriteBackListener::_cache;
+    set<string> WriteBackListener::_seenSets;
+    mongo::mutex WriteBackListener::_cacheLock("WriteBackListener");
+
+    map<WriteBackListener::ConnectionIdent,WriteBackListener::WBStatus> WriteBackListener::_seenWritebacks;
+    mongo::mutex WriteBackListener::_seenWritebacksLock("WriteBackListener::seen");
+
+    WriteBackListener::WriteBackListener( const string& addr ) : _addr( addr ) {
+        _name = str::stream() << "WriteBackListener-" << addr;
+        log() << "creating WriteBackListener for: " << addr << " serverID: " << serverID << endl;
+    }
+
+    /* static */
+    void WriteBackListener::init( DBClientBase& conn ) {
+        
+        if ( conn.type() == ConnectionString::SYNC ) {
+            // don't want write back listeners for config servers
+            return;
+        }
+
+        if ( conn.type() != ConnectionString::SET ) {
+            init( conn.getServerAddress() );
+            return;
+        }
+        
+
+        {
+            scoped_lock lk( _cacheLock );
+            if ( _seenSets.count( conn.getServerAddress() ) )
+                return;
+        }
+
+        // we want to do writebacks on all rs nodes
+        string errmsg;
+        ConnectionString cs = ConnectionString::parse( conn.getServerAddress() , errmsg );
+        uassert( 13641 , str::stream() << "can't parse host [" << conn.getServerAddress() << "]" , cs.isValid() );
+
+        vector<HostAndPort> hosts = cs.getServers();
+        
+        for ( unsigned i=0; i<hosts.size(); i++ )
+            init( hosts[i].toString() );
+
+    }
+    
+    /* static */
+    void WriteBackListener::init( const string& host ) {
+        scoped_lock lk( _cacheLock );
+        WriteBackListener*& l = _cache[host];
+        if ( l )
+            return;
+        l = new WriteBackListener( host );
+        l->go();
+    }
+
+    /* static */
+    BSONObj WriteBackListener::waitFor( const ConnectionIdent& ident, const OID& oid ) {
+        Timer t;
+        for ( int i=0; i<10000; i++ ) {
+            {
+                scoped_lock lk( _seenWritebacksLock );
+                WBStatus s = _seenWritebacks[ident];
+                if ( oid < s.id ) {
+                    // this means we're waiting for a GLE that already passed.
+                    // it should be impossible because once we call GLE, no other
+                    // writebacks should happen with that connection id
+
+                    msgasserted( 14041 , str::stream() << "got writeback waitfor for older id " <<
+                                 " oid: " << oid << " s.id: " << s.id << " ident: " << ident.toString() );
+                }
+                else if ( oid == s.id ) {
+                    return s.gle;
+                }
+                
+            }
+            sleepmillis( 10 );
+        }
+        uasserted( 13403 , str::stream() << "didn't get writeback for: " << oid << " after: " << t.millis() << " ms" );
+        throw 1; // never gets here
+    }
+
+    void WriteBackListener::run() {
+        int secsToSleep = 0;
+        while ( ! inShutdown() ) {
+            
+            if ( ! Shard::isAShardNode( _addr ) ) {
+                LOG(1) << _addr << " is not a shard node" << endl;
+                sleepsecs( 60 );
+                continue;
+            }
+
+            try {
+                ScopedDbConnection conn( _addr );
+
+                BSONObj result;
+
+                {
+                    BSONObjBuilder cmd;
+                    cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data
+                    if ( ! conn->runCommand( "admin" , cmd.obj() , result ) ) {
+                        result = result.getOwned();
+                        log() <<  "writebacklisten command failed!  "  << result << endl;
+                        conn.done();
+                        continue;
+                    }
+
+                }
+
+                LOG(1) << "writebacklisten result: " << result << endl;
+
+                BSONObj data = result.getObjectField( "data" );
+                if ( data.getBoolField( "writeBack" ) ) {
+                    string ns = data["ns"].valuestrsafe();
+
+                    ConnectionIdent cid( "" , 0 );
+                    OID wid;
+                    if ( data["connectionId"].isNumber() && data["id"].type() == jstOID ) {
+                        string s = "";
+                        if ( data["instanceIdent"].type() == String )
+                            s = data["instanceIdent"].String();
+                        cid = ConnectionIdent( s , data["connectionId"].numberLong() );
+                        wid = data["id"].OID();
+                    }
+                    else {
+                        warning() << "mongos/mongod version mismatch (1.7.5 is the split)" << endl;
+                    }
+
+                    int len; // not used, but needed for next call
+                    Message m( (void*)data["msg"].binData( len ) , false );
+                    massert( 10427 ,  "invalid writeback message" , m.header()->valid() );
+
+                    DBConfigPtr db = grid.getDBConfig( ns );
+                    ShardChunkVersion needVersion( data["version"] );
+
+                    // TODO: The logic here could be refactored, but keeping to the original codepath for safety for now
+                    ChunkManagerPtr manager = db->getChunkManagerIfExists( ns );
+
+                    LOG(1) << "connectionId: " << cid << " writebackId: " << wid << " needVersion : " << needVersion.toString()
+                           << " mine : " << ( manager ? manager->getVersion().toString() : "(unknown)" )
+                           << endl;
+
+                    LOG(1) << m.toString() << endl;
+
+                    if ( needVersion.isSet() && manager && needVersion <= manager->getVersion() ) {
+                        // this means when the write went originally, the version was old
+                        // if we're here, it means we've already updated the config, so don't need to do again
+                        //db->getChunkManager( ns , true ); // SERVER-1349
+                    }
+                    else {
+                        // we received a writeback object that was sent to a previous version of a shard
+                        // the actual shard may not have the object the writeback operation is for
+                        // we need to reload the chunk manager and get the new shard versions
+                        manager = db->getChunkManager( ns , true );
+                    }
+
+                    // do request and then call getLastError
+                    // we have to call getLastError so we can return the right fields to the user if they decide to call getLastError
+
+                    BSONObj gle;
+                    int attempts = 0;
+                    while ( true ) {
+                        attempts++;
+
+                        try {
+                            
+                            Request r( m , 0 );
+                            r.init();
+                            
+                            r.d().reservedField() |= DbMessage::Reserved_FromWriteback;
+                            
+                            ClientInfo * ci = r.getClientInfo();
+                            if (!noauth) {
+                                ci->getAuthenticationInfo()->authorize("admin", internalSecurity.user);
+                            }
+                            ci->noAutoSplit();
+                            
+                            r.process();
+                            
+                            ci->newRequest(); // this so we flip prev and cur shards
+                            
+                            BSONObjBuilder b;
+                            if ( ! ci->getLastError( BSON( "getLastError" << 1 ) , b , true ) ) {
+                                b.appendBool( "commandFailed" , true );
+                            }
+                            gle = b.obj();
+                            
+                            if ( gle["code"].numberInt() == 9517 ) {
+                                log() << "writeback failed because of stale config, retrying attempts: " << attempts << endl;
+                                if( ! db->getChunkManagerIfExists( ns , true, attempts > 2 ) ){
+                                    uassert( 15884, str::stream() << "Could not reload chunk manager after " << attempts << " attempts.", attempts <= 4 );
+                                    sleepsecs( attempts - 1 );
+                                }
+                                continue;
+                            }
+
+                            ci->clearSinceLastGetError();
+                        }
+                        catch ( DBException& e ) {
+                            error() << "error processing writeback: " << e << endl;
+                            BSONObjBuilder b;
+                            b.append( "err" , e.toString() );
+                            e.getInfo().append( b );
+                            gle = b.obj();
+                        }
+                        
+                        break;
+                    }
+
+                    {
+                        scoped_lock lk( _seenWritebacksLock );
+                        WBStatus& s = _seenWritebacks[cid];
+                        s.id = wid;
+                        s.gle = gle;
+                    }
+                }
+                else if ( result["noop"].trueValue() ) {
+                    // no-op
+                }
+                else {
+                    log() << "unknown writeBack result: " << result << endl;
+                }
+
+                conn.done();
+                secsToSleep = 0;
+                continue;
+            }
+            catch ( std::exception& e ) {
+
+                if ( inShutdown() ) {
+                    // we're shutting down, so just clean up
+                    return;
+                }
+
+                log() << "WriteBackListener exception : " << e.what() << endl;
+
+                // It's possible this shard was removed
+                Shard::reloadShardInfo();
+            }
+            catch ( ... ) {
+                log() << "WriteBackListener uncaught exception!" << endl;
+            }
+            secsToSleep++;
+            sleepsecs(secsToSleep);
+            if ( secsToSleep > 10 )
+                secsToSleep = 0;
+        }
+
+        log() << "WriteBackListener exiting : address no longer in cluster " << _addr;
+
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/s/writeback_listener.h b/src/mongo/s/writeback_listener.h
new file mode 100644
index 00000000000..1ef33dab1ec
--- /dev/null
+++ b/src/mongo/s/writeback_listener.h
@@ -0,0 +1,89 @@
+// @file writeback_listener.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+
+#include "../client/connpool.h"
+#include "../util/background.h"
+#include "../db/client.h"
+
+namespace mongo {
+
+    /*
+     * The writeback listener takes back write attempts that were made against a wrong shard.
+     * (Wrong here in the sense that the target chunk moved before this mongos had a chance to
+     * learn so.) It is responsible for reapplying these writes to the correct shard.
+     *
+     * Runs (instantiated) on mongos.
+     * Currently, there is one writebacklistener per shard.
+     */
+    class WriteBackListener : public BackgroundJob {
+    public:
+        
+        class ConnectionIdent {
+        public:
+            ConnectionIdent( const string& ii , ConnectionId id ) 
+                : instanceIdent( ii ) , connectionId( id ) {
+            }
+            
+            bool operator<(const ConnectionIdent& other) const {
+                if ( instanceIdent == other.instanceIdent )
+                    return connectionId < other.connectionId;
+                
+                return instanceIdent < other.instanceIdent;
+            }
+
+            string toString() const { return str::stream() << instanceIdent << ":" << connectionId; }
+
+            string instanceIdent;
+            ConnectionId connectionId;
+        };
+
+        static void init( DBClientBase& conn );
+        static void init( const string& host );
+
+        static BSONObj waitFor( const ConnectionIdent& ident, const OID& oid );
+
+    protected:
+        WriteBackListener( const string& addr );
+
+        string name() const { return _name; }
+        void run();
+
+    private:
+        string _addr;
+        string _name;
+        
+        static mongo::mutex _cacheLock; // protects _cache
+        static map<string,WriteBackListener*> _cache; // server to listener
+        static set<string> _seenSets; // cache of set urls we've seen - note this is ever expanding for order, case, changes
+
+        struct WBStatus {
+            OID id;
+            BSONObj gle;
+        };
+
+        static mongo::mutex _seenWritebacksLock;  // protects _seenWritbacks
+        static map<ConnectionIdent,WBStatus> _seenWritebacks; // connectionId -> last write back GLE
+    };
+
+    void waitForWriteback( const OID& oid );
+
+}  // namespace mongo
diff --git a/src/mongo/scripting/bench.cpp b/src/mongo/scripting/bench.cpp
new file mode 100644
index 00000000000..01291b1e1f0
--- /dev/null
+++ b/src/mongo/scripting/bench.cpp
@@ -0,0 +1,785 @@
+/** @file bench.cpp */
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "engine.h"
+#include "../util/md5.hpp"
+#include "../util/version.h"
+#include "../client/dbclient.h"
+#include "../client/connpool.h"
+#include <pcrecpp.h>
+
+// ---------------------------------
+// ---- benchmarking system --------
+// ---------------------------------
+
+// TODO:  Maybe extract as library to avoid code duplication?
+namespace {
+    inline pcrecpp::RE_Options flags2options(const char* flags) {
+        pcrecpp::RE_Options options;
+        options.set_utf8(true);
+        while ( flags && *flags ) {
+            if ( *flags == 'i' )
+                options.set_caseless(true);
+            else if ( *flags == 'm' )
+                options.set_multiline(true);
+            else if ( *flags == 'x' )
+                options.set_extended(true);
+            flags++;
+        }
+        return options;
+    }
+}
+
+namespace mongo {
+
+    struct BenchRunConfig {
+        BenchRunConfig() : _mutex( "BenchRunConfig" ) {
+            host = "localhost";
+            db = "test";
+            username = "";
+            password = "";
+
+            parallel = 1;
+            seconds = 1;
+            handleErrors = false;
+            hideErrors = false;
+            hideResults = true;
+
+            active = true;
+            threadsReady = 0;
+            error = false;
+            errCount = 0;
+            throwGLE = false;
+            breakOnTrap = true;
+        }
+
+        string host;
+        string db;
+        string username;
+        string password;
+
+        unsigned parallel;
+        double seconds;
+
+        bool hideResults;
+        bool handleErrors;
+        bool hideErrors;
+
+        shared_ptr< pcrecpp::RE > trapPattern;
+        shared_ptr< pcrecpp::RE > noTrapPattern;
+        shared_ptr< pcrecpp::RE > watchPattern;
+        shared_ptr< pcrecpp::RE > noWatchPattern;
+
+        BSONObj ops;
+
+        volatile bool active; // true at starts, gets set to false when should stop
+        AtomicUInt threadsReady;
+
+        bool error;
+        bool throwGLE;
+        bool breakOnTrap;
+
+        AtomicUInt threadsActive;
+
+        mongo::mutex _mutex;
+        long long errCount;
+        BSONArrayBuilder trapped;
+    };
+
+    static bool _hasSpecial( const BSONObj& obj ) {
+        BSONObjIterator i( obj );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( e.fieldName()[0] == '#' )
+                return true;
+            
+            if ( ! e.isABSONObj() )
+                continue;
+            
+            if ( _hasSpecial( e.Obj() ) )
+                return true;
+        }
+        return false;
+    }
+    
+    static void _fixField( BSONObjBuilder& b , const BSONElement& e ) {
+        assert( e.type() == Object );
+        
+        BSONObj sub = e.Obj();
+        assert( sub.nFields() == 1 );
+        
+        BSONElement f = sub.firstElement();
+        if ( str::equals( "#RAND_INT" , f.fieldName() ) ) {
+            BSONObjIterator i( f.Obj() );
+            int min = i.next().numberInt();
+            int max = i.next().numberInt();
+            
+            int x = min + ( rand() % ( max - min ) );
+            b.append( e.fieldName() , x );
+        }
+        else {
+            uasserted( 14811 , str::stream() << "invalid bench dynamic piece: " << f.fieldName() );
+        }
+        
+    } 
+    
+    static void fixQuery( BSONObjBuilder& b  , const BSONObj& obj ) {
+        BSONObjIterator i( obj );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            
+            if ( ! e.isABSONObj() ) {
+                b.append( e );
+                continue;
+            }
+
+            BSONObj sub = e.Obj();
+            if ( sub.firstElement().fieldName()[0] == '#' ) {
+                _fixField( b , e );
+            }
+            else {
+                BSONObjBuilder xx( e.type() == Object ? b.subobjStart( e.fieldName() ) : b.subarrayStart( e.fieldName() ) );
+                fixQuery( xx , sub );
+                xx.done();
+            }
+
+        }
+    }
+
+    static BSONObj fixQuery( const BSONObj& obj ) {
+
+        if ( ! _hasSpecial( obj ) ) 
+            return obj;
+
+        BSONObjBuilder b( obj.objsize() + 128 );
+        fixQuery( b , obj );
+        return b.obj();
+    }
+
+
+    static void _benchThread( BenchRunConfig * config, ScopedDbConnection& conn ){
+
+        long long count = 0;
+        while ( config->active ) {
+            BSONObjIterator i( config->ops );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+
+                string ns = e["ns"].String();
+                string op = e["op"].String();
+
+                int delay = e["delay"].eoo() ? 0 : e["delay"].Int();
+
+                auto_ptr<Scope> scope;
+                ScriptingFunction scopeFunc = 0;
+                BSONObj scopeObj;
+
+                if (config->username != "") {
+                    string errmsg;
+                    if (!conn.get()->auth(config->db, config->username, config->password, errmsg)) {
+                        uasserted(15931, "Authenticating to connection for _benchThread failed: " + errmsg);
+                    }
+                }
+
+                bool check = ! e["check"].eoo();
+                if( check ){
+                    if ( e["check"].type() == CodeWScope || e["check"].type() == Code || e["check"].type() == String ) {
+                        scope = globalScriptEngine->getPooledScope( ns );
+                        assert( scope.get() );
+
+                        if ( e.type() == CodeWScope ) {
+                            scopeFunc = scope->createFunction( e["check"].codeWScopeCode() );
+                            scopeObj = BSONObj( e.codeWScopeScopeData() );
+                        }
+                        else {
+                            scopeFunc = scope->createFunction( e["check"].valuestr() );
+                        }
+
+                        scope->init( &scopeObj );
+                        assert( scopeFunc );
+                    }
+                    else {
+                        warning() << "Invalid check type detected in benchRun op : " << e << endl;
+                        check = false;
+                    }
+                }
+
+                try {
+                    if ( op == "findOne" ) {
+
+                        BSONObj result = conn->findOne( ns , fixQuery( e["query"].Obj() ) );
+
+                        if( check ){
+                            int err = scope->invoke( scopeFunc , 0 , &result,  1000 * 60 , false );
+                            if( err ){
+                                log() << "Error checking in benchRun thread [findOne]" << causedBy( scope->getError() ) << endl;
+                                return;
+                            }
+                        }
+
+                        if( ! config->hideResults || e["showResult"].trueValue() ) log() << "Result from benchRun thread [findOne] : " << result << endl;
+
+                    }
+                    else if ( op == "command" ) {
+
+                        BSONObj result;
+                        // TODO
+                        /* bool ok = */ conn->runCommand( ns , fixQuery( e["command"].Obj() ), result, e["options"].numberInt() );
+
+                        if( check ){
+                            int err = scope->invoke( scopeFunc , 0 , &result,  1000 * 60 , false );
+                            if( err ){
+                                log() << "Error checking in benchRun thread [command]" << causedBy( scope->getError() ) << endl;
+                                return;
+                            }
+                        }
+
+                        if( ! config->hideResults || e["showResult"].trueValue() ) log() << "Result from benchRun thread [command] : " << result << endl;
+
+                    }
+                    else if( op == "find" || op == "query" ) {
+
+                        int limit = e["limit"].eoo() ? 0 : e["limit"].numberInt();
+                        int skip = e["skip"].eoo() ? 0 : e["skip"].Int();
+                        int options = e["options"].eoo() ? 0 : e["options"].Int();
+                        int batchSize = e["batchSize"].eoo() ? 0 : e["batchSize"].Int();
+                        BSONObj filter = e["filter"].eoo() ? BSONObj() : e["filter"].Obj();
+
+                        auto_ptr<DBClientCursor> cursor = conn->query( ns, fixQuery( e["query"].Obj() ), limit, skip, &filter, options, batchSize );
+
+                        int count = cursor->itcount();
+
+                        if( check ){
+                            BSONObj thisValue = BSON( "count" << count );
+                            int err = scope->invoke( scopeFunc , 0 , &thisValue, 1000 * 60 , false );
+                            if( err ){
+                                log() << "Error checking in benchRun thread [find]" << causedBy( scope->getError() ) << endl;
+                                return;
+                            }
+                        }
+
+                        if( ! config->hideResults || e["showResult"].trueValue() ) log() << "Result from benchRun thread [query] : " << count << endl;
+
+                    }
+                    else if( op == "update" ) {
+
+                        bool multi = e["multi"].trueValue();
+                        bool upsert = e["upsert"].trueValue();
+                        BSONObj query = e["query"].eoo() ? BSONObj() : e["query"].Obj();
+                        BSONObj update = e["update"].Obj();
+
+                        conn->update( ns, fixQuery( query ), update, upsert , multi );
+
+                        bool safe = e["safe"].trueValue();
+                        if( safe ){
+                            BSONObj result = conn->getLastErrorDetailed();
+
+                            if( check ){
+                                int err = scope->invoke( scopeFunc , 0 , &result, 1000 * 60 , false );
+                                if( err ){
+                                    log() << "Error checking in benchRun thread [update]" << causedBy( scope->getError() ) << endl;
+                                    return;
+                                }
+                            }
+
+                            if( ! config->hideResults || e["showResult"].trueValue() ) log() << "Result from benchRun thread [safe update] : " << result << endl;
+
+                            if( ! result["err"].eoo() && result["err"].type() == String && ( config->throwGLE || e["throwGLE"].trueValue() ) )
+                                throw DBException( (string)"From benchRun GLE" + causedBy( result["err"].String() ),
+                                                   result["code"].eoo() ? 0 : result["code"].Int() );
+                        }
+                    }
+                    else if( op == "insert" ) {
+
+                        conn->insert( ns, fixQuery( e["doc"].Obj() ) );
+
+                        bool safe = e["safe"].trueValue();
+                        if( safe ){
+                            BSONObj result = conn->getLastErrorDetailed();
+
+                            if( check ){
+                                int err = scope->invoke( scopeFunc , 0 , &result, 1000 * 60 , false );
+                                if( err ){
+                                    log() << "Error checking in benchRun thread [insert]" << causedBy( scope->getError() ) << endl;
+                                    return;
+                                }
+                            }
+
+                            if( ! config->hideResults || e["showResult"].trueValue() ) log() << "Result from benchRun thread [safe insert] : " << result << endl;
+
+                            if( ! result["err"].eoo() && result["err"].type() == String && ( config->throwGLE || e["throwGLE"].trueValue() ) )
+                                throw DBException( (string)"From benchRun GLE" + causedBy( result["err"].String() ),
+                                                   result["code"].eoo() ? 0 : result["code"].Int() );
+                        }
+                    }
+                    else if( op == "delete" || op == "remove" ) {
+
+                        bool multi = e["multi"].eoo() ? true : e["multi"].trueValue();
+                        BSONObj query = e["query"].eoo() ? BSONObj() : e["query"].Obj();
+
+                        conn->remove( ns, fixQuery( query ), ! multi );
+
+                        bool safe = e["safe"].trueValue();
+                        if( safe ){
+                            BSONObj result = conn->getLastErrorDetailed();
+
+                            if( check ){
+                                int err = scope->invoke( scopeFunc , 0 , &result, 1000 * 60 , false );
+                                if( err ){
+                                    log() << "Error checking in benchRun thread [delete]" << causedBy( scope->getError() ) << endl;
+                                    return;
+                                }
+                            }
+
+                            if( ! config->hideResults || e["showResult"].trueValue() ) log() << "Result from benchRun thread [safe remove] : " << result << endl;
+
+                            if( ! result["err"].eoo() && result["err"].type() == String && ( config->throwGLE || e["throwGLE"].trueValue() ) )
+                                throw DBException( (string)"From benchRun GLE " + causedBy( result["err"].String() ),
+                                                   result["code"].eoo() ? 0 : result["code"].Int() );
+                        }
+                    }
+                    else if ( op == "createIndex" ) {
+                        conn->ensureIndex( ns , e["key"].Obj() , false , "" , false );
+                    }
+                    else if ( op == "dropIndex" ) {
+                        conn->dropIndex( ns , e["key"].Obj()  );
+                    }
+                    else {
+                        log() << "don't understand op: " << op << endl;
+                        config->error = true;
+                        return;
+                    }
+                }
+                catch( DBException& ex ){
+                    if( ! config->hideErrors || e["showError"].trueValue() ){
+
+                        bool yesWatch = ( config->watchPattern && config->watchPattern->FullMatch( ex.what() ) );
+                        bool noWatch = ( config->noWatchPattern && config->noWatchPattern->FullMatch( ex.what() ) );
+
+                        if( ( ! config->watchPattern && config->noWatchPattern && ! noWatch ) || // If we're just ignoring things
+                            ( ! config->noWatchPattern && config->watchPattern && yesWatch ) || // If we're just watching things
+                            ( config->watchPattern && config->noWatchPattern && yesWatch && ! noWatch ) )
+                            log() << "Error in benchRun thread for op " << e << causedBy( ex ) << endl;
+                    }
+
+                    bool yesTrap = ( config->trapPattern && config->trapPattern->FullMatch( ex.what() ) );
+                    bool noTrap = ( config->noTrapPattern && config->noTrapPattern->FullMatch( ex.what() ) );
+
+                    if( ( ! config->trapPattern && config->noTrapPattern && ! noTrap ) ||
+                        ( ! config->noTrapPattern && config->trapPattern && yesTrap ) ||
+                        ( config->trapPattern && config->noTrapPattern && yesTrap && ! noTrap ) ){
+                        {
+                            scoped_lock lock( config->_mutex );
+                            config->trapped.append( BSON( "error" << ex.what() << "op" << e << "count" << count ) );
+                        }
+                        if( config->breakOnTrap ) return;
+                    }
+                    if( ! config->handleErrors && ! e["handleError"].trueValue() ) return;
+
+                    {
+                        scoped_lock lock( config->_mutex );
+                        config->errCount++;
+                    }
+                }
+                catch( ... ){
+                    if( ! config->hideErrors || e["showError"].trueValue() ) log() << "Error in benchRun thread caused by unknown error for op " << e << endl;
+                    if( ! config->handleErrors && ! e["handleError"].trueValue() ) return;
+
+                    {
+                        scoped_lock lock( config->_mutex );
+                        config->errCount++;
+                    }
+                }
+                
+                if ( ++count % 100 == 0 ) {
+                    conn->getLastError();
+                }
+
+                sleepmillis( delay );
+
+            }
+        }
+    }
+
+    static void benchThread( BenchRunConfig * config ) {
+
+        ScopedDbConnection conn( config->host );
+        config->threadsReady++;
+        config->threadsActive++;
+
+        try {
+            if (config->username != "") {
+                string errmsg;
+                if (!conn.get()->auth(config->db, config->username, config->password, errmsg)) {
+                    uasserted(15932, "Authenticating to connection for benchThread failed: " + errmsg);
+                }
+            }
+
+            _benchThread( config, conn );
+        }
+        catch( DBException& e ){
+            error() << "DBException not handled in benchRun thread" << causedBy( e ) << endl;
+        }
+        catch( std::exception& e ){
+            error() << "Exception not handled in benchRun thread" << causedBy( e ) << endl;
+        }
+        catch( ... ){
+            error() << "Exception not handled in benchRun thread." << endl;
+        }
+        conn->getLastError();
+        config->threadsActive--;
+        conn.done();
+
+    }
+
+
+    class BenchRunner {
+    public:
+
+        BenchRunner( ) {
+        }
+
+        ~BenchRunner() {
+        }
+
+        void init( BSONObj& args ){
+
+            oid.init();
+            activeRuns[ oid ] = this;
+
+            if ( args["host"].type() == String )
+                config.host = args["host"].String();
+            if ( args["db"].type() == String )
+                config.db = args["db"].String();
+            if ( args["username"].type() == String )
+                config.username = args["username"].String();
+            if ( args["password"].type() == String )
+                config.db = args["password"].String();
+
+            if ( args["parallel"].isNumber() )
+                config.parallel = args["parallel"].numberInt();
+            if ( args["seconds"].isNumber() )
+                config.seconds = args["seconds"].numberInt();
+            if ( ! args["hideResults"].eoo() )
+                config.hideResults = args["hideResults"].trueValue();
+            if ( ! args["handleErrors"].eoo() )
+                config.handleErrors = args["handleErrors"].trueValue();
+            if ( ! args["hideErrors"].eoo() )
+                config.hideErrors = args["hideErrors"].trueValue();
+            if ( ! args["throwGLE"].eoo() )
+                config.throwGLE = args["throwGLE"].trueValue();
+            if ( ! args["breakOnTrap"].eoo() )
+                config.breakOnTrap = args["breakOnTrap"].trueValue();
+
+
+            if ( ! args["trapPattern"].eoo() ){
+                 const char* regex = args["trapPattern"].regex();
+                 const char* flags = args["trapPattern"].regexFlags();
+                 config.trapPattern = shared_ptr< pcrecpp::RE >( new pcrecpp::RE( regex, flags2options( flags ) ) );
+            }
+
+            if ( ! args["noTrapPattern"].eoo() ){
+                 const char* regex = args["noTrapPattern"].regex();
+                 const char* flags = args["noTrapPattern"].regexFlags();
+                 config.noTrapPattern = shared_ptr< pcrecpp::RE >( new pcrecpp::RE( regex, flags2options( flags ) ) );
+            }
+
+            if ( ! args["watchPattern"].eoo() ){
+                 const char* regex = args["watchPattern"].regex();
+                 const char* flags = args["watchPattern"].regexFlags();
+                 config.watchPattern = shared_ptr< pcrecpp::RE >( new pcrecpp::RE( regex, flags2options( flags ) ) );
+            }
+
+            if ( ! args["noWatchPattern"].eoo() ){
+                 const char* regex = args["noWatchPattern"].regex();
+                 const char* flags = args["noWatchPattern"].regexFlags();
+                 config.noWatchPattern = shared_ptr< pcrecpp::RE >( new pcrecpp::RE( regex, flags2options( flags ) ) );
+            }
+
+            config.ops = args["ops"].Obj().getOwned();
+            conn = shared_ptr< ScopedDbConnection >( new ScopedDbConnection( config.host ) );
+
+            // Get initial stats
+            conn->get()->simpleCommand( "admin" , &before , "serverStatus" );
+
+            // Start threads
+            for ( unsigned i = 0; i < config.parallel; i++ )
+                threads.push_back( shared_ptr< boost::thread >( new boost::thread( boost::bind( benchThread , &config ) ) ) );
+
+            // Give them time to init
+            while ( config.threadsReady < config.parallel ) sleepmillis( 1 );
+
+        }
+
+        void done(){
+
+            log() << "Ending! (waiting for " << threads.size() << " threads)" << endl;
+
+            {
+                scoped_lock lock( config._mutex );
+                config.active = false;
+            }
+
+            for ( unsigned i = 0; i < threads.size(); i++ ) threads[i]->join();
+
+            // Get final stats
+            conn->get()->simpleCommand( "admin" , &after , "serverStatus" );
+            after.getOwned();
+
+            conn.get()->done();
+
+            activeRuns.erase( oid );
+
+        }
+
+        BSONObj status(){
+            scoped_lock lock( config._mutex );
+            return BSON( "errCount" << config.errCount <<
+                         "trappedCount" << config.trapped.arrSize() <<
+                         "threadsActive" << config.threadsActive.get() );
+        }
+
+        static BenchRunner* get( BSONObj args ){
+            BenchRunner* runner = new BenchRunner();
+            runner->init( args );
+            return runner;
+        }
+
+        static BenchRunner* get( OID oid ){
+            return activeRuns[ oid ];
+        }
+
+        static BSONObj finish( BenchRunner* runner ){
+
+            runner->done();
+
+            // vector<BSONOBj> errors = runner->config.errors;
+            bool error = runner->config.error;
+
+            if ( error )
+                return BSON( "err" << 1 );
+
+            // compute actual ops/sec
+            BSONObj before = runner->before["opcounters"].Obj();
+            BSONObj after = runner->after["opcounters"].Obj();
+
+            BSONObjBuilder buf;
+            buf.append( "note" , "values per second" );
+            buf.append( "errCount", (long long) runner->config.errCount );
+            buf.append( "trapped", runner->config.trapped.arr() );
+            {
+                BSONObjIterator i( after );
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    double x = e.number();
+                    x = x - before[e.fieldName()].number();
+                    buf.append( e.fieldName() , x / runner->config.seconds );
+                }
+            }
+
+            BSONObj zoo = buf.obj();
+
+            delete runner;
+            return zoo;
+        }
+
+        static map< OID, BenchRunner* > activeRuns;
+
+        OID oid;
+        BenchRunConfig config;
+        vector< shared_ptr< boost::thread > > threads;
+
+        shared_ptr< ScopedDbConnection > conn;
+        BSONObj before;
+        BSONObj after;
+
+    };
+
+    map< OID, BenchRunner* > BenchRunner::activeRuns;
+
+
+    /**
+     * benchRun( { ops : [] , host : XXX , db : XXXX , parallel : 5 , seconds : 5 }
+     */
+    BSONObj benchRun( const BSONObj& argsFake, void* data ) {
+        assert( argsFake.firstElement().isABSONObj() );
+        BSONObj args = argsFake.firstElement().Obj();
+
+        // setup
+
+        BenchRunConfig config;
+
+        if ( args["host"].type() == String )
+            config.host = args["host"].String();
+        if ( args["db"].type() == String )
+            config.db = args["db"].String();
+        if ( args["username"].type() == String )
+            config.username = args["username"].String();
+        if ( args["password"].type() == String )
+            config.password = args["password"].String();
+
+        if ( args["parallel"].isNumber() )
+            config.parallel = args["parallel"].numberInt();
+        if ( args["seconds"].isNumber() )
+            config.seconds = args["seconds"].number();
+
+
+        config.ops = args["ops"].Obj();
+
+        // execute
+
+        ScopedDbConnection conn( config.host );
+
+        if (config.username != "") {
+            string errmsg;
+            if (!conn.get()->auth(config.db, config.username, config.password, errmsg)) {
+                uasserted(15930, "Authenticating to connection for bench run failed: " + errmsg);
+            }
+        }
+
+
+        //    start threads
+        vector<boost::thread*> all;
+        for ( unsigned i=0; i<config.parallel; i++ )
+            all.push_back( new boost::thread( boost::bind( benchThread , &config ) ) );
+
+        //    give them time to init
+        while ( config.threadsReady < config.parallel )
+            sleepmillis( 1 );
+
+        BSONObj before;
+        conn->simpleCommand( "admin" , &before , "serverStatus" );
+
+        sleepmillis( (int)(1000.0 * config.seconds) );
+
+        BSONObj after;
+        conn->simpleCommand( "admin" , &after , "serverStatus" );
+
+        conn.done();
+
+        config.active = false;
+
+        for ( unsigned i=0; i<all.size(); i++ )
+            all[i]->join();
+
+        if ( config.error )
+            return BSON( "err" << 1 );
+
+        // compute actual ops/sec
+
+        before = before["opcounters"].Obj().copy();
+        after = after["opcounters"].Obj().copy();
+        
+        bool totals = args["totals"].trueValue();
+
+        BSONObjBuilder buf;
+        if ( ! totals )
+            buf.append( "note" , "values per second" );
+
+        {
+            BSONObjIterator i( after );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                double x = e.number();
+                x = x - before[e.fieldName()].number();
+                if ( ! totals )
+                    x = x / config.seconds;
+                buf.append( e.fieldName() , x );
+            }
+        }
+        BSONObj zoo = buf.obj();
+        return BSON( "" << zoo );
+    }
+
+    /**
+     * benchRun( { ops : [] , host : XXX , db : XXXX , parallel : 5 , seconds : 5 }
+     */
+    BSONObj benchRunSync( const BSONObj& argsFake, void* data ) {
+
+        assert( argsFake.firstElement().isABSONObj() );
+        BSONObj args = argsFake.firstElement().Obj();
+
+        // Get new BenchRunner object
+        BenchRunner* runner = BenchRunner::get( args );
+
+        sleepsecs( static_cast<int>( runner->config.seconds ) );
+
+        return BenchRunner::finish( runner );
+
+    }
+
+    /**
+     * benchRun( { ops : [] , host : XXX , db : XXXX , parallel : 5 , seconds : 5 }
+     */
+    BSONObj benchStart( const BSONObj& argsFake, void* data ) {
+
+        assert( argsFake.firstElement().isABSONObj() );
+        BSONObj args = argsFake.firstElement().Obj();
+
+        // Get new BenchRunner object
+        BenchRunner* runner = BenchRunner::get( args );
+
+        log() << "Starting benchRun test " << runner->oid << endl;
+
+        return BSON( "" << runner->oid.toString() );
+    }
+
+    /**
+     * benchRun( { ops : [] , host : XXX , db : XXXX , parallel : 5 , seconds : 5 }
+     */
+    BSONObj benchStatus( const BSONObj& argsFake, void* data ) {
+
+        OID oid = OID( argsFake.firstElement().String() );
+
+        log() << "Getting status for benchRun test " << oid << endl;
+
+        // Get new BenchRunner object
+        BenchRunner* runner = BenchRunner::get( oid );
+
+        BSONObj statusObj = runner->status();
+
+        return BSON( "" << statusObj );
+    }
+
+    /**
+     * benchRun( { ops : [] , host : XXX , db : XXXX , parallel : 5 , seconds : 5 }
+     */
+    BSONObj benchFinish( const BSONObj& argsFake, void* data ) {
+
+        OID oid = OID( argsFake.firstElement().String() );
+
+        log() << "Finishing benchRun test " << oid << endl;
+
+        // Get new BenchRunner object
+        BenchRunner* runner = BenchRunner::get( oid );
+
+        BSONObj finalObj = BenchRunner::finish( runner );
+
+        return BSON( "" << finalObj );
+    }
+
+    void installBenchmarkSystem( Scope& scope ) {
+        scope.injectNative( "benchRun" , benchRun );
+        scope.injectNative( "benchRunSync" , benchRunSync );
+        scope.injectNative( "benchStart" , benchStart );
+        scope.injectNative( "benchStatus" , benchStatus );
+        scope.injectNative( "benchFinish" , benchFinish );
+    }
+
+}
diff --git a/src/mongo/scripting/engine.cpp b/src/mongo/scripting/engine.cpp
new file mode 100644
index 00000000000..13fe681ebe5
--- /dev/null
+++ b/src/mongo/scripting/engine.cpp
@@ -0,0 +1,519 @@
+// engine.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "engine.h"
+#include "../util/file.h"
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+    long long Scope::_lastVersion = 1;
+
+    int Scope::_numScopes = 0;
+
+    Scope::Scope() : _localDBName("") , _loadedVersion(0), _numTimeUsed(0) {
+        _numScopes++;
+    }
+
+    Scope::~Scope() {
+        _numScopes--;
+    }
+
+    ScriptEngine::ScriptEngine() : _scopeInitCallback() {
+    }
+
+    ScriptEngine::~ScriptEngine() {
+    }
+
+    void Scope::append( BSONObjBuilder & builder , const char * fieldName , const char * scopeName ) {
+        int t = type( scopeName );
+
+        switch ( t ) {
+        case Object:
+            builder.append( fieldName , getObject( scopeName ) );
+            break;
+        case Array:
+            builder.appendArray( fieldName , getObject( scopeName ) );
+            break;
+        case NumberDouble:
+            builder.append( fieldName , getNumber( scopeName ) );
+            break;
+        case NumberInt:
+            builder.append( fieldName , getNumberInt( scopeName ) );
+            break;
+        case NumberLong:
+            builder.append( fieldName , getNumberLongLong( scopeName ) );
+            break;
+        case String:
+            builder.append( fieldName , getString( scopeName ).c_str() );
+            break;
+        case Bool:
+            builder.appendBool( fieldName , getBoolean( scopeName ) );
+            break;
+        case jstNULL:
+        case Undefined:
+            builder.appendNull( fieldName );
+            break;
+        case Date:
+            // TODO: make signed
+            builder.appendDate( fieldName , Date_t((unsigned long long)getNumber( scopeName )) );
+            break;
+        case Code:
+            builder.appendCode( fieldName , getString( scopeName ) );
+            break;
+        default:
+            stringstream temp;
+            temp << "can't append type from:";
+            temp << t;
+            uassert( 10206 ,  temp.str() , 0 );
+        }
+
+    }
+
+    int Scope::invoke( const char* code , const BSONObj* args, const BSONObj* recv, int timeoutMs ) {
+        ScriptingFunction func = createFunction( code );
+        uassert( 10207 ,  "compile failed" , func );
+        return invoke( func , args, recv, timeoutMs );
+    }
+
+    bool Scope::execFile( const string& filename , bool printResult , bool reportError , bool assertOnError, int timeoutMs ) {
+
+        path p( filename );
+
+        if ( ! exists( p ) ) {
+            log() << "file [" << filename << "] doesn't exist" << endl;
+            if ( assertOnError )
+                assert( 0 );
+            return false;
+        }
+
+        // iterate directories and recurse using all *.js files in the directory
+        if ( is_directory( p ) ) {
+            directory_iterator end;
+            bool empty = true;
+            for (directory_iterator it (p); it != end; it++) {
+                empty = false;
+                path sub (*it);
+                if (!endsWith(sub.string().c_str(), ".js"))
+                    continue;
+                if (!execFile(sub.string().c_str(), printResult, reportError, assertOnError, timeoutMs))
+                    return false;
+            }
+
+            if (empty) {
+                log() << "directory [" << filename << "] doesn't have any *.js files" << endl;
+                if ( assertOnError )
+                    assert( 0 );
+                return false;
+            }
+
+            return true;
+        }
+
+        File f;
+        f.open( filename.c_str() , true );
+
+        unsigned L;
+        {
+            fileofs fo = f.len();
+            assert( fo <= 0x7ffffffe );
+            L = (unsigned) fo;
+        }
+        boost::scoped_array<char> data (new char[L+1]);
+        data[L] = 0;
+        f.read( 0 , data.get() , L );
+
+        int offset = 0;
+        if (data[0] == '#' && data[1] == '!') {
+            const char* newline = strchr(data.get(), '\n');
+            if (! newline)
+                return true; // file of just shebang treated same as empty file
+            offset = newline - data.get();
+        }
+
+        StringData code (data.get() + offset, L - offset);
+
+        return exec( code , filename , printResult , reportError , assertOnError, timeoutMs );
+    }
+
+    void Scope::storedFuncMod() {
+        _lastVersion++;
+    }
+
+    void Scope::validateObjectIdString( const string &str ) {
+        massert( 10448 , "invalid object id: length", str.size() == 24 );
+
+        for ( string::size_type i=0; i<str.size(); i++ ) {
+            char c = str[i];
+            if ( ( c >= '0' && c <= '9' ) ||
+                    ( c >= 'a' && c <= 'f' ) ||
+                    ( c >= 'A' && c <= 'F' ) ) {
+                continue;
+            }
+            massert( 10430 ,  "invalid object id: not hex", false );
+        }
+    }
+
+    void Scope::loadStored( bool ignoreNotConnected ) {
+        if ( _localDBName.size() == 0 ) {
+            if ( ignoreNotConnected )
+                return;
+            uassert( 10208 ,  "need to have locallyConnected already" , _localDBName.size() );
+        }
+        if ( _loadedVersion == _lastVersion )
+            return;
+
+        _loadedVersion = _lastVersion;
+
+        string coll = _localDBName + ".system.js";
+
+        static DBClientBase * db = createDirectClient();
+        auto_ptr<DBClientCursor> c = db->query( coll , Query(), 0, 0, NULL, QueryOption_SlaveOk, 0 );
+        assert( c.get() );
+
+        set<string> thisTime;
+
+        while ( c->more() ) {
+            BSONObj o = c->nextSafe();
+
+            BSONElement n = o["_id"];
+            BSONElement v = o["value"];
+
+            uassert( 10209 ,  str::stream() << "name has to be a string: " << n  , n.type() == String );
+            uassert( 10210 ,  "value has to be set" , v.type() != EOO );
+
+            setElement( n.valuestr() , v );
+
+            thisTime.insert( n.valuestr() );
+            _storedNames.insert( n.valuestr() );
+
+        }
+
+        // --- remove things from scope that were removed
+
+        list<string> toremove;
+
+        for ( set<string>::iterator i=_storedNames.begin(); i!=_storedNames.end(); i++ ) {
+            string n = *i;
+            if ( thisTime.count( n ) == 0 )
+                toremove.push_back( n );
+        }
+
+        for ( list<string>::iterator i=toremove.begin(); i!=toremove.end(); i++ ) {
+            string n = *i;
+            _storedNames.erase( n );
+            execSetup( (string)"delete " + n , "clean up scope" );
+        }
+
+    }
+
+    ScriptingFunction Scope::createFunction( const char * code ) {
+        if ( code[0] == '/' && code [1] == '*' ) {
+            code += 2;
+            while ( code[0] && code[1] ) {
+                if ( code[0] == '*' && code[1] == '/' ) {
+                    code += 2;
+                    break;
+                }
+                code++;
+            }
+        }
+        map<string,ScriptingFunction>::iterator i = _cachedFunctions.find( code );
+        if ( i != _cachedFunctions.end() )
+            return i->second;
+        ScriptingFunction f = _createFunction( code );
+        _cachedFunctions[code] = f;
+        return f;
+    }
+
+    namespace JSFiles {
+        extern const JSFile collection;
+        extern const JSFile db;
+        extern const JSFile mongo;
+        extern const JSFile mr;
+        extern const JSFile query;
+        extern const JSFile utils;
+        extern const JSFile utils_sh;
+    }
+
+    void Scope::execCoreFiles() {
+        // keeping same order as in SConstruct
+        execSetup(JSFiles::utils);
+        execSetup(JSFiles::utils_sh);
+        execSetup(JSFiles::db);
+        execSetup(JSFiles::mongo);
+        execSetup(JSFiles::mr);
+        execSetup(JSFiles::query);
+        execSetup(JSFiles::collection);
+    }
+
+    typedef map< string , list<Scope*> > PoolToScopes;
+
+    class ScopeCache {
+    public:
+
+        ScopeCache() : _mutex("ScopeCache") {
+            _magic = 17;
+        }
+
+        ~ScopeCache() {
+            assert( _magic == 17 );
+            _magic = 1;
+
+            if ( inShutdown() )
+                return;
+
+            clear();
+        }
+
+        void done( const string& pool , Scope * s ) {
+            scoped_lock lk( _mutex );
+            list<Scope*> & l = _pools[pool];
+            bool oom = s->hasOutOfMemoryException();
+
+            // do not keep too many contexts, or use them for too long
+            if ( l.size() > 10 || s->getTimeUsed() > 100 || oom ) {
+                delete s;
+            }
+            else {
+                l.push_back( s );
+                s->reset();
+            }
+
+            if (oom) {
+                // out of mem, make some room
+                log() << "Clearing all idle JS contexts due to out of memory" << endl;
+                clear();
+            }
+        }
+
+        Scope * get( const string& pool ) {
+            scoped_lock lk( _mutex );
+            list<Scope*> & l = _pools[pool];
+            if ( l.size() == 0 )
+                return 0;
+
+            Scope * s = l.back();
+            l.pop_back();
+            s->reset();
+            s->incTimeUsed();
+            return s;
+        }
+
+        void clear() {
+            set<Scope*> seen;
+
+            for ( PoolToScopes::iterator i=_pools.begin() ; i != _pools.end(); i++ ) {
+                for ( list<Scope*>::iterator j=i->second.begin(); j != i->second.end(); j++ ) {
+                    Scope * s = *j;
+                    assert( ! seen.count( s ) );
+                    delete s;
+                    seen.insert( s );
+                }
+            }
+
+            _pools.clear();
+        }
+
+    private:
+        PoolToScopes _pools;
+        mongo::mutex _mutex;
+        int _magic;
+    };
+
+    thread_specific_ptr<ScopeCache> scopeCache;
+
+    class PooledScope : public Scope {
+    public:
+        PooledScope( const string pool , Scope * real ) : _pool( pool ) , _real( real ) {
+            _real->loadStored( true );
+        };
+        virtual ~PooledScope() {
+            ScopeCache * sc = scopeCache.get();
+            if ( sc ) {
+                sc->done( _pool , _real );
+                _real = 0;
+            }
+            else {
+                // this means that the Scope was killed from a different thread
+                // for example a cursor got timed out that has a $where clause
+                log(3) << "warning: scopeCache is empty!" << endl;
+                delete _real;
+                _real = 0;
+            }
+        }
+
+        void reset() {
+            _real->reset();
+        }
+        void init( const BSONObj * data ) {
+            _real->init( data );
+        }
+
+        void localConnect( const char * dbName ) {
+            _real->localConnect( dbName );
+        }
+        void externalSetup() {
+            _real->externalSetup();
+        }
+
+        double getNumber( const char *field ) {
+            return _real->getNumber( field );
+        }
+        string getString( const char *field ) {
+            return _real->getString( field );
+        }
+        bool getBoolean( const char *field ) {
+            return _real->getBoolean( field );
+        }
+        BSONObj getObject( const char *field ) {
+            return _real->getObject( field );
+        }
+
+        int type( const char *field ) {
+            return _real->type( field );
+        }
+
+        void setElement( const char *field , const BSONElement& val ) {
+            _real->setElement( field , val );
+        }
+        void setNumber( const char *field , double val ) {
+            _real->setNumber( field , val );
+        }
+        void setString( const char *field , const char * val ) {
+            _real->setString( field , val );
+        }
+        void setObject( const char *field , const BSONObj& obj , bool readOnly=true ) {
+            _real->setObject( field , obj , readOnly );
+        }
+        void setBoolean( const char *field , bool val ) {
+            _real->setBoolean( field , val );
+        }
+//        void setThis( const BSONObj * obj ) {
+//            _real->setThis( obj );
+//        }
+
+        void setFunction( const char *field , const char * code ) {
+            _real->setFunction(field, code);
+        }
+
+        ScriptingFunction createFunction( const char * code ) {
+            return _real->createFunction( code );
+        }
+
+        ScriptingFunction _createFunction( const char * code ) {
+            return _real->createFunction( code );
+        }
+
+        void rename( const char * from , const char * to ) {
+            _real->rename( from , to );
+        }
+
+        /**
+         * @return 0 on success
+         */
+        int invoke( ScriptingFunction func , const BSONObj* args, const BSONObj* recv, int timeoutMs , bool ignoreReturn, bool readOnlyArgs, bool readOnlyRecv ) {
+            return _real->invoke( func , args , recv, timeoutMs , ignoreReturn, readOnlyArgs, readOnlyRecv );
+        }
+
+        string getError() {
+            return _real->getError();
+        }
+
+        bool hasOutOfMemoryException() {
+            return _real->hasOutOfMemoryException();
+        }
+
+        bool exec( const StringData& code , const string& name , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ) {
+            return _real->exec( code , name , printResult , reportError , assertOnError , timeoutMs );
+        }
+        bool execFile( const string& filename , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ) {
+            return _real->execFile( filename , printResult , reportError , assertOnError , timeoutMs );
+        }
+
+        void injectNative( const char *field, NativeFunction func, void* data ) {
+            _real->injectNative( field , func, data );
+        }
+
+        void gc() {
+            _real->gc();
+        }
+
+        void append( BSONObjBuilder & builder , const char * fieldName , const char * scopeName ) {
+            _real->append(builder, fieldName, scopeName);
+        }
+
+    private:
+        string _pool;
+        Scope * _real;
+    };
+
+    auto_ptr<Scope> ScriptEngine::getPooledScope( const string& pool ) {
+        if ( ! scopeCache.get() ) {
+            scopeCache.reset( new ScopeCache() );
+        }
+
+        Scope * s = scopeCache->get( pool );
+        if ( ! s ) {
+            s = newScope();
+        }
+
+        auto_ptr<Scope> p;
+        p.reset( new PooledScope( pool , s ) );
+        return p;
+    }
+
+    void ScriptEngine::threadDone() {
+        ScopeCache * sc = scopeCache.get();
+        if ( sc ) {
+            sc->clear();
+        }
+    }
+
+    void ( *ScriptEngine::_connectCallback )( DBClientWithCommands & ) = 0;
+    const char * ( *ScriptEngine::_checkInterruptCallback )() = 0;
+    unsigned ( *ScriptEngine::_getInterruptSpecCallback )() = 0;
+
+    ScriptEngine * globalScriptEngine = 0;
+
+    bool hasJSReturn( const string& code ) {
+        size_t x = code.find( "return" );
+        if ( x == string::npos )
+            return false;
+
+        return
+            ( x == 0 || ! isalpha( code[x-1] ) ) &&
+            ! isalpha( code[x+6] );
+    }
+
+    const char * jsSkipWhiteSpace( const char * raw ) {
+        while ( raw[0] ) {
+            while (isspace(*raw)) {
+                raw++;
+            }
+
+            if ( raw[0] != '/' || raw[1] != '/' )
+                break;
+
+            while ( raw[0] && raw[0] != '\n' )
+                raw++;
+        }
+        return raw;
+    }
+}
+
diff --git a/src/mongo/scripting/engine.h b/src/mongo/scripting/engine.h
new file mode 100644
index 00000000000..f4b39740001
--- /dev/null
+++ b/src/mongo/scripting/engine.h
@@ -0,0 +1,235 @@
+// engine.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "../db/jsobj.h"
+
+namespace mongo {
+
+    struct JSFile {
+        const char* name;
+        const StringData& source;
+    };
+
+    typedef unsigned long long ScriptingFunction;
+    typedef BSONObj (*NativeFunction) ( const BSONObj &args, void* data );
+
+    class Scope : boost::noncopyable {
+    public:
+        Scope();
+        virtual ~Scope();
+
+        virtual void reset() = 0;
+        virtual void init( const BSONObj * data ) = 0;
+        void init( const char * data ) {
+            BSONObj o( data );
+            init( &o );
+        }
+
+        virtual void localConnect( const char * dbName ) = 0;
+        virtual void externalSetup() = 0;
+
+        class NoDBAccess {
+            Scope * _s;
+        public:
+            NoDBAccess( Scope * s ) {
+                _s = s;
+            }
+            ~NoDBAccess() {
+                _s->rename( "____db____" , "db" );
+            }
+        };
+        NoDBAccess disableDBAccess( const char * why ) {
+            rename( "db" , "____db____" );
+            return NoDBAccess( this );
+        }
+
+        virtual double getNumber( const char *field ) = 0;
+        virtual int getNumberInt( const char *field ) { return (int)getNumber( field ); }
+        virtual long long getNumberLongLong( const char *field ) { return (long long)getNumber( field ); }
+        virtual string getString( const char *field ) = 0;
+        virtual bool getBoolean( const char *field ) = 0;
+        virtual BSONObj getObject( const char *field ) = 0;
+
+        virtual int type( const char *field ) = 0;
+
+        virtual void append( BSONObjBuilder & builder , const char * fieldName , const char * scopeName );
+
+        virtual void setElement( const char *field , const BSONElement& e ) = 0;
+        virtual void setNumber( const char *field , double val ) = 0;
+        virtual void setString( const char *field , const char * val ) = 0;
+        virtual void setObject( const char *field , const BSONObj& obj , bool readOnly=true ) = 0;
+        virtual void setBoolean( const char *field , bool val ) = 0;
+        virtual void setFunction( const char *field , const char * code ) = 0;
+//        virtual void setThis( const BSONObj * obj ) = 0;
+
+        virtual ScriptingFunction createFunction( const char * code );
+
+        virtual void rename( const char * from , const char * to ) = 0;
+        /**
+         * @return 0 on success
+         */
+        virtual int invoke( ScriptingFunction func , const BSONObj* args, const BSONObj* recv, int timeoutMs = 0 , bool ignoreReturn = false, bool readOnlyArgs = false, bool readOnlyRecv = false ) = 0;
+        void invokeSafe( ScriptingFunction func , const BSONObj* args, const BSONObj* recv, int timeoutMs = 0 , bool ignoreReturn = false, bool readOnlyArgs = false, bool readOnlyRecv = false ) {
+            int res = invoke( func , args , recv, timeoutMs, ignoreReturn, readOnlyArgs, readOnlyRecv );
+            if ( res == 0 )
+                return;
+            throw UserException( 9004 , (string)"invoke failed: " + getError() );
+        }
+        virtual string getError() = 0;
+        virtual bool hasOutOfMemoryException() = 0;
+
+        int invoke( const char* code , const BSONObj* args, const BSONObj* recv, int timeoutMs = 0 );
+        void invokeSafe( const char* code , const BSONObj* args, const BSONObj* recv, int timeoutMs = 0 ) {
+            if ( invoke( code , args , recv, timeoutMs ) == 0 )
+                return;
+            throw UserException( 9005 , (string)"invoke failed: " + getError() );
+        }
+
+        virtual bool exec( const StringData& code , const string& name , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 ) = 0;
+        virtual void execSetup( const StringData& code , const string& name = "setup" ) {
+            exec( code , name , false , true , true , 0 );
+        }
+
+        void execSetup( const JSFile& file) {
+            execSetup(file.source, file.name);
+        }
+
+        void execCoreFiles();
+
+        virtual bool execFile( const string& filename , bool printResult , bool reportError , bool assertOnError, int timeoutMs = 0 );
+
+        virtual void injectNative( const char *field, NativeFunction func, void* data = 0 ) = 0;
+
+        virtual void gc() = 0;
+
+        void loadStored( bool ignoreNotConnected = false );
+
+        /**
+         if any changes are made to .system.js, call this
+         right now its just global - slightly inefficient, but a lot simpler
+        */
+        static void storedFuncMod();
+
+        static int getNumScopes() {
+            return _numScopes;
+        }
+
+        static void validateObjectIdString( const string &str );
+
+        /** increments the number of times a scope was used */
+        void incTimeUsed() { ++_numTimeUsed; }
+        /** gets the number of times a scope was used */
+        int getTimeUsed() { return _numTimeUsed; }
+
+    protected:
+
+        virtual ScriptingFunction _createFunction( const char * code ) = 0;
+
+        string _localDBName;
+        long long _loadedVersion;
+        set<string> _storedNames;
+        static long long _lastVersion;
+        map<string,ScriptingFunction> _cachedFunctions;
+        int _numTimeUsed;
+
+        static int _numScopes;
+    };
+
+    void installGlobalUtils( Scope& scope );
+
+    class DBClientWithCommands;
+
+    class ScriptEngine : boost::noncopyable {
+    public:
+        ScriptEngine();
+        virtual ~ScriptEngine();
+
+        virtual Scope * newScope() {
+            Scope *s = createScope();
+            if ( s && _scopeInitCallback )
+                _scopeInitCallback( *s );
+            installGlobalUtils( *s );
+            return s;
+        }
+
+        virtual void runTest() = 0;
+
+        virtual bool utf8Ok() const = 0;
+
+        static void setup();
+
+        /** gets a scope from the pool or a new one if pool is empty
+         * @param pool An identifier for the pool, usually the db name
+         * @return the scope */
+        auto_ptr<Scope> getPooledScope( const string& pool );
+
+        /** call this method to release some JS resources when a thread is done */
+        void threadDone();
+
+        struct Unlocker { virtual ~Unlocker() {} };
+        virtual auto_ptr<Unlocker> newThreadUnlocker() { return auto_ptr< Unlocker >( new Unlocker ); }
+
+        void setScopeInitCallback( void ( *func )( Scope & ) ) { _scopeInitCallback = func; }
+        static void setConnectCallback( void ( *func )( DBClientWithCommands& ) ) { _connectCallback = func; }
+        static void runConnectCallback( DBClientWithCommands &c ) {
+            if ( _connectCallback )
+                _connectCallback( c );
+        }
+
+        // engine implementation may either respond to interrupt events or
+        // poll for interrupts
+
+        // the interrupt functions must not wait indefinitely on a lock
+        virtual void interrupt( unsigned opSpec ) {}
+        virtual void interruptAll() {}
+
+        static void setGetInterruptSpecCallback( unsigned ( *func )() ) { _getInterruptSpecCallback = func; }
+        static bool haveGetInterruptSpecCallback() { return _getInterruptSpecCallback; }
+        static unsigned getInterruptSpec() {
+            massert( 13474, "no _getInterruptSpecCallback", _getInterruptSpecCallback );
+            return _getInterruptSpecCallback();
+        }
+
+        static void setCheckInterruptCallback( const char * ( *func )() ) { _checkInterruptCallback = func; }
+        static bool haveCheckInterruptCallback() { return _checkInterruptCallback; }
+        static const char * checkInterrupt() {
+            return _checkInterruptCallback ? _checkInterruptCallback() : "";
+        }
+        static bool interrupted() {
+            const char *r = checkInterrupt();
+            return r && r[ 0 ];
+        }
+
+    protected:
+        virtual Scope * createScope() = 0;
+
+    private:
+        void ( *_scopeInitCallback )( Scope & );
+        static void ( *_connectCallback )( DBClientWithCommands & );
+        static const char * ( *_checkInterruptCallback )();
+        static unsigned ( *_getInterruptSpecCallback )();
+    };
+
+    bool hasJSReturn( const string& s );
+
+    const char * jsSkipWhiteSpace( const char * raw );
+
+    extern ScriptEngine * globalScriptEngine;
+}
diff --git a/src/mongo/scripting/engine_java.cpp b/src/mongo/scripting/engine_java.cpp
new file mode 100644
index 00000000000..57388166e98
--- /dev/null
+++ b/src/mongo/scripting/engine_java.cpp
@@ -0,0 +1,764 @@
+// java.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+
+#include "pch.h"
+#include "engine_java.h"
+#include <iostream>
+#include <map>
+#include <list>
+
+#include "../db/jsobj.h"
+#include "../db/db.h"
+
+using namespace boost::filesystem;
+
+namespace mongo {
+
+//#define JNI_DEBUG 1
+
+#ifdef JNI_DEBUG
+#undef JNI_DEBUG
+#define JNI_DEBUG(x) cout << x << endl
+#else
+#undef JNI_DEBUG
+#define JNI_DEBUG(x)
+#endif
+
+} // namespace mongo
+
+
+
+#include "../util/net/message.h"
+#include "../db/db.h"
+
+using namespace std;
+
+namespace mongo {
+
+#if defined(_WIN32)
+    /* [dm] this being undefined without us adding it here means there is
+            no tss cleanup on windows for boost lib?
+            we don't care for now esp on windows only
+
+            the boost source says:
+
+              This function's sole purpose is to cause a link error in cases where
+              automatic tss cleanup is not implemented by Boost.Threads as a
+              reminder that user code is responsible for calling the necessary
+              functions at the appropriate times (and for implementing an a
+              tss_cleanup_implemented() function to eliminate the linker's
+              missing symbol error).
+
+              If Boost.Threads later implements automatic tss cleanup in cases
+              where it currently doesn't (which is the plan), the duplicate
+              symbol error will warn the user that their custom solution is no
+              longer needed and can be removed.
+    */
+    extern "C" void tss_cleanup_implemented(void) {
+        //out() << "tss_cleanup_implemented called" << endl;
+    }
+#endif
+
+    JavaJSImpl * JavaJS = 0;
+    extern string dbExecCommand;
+
+#if !defined(NOJNI)
+
+    void myJNIClean( JNIEnv * env ) {
+        JavaJS->detach( env );
+    }
+
+#if defined(_WIN32)
+    const char SYSTEM_COLON = ';';
+#else
+    const char SYSTEM_COLON = ':';
+#endif
+
+
+    void _addClassPath( const char * ed , stringstream & ss , const char * subdir ) {
+        path includeDir(ed);
+        includeDir /= subdir;
+        directory_iterator end;
+        try {
+            directory_iterator i(includeDir);
+            while ( i != end ) {
+                path p = *i;
+                ss << SYSTEM_COLON << p.string();
+                i++;
+            }
+        }
+        catch (...) {
+            problem() << "exception looking for ed class path includeDir: " << includeDir.string() << endl;
+            sleepsecs(3);
+            dbexit( EXIT_JAVA );
+        }
+    }
+
+
+    JavaJSImpl::JavaJSImpl(const char *appserverPath) {
+        _jvm = 0;
+        _mainEnv = 0;
+        _dbhook = 0;
+
+        stringstream ss;
+        string edTemp;
+
+        const char * ed = 0;
+        ss << "-Djava.class.path=.";
+
+        if ( appserverPath ) {
+            ed = findEd(appserverPath);
+            assert( ed );
+
+            ss << SYSTEM_COLON << ed << "/build/";
+
+            _addClassPath( ed , ss , "include" );
+            _addClassPath( ed , ss , "include/jython/" );
+            _addClassPath( ed , ss , "include/jython/javalib" );
+        }
+        else {
+            const string jars = findJars();
+            _addClassPath( jars.c_str() , ss , "jars" );
+
+            edTemp += (string)jars + "/jars/mongojs-js.jar";
+            ed = edTemp.c_str();
+        }
+
+
+
+#if defined(_WIN32)
+        ss << SYSTEM_COLON << "C:\\Program Files\\Java\\jdk\\lib\\tools.jar";
+#else
+        ss << SYSTEM_COLON << "/opt/java/lib/tools.jar";
+#endif
+
+        if ( getenv( "CLASSPATH" ) )
+            ss << SYSTEM_COLON << getenv( "CLASSPATH" );
+
+        string s = ss.str();
+        char * p = (char *)malloc( s.size() * 4 );
+        strcpy( p , s.c_str() );
+        char *q = p;
+#if defined(_WIN32)
+        while ( *p ) {
+            if ( *p == '/' ) *p = '\\';
+            p++;
+        }
+#endif
+
+        log(1) << "classpath: " << q << endl;
+
+        JavaVMOption * options = new JavaVMOption[4];
+        options[0].optionString = q;
+        options[1].optionString = (char*)"-Djava.awt.headless=true";
+        options[2].optionString = (char*)"-Xmx300m";
+
+        // Prevent JVM from using async signals internally, since on linux the pre-installed handlers for these
+        // signals don't seem to be respected by JNI.
+        options[3].optionString = (char*)"-Xrs";
+        // -Xcheck:jni
+
+        _vmArgs = new JavaVMInitArgs();
+        _vmArgs->version = JNI_VERSION_1_4;
+        _vmArgs->options = options;
+        _vmArgs->nOptions = 4;
+        _vmArgs->ignoreUnrecognized = JNI_FALSE;
+
+        log(1) << "loading JVM" << endl;
+        jint res = JNI_CreateJavaVM( &_jvm, (void**)&_mainEnv, _vmArgs );
+
+        if ( res ) {
+            log() << "using classpath: " << q << endl;
+            log()
+                    << " res : " << (unsigned) res << " "
+                    << "_jvm : " << _jvm  << " "
+                    << "_env : " << _mainEnv << " "
+                    << endl;
+            problem() << "Couldn't create JVM res:" << (int) res << " terminating" << endl;
+            log() << "(try --nojni if you do not require that functionality)" << endl;
+            exit(22);
+        }
+        jassert( res == 0 );
+        jassert( _jvm > 0 );
+        jassert( _mainEnv > 0 );
+
+        _envs = new boost::thread_specific_ptr<JNIEnv>( myJNIClean );
+        assert( ! _envs->get() );
+        _envs->reset( _mainEnv );
+
+        _dbhook = findClass( "ed/db/JSHook" );
+        if ( _dbhook == 0 ) {
+            log() << "using classpath: " << q << endl;
+            printException();
+        }
+        jassert( _dbhook );
+
+        if ( ed ) {
+            jmethodID init = _mainEnv->GetStaticMethodID( _dbhook ,  "init" , "(Ljava/lang/String;)V" );
+            jassert( init );
+            _mainEnv->CallStaticVoidMethod( _dbhook , init , _getEnv()->NewStringUTF( ed ) );
+        }
+
+        _dbjni = findClass( "ed/db/DBJni" );
+        jassert( _dbjni );
+
+        _scopeCreate = _mainEnv->GetStaticMethodID( _dbhook , "scopeCreate" , "()J" );
+        _scopeInit = _mainEnv->GetStaticMethodID( _dbhook , "scopeInit" , "(JLjava/nio/ByteBuffer;)Z" );
+        _scopeSetThis = _mainEnv->GetStaticMethodID( _dbhook , "scopeSetThis" , "(JLjava/nio/ByteBuffer;)Z" );
+        _scopeReset = _mainEnv->GetStaticMethodID( _dbhook , "scopeReset" , "(J)Z" );
+        _scopeFree = _mainEnv->GetStaticMethodID( _dbhook , "scopeFree" , "(J)V" );
+
+        _scopeGetNumber = _mainEnv->GetStaticMethodID( _dbhook , "scopeGetNumber" , "(JLjava/lang/String;)D" );
+        _scopeGetString = _mainEnv->GetStaticMethodID( _dbhook , "scopeGetString" , "(JLjava/lang/String;)Ljava/lang/String;" );
+        _scopeGetBoolean = _mainEnv->GetStaticMethodID( _dbhook , "scopeGetBoolean" , "(JLjava/lang/String;)Z" );
+        _scopeGetType = _mainEnv->GetStaticMethodID( _dbhook , "scopeGetType" , "(JLjava/lang/String;)B" );
+        _scopeGetObject = _mainEnv->GetStaticMethodID( _dbhook , "scopeGetObject" , "(JLjava/lang/String;Ljava/nio/ByteBuffer;)I" );
+        _scopeGuessObjectSize = _mainEnv->GetStaticMethodID( _dbhook , "scopeGuessObjectSize" , "(JLjava/lang/String;)J" );
+
+        _scopeSetNumber = _mainEnv->GetStaticMethodID( _dbhook , "scopeSetNumber" , "(JLjava/lang/String;D)Z" );
+        _scopeSetBoolean = _mainEnv->GetStaticMethodID( _dbhook , "scopeSetBoolean" , "(JLjava/lang/String;Z)Z" );
+        _scopeSetString = _mainEnv->GetStaticMethodID( _dbhook , "scopeSetString" , "(JLjava/lang/String;Ljava/lang/String;)Z" );
+        _scopeSetObject = _mainEnv->GetStaticMethodID( _dbhook , "scopeSetObject" , "(JLjava/lang/String;Ljava/nio/ByteBuffer;)Z" );
+
+        _functionCreate = _mainEnv->GetStaticMethodID( _dbhook , "functionCreate" , "(Ljava/lang/String;)J" );
+        _invoke = _mainEnv->GetStaticMethodID( _dbhook , "invoke" , "(JJ)I" );
+
+        jassert( _scopeCreate );
+        jassert( _scopeInit );
+        jassert( _scopeSetThis );
+        jassert( _scopeReset );
+        jassert( _scopeFree );
+
+        jassert( _scopeGetNumber );
+        jassert( _scopeGetString );
+        jassert( _scopeGetObject );
+        jassert( _scopeGetBoolean );
+        jassert( _scopeGetType );
+        jassert( _scopeGuessObjectSize );
+
+        jassert( _scopeSetNumber );
+        jassert( _scopeSetBoolean );
+        jassert( _scopeSetString );
+        jassert( _scopeSetObject );
+
+        jassert( _functionCreate );
+        jassert( _invoke );
+
+        JNINativeMethod * nativeSay = new JNINativeMethod();
+        nativeSay->name = (char*)"native_say";
+        nativeSay->signature = (char*)"(Ljava/nio/ByteBuffer;)V";
+        nativeSay->fnPtr = (void*)java_native_say;
+        _mainEnv->RegisterNatives( _dbjni , nativeSay , 1 );
+
+
+        JNINativeMethod * nativeCall = new JNINativeMethod();
+        nativeCall->name = (char*)"native_call";
+        nativeCall->signature = (char*)"(Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;)I";
+        nativeCall->fnPtr = (void*)java_native_call;
+        _mainEnv->RegisterNatives( _dbjni , nativeCall , 1 );
+
+    }
+
+    JavaJSImpl::~JavaJSImpl() {
+        if ( _jvm ) {
+            _jvm->DestroyJavaVM();
+            cout << "Destroying JVM" << endl;
+        }
+    }
+
+// scope
+
+    jlong JavaJSImpl::scopeCreate() {
+        return _getEnv()->CallStaticLongMethod( _dbhook , _scopeCreate );
+    }
+
+    jboolean JavaJSImpl::scopeReset( jlong id ) {
+        return _getEnv()->CallStaticBooleanMethod( _dbhook , _scopeReset );
+    }
+
+    void JavaJSImpl::scopeFree( jlong id ) {
+        _getEnv()->CallStaticVoidMethod( _dbhook , _scopeFree , id );
+    }
+
+// scope setters
+
+    int JavaJSImpl::scopeSetBoolean( jlong id , const char * field , jboolean val ) {
+        jstring fieldString = _getEnv()->NewStringUTF( field );
+        int res = _getEnv()->CallStaticBooleanMethod( _dbhook , _scopeSetNumber , id , fieldString  , val );
+        _getEnv()->DeleteLocalRef( fieldString );
+        return res;
+    }
+
+    int JavaJSImpl::scopeSetNumber( jlong id , const char * field , double val ) {
+        jstring fieldString = _getEnv()->NewStringUTF( field );
+        int res = _getEnv()->CallStaticBooleanMethod( _dbhook , _scopeSetNumber , id , fieldString , val );
+        _getEnv()->DeleteLocalRef( fieldString );
+        return res;
+    }
+
+    int JavaJSImpl::scopeSetString( jlong id , const char * field , const char * val ) {
+        jstring s1 = _getEnv()->NewStringUTF( field );
+        jstring s2 = _getEnv()->NewStringUTF( val );
+        int res = _getEnv()->CallStaticBooleanMethod( _dbhook , _scopeSetString , id , s1 , s2 );
+        _getEnv()->DeleteLocalRef( s1 );
+        _getEnv()->DeleteLocalRef( s2 );
+        return res;
+    }
+
+    int JavaJSImpl::scopeSetObject( jlong id , const char * field , const BSONObj * obj ) {
+        jobject bb = 0;
+        if ( obj ) {
+            bb = _getEnv()->NewDirectByteBuffer( (void*)(obj->objdata()) , (jlong)(obj->objsize()) );
+            jassert( bb );
+        }
+
+        jstring s1 = _getEnv()->NewStringUTF( field );
+        int res = _getEnv()->CallStaticBooleanMethod( _dbhook , _scopeSetObject , id , s1 , bb );
+        _getEnv()->DeleteLocalRef( s1 );
+        if ( bb )
+            _getEnv()->DeleteLocalRef( bb );
+
+        return res;
+    }
+
+    int JavaJSImpl::scopeInit( jlong id , const BSONObj * obj ) {
+        if ( ! obj )
+            return 0;
+
+        jobject bb = _getEnv()->NewDirectByteBuffer( (void*)(obj->objdata()) , (jlong)(obj->objsize()) );
+        jassert( bb );
+
+        int res = _getEnv()->CallStaticBooleanMethod( _dbhook , _scopeInit , id , bb );
+        _getEnv()->DeleteLocalRef( bb );
+        return res;
+    }
+
+    int JavaJSImpl::scopeSetThis( jlong id , const BSONObj * obj ) {
+        if ( ! obj )
+            return 0;
+
+        jobject bb = _getEnv()->NewDirectByteBuffer( (void*)(obj->objdata()) , (jlong)(obj->objsize()) );
+        jassert( bb );
+
+        int res = _getEnv()->CallStaticBooleanMethod( _dbhook , _scopeSetThis , id , bb );
+        _getEnv()->DeleteLocalRef( bb );
+        return res;
+    }
+
+// scope getters
+
+    char JavaJSImpl::scopeGetType( jlong id , const char * field ) {
+        jstring s1 = _getEnv()->NewStringUTF( field );
+        int res =_getEnv()->CallStaticByteMethod( _dbhook , _scopeGetType , id , s1 );
+        _getEnv()->DeleteLocalRef( s1 );
+        return res;
+    }
+
+    double JavaJSImpl::scopeGetNumber( jlong id , const char * field ) {
+        jstring s1 = _getEnv()->NewStringUTF( field );
+        double res = _getEnv()->CallStaticDoubleMethod( _dbhook , _scopeGetNumber , id , s1 );
+        _getEnv()->DeleteLocalRef( s1 );
+        return res;
+    }
+
+    jboolean JavaJSImpl::scopeGetBoolean( jlong id , const char * field ) {
+        jstring s1 = _getEnv()->NewStringUTF( field );
+        jboolean res = _getEnv()->CallStaticBooleanMethod( _dbhook , _scopeGetBoolean , id , s1 );
+        _getEnv()->DeleteLocalRef( s1 );
+        return res;
+    }
+
+    string JavaJSImpl::scopeGetString( jlong id , const char * field ) {
+        jstring s1 = _getEnv()->NewStringUTF( field );
+        jstring s = (jstring)_getEnv()->CallStaticObjectMethod( _dbhook , _scopeGetString , id , s1 );
+        _getEnv()->DeleteLocalRef( s1 );
+
+        if ( ! s )
+            return "";
+
+        const char * c = _getEnv()->GetStringUTFChars( s , 0 );
+        string retStr(c);
+        _getEnv()->ReleaseStringUTFChars( s , c );
+        return retStr;
+    }
+
+    BSONObj JavaJSImpl::scopeGetObject( jlong id , const char * field ) {
+        jstring s1 = _getEnv()->NewStringUTF( field );
+        int guess = _getEnv()->CallStaticIntMethod( _dbhook , _scopeGuessObjectSize , id , _getEnv()->NewStringUTF( field ) );
+        _getEnv()->DeleteLocalRef( s1 );
+
+        if ( guess == 0 )
+            return BSONObj();
+
+        BSONObj::Holder* holder = (BSONObj::Holder*) malloc(guess + sizeof(unsigned));
+        holder->zero()
+
+        jobject bb = _getEnv()->NewDirectByteBuffer( (void*)holder->data , guess );
+        jassert( bb );
+
+        int len = _getEnv()->CallStaticIntMethod( _dbhook , _scopeGetObject , id , _getEnv()->NewStringUTF( field ) , bb );
+        _getEnv()->DeleteLocalRef( bb );
+        jassert( len > 0 && len < guess );
+
+        BSONObj obj(holder);
+        assert( obj.objsize() <= guess );
+        return obj;
+    }
+
+// other
+
+    jlong JavaJSImpl::functionCreate( const char * code ) {
+        jstring s = _getEnv()->NewStringUTF( code );
+        jassert( s );
+        jlong id = _getEnv()->CallStaticLongMethod( _dbhook , _functionCreate , s );
+        _getEnv()->DeleteLocalRef( s );
+        return id;
+    }
+
+    int JavaJSImpl::invoke( jlong scope , jlong function ) {
+        return _getEnv()->CallStaticIntMethod( _dbhook , _invoke , scope , function );
+    }
+
+// --- fun run method
+
+    void JavaJSImpl::run( const char * js ) {
+        jclass c = findClass( "ed/js/JS" );
+        jassert( c );
+
+        jmethodID m = _getEnv()->GetStaticMethodID( c , "eval" , "(Ljava/lang/String;)Ljava/lang/Object;" );
+        jassert( m );
+
+        jstring s = _getEnv()->NewStringUTF( js );
+        log() << _getEnv()->CallStaticObjectMethod( c , m , s ) << endl;
+        _getEnv()->DeleteLocalRef( s );
+    }
+
+    void JavaJSImpl::printException() {
+        jthrowable exc = _getEnv()->ExceptionOccurred();
+        if ( exc ) {
+            _getEnv()->ExceptionDescribe();
+            _getEnv()->ExceptionClear();
+        }
+
+    }
+
+    JNIEnv * JavaJSImpl::_getEnv() {
+        JNIEnv * env = _envs->get();
+        if ( env )
+            return env;
+
+        int res = _jvm->AttachCurrentThread( (void**)&env , (void*)&_vmArgs );
+        if ( res ) {
+            out() << "ERROR javajs attachcurrentthread fails res:" << res << '\n';
+            assert(false);
+        }
+
+        _envs->reset( env );
+        return env;
+    }
+
+    Scope * JavaJSImpl::createScope() {
+        return new JavaScope();
+    }
+
+    void ScriptEngine::setup() {
+        if ( ! JavaJS ) {
+            JavaJS = new JavaJSImpl();
+            globalScriptEngine = JavaJS;
+        }
+    }
+
+    void jasserted(const char *msg, const char *file, unsigned line) {
+        log() << "jassert failed " << msg << " " << file << " " << line << endl;
+        if ( JavaJS ) JavaJS->printException();
+        throw AssertionException();
+    }
+
+
+    const char* findEd(const char *path) {
+
+#if defined(_WIN32)
+
+        if (!path) {
+            path = findEd();
+        }
+
+        // @TODO check validity
+
+        return path;
+#else
+
+        if (!path) {
+            return findEd();
+        }
+
+        log() << "Appserver location specified : " << path << endl;
+
+        if (!path) {
+            log() << "   invalid appserver location : " << path << " : terminating - prepare for bus error" << endl;
+            return 0;
+        }
+
+        DIR *testDir = opendir(path);
+
+        if (testDir) {
+            log(1) << "   found directory for appserver : " << path << endl;
+            closedir(testDir);
+            return path;
+        }
+        else {
+            log() << "   ERROR : not a directory for specified appserver location : " << path << " - prepare for bus error" << endl;
+            return null;
+        }
+#endif
+    }
+
+    const char * findEd() {
+
+#if defined(_WIN32)
+        log() << "Appserver location will be WIN32 default : c:/l/ed/" << endl;
+        return "c:/l/ed";
+#else
+
+        static list<const char*> possibleEdDirs;
+        if ( ! possibleEdDirs.size() ) {
+            possibleEdDirs.push_back( "../../ed/ed/" ); // this one for dwight dev box
+            possibleEdDirs.push_back( "../ed/" );
+            possibleEdDirs.push_back( "../../ed/" );
+            possibleEdDirs.push_back( "../babble/" );
+            possibleEdDirs.push_back( "../../babble/" );
+        }
+
+        for ( list<const char*>::iterator i = possibleEdDirs.begin() ; i != possibleEdDirs.end(); i++ ) {
+            const char * temp = *i;
+            DIR * test = opendir( temp );
+            if ( ! test )
+                continue;
+
+            closedir( test );
+            log(1) << "found directory for appserver : " << temp << endl;
+            return temp;
+        }
+
+        return 0;
+#endif
+    };
+
+    const string findJars() {
+
+        static list<string> possible;
+        if ( ! possible.size() ) {
+            possible.push_back( "./" );
+            possible.push_back( "../" );
+
+            log(2) << "dbExecCommand: " << dbExecCommand << endl;
+
+            string dbDir = dbExecCommand;
+#ifdef WIN32
+            if ( dbDir.find( "\\" ) != string::npos ) {
+                dbDir = dbDir.substr( 0 , dbDir.find_last_of( "\\" ) );
+            }
+            else {
+                dbDir = ".";
+            }
+#else
+            if ( dbDir.find( "/" ) != string::npos ) {
+                dbDir = dbDir.substr( 0 , dbDir.find_last_of( "/" ) );
+            }
+            else {
+                bool found = false;
+
+                if ( getenv( "PATH" ) ) {
+                    string s = getenv( "PATH" );
+                    s += ":";
+                    pcrecpp::StringPiece input( s );
+                    string dir;
+                    pcrecpp::RE re("(.*?):");
+                    while ( re.Consume( &input, &dir ) ) {
+                        string test = dir + "/" + dbExecCommand;
+                        if ( boost::filesystem::exists( test ) ) {
+                            while ( boost::filesystem::symbolic_link_exists( test ) ) {
+                                char tmp[2048];
+                                int len = readlink( test.c_str() , tmp , 2048 );
+                                tmp[len] = 0;
+                                log(5) << " symlink " << test << "  -->> " << tmp << endl;
+                                test = tmp;
+
+                                dir = test.substr( 0 , test.rfind( "/" ) );
+                            }
+                            dbDir = dir;
+                            found = true;
+                            break;
+                        }
+                    }
+                }
+
+                if ( ! found )
+                    dbDir = ".";
+            }
+#endif
+
+            log(2) << "dbDir [" << dbDir << "]" << endl;
+            possible.push_back( ( dbDir + "/../lib/mongo/" ));
+            possible.push_back( ( dbDir + "/../lib64/mongo/" ));
+            possible.push_back( ( dbDir + "/../lib32/mongo/" ));
+            possible.push_back( ( dbDir + "/" ));
+            possible.push_back( ( dbDir + "/lib64/mongo/" ));
+            possible.push_back( ( dbDir + "/lib32/mongo/" ));
+        }
+
+        for ( list<string>::iterator i = possible.begin() ; i != possible.end(); i++ ) {
+            const string temp = *i;
+            const string jarDir = ((string)temp) + "jars/";
+
+            log(5) << "possible jarDir [" << jarDir << "]" << endl;
+
+            path p(jarDir );
+            if ( ! boost::filesystem::exists( p) )
+                continue;
+
+            log(1) << "found directory for jars : " << jarDir << endl;
+            return temp;
+        }
+
+        problem() << "ERROR : can't find directory for jars - terminating" << endl;
+        exit(44);
+        return 0;
+
+    };
+
+
+// ---
+
+    JNIEXPORT void JNICALL java_native_say(JNIEnv * env , jclass, jobject outBuffer ) {
+        JNI_DEBUG( "native say called!" );
+
+        Message out( env->GetDirectBufferAddress( outBuffer ) , false );
+        Message in;
+
+        jniCallback( out , in );
+        assert( ! out.doIFreeIt() );
+        curNs = 0;
+    }
+
+    JNIEXPORT jint JNICALL java_native_call(JNIEnv * env , jclass, jobject outBuffer , jobject inBuffer ) {
+        JNI_DEBUG( "native call called!" );
+
+        Message out( env->GetDirectBufferAddress( outBuffer ) , false );
+        Message in;
+
+        jniCallback( out , in );
+        curNs = 0;
+
+        JNI_DEBUG( "in.data : " << in.data );
+        if ( in.data && in.data->len > 0 ) {
+            JNI_DEBUG( "copying data of len :" << in.data->len );
+            assert( env->GetDirectBufferCapacity( inBuffer ) >= in.data->len );
+            memcpy( env->GetDirectBufferAddress( inBuffer ) , in.data , in.data->len );
+
+            assert( ! out.doIFreeIt() );
+            assert( in.doIFreeIt() );
+            return in.data->len;
+        }
+
+        return 0;
+    }
+
+// ----
+
+    void JavaJSImpl::runTest() {
+
+        const int debug = 0;
+
+        JavaJSImpl& JavaJS = *mongo::JavaJS;
+
+        jlong scope = JavaJS.scopeCreate();
+        jassert( scope );
+        if ( debug ) out() << "got scope" << endl;
+
+
+        jlong func1 = JavaJS.functionCreate( "foo = 5.6; bar = \"eliot\"; abc = { foo : 517 }; " );
+        jassert( ! JavaJS.invoke( scope , func1 ) );
+
+
+        if ( debug ) out() << "func3 start" << endl;
+        jlong func3 = JavaJS.functionCreate( "function(){ z = true; } " );
+        jassert( func3 );
+        jassert( ! JavaJS.invoke( scope , func3 ) );
+        jassert( JavaJS.scopeGetBoolean( scope , "z" ) );
+        if ( debug ) out() << "func3 done" << endl;
+
+        if ( debug ) out() << "going to get object" << endl;
+        BSONObj obj = JavaJS.scopeGetObject( scope , "abc" );
+        if ( debug ) out() << "done getting object" << endl;
+
+        if ( debug ) {
+            out() << "obj : " << obj.toString() << endl;
+        }
+
+        {
+            time_t start = time(0);
+            for ( int i=0; i<5000; i++ ) {
+                JavaJS.scopeSetObject( scope , "obj" , &obj );
+            }
+            time_t end = time(0);
+
+            if ( debug )
+                out() << "time : " << (unsigned) ( end - start ) << endl;
+        }
+
+        if ( debug ) out() << "func4 start" << endl;
+        JavaJS.scopeSetObject( scope , "obj" , &obj );
+        if ( debug ) out() << "\t here 1" << endl;
+        jlong func4 = JavaJS.functionCreate( "tojson( obj );" );
+        if ( debug ) out() << "\t here 2" << endl;
+        jassert( ! JavaJS.invoke( scope , func4 ) );
+        if ( debug ) out() << "func4 end" << endl;
+
+        if ( debug ) out() << "func5 start" << endl;
+        jassert( JavaJS.scopeSetObject( scope , "c" , &obj ) );
+        jlong func5 = JavaJS.functionCreate( "assert.eq( 517 , c.foo );" );
+        jassert( func5 );
+        jassert( ! JavaJS.invoke( scope , func5 ) );
+        if ( debug ) out() << "func5 done" << endl;
+
+        if ( debug ) out() << "func6 start" << endl;
+        for ( int i=0; i<100; i++ ) {
+            double val = i + 5;
+            JavaJS.scopeSetNumber( scope , "zzz" , val );
+            jlong func6 = JavaJS.functionCreate( " xxx = zzz; " );
+            jassert( ! JavaJS.invoke( scope , func6 ) );
+            double n = JavaJS.scopeGetNumber( scope , "xxx" );
+            jassert( val == n );
+        }
+        if ( debug ) out() << "func6 done" << endl;
+
+        jlong func7 = JavaJS.functionCreate( "return 11;" );
+        jassert( ! JavaJS.invoke( scope , func7 ) );
+        assert( 11 == JavaJS.scopeGetNumber( scope , "return" ) );
+
+        scope = JavaJS.scopeCreate();
+        jlong func8 = JavaJS.functionCreate( "function(){ return 12; }" );
+        jassert( ! JavaJS.invoke( scope , func8 ) );
+        assert( 12 == JavaJS.scopeGetNumber( scope , "return" ) );
+
+    }
+
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/scripting/engine_java.h b/src/mongo/scripting/engine_java.h
new file mode 100644
index 00000000000..b8245ba6f22
--- /dev/null
+++ b/src/mongo/scripting/engine_java.h
@@ -0,0 +1,223 @@
+// engine_java.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/* this file contains code to call into java (into the 10gen sandbox) from inside the database */
+
+#pragma once
+
+#include "../pch.h"
+
+#include <jni.h>
+#include <errno.h>
+#include <sys/types.h>
+
+#if !defined(_WIN32)
+#include <dirent.h>
+#endif
+
+#include "../db/jsobj.h"
+
+#include "engine.h"
+
+namespace mongo {
+
+    void jasserted(const char *msg, const char *file, unsigned line);
+#define jassert(_Expression) if ( ! ( _Expression ) ){ jasserted(#_Expression, __FILE__, __LINE__); }
+
+    const char * findEd();
+    const char * findEd(const char *);
+    const string findJars();
+
+    class BSONObj;
+
+    class JavaJSImpl : public ScriptEngine {
+    public:
+        JavaJSImpl(const char * = 0);
+        ~JavaJSImpl();
+
+        jlong scopeCreate();
+        int scopeInit( jlong id , const BSONObj * obj );
+        int scopeSetThis( jlong id , const BSONObj * obj );
+        jboolean scopeReset( jlong id );
+        void scopeFree( jlong id );
+
+        double scopeGetNumber( jlong id , const char * field );
+        string scopeGetString( jlong id , const char * field );
+        jboolean scopeGetBoolean( jlong id , const char * field );
+        BSONObj scopeGetObject( jlong id , const char * field );
+        char scopeGetType( jlong id , const char * field );
+
+        int scopeSetNumber( jlong id , const char * field , double val );
+        int scopeSetString( jlong id , const char * field , const char * val );
+        int scopeSetObject( jlong id , const char * field , const BSONObj * obj );
+        int scopeSetBoolean( jlong id , const char * field , jboolean val );
+
+        jlong functionCreate( const char * code );
+
+        /* return values:
+           public static final int NO_SCOPE = -1;
+           public static final int NO_FUNCTION = -2;
+           public static final int INVOKE_ERROR = -3;
+           public static final int INVOKE_SUCCESS = 0;
+          */
+        int invoke( jlong scope , jlong function );
+
+        void printException();
+
+        void run( const char * js );
+
+        void detach( JNIEnv * env ) {
+            _jvm->DetachCurrentThread();
+        }
+
+        Scope * createScope();
+
+        void runTest();
+    private:
+
+        jobject create( const char * name ) {
+            jclass c = findClass( name );
+            if ( ! c )
+                return 0;
+
+            jmethodID cons = _getEnv()->GetMethodID( c , "<init>" , "()V" );
+            if ( ! cons )
+                return 0;
+
+            return _getEnv()->NewObject( c , cons );
+        }
+
+        jclass findClass( const char * name ) {
+            return _getEnv()->FindClass( name );
+        }
+
+
+    private:
+
+        JNIEnv * _getEnv();
+
+        JavaVM * _jvm;
+        JNIEnv * _mainEnv;
+        JavaVMInitArgs * _vmArgs;
+
+        boost::thread_specific_ptr<JNIEnv> * _envs;
+
+        jclass _dbhook;
+        jclass _dbjni;
+
+        jmethodID _scopeCreate;
+        jmethodID _scopeInit;
+        jmethodID _scopeSetThis;
+        jmethodID _scopeReset;
+        jmethodID _scopeFree;
+
+        jmethodID _scopeGetNumber;
+        jmethodID _scopeGetString;
+        jmethodID _scopeGetObject;
+        jmethodID _scopeGetBoolean;
+        jmethodID _scopeGuessObjectSize;
+        jmethodID _scopeGetType;
+
+        jmethodID _scopeSetNumber;
+        jmethodID _scopeSetString;
+        jmethodID _scopeSetObject;
+        jmethodID _scopeSetBoolean;
+
+        jmethodID _functionCreate;
+
+        jmethodID _invoke;
+
+    };
+
+    extern JavaJSImpl *JavaJS;
+
+// a javascript "scope"
+    class JavaScope : public Scope {
+    public:
+        JavaScope() {
+            s = JavaJS->scopeCreate();
+        }
+        virtual ~JavaScope() {
+            JavaJS->scopeFree(s);
+            s = 0;
+        }
+        void reset() {
+            JavaJS->scopeReset(s);
+        }
+
+        void init( BSONObj * o ) {
+            JavaJS->scopeInit( s , o );
+        }
+
+        void localConnect( const char * dbName ) {
+            setString("$client", dbName );
+        }
+
+        double getNumber(const char *field) {
+            return JavaJS->scopeGetNumber(s,field);
+        }
+        string getString(const char *field) {
+            return JavaJS->scopeGetString(s,field);
+        }
+        bool getBoolean(const char *field) {
+            return JavaJS->scopeGetBoolean(s,field);
+        }
+        BSONObj getObject(const char *field ) {
+            return JavaJS->scopeGetObject(s,field);
+        }
+        int type(const char *field ) {
+            return JavaJS->scopeGetType(s,field);
+        }
+
+        void setThis( const BSONObj * obj ) {
+            JavaJS->scopeSetThis( s , obj );
+        }
+
+        void setNumber(const char *field, double val ) {
+            JavaJS->scopeSetNumber(s,field,val);
+        }
+        void setString(const char *field, const char * val ) {
+            JavaJS->scopeSetString(s,field,val);
+        }
+        void setObject(const char *field, const BSONObj& obj , bool readOnly ) {
+            uassert( 10211 ,  "only readOnly setObject supported in java" , readOnly );
+            JavaJS->scopeSetObject(s,field,&obj);
+        }
+        void setBoolean(const char *field, bool val ) {
+            JavaJS->scopeSetBoolean(s,field,val);
+        }
+
+        ScriptingFunction createFunction( const char * code ) {
+            return JavaJS->functionCreate( code );
+        }
+
+        int invoke( ScriptingFunction function , const BSONObj& args ) {
+            setObject( "args" , args , true );
+            return JavaJS->invoke(s,function);
+        }
+
+        string getError() {
+            return getString( "error" );
+        }
+
+        jlong s;
+    };
+
+    JNIEXPORT void JNICALL java_native_say(JNIEnv *, jclass, jobject outBuffer );
+    JNIEXPORT jint JNICALL java_native_call(JNIEnv *, jclass, jobject outBuffer , jobject inBuffer );
+
+} // namespace mongo
diff --git a/src/mongo/scripting/engine_none.cpp b/src/mongo/scripting/engine_none.cpp
new file mode 100644
index 00000000000..d13dbecc06e
--- /dev/null
+++ b/src/mongo/scripting/engine_none.cpp
@@ -0,0 +1,24 @@
+// engine_none.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "engine.h"
+
+namespace mongo {
+    void ScriptEngine::setup() {
+        // noop
+    }
+}
diff --git a/src/mongo/scripting/engine_spidermonkey.cpp b/src/mongo/scripting/engine_spidermonkey.cpp
new file mode 100644
index 00000000000..70b89cddbb5
--- /dev/null
+++ b/src/mongo/scripting/engine_spidermonkey.cpp
@@ -0,0 +1,1766 @@
+// engine_spidermonkey.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "engine_spidermonkey.h"
+#include "../client/dbclient.h"
+
+#ifndef _WIN32
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert MONGO_assert
+#endif
+
+#define smuassert( cx , msg , val ) \
+    if ( ! ( val ) ){ \
+        JS_ReportError( cx , msg ); \
+        return JS_FALSE; \
+    }
+
+#define CHECKNEWOBJECT(xx,ctx,w)                                   \
+    if ( ! xx ){                                                   \
+        massert(13072,(string)"JS_NewObject failed: " + w ,xx);    \
+    }
+
+#define CHECKJSALLOC( newthing )                \
+    massert( 13615 , "JS allocation failed, either memory leak or using too much memory" , newthing )
+
+namespace mongo {
+
+    class InvalidUTF8Exception : public UserException {
+    public:
+        InvalidUTF8Exception() : UserException( 9006 , "invalid utf8" ) {
+        }
+    };
+
+    string trim( string s ) {
+        while ( s.size() && isspace( s[0] ) )
+            s = s.substr( 1 );
+
+        while ( s.size() && isspace( s[s.size()-1] ) )
+            s = s.substr( 0 , s.size() - 1 );
+
+        return s;
+    }
+
+    boost::thread_specific_ptr<SMScope> currentScope( dontDeleteScope );
+    boost::recursive_mutex &smmutex = *( new boost::recursive_mutex );
+#define smlock recursive_scoped_lock ___lk( smmutex );
+
+#define GETHOLDER(x,o) ((BSONHolder*)JS_GetPrivate( x , o ))
+
+    class BSONFieldIterator;
+
+    class BSONHolder {
+    public:
+
+        BSONHolder( BSONObj obj ) {
+            _obj = obj.getOwned();
+            _inResolve = false;
+            _modified = false;
+            _magic = 17;
+        }
+
+        ~BSONHolder() {
+            _magic = 18;
+        }
+
+        void check() {
+            uassert( 10212 ,  "holder magic value is wrong" , _magic == 17 && _obj.isValid() );
+        }
+
+        BSONFieldIterator * it();
+
+        BSONObj _obj;
+        bool _inResolve;
+        char _magic;
+        list<string> _extra;
+        set<string> _removed;
+        bool _modified;
+    };
+
+    class BSONFieldIterator {
+    public:
+
+        BSONFieldIterator( BSONHolder * holder ) {
+
+            set<string> added;
+
+            BSONObjIterator it( holder->_obj );
+            while ( it.more() ) {
+                BSONElement e = it.next();
+                if ( holder->_removed.count( e.fieldName() ) )
+                    continue;
+                _names.push_back( e.fieldName() );
+                added.insert( e.fieldName() );
+            }
+
+            for ( list<string>::iterator i = holder->_extra.begin(); i != holder->_extra.end(); i++ ) {
+                if ( ! added.count( *i ) )
+                    _names.push_back( *i );
+            }
+
+            _it = _names.begin();
+        }
+
+        bool more() {
+            return _it != _names.end();
+        }
+
+        string next() {
+            string s = *_it;
+            _it++;
+            return s;
+        }
+
+    private:
+        list<string> _names;
+        list<string>::iterator _it;
+    };
+
+    BSONFieldIterator * BSONHolder::it() {
+        return new BSONFieldIterator( this );
+    }
+
+    class TraverseStack {
+    public:
+        TraverseStack() {
+            _o = 0;
+            _parent = 0;
+        }
+
+        TraverseStack( JSObject * o , const TraverseStack * parent ) {
+            _o = o;
+            _parent = parent;
+        }
+
+        TraverseStack dive( JSObject * o ) const {
+            if ( o ) {
+                uassert( 13076 , (string)"recursive toObject" , ! has( o ) );
+            }
+            return TraverseStack( o , this );
+        }
+
+        int depth() const {
+            int d = 0;
+            const TraverseStack * s = _parent;
+            while ( s ) {
+                s = s->_parent;
+                d++;
+            }
+            return d;
+        }
+
+        bool isTop() const {
+            return _parent == 0;
+        }
+
+        bool has( JSObject * o ) const {
+            if ( ! o )
+                return false;
+            const TraverseStack * s = this;
+            while ( s ) {
+                if ( s->_o == o )
+                    return true;
+                s = s->_parent;
+            }
+            return false;
+        }
+
+        JSObject * _o;
+        const TraverseStack * _parent;
+    };
+
+    class Convertor : boost::noncopyable {
+    public:
+        Convertor( JSContext * cx ) {
+            _context = cx;
+        }
+
+        string toString( JSString * so ) {
+            jschar * s = JS_GetStringChars( so );
+            size_t srclen = JS_GetStringLength( so );
+            if( srclen == 0 )
+                return "";
+
+            size_t len = srclen * 6; // we only need *3, but see note on len below
+            char * dst = (char*)malloc( len );
+
+            len /= 2;
+            // doc re weird JS_EncodeCharacters api claims len expected in 16bit
+            // units, but experiments suggest 8bit units expected.  We allocate
+            // enough memory that either will work.
+
+            if ( !JS_EncodeCharacters( _context , s , srclen , dst , &len) ) {
+                StringBuilder temp;
+                temp << "Not proper UTF-16: ";
+                for ( size_t i=0; i<srclen; i++ ) {
+                    if ( i > 0 )
+                        temp << ",";
+                    temp << s[i];
+                }
+                uasserted( 13498 , temp.str() );
+            }
+
+            string ss( dst , len );
+            free( dst );
+            if ( !JS_CStringsAreUTF8() )
+                for( string::const_iterator i = ss.begin(); i != ss.end(); ++i )
+                    uassert( 10213 ,  "non ascii character detected", (unsigned char)(*i) <= 127 );
+            return ss;
+        }
+
+        string toString( jsval v ) {
+            return toString( JS_ValueToString( _context , v ) );
+        }
+
+        // NOTE No validation of passed in object
+        long long toNumberLongUnsafe( JSObject *o ) {
+            boost::uint64_t val;
+            if ( hasProperty( o, "top" ) ) {
+                val =
+                    ( (boost::uint64_t)(boost::uint32_t)getNumber( o , "top" ) << 32 ) +
+                    ( boost::uint32_t)( getNumber( o , "bottom" ) );
+            }
+            else {
+                val = (boost::uint64_t)(boost::int64_t) getNumber( o, "floatApprox" );
+            }
+            return val;
+        }
+
+        int toNumberInt( JSObject *o ) {
+            return (boost::uint32_t)(boost::int32_t) getNumber( o, "floatApprox" );
+        }
+
+        double toNumber( jsval v ) {
+            double d;
+            uassert( 10214 ,  "not a number" , JS_ValueToNumber( _context , v , &d ) );
+            return d;
+        }
+
+        bool toBoolean( jsval v ) {
+            JSBool b;
+            assert( JS_ValueToBoolean( _context, v , &b ) );
+            return b;
+        }
+
+        OID toOID( jsval v ) {
+            JSContext * cx = _context;
+            assert( JSVAL_IS_OID( v ) );
+
+            JSObject * o = JSVAL_TO_OBJECT( v );
+            OID oid;
+            oid.init( getString( o , "str" ) );
+            return oid;
+        }
+
+        BSONObj toObject( JSObject * o , const TraverseStack& stack=TraverseStack() ) {
+            if ( ! o )
+                return BSONObj();
+
+            if ( JS_InstanceOf( _context , o , &bson_ro_class , 0 ) ) {
+                BSONHolder * holder = GETHOLDER( _context , o );
+                assert( holder );
+                return holder->_obj.getOwned();
+            }
+
+            BSONObj orig;
+            if ( JS_InstanceOf( _context , o , &bson_class , 0 ) ) {
+                BSONHolder * holder = GETHOLDER(_context,o);
+                assert( holder );
+                if ( ! holder->_modified ) {
+                    return holder->_obj;
+                }
+                orig = holder->_obj;
+            }
+
+            BSONObjBuilder b;
+
+            if ( ! appendSpecialDBObject( this , b , "value" , OBJECT_TO_JSVAL( o ) , o ) ) {
+
+                if ( stack.isTop() ) {
+                    jsval theid = getProperty( o , "_id" );
+                    if ( ! JSVAL_IS_VOID( theid ) ) {
+                        append( b , "_id" , theid , EOO , stack.dive( o ) );
+                    }
+                }
+
+                JSIdArray * properties = JS_Enumerate( _context , o );
+                assert( properties );
+
+                for ( jsint i=0; i<properties->length; i++ ) {
+                    jsid id = properties->vector[i];
+                    jsval nameval;
+                    assert( JS_IdToValue( _context ,id , &nameval ) );
+                    string name = toString( nameval );
+                    if ( stack.isTop() && name == "_id" )
+                        continue;
+
+                    append( b , name , getProperty( o , name.c_str() ) , orig[name].type() , stack.dive( o ) );
+                }
+
+                JS_DestroyIdArray( _context , properties );
+            }
+
+            return b.obj();
+        }
+
+        BSONObj toObject( jsval v ) {
+            if ( JSVAL_IS_NULL( v ) ||
+                    JSVAL_IS_VOID( v ) )
+                return BSONObj();
+
+            uassert( 10215 ,  "not an object" , JSVAL_IS_OBJECT( v ) );
+            return toObject( JSVAL_TO_OBJECT( v ) );
+        }
+
+        string getFunctionCode( JSFunction * func ) {
+            return toString( JS_DecompileFunction( _context , func , 0 ) );
+        }
+
+        string getFunctionCode( jsval v ) {
+            uassert( 10216 ,  "not a function" , JS_TypeOfValue( _context , v ) == JSTYPE_FUNCTION );
+            return getFunctionCode( JS_ValueToFunction( _context , v ) );
+        }
+
+        void appendRegex( BSONObjBuilder& b , const string& name , string s ) {
+            assert( s[0] == '/' );
+            s = s.substr(1);
+            string::size_type end = s.rfind( '/' );
+            b.appendRegex( name , s.substr( 0 , end ) , s.substr( end + 1 ) );
+        }
+
+        void append( BSONObjBuilder& b , string name , jsval val , BSONType oldType = EOO , const TraverseStack& stack=TraverseStack() ) {
+            //cout << "name: " << name << "\t" << typeString( val ) << " oldType: " << oldType << endl;
+            switch ( JS_TypeOfValue( _context , val ) ) {
+
+            case JSTYPE_VOID: b.appendUndefined( name ); break;
+            case JSTYPE_NULL: b.appendNull( name ); break;
+
+            case JSTYPE_NUMBER: {
+                double d = toNumber( val );
+                if ( oldType == NumberInt && ((int)d) == d )
+                    b.append( name , (int)d );
+                else
+                    b.append( name , d );
+                break;
+            }
+            case JSTYPE_STRING: b.append( name , toString( val ) ); break;
+            case JSTYPE_BOOLEAN: b.appendBool( name , toBoolean( val ) ); break;
+
+            case JSTYPE_OBJECT: {
+                JSObject * o = JSVAL_TO_OBJECT( val );
+                if ( ! o || o == JSVAL_NULL ) {
+                    b.appendNull( name );
+                }
+                else if ( ! appendSpecialDBObject( this , b , name , val , o ) ) {
+                    BSONObj sub = toObject( o , stack );
+                    if ( JS_IsArrayObject( _context , o ) ) {
+                        b.appendArray( name , sub );
+                    }
+                    else {
+                        b.append( name , sub );
+                    }
+                }
+                break;
+            }
+
+            case JSTYPE_FUNCTION: {
+                string s = toString(val);
+                if ( s[0] == '/' ) {
+                    appendRegex( b , name , s );
+                }
+                else {
+                    b.appendCode( name , getFunctionCode( val ) );
+                }
+                break;
+            }
+
+            default: uassert( 10217 ,  (string)"can't append field.  name:" + name + " type: " + typeString( val ) , 0 );
+            }
+        }
+
+        // ---------- to spider monkey ---------
+
+        bool hasFunctionIdentifier( const string& code ) {
+            if ( code.size() < 9 || code.find( "function" ) != 0  )
+                return false;
+
+            return code[8] == ' ' || code[8] == '(';
+        }
+
+        bool isSimpleStatement( const string& code ) {
+            if ( hasJSReturn( code ) )
+                return false;
+
+            if ( code.find( ';' ) != string::npos &&
+                    code.find( ';' ) != code.rfind( ';' ) )
+                return false;
+
+            if ( code.find( '\n') != string::npos )
+                return false;
+
+            if ( code.find( "for(" ) != string::npos ||
+                    code.find( "for (" ) != string::npos ||
+                    code.find( "while (" ) != string::npos ||
+                    code.find( "while(" ) != string::npos )
+                return false;
+
+            return true;
+        }
+
+        void addRoot( JSFunction * f , const char * name );
+
+        JSFunction * compileFunction( const char * code, JSObject * assoc = 0 ) {
+            const char * gcName = "unknown";
+            JSFunction * f = _compileFunction( code , assoc , gcName );
+            //addRoot( f , gcName );
+            return f;
+        }
+
+        JSFunction * _compileFunction( const char * raw , JSObject * assoc , const char *& gcName ) {
+            if ( ! assoc )
+                assoc = JS_GetGlobalObject( _context );
+
+            raw = jsSkipWhiteSpace( raw );
+
+            //cout << "RAW\n---\n" << raw << "\n---" << endl;
+
+            static int fnum = 1;
+            stringstream fname;
+            fname << "__cf__" << fnum++ << "__";
+
+            if ( ! hasFunctionIdentifier( raw ) ) {
+                string s = raw;
+                if ( isSimpleStatement( s ) ) {
+                    s = "return " + s;
+                }
+                gcName = "cf anon";
+                fname << "anon";
+                return JS_CompileFunction( _context , assoc , fname.str().c_str() , 0 , 0 , s.c_str() , s.size() , "nofile_a" , 0 );
+            }
+
+            string code = raw;
+
+            size_t start = code.find( '(' );
+            assert( start != string::npos );
+
+            string fbase;
+            if ( start > 9 ) {
+                fbase = trim( code.substr( 9 , start - 9 ) );
+            }
+            if ( fbase.length() == 0 ) {
+                fbase = "anonymous_function";
+            }
+            fname << "f__" << fbase;
+
+            code = code.substr( start + 1 );
+            size_t end = code.find( ')' );
+            assert( end != string::npos );
+
+            string paramString = trim( code.substr( 0 , end ) );
+            code = code.substr( end + 1 );
+
+            vector<string> params;
+            while ( paramString.size() ) {
+                size_t c = paramString.find( ',' );
+                if ( c == string::npos ) {
+                    params.push_back( paramString );
+                    break;
+                }
+                params.push_back( trim( paramString.substr( 0 , c ) ) );
+                paramString = trim( paramString.substr( c + 1 ) );
+                paramString = trim( paramString );
+            }
+
+            boost::scoped_array<const char *> paramArray (new const char*[params.size()]);
+            for ( size_t i=0; i<params.size(); i++ )
+                paramArray[i] = params[i].c_str();
+            
+            // avoid munging previously munged name (kludge; switching to v8 fixes underlying issue)
+            if ( fbase.find("__cf__") != 0 && fbase.find("__f__") == string::npos ) {
+                fbase = fname.str();
+            }
+
+            JSFunction * func = JS_CompileFunction( _context , assoc , fbase.c_str() , params.size() , paramArray.get() , code.c_str() , code.size() , "nofile_b" , 0 );
+
+            if ( ! func ) {
+                log() << "compile failed for: " << raw << endl;
+                return 0;
+            }
+            gcName = "cf normal";
+            return func;
+        }
+
+        jsval toval( double d ) {
+            jsval val;
+            assert( JS_NewNumberValue( _context, d , &val ) );
+            return val;
+        }
+
+        jsval toval( const char * c ) {
+            JSString * s = JS_NewStringCopyZ( _context , c );
+            if ( s )
+                return STRING_TO_JSVAL( s );
+
+            // possibly unicode, try manual
+
+            size_t len = strlen( c );
+            size_t dstlen = len * 4;
+            jschar * dst = (jschar*)malloc( dstlen );
+
+            JSBool res = JS_DecodeBytes( _context , c , len , dst, &dstlen );
+            if ( res ) {
+                s = JS_NewUCStringCopyN( _context , dst , dstlen );
+            }
+
+            free( dst );
+
+            if ( ! res ) {
+                tlog() << "decode failed. probably invalid utf-8 string [" << c << "]" << endl;
+                jsval v;
+                if ( JS_GetPendingException( _context , &v ) )
+                    tlog() << "\t why: " << toString( v ) << endl;
+                throw InvalidUTF8Exception();
+            }
+
+            CHECKJSALLOC( s );
+            return STRING_TO_JSVAL( s );
+        }
+
+        JSObject * toJSObject( const BSONObj * obj , bool readOnly=false ) {
+            static string ref = "$ref";
+            if ( ref == obj->firstElementFieldName() ) {
+                JSObject * o = JS_NewObject( _context , &dbref_class , NULL, NULL);
+                CHECKNEWOBJECT(o,_context,"toJSObject1");
+                assert( JS_SetPrivate( _context , o , (void*)(new BSONHolder( obj->getOwned() ) ) ) );
+                return o;
+            }
+            JSObject * o = JS_NewObject( _context , readOnly ? &bson_ro_class : &bson_class , NULL, NULL);
+            CHECKNEWOBJECT(o,_context,"toJSObject2");
+            assert( JS_SetPrivate( _context , o , (void*)(new BSONHolder( obj->getOwned() ) ) ) );
+            return o;
+        }
+
+        jsval toval( const BSONObj* obj , bool readOnly=false ) {
+            JSObject * o = toJSObject( obj , readOnly );
+            return OBJECT_TO_JSVAL( o );
+        }
+
+        void makeLongObj( long long n, JSObject * o ) {
+            boost::uint64_t val = (boost::uint64_t)n;
+            CHECKNEWOBJECT(o,_context,"NumberLong1");
+            double floatApprox = (double)(boost::int64_t)val;
+            setProperty( o , "floatApprox" , toval( floatApprox ) );
+            if ( (boost::int64_t)val != (boost::int64_t)floatApprox ) {
+                // using 2 doubles here instead of a single double because certain double
+                // bit patterns represent undefined values and sm might trash them
+                setProperty( o , "top" , toval( (double)(boost::uint32_t)( val >> 32 ) ) );
+                setProperty( o , "bottom" , toval( (double)(boost::uint32_t)( val & 0x00000000ffffffff ) ) );
+            }
+        }
+
+        jsval toval( long long n ) {
+            JSObject * o = JS_NewObject( _context , &numberlong_class , 0 , 0 );
+            makeLongObj( n, o );
+            return OBJECT_TO_JSVAL( o );
+        }
+
+        void makeIntObj( int n, JSObject * o ) {
+            boost::uint32_t val = (boost::uint32_t)n;
+            CHECKNEWOBJECT(o,_context,"NumberInt1");
+            double floatApprox = (double)(boost::int32_t)val;
+            setProperty( o , "floatApprox" , toval( floatApprox ) );
+        }
+
+        jsval toval( int n ) {
+            JSObject * o = JS_NewObject( _context , &numberint_class , 0 , 0 );
+            makeIntObj( n, o );
+            return OBJECT_TO_JSVAL( o );
+        }
+
+        jsval toval( const BSONElement& e ) {
+
+            switch( e.type() ) {
+            case EOO:
+            case jstNULL:
+            case Undefined:
+                return JSVAL_NULL;
+            case NumberDouble:
+            case NumberInt:
+                return toval( e.number() );
+//            case NumberInt:
+//                return toval( e.numberInt() );
+            case Symbol: // TODO: should we make a special class for this
+            case String:
+                return toval( e.valuestr() );
+            case Bool:
+                return e.boolean() ? JSVAL_TRUE : JSVAL_FALSE;
+            case Object: {
+                BSONObj embed = e.embeddedObject().getOwned();
+                return toval( &embed );
+            }
+            case Array: {
+
+                BSONObj embed = e.embeddedObject().getOwned();
+
+                if ( embed.isEmpty() ) {
+                    return OBJECT_TO_JSVAL( JS_NewArrayObject( _context , 0 , 0 ) );
+                }
+                
+                JSObject * array = JS_NewArrayObject( _context , 1 , 0 );
+                CHECKJSALLOC( array );
+
+                jsval myarray = OBJECT_TO_JSVAL( array );
+
+                BSONObjIterator i( embed );
+                while ( i.more() ){
+                    const BSONElement& e = i.next();
+                    jsval v = toval( e );
+                    assert( JS_SetElement( _context , array , atoi(e.fieldName()) , &v ) );
+                }
+
+                return myarray;
+            }
+            case jstOID: {
+                OID oid = e.__oid();
+                JSObject * o = JS_NewObject( _context , &object_id_class , 0 , 0 );
+                CHECKNEWOBJECT(o,_context,"jstOID");
+                setProperty( o , "str" , toval( oid.str().c_str() ) );
+                return OBJECT_TO_JSVAL( o );
+            }
+            case RegEx: {
+                const char * flags = e.regexFlags();
+                uintN flagNumber = 0;
+                while ( *flags ) {
+                    switch ( *flags ) {
+                    case 'g': flagNumber |= JSREG_GLOB; break;
+                    case 'i': flagNumber |= JSREG_FOLD; break;
+                    case 'm': flagNumber |= JSREG_MULTILINE; break;
+                        //case 'y': flagNumber |= JSREG_STICKY; break;
+
+                    default:
+                        log() << "warning: unknown regex flag:" << *flags << endl;
+                    }
+                    flags++;
+                }
+
+                JSObject * r = JS_NewRegExpObject( _context , (char*)e.regex() , strlen( e.regex() ) , flagNumber );
+                assert( r );
+                return OBJECT_TO_JSVAL( r );
+            }
+            case Code: {
+                JSFunction * func = compileFunction( e.valuestr() );
+                if ( func )
+                    return OBJECT_TO_JSVAL( JS_GetFunctionObject( func ) );
+                return JSVAL_NULL;
+            }
+            case CodeWScope: {
+                JSFunction * func = compileFunction( e.codeWScopeCode() );
+                if ( !func )
+                    return JSVAL_NULL;
+
+                BSONObj extraScope = e.codeWScopeObject();
+                if ( ! extraScope.isEmpty() ) {
+                    log() << "warning: CodeWScope doesn't transfer to db.eval" << endl;
+                }
+
+                return OBJECT_TO_JSVAL( JS_GetFunctionObject( func ) );
+            }
+            case Date:
+                return OBJECT_TO_JSVAL( js_NewDateObjectMsec( _context , (jsdouble) ((long long)e.date().millis) ) );
+
+            case MinKey:
+                return OBJECT_TO_JSVAL( JS_NewObject( _context , &minkey_class , 0 , 0 ) );
+
+            case MaxKey:
+                return OBJECT_TO_JSVAL( JS_NewObject( _context , &maxkey_class , 0 , 0 ) );
+
+            case Timestamp: {
+                JSObject * o = JS_NewObject( _context , &timestamp_class , 0 , 0 );
+                CHECKNEWOBJECT(o,_context,"Timestamp1");
+                setProperty( o , "t" , toval( (double)(e.timestampTime()) ) );
+                setProperty( o , "i" , toval( (double)(e.timestampInc()) ) );
+                return OBJECT_TO_JSVAL( o );
+            }
+            case NumberLong: {
+                return toval( e.numberLong() );
+            }
+            case DBRef: {
+                JSObject * o = JS_NewObject( _context , &dbpointer_class , 0 , 0 );
+                CHECKNEWOBJECT(o,_context,"DBRef1");
+                setProperty( o , "ns" , toval( e.dbrefNS() ) );
+
+                JSObject * oid = JS_NewObject( _context , &object_id_class , 0 , 0 );
+                CHECKNEWOBJECT(oid,_context,"DBRef2");
+                setProperty( oid , "str" , toval( e.dbrefOID().str().c_str() ) );
+
+                setProperty( o , "id" , OBJECT_TO_JSVAL( oid ) );
+                return OBJECT_TO_JSVAL( o );
+            }
+            case BinData: {
+                JSObject * o = JS_NewObject( _context , &bindata_class , 0 , 0 );
+                CHECKNEWOBJECT(o,_context,"Bindata_BinData1");
+                int len;
+                const char * data = e.binData( len );
+                assert( data );
+                assert( JS_SetPrivate( _context , o , new BinDataHolder( data , len ) ) );
+
+                setProperty( o , "len" , toval( (double)len ) );
+                setProperty( o , "type" , toval( (double)e.binDataType() ) );
+                return OBJECT_TO_JSVAL( o );
+            }
+            }
+
+            log() << "toval: unknown type: " << (int) e.type() << endl;
+            uassert( 10218 ,  "not done: toval" , 0 );
+            return 0;
+        }
+
+        // ------- object helpers ------
+
+        JSObject * getJSObject( JSObject * o , const char * name ) {
+            jsval v;
+            assert( JS_GetProperty( _context , o , name , &v ) );
+            return JSVAL_TO_OBJECT( v );
+        }
+
+        JSObject * getGlobalObject( const char * name ) {
+            return getJSObject( JS_GetGlobalObject( _context ) , name );
+        }
+
+        JSObject * getGlobalPrototype( const char * name ) {
+            return getJSObject( getGlobalObject( name ) , "prototype" );
+        }
+
+        bool hasProperty( JSObject * o , const char * name ) {
+            JSBool res;
+            assert( JS_HasProperty( _context , o , name , & res ) );
+            return res;
+        }
+
+        jsval getProperty( JSObject * o , const char * field ) {
+            uassert( 10219 ,  "object passed to getPropery is null" , o );
+            jsval v;
+            assert( JS_GetProperty( _context , o , field , &v ) );
+            return v;
+        }
+
+        void setProperty( JSObject * o , const char * field , jsval v ) {
+            assert( JS_SetProperty( _context , o , field , &v ) );
+        }
+
+        string typeString( jsval v ) {
+            JSType t = JS_TypeOfValue( _context , v );
+            return JS_GetTypeName( _context , t );
+        }
+
+        bool getBoolean( JSObject * o , const char * field ) {
+            return toBoolean( getProperty( o , field ) );
+        }
+
+        double getNumber( JSObject * o , const char * field ) {
+            return toNumber( getProperty( o , field ) );
+        }
+
+        string getString( JSObject * o , const char * field ) {
+            return toString( getProperty( o , field ) );
+        }
+
+        JSClass * getClass( JSObject * o , const char * field ) {
+            jsval v;
+            assert( JS_GetProperty( _context , o , field , &v ) );
+            if ( ! JSVAL_IS_OBJECT( v ) )
+                return 0;
+            return JS_GET_CLASS( _context , JSVAL_TO_OBJECT( v ) );
+        }
+
+        JSContext * _context;
+
+
+    };
+
+
+    void bson_finalize( JSContext * cx , JSObject * obj ) {
+        BSONHolder * o = GETHOLDER( cx , obj );
+        if ( o ) {
+            delete o;
+            assert( JS_SetPrivate( cx , obj , 0 ) );
+        }
+    }
+
+    JSBool bson_enumerate( JSContext *cx, JSObject *obj, JSIterateOp enum_op, jsval *statep, jsid *idp ) {
+
+        BSONHolder * o = GETHOLDER( cx , obj );
+
+        if ( enum_op == JSENUMERATE_INIT ) {
+            if ( o ) {
+                BSONFieldIterator * it = o->it();
+                *statep = PRIVATE_TO_JSVAL( it );
+            }
+            else {
+                *statep = 0;
+            }
+            if ( idp )
+                *idp = JSVAL_ZERO;
+            return JS_TRUE;
+        }
+
+        BSONFieldIterator * it = (BSONFieldIterator*)JSVAL_TO_PRIVATE( *statep );
+        if ( ! it ) {
+            *statep = 0;
+            return JS_TRUE;
+        }
+
+        if ( enum_op == JSENUMERATE_NEXT ) {
+            if ( it->more() ) {
+                string name = it->next();
+                Convertor c(cx);
+                assert( JS_ValueToId( cx , c.toval( name.c_str() ) , idp ) );
+            }
+            else {
+                delete it;
+                *statep = 0;
+            }
+            return JS_TRUE;
+        }
+
+        if ( enum_op == JSENUMERATE_DESTROY ) {
+            if ( it )
+                delete it;
+            return JS_TRUE;
+        }
+
+        uassert( 10220 ,  "don't know what to do with this op" , 0 );
+        return JS_FALSE;
+    }
+
+    JSBool noaccess( JSContext *cx, JSObject *obj, jsval idval, jsval *vp) {
+        BSONHolder * holder = GETHOLDER( cx , obj );
+        if ( ! holder ) {
+            // in init code still
+            return JS_TRUE;
+        }
+        if ( holder->_inResolve )
+            return JS_TRUE;
+        JS_ReportError( cx , "doing write op on read only operation" );
+        return JS_FALSE;
+    }
+
+    JSClass bson_ro_class = {
+        "bson_ro_object" , JSCLASS_HAS_PRIVATE | JSCLASS_NEW_RESOLVE | JSCLASS_NEW_ENUMERATE ,
+        noaccess, noaccess, JS_PropertyStub, noaccess,
+        (JSEnumerateOp)bson_enumerate, (JSResolveOp)(&resolveBSONField) , JS_ConvertStub, bson_finalize ,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSBool bson_cons( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        cerr << "bson_cons : shouldn't be here!" << endl;
+        JS_ReportError( cx , "can't construct bson object" );
+        return JS_FALSE;
+    }
+
+    JSFunctionSpec bson_functions[] = {
+        { 0 }
+    };
+
+    JSBool bson_add_prop( JSContext *cx, JSObject *obj, jsval idval, jsval *vp) {
+        BSONHolder * holder = GETHOLDER( cx , obj );
+        if ( ! holder ) {
+            // static init
+            return JS_TRUE;
+        }
+        if ( ! holder->_inResolve ) {
+            Convertor c(cx);
+            string name = c.toString( idval );
+            if ( holder->_obj[name].eoo() ) {
+                holder->_extra.push_back( name );
+            }
+            holder->_modified = true;
+        }
+        return JS_TRUE;
+    }
+
+
+    JSBool mark_modified( JSContext *cx, JSObject *obj, jsval idval, jsval *vp) {
+        Convertor c(cx);
+        BSONHolder * holder = GETHOLDER( cx , obj );
+        if ( !holder ) // needed when we're messing with DBRef.prototype
+            return JS_TRUE;
+        if ( holder->_inResolve )
+            return JS_TRUE;
+        holder->_modified = true;
+        holder->_removed.erase( c.toString( idval ) );
+        return JS_TRUE;
+    }
+
+    JSBool mark_modified_remove( JSContext *cx, JSObject *obj, jsval idval, jsval *vp) {
+        Convertor c(cx);
+        BSONHolder * holder = GETHOLDER( cx , obj );
+        if ( holder->_inResolve )
+            return JS_TRUE;
+        holder->_modified = true;
+        holder->_removed.insert( c.toString( idval ) );
+        return JS_TRUE;
+    }
+
+    JSClass bson_class = {
+        "bson_object" , JSCLASS_HAS_PRIVATE | JSCLASS_NEW_RESOLVE | JSCLASS_NEW_ENUMERATE ,
+        bson_add_prop, mark_modified_remove, JS_PropertyStub, mark_modified,
+        (JSEnumerateOp)bson_enumerate, (JSResolveOp)(&resolveBSONField) , JS_ConvertStub, bson_finalize ,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    static JSClass global_class = {
+        "global", JSCLASS_GLOBAL_FLAGS,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub, JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    // --- global helpers ---
+
+    JSBool hexToBinData(JSContext * cx, jsval *rval, int subtype, string s) { 
+        JSObject * o = JS_NewObject( cx , &bindata_class , 0 , 0 );
+        CHECKNEWOBJECT(o,_context,"Bindata_BinData1");
+        int len = s.size() / 2;
+        char * data = new char[len];
+        char *p = data;
+        const char *src = s.c_str();
+        for( size_t i = 0; i+1 < s.size(); i += 2 ) { 
+            *p++ = fromHex(src + i);
+        }
+        assert( JS_SetPrivate( cx , o , new BinDataHolder( data , len ) ) );
+        Convertor c(cx);
+        c.setProperty( o, "len", c.toval((double)len) );
+        c.setProperty( o, "type", c.toval((double)subtype) );
+        *rval = OBJECT_TO_JSVAL( o );
+        delete data;
+        return JS_TRUE;
+    }
+
+    JSBool _HexData( JSContext * cx , JSObject * obj , uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+        if ( argc != 2 ) {
+            JS_ReportError( cx , "HexData needs 2 arguments -- HexData(subtype,hexstring)" );
+            return JS_FALSE;
+        }
+        int type = (int)c.toNumber( argv[ 0 ] );
+        if ( type == 2 ) {
+            JS_ReportError( cx , "BinData subtype 2 is deprecated" );
+            return JS_FALSE;
+        }
+        string s = c.toString(argv[1]);
+        return hexToBinData(cx, rval, type, s);
+    }
+
+    JSBool _UUID( JSContext * cx , JSObject * obj , uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+        if ( argc != 1 ) {
+            JS_ReportError( cx , "UUID needs argument -- UUID(hexstring)" );
+            return JS_FALSE;
+        }
+        string s = c.toString(argv[0]);
+        if( s.size() != 32 ) {
+            JS_ReportError( cx , "bad UUID hex string len" );
+            return JS_FALSE;
+        }
+        return hexToBinData(cx, rval, 3, s);
+    }
+
+    JSBool _MD5( JSContext * cx , JSObject * obj , uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+        if ( argc != 1 ) {
+            JS_ReportError( cx , "MD5 needs argument -- MD5(hexstring)" );
+            return JS_FALSE;
+        }
+        string s = c.toString(argv[0]);
+        if( s.size() != 32 ) {
+            JS_ReportError( cx , "bad MD5 hex string len" );
+            return JS_FALSE;
+        }
+        return hexToBinData(cx, rval, 5, s);
+    }
+
+    JSBool native_print( JSContext * cx , JSObject * obj , uintN argc, jsval *argv, jsval *rval ) {
+        stringstream ss;
+        Convertor c( cx );
+        for ( uintN i=0; i<argc; i++ ) {
+            if ( i > 0 )
+                ss << " ";
+            ss << c.toString( argv[i] );
+        }
+        ss << "\n";
+        Logstream::logLockless( ss.str() );
+        return JS_TRUE;
+    }
+
+    JSBool native_helper( JSContext *cx , JSObject *obj , uintN argc, jsval *argv , jsval *rval ) {
+        Convertor c(cx);
+
+        NativeFunction func = (NativeFunction)((long long)c.getNumber( obj , "x" ) );
+        void* data = (void*)((long long)c.getNumber( obj , "y" ) );
+        assert( func );
+
+        BSONObj a;
+        if ( argc > 0 ) {
+            BSONObjBuilder args;
+            for ( uintN i=0; i<argc; i++ ) {
+                c.append( args , args.numStr( i ) , argv[i] );
+            }
+
+            a = args.obj();
+        }
+
+        BSONObj out;
+        try {
+            out = func( a, data );
+        }
+        catch ( std::exception& e ) {
+            JS_ReportError( cx , e.what() );
+            return JS_FALSE;
+        }
+
+        if ( out.isEmpty() ) {
+            *rval = JSVAL_VOID;
+        }
+        else {
+            *rval = c.toval( out.firstElement() );
+        }
+
+        return JS_TRUE;
+    }
+
+    JSBool native_load( JSContext *cx , JSObject *obj , uintN argc, jsval *argv , jsval *rval );
+
+    JSBool native_gc( JSContext *cx , JSObject *obj , uintN argc, jsval *argv , jsval *rval ) {
+        JS_GC( cx );
+        return JS_TRUE;
+    }
+
+    JSFunctionSpec globalHelpers[] = {
+        { "print" , &native_print , 0 , 0 , 0 } ,
+        { "nativeHelper" , &native_helper , 1 , 0 , 0 } ,
+        { "load" , &native_load , 1 , 0 , 0 } ,
+        { "gc" , &native_gc , 1 , 0 , 0 } ,
+        { "UUID", &_UUID, 0, 0, 0 } ,
+        { "MD5", &_MD5, 0, 0, 0 } ,
+        { "HexData", &_HexData, 0, 0, 0 } ,
+        { 0 , 0 , 0 , 0 , 0 }
+    };
+
+    // ----END global helpers ----
+
+    // Object helpers
+
+    JSBool bson_get_size(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        if ( argc != 1 || !JSVAL_IS_OBJECT( argv[ 0 ] ) ) {
+            JS_ReportError( cx , "bsonsize requires one valid object" );
+            return JS_FALSE;
+        }
+
+        Convertor c(cx);
+
+        if ( argv[0] == JSVAL_VOID || argv[0] == JSVAL_NULL ) {
+            *rval = c.toval( 0.0 );
+            return JS_TRUE;
+        }
+
+        JSObject * o = JSVAL_TO_OBJECT( argv[0] );
+
+        double size = 0;
+
+        if ( JS_InstanceOf( cx , o , &bson_ro_class , 0 ) ||
+                JS_InstanceOf( cx , o , &bson_class , 0 ) ) {
+            BSONHolder * h = GETHOLDER( cx , o );
+            if ( h ) {
+                size = h->_obj.objsize();
+            }
+        }
+        else {
+            BSONObj temp = c.toObject( o );
+            size = temp.objsize();
+        }
+
+        *rval = c.toval( size );
+        return JS_TRUE;
+    }
+
+    JSFunctionSpec objectHelpers[] = {
+        { "bsonsize" , &bson_get_size , 1 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 }
+    };
+
+    // end Object helpers
+
+    JSBool resolveBSONField( JSContext *cx, JSObject *obj, jsval id, uintN flags, JSObject **objp ) {
+        assert( JS_EnterLocalRootScope( cx ) );
+        Convertor c( cx );
+
+        BSONHolder * holder = GETHOLDER( cx , obj );
+        if ( ! holder ) {
+            // static init
+            *objp = 0;
+            JS_LeaveLocalRootScope( cx );
+            return JS_TRUE;
+        }
+        holder->check();
+
+        string s = c.toString( id );
+
+        BSONElement e = holder->_obj[ s.c_str() ];
+
+        if ( e.type() == EOO || holder->_removed.count( s ) ) {
+            *objp = 0;
+            JS_LeaveLocalRootScope( cx );
+            return JS_TRUE;
+        }
+
+        jsval val;
+        try {
+            val = c.toval( e );
+        }
+        catch ( InvalidUTF8Exception& ) {
+            JS_LeaveLocalRootScope( cx );
+            JS_ReportError( cx , "invalid utf8" );
+            return JS_FALSE;
+        }
+
+        assert( ! holder->_inResolve );
+        holder->_inResolve = true;
+        assert( JS_SetProperty( cx , obj , s.c_str() , &val ) );
+        holder->_inResolve = false;
+
+        if ( val != JSVAL_NULL && val != JSVAL_VOID && JSVAL_IS_OBJECT( val ) ) {
+            // TODO: this is a hack to get around sub objects being modified
+            // basically right now whenever a sub object is read we mark whole obj as possibly modified
+            JSObject * oo = JSVAL_TO_OBJECT( val );
+            if ( JS_InstanceOf( cx , oo , &bson_class , 0 ) ||
+                    JS_IsArrayObject( cx , oo ) ) {
+                holder->_modified = true;
+            }
+        }
+
+        *objp = obj;
+        JS_LeaveLocalRootScope( cx );
+        return JS_TRUE;
+    }
+
+
+    class SMScope;
+
+    class SMEngine : public ScriptEngine {
+    public:
+
+        SMEngine() {
+#ifdef SM18
+            JS_SetCStringsAreUTF8();
+#endif
+
+            _runtime = JS_NewRuntime(64L * 1024L * 1024L);
+            uassert( 10221 ,  "JS_NewRuntime failed" , _runtime );
+
+            if ( ! utf8Ok() ) {
+                log() << "*** warning: spider monkey build without utf8 support.  consider rebuilding with utf8 support" << endl;
+            }
+
+            int x = 0;
+            assert( x = 1 );
+            uassert( 10222 ,  "assert not being executed" , x == 1 );
+        }
+
+        ~SMEngine() {
+            JS_DestroyRuntime( _runtime );
+            JS_ShutDown();
+        }
+
+        Scope * createScope();
+
+        void runTest();
+
+        virtual bool utf8Ok() const { return JS_CStringsAreUTF8(); }
+
+#ifdef XULRUNNER
+        JSClass * _dateClass;
+        JSClass * _regexClass;
+#endif
+
+
+    private:
+        JSRuntime * _runtime;
+        friend class SMScope;
+    };
+
+    SMEngine * globalSMEngine;
+
+
+    void ScriptEngine::setup() {
+        globalSMEngine = new SMEngine();
+        globalScriptEngine = globalSMEngine;
+    }
+
+
+    // ------ scope ------
+
+
+    JSBool no_gc(JSContext *cx, JSGCStatus status) {
+        return JS_FALSE;
+    }
+
+    JSBool yes_gc(JSContext *cx, JSGCStatus status) {
+        return JS_TRUE;
+    }
+
+    class SMScope : public Scope {
+    public:
+        SMScope() : _this( 0 ) , _externalSetup( false ) , _localConnect( false ) {
+            smlock;
+            _context = JS_NewContext( globalSMEngine->_runtime , 8192 );
+            _convertor = new Convertor( _context );
+            massert( 10431 ,  "JS_NewContext failed" , _context );
+
+            JS_SetOptions( _context , JSOPTION_VAROBJFIX);
+            //JS_SetVersion( _context , JSVERSION_LATEST); TODO
+            JS_SetErrorReporter( _context , errorReporter );
+
+            _global = JS_NewObject( _context , &global_class, NULL, NULL);
+            massert( 10432 ,  "JS_NewObject failed for global" , _global );
+            JS_SetGlobalObject( _context , _global );
+            massert( 10433 ,  "js init failed" , JS_InitStandardClasses( _context , _global ) );
+
+            JS_SetOptions( _context , JS_GetOptions( _context ) | JSOPTION_VAROBJFIX );
+
+            JS_DefineFunctions( _context , _global , globalHelpers );
+
+            JS_DefineFunctions( _context , _convertor->getGlobalObject( "Object" ), objectHelpers );
+
+            //JS_SetGCCallback( _context , no_gc ); // this is useful for seeing if something is a gc problem
+
+            _postCreateHacks();
+        }
+
+        ~SMScope() {
+            smlock;
+            uassert( 10223 ,  "deleted SMScope twice?" , _convertor );
+
+            for ( list<void*>::iterator i=_roots.begin(); i != _roots.end(); i++ ) {
+                JS_RemoveRoot( _context , *i );
+            }
+            _roots.clear();
+
+            if ( _this ) {
+                JS_RemoveRoot( _context , &_this );
+                _this = 0;
+            }
+
+            if ( _convertor ) {
+                delete _convertor;
+                _convertor = 0;
+            }
+
+            if ( _context ) {
+                // This is expected to reclaim _global as well.
+                JS_DestroyContext( _context );
+                _context = 0;
+            }
+
+        }
+
+        void reset() {
+            smlock;
+            assert( _convertor );
+            return;
+            if ( _this ) {
+                JS_RemoveRoot( _context , &_this );
+                _this = 0;
+            }
+            currentScope.reset( this );
+            _error = "";
+        }
+
+        void addRoot( void * root , const char * name ) {
+            JS_AddNamedRoot( _context , root , name );
+            _roots.push_back( root );
+        }
+
+        void init( const BSONObj * data ) {
+            smlock;
+            if ( ! data )
+                return;
+
+            BSONObjIterator i( *data );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                _convertor->setProperty( _global , e.fieldName() , _convertor->toval( e ) );
+                _initFieldNames.insert( e.fieldName() );
+            }
+
+        }
+
+        bool hasOutOfMemoryException() {
+            string err = getError();
+            return err.find("out of memory") != string::npos;
+        }
+
+        void externalSetup() {
+            smlock;
+            uassert( 10224 ,  "already local connected" , ! _localConnect );
+            if ( _externalSetup )
+                return;
+            initMongoJS( this , _context , _global , false );
+            _externalSetup = true;
+        }
+
+        void localConnect( const char * dbName ) {
+            {
+                smlock;
+                uassert( 10225 ,  "already setup for external db" , ! _externalSetup );
+                if ( _localConnect ) {
+                    uassert( 10226 ,  "connected to different db" , _localDBName == dbName );
+                    return;
+                }
+
+                initMongoJS( this , _context , _global , true );
+
+                exec( "_mongo = new Mongo();" );
+                exec( ((string)"db = _mongo.getDB( \"" + dbName + "\" ); ").c_str() );
+
+                _localConnect = true;
+                _localDBName = dbName;
+            }
+            loadStored();
+        }
+
+        // ----- getters ------
+        double getNumber( const char *field ) {
+            smlock;
+            jsval val;
+            assert( JS_GetProperty( _context , _global , field , &val ) );
+            return _convertor->toNumber( val );
+        }
+
+        string getString( const char *field ) {
+            smlock;
+            jsval val;
+            assert( JS_GetProperty( _context , _global , field , &val ) );
+            JSString * s = JS_ValueToString( _context , val );
+            return _convertor->toString( s );
+        }
+
+        bool getBoolean( const char *field ) {
+            smlock;
+            return _convertor->getBoolean( _global , field );
+        }
+
+        BSONObj getObject( const char *field ) {
+            smlock;
+            return _convertor->toObject( _convertor->getProperty( _global , field ) );
+        }
+
+        JSObject * getJSObject( const char * field ) {
+            smlock;
+            return _convertor->getJSObject( _global , field );
+        }
+
+        int type( const char *field ) {
+            smlock;
+            jsval val;
+            assert( JS_GetProperty( _context , _global , field , &val ) );
+
+            switch ( JS_TypeOfValue( _context , val ) ) {
+            case JSTYPE_VOID: return Undefined;
+            case JSTYPE_NULL: return jstNULL;
+            case JSTYPE_OBJECT: {
+                if ( val == JSVAL_NULL )
+                    return jstNULL;
+                JSObject * o = JSVAL_TO_OBJECT( val );
+                if ( JS_IsArrayObject( _context , o ) )
+                    return Array;
+                if ( isDate( _context , o ) )
+                    return Date;
+                return Object;
+            }
+            case JSTYPE_FUNCTION: return Code;
+            case JSTYPE_STRING: return String;
+            case JSTYPE_NUMBER: return NumberDouble;
+            case JSTYPE_BOOLEAN: return Bool;
+            default:
+                uassert( 10227 ,  "unknown type" , 0 );
+            }
+            return 0;
+        }
+
+        // ----- setters ------
+
+        void setElement( const char *field , const BSONElement& val ) {
+            smlock;
+            jsval v = _convertor->toval( val );
+            assert( JS_SetProperty( _context , _global , field , &v ) );
+        }
+
+        void setNumber( const char *field , double val ) {
+            smlock;
+            jsval v = _convertor->toval( val );
+            assert( JS_SetProperty( _context , _global , field , &v ) );
+        }
+
+        void setString( const char *field , const char * val ) {
+            smlock;
+            jsval v = _convertor->toval( val );
+            assert( JS_SetProperty( _context , _global , field , &v ) );
+        }
+
+        void setObject( const char *field , const BSONObj& obj , bool readOnly ) {
+            smlock;
+            jsval v = _convertor->toval( &obj , readOnly );
+            JS_SetProperty( _context , _global , field , &v );
+        }
+
+        void setBoolean( const char *field , bool val ) {
+            smlock;
+            jsval v = BOOLEAN_TO_JSVAL( val );
+            assert( JS_SetProperty( _context , _global , field , &v ) );
+        }
+
+        void setThis( const BSONObj * obj ) {
+            smlock;
+            if ( _this ) {
+                JS_RemoveRoot( _context , &_this );
+                _this = 0;
+            }
+
+            if ( obj ) {
+                _this = _convertor->toJSObject( obj );
+                JS_AddNamedRoot( _context , &_this , "scope this" );
+            }
+        }
+
+        void setFunction( const char *field , const char * code ) {
+            smlock;
+            jsval v = OBJECT_TO_JSVAL(JS_GetFunctionObject(_convertor->compileFunction(code)));
+            JS_SetProperty( _context , _global , field , &v );
+        }
+
+        void rename( const char * from , const char * to ) {
+            smlock;
+            jsval v;
+            assert( JS_GetProperty( _context , _global , from , &v ) );
+            assert( JS_SetProperty( _context , _global , to , &v ) );
+            v = JSVAL_VOID;
+            assert( JS_SetProperty( _context , _global , from , &v ) );
+        }
+
+        // ---- functions -----
+
+        ScriptingFunction _createFunction( const char * code ) {
+            smlock;
+            precall();
+            return (ScriptingFunction)_convertor->compileFunction( code );
+        }
+
+        struct TimeoutSpec {
+            boost::posix_time::ptime start;
+            boost::posix_time::time_duration timeout;
+            int count;
+        };
+
+        // should not generate exceptions, as those can be caught in
+        // javascript code; returning false without an exception exits
+        // immediately
+        static JSBool _interrupt( JSContext *cx ) {
+            TimeoutSpec &spec = *(TimeoutSpec *)( JS_GetContextPrivate( cx ) );
+            if ( ++spec.count % 1000 != 0 )
+                return JS_TRUE;
+            const char * interrupt = ScriptEngine::checkInterrupt();
+            if ( interrupt && interrupt[ 0 ] ) {
+                return JS_FALSE;
+            }
+            if ( spec.timeout.ticks() == 0 ) {
+                return JS_TRUE;
+            }
+            boost::posix_time::time_duration elapsed = ( boost::posix_time::microsec_clock::local_time() - spec.start );
+            if ( elapsed < spec.timeout ) {
+                return JS_TRUE;
+            }
+            return JS_FALSE;
+
+        }
+
+        static JSBool interrupt( JSContext *cx, JSScript *script ) {
+            return _interrupt( cx );
+        }
+
+        void installInterrupt( int timeoutMs ) {
+            if ( timeoutMs != 0 || ScriptEngine::haveCheckInterruptCallback() ) {
+                TimeoutSpec *spec = new TimeoutSpec;
+                spec->timeout = boost::posix_time::millisec( timeoutMs );
+                spec->start = boost::posix_time::microsec_clock::local_time();
+                spec->count = 0;
+                JS_SetContextPrivate( _context, (void*)spec );
+#if defined(SM181) && !defined(XULRUNNER190)
+                JS_SetOperationCallback( _context, _interrupt );
+#else
+                JS_SetBranchCallback( _context, interrupt );
+#endif
+            }
+        }
+
+        void uninstallInterrupt( int timeoutMs ) {
+            if ( timeoutMs != 0 || ScriptEngine::haveCheckInterruptCallback() ) {
+#if defined(SM181) && !defined(XULRUNNER190)
+                JS_SetOperationCallback( _context , 0 );
+#else
+                JS_SetBranchCallback( _context, 0 );
+#endif
+                delete (TimeoutSpec *)JS_GetContextPrivate( _context );
+                JS_SetContextPrivate( _context, 0 );
+            }
+        }
+
+        void precall() {
+            _error = "";
+            currentScope.reset( this );
+        }
+
+        bool exec( const StringData& code , const string& name = "(anon)" , bool printResult = false , bool reportError = true , bool assertOnError = true, int timeoutMs = 0 ) {
+            smlock;
+            precall();
+
+            jsval ret = JSVAL_VOID;
+
+            installInterrupt( timeoutMs );
+            JSBool worked = JS_EvaluateScript( _context , _global , code.data() , code.size() , name.c_str() , 1 , &ret );
+            uninstallInterrupt( timeoutMs );
+
+            if ( ! worked && _error.size() == 0 ) {
+                jsval v;
+                if ( JS_GetPendingException( _context , &v ) ) {
+                    _error = _convertor->toString( v );
+                    if ( reportError )
+                        cout << _error << endl;
+                }
+            }
+
+            uassert( 10228 ,  str::stream() << name + " exec failed: " << _error , worked || ! assertOnError );
+
+            if ( reportError && ! _error.empty() ) {
+                // cout << "exec error: " << _error << endl;
+                // already printed in reportError, so... TODO
+            }
+
+            if ( worked )
+                _convertor->setProperty( _global , "__lastres__" , ret );
+
+            if ( worked && printResult && ! JSVAL_IS_VOID( ret ) )
+                cout << _convertor->toString( ret ) << endl;
+
+            return worked;
+        }
+
+        int invoke( JSFunction * func , const BSONObj* args, const BSONObj* recv, int timeoutMs , bool ignoreReturn, bool readOnlyArgs, bool readOnlyRecv ) {
+            smlock;
+            precall();
+
+            assert( JS_EnterLocalRootScope( _context ) );
+
+            int nargs = args ? args->nFields() : 0;
+            scoped_array<jsval> smargsPtr( new jsval[nargs] );
+            if ( nargs ) {
+                BSONObjIterator it( *args );
+                for ( int i=0; i<nargs; i++ ) {
+                    smargsPtr[i] = _convertor->toval( it.next() );
+                }
+            }
+
+            if ( !args ) {
+                _convertor->setProperty( _global , "args" , JSVAL_NULL );
+            }
+            else {
+                setObject( "args" , *args , true ); // this is for backwards compatability
+            }
+
+            JS_LeaveLocalRootScope( _context );
+
+            installInterrupt( timeoutMs );
+            jsval rval;
+            setThis(recv);
+            JSBool ret = JS_CallFunction( _context , _this ? _this : _global , func , nargs , smargsPtr.get() , &rval );
+            setThis(0);
+            uninstallInterrupt( timeoutMs );
+
+            if ( !ret ) {
+                return -3;
+            }
+
+            if ( ! ignoreReturn ) {
+                assert( JS_SetProperty( _context , _global , "return" , &rval ) );
+            }
+
+            return 0;
+        }
+
+        int invoke( ScriptingFunction funcAddr , const BSONObj* args, const BSONObj* recv, int timeoutMs = 0 , bool ignoreReturn = 0, bool readOnlyArgs = false, bool readOnlyRecv = false ) {
+            return invoke( (JSFunction*)funcAddr , args , recv, timeoutMs , ignoreReturn, readOnlyArgs, readOnlyRecv);
+        }
+
+        void gotError( string s ) {
+            _error = s;
+        }
+
+        string getError() {
+            return _error;
+        }
+
+        void injectNative( const char *field, NativeFunction func, void* data ) {
+            smlock;
+            string name = field;
+            _convertor->setProperty( _global , (name + "_").c_str() , _convertor->toval( (double)(long long)func ) );
+
+            stringstream code;
+            if (data) {
+                _convertor->setProperty( _global , (name + "_data_").c_str() , _convertor->toval( (double)(long long)data ) );
+                code << field << "_" << " = { x : " << field << "_ , y: " << field << "_data_ }; ";
+            } else {
+                code << field << "_" << " = { x : " << field << "_ }; ";
+            }
+            code << field << " = function(){ return nativeHelper.apply( " << field << "_ , arguments ); }";
+            exec( code.str() );
+        }
+
+        virtual void gc() {
+            smlock;
+            JS_GC( _context );
+        }
+
+        JSContext *SavedContext() const { return _context; }
+
+    private:
+
+        void _postCreateHacks() {
+#ifdef XULRUNNER
+            exec( "__x__ = new Date(1);" );
+            globalSMEngine->_dateClass = _convertor->getClass( _global , "__x__" );
+            exec( "__x__ = /abc/i" );
+            globalSMEngine->_regexClass = _convertor->getClass( _global , "__x__" );
+#endif
+        }
+
+        JSContext * _context;
+        Convertor * _convertor;
+
+        JSObject * _global;
+        JSObject * _this;
+
+        string _error;
+        list<void*> _roots;
+
+        bool _externalSetup;
+        bool _localConnect;
+
+        set<string> _initFieldNames;
+
+    };
+
+    /* used to make the logging not overly chatty in the mongo shell. */
+    extern bool isShell;
+
+    void errorReporter( JSContext *cx, const char *message, JSErrorReport *report ) {
+        stringstream ss;
+        if( !isShell )
+            ss << "JS Error: ";
+        ss << message;
+
+        if ( report && report->filename ) {
+            ss << " " << report->filename << ":" << report->lineno;
+        }
+
+        tlog() << ss.str() << endl;
+
+        if ( currentScope.get() ) {
+            currentScope->gotError( ss.str() );
+        }
+    }
+
+    JSBool native_load( JSContext *cx , JSObject *obj , uintN argc, jsval *argv , jsval *rval ) {
+        Convertor c(cx);
+
+        Scope * s = currentScope.get();
+
+        for ( uintN i=0; i<argc; i++ ) {
+            string filename = c.toString( argv[i] );
+            //cout << "load [" << filename << "]" << endl;
+
+            if ( ! s->execFile( filename , false , true , false ) ) {
+                JS_ReportError( cx , ((string)"error loading js file: " + filename ).c_str() );
+                return JS_FALSE;
+            }
+        }
+
+        return JS_TRUE;
+    }
+
+
+
+    void SMEngine::runTest() {
+        SMScope s;
+
+        s.localConnect( "foo" );
+
+        s.exec( "assert( db.getMongo() )" );
+        s.exec( "assert( db.bar , 'collection getting does not work' ); " );
+        s.exec( "assert.eq( db._name , 'foo' );" );
+        s.exec( "assert( _mongo == db.getMongo() ); " );
+        s.exec( "assert( _mongo == db._mongo ); " );
+        s.exec( "assert( typeof DB.bar == 'undefined' ); " );
+        s.exec( "assert( typeof DB.prototype.bar == 'undefined' , 'resolution is happening on prototype, not object' ); " );
+
+        s.exec( "assert( db.bar ); " );
+        s.exec( "assert( typeof db.addUser == 'function' )" );
+        s.exec( "assert( db.addUser == DB.prototype.addUser )" );
+        s.exec( "assert.eq( 'foo.bar' , db.bar._fullName ); " );
+        s.exec( "db.bar.verify();" );
+
+        s.exec( "db.bar.silly.verify();" );
+        s.exec( "assert.eq( 'foo.bar.silly' , db.bar.silly._fullName )" );
+        s.exec( "assert.eq( 'function' , typeof _mongo.find , 'mongo.find is not a function' )" );
+
+        assert( (string)"abc" == trim( "abc" ) );
+        assert( (string)"abc" == trim( " abc" ) );
+        assert( (string)"abc" == trim( "abc " ) );
+        assert( (string)"abc" == trim( " abc " ) );
+
+    }
+
+    Scope * SMEngine::createScope() {
+        return new SMScope();
+    }
+
+    void Convertor::addRoot( JSFunction * f , const char * name ) {
+        if ( ! f )
+            return;
+
+        SMScope * scope = currentScope.get();
+        uassert( 10229 ,  "need a scope" , scope );
+
+        JSObject * o = JS_GetFunctionObject( f );
+        assert( o );
+        scope->addRoot( &o , name );
+    }
+
+}
+
+#include "sm_db.cpp"
diff --git a/src/mongo/scripting/engine_spidermonkey.h b/src/mongo/scripting/engine_spidermonkey.h
new file mode 100644
index 00000000000..9fd430d853d
--- /dev/null
+++ b/src/mongo/scripting/engine_spidermonkey.h
@@ -0,0 +1,105 @@
+// engine_spidermonkey.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "engine.h"
+
+// START inc hacking
+
+#ifdef WIN32
+#include "jstypes.h"
+#undef JS_PUBLIC_API
+#undef JS_PUBLIC_DATA
+#define JS_PUBLIC_API(t)    t __cdecl
+#define JS_PUBLIC_DATA(t)   t
+#endif
+
+#include "jsapi.h"
+#include "jsobj.h"
+#include "jsdate.h"
+#include "jsregexp.h"
+
+// END inc hacking
+
+// -- SM 1.6 hacks ---
+#ifndef JSCLASS_GLOBAL_FLAGS
+#error old version of spider monkey ( probably 1.6 ) you should upgrade to at least 1.7
+#endif
+// -- END SM 1.6 hacks ---
+
+#ifdef JSVAL_IS_TRACEABLE
+#define SM18
+#endif
+
+#ifdef XULRUNNER
+#define SM181
+#endif
+
+namespace mongo {
+
+    class SMScope;
+    class Convertor;
+
+    extern JSClass bson_class;
+    extern JSClass bson_ro_class;
+
+    extern JSClass object_id_class;
+    extern JSClass dbpointer_class;
+    extern JSClass dbref_class;
+    extern JSClass bindata_class;
+    extern JSClass timestamp_class;
+    extern JSClass numberlong_class;
+    extern JSClass numberint_class;
+    extern JSClass minkey_class;
+    extern JSClass maxkey_class;
+
+    // internal things
+    void dontDeleteScope( SMScope * s ) {}
+    void errorReporter( JSContext *cx, const char *message, JSErrorReport *report );
+    extern boost::thread_specific_ptr<SMScope> currentScope;
+
+    // bson
+    JSBool resolveBSONField( JSContext *cx, JSObject *obj, jsval id, uintN flags, JSObject **objp );
+
+
+    // mongo
+    void initMongoJS( SMScope * scope , JSContext * cx , JSObject * global , bool local );
+    bool appendSpecialDBObject( Convertor * c , BSONObjBuilder& b , const string& name , jsval val , JSObject * o );
+
+#define JSVAL_IS_OID(v) ( JSVAL_IS_OBJECT( v ) && JS_InstanceOf( cx , JSVAL_TO_OBJECT( v ) , &object_id_class , 0 ) )
+
+    bool isDate( JSContext * cx , JSObject * o );
+
+    // JS private data must be 2byte aligned, so we use a holder to refer to an unaligned pointer.
+    struct BinDataHolder {
+        BinDataHolder( const char *c, int copyLen = -1 ) :
+            c_( const_cast< char * >( c ) ),
+            iFree_( copyLen != -1 ) {
+            if ( copyLen != -1 ) {
+                c_ = (char*)malloc( copyLen );
+                memcpy( c_, c, copyLen );
+            }
+        }
+        ~BinDataHolder() {
+            if ( iFree_ )
+                free( c_ );
+        }
+        char *c_;
+        bool iFree_;
+    };
+}
diff --git a/src/mongo/scripting/engine_v8.cpp b/src/mongo/scripting/engine_v8.cpp
new file mode 100644
index 00000000000..53539c2f75c
--- /dev/null
+++ b/src/mongo/scripting/engine_v8.cpp
@@ -0,0 +1,1634 @@
+//engine_v8.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#if defined(_WIN32)
+/** this is a hack - v8stdint.h defined uint16_t etc. on _WIN32 only, and that collides with 
+    our usage of boost */
+#include "boost/cstdint.hpp"
+using namespace boost;
+#define V8STDINT_H_
+#endif
+
+#include "engine_v8.h"
+
+#include "v8_wrapper.h"
+#include "v8_utils.h"
+#include "v8_db.h"
+
+#define V8_SIMPLE_HEADER v8::Isolate::Scope iscope(_isolate); v8::Locker l(_isolate); HandleScope handle_scope; Context::Scope context_scope( _context );
+
+namespace mongo {
+
+    // guarded by v8 mutex
+    map< unsigned, int > __interruptSpecToThreadId;
+    map< unsigned, v8::Isolate* > __interruptSpecToIsolate;
+
+    /**
+     * Unwraps a BSONObj from the JS wrapper
+     */
+    static BSONObj* unwrapBSONObj(const Handle<v8::Object>& obj) {
+      Handle<External> field = Handle<External>::Cast(obj->GetInternalField(0));
+      if (field.IsEmpty() || !field->IsExternal())
+          return 0;
+      void* ptr = field->Value();
+      return (BSONObj*)ptr;
+    }
+
+    static void weakRefBSONCallback(v8::Persistent<v8::Value> p, void* scope) {
+        // should we lock here? no idea, and no doc from v8 of course
+        HandleScope handle_scope;
+        if (!p.IsNearDeath())
+            return;
+        Handle<External> field = Handle<External>::Cast(p->ToObject()->GetInternalField(0));
+        BSONObj* data = (BSONObj*) field->Value();
+        delete data;
+        p.Dispose();
+    }
+
+    Persistent<v8::Object> V8Scope::wrapBSONObject(Local<v8::Object> obj, BSONObj* data) {
+        obj->SetInternalField(0, v8::External::New(data));
+        Persistent<v8::Object> p = Persistent<v8::Object>::New(obj);
+        p.MakeWeak(this, weakRefBSONCallback);
+        return p;
+    }
+
+    static void weakRefArrayCallback(v8::Persistent<v8::Value> p, void* scope) {
+        // should we lock here? no idea, and no doc from v8 of course
+        HandleScope handle_scope;
+        if (!p.IsNearDeath())
+            return;
+        Handle<External> field = Handle<External>::Cast(p->ToObject()->GetInternalField(0));
+        char* data = (char*) field->Value();
+        delete [] data;
+        p.Dispose();
+    }
+
+    Persistent<v8::Object> V8Scope::wrapArrayObject(Local<v8::Object> obj, char* data) {
+        obj->SetInternalField(0, v8::External::New(data));
+        Persistent<v8::Object> p = Persistent<v8::Object>::New(obj);
+        p.MakeWeak(this, weakRefArrayCallback);
+        return p;
+    }
+
+    static Handle<v8::Value> namedGet(Local<v8::String> name, const v8::AccessorInfo &info) {
+      // all properties should be set, otherwise means builtin or deleted
+      if (!(info.This()->HasRealNamedProperty(name)))
+          return v8::Handle<v8::Value>();
+
+      Handle<v8::Value> val = info.This()->GetRealNamedProperty(name);
+      if (!val->IsUndefined()) {
+          // value already cached
+          return val;
+      }
+
+      string key = toSTLString(name);
+      BSONObj *obj = unwrapBSONObj(info.Holder());
+      BSONElement elmt = obj->getField(key.c_str());
+      if (elmt.eoo())
+          return Handle<Value>();
+      Local< External > scp = External::Cast( *info.Data() );
+      V8Scope* scope = (V8Scope*)(scp->Value());
+      val = scope->mongoToV8Element(elmt, false);
+      info.This()->ForceSet(name, val);
+
+      if (elmt.type() == mongo::Object || elmt.type() == mongo::Array) {
+          // if accessing a subobject, it may get modified and base obj would not know
+          // have to set base as modified, which means some optim is lost
+          info.This()->SetHiddenValue(scope->V8STR_MODIFIED, v8::Boolean::New(true));
+      }
+      return val;
+    }
+
+    static Handle<v8::Value> namedGetRO(Local<v8::String> name, const v8::AccessorInfo &info) {
+      string key = toSTLString(name);
+      BSONObj *obj = unwrapBSONObj(info.Holder());
+      BSONElement elmt = obj->getField(key.c_str());
+      if (elmt.eoo())
+          return Handle<Value>();
+      Local< External > scp = External::Cast( *info.Data() );
+      V8Scope* scope = (V8Scope*)(scp->Value());
+      Handle<v8::Value> val = scope->mongoToV8Element(elmt, true);
+      return val;
+    }
+
+    static Handle<v8::Value> namedSet(Local<v8::String> name, Local<v8::Value> value_obj, const v8::AccessorInfo& info) {
+        Local< External > scp = External::Cast( *info.Data() );
+        V8Scope* scope = (V8Scope*)(scp->Value());
+        info.This()->SetHiddenValue(scope->V8STR_MODIFIED, v8::Boolean::New(true));
+        return Handle<Value>();
+    }
+
+    static Handle<v8::Array> namedEnumerator(const AccessorInfo &info) {
+        BSONObj *obj = unwrapBSONObj(info.Holder());
+        Handle<v8::Array> arr = Handle<v8::Array>(v8::Array::New(obj->nFields()));
+        int i = 0;
+        Local< External > scp = External::Cast( *info.Data() );
+        V8Scope* scope = (V8Scope*)(scp->Value());
+        // note here that if keys are parseable number, v8 will access them using index
+        for ( BSONObjIterator it(*obj); it.more(); ++i) {
+            const BSONElement& f = it.next();
+//            arr->Set(i, v8::String::NewExternal(new ExternalString(f.fieldName())));
+            Handle<v8::String> name = scope->getV8Str(f.fieldName());
+            arr->Set(i, name);
+        }
+        return arr;
+    }
+
+    Handle<Boolean> namedDelete( Local<v8::String> property, const AccessorInfo& info ) {
+        Local< External > scp = External::Cast( *info.Data() );
+        V8Scope* scope = (V8Scope*)(scp->Value());
+        info.This()->SetHiddenValue(scope->V8STR_MODIFIED, v8::Boolean::New(true));
+        return Handle<Boolean>();
+    }
+
+//    v8::Handle<v8::Integer> namedQuery(Local<v8::String> property, const AccessorInfo& info) {
+//      string key = ToString(property);
+//      return v8::Integer::New(None);
+//    }
+
+    static Handle<v8::Value> indexedGet(uint32_t index, const v8::AccessorInfo &info) {
+        // all properties should be set, otherwise means builtin or deleted
+        if (!(info.This()->HasRealIndexedProperty(index)))
+            return v8::Handle<v8::Value>();
+
+        StringBuilder ss;
+        ss << index;
+        string key = ss.str();
+        Local< External > scp = External::Cast( *info.Data() );
+        V8Scope* scope = (V8Scope*)(scp->Value());
+        // cannot get v8 to properly cache the indexed val in the js object
+//        Handle<v8::String> name = scope->getV8Str(key);
+//        // v8 API really confusing here, must check existence on index, but then fetch with name
+//        if (info.This()->HasRealIndexedProperty(index)) {
+//            Handle<v8::Value> val = info.This()->GetRealNamedProperty(name);
+//            if (!val.IsEmpty() && !val->IsNull())
+//                return val;
+//        }
+        BSONObj *obj = unwrapBSONObj(info.Holder());
+        BSONElement elmt = obj->getField(key);
+        if (elmt.eoo())
+            return Handle<Value>();
+        Handle<Value> val = scope->mongoToV8Element(elmt, false);
+//        info.This()->ForceSet(name, val);
+
+        if (elmt.type() == mongo::Object || elmt.type() == mongo::Array) {
+            // if accessing a subobject, it may get modified and base obj would not know
+            // have to set base as modified, which means some optim is lost
+            info.This()->SetHiddenValue(scope->V8STR_MODIFIED, v8::Boolean::New(true));
+        }
+        return val;
+    }
+
+    Handle<Boolean> indexedDelete( uint32_t index, const AccessorInfo& info ) {
+        Local< External > scp = External::Cast( *info.Data() );
+        V8Scope* scope = (V8Scope*)(scp->Value());
+        info.This()->SetHiddenValue(scope->V8STR_MODIFIED, v8::Boolean::New(true));
+        return Handle<Boolean>();
+    }
+
+    static Handle<v8::Value> indexedGetRO(uint32_t index, const v8::AccessorInfo &info) {
+        StringBuilder ss;
+        ss << index;
+        string key = ss.str();
+        Local< External > scp = External::Cast( *info.Data() );
+        V8Scope* scope = (V8Scope*)(scp->Value());
+        // cannot get v8 to properly cache the indexed val in the js object
+//        Handle<v8::String> name = scope->getV8Str(key);
+//        // v8 API really confusing here, must check existence on index, but then fetch with name
+//        if (info.This()->HasRealIndexedProperty(index)) {
+//            Handle<v8::Value> val = info.This()->GetRealNamedProperty(name);
+//            if (!val.IsEmpty() && !val->IsNull())
+//                return val;
+//        }
+        BSONObj *obj = unwrapBSONObj(info.Holder());
+        BSONElement elmt = obj->getField(key);
+        if (elmt.eoo())
+            return Handle<Value>();
+        Handle<Value> val = scope->mongoToV8Element(elmt, true);
+//        info.This()->ForceSet(name, val);
+        return val;
+    }
+
+    static Handle<v8::Value> indexedSet(uint32_t index, Local<v8::Value> value_obj, const v8::AccessorInfo& info) {
+        Local< External > scp = External::Cast( *info.Data() );
+        V8Scope* scope = (V8Scope*)(scp->Value());
+        info.This()->SetHiddenValue(scope->V8STR_MODIFIED, v8::Boolean::New(true));
+        return Handle<Value>();
+    }
+
+//    static Handle<v8::Array> indexedEnumerator(const AccessorInfo &info) {
+//        BSONObj *obj = unwrapBSONObj(info.Holder());
+//        Handle<v8::Array> arr = Handle<v8::Array>(v8::Array::New(obj->nFields()));
+//        Local< External > scp = External::Cast( *info.Data() );
+//        V8Scope* scope = (V8Scope*)(scp->Value());
+//        int i = 0;
+//        for ( BSONObjIterator it(*obj); it.more(); ++i) {
+//            const BSONElement& f = it.next();
+////          arr->Set(i, v8::String::NewExternal(new ExternalString(f.fieldName())));
+//            arr->Set(i, scope->getV8Str(f.fieldName()));
+//        }
+//        return arr;
+//    }
+
+    Handle<Value> NamedReadOnlySet( Local<v8::String> property, Local<Value> value, const AccessorInfo& info ) {
+        string key = toSTLString(property);
+        cout << "cannot write property " << key << " to read-only object" << endl;
+        return value;
+    }
+
+    Handle<Boolean> NamedReadOnlyDelete( Local<v8::String> property, const AccessorInfo& info ) {
+        string key = toSTLString(property);
+        cout << "cannot delete property " << key << " from read-only object" << endl;
+        return Boolean::New( false );
+    }
+
+    Handle<Value> IndexedReadOnlySet( uint32_t index, Local<Value> value, const AccessorInfo& info ) {
+        cout << "cannot write property " << index << " to read-only array" << endl;
+        return value;
+    }
+
+    Handle<Boolean> IndexedReadOnlyDelete( uint32_t index, const AccessorInfo& info ) {
+        cout << "cannot delete property " << index << " from read-only array" << endl;
+        return Boolean::New( false );
+    }
+
+    // --- engine ---
+
+//    void fatalHandler(const char* s1, const char* s2) {
+//        cout << "Fatal handler " << s1 << " " << s2;
+//    }
+
+    V8ScriptEngine::V8ScriptEngine() {
+        v8::V8::Initialize();
+        v8::Locker l;
+//        v8::Locker::StartPreemption( 10 );
+
+        int K = 1024;
+        v8::ResourceConstraints rc;
+        rc.set_max_young_space_size(4 * K * K);
+        rc.set_max_old_space_size(64 * K * K);
+        v8::SetResourceConstraints(&rc);
+//        v8::V8::IgnoreOutOfMemoryException();
+//        v8::V8::SetFatalErrorHandler(fatalHandler);
+    }
+
+    V8ScriptEngine::~V8ScriptEngine() {
+    }
+
+    void ScriptEngine::setup() {
+        if ( !globalScriptEngine ) {
+            globalScriptEngine = new V8ScriptEngine();
+        }
+    }
+
+    void V8ScriptEngine::interrupt( unsigned opSpec ) {
+        v8::Locker l;
+        v8Locks::InterruptLock il;
+        if ( __interruptSpecToThreadId.count( opSpec ) ) {
+            int thread = __interruptSpecToThreadId[ opSpec ];
+            if ( thread == -2 || thread == -3) {
+                // just mark as interrupted
+                __interruptSpecToThreadId[ opSpec ] = -3;
+                return;
+            }
+
+            V8::TerminateExecution( __interruptSpecToIsolate[ opSpec ] );
+        }
+    }
+
+    void V8ScriptEngine::interruptAll() {
+        v8::Locker l;
+        v8Locks::InterruptLock il;
+        vector< Isolate* > toKill; // v8 mutex could potentially be yielded during the termination call
+
+        for( map< unsigned, Isolate* >::const_iterator i = __interruptSpecToIsolate.begin(); i != __interruptSpecToIsolate.end(); ++i ) {
+            toKill.push_back( i->second );
+        }
+        for( vector< Isolate* >::const_iterator i = toKill.begin(); i != toKill.end(); ++i ) {
+            V8::TerminateExecution( *i );
+        }
+    }
+
+    // --- scope ---
+
+    V8Scope::V8Scope( V8ScriptEngine * engine )
+        : _engine( engine ) ,
+          _connectState( NOT ) {
+
+        _isolate = v8::Isolate::New();
+        v8::Isolate::Scope iscope(_isolate);
+        v8::Locker l(_isolate);
+
+        HandleScope handleScope;
+        _context = Context::New();
+        Context::Scope context_scope( _context );
+        _global = Persistent< v8::Object >::New( _context->Global() );
+        _emptyObj = Persistent< v8::Object >::New( v8::Object::New() );
+
+        // initialize lazy object template
+        lzObjectTemplate = Persistent<ObjectTemplate>::New(ObjectTemplate::New());
+        lzObjectTemplate->SetInternalFieldCount( 1 );
+        lzObjectTemplate->SetNamedPropertyHandler(namedGet, namedSet, 0, namedDelete, 0, v8::External::New(this));
+        lzObjectTemplate->SetIndexedPropertyHandler(indexedGet, indexedSet, 0, indexedDelete, 0, v8::External::New(this));
+
+        roObjectTemplate = Persistent<ObjectTemplate>::New(ObjectTemplate::New());
+        roObjectTemplate->SetInternalFieldCount( 1 );
+        roObjectTemplate->SetNamedPropertyHandler(namedGetRO, NamedReadOnlySet, 0, NamedReadOnlyDelete, namedEnumerator, v8::External::New(this));
+        roObjectTemplate->SetIndexedPropertyHandler(indexedGetRO, IndexedReadOnlySet, 0, IndexedReadOnlyDelete, 0, v8::External::New(this));
+
+        // initialize lazy array template
+        // unfortunately it is not possible to create true v8 array from a template
+        // this means we use an object template and copy methods over
+        // this it creates issues when calling certain methods that check array type
+        lzArrayTemplate = Persistent<ObjectTemplate>::New(ObjectTemplate::New());
+        lzArrayTemplate->SetInternalFieldCount( 1 );
+        lzArrayTemplate->SetIndexedPropertyHandler(indexedGet, 0, 0, 0, 0, v8::External::New(this));
+
+        internalFieldObjects = Persistent<ObjectTemplate>::New(ObjectTemplate::New());
+        internalFieldObjects->SetInternalFieldCount( 1 );
+
+        V8STR_CONN = getV8Str( "_conn" );
+        V8STR_ID = getV8Str( "_id" );
+        V8STR_LENGTH = getV8Str( "length" );
+        V8STR_LEN = getV8Str( "len" );
+        V8STR_TYPE = getV8Str( "type" );
+        V8STR_ISOBJECTID = getV8Str( "isObjectId" );
+        V8STR_RETURN = getV8Str( "return" );
+        V8STR_ARGS = getV8Str( "args" );
+        V8STR_T = getV8Str( "t" );
+        V8STR_I = getV8Str( "i" );
+        V8STR_EMPTY = getV8Str( "" );
+        V8STR_MINKEY = getV8Str( "$MinKey" );
+        V8STR_MAXKEY = getV8Str( "$MaxKey" );
+        V8STR_NUMBERLONG = getV8Str( "__NumberLong" );
+        V8STR_NUMBERINT = getV8Str( "__NumberInt" );
+        V8STR_DBPTR = getV8Str( "__DBPointer" );
+        V8STR_BINDATA = getV8Str( "__BinData" );
+        V8STR_NATIVE_FUNC = getV8Str( "_native_function" );
+        V8STR_NATIVE_DATA = getV8Str( "_native_data" );
+        V8STR_V8_FUNC = getV8Str( "_v8_function" );
+        V8STR_RO = getV8Str( "_ro" );
+        V8STR_MODIFIED = getV8Str( "_mod" );
+        V8STR_FULLNAME = getV8Str( "_fullName" );
+
+        injectV8Function("print", Print);
+        injectV8Function("version", Version);
+        injectV8Function("load", load);
+
+        _wrapper = Persistent< v8::Function >::New( getObjectWrapperTemplate(this)->GetFunction() );
+
+        injectV8Function("gc", GCV8);
+
+        installDBTypes( this, _global );
+    }
+
+    V8Scope::~V8Scope() {
+        // make sure to disable interrupt, otherwise can get segfault on race condition
+        disableV8Interrupt();
+
+        {
+        V8_SIMPLE_HEADER
+        _wrapper.Dispose();
+        _emptyObj.Dispose();
+        for( unsigned i = 0; i < _funcs.size(); ++i )
+            _funcs[ i ].Dispose();
+        _funcs.clear();
+        _global.Dispose();
+        std::map <string, v8::Persistent <v8::String> >::iterator it = _strCache.begin();
+        std::map <string, v8::Persistent <v8::String> >::iterator end = _strCache.end();
+        while (it != end) {
+            it->second.Dispose();
+            ++it;
+        }
+        lzObjectTemplate.Dispose();
+        lzArrayTemplate.Dispose();
+        roObjectTemplate.Dispose();
+        internalFieldObjects.Dispose();
+        _context.Dispose();
+        }
+
+        _isolate->Dispose();
+    }
+
+    bool V8Scope::hasOutOfMemoryException() {
+        if (!_context.IsEmpty())
+            return _context->HasOutOfMemoryException();
+        return false;
+    }
+
+    /**
+     * JS Callback that will call a c++ function with BSON arguments.
+     */
+    Handle< Value > V8Scope::nativeCallback( V8Scope* scope, const Arguments &args ) {
+        V8Lock l;
+        HandleScope handle_scope;
+        Local< External > f = External::Cast( *args.Callee()->Get( scope->V8STR_NATIVE_FUNC ) );
+        NativeFunction function = (NativeFunction)(f->Value());
+        Local< External > data = External::Cast( *args.Callee()->Get( scope->V8STR_NATIVE_DATA ) );
+        BSONObjBuilder b;
+        for( int i = 0; i < args.Length(); ++i ) {
+            stringstream ss;
+            ss << i;
+            scope->v8ToMongoElement( b, ss.str(), args[ i ] );
+        }
+        BSONObj nativeArgs = b.obj();
+        BSONObj ret;
+        try {
+            ret = function( nativeArgs, data->Value() );
+        }
+        catch( const std::exception &e ) {
+            return v8::ThrowException(v8::String::New(e.what()));
+        }
+        catch( ... ) {
+            return v8::ThrowException(v8::String::New("unknown exception"));
+        }
+        return handle_scope.Close( scope->mongoToV8Element( ret.firstElement() ) );
+    }
+
+    Handle< Value > V8Scope::load( V8Scope* scope, const Arguments &args ) {
+        Context::Scope context_scope(scope->_context);
+        for (int i = 0; i < args.Length(); ++i) {
+            std::string filename(toSTLString(args[i]));
+            if (!scope->execFile(filename, false , true , false)) {
+                return v8::ThrowException(v8::String::New((std::string("error loading file: ") + filename).c_str()));
+            }
+        }
+        return v8::True();
+    }
+
+    /**
+     * JS Callback that will call a c++ function with the v8 scope and v8 arguments.
+     * Handles interrupts, exception handling, etc
+     *
+     * The implementation below assumes that SERVER-1816 has been fixed - in
+     * particular, interrupted() must return true if an interrupt was ever
+     * sent; currently that is not the case if a new killop overwrites the data
+     * for an old one
+     */
+    v8::Handle< v8::Value > V8Scope::v8Callback( const v8::Arguments &args ) {
+        Local< External > f = External::Cast( *args.Callee()->Get( v8::String::New( "_v8_function" ) ) );
+        v8Function function = (v8Function)(f->Value());
+        Local< External > scp = External::Cast( *args.Data() );
+        V8Scope* scope = (V8Scope*)(scp->Value());
+
+        // originally v8 interrupt where disabled here cause: don't want to have to audit all v8 calls for termination exceptions
+        // but we do need to keep interrupt because much time may be spent here (e.g. sleep)
+        bool paused = scope->pauseV8Interrupt();
+
+        v8::Handle< v8::Value > ret;
+        string exception;
+        try {
+            ret = function( scope, args );
+        }
+        catch( const std::exception &e ) {
+            exception = e.what();
+        }
+        catch( ... ) {
+            exception = "unknown exception";
+        }
+        if (paused) {
+            bool resume = scope->resumeV8Interrupt();
+            if ( !resume || globalScriptEngine->interrupted() ) {
+                v8::V8::TerminateExecution(scope->_isolate);
+                return v8::ThrowException( v8::String::New( "Interruption in V8 native callback" ) );
+            }
+        }
+        if ( !exception.empty() ) {
+            return v8::ThrowException( v8::String::New( exception.c_str() ) );
+        }
+        return ret;
+    }
+
+    // ---- global stuff ----
+
+    void V8Scope::init( const BSONObj * data ) {
+        V8Lock l;
+        if ( ! data )
+            return;
+
+        BSONObjIterator i( *data );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            setElement( e.fieldName() , e );
+        }
+    }
+
+    void V8Scope::setNumber( const char * field , double val ) {
+        V8_SIMPLE_HEADER
+        _global->Set( getV8Str( field ) , v8::Number::New( val ) );
+    }
+
+    void V8Scope::setString( const char * field , const char * val ) {
+        V8_SIMPLE_HEADER
+        _global->Set( getV8Str( field ) , v8::String::New( val ) );
+    }
+
+    void V8Scope::setBoolean( const char * field , bool val ) {
+        V8_SIMPLE_HEADER
+        _global->Set( getV8Str( field ) , v8::Boolean::New( val ) );
+    }
+
+    void V8Scope::setElement( const char *field , const BSONElement& e ) {
+        V8_SIMPLE_HEADER
+        _global->Set( getV8Str( field ) , mongoToV8Element( e ) );
+    }
+
+    void V8Scope::setObject( const char *field , const BSONObj& obj , bool readOnly) {
+        V8_SIMPLE_HEADER
+        // Set() accepts a ReadOnly parameter, but this just prevents the field itself
+        // from being overwritten and doesn't protect the object stored in 'field'.
+        _global->Set( getV8Str( field ) , mongoToLZV8( obj, false, readOnly) );
+    }
+
+    int V8Scope::type( const char *field ) {
+        V8_SIMPLE_HEADER
+        Handle<Value> v = get( field );
+        if ( v->IsNull() )
+            return jstNULL;
+        if ( v->IsUndefined() )
+            return Undefined;
+        if ( v->IsString() )
+            return String;
+        if ( v->IsFunction() )
+            return Code;
+        if ( v->IsArray() )
+            return Array;
+        if ( v->IsBoolean() )
+            return Bool;
+        // needs to be explicit NumberInt to use integer
+//        if ( v->IsInt32() )
+//            return NumberInt;
+        if ( v->IsNumber() )
+            return NumberDouble;
+        if ( v->IsExternal() ) {
+            uassert( 10230 ,  "can't handle external yet" , 0 );
+            return -1;
+        }
+        if ( v->IsDate() )
+            return Date;
+        if ( v->IsObject() )
+            return Object;
+
+        throw UserException( 12509, (string)"don't know what this is: " + field );
+    }
+
+    v8::Handle<v8::Value> V8Scope::get( const char * field ) {
+        return _global->Get( getV8Str( field ) );
+    }
+
+    double V8Scope::getNumber( const char *field ) {
+        V8_SIMPLE_HEADER
+        return get( field )->ToNumber()->Value();
+    }
+
+    int V8Scope::getNumberInt( const char *field ) {
+        V8_SIMPLE_HEADER
+        return get( field )->ToInt32()->Value();
+    }
+
+    long long V8Scope::getNumberLongLong( const char *field ) {
+        V8_SIMPLE_HEADER
+        return get( field )->ToInteger()->Value();
+    }
+
+    string V8Scope::getString( const char *field ) {
+        V8_SIMPLE_HEADER
+        return toSTLString( get( field ) );
+    }
+
+    bool V8Scope::getBoolean( const char *field ) {
+        V8_SIMPLE_HEADER
+        return get( field )->ToBoolean()->Value();
+    }
+
+    BSONObj V8Scope::getObject( const char * field ) {
+        V8_SIMPLE_HEADER
+        Handle<Value> v = get( field );
+        if ( v->IsNull() || v->IsUndefined() )
+            return BSONObj();
+        uassert( 10231 ,  "not an object" , v->IsObject() );
+        return v8ToMongo( v->ToObject() );
+    }
+
+    // --- functions -----
+
+    bool hasFunctionIdentifier( const string& code ) {
+        if ( code.size() < 9 || code.find( "function" ) != 0  )
+            return false;
+
+        return code[8] == ' ' || code[8] == '(';
+    }
+
+    Local< v8::Function > V8Scope::__createFunction( const char * raw ) {
+        raw = jsSkipWhiteSpace( raw );
+        string code = raw;
+        if ( !hasFunctionIdentifier( code ) ) {
+            if ( code.find( "\n" ) == string::npos &&
+                    ! hasJSReturn( code ) &&
+                    ( code.find( ";" ) == string::npos || code.find( ";" ) == code.size() - 1 ) ) {
+                code = "return " + code;
+            }
+            code = "function(){ " + code + "}";
+        }
+
+        int num = _funcs.size() + 1;
+
+        string fn;
+        {
+            stringstream ss;
+            ss << "_funcs" << num;
+            fn = ss.str();
+        }
+
+        code = fn + " = " + code;
+
+        TryCatch try_catch;
+        // this might be time consuming, consider allowing an interrupt
+        Handle<Script> script = v8::Script::Compile( v8::String::New( code.c_str() ) ,
+                                v8::String::New( fn.c_str() ) );
+        if ( script.IsEmpty() ) {
+            _error = (string)"compile error: " + toSTLString( &try_catch );
+            log() << _error << endl;
+            return Local< v8::Function >();
+        }
+
+        Local<Value> result = script->Run();
+        if ( result.IsEmpty() ) {
+            _error = (string)"compile error: " + toSTLString( &try_catch );
+            log() << _error << endl;
+            return Local< v8::Function >();
+        }
+
+        return v8::Function::Cast( *_global->Get( v8::String::New( fn.c_str() ) ) );
+    }
+
+    ScriptingFunction V8Scope::_createFunction( const char * raw ) {
+        V8_SIMPLE_HEADER
+        Local< Value > ret = __createFunction( raw );
+        if ( ret.IsEmpty() )
+            return 0;
+        Persistent<Value> f = Persistent< Value >::New( ret );
+        uassert( 10232, "not a func" , f->IsFunction() );
+        int num = _funcs.size() + 1;
+        _funcs.push_back( f );
+        return num;
+    }
+
+    void V8Scope::setFunction( const char *field , const char * code ) {
+        V8_SIMPLE_HEADER
+        _global->Set( getV8Str( field ) , __createFunction(code) );
+    }
+
+//    void V8Scope::setThis( const BSONObj * obj ) {
+//        V8_SIMPLE_HEADER
+//        if ( ! obj ) {
+//            _this = Persistent< v8::Object >::New( v8::Object::New() );
+//            return;
+//        }
+//
+//        //_this = mongoToV8( *obj );
+//        v8::Handle<v8::Value> argv[1];
+//        argv[0] = v8::External::New( createWrapperHolder( this, obj , true , false ) );
+//        _this = Persistent< v8::Object >::New( _wrapper->NewInstance( 1, argv ) );
+//    }
+
+    void V8Scope::rename( const char * from , const char * to ) {
+        V8_SIMPLE_HEADER;
+        Handle<v8::String> f = getV8Str( from );
+        Handle<v8::String> t = getV8Str( to );
+        _global->Set( t , _global->Get( f ) );
+        _global->Set( f , v8::Undefined() );
+    }
+
+    int V8Scope::invoke( ScriptingFunction func , const BSONObj* argsObject, const BSONObj* recv, int timeoutMs , bool ignoreReturn, bool readOnlyArgs, bool readOnlyRecv ) {
+        V8_SIMPLE_HEADER
+        Handle<Value> funcValue = _funcs[func-1];
+
+        TryCatch try_catch;
+        int nargs = argsObject ? argsObject->nFields() : 0;
+        scoped_array< Handle<Value> > args;
+        if ( nargs ) {
+            args.reset( new Handle<Value>[nargs] );
+            BSONObjIterator it( *argsObject );
+            for ( int i=0; i<nargs; i++ ) {
+                BSONElement next = it.next();
+                args[i] = mongoToV8Element( next, readOnlyArgs );
+            }
+            setObject( "args", *argsObject, readOnlyArgs); // for backwards compatibility
+        }
+        else {
+            _global->Set( V8STR_ARGS, v8::Undefined() );
+        }
+        if ( globalScriptEngine->interrupted() ) {
+            stringstream ss;
+            ss << "error in invoke: " << globalScriptEngine->checkInterrupt();
+            _error = ss.str();
+            log() << _error << endl;
+            return 1;
+        }
+        Handle<v8::Object> v8recv;
+        if (recv != 0)
+            v8recv = mongoToLZV8(*recv, false, readOnlyRecv);
+        else
+            v8recv = _global;
+
+        enableV8Interrupt(); // because of v8 locker we can check interrupted, then enable
+        Local<Value> result = ((v8::Function*)(*funcValue))->Call( v8recv , nargs , nargs ? args.get() : 0 );
+        disableV8Interrupt();
+
+        if ( result.IsEmpty() ) {
+            stringstream ss;
+            if ( try_catch.HasCaught() && !try_catch.CanContinue() ) {
+                ss << "error in invoke: " << globalScriptEngine->checkInterrupt();
+            }
+            else {
+                ss << "error in invoke: " << toSTLString( &try_catch );
+            }
+            _error = ss.str();
+            log() << _error << endl;
+            return 1;
+        }
+
+        if ( ! ignoreReturn ) {
+            _global->Set( V8STR_RETURN , result );
+        }
+
+        return 0;
+    }
+
+    bool V8Scope::exec( const StringData& code , const string& name , bool printResult , bool reportError , bool assertOnError, int timeoutMs ) {
+        if ( timeoutMs ) {
+            static bool t = 1;
+            if ( t ) {
+                log() << "timeoutMs not support for v8 yet  code: " << code << endl;
+                t = 0;
+            }
+        }
+
+        V8_SIMPLE_HEADER
+
+        TryCatch try_catch;
+
+        Handle<Script> script = v8::Script::Compile( v8::String::New( code.data() ) ,
+                                v8::String::New( name.c_str() ) );
+        if (script.IsEmpty()) {
+            stringstream ss;
+            ss << "compile error: " << toSTLString( &try_catch );
+            _error = ss.str();
+            if (reportError)
+                log() << _error << endl;
+            if ( assertOnError )
+                uassert( 10233 ,  _error , 0 );
+            return false;
+        }
+
+        if ( globalScriptEngine->interrupted() ) {
+            _error = (string)"exec error: " + globalScriptEngine->checkInterrupt();
+            if ( reportError ) {
+                log() << _error << endl;
+            }
+            if ( assertOnError ) {
+                uassert( 13475 ,  _error , 0 );
+            }
+            return false;
+        }
+        enableV8Interrupt(); // because of v8 locker we can check interrupted, then enable
+        Handle<v8::Value> result = script->Run();
+        disableV8Interrupt();
+        if ( result.IsEmpty() ) {
+            if ( try_catch.HasCaught() && !try_catch.CanContinue() ) {
+                _error = (string)"exec error: " + globalScriptEngine->checkInterrupt();
+            }
+            else {
+                _error = (string)"exec error: " + toSTLString( &try_catch );
+            }
+            if ( reportError )
+                log() << _error << endl;
+            if ( assertOnError )
+                uassert( 10234 ,  _error , 0 );
+            return false;
+        }
+
+        _global->Set( getV8Str( "__lastres__" ) , result );
+
+        if ( printResult && ! result->IsUndefined() ) {
+            cout << toSTLString( result ) << endl;
+        }
+
+        return true;
+    }
+
+    void V8Scope::injectNative( const char *field, NativeFunction func, void* data ) {
+        injectNative(field, func, _global, data);
+    }
+
+    void V8Scope::injectNative( const char *field, NativeFunction func, Handle<v8::Object>& obj, void* data ) {
+        V8_SIMPLE_HEADER
+
+        Handle< FunctionTemplate > ft = createV8Function(nativeCallback);
+        ft->Set( this->V8STR_NATIVE_FUNC, External::New( (void*)func ) );
+        ft->Set( this->V8STR_NATIVE_DATA, External::New( data ) );
+        obj->Set( getV8Str( field ), ft->GetFunction() );
+    }
+
+    void V8Scope::injectV8Function( const char *field, v8Function func ) {
+        injectV8Function(field, func, _global);
+    }
+
+    void V8Scope::injectV8Function( const char *field, v8Function func, Handle<v8::Object>& obj ) {
+        V8_SIMPLE_HEADER
+
+        Handle< FunctionTemplate > ft = createV8Function(func);
+        Handle<v8::Function> f = ft->GetFunction();
+        obj->Set( getV8Str( field ), f );
+    }
+
+    void V8Scope::injectV8Function( const char *field, v8Function func, Handle<v8::Template>& t ) {
+        V8_SIMPLE_HEADER
+
+        Handle< FunctionTemplate > ft = createV8Function(func);
+        Handle<v8::Function> f = ft->GetFunction();
+        t->Set( getV8Str( field ), f );
+    }
+
+    Handle<FunctionTemplate> V8Scope::createV8Function( v8Function func ) {
+        Handle< FunctionTemplate > ft = v8::FunctionTemplate::New(v8Callback, External::New( this ));
+        ft->Set( this->V8STR_V8_FUNC, External::New( (void*)func ) );
+        return ft;
+    }
+
+    void V8Scope::gc() {
+        cout << "in gc" << endl;
+        V8Lock l;
+        V8::LowMemoryNotification();
+    }
+
+    // ----- db access -----
+
+    void V8Scope::localConnect( const char * dbName ) {
+        {
+            V8_SIMPLE_HEADER
+
+            if ( _connectState == EXTERNAL )
+                throw UserException( 12510, "externalSetup already called, can't call externalSetup" );
+            if ( _connectState ==  LOCAL ) {
+                if ( _localDBName == dbName )
+                    return;
+                throw UserException( 12511, "localConnect called with a different name previously" );
+            }
+
+            //_global->Set( v8::String::New( "Mongo" ) , _engine->_externalTemplate->GetFunction() );
+            _global->Set( getV8Str( "Mongo" ) , getMongoFunctionTemplate( this, true )->GetFunction() );
+            execCoreFiles();
+            exec( "_mongo = new Mongo();" , "local connect 2" , false , true , true , 0 );
+            exec( (string)"db = _mongo.getDB(\"" + dbName + "\");" , "local connect 3" , false , true , true , 0 );
+            _connectState = LOCAL;
+            _localDBName = dbName;
+        }
+        loadStored();
+    }
+
+    void V8Scope::externalSetup() {
+        V8_SIMPLE_HEADER
+        if ( _connectState == EXTERNAL )
+            return;
+        if ( _connectState == LOCAL )
+            throw UserException( 12512, "localConnect already called, can't call externalSetup" );
+
+        installFork( this, _global, _context );
+        _global->Set( getV8Str( "Mongo" ) , getMongoFunctionTemplate( this, false )->GetFunction() );
+        execCoreFiles();
+        _connectState = EXTERNAL;
+    }
+
+    // ----- internal -----
+
+    void V8Scope::reset() {
+        _startCall();
+    }
+
+    void V8Scope::_startCall() {
+        _error = "";
+    }
+
+    Local< v8::Value > newFunction( const char *code ) {
+        stringstream codeSS;
+        codeSS << "____MontoToV8_newFunction_temp = " << code;
+        string codeStr = codeSS.str();
+        Local< Script > compiled = Script::New( v8::String::New( codeStr.c_str() ) );
+        Local< Value > ret = compiled->Run();
+        return ret;
+    }
+
+    Local< v8::Value > V8Scope::newId( const OID &id ) {
+        v8::Function * idCons = this->getObjectIdCons();
+        v8::Handle<v8::Value> argv[1];
+        argv[0] = v8::String::New( id.str().c_str() );
+        return idCons->NewInstance( 1 , argv );
+    }
+
+    Local<v8::Object> V8Scope::mongoToV8( const BSONObj& m , bool array, bool readOnly ) {
+
+        Local<v8::Object> o;
+
+        // handle DBRef. needs to come first. isn't it? (metagoto)
+        static string ref = "$ref";
+        if ( ref == m.firstElement().fieldName() ) {
+            const BSONElement& id = m["$id"];
+            if (!id.eoo()) { // there's no check on $id exitence in sm implementation. risky ?
+                v8::Function* dbRef = getNamedCons( "DBRef" );
+                o = dbRef->NewInstance();
+            }
+        }
+
+        Local< v8::ObjectTemplate > readOnlyObjects;
+
+        if ( !o.IsEmpty() ) {
+            readOnly = false;
+        }
+        else if ( array ) {
+            // NOTE Looks like it's impossible to add interceptors to v8 arrays.
+            // so array itself will never be read only, but its values can be
+            o = v8::Array::New();
+        }
+        else if ( !readOnly ) {
+            o = v8::Object::New();
+        }
+        else {
+            // NOTE Our readOnly implemention relies on undocumented ObjectTemplate
+            // functionality that may be fragile, but it still seems like the best option
+            // for now -- fwiw, the v8 docs are pretty sparse.  I've determined experimentally
+            // that when property handlers are set for an object template, they will attach
+            // to objects previously created by that template.  To get this to work, though,
+            // it is necessary to initialize the template's property handlers before
+            // creating objects from the template (as I have in the following few lines
+            // of code).
+            // NOTE In my first attempt, I configured the permanent property handlers before
+            // constructiong the object and replaced the Set() calls below with ForceSet().
+            // However, it turns out that ForceSet() only bypasses handlers for named
+            // properties and not for indexed properties.
+            readOnlyObjects = v8::ObjectTemplate::New();
+            // NOTE This internal field will store type info for special db types.  For
+            // regular objects the field is unnecessary - for simplicity I'm creating just
+            // one readOnlyObjects template for objects where the field is & isn't necessary,
+            // assuming that the overhead of an internal field is slight.
+            readOnlyObjects->SetInternalFieldCount( 1 );
+            readOnlyObjects->SetNamedPropertyHandler( 0 );
+            readOnlyObjects->SetIndexedPropertyHandler( 0 );
+            o = readOnlyObjects->NewInstance();
+        }
+
+        mongo::BSONObj sub;
+
+        for ( BSONObjIterator i(m); i.more(); ) {
+            const BSONElement& f = i.next();
+
+            Local<Value> v;
+            Handle<v8::String> name = getV8Str(f.fieldName());
+
+            switch ( f.type() ) {
+
+            case mongo::Code:
+                o->Set( name, newFunction( f.valuestr() ) );
+                break;
+
+            case CodeWScope:
+                if ( !f.codeWScopeObject().isEmpty() )
+                    log() << "warning: CodeWScope doesn't transfer to db.eval" << endl;
+                o->Set( name, newFunction( f.codeWScopeCode() ) );
+                break;
+
+            case mongo::String:
+                o->Set( name , v8::String::New( f.valuestr() ) );
+                break;
+
+            case mongo::jstOID: {
+                v8::Function * idCons = getObjectIdCons();
+                v8::Handle<v8::Value> argv[1];
+                argv[0] = v8::String::New( f.__oid().str().c_str() );
+                o->Set( name ,
+                        idCons->NewInstance( 1 , argv ) );
+                break;
+            }
+
+            case mongo::NumberDouble:
+            case mongo::NumberInt:
+                o->Set( name , v8::Number::New( f.number() ) );
+                break;
+
+//            case mongo::NumberInt: {
+//                Local<v8::Object> sub = readOnly ? readOnlyObjects->NewInstance() : internalFieldObjects->NewInstance();
+//                int val = f.numberInt();
+//                v8::Function* numberInt = getNamedCons( "NumberInt" );
+//                v8::Handle<v8::Value> argv[1];
+//                argv[0] = v8::Int32::New( val );
+//                o->Set( name, numberInt->NewInstance( 1, argv ) );
+//                break;
+//            }
+
+            case mongo::Array:
+                sub = f.embeddedObject();
+                o->Set( name , mongoToV8( sub , true, readOnly ) );
+                break;
+            case mongo::Object:
+                sub = f.embeddedObject();
+                o->Set( name , mongoToLZV8( sub , false, readOnly ) );
+                break;
+
+            case mongo::Date:
+                o->Set( name , v8::Date::New( (double) ((long long)f.date().millis) ));
+                break;
+
+            case mongo::Bool:
+                o->Set( name , v8::Boolean::New( f.boolean() ) );
+                break;
+
+            case mongo::jstNULL:
+            case mongo::Undefined: // duplicate sm behavior
+                o->Set( name , v8::Null() );
+                break;
+
+            case mongo::RegEx: {
+                v8::Function * regex = getNamedCons( "RegExp" );
+
+                v8::Handle<v8::Value> argv[2];
+                argv[0] = v8::String::New( f.regex() );
+                argv[1] = v8::String::New( f.regexFlags() );
+
+                o->Set( name , regex->NewInstance( 2 , argv ) );
+                break;
+            }
+
+            case mongo::BinData: {
+                int len;
+                const char *data = f.binData( len );
+
+                v8::Function* binData = getNamedCons( "BinData" );
+                v8::Handle<v8::Value> argv[3];
+                argv[0] = v8::Number::New( len );
+                argv[1] = v8::Number::New( f.binDataType() );
+                argv[2] = v8::String::New( data, len );
+                o->Set( name, binData->NewInstance(3, argv) );
+                break;
+            }
+
+            case mongo::Timestamp: {
+                Local<v8::Object> sub = readOnly ? readOnlyObjects->NewInstance() : internalFieldObjects->NewInstance();
+
+                sub->Set( V8STR_T , v8::Number::New( f.timestampTime() ) );
+                sub->Set( V8STR_I , v8::Number::New( f.timestampInc() ) );
+                sub->SetInternalField( 0, v8::Uint32::New( f.type() ) );
+
+                o->Set( name , sub );
+                break;
+            }
+
+            case mongo::NumberLong: {
+                unsigned long long val = f.numberLong();
+                v8::Function* numberLong = getNamedCons( "NumberLong" );
+                double floatApprox = (double)(long long)val;
+                // values above 2^53 are not accurately represented in JS
+                if ( (long long)val == (long long)floatApprox && val < 9007199254740992ULL ) {
+                    v8::Handle<v8::Value> argv[1];
+                    argv[0] = v8::Number::New( floatApprox );
+                    o->Set( name, numberLong->NewInstance( 1, argv ) );
+                }
+                else {
+                    v8::Handle<v8::Value> argv[3];
+                    argv[0] = v8::Number::New( floatApprox );
+                    argv[1] = v8::Integer::New( val >> 32 );
+                    argv[2] = v8::Integer::New( (unsigned long)(val & 0x00000000ffffffff) );
+                    o->Set( name, numberLong->NewInstance(3, argv) );
+                }
+                break;
+            }
+
+            case mongo::MinKey: {
+                Local<v8::Object> sub = readOnly ? readOnlyObjects->NewInstance() : internalFieldObjects->NewInstance();
+                sub->Set( V8STR_MINKEY, v8::Boolean::New( true ) );
+                sub->SetInternalField( 0, v8::Uint32::New( f.type() ) );
+                o->Set( name , sub );
+                break;
+            }
+
+            case mongo::MaxKey: {
+                Local<v8::Object> sub = readOnly ? readOnlyObjects->NewInstance() : internalFieldObjects->NewInstance();
+                sub->Set( V8STR_MAXKEY, v8::Boolean::New( true ) );
+                sub->SetInternalField( 0, v8::Uint32::New( f.type() ) );
+                o->Set( name , sub );
+                break;
+            }
+
+            case mongo::DBRef: {
+                v8::Function* dbPointer = getNamedCons( "DBPointer" );
+                v8::Handle<v8::Value> argv[2];
+                argv[0] = getV8Str( f.dbrefNS() );
+                argv[1] = newId( f.dbrefOID() );
+                o->Set( name, dbPointer->NewInstance(2, argv) );
+                break;
+            }
+
+            default:
+                cout << "can't handle type: ";
+                cout  << f.type() << " ";
+                cout  << f.toString();
+                cout  << endl;
+                break;
+            }
+
+        }
+
+        if ( !array && readOnly ) {
+            readOnlyObjects->SetNamedPropertyHandler( 0, NamedReadOnlySet, 0, NamedReadOnlyDelete );
+            readOnlyObjects->SetIndexedPropertyHandler( 0, IndexedReadOnlySet, 0, IndexedReadOnlyDelete );
+        }
+
+        return o;
+    }
+
+    /**
+     * converts a BSONObj to a Lazy V8 object
+     */
+    Handle<v8::Object> V8Scope::mongoToLZV8( const BSONObj& m , bool array, bool readOnly ) {
+        Local<v8::Object> o;
+
+        if (readOnly) {
+            o = roObjectTemplate->NewInstance();
+            o->SetHiddenValue(V8STR_RO, v8::Boolean::New(true));
+        } else {
+            if (array) {
+                o = lzArrayTemplate->NewInstance();
+                o->SetPrototype(v8::Array::New(1)->GetPrototype());
+                o->Set(V8STR_LENGTH, v8::Integer::New(m.nFields()), DontEnum);
+    //            o->Set(ARRAY_STRING, v8::Boolean::New(true), DontEnum);
+            } else {
+                o = lzObjectTemplate->NewInstance();
+
+                static string ref = "$ref";
+                if ( ref == m.firstElement().fieldName() ) {
+                  const BSONElement& id = m["$id"];
+                  if (!id.eoo()) {
+                      v8::Function* dbRef = getNamedCons( "DBRef" );
+                      o->SetPrototype(dbRef->NewInstance()->GetPrototype());
+                  }
+                }
+            }
+
+            // need to set all keys with dummy values, so that order of keys is correct during enumeration
+            // otherwise v8 will list any newly set property in JS before the ones of underlying BSON obj.
+            for (BSONObjIterator it(m); it.more();) {
+                const BSONElement& f = it.next();
+                o->ForceSet(getV8Str(f.fieldName()), v8::Undefined());
+            }
+        }
+
+        BSONObj* own = new BSONObj(m.getOwned());
+//        BSONObj* own = new BSONObj(m);
+        Persistent<v8::Object> p = wrapBSONObject(o, own);
+        return p;
+    }
+
+    Handle<v8::Value> V8Scope::mongoToV8Element( const BSONElement &f, bool readOnly ) {
+//        Local< v8::ObjectTemplate > internalFieldObjects = v8::ObjectTemplate::New();
+//        internalFieldObjects->SetInternalFieldCount( 1 );
+
+        switch ( f.type() ) {
+
+        case mongo::Code:
+            return newFunction( f.valuestr() );
+
+        case CodeWScope:
+            if ( !f.codeWScopeObject().isEmpty() )
+                log() << "warning: CodeWScope doesn't transfer to db.eval" << endl;
+            return newFunction( f.codeWScopeCode() );
+
+        case mongo::String:
+//            return v8::String::NewExternal( new ExternalString( f.valuestr() ));
+            return v8::String::New( f.valuestr() );
+//            return getV8Str( f.valuestr() );
+
+        case mongo::jstOID:
+            return newId( f.__oid() );
+
+        case mongo::NumberDouble:
+        case mongo::NumberInt:
+            return v8::Number::New( f.number() );
+
+        case mongo::Array:
+            // for arrays it's better to use non lazy object because:
+            // - the lazy array is not a true v8 array and requires some v8 src change for all methods to work
+            // - it made several tests about 1.5x slower
+            // - most times when an array is accessed, all its values will be used
+            return mongoToV8( f.embeddedObject() , true, readOnly );
+        case mongo::Object:
+            return mongoToLZV8( f.embeddedObject() , false, readOnly);
+
+        case mongo::Date:
+            return v8::Date::New( (double) ((long long)f.date().millis) );
+
+        case mongo::Bool:
+            return v8::Boolean::New( f.boolean() );
+
+        case mongo::EOO:
+        case mongo::jstNULL:
+        case mongo::Undefined: // duplicate sm behavior
+            return v8::Null();
+
+        case mongo::RegEx: {
+            v8::Function * regex = getNamedCons( "RegExp" );
+
+            v8::Handle<v8::Value> argv[2];
+            argv[0] = v8::String::New( f.regex() );
+            argv[1] = v8::String::New( f.regexFlags() );
+
+            return regex->NewInstance( 2 , argv );
+            break;
+        }
+
+        case mongo::BinData: {
+            int len;
+            const char *data = f.binData( len );
+
+            v8::Function* binData = getNamedCons( "BinData" );
+            v8::Handle<v8::Value> argv[3];
+            argv[0] = v8::Number::New( len );
+            argv[1] = v8::Number::New( f.binDataType() );
+            argv[2] = v8::String::New( data, len );
+            return binData->NewInstance( 3, argv );
+        };
+
+        case mongo::Timestamp: {
+            Local<v8::Object> sub = internalFieldObjects->NewInstance();
+
+            sub->Set( V8STR_T , v8::Number::New( f.timestampTime() ) );
+            sub->Set( V8STR_I , v8::Number::New( f.timestampInc() ) );
+            sub->SetInternalField( 0, v8::Uint32::New( f.type() ) );
+
+            return sub;
+        }
+
+        case mongo::NumberLong: {
+            unsigned long long val = f.numberLong();
+            v8::Function* numberLong = getNamedCons( "NumberLong" );
+            // values above 2^53 are not accurately represented in JS
+            if ( (long long)val == (long long)(double)(long long)(val) && val < 9007199254740992ULL ) {
+                v8::Handle<v8::Value> argv[1];
+                argv[0] = v8::Number::New( (double)(long long)( val ) );
+                return numberLong->NewInstance( 1, argv );
+            }
+            else {
+                v8::Handle<v8::Value> argv[3];
+                argv[0] = v8::Number::New( (double)(long long)( val ) );
+                argv[1] = v8::Integer::New( val >> 32 );
+                argv[2] = v8::Integer::New( (unsigned long)(val & 0x00000000ffffffff) );
+                return numberLong->NewInstance( 3, argv );
+            }
+        }
+
+//        case mongo::NumberInt: {
+//            Local<v8::Object> sub = internalFieldObjects->NewInstance();
+//            int val = f.numberInt();
+//            v8::Function* numberInt = getNamedCons( "NumberInt" );
+//            v8::Handle<v8::Value> argv[1];
+//            argv[0] = v8::Int32::New(val);
+//            return numberInt->NewInstance( 1, argv );
+//        }
+
+        case mongo::MinKey: {
+            Local<v8::Object> sub = internalFieldObjects->NewInstance();
+            sub->Set( V8STR_MINKEY, v8::Boolean::New( true ) );
+            sub->SetInternalField( 0, v8::Uint32::New( f.type() ) );
+            return sub;
+        }
+
+        case mongo::MaxKey: {
+            Local<v8::Object> sub = internalFieldObjects->NewInstance();
+            sub->Set( V8STR_MAXKEY, v8::Boolean::New( true ) );
+            sub->SetInternalField( 0, v8::Uint32::New( f.type() ) );
+            return sub;
+        }
+
+        case mongo::DBRef: {
+            v8::Function* dbPointer = getNamedCons( "DBPointer" );
+            v8::Handle<v8::Value> argv[2];
+            argv[0] = getV8Str( f.dbrefNS() );
+            argv[1] = newId( f.dbrefOID() );
+            return dbPointer->NewInstance(2, argv);
+        }
+
+        default:
+            cout << "can't handle type: ";
+            cout  << f.type() << " ";
+            cout  << f.toString();
+            cout  << endl;
+            break;
+        }
+
+        return v8::Undefined();
+    }
+
+    void V8Scope::append( BSONObjBuilder & builder , const char * fieldName , const char * scopeName ) {
+        V8_SIMPLE_HEADER
+        Handle<v8::String> v8name = getV8Str(scopeName);
+        Handle<Value> value = _global->Get( v8name );
+        v8ToMongoElement(builder, fieldName, value);
+    }
+
+    void V8Scope::v8ToMongoElement( BSONObjBuilder & b , const string sname , v8::Handle<v8::Value> value , int depth, BSONObj* originalParent ) {
+
+        if ( value->IsString() ) {
+//            Handle<v8::String> str = Handle<v8::String>::Cast(value);
+//            ExternalString* es = (ExternalString*) (str->GetExternalAsciiStringResource());
+//            b.append( sname , es->data() );
+            b.append( sname , toSTLString( value ).c_str() );
+            return;
+        }
+
+        if ( value->IsFunction() ) {
+            b.appendCode( sname , toSTLString( value ) );
+            return;
+        }
+
+        if ( value->IsNumber() ) {
+            double val = value->ToNumber()->Value();
+            // if previous type was integer, keep it
+            int intval = (int)val;
+            if (val == intval && originalParent) {
+                BSONElement elmt = originalParent->getField(sname);
+                if (elmt.type() == mongo::NumberInt) {
+                    b.append( sname , intval );
+                    return;
+                }
+            }
+
+            b.append( sname , val );
+            return;
+        }
+
+        if ( value->IsArray() ) {
+            BSONObj sub = v8ToMongo( value->ToObject() , depth );
+            b.appendArray( sname , sub );
+            return;
+        }
+
+        if ( value->IsDate() ) {
+            long long dateval = (long long)(v8::Date::Cast( *value )->NumberValue());
+            b.appendDate( sname , Date_t( (unsigned long long) dateval ) );
+            return;
+        }
+
+        if ( value->IsExternal() )
+            return;
+
+        if ( value->IsObject() ) {
+            // The user could potentially modify the fields of these special objects,
+            // wreaking havoc when we attempt to reinterpret them.  Not doing any validation
+            // for now...
+            Local< v8::Object > obj = value->ToObject();
+            if ( obj->InternalFieldCount() && obj->GetInternalField( 0 )->IsNumber() ) {
+                switch( obj->GetInternalField( 0 )->ToInt32()->Value() ) { // NOTE Uint32's Value() gave me a linking error, so going with this instead
+                case Timestamp:
+                    b.appendTimestamp( sname,
+                                       Date_t( (unsigned long long)(obj->Get( V8STR_T )->ToNumber()->Value() )),
+                                       obj->Get( V8STR_I )->ToInt32()->Value() );
+                    return;
+                case MinKey:
+                    b.appendMinKey( sname );
+                    return;
+                case MaxKey:
+                    b.appendMaxKey( sname );
+                    return;
+                default:
+                    assert( "invalid internal field" == 0 );
+                }
+            }
+            string s = toSTLString( value );
+            if ( s.size() && s[0] == '/' ) {
+                s = s.substr( 1 );
+                string r = s.substr( 0 , s.rfind( "/" ) );
+                string o = s.substr( s.rfind( "/" ) + 1 );
+                b.appendRegex( sname , r , o );
+            }
+            else if ( value->ToObject()->GetPrototype()->IsObject() &&
+                      value->ToObject()->GetPrototype()->ToObject()->HasRealNamedProperty( V8STR_ISOBJECTID ) ) {
+                OID oid;
+                oid.init( toSTLString( value->ToObject()->Get(getV8Str("str")) ) );
+                b.appendOID( sname , &oid );
+            }
+            else if ( !value->ToObject()->GetHiddenValue( V8STR_NUMBERLONG ).IsEmpty() ) {
+                // TODO might be nice to potentially speed this up with an indexed internal
+                // field, but I don't yet know how to use an ObjectTemplate with a
+                // constructor.
+                v8::Handle< v8::Object > it = value->ToObject();
+                long long val;
+                if ( !it->Has( getV8Str( "top" ) ) ) {
+                    val = (long long)( it->Get( getV8Str( "floatApprox" ) )->NumberValue() );
+                }
+                else {
+                    val = (long long)
+                          ( (unsigned long long)( it->Get( getV8Str( "top" ) )->ToInt32()->Value() ) << 32 ) +
+                          (unsigned)( it->Get( getV8Str( "bottom" ) )->ToInt32()->Value() );
+                }
+
+                b.append( sname, val );
+            }
+            else if ( !value->ToObject()->GetHiddenValue( V8STR_NUMBERINT ).IsEmpty() ) {
+                v8::Handle< v8::Object > it = value->ToObject();
+                b.append(sname, it->GetHiddenValue(V8STR_NUMBERINT)->Int32Value());
+            }
+            else if ( !value->ToObject()->GetHiddenValue( V8STR_DBPTR ).IsEmpty() ) {
+                OID oid;
+                Local<Value> theid = value->ToObject()->Get( getV8Str( "id" ) );
+                oid.init( toSTLString( theid->ToObject()->Get(getV8Str("str")) ) );
+                string ns = toSTLString( value->ToObject()->Get( getV8Str( "ns" ) ) );
+                b.appendDBRef( sname, ns, oid );
+            }
+            else if ( !value->ToObject()->GetHiddenValue( V8STR_BINDATA ).IsEmpty() ) {
+                int len = obj->Get( getV8Str( "len" ) )->ToInt32()->Value();
+                Local<External> c = External::Cast( *(obj->GetInternalField( 0 )) );
+                const char* dataArray = (char*)(c->Value());;
+                b.appendBinData( sname,
+                                 len,
+                                 mongo::BinDataType( obj->Get( getV8Str( "type" ) )->ToInt32()->Value() ),
+                                 dataArray );
+            }
+            else {
+                BSONObj sub = v8ToMongo( value->ToObject() , depth );
+                b.append( sname , sub );
+            }
+            return;
+        }
+
+        if ( value->IsBoolean() ) {
+            b.appendBool( sname , value->ToBoolean()->Value() );
+            return;
+        }
+
+        else if ( value->IsUndefined() ) {
+            b.appendUndefined( sname );
+            return;
+        }
+
+        else if ( value->IsNull() ) {
+            b.appendNull( sname );
+            return;
+        }
+
+        cout << "don't know how to convert to mongo field [" << sname << "]\t" << value << endl;
+    }
+
+    BSONObj V8Scope::v8ToMongo( v8::Handle<v8::Object> o , int depth ) {
+        BSONObj* originalBSON = 0;
+        if (o->InternalFieldCount() > 0) {
+            originalBSON = unwrapBSONObj(o);
+
+            if ( !o->GetHiddenValue( V8STR_RO ).IsEmpty() ||
+                    (o->HasNamedLookupInterceptor() && o->GetHiddenValue( V8STR_MODIFIED ).IsEmpty()) ) {
+                // object was readonly, use bson as is
+                return *originalBSON;
+            }
+        }
+
+        BSONObjBuilder b;
+
+        if ( depth == 0 ) {
+            if ( o->HasRealNamedProperty( V8STR_ID ) ) {
+                v8ToMongoElement( b , "_id" , o->Get( V8STR_ID ), 0, originalBSON );
+            }
+        }
+
+        Local<v8::Array> names = o->GetPropertyNames();
+        for ( unsigned int i=0; i<names->Length(); i++ ) {
+            v8::Local<v8::String> name = names->Get( i )->ToString();
+
+//            if ( o->GetPrototype()->IsObject() &&
+//                    o->GetPrototype()->ToObject()->HasRealNamedProperty( name ) )
+//                continue;
+
+            v8::Local<v8::Value> value = o->Get( name );
+
+            const string sname = toSTLString( name );
+            if ( depth == 0 && sname == "_id" )
+                continue;
+
+            v8ToMongoElement( b , sname , value , depth + 1, originalBSON );
+        }
+        return b.obj();
+    }
+
+    // --- random utils ----
+
+    v8::Function * V8Scope::getNamedCons( const char * name ) {
+        return v8::Function::Cast( *(v8::Context::GetCurrent()->Global()->Get( getV8Str( name ) ) ) );
+    }
+
+    v8::Function * V8Scope::getObjectIdCons() {
+        return getNamedCons( "ObjectId" );
+    }
+
+    Handle<v8::Value> V8Scope::Print(V8Scope* scope, const Arguments& args) {
+        bool first = true;
+        for (int i = 0; i < args.Length(); i++) {
+            HandleScope handle_scope;
+            if (first) {
+                first = false;
+            }
+            else {
+                printf(" ");
+            }
+            v8::String::Utf8Value str(args[i]);
+            printf("%s", *str);
+        }
+        printf("\n");
+        return v8::Undefined();
+    }
+
+    Handle<v8::Value> V8Scope::Version(V8Scope* scope, const Arguments& args) {
+        HandleScope handle_scope;
+        return handle_scope.Close( v8::String::New(v8::V8::GetVersion()) );
+    }
+
+    Handle<v8::Value> V8Scope::GCV8(V8Scope* scope, const Arguments& args) {
+        V8Lock l;
+        v8::V8::LowMemoryNotification();
+        return v8::Undefined();
+    }
+
+    /**
+     * Gets a V8 strings from the scope's cache, creating one if needed
+     */
+    v8::Handle<v8::String> V8Scope::getV8Str(string str) {
+        Persistent<v8::String> ptr = _strCache[str];
+        if (ptr.IsEmpty()) {
+            ptr = Persistent<v8::String>::New(v8::String::New(str.c_str()));
+            _strCache[str] = ptr;
+//          cout << "Adding str " + str << endl;
+        }
+//      cout << "Returning str " + str << endl;
+        return ptr;
+    }
+
+    // to be called with v8 mutex
+    void V8Scope::enableV8Interrupt() {
+        v8Locks::InterruptLock l;
+        if ( globalScriptEngine->haveGetInterruptSpecCallback() ) {
+            unsigned op = globalScriptEngine->getInterruptSpec();
+            __interruptSpecToThreadId[ op ] = v8::V8::GetCurrentThreadId();
+            __interruptSpecToIsolate[ op ] = _isolate;
+        }
+    }
+
+    // to be called with v8 mutex
+    void V8Scope::disableV8Interrupt() {
+        v8Locks::InterruptLock l;
+        if ( globalScriptEngine->haveGetInterruptSpecCallback() ) {
+            unsigned op = globalScriptEngine->getInterruptSpec();
+            __interruptSpecToIsolate.erase( op );
+            __interruptSpecToThreadId.erase( op );
+        }
+    }
+
+    // to be called with v8 mutex
+    bool V8Scope::pauseV8Interrupt() {
+        v8Locks::InterruptLock l;
+        if ( globalScriptEngine->haveGetInterruptSpecCallback() ) {
+            unsigned op = globalScriptEngine->getInterruptSpec();
+            int thread = __interruptSpecToThreadId[ op ];
+            if ( thread == -2 || thread == -3) {
+                // already paused
+                return false;
+            }
+            __interruptSpecToThreadId[ op ] = -2;
+        }
+        return true;
+    }
+
+    // to be called with v8 mutex
+    bool V8Scope::resumeV8Interrupt() {
+        v8Locks::InterruptLock l;
+        if ( globalScriptEngine->haveGetInterruptSpecCallback() ) {
+            unsigned op = globalScriptEngine->getInterruptSpec();
+            if (__interruptSpecToThreadId[ op ] == -3) {
+                // was interrupted
+                return false;
+            }
+            __interruptSpecToThreadId[ op ] = v8::V8::GetCurrentThreadId();
+        }
+        return true;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/scripting/engine_v8.h b/src/mongo/scripting/engine_v8.h
new file mode 100644
index 00000000000..48a9858c63b
--- /dev/null
+++ b/src/mongo/scripting/engine_v8.h
@@ -0,0 +1,254 @@
+//engine_v8.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+#include "engine.h"
+#include <v8.h>
+
+using namespace v8;
+
+namespace mongo {
+
+    class V8ScriptEngine;
+    class V8Scope;
+
+    typedef Handle< Value > (*v8Function) ( V8Scope* scope, const v8::Arguments& args );
+
+    // Preemption is going to be allowed for the v8 mutex, and some of our v8
+    // usage is not preemption safe.  So we are using an additional mutex that
+    // will not be preempted.  The V8Lock should be used in place of v8::Locker
+    // except in certain special cases involving interrupts.
+    namespace v8Locks {
+        struct InterruptLock {
+            InterruptLock();
+            ~InterruptLock();
+        };
+
+        // the implementations are quite simple - objects must be destroyed in
+        // reverse of the order created, and should not be shared between threads
+        struct RecursiveLock {
+            RecursiveLock();
+            ~RecursiveLock();
+            bool _unlock;
+        };
+        struct RecursiveUnlock {
+            RecursiveUnlock();
+            ~RecursiveUnlock();
+            bool _lock;
+        };
+    } // namespace v8Locks
+    class V8Lock {
+        public:
+        V8Lock() : _preemptionLock(Isolate::GetCurrent()){}
+
+        private:
+        v8Locks::RecursiveLock _noPreemptionLock;
+        v8::Locker _preemptionLock;
+    };
+    struct V8Unlock {
+        public:
+        V8Unlock() : _preemptionUnlock(Isolate::GetCurrent()){}
+
+        private:
+        v8::Unlocker _preemptionUnlock;
+        v8Locks::RecursiveUnlock _noPreemptionUnlock;
+    };
+
+    class V8Scope : public Scope {
+    public:
+
+        V8Scope( V8ScriptEngine * engine );
+        ~V8Scope();
+
+        virtual void reset();
+        virtual void init( const BSONObj * data );
+
+        virtual void localConnect( const char * dbName );
+        virtual void externalSetup();
+
+        v8::Handle<v8::Value> get( const char * field ); // caller must create context and handle scopes
+        virtual double getNumber( const char *field );
+        virtual int getNumberInt( const char *field );
+        virtual long long getNumberLongLong( const char *field );
+        virtual string getString( const char *field );
+        virtual bool getBoolean( const char *field );
+        virtual BSONObj getObject( const char *field );
+        Handle<v8::Object> getGlobalObject() { return _global; };
+
+        virtual int type( const char *field );
+
+        virtual void setNumber( const char *field , double val );
+        virtual void setString( const char *field , const char * val );
+        virtual void setBoolean( const char *field , bool val );
+        virtual void setElement( const char *field , const BSONElement& e );
+        virtual void setObject( const char *field , const BSONObj& obj , bool readOnly);
+        virtual void setFunction( const char *field , const char * code );
+//        virtual void setThis( const BSONObj * obj );
+
+        virtual void rename( const char * from , const char * to );
+
+        virtual ScriptingFunction _createFunction( const char * code );
+        Local< v8::Function > __createFunction( const char * code );
+        virtual int invoke( ScriptingFunction func , const BSONObj* args, const BSONObj* recv, int timeoutMs = 0 , bool ignoreReturn = false, bool readOnlyArgs = false, bool readOnlyRecv = false );
+        virtual bool exec( const StringData& code , const string& name , bool printResult , bool reportError , bool assertOnError, int timeoutMs );
+        virtual string getError() { return _error; }
+        virtual bool hasOutOfMemoryException();
+
+        virtual void injectNative( const char *field, NativeFunction func, void* data = 0 );
+        void injectNative( const char *field, NativeFunction func, Handle<v8::Object>& obj, void* data = 0 );
+        void injectV8Function( const char *field, v8Function func );
+        void injectV8Function( const char *field, v8Function func, Handle<v8::Object>& obj );
+        void injectV8Function( const char *field, v8Function func, Handle<v8::Template>& t );
+        Handle<v8::FunctionTemplate> createV8Function( v8Function func );
+
+        void gc();
+
+        Handle< Context > context() const { return _context; }
+
+        v8::Local<v8::Object> mongoToV8( const mongo::BSONObj & m , bool array = 0 , bool readOnly = false );
+        v8::Handle<v8::Object> mongoToLZV8( const mongo::BSONObj & m , bool array = 0 , bool readOnly = false );
+        mongo::BSONObj v8ToMongo( v8::Handle<v8::Object> o , int depth = 0 );
+
+        void v8ToMongoElement( BSONObjBuilder & b , const string sname , v8::Handle<v8::Value> value , int depth = 0, BSONObj* originalParent=0 );
+        v8::Handle<v8::Value> mongoToV8Element( const BSONElement &f, bool readOnly = false );
+        virtual void append( BSONObjBuilder & builder , const char * fieldName , const char * scopeName );
+
+        v8::Function * getNamedCons( const char * name );
+        v8::Function * getObjectIdCons();
+        Local< v8::Value > newId( const OID &id );
+
+        Persistent<v8::Object> wrapBSONObject(Local<v8::Object> obj, BSONObj* data);
+        Persistent<v8::Object> wrapArrayObject(Local<v8::Object> obj, char* data);
+
+        v8::Handle<v8::String> getV8Str(string str);
+//        inline v8::Handle<v8::String> getV8Str(string str) { return v8::String::New(str.c_str()); }
+        inline v8::Handle<v8::String> getLocalV8Str(string str) { return v8::String::New(str.c_str()); }
+
+        v8::Isolate* getIsolate() { return _isolate; }
+        Persistent<Context> getContext() { return _context; }
+
+        // call with v8 mutex:
+        void enableV8Interrupt();
+        void disableV8Interrupt();
+        bool pauseV8Interrupt();
+        bool resumeV8Interrupt();
+
+        Handle<v8::String> V8STR_CONN;
+        Handle<v8::String> V8STR_ID;
+        Handle<v8::String> V8STR_LENGTH;
+        Handle<v8::String> V8STR_LEN;
+        Handle<v8::String> V8STR_TYPE;
+        Handle<v8::String> V8STR_ISOBJECTID;
+        Handle<v8::String> V8STR_NATIVE_FUNC;
+        Handle<v8::String> V8STR_NATIVE_DATA;
+        Handle<v8::String> V8STR_V8_FUNC;
+        Handle<v8::String> V8STR_RETURN;
+        Handle<v8::String> V8STR_ARGS;
+        Handle<v8::String> V8STR_T;
+        Handle<v8::String> V8STR_I;
+        Handle<v8::String> V8STR_EMPTY;
+        Handle<v8::String> V8STR_MINKEY;
+        Handle<v8::String> V8STR_MAXKEY;
+        Handle<v8::String> V8STR_NUMBERLONG;
+        Handle<v8::String> V8STR_NUMBERINT;
+        Handle<v8::String> V8STR_DBPTR;
+        Handle<v8::String> V8STR_BINDATA;
+        Handle<v8::String> V8STR_WRAPPER;
+        Handle<v8::String> V8STR_RO;
+        Handle<v8::String> V8STR_MODIFIED;
+        Handle<v8::String> V8STR_FULLNAME;
+
+    private:
+        void _startCall();
+
+        static Handle< Value > nativeCallback( V8Scope* scope, const Arguments &args );
+        static v8::Handle< v8::Value > v8Callback( const v8::Arguments &args );
+        static Handle< Value > load( V8Scope* scope, const Arguments &args );
+        static Handle< Value > Print(V8Scope* scope, const v8::Arguments& args);
+        static Handle< Value > Version(V8Scope* scope, const v8::Arguments& args);
+        static Handle< Value > GCV8(V8Scope* scope, const v8::Arguments& args);
+
+
+        V8ScriptEngine * _engine;
+
+        Persistent<Context> _context;
+        Persistent<v8::Object> _global;
+
+        string _error;
+        vector< Persistent<Value> > _funcs;
+        v8::Persistent<v8::Object> _emptyObj;
+
+        v8::Persistent<v8::Function> _wrapper;
+
+        enum ConnectState { NOT , LOCAL , EXTERNAL };
+        ConnectState _connectState;
+
+        std::map <string, v8::Persistent <v8::String> > _strCache;
+
+        Persistent<v8::ObjectTemplate> lzObjectTemplate;
+        Persistent<v8::ObjectTemplate> roObjectTemplate;
+        Persistent<v8::ObjectTemplate> lzArrayTemplate;
+        Persistent<v8::ObjectTemplate> internalFieldObjects;
+        v8::Isolate* _isolate;
+    };
+
+    class V8ScriptEngine : public ScriptEngine {
+    public:
+        V8ScriptEngine();
+        virtual ~V8ScriptEngine();
+
+        virtual Scope * createScope() { return new V8Scope( this ); }
+
+        virtual void runTest() {}
+
+        bool utf8Ok() const { return true; }
+
+        class V8UnlockForClient : public Unlocker {
+            V8Unlock u_;
+        };
+
+        virtual auto_ptr<Unlocker> newThreadUnlocker() { return auto_ptr< Unlocker >( new V8UnlockForClient ); }
+
+        virtual void interrupt( unsigned opSpec );
+        virtual void interruptAll();
+
+    private:
+        friend class V8Scope;
+    };
+
+    class ExternalString : public v8::String::ExternalAsciiStringResource {
+    public:
+        ExternalString(std::string str) : _data(str) {
+        }
+
+        ~ExternalString() {
+        }
+
+        const char* data () const { return _data.c_str(); }
+        size_t length () const { return _data.length(); }
+    private:
+//      string _str;
+//        const char* _data;
+        std::string _data;
+//        size_t _len;
+    };
+
+    extern ScriptEngine * globalScriptEngine;
+
+}
diff --git a/src/mongo/scripting/sm_db.cpp b/src/mongo/scripting/sm_db.cpp
new file mode 100644
index 00000000000..ea8780fa7c0
--- /dev/null
+++ b/src/mongo/scripting/sm_db.cpp
@@ -0,0 +1,1284 @@
+// sm_db.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+// hacked in right now from engine_spidermonkey.cpp
+
+#include "../client/syncclusterconnection.h"
+#include "../util/base64.h"
+#include "../util/text.h"
+#include "../util/hex.h"
+
+#if( BOOST_VERSION >= 104200 )
+//#include <boost/uuid/uuid.hpp>
+#define HAVE_UUID 1
+#else
+;
+#endif
+
+namespace mongo {
+
+    bool haveLocalShardingInfo( const string& ns );
+
+    // ------------    some defs needed ---------------
+
+    JSObject * doCreateCollection( JSContext * cx , JSObject * db , const string& shortName );
+
+    // ------------     utils          ------------------
+
+
+    bool isSpecialName( const string& name ) {
+        static set<string> names;
+        if ( names.size() == 0 ) {
+            names.insert( "tojson" );
+            names.insert( "toJson" );
+            names.insert( "toString" );
+        }
+
+        if ( name.length() == 0 )
+            return false;
+
+        if ( name[0] == '_' )
+            return true;
+
+        return names.count( name ) > 0;
+    }
+
+
+    // ------ cursor ------
+
+    class CursorHolder {
+    public:
+        CursorHolder( auto_ptr< DBClientCursor > &cursor, const shared_ptr< DBClientWithCommands > &connection ) :
+            connection_( connection ),
+            cursor_( cursor ) {
+            assert( cursor_.get() );
+        }
+        DBClientCursor *get() const { return cursor_.get(); }
+    private:
+        shared_ptr< DBClientWithCommands > connection_;
+        auto_ptr< DBClientCursor > cursor_;
+    };
+
+    DBClientCursor *getCursor( JSContext *cx, JSObject *obj ) {
+        CursorHolder * holder = (CursorHolder*)JS_GetPrivate( cx , obj );
+        uassert( 10235 ,  "no cursor!" , holder );
+        return holder->get();
+    }
+
+    JSBool internal_cursor_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        uassert( 10236 ,  "no args to internal_cursor_constructor" , argc == 0 );
+        assert( JS_SetPrivate( cx , obj , 0 ) ); // just for safety
+        return JS_TRUE;
+    }
+
+    void internal_cursor_finalize( JSContext * cx , JSObject * obj ) {
+        CursorHolder * holder = (CursorHolder*)JS_GetPrivate( cx , obj );
+        if ( holder ) {
+            delete holder;
+            assert( JS_SetPrivate( cx , obj , 0 ) );
+        }
+    }
+
+    JSBool internal_cursor_hasNext(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        DBClientCursor *cursor = getCursor( cx, obj );
+        try {
+            *rval = cursor->more() ? JSVAL_TRUE : JSVAL_FALSE;
+        }
+        catch ( std::exception& e ) {
+            JS_ReportError( cx , e.what() );
+            return JS_FALSE;
+        }
+        return JS_TRUE;
+    }
+
+    JSBool internal_cursor_objsLeftInBatch(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        DBClientCursor *cursor = getCursor( cx, obj );
+        Convertor c(cx);
+        *rval = c.toval((double) cursor->objsLeftInBatch() );
+        return JS_TRUE;
+    }
+
+    JSBool internal_cursor_next(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        DBClientCursor *cursor = getCursor( cx, obj );
+
+        BSONObj n;
+
+        try {
+            if ( ! cursor->more() ) {
+                JS_ReportError( cx , "cursor at the end" );
+                return JS_FALSE;
+            }
+
+            n = cursor->next();
+        }
+        catch ( std::exception& e ) {
+            JS_ReportError( cx , e.what() );
+            return JS_FALSE;
+        }
+
+        Convertor c(cx);
+        *rval = c.toval( &n );
+        return JS_TRUE;
+    }
+
+    JSFunctionSpec internal_cursor_functions[] = {
+        { "hasNext" , internal_cursor_hasNext , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "objsLeftInBatch" , internal_cursor_objsLeftInBatch , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "next" , internal_cursor_next , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { 0 }
+    };
+
+    JSClass internal_cursor_class = {
+        "InternalCursor" , JSCLASS_HAS_PRIVATE  ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, internal_cursor_finalize,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+
+    // ------ mongo stuff ------
+
+    JSBool mongo_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        uassert( 10237 ,  "mongo_constructor not implemented yet" , 0 );
+        throw -1;
+    }
+
+    JSBool mongo_local_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+
+        shared_ptr< DBClientWithCommands > client( createDirectClient() );
+        assert( JS_SetPrivate( cx , obj , (void*)( new shared_ptr< DBClientWithCommands >( client ) ) ) );
+
+        jsval host = c.toval( "EMBEDDED" );
+        assert( JS_SetProperty( cx , obj , "host" , &host ) );
+
+        return JS_TRUE;
+    }
+
+    JSBool mongo_external_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+
+        smuassert( cx ,  "0 or 1 args to Mongo" , argc <= 1 );
+
+        string host = "127.0.0.1";
+        if ( argc > 0 )
+            host = c.toString( argv[0] );
+
+        string errmsg;
+
+        ConnectionString cs = ConnectionString::parse( host , errmsg );
+        if ( ! cs.isValid() ) {
+            JS_ReportError( cx , errmsg.c_str() );
+            return JS_FALSE;
+        }
+
+        shared_ptr< DBClientWithCommands > conn( cs.connect( errmsg ) );
+        if ( ! conn ) {
+            JS_ReportError( cx , errmsg.c_str() );
+            return JS_FALSE;
+        }
+
+        try{
+        	ScriptEngine::runConnectCallback( *conn );
+        }
+        catch( std::exception& e ){
+        	// Can happen if connection goes down while we're starting up here
+		// Catch so that we don't get a hard-to-trace segfault from SM
+        	JS_ReportError( cx, ((string)( str::stream() << "Error during mongo startup." << causedBy( e ) )).c_str() );
+        	return JS_FALSE;
+        }
+
+        assert( JS_SetPrivate( cx , obj , (void*)( new shared_ptr< DBClientWithCommands >( conn ) ) ) );
+        jsval host_val = c.toval( host.c_str() );
+        assert( JS_SetProperty( cx , obj , "host" , &host_val ) );
+        return JS_TRUE;
+
+    }
+
+    DBClientWithCommands *getConnection( JSContext *cx, JSObject *obj ) {
+        shared_ptr< DBClientWithCommands > * connHolder = (shared_ptr< DBClientWithCommands >*)JS_GetPrivate( cx , obj );
+        uassert( 10239 ,  "no connection!" , connHolder && connHolder->get() );
+        return connHolder->get();
+    }
+
+    void mongo_finalize( JSContext * cx , JSObject * obj ) {
+        shared_ptr< DBClientWithCommands > * connHolder = (shared_ptr< DBClientWithCommands >*)JS_GetPrivate( cx , obj );
+        if ( connHolder ) {
+            delete connHolder;
+            assert( JS_SetPrivate( cx , obj , 0 ) );
+        }
+    }
+
+    JSClass mongo_class = {
+        "Mongo" , JSCLASS_HAS_PRIVATE | JSCLASS_NEW_RESOLVE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, mongo_finalize,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSBool mongo_auth(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        smuassert( cx , "mongo_auth needs 3 args" , argc == 3 );
+        shared_ptr< DBClientWithCommands > * connHolder = (shared_ptr< DBClientWithCommands >*)JS_GetPrivate( cx , obj );
+        smuassert( cx ,  "no connection!" , connHolder && connHolder->get() );
+        DBClientWithCommands *conn = connHolder->get();
+
+        Convertor c( cx );
+
+        string db = c.toString( argv[0] );
+        string username = c.toString( argv[1] );
+        string password = c.toString( argv[2] );
+        string errmsg = "";
+
+        try {
+            if (conn->auth(db, username, password, errmsg)) {
+                return JS_TRUE;
+            }
+            JS_ReportError( cx, errmsg.c_str() );
+        }
+        catch ( ... ) {
+            JS_ReportError( cx , "error doing query: unknown" );
+        }
+        return JS_FALSE;
+    }
+
+    JSBool mongo_find(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        smuassert( cx , "mongo_find needs 7 args" , argc == 7 );
+        shared_ptr< DBClientWithCommands > * connHolder = (shared_ptr< DBClientWithCommands >*)JS_GetPrivate( cx , obj );
+        smuassert( cx ,  "no connection!" , connHolder && connHolder->get() );
+        DBClientWithCommands *conn = connHolder->get();
+
+        Convertor c( cx );
+
+        string ns = c.toString( argv[0] );
+
+        BSONObj q = c.toObject( argv[1] );
+        BSONObj f = c.toObject( argv[2] );
+
+        int nToReturn = (int) c.toNumber( argv[3] );
+        int nToSkip = (int) c.toNumber( argv[4] );
+        int batchSize = (int) c.toNumber( argv[5] );
+        int options = (int)c.toNumber( argv[6] );
+
+        try {
+
+            auto_ptr<DBClientCursor> cursor = conn->query( ns , q , nToReturn , nToSkip , f.nFields() ? &f : 0  , options , batchSize );
+            if ( ! cursor.get() ) {
+                log() << "query failed : " << ns << " " << q << " to: " << conn->toString() << endl;
+                JS_ReportError( cx , "error doing query: failed" );
+                return JS_FALSE;
+            }
+            JSObject * mycursor = JS_NewObject( cx , &internal_cursor_class , 0 , 0 );
+            CHECKNEWOBJECT( mycursor, cx, "internal_cursor_class" );
+            assert( JS_SetPrivate( cx , mycursor , new CursorHolder( cursor, *connHolder ) ) );
+            *rval = OBJECT_TO_JSVAL( mycursor );
+            return JS_TRUE;
+        }
+        catch ( ... ) {
+            JS_ReportError( cx , "error doing query: unknown" );
+            return JS_FALSE;
+        }
+    }
+
+    JSBool mongo_update(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        smuassert( cx ,  "mongo_update needs at least 3 args" , argc >= 3 );
+        smuassert( cx ,  "2nd param to update has to be an object" , JSVAL_IS_OBJECT( argv[1] ) );
+        smuassert( cx ,  "3rd param to update has to be an object" , JSVAL_IS_OBJECT( argv[2] ) );
+
+        Convertor c( cx );
+        if ( c.getBoolean( obj , "readOnly" ) ) {
+            JS_ReportError( cx , "js db in read only mode - mongo_update" );
+            return JS_FALSE;
+        }
+
+        DBClientWithCommands * conn = getConnection( cx, obj );
+        uassert( 10245 ,  "no connection!" , conn );
+
+        string ns = c.toString( argv[0] );
+
+        bool upsert = argc > 3 && c.toBoolean( argv[3] );
+        bool multi = argc > 4 && c.toBoolean( argv[4] );
+
+        try {
+            conn->update( ns , c.toObject( argv[1] ) , c.toObject( argv[2] ) , upsert , multi );
+            return JS_TRUE;
+        }
+        catch ( ... ) {
+            JS_ReportError( cx , "error doing update" );
+            return JS_FALSE;
+        }
+    }
+
+    JSBool mongo_insert(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        smuassert( cx ,  "mongo_insert needs 2 args" , argc == 2 );
+        smuassert( cx ,  "2nd param to insert has to be an object" , JSVAL_IS_OBJECT( argv[1] ) );
+
+        Convertor c( cx );
+        if ( c.getBoolean( obj , "readOnly" ) ) {
+            JS_ReportError( cx , "js db in read only mode - mongo_insert" );
+            return JS_FALSE;
+        }
+
+        DBClientWithCommands * conn = getConnection( cx, obj );
+        uassert( 10248 ,  "no connection!" , conn );
+
+        string ns = c.toString( argv[0] );
+
+        try {
+            JSObject * insertObj = JSVAL_TO_OBJECT( argv[1] );
+
+            if( JS_IsArrayObject( cx, insertObj ) ){
+                vector<BSONObj> bos;
+
+                jsuint len;
+                JSBool gotLen = JS_GetArrayLength( cx, insertObj, &len );
+                smuassert( cx, "could not get length of array", gotLen );
+
+                for( jsuint i = 0; i < len; i++ ){
+
+                    jsval el;
+                    JSBool inserted = JS_GetElement( cx, insertObj, i, &el);
+                    smuassert( cx, "could not find element in array object", inserted );
+
+                    bos.push_back( c.toObject( el ) );
+                }
+
+                conn->insert( ns, bos );
+
+                return JS_TRUE;
+            }
+            else {
+                BSONObj o = c.toObject( argv[1] );
+                // TODO: add _id
+
+                conn->insert( ns , o );
+                return JS_TRUE;
+            }
+        }
+        catch ( std::exception& e ) {
+            stringstream ss;
+            ss << "error doing insert:" << e.what();
+            string s = ss.str();
+            JS_ReportError( cx , s.c_str() );
+            return JS_FALSE;
+        }
+        catch ( ... ) {
+            JS_ReportError( cx , "error doing insert" );
+            return JS_FALSE;
+        }
+    }
+
+    JSBool mongo_remove(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        smuassert( cx ,  "mongo_remove needs 2 or 3 arguments" , argc == 2 || argc == 3 );
+        smuassert( cx ,  "2nd param to insert has to be an object" , JSVAL_IS_OBJECT( argv[1] ) );
+
+        Convertor c( cx );
+        if ( c.getBoolean( obj , "readOnly" ) ) {
+            JS_ReportError( cx , "js db in read only mode - mongo_remove" );
+            return JS_FALSE;
+        }
+
+        DBClientWithCommands * conn = getConnection( cx, obj );
+        uassert( 10251 ,  "no connection!" , conn );
+
+        string ns = c.toString( argv[0] );
+        BSONObj o = c.toObject( argv[1] );
+        bool justOne = false;
+        if ( argc > 2 )
+            justOne = c.toBoolean( argv[2] );
+
+        try {
+            conn->remove( ns , o , justOne );
+            return JS_TRUE;
+        }
+        catch ( std::exception& e ) {
+            JS_ReportError( cx , e.what() );
+            return JS_FALSE;
+        }
+        
+        catch ( ... ) {
+            JS_ReportError( cx , "error doing remove" );
+            return JS_FALSE;
+        }
+
+    }
+
+    JSFunctionSpec mongo_functions[] = {
+        { "auth" , mongo_auth , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "find" , mongo_find , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "update" , mongo_update , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "insert" , mongo_insert , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "remove" , mongo_remove , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { 0 }
+    };
+
+    // -------------  db_collection -------------
+
+    JSBool db_collection_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        smuassert( cx ,  "db_collection_constructor wrong args" , argc == 4 );
+        assert( JS_SetProperty( cx , obj , "_mongo" , &(argv[0]) ) );
+        assert( JS_SetProperty( cx , obj , "_db" , &(argv[1]) ) );
+        assert( JS_SetProperty( cx , obj , "_shortName" , &(argv[2]) ) );
+        assert( JS_SetProperty( cx , obj , "_fullName" , &(argv[3]) ) );
+
+        Convertor c(cx);
+        if ( haveLocalShardingInfo( c.toString( argv[3] ) ) ) {
+            JS_ReportError( cx , "can't use sharded collection from db.eval" );
+            return JS_FALSE;
+        }
+
+        return JS_TRUE;
+    }
+
+    JSBool db_collection_resolve( JSContext *cx, JSObject *obj, jsval id, uintN flags, JSObject **objp ) {
+        if ( flags & JSRESOLVE_ASSIGNING )
+            return JS_TRUE;
+
+        Convertor c( cx );
+        string collname = c.toString( id );
+
+        if ( isSpecialName( collname ) )
+            return JS_TRUE;
+
+        if ( obj == c.getGlobalPrototype( "DBCollection" ) )
+            return JS_TRUE;
+
+        JSObject * proto = JS_GetPrototype( cx , obj );
+        if ( c.hasProperty( obj , collname.c_str() ) || ( proto && c.hasProperty( proto , collname.c_str() )  ) )
+            return JS_TRUE;
+
+        string name = c.toString( c.getProperty( obj , "_shortName" ) );
+        name += ".";
+        name += collname;
+
+        jsval db = c.getProperty( obj , "_db" );
+        if ( ! JSVAL_IS_OBJECT( db ) )
+            return JS_TRUE;
+
+        JSObject * coll = doCreateCollection( cx , JSVAL_TO_OBJECT( db ) , name );
+        if ( ! coll )
+            return JS_FALSE;
+        c.setProperty( obj , collname.c_str() , OBJECT_TO_JSVAL( coll ) );
+        *objp = obj;
+        return JS_TRUE;
+    }
+
+    JSClass db_collection_class = {
+        "DBCollection" , JSCLASS_HAS_PRIVATE | JSCLASS_NEW_RESOLVE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, (JSResolveOp)(&db_collection_resolve) , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+
+    JSObject * doCreateCollection( JSContext * cx , JSObject * db , const string& shortName ) {
+        Convertor c(cx);
+
+        assert( c.hasProperty( db , "_mongo" ) );
+        assert( c.hasProperty( db , "_name" ) );
+
+        JSObject * coll = JS_NewObject( cx , &db_collection_class , 0 , 0 );
+        CHECKNEWOBJECT( coll, cx, "doCreateCollection" );
+        c.setProperty( coll , "_mongo" , c.getProperty( db , "_mongo" ) );
+        c.setProperty( coll , "_db" , OBJECT_TO_JSVAL( db ) );
+        c.setProperty( coll , "_shortName" , c.toval( shortName.c_str() ) );
+
+        string name = c.toString( c.getProperty( db , "_name" ) );
+        name += "." + shortName;
+        c.setProperty( coll , "_fullName" , c.toval( name.c_str() ) );
+
+        if ( haveLocalShardingInfo( name ) ) {
+            JS_ReportError( cx , "can't use sharded collection from db.eval" );
+            return 0;
+        }
+
+        return coll;
+    }
+
+    // --------------  DB ---------------
+
+
+    JSBool db_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        smuassert( cx,  "wrong number of arguments to DB" , argc == 2 );
+        assert( JS_SetProperty( cx , obj , "_mongo" , &(argv[0]) ) );
+        assert( JS_SetProperty( cx , obj , "_name" , &(argv[1]) ) );
+
+        return JS_TRUE;
+    }
+
+    JSBool db_resolve( JSContext *cx, JSObject *obj, jsval id, uintN flags, JSObject **objp ) {
+        if ( flags & JSRESOLVE_ASSIGNING )
+            return JS_TRUE;
+
+        Convertor c( cx );
+
+        if ( obj == c.getGlobalPrototype( "DB" ) )
+            return JS_TRUE;
+
+        string collname = c.toString( id );
+
+        if ( isSpecialName( collname ) )
+            return JS_TRUE;
+
+        JSObject * proto = JS_GetPrototype( cx , obj );
+        if ( proto && c.hasProperty( proto , collname.c_str() ) )
+            return JS_TRUE;
+
+        JSObject * coll = doCreateCollection( cx , obj , collname );
+        if ( ! coll )
+            return JS_FALSE;
+        c.setProperty( obj , collname.c_str() , OBJECT_TO_JSVAL( coll ) );
+
+        *objp = obj;
+        return JS_TRUE;
+    }
+
+    JSClass db_class = {
+        "DB" , JSCLASS_HAS_PRIVATE | JSCLASS_NEW_RESOLVE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, (JSResolveOp)(&db_resolve) , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+
+    // -------------- object id -------------
+
+    JSBool object_id_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+
+        OID oid;
+        if ( argc == 0 ) {
+            oid.init();
+        }
+        else {
+            smuassert( cx ,  "object_id_constructor can't take more than 1 param" , argc == 1 );
+            string s = c.toString( argv[0] );
+
+            try {
+                Scope::validateObjectIdString( s );
+            }
+            catch ( const MsgAssertionException &m ) {
+                static string error = m.toString();
+                JS_ReportError( cx, error.c_str() );
+                return JS_FALSE;
+            }
+            oid.init( s );
+        }
+
+        if ( ! JS_InstanceOf( cx , obj , &object_id_class , 0 ) ) {
+            obj = JS_NewObject( cx , &object_id_class , 0 , 0 );
+            CHECKNEWOBJECT( obj, cx, "object_id_constructor" );
+            *rval = OBJECT_TO_JSVAL( obj );
+        }
+
+        jsval v = c.toval( oid.str().c_str() );
+        assert( JS_SetProperty( cx , obj , "str" , &v  ) );
+
+        return JS_TRUE;
+    }
+
+    JSClass object_id_class = {
+        "ObjectId" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    // dbpointer
+
+    JSBool dbpointer_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+        if ( ! JS_InstanceOf( cx , obj , &dbpointer_class , 0 ) ) {
+            obj = JS_NewObject( cx , &dbpointer_class , 0 , 0 );
+            CHECKNEWOBJECT( obj, cx, "dbpointer_constructor" );
+            *rval = OBJECT_TO_JSVAL( obj );
+        }
+
+        if ( argc == 2 ) {
+
+            if ( ! JSVAL_IS_OID( argv[1] ) ) {
+                JS_ReportError( cx , "2nd arg to DBPointer needs to be oid" );
+                return JS_FALSE;
+            }
+
+            assert( JS_SetProperty( cx , obj , "ns" , &(argv[0]) ) );
+            assert( JS_SetProperty( cx , obj , "id" , &(argv[1]) ) );
+            return JS_TRUE;
+        }
+        else {
+            JS_ReportError( cx , "DBPointer needs 2 arguments" );
+            return JS_FALSE;
+        }
+    }
+
+    JSClass dbpointer_class = {
+        "DBPointer" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSFunctionSpec dbpointer_functions[] = {
+        { 0 }
+    };
+
+
+    JSBool dbref_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+        if ( ! JS_InstanceOf( cx , obj , &dbref_class , 0 ) ) {
+            obj = JS_NewObject( cx , &dbref_class , 0 , 0 );
+            CHECKNEWOBJECT( obj, cx, "dbref_constructor" );
+            *rval = OBJECT_TO_JSVAL( obj );
+        }
+
+        if ( argc == 2 ) {
+            JSObject * o = JS_NewObject( cx , NULL , NULL, NULL );
+            CHECKNEWOBJECT( o, cx, "dbref_constructor" );
+            assert( JS_SetProperty( cx, o , "$ref" , &argv[ 0 ] ) );
+            assert( JS_SetProperty( cx, o , "$id" , &argv[ 1 ] ) );
+            BSONObj bo = c.toObject( o );
+            assert( JS_SetPrivate( cx , obj , (void*)(new BSONHolder( bo.getOwned() ) ) ) );
+            return JS_TRUE;
+        }
+        else {
+            JS_ReportError( cx , "DBRef needs 2 arguments" );
+            assert( JS_SetPrivate( cx , obj , (void*)(new BSONHolder( BSONObj().getOwned() ) ) ) );
+            return JS_FALSE;
+        }
+    }
+
+    JSClass dbref_class = bson_class; // name will be fixed later
+
+    // UUID **************************
+
+#if 0
+    JSBool uuid_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+
+        if( argc == 0 ) {
+#if defined(HAVE_UUID)
+            //uuids::uuid
+#else
+#endif
+            JS_ReportError( cx , "UUID needs 1 argument -- UUID(hexstr)" );
+            return JS_FALSE;
+        }
+        else if ( argc == 1 ) {
+
+            string encoded = c.toString( argv[ 0 ] );
+            if( encoded.size() != 32 ) {
+                JS_ReportError( cx, "expect 32 char hex string to UUID()" );
+                return JS_FALSE;
+            }
+
+            char buf[16];
+            for( int i = 0; i < 16; i++ ) {
+                buf[i] = fromHex(encoded.c_str() + i * 2);
+            }
+
+zzz
+
+            assert( JS_SetPrivate( cx, obj, new BinDataHolder( buf, 16 ) ) );
+            c.setProperty( obj, "len", c.toval( (double)16 ) );
+            c.setProperty( obj, "type", c.toval( (double)3 ) );
+
+            return JS_TRUE;
+        }
+        else {
+            JS_ReportError( cx , "UUID needs 1 argument -- UUID(hexstr)" );
+            return JS_FALSE;
+        }
+    }
+
+    JSBool uuid_tostring(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        Convertor c(cx);
+        void *holder = JS_GetPrivate( cx, obj );
+        assert( holder );
+        const char *data = ( ( BinDataHolder* )( holder ) )->c_;
+        stringstream ss;
+        ss << "UUID(\"" << toHex(data, 16);
+        ss << "\")";
+        string ret = ss.str();
+        return *rval = c.toval( ret.c_str() );
+    }
+
+    void uuid_finalize( JSContext * cx , JSObject * obj ) {
+        Convertor c(cx);
+        void *holder = JS_GetPrivate( cx, obj );
+        if ( holder ) {
+            delete ( BinDataHolder* )holder;
+            assert( JS_SetPrivate( cx , obj , 0 ) );
+        }
+    }
+
+    JSClass uuid_class = {
+        "UUID" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, uuid_finalize,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSFunctionSpec uuid_functions[] = {
+        { "toString" , uuid_tostring , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { 0 }
+    };
+
+#endif
+
+    // BinData **************************
+
+    JSBool bindata_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        Convertor c( cx );
+        if ( ! JS_InstanceOf( cx , obj , &bindata_class , 0 ) ) {
+            obj = JS_NewObject( cx , &bindata_class , 0 , 0 );
+            CHECKNEWOBJECT( obj, cx, "bindata_constructor" );
+            *rval = OBJECT_TO_JSVAL( obj );
+        }
+
+        if ( argc == 2 ) {
+
+            int type = (int)c.toNumber( argv[ 0 ] );
+            if( type < 0 || type > 255 ) {
+                JS_ReportError( cx , "invalid BinData subtype -- range is 0..255 see bsonspec.org" );
+                return JS_FALSE;
+            }
+            string encoded = c.toString( argv[ 1 ] );
+            string decoded;
+            try {
+                decoded = base64::decode( encoded );
+            }
+            catch(...) {
+                JS_ReportError(cx, "BinData could not decode base64 parameter");
+                return JS_FALSE;
+            }
+
+            assert( JS_SetPrivate( cx, obj, new BinDataHolder( decoded.data(), decoded.length() ) ) );
+            c.setProperty( obj, "len", c.toval( (double)decoded.length() ) );
+            c.setProperty( obj, "type", c.toval( (double)type ) );
+
+            return JS_TRUE;
+        }
+        else {
+            JS_ReportError( cx , "BinData needs 2 arguments -- BinData(subtype,data)" );
+            return JS_FALSE;
+        }
+    }
+
+    JSBool bindata_tostring(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        Convertor c(cx);
+        int type = (int)c.getNumber( obj , "type" );
+        int len = (int)c.getNumber( obj, "len" );
+        void *holder = JS_GetPrivate( cx, obj );
+        assert( holder );
+        const char *data = ( ( BinDataHolder* )( holder ) )->c_;
+        stringstream ss;
+        ss << "BinData(" << type << ",\"";
+        base64::encode( ss, (const char *)data, len );
+        ss << "\")";
+        string ret = ss.str();
+        return *rval = c.toval( ret.c_str() );
+    }
+
+    JSBool bindataBase64(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        Convertor c(cx);
+        int len = (int)c.getNumber( obj, "len" );
+        void *holder = JS_GetPrivate( cx, obj );
+        assert( holder );
+        const char *data = ( ( BinDataHolder* )( holder ) )->c_;
+        stringstream ss;
+        base64::encode( ss, (const char *)data, len );
+        string ret = ss.str();
+        return *rval = c.toval( ret.c_str() );
+    }
+
+    JSBool bindataAsHex(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        Convertor c(cx);
+        int len = (int)c.getNumber( obj, "len" );
+        void *holder = JS_GetPrivate( cx, obj );
+        assert( holder );
+        const char *data = ( ( BinDataHolder* )( holder ) )->c_;
+        stringstream ss;
+        ss.setf (ios_base::hex , ios_base::basefield);
+        ss.fill ('0');
+        ss.setf (ios_base::right , ios_base::adjustfield);
+        for( int i = 0; i < len; i++ ) {
+            unsigned v = (unsigned char) data[i];
+            ss << setw(2) << v;
+        }
+        string ret = ss.str();
+        return *rval = c.toval( ret.c_str() );
+    }
+
+    void bindata_finalize( JSContext * cx , JSObject * obj ) {
+        Convertor c(cx);
+        void *holder = JS_GetPrivate( cx, obj );
+        if ( holder ) {
+            delete ( BinDataHolder* )holder;
+            assert( JS_SetPrivate( cx , obj , 0 ) );
+        }
+    }
+
+    JSClass bindata_class = {
+        "BinData" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, bindata_finalize,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSFunctionSpec bindata_functions[] = {
+        { "toString" , bindata_tostring , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "hex", bindataAsHex, 0, JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "base64", bindataBase64, 0, JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { 0 }
+    };
+
+    // Map
+
+    bool specialMapString( const string& s ) {
+        return s == "put" || s == "get" || s == "_get" || s == "values" || s == "_data" || s == "constructor" ;
+    }
+
+    JSBool map_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        if ( argc > 0 ) {
+            JS_ReportError( cx , "Map takes no arguments" );
+            return JS_FALSE;
+        }
+
+        JSObject * array = JS_NewObject( cx , 0 , 0 , 0 );
+        CHECKNEWOBJECT( array, cx, "map_constructor" );
+
+        jsval a = OBJECT_TO_JSVAL( array );
+        JS_SetProperty( cx , obj , "_data" , &a );
+
+        return JS_TRUE;
+    }
+
+    JSBool map_prop( JSContext *cx, JSObject *obj, jsval idval, jsval *vp ) {
+        Convertor c(cx);
+        if ( specialMapString( c.toString( idval ) ) )
+            return JS_TRUE;
+
+        log() << "illegal prop access: " << c.toString( idval ) << endl;
+        JS_ReportError( cx , "can't use array access with Map" );
+        return JS_FALSE;
+    }
+
+    JSClass map_class = {
+        "Map" , JSCLASS_HAS_PRIVATE ,
+        map_prop, JS_PropertyStub, map_prop, map_prop,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSFunctionSpec map_functions[] = {
+        { 0 }
+    };
+
+
+    // -----
+
+    JSClass timestamp_class = {
+        "Timestamp" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSBool timestamp_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        smuassert( cx , "Timestamp needs 0 or 2 args" , argc == 0 || argc == 2 );
+
+        if ( ! JS_InstanceOf( cx , obj , &timestamp_class , 0 ) ) {
+            obj = JS_NewObject( cx , &timestamp_class , 0 , 0 );
+            CHECKNEWOBJECT( obj, cx, "timestamp_constructor" );
+            *rval = OBJECT_TO_JSVAL( obj );
+        }
+
+        Convertor c( cx );
+        if ( argc == 0 ) {
+            c.setProperty( obj, "t", c.toval( 0.0 ) );
+            c.setProperty( obj, "i", c.toval( 0.0 ) );
+        }
+        else {
+            c.setProperty( obj, "t", argv[ 0 ] );
+            c.setProperty( obj, "i", argv[ 1 ] );
+        }
+
+        return JS_TRUE;
+    }
+
+    JSClass numberlong_class = {
+        "NumberLong" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSBool numberlong_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        smuassert( cx , "NumberLong needs 0 or 1 args" , argc == 0 || argc == 1 );
+
+        if ( ! JS_InstanceOf( cx , obj , &numberlong_class , 0 ) ) {
+            obj = JS_NewObject( cx , &numberlong_class , 0 , 0 );
+            CHECKNEWOBJECT( obj, cx, "numberlong_constructor" );
+            *rval = OBJECT_TO_JSVAL( obj );
+        }
+
+        Convertor c( cx );
+        if ( argc == 0 ) {
+            c.setProperty( obj, "floatApprox", c.toval( 0.0 ) );
+        }
+        else if ( JSVAL_IS_NUMBER( argv[ 0 ] ) ) {
+            c.setProperty( obj, "floatApprox", argv[ 0 ] );
+        }
+        else {
+            string num = c.toString( argv[ 0 ] );
+            //PRINT(num);
+            const char *numStr = num.c_str();
+            long long n;
+            try {
+                n = parseLL( numStr );
+                //PRINT(n);
+            }
+            catch ( const AssertionException & ) {
+                smuassert( cx , "could not convert string to long long" , false );
+            }
+            c.makeLongObj( n, obj );
+        }
+
+        return JS_TRUE;
+    }
+
+    JSBool numberlong_valueof(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        Convertor c(cx);
+        return *rval = c.toval( double( c.toNumberLongUnsafe( obj ) ) );
+    }
+
+    JSBool numberlong_tonumber(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        return numberlong_valueof( cx, obj, argc, argv, rval );
+    }
+
+    JSBool numberlong_tostring(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        Convertor c(cx);
+        stringstream ss;
+        long long val = c.toNumberLongUnsafe( obj );
+        const long long limit = 2LL << 30;
+
+        if ( val <= -limit || limit <= val )
+            ss << "NumberLong(\"" << val << "\")";
+        else
+            ss << "NumberLong(" << val << ")";
+
+        string ret = ss.str();
+        return *rval = c.toval( ret.c_str() );
+    }
+
+    JSFunctionSpec numberlong_functions[] = {
+        { "valueOf" , numberlong_valueof , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "toNumber" , numberlong_tonumber , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "toString" , numberlong_tostring , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { 0 }
+    };
+
+    JSClass numberint_class = {
+        "NumberInt" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSBool numberint_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        smuassert( cx , "NumberInt needs 0 or 1 args" , argc == 0 || argc == 1 );
+
+        if ( ! JS_InstanceOf( cx , obj , &numberint_class , 0 ) ) {
+            obj = JS_NewObject( cx , &numberint_class , 0 , 0 );
+            CHECKNEWOBJECT( obj, cx, "numberint_constructor" );
+            *rval = OBJECT_TO_JSVAL( obj );
+        }
+
+        Convertor c( cx );
+        if ( argc == 0 ) {
+            c.setProperty( obj, "floatApprox", c.toval( 0.0 ) );
+        }
+        else if ( JSVAL_IS_NUMBER( argv[ 0 ] ) ) {
+            c.setProperty( obj, "floatApprox", argv[ 0 ] );
+        }
+        else {
+            string num = c.toString( argv[ 0 ] );
+            //PRINT(num);
+            const char *numStr = num.c_str();
+            int n;
+            try {
+                n = (int) parseLL( numStr );
+                //PRINT(n);
+            }
+            catch ( const AssertionException & ) {
+                smuassert( cx , "could not convert string to integer" , false );
+            }
+            c.makeIntObj( n, obj );
+        }
+
+        return JS_TRUE;
+    }
+
+    JSBool numberint_valueof(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        Convertor c(cx);
+        return *rval = c.toval( double( c.toNumberInt( obj ) ) );
+    }
+
+    JSBool numberint_tonumber(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        return numberint_valueof( cx, obj, argc, argv, rval );
+    }
+
+    JSBool numberint_tostring(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+        Convertor c(cx);
+        int val = c.toNumberInt( obj );
+        string ret = str::stream() << "NumberInt(" << val << ")";
+        return *rval = c.toval( ret.c_str() );
+    }
+
+    JSFunctionSpec numberint_functions[] = {
+        { "valueOf" , numberint_valueof , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "toNumber" , numberint_tonumber , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { "toString" , numberint_tostring , 0 , JSPROP_READONLY | JSPROP_PERMANENT, 0 } ,
+        { 0 }
+    };
+
+    JSClass minkey_class = {
+        "MinKey" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    JSClass maxkey_class = {
+        "MaxKey" , JSCLASS_HAS_PRIVATE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, JS_ResolveStub , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    // dbquery
+
+    JSBool dbquery_constructor( JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval ) {
+        smuassert( cx ,  "DDQuery needs at least 4 args" , argc >= 4 );
+
+        Convertor c(cx);
+        c.setProperty( obj , "_mongo" , argv[0] );
+        c.setProperty( obj , "_db" , argv[1] );
+        c.setProperty( obj , "_collection" , argv[2] );
+        c.setProperty( obj , "_ns" , argv[3] );
+
+        if ( argc > 4 && JSVAL_IS_OBJECT( argv[4] ) )
+            c.setProperty( obj , "_query" , argv[4] );
+        else {
+            JSObject * temp = JS_NewObject( cx , 0 , 0 , 0 );
+            CHECKNEWOBJECT( temp, cx, "dbquery_constructor" );
+            c.setProperty( obj , "_query" , OBJECT_TO_JSVAL( temp ) );
+        }
+
+        if ( argc > 5 && JSVAL_IS_OBJECT( argv[5] ) )
+            c.setProperty( obj , "_fields" , argv[5] );
+        else
+            c.setProperty( obj , "_fields" , JSVAL_NULL );
+
+
+        if ( argc > 6 && JSVAL_IS_NUMBER( argv[6] ) )
+            c.setProperty( obj , "_limit" , argv[6] );
+        else
+            c.setProperty( obj , "_limit" , JSVAL_ZERO );
+
+        if ( argc > 7 && JSVAL_IS_NUMBER( argv[7] ) )
+            c.setProperty( obj , "_skip" , argv[7] );
+        else
+            c.setProperty( obj , "_skip" , JSVAL_ZERO );
+
+        if ( argc > 8 && JSVAL_IS_NUMBER( argv[8] ) )
+            c.setProperty( obj , "_batchSize" , argv[8] );
+        else
+            c.setProperty( obj , "_batchSize" , JSVAL_ZERO );
+
+        if ( argc > 9 && JSVAL_IS_NUMBER( argv[9] ) )
+            c.setProperty( obj , "_options" , argv[9] );
+        else
+            c.setProperty( obj , "_options" , JSVAL_ZERO );
+
+
+        c.setProperty( obj , "_cursor" , JSVAL_NULL );
+        c.setProperty( obj , "_numReturned" , JSVAL_ZERO );
+        c.setProperty( obj , "_special" , JSVAL_FALSE );
+
+        return JS_TRUE;
+    }
+
+    JSBool dbquery_resolve( JSContext *cx, JSObject *obj, jsval id, uintN flags, JSObject **objp ) {
+        if ( flags & JSRESOLVE_ASSIGNING )
+            return JS_TRUE;
+
+        if ( ! JSVAL_IS_NUMBER( id ) )
+            return JS_TRUE;
+
+        jsval val = JSVAL_VOID;
+        assert( JS_CallFunctionName( cx , obj , "arrayAccess" , 1 , &id , &val ) );
+        Convertor c(cx);
+        c.setProperty( obj , c.toString( id ).c_str() , val );
+        *objp = obj;
+        return JS_TRUE;
+    }
+
+    JSClass dbquery_class = {
+        "DBQuery" , JSCLASS_NEW_RESOLVE ,
+        JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub,
+        JS_EnumerateStub, (JSResolveOp)(&dbquery_resolve) , JS_ConvertStub, JS_FinalizeStub,
+        JSCLASS_NO_OPTIONAL_MEMBERS
+    };
+
+    // ---- other stuff ----
+
+    void initMongoJS( SMScope * scope , JSContext * cx , JSObject * global , bool local ) {
+
+        assert( JS_InitClass( cx , global , 0 , &mongo_class , local ? mongo_local_constructor : mongo_external_constructor , 0 , 0 , mongo_functions , 0 , 0 ) );
+
+        assert( JS_InitClass( cx , global , 0 , &object_id_class , object_id_constructor , 0 , 0 , 0 , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &db_class , db_constructor , 2 , 0 , 0 , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &db_collection_class , db_collection_constructor , 4 , 0 , 0 , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &internal_cursor_class , internal_cursor_constructor , 0 , 0 , internal_cursor_functions , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &dbquery_class , dbquery_constructor , 0 , 0 , 0 , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &dbpointer_class , dbpointer_constructor , 0 , 0 , dbpointer_functions , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &bindata_class , bindata_constructor , 0 , 0 , bindata_functions , 0 , 0 ) );
+//        assert( JS_InitClass( cx , global , 0 , &uuid_class , uuid_constructor , 0 , 0 , uuid_functions , 0 , 0 ) );
+
+        assert( JS_InitClass( cx , global , 0 , &timestamp_class , timestamp_constructor , 0 , 0 , 0 , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &numberlong_class , numberlong_constructor , 0 , 0 , numberlong_functions , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &numberint_class , numberint_constructor , 0 , 0 , numberint_functions , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &minkey_class , 0 , 0 , 0 , 0 , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &maxkey_class , 0 , 0 , 0 , 0 , 0 , 0 ) );
+
+        assert( JS_InitClass( cx , global , 0 , &map_class , map_constructor , 0 , 0 , map_functions , 0 , 0 ) );
+
+        assert( JS_InitClass( cx , global , 0 , &bson_ro_class , bson_cons , 0 , 0 , bson_functions , 0 , 0 ) );
+        assert( JS_InitClass( cx , global , 0 , &bson_class , bson_cons , 0 , 0 , bson_functions , 0 , 0 ) );
+
+        static const char *dbrefName = "DBRef";
+        dbref_class.name = dbrefName;
+        assert( JS_InitClass( cx , global , 0 , &dbref_class , dbref_constructor , 2 , 0 , bson_functions , 0 , 0 ) );
+
+        scope->execCoreFiles();
+    }
+
+    bool appendSpecialDBObject( Convertor * c , BSONObjBuilder& b , const string& name , jsval val , JSObject * o ) {
+
+        if ( JS_InstanceOf( c->_context , o , &object_id_class , 0 ) ) {
+            OID oid;
+            oid.init( c->getString( o , "str" ) );
+            b.append( name , oid );
+            return true;
+        }
+
+        if ( JS_InstanceOf( c->_context , o , &minkey_class , 0 ) ) {
+            b.appendMinKey( name );
+            return true;
+        }
+
+        if ( JS_InstanceOf( c->_context , o , &maxkey_class , 0 ) ) {
+            b.appendMaxKey( name );
+            return true;
+        }
+
+        if ( JS_InstanceOf( c->_context , o , &timestamp_class , 0 ) ) {
+            b.appendTimestamp( name , (unsigned long long)c->getNumber( o , "t" ) , (unsigned int )c->getNumber( o , "i" ) );
+            return true;
+        }
+
+        if ( JS_InstanceOf( c->_context , o , &numberlong_class , 0 ) ) {
+            b.append( name , c->toNumberLongUnsafe( o ) );
+            return true;
+        }
+
+        if ( JS_InstanceOf( c->_context , o , &numberint_class , 0 ) ) {
+            b.append( name , c->toNumberInt( o ) );
+            return true;
+        }
+
+        if ( JS_InstanceOf( c->_context , o , &dbpointer_class , 0 ) ) {
+            b.appendDBRef( name , c->getString( o , "ns" ) , c->toOID( c->getProperty( o , "id" ) ) );
+            return true;
+        }
+
+        if ( JS_InstanceOf( c->_context , o , &bindata_class , 0 ) ) {
+            void *holder = JS_GetPrivate( c->_context , o );
+            const char *data = ( ( BinDataHolder * )( holder ) )->c_;
+            b.appendBinData( name ,
+                             (int)(c->getNumber( o , "len" )) , (BinDataType)((char)(c->getNumber( o , "type" ) ) ) ,
+                             data
+                           );
+            return true;
+        }
+
+#if defined( SM16 ) || defined( MOZJS )
+#warning dates do not work in your version of spider monkey
+        {
+            jsdouble d = js_DateGetMsecSinceEpoch( c->_context , o );
+            if ( d ) {
+                b.appendDate( name , Date_t(d) );
+                return true;
+            }
+        }
+#elif defined( XULRUNNER )
+        if ( JS_InstanceOf( c->_context , o, globalSMEngine->_dateClass , 0 ) ) {
+            jsdouble d = js_DateGetMsecSinceEpoch( c->_context , o );
+            b.appendDate( name , Date_t(d) );
+            return true;
+        }
+#else
+        if ( JS_InstanceOf( c->_context , o, &js_DateClass , 0 ) ) {
+            jsdouble d = js_DateGetMsecSinceEpoch( c->_context , o );
+            long long d2 = (long long)d;
+            b.appendDate( name , Date_t((unsigned long long)d2) );
+            return true;
+        }
+#endif
+
+
+        if ( JS_InstanceOf( c->_context , o , &dbquery_class , 0 ) ||
+                JS_InstanceOf( c->_context , o , &mongo_class , 0 ) ||
+                JS_InstanceOf( c->_context , o , &db_collection_class , 0 ) ) {
+            b.append( name , c->toString( val ) );
+            return true;
+        }
+
+#if defined( XULRUNNER )
+        if ( JS_InstanceOf( c->_context , o , globalSMEngine->_regexClass , 0 ) ) {
+            c->appendRegex( b , name , c->toString( val ) );
+            return true;
+        }
+#elif defined( SM18 )
+        if ( JS_InstanceOf( c->_context , o , &js_RegExpClass , 0 ) ) {
+            c->appendRegex( b , name , c->toString( val ) );
+            return true;
+        }
+#endif
+
+        return false;
+    }
+
+    bool isDate( JSContext * cx , JSObject * o ) {
+#if defined( SM16 ) || defined( MOZJS ) || defined( XULRUNNER )
+        return js_DateGetMsecSinceEpoch( cx , o ) != 0;
+#else
+        return JS_InstanceOf( cx , o, &js_DateClass, 0 );
+#endif
+    }
+
+}
diff --git a/src/mongo/scripting/utils.cpp b/src/mongo/scripting/utils.cpp
new file mode 100644
index 00000000000..612b173fdf8
--- /dev/null
+++ b/src/mongo/scripting/utils.cpp
@@ -0,0 +1,77 @@
+// utils.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "engine.h"
+#include "../util/md5.hpp"
+#include "../util/version.h"
+
+namespace mongo {
+
+    void installBenchmarkSystem( Scope& scope );
+
+    BSONObj jsmd5( const BSONObj &a, void* data ) {
+        uassert( 10261 ,  "js md5 needs a string" , a.firstElement().type() == String );
+        const char * s = a.firstElement().valuestrsafe();
+
+        md5digest d;
+        md5_state_t st;
+        md5_init(&st);
+        md5_append( &st , (const md5_byte_t*)s , strlen( s ) );
+        md5_finish(&st, d);
+
+        return BSON( "" << digestToString( d ) );
+    }
+
+    BSONObj JSVersion( const BSONObj& args, void* data ) {
+        cout << "version: " << versionString << endl;
+        if ( strstr( versionString , "+" ) )
+            printGitVersion();
+        return BSONObj();
+    }
+
+
+    BSONObj JSSleep(const mongo::BSONObj &args, void* data) {
+        assert( args.nFields() == 1 );
+        assert( args.firstElement().isNumber() );
+        int ms = int( args.firstElement().number() );
+        {
+            auto_ptr< ScriptEngine::Unlocker > u = globalScriptEngine->newThreadUnlocker();
+            sleepmillis( ms );
+        }
+
+        BSONObjBuilder b;
+        b.appendUndefined( "" );
+        return b.obj();
+    }
+    
+    // ---------------------------------
+    // ---- installer           --------
+    // ---------------------------------
+
+    void installGlobalUtils( Scope& scope ) {
+        scope.injectNative( "hex_md5" , jsmd5 );
+        scope.injectNative( "version" , JSVersion );
+        scope.injectNative( "sleep" , JSSleep );
+
+        installBenchmarkSystem( scope );
+    }
+
+}
+
+
diff --git a/src/mongo/scripting/v8_db.cpp b/src/mongo/scripting/v8_db.cpp
new file mode 100644
index 00000000000..de419b368d9
--- /dev/null
+++ b/src/mongo/scripting/v8_db.cpp
@@ -0,0 +1,1128 @@
+// v8_db.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#if defined(_WIN32)
+/** this is a hack - v8stdint.h defined uint16_t etc. on _WIN32 only, and that collides with 
+    our usage of boost */
+#include "boost/cstdint.hpp"
+using namespace boost;
+#define V8STDINT_H_
+#endif
+
+#include "v8_wrapper.h"
+#include "v8_utils.h"
+#include "engine_v8.h"
+#include "v8_db.h"
+#include "util/base64.h"
+#include "util/text.h"
+#include "../client/syncclusterconnection.h"
+#include "../s/d_logic.h"
+#include <iostream>
+
+using namespace std;
+using namespace v8;
+
+namespace mongo {
+
+#define DDD(x)
+
+    static v8::Handle<v8::Value> newInstance( v8::Function* f, const v8::Arguments& args ) {
+        // need to translate arguments into an array
+        int argc = args.Length();
+        scoped_array< Handle<Value> > argv( new Handle<Value>[argc] );
+        for (int i = 0; i < argc; ++i) {
+            argv[i] = args[i];
+        }
+        return f->NewInstance(argc, argv.get());
+    }
+
+    v8::Handle<v8::FunctionTemplate> getMongoFunctionTemplate( V8Scope* scope, bool local ) {
+        v8::Handle<v8::FunctionTemplate> mongo;
+        if ( local ) {
+            mongo = scope->createV8Function(mongoConsLocal);
+        }
+        else {
+            mongo = scope->createV8Function(mongoConsExternal);
+        }
+        mongo->InstanceTemplate()->SetInternalFieldCount( 1 );
+        v8::Handle<v8::Template> proto = mongo->PrototypeTemplate();
+        scope->injectV8Function("find", mongoFind, proto);
+        scope->injectV8Function("insert", mongoInsert, proto);
+        scope->injectV8Function("remove", mongoRemove, proto);
+        scope->injectV8Function("update", mongoUpdate, proto);
+        scope->injectV8Function("auth", mongoAuth, proto);
+
+        v8::Handle<FunctionTemplate> ic = scope->createV8Function(internalCursorCons);
+        ic->InstanceTemplate()->SetInternalFieldCount( 1 );
+        v8::Handle<v8::Template> icproto = ic->PrototypeTemplate();
+        scope->injectV8Function("next", internalCursorNext, icproto);
+        scope->injectV8Function("hasNext", internalCursorHasNext, icproto);
+        scope->injectV8Function("objsLeftInBatch", internalCursorObjsLeftInBatch, icproto);
+        scope->injectV8Function("readOnly", internalCursorReadOnly, icproto);
+        proto->Set( scope->getV8Str( "internalCursor" ) , ic );
+
+        return mongo;
+    }
+
+    v8::Handle<v8::FunctionTemplate> getNumberLongFunctionTemplate(V8Scope* scope) {
+        v8::Handle<v8::FunctionTemplate> numberLong = scope->createV8Function(numberLongInit);
+        v8::Local<v8::Template> proto = numberLong->PrototypeTemplate();
+        scope->injectV8Function("valueOf", numberLongValueOf, proto);
+        scope->injectV8Function("toNumber", numberLongToNumber, proto);
+        scope->injectV8Function("toString", numberLongToString, proto);
+
+        return numberLong;
+    }
+
+    v8::Handle<v8::FunctionTemplate> getNumberIntFunctionTemplate(V8Scope* scope) {
+        v8::Handle<v8::FunctionTemplate> numberInt = scope->createV8Function(numberIntInit);
+        v8::Local<v8::Template> proto = numberInt->PrototypeTemplate();
+        scope->injectV8Function("valueOf", numberIntValueOf, proto);
+        scope->injectV8Function("toNumber", numberIntToNumber, proto);
+        scope->injectV8Function("toString", numberIntToString, proto);
+
+        return numberInt;
+    }
+
+    v8::Handle<v8::FunctionTemplate> getBinDataFunctionTemplate(V8Scope* scope) {
+        v8::Handle<v8::FunctionTemplate> binData = scope->createV8Function(binDataInit);
+        binData->InstanceTemplate()->SetInternalFieldCount(1);
+        v8::Local<v8::Template> proto = binData->PrototypeTemplate();
+        scope->injectV8Function("toString", binDataToString, proto);
+        scope->injectV8Function("base64", binDataToBase64, proto);
+        scope->injectV8Function("hex", binDataToHex, proto);
+        return binData;
+    }
+
+    v8::Handle<v8::FunctionTemplate> getUUIDFunctionTemplate(V8Scope* scope) {
+        v8::Handle<v8::FunctionTemplate> templ = scope->createV8Function(uuidInit);
+        return templ;
+    }
+
+    v8::Handle<v8::FunctionTemplate> getMD5FunctionTemplate(V8Scope* scope) {
+        v8::Handle<v8::FunctionTemplate> templ = scope->createV8Function(md5Init);
+        return templ;
+    }
+
+    v8::Handle<v8::FunctionTemplate> getHexDataFunctionTemplate(V8Scope* scope) {
+        v8::Handle<v8::FunctionTemplate> templ = scope->createV8Function(hexDataInit);
+        return templ;
+    }
+
+    v8::Handle<v8::FunctionTemplate> getTimestampFunctionTemplate(V8Scope* scope) {
+        v8::Handle<v8::FunctionTemplate> ts = scope->createV8Function(dbTimestampInit);
+        ts->InstanceTemplate()->SetInternalFieldCount( 1 );
+        return ts;
+    }
+
+//    void installDBTypes( V8Scope* scope, Handle<ObjectTemplate>& global ) {
+//        v8::Handle<v8::FunctionTemplate> db = scope->createV8Function(dbInit);
+//        db->InstanceTemplate()->SetNamedPropertyHandler( collectionFallback );
+//        global->Set(v8::String::New("DB") , db );
+//
+//        v8::Handle<v8::FunctionTemplate> dbCollection = scope->createV8Function(collectionInit);
+//        dbCollection->InstanceTemplate()->SetNamedPropertyHandler( collectionFallback );
+//        global->Set(v8::String::New("DBCollection") , dbCollection );
+//
+//
+//        v8::Handle<v8::FunctionTemplate> dbQuery = scope->createV8Function(dbQueryInit);
+//        dbQuery->InstanceTemplate()->SetIndexedPropertyHandler( dbQueryIndexAccess );
+//        global->Set(v8::String::New("DBQuery") , dbQuery );
+//
+//        global->Set( v8::String::New("ObjectId") , newV8Function< objectIdInit >(scope) );
+//
+//        global->Set( v8::String::New("DBRef") , newV8Function< dbRefInit >(scope) );
+//
+//        global->Set( v8::String::New("DBPointer") , newV8Function< dbPointerInit >(scope) );
+//
+//        global->Set( v8::String::New("BinData") , getBinDataFunctionTemplate(scope) );
+//
+//        global->Set( v8::String::New("NumberLong") , getNumberLongFunctionTemplate(scope) );
+//
+//        global->Set( v8::String::New("Timestamp") , getTimestampFunctionTemplate(scope) );
+//    }
+
+    void installDBTypes( V8Scope* scope, v8::Handle<v8::Object>& global ) {
+        v8::Handle<v8::FunctionTemplate> db = scope->createV8Function(dbInit);
+        db->InstanceTemplate()->SetNamedPropertyHandler( collectionGetter, collectionSetter );
+        global->Set(scope->getV8Str("DB") , db->GetFunction() );
+        v8::Handle<v8::FunctionTemplate> dbCollection = scope->createV8Function(collectionInit);
+        dbCollection->InstanceTemplate()->SetNamedPropertyHandler( collectionGetter, collectionSetter );
+        global->Set(scope->getV8Str("DBCollection") , dbCollection->GetFunction() );
+
+
+        v8::Handle<v8::FunctionTemplate> dbQuery = scope->createV8Function(dbQueryInit);
+        dbQuery->InstanceTemplate()->SetIndexedPropertyHandler( dbQueryIndexAccess );
+        global->Set(scope->getV8Str("DBQuery") , dbQuery->GetFunction() );
+
+        scope->injectV8Function("ObjectId", objectIdInit, global);
+        scope->injectV8Function("DBRef", dbRefInit, global);
+        scope->injectV8Function("DBPointer", dbPointerInit, global);
+
+        global->Set( scope->getV8Str("BinData") , getBinDataFunctionTemplate(scope)->GetFunction() );
+        global->Set( scope->getV8Str("UUID") , getUUIDFunctionTemplate(scope)->GetFunction() );
+        global->Set( scope->getV8Str("MD5") , getMD5FunctionTemplate(scope)->GetFunction() );
+        global->Set( scope->getV8Str("HexData") , getHexDataFunctionTemplate(scope)->GetFunction() );
+        global->Set( scope->getV8Str("NumberLong") , getNumberLongFunctionTemplate(scope)->GetFunction() );
+        global->Set( scope->getV8Str("NumberInt") , getNumberIntFunctionTemplate(scope)->GetFunction() );
+        global->Set( scope->getV8Str("Timestamp") , getTimestampFunctionTemplate(scope)->GetFunction() );
+
+        BSONObjBuilder b;
+        b.appendMaxKey( "" );
+        b.appendMinKey( "" );
+        BSONObj o = b.obj();
+        BSONObjIterator i( o );
+        global->Set( scope->getV8Str("MaxKey"), scope->mongoToV8Element( i.next() ) );
+        global->Set( scope->getV8Str("MinKey"), scope->mongoToV8Element( i.next() ) );
+
+        global->Get( scope->getV8Str( "Object" ) )->ToObject()->Set( scope->getV8Str("bsonsize") , scope->createV8Function(bsonsize)->GetFunction() );
+    }
+
+    void destroyConnection( Persistent<Value> self, void* parameter) {
+        delete static_cast<DBClientBase*>(parameter);
+        self.Dispose();
+        self.Clear();
+    }
+
+    Handle<Value> mongoConsExternal(V8Scope* scope, const Arguments& args) {
+
+        char host[255];
+
+        if ( args.Length() > 0 && args[0]->IsString() ) {
+            assert( args[0]->ToString()->Utf8Length() < 250 );
+            args[0]->ToString()->WriteAscii( host );
+        }
+        else {
+            strcpy( host , "127.0.0.1" );
+        }
+
+        string errmsg;
+        ConnectionString cs = ConnectionString::parse( host , errmsg );
+        if ( ! cs.isValid() )
+            return v8::ThrowException( v8::String::New( errmsg.c_str() ) );
+
+
+        DBClientWithCommands * conn;
+        {
+            V8Unlock ul;
+            conn = cs.connect( errmsg );
+        }
+        if ( ! conn )
+            return v8::ThrowException( v8::String::New( errmsg.c_str() ) );
+
+        Persistent<v8::Object> self = Persistent<v8::Object>::New( args.Holder() );
+        self.MakeWeak( conn , destroyConnection );
+
+        {
+            V8Unlock ul;
+            ScriptEngine::runConnectCallback( *conn );
+        }
+
+        args.This()->SetInternalField( 0 , External::New( conn ) );
+        args.This()->Set( scope->getV8Str( "slaveOk" ) , Boolean::New( false ) );
+        args.This()->Set( scope->getV8Str( "host" ) , scope->getV8Str( host ) );
+
+        return v8::Undefined();
+    }
+
+    Handle<Value> mongoConsLocal(V8Scope* scope, const Arguments& args) {
+
+        if ( args.Length() > 0 )
+            return v8::ThrowException( v8::String::New( "local Mongo constructor takes no args" ) );
+
+        DBClientBase * conn;
+        {
+            V8Unlock ul;
+            conn = createDirectClient();
+        }
+
+        Persistent<v8::Object> self = Persistent<v8::Object>::New( args.This() );
+        self.MakeWeak( conn , destroyConnection );
+
+        // NOTE I don't believe the conn object will ever be freed.
+        args.This()->SetInternalField( 0 , External::New( conn ) );
+        args.This()->Set( scope->getV8Str( "slaveOk" ) , Boolean::New( false ) );
+        args.This()->Set( scope->getV8Str( "host" ) , scope->getV8Str( "EMBEDDED" ) );
+
+        return v8::Undefined();
+    }
+
+
+    // ---
+
+#ifdef _WIN32
+#define GETNS char * ns = new char[args[0]->ToString()->Utf8Length()];  args[0]->ToString()->WriteUtf8( ns );
+#else
+#define GETNS char ns[args[0]->ToString()->Utf8Length()];  args[0]->ToString()->WriteUtf8( ns );
+#endif
+
+    DBClientBase * getConnection( const Arguments& args ) {
+        Local<External> c = External::Cast( *(args.This()->GetInternalField( 0 )) );
+        DBClientBase * conn = (DBClientBase*)(c->Value());
+        assert( conn );
+        return conn;
+    }
+
+    // ---- real methods
+
+    void destroyCursor( Persistent<Value> self, void* parameter) {
+        delete static_cast<mongo::DBClientCursor*>(parameter);
+        self.Dispose();
+        self.Clear();
+    }
+
+    /**
+       0 - namespace
+       1 - query
+       2 - fields
+       3 - limit
+       4 - skip
+    */
+    Handle<Value> mongoFind(V8Scope* scope, const Arguments& args) {
+        HandleScope handle_scope;
+
+        jsassert( args.Length() == 7 , "find needs 7 args" );
+        jsassert( args[1]->IsObject() , "needs to be an object" );
+        DBClientBase * conn = getConnection( args );
+        GETNS;
+
+        BSONObj q = scope->v8ToMongo( args[1]->ToObject() );
+        DDD( "query:" << q  );
+
+        BSONObj fields;
+        bool haveFields = args[2]->IsObject() && args[2]->ToObject()->GetPropertyNames()->Length() > 0;
+        if ( haveFields )
+            fields = scope->v8ToMongo( args[2]->ToObject() );
+
+        Local<v8::Object> mongo = args.This();
+
+        try {
+            auto_ptr<mongo::DBClientCursor> cursor;
+            int nToReturn = (int)(args[3]->ToNumber()->Value());
+            int nToSkip = (int)(args[4]->ToNumber()->Value());
+            int batchSize = (int)(args[5]->ToNumber()->Value());
+            int options = (int)(args[6]->ToNumber()->Value());
+            {
+                V8Unlock u;
+                cursor = conn->query( ns, q ,  nToReturn , nToSkip , haveFields ? &fields : 0, options , batchSize );
+                if ( ! cursor.get() ) 
+                    return v8::ThrowException( v8::String::New( "error doing query: failed" ) );
+            }
+            v8::Function * cons = (v8::Function*)( *( mongo->Get( scope->getV8Str( "internalCursor" ) ) ) );
+            if ( !cons ) {
+                // may get here in case of thread termination
+                return v8::ThrowException( v8::String::New( "Could not create a cursor" ) );
+            }
+
+            Persistent<v8::Object> c = Persistent<v8::Object>::New( cons->NewInstance() );
+            c.MakeWeak( cursor.get() , destroyCursor );
+            c->SetInternalField( 0 , External::New( cursor.release() ) );
+            return handle_scope.Close(c);
+        }
+        catch ( ... ) {
+            return v8::ThrowException( v8::String::New( "socket error on query" ) );
+        }
+    }
+
+    v8::Handle<v8::Value> mongoInsert(V8Scope* scope, const v8::Arguments& args) {
+        jsassert( args.Length() == 2 , "insert needs 2 args" );
+        jsassert( args[1]->IsObject() , "have to insert an object" );
+
+        if ( args.This()->Get( scope->getV8Str( "readOnly" ) )->BooleanValue() )
+            return v8::ThrowException( v8::String::New( "js db in read only mode" ) );
+
+        DBClientBase * conn = getConnection( args );
+        GETNS;
+
+        v8::Handle<v8::Object> in = args[1]->ToObject();
+
+        if( args[1]->IsArray() ){
+
+            v8::Local<v8::Array> arr = v8::Array::Cast( *args[1] );
+            vector<BSONObj> bos;
+            uint32_t len = arr->Length();
+
+            for( uint32_t i = 0; i < len; i++ ){
+
+                v8::Local<v8::Object> el = arr->CloneElementAt( i );
+
+                // Set ID on the element if necessary
+                if ( ! el->Has( scope->getV8Str( "_id" ) ) ) {
+                    v8::Handle<v8::Value> argv[1];
+                    el->Set( scope->getV8Str( "_id" ) , scope->getObjectIdCons()->NewInstance( 0 , argv ) );
+                }
+
+                bos.push_back( scope->v8ToMongo( arr->CloneElementAt( i ) ) );
+            }
+
+            DDD( "want to save batch : " << bos.length );
+            try {
+                V8Unlock u;
+                conn->insert( ns , bos );
+            }
+            catch ( ... ) {
+                return v8::ThrowException( v8::String::New( "socket error on bulk insert" ) );
+            }
+
+        }
+        else {
+
+            if ( ! in->Has( scope->getV8Str( "_id" ) ) ) {
+                v8::Handle<v8::Value> argv[1];
+                in->Set( scope->getV8Str( "_id" ) , scope->getObjectIdCons()->NewInstance( 0 , argv ) );
+            }
+
+            BSONObj o = scope->v8ToMongo( in );
+
+            DDD( "want to save : " << o.jsonString() );
+            try {
+                V8Unlock u;
+                conn->insert( ns , o );
+            }
+            catch ( ... ) {
+                return v8::ThrowException( v8::String::New( "socket error on insert" ) );
+            }
+
+        }
+
+        return v8::Undefined();
+    }
+
+    v8::Handle<v8::Value> mongoRemove(V8Scope* scope, const v8::Arguments& args) {
+        jsassert( args.Length() == 2 || args.Length() == 3 , "remove needs 2 args" );
+        jsassert( args[1]->IsObject() , "have to remove an object template" );
+
+        if ( args.This()->Get( scope->getV8Str( "readOnly" ) )->BooleanValue() )
+            return v8::ThrowException( v8::String::New( "js db in read only mode" ) );
+
+        DBClientBase * conn = getConnection( args );
+        GETNS;
+
+        v8::Handle<v8::Object> in = args[1]->ToObject();
+        BSONObj o = scope->v8ToMongo( in );
+
+        bool justOne = false;
+        if ( args.Length() > 2 ) {
+            justOne = args[2]->BooleanValue();
+        }
+
+        DDD( "want to remove : " << o.jsonString() );
+        try {
+            V8Unlock u;
+            conn->remove( ns , o , justOne );
+        }
+        catch ( ... ) {
+            return v8::ThrowException( v8::String::New( "socket error on remove" ) );
+        }
+
+        return v8::Undefined();
+    }
+
+    v8::Handle<v8::Value> mongoUpdate(V8Scope* scope, const v8::Arguments& args) {
+        jsassert( args.Length() >= 3 , "update needs at least 3 args" );
+        jsassert( args[1]->IsObject() , "1st param to update has to be an object" );
+        jsassert( args[2]->IsObject() , "2nd param to update has to be an object" );
+        
+        if ( args.This()->Get( scope->getV8Str( "readOnly" ) )->BooleanValue() )
+            return v8::ThrowException( v8::String::New( "js db in read only mode" ) );
+
+        DBClientBase * conn = getConnection( args );
+        GETNS;
+
+        v8::Handle<v8::Object> q = args[1]->ToObject();
+        v8::Handle<v8::Object> o = args[2]->ToObject();
+
+        bool upsert = args.Length() > 3 && args[3]->IsBoolean() && args[3]->ToBoolean()->Value();
+        bool multi = args.Length() > 4 && args[4]->IsBoolean() && args[4]->ToBoolean()->Value();
+
+        try {
+            BSONObj q1 = scope->v8ToMongo( q );
+            BSONObj o1 = scope->v8ToMongo( o );
+            V8Unlock u;
+            conn->update( ns , q1 , o1 , upsert, multi );
+        }
+        catch ( ... ) {
+            return v8::ThrowException( v8::String::New( "socket error on remove" ) );
+        }
+
+        return v8::Undefined();
+    }
+
+    v8::Handle<v8::Value> mongoAuth(V8Scope* scope, const v8::Arguments& args) {
+        jsassert( args.Length() >= 3 , "update needs at least 3 args" );
+        DBClientBase * conn = getConnection( args );
+        string db = toSTLString(args[0]);
+        string username = toSTLString(args[1]);
+        string password = toSTLString(args[2]);
+        string errmsg = "";
+
+        try {
+            if (conn->auth(db, username, password, errmsg)) {
+                return v8::Boolean::New(true);
+            }
+        } catch ( ... ) {
+        }
+        return v8::ThrowException( v8::String::New( errmsg.c_str() ) );
+    }
+
+//    +    JSBool mongo_auth(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) {
+//    +        smuassert( cx , "mongo_auth needs 3 args" , argc == 3 );
+//    +        shared_ptr< DBClientWithCommands > * connHolder = (shared_ptr< DBClientWithCommands >*)JS_GetPrivate( cx , obj );
+//    +        smuassert( cx ,  "no connection!" , connHolder && connHolder->get() );
+//    +        DBClientWithCommands *conn = connHolder->get();
+//    +
+//    +        Convertor c( cx );
+//    +
+//    +        string db = c.toString( argv[0] );
+//    +        string username = c.toString( argv[1] );
+//    +        string password = c.toString( argv[2] );
+//    +        string errmsg = "";
+//    +
+//    +        try {
+//    +            if (conn->auth(db, username, password, errmsg)) {
+//    +                return JS_TRUE;
+//    +            }
+//    +            JS_ReportError( cx, errmsg.c_str() );
+//    +        }
+//    +        catch ( ... ) {
+//    +            JS_ReportError( cx , "error doing query: unknown" );
+//    +        }
+//    +        return JS_FALSE;
+//    +    }
+
+
+    // --- cursor ---
+
+    mongo::DBClientCursor * getCursor( const Arguments& args ) {
+        Local<External> c = External::Cast( *(args.This()->GetInternalField( 0 ) ) );
+
+        mongo::DBClientCursor * cursor = (mongo::DBClientCursor*)(c->Value());
+        return cursor;
+    }
+
+    v8::Handle<v8::Value> internalCursorCons(V8Scope* scope, const v8::Arguments& args) {
+        return v8::Undefined();
+    }
+
+    v8::Handle<v8::Value> internalCursorNext(V8Scope* scope, const v8::Arguments& args) {
+        mongo::DBClientCursor * cursor = getCursor( args );
+        if ( ! cursor )
+            return v8::Undefined();
+        BSONObj o;
+        {
+            V8Unlock u;
+            o = cursor->next();
+        }
+        bool ro = false;
+        if (args.This()->Has(scope->V8STR_RO))
+            ro = args.This()->Get(scope->V8STR_RO)->BooleanValue();
+        return scope->mongoToLZV8( o, false, ro );
+    }
+
+    v8::Handle<v8::Value> internalCursorHasNext(V8Scope* scope, const v8::Arguments& args) {
+        mongo::DBClientCursor * cursor = getCursor( args );
+        if ( ! cursor )
+            return Boolean::New( false );
+        bool ret;
+        {
+            V8Unlock u;
+            ret = cursor->more();
+        }
+        return Boolean::New( ret );
+    }
+
+    v8::Handle<v8::Value> internalCursorObjsLeftInBatch(V8Scope* scope, const v8::Arguments& args) {
+        mongo::DBClientCursor * cursor = getCursor( args );
+        if ( ! cursor )
+            return v8::Number::New( (double) 0 );
+        int ret;
+        {
+            V8Unlock u;
+            ret = cursor->objsLeftInBatch();
+        }
+        return v8::Number::New( (double) ret );
+    }
+
+    v8::Handle<v8::Value> internalCursorReadOnly(V8Scope* scope, const v8::Arguments& args) {
+        Local<v8::Object> cursor = args.This();
+        cursor->Set(scope->V8STR_RO, v8::Boolean::New(true));
+        return cursor;
+    }
+
+    // --- DB ----
+
+    v8::Handle<v8::Value> dbInit(V8Scope* scope, const v8::Arguments& args) {
+        assert( args.Length() == 2 );
+
+        args.This()->Set( scope->getV8Str( "_mongo" ) , args[0] );
+        args.This()->Set( scope->getV8Str( "_name" ) , args[1] );
+
+        for ( int i=0; i<args.Length(); i++ )
+            assert( ! args[i]->IsUndefined() );
+
+        return v8::Undefined();
+    }
+
+    v8::Handle<v8::Value> collectionInit( V8Scope* scope, const v8::Arguments& args ) {
+        assert( args.Length() == 4 );
+
+        args.This()->Set( scope->getV8Str( "_mongo" ) , args[0] );
+        args.This()->Set( scope->getV8Str( "_db" ) , args[1] );
+        args.This()->Set( scope->getV8Str( "_shortName" ) , args[2] );
+        args.This()->Set( scope->V8STR_FULLNAME , args[3] );
+        
+        if ( haveLocalShardingInfo( toSTLString( args[3] ) ) )
+            return v8::ThrowException( v8::String::New( "can't use sharded collection from db.eval" ) );
+
+        for ( int i=0; i<args.Length(); i++ )
+            assert( ! args[i]->IsUndefined() );
+
+        return v8::Undefined();
+    }
+
+    v8::Handle<v8::Value> dbQueryInit( V8Scope* scope, const v8::Arguments& args ) {
+
+        v8::Handle<v8::Object> t = args.This();
+
+        assert( args.Length() >= 4 );
+
+        t->Set( scope->getV8Str( "_mongo" ) , args[0] );
+        t->Set( scope->getV8Str( "_db" ) , args[1] );
+        t->Set( scope->getV8Str( "_collection" ) , args[2] );
+        t->Set( scope->getV8Str( "_ns" ) , args[3] );
+
+        if ( args.Length() > 4 && args[4]->IsObject() )
+            t->Set( scope->getV8Str( "_query" ) , args[4] );
+        else
+            t->Set( scope->getV8Str( "_query" ) , v8::Object::New() );
+
+        if ( args.Length() > 5 && args[5]->IsObject() )
+            t->Set( scope->getV8Str( "_fields" ) , args[5] );
+        else
+            t->Set( scope->getV8Str( "_fields" ) , v8::Null() );
+
+
+        if ( args.Length() > 6 && args[6]->IsNumber() )
+            t->Set( scope->getV8Str( "_limit" ) , args[6] );
+        else
+            t->Set( scope->getV8Str( "_limit" ) , Number::New( 0 ) );
+
+        if ( args.Length() > 7 && args[7]->IsNumber() )
+            t->Set( scope->getV8Str( "_skip" ) , args[7] );
+        else
+            t->Set( scope->getV8Str( "_skip" ) , Number::New( 0 ) );
+
+        if ( args.Length() > 8 && args[8]->IsNumber() )
+            t->Set( scope->getV8Str( "_batchSize" ) , args[8] );
+        else
+            t->Set( scope->getV8Str( "_batchSize" ) , Number::New( 0 ) );
+
+        if ( args.Length() > 9 && args[9]->IsNumber() )
+            t->Set( scope->getV8Str( "_options" ) , args[9] );
+        else
+            t->Set( scope->getV8Str( "_options" ) , Number::New( 0 ) );
+
+        
+        t->Set( scope->getV8Str( "_cursor" ) , v8::Null() );
+        t->Set( scope->getV8Str( "_numReturned" ) , v8::Number::New(0) );
+        t->Set( scope->getV8Str( "_special" ) , Boolean::New(false) );
+
+        return v8::Undefined();
+    }
+
+    Handle<Value> collectionSetter( Local<v8::String> name, Local<Value> value, const AccessorInfo& info ) {
+        // a collection name cannot be overwritten by a variable
+        string sname = toSTLString( name );
+        if ( sname.length() == 0 || sname[0] == '_' ) {
+            // if starts with '_' we allow overwrite
+            return Handle<Value>();
+        }
+        // dont set
+        return value;
+    }
+
+    v8::Handle<v8::Value> collectionGetter( v8::Local<v8::String> name, const v8::AccessorInfo &info) {
+        DDD( "collectionFallback [" << name << "]" );
+
+        // first look in prototype, may be a function
+        v8::Handle<v8::Value> real = info.This()->GetPrototype()->ToObject()->Get( name );
+        if ( !real->IsUndefined() )
+            return real;
+
+        // 2nd look into real values, may be cached collection object
+        string sname = toSTLString( name );
+        if (info.This()->HasRealNamedProperty(name)) {
+            v8::Local<v8::Value> prop = info.This()->GetRealNamedProperty( name );
+            if (prop->IsObject() && prop->ToObject()->HasRealNamedProperty(v8::String::New("_fullName"))) {
+                // need to check every time that the collection did not get sharded
+                if ( haveLocalShardingInfo( toSTLString( prop->ToObject()->GetRealNamedProperty(v8::String::New("_fullName")) ) ) )
+                    return v8::ThrowException( v8::String::New( "can't use sharded collection from db.eval" ) );
+            }
+            return prop;
+        } else if ( sname.length() == 0 || sname[0] == '_' ) {
+            // if starts with '_' we dont return collection, one must use getCollection()
+            return v8::Undefined();
+        }
+
+        // no hit, create new collection
+        v8::Handle<v8::Value> getCollection = info.This()->GetPrototype()->ToObject()->Get( v8::String::New( "getCollection" ) );
+        assert( getCollection->IsFunction() );
+
+        TryCatch tryCatch;
+        v8::Function * f = (v8::Function*)(*getCollection);
+        v8::Handle<v8::Value> argv[1];
+        argv[0] = name;
+        v8::Local<v8::Value> coll = f->Call( info.This() , 1 , argv );
+        if (coll.IsEmpty()) {
+            if (tryCatch.HasCaught()) {
+                return v8::ThrowException( tryCatch.Exception() );
+            }
+            return Handle<Value>();
+        }
+
+        // cache collection for reuse, dont enumerate
+        info.This()->ForceSet(name, coll, v8::DontEnum);
+        return coll;
+    }
+
+    v8::Handle<v8::Value> dbQueryIndexAccess( unsigned int index , const v8::AccessorInfo& info ) {
+        v8::Handle<v8::Value> arrayAccess = info.This()->GetPrototype()->ToObject()->Get( v8::String::New( "arrayAccess" ) );
+        assert( arrayAccess->IsFunction() );
+
+        v8::Function * f = (v8::Function*)(*arrayAccess);
+        v8::Handle<v8::Value> argv[1];
+        argv[0] = v8::Number::New( index );
+
+        return f->Call( info.This() , 1 , argv );
+    }
+
+    v8::Handle<v8::Value> objectIdInit( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        if ( it->IsUndefined() || it == v8::Context::GetCurrent()->Global() ) {
+            v8::Function * f = scope->getObjectIdCons();
+            return newInstance(f, args);
+        }
+
+        OID oid;
+
+        if ( args.Length() == 0 ) {
+            oid.init();
+        }
+        else {
+            string s = toSTLString( args[0] );
+            try {
+                Scope::validateObjectIdString( s );
+            }
+            catch ( const MsgAssertionException &m ) {
+                string error = m.toString();
+                return v8::ThrowException( v8::String::New( error.c_str() ) );
+            }
+            oid.init( s );
+        }
+
+        it->Set( scope->getV8Str( "str" ) , v8::String::New( oid.str().c_str() ) );
+
+        return it;
+    }
+
+    v8::Handle<v8::Value> dbRefInit( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        if ( it->IsUndefined() || it == v8::Context::GetCurrent()->Global() ) {
+            v8::Function * f = scope->getNamedCons( "DBRef" );
+            return newInstance(f, args);
+        }
+
+        if (args.Length() != 2 && args.Length() != 0) {
+            return v8::ThrowException( v8::String::New( "DBRef needs 2 arguments" ) );
+        }
+
+        if ( args.Length() == 2 ) {
+            it->Set( scope->getV8Str( "$ref" ) , args[0] );
+            it->Set( scope->getV8Str( "$id" ) , args[1] );
+        }
+
+        return it;
+    }
+
+    v8::Handle<v8::Value> dbPointerInit( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        if ( it->IsUndefined() || it == v8::Context::GetCurrent()->Global() ) {
+            v8::Function * f = scope->getNamedCons( "DBPointer" );
+            return newInstance(f, args);
+        }
+
+        if (args.Length() != 2) {
+            return v8::ThrowException( v8::String::New( "DBPointer needs 2 arguments" ) );
+        }
+
+        it->Set( scope->getV8Str( "ns" ) , args[0] );
+        it->Set( scope->getV8Str( "id" ) , args[1] );
+        it->SetHiddenValue( scope->getV8Str( "__DBPointer" ), v8::Number::New( 1 ) );
+
+        return it;
+    }
+
+    v8::Handle<v8::Value> dbTimestampInit( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        if ( it->IsUndefined() || it == v8::Context::GetCurrent()->Global() ) {
+            v8::Function * f = scope->getNamedCons( "Timestamp" );
+            return newInstance(f, args);
+        }
+
+        if ( args.Length() == 0 ) {
+            it->Set( scope->getV8Str( "t" ) , v8::Number::New( 0 ) );
+            it->Set( scope->getV8Str( "i" ) , v8::Number::New( 0 ) );
+        }
+        else if ( args.Length() == 2 ) {
+            it->Set( scope->getV8Str( "t" ) , args[0] );
+            it->Set( scope->getV8Str( "i" ) , args[1] );
+        }
+        else {
+            return v8::ThrowException( v8::String::New( "Timestamp needs 0 or 2 arguments" ) );
+        }
+
+        it->SetInternalField( 0, v8::Uint32::New( Timestamp ) );
+
+        return it;
+    }
+
+
+    v8::Handle<v8::Value> binDataInit( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Local<v8::Object> it = args.This();
+        if ( it->IsUndefined() || it == v8::Context::GetCurrent()->Global() ) {
+            v8::Function* f = scope->getNamedCons( "BinData" );
+            return newInstance(f, args);
+        }
+
+        Handle<Value> type;
+        Handle<Value> len;
+        int rlen;
+        char* data;
+        if (args.Length() == 3) {
+            // 3 args: len, type, data
+            len = args[0];
+            rlen = len->IntegerValue();
+            type = args[1];
+            v8::String::Utf8Value utf( args[ 2 ] );
+            char* tmp = *utf;
+            data = new char[rlen];
+            memcpy(data, tmp, rlen);
+        }
+        else if ( args.Length() == 2 ) {
+            // 2 args: type, base64 string
+            type = args[0];
+            v8::String::Utf8Value utf( args[ 1 ] );
+            string decoded = base64::decode( *utf );
+            const char* tmp = decoded.data();
+            rlen = decoded.length();
+            data = new char[rlen];
+            memcpy(data, tmp, rlen);
+            len = v8::Number::New(rlen);
+//            it->Set( scope->getV8Str( "data" ), v8::String::New( decoded.data(), decoded.length() ) );
+        } else if (args.Length() == 0) {
+            // this is called by subclasses that will fill properties
+            return it;
+        } else {
+            return v8::ThrowException( v8::String::New( "BinData needs 2 or 3 arguments" ) );
+        }
+
+        it->Set( scope->getV8Str( "len" ) , len );
+        it->Set( scope->getV8Str( "type" ) , type );
+        it->SetHiddenValue( scope->V8STR_BINDATA, v8::Number::New( 1 ) );
+        Persistent<v8::Object> res = scope->wrapArrayObject(it, data);
+        return res;
+    }
+
+    v8::Handle<v8::Value> binDataToString( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        int len = it->Get( scope->V8STR_LEN )->Int32Value();
+        int type = it->Get( scope->V8STR_TYPE )->Int32Value();
+        Local<External> c = External::Cast( *(it->GetInternalField( 0 )) );
+        char* data = (char*)(c->Value());
+
+        stringstream ss;
+        ss << "BinData(" << type << ",\"";
+        base64::encode( ss, data, len );
+        ss << "\")";
+        string ret = ss.str();
+        return v8::String::New( ret.c_str() );
+    }
+
+    v8::Handle<v8::Value> binDataToBase64( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        int len = Handle<v8::Number>::Cast(it->Get(scope->V8STR_LEN))->Int32Value();
+        Local<External> c = External::Cast( *(it->GetInternalField( 0 )) );
+        char* data = (char*)(c->Value());
+        stringstream ss;
+        base64::encode( ss, (const char *)data, len );
+        return v8::String::New(ss.str().c_str());
+    }
+
+    v8::Handle<v8::Value> binDataToHex( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        int len = Handle<v8::Number>::Cast(it->Get(scope->V8STR_LEN))->Int32Value();
+        Local<External> c = External::Cast( *(it->GetInternalField( 0 )) );
+        char* data = (char*)(c->Value());
+        stringstream ss;
+        ss.setf (ios_base::hex , ios_base::basefield);
+        ss.fill ('0');
+        ss.setf (ios_base::right , ios_base::adjustfield);
+        for( int i = 0; i < len; i++ ) {
+            unsigned v = (unsigned char) data[i];
+            ss << setw(2) << v;
+        }
+        return v8::String::New(ss.str().c_str());
+    }
+
+    static v8::Handle<v8::Value> hexToBinData( V8Scope* scope, v8::Local<v8::Object> it, int type, string hexstr ) {
+        int len = hexstr.length() / 2;
+        char* data = new char[len];
+        const char* src = hexstr.c_str();
+        for( int i = 0; i < 16; i++ ) {
+            data[i] = fromHex(src + i * 2);
+        }
+
+        it->Set( scope->V8STR_LEN , v8::Number::New(len) );
+        it->Set( scope->V8STR_TYPE , v8::Number::New(type) );
+        it->SetHiddenValue( scope->V8STR_BINDATA, v8::Number::New( 1 ) );
+        Persistent<v8::Object> res = scope->wrapArrayObject(it, data);
+        return res;
+    }
+
+    v8::Handle<v8::Value> uuidInit( V8Scope* scope, const v8::Arguments& args ) {
+        if (args.Length() != 1) {
+            return v8::ThrowException( v8::String::New( "UUIS needs 1 argument" ) );
+        }
+        v8::String::Utf8Value utf( args[ 0 ] );
+        if( utf.length() != 32 ) {
+            return v8::ThrowException( v8::String::New( "UUIS string must have 32 characters" ) );
+        }
+
+        v8::Function * f = scope->getNamedCons("BinData");
+        Local<v8::Object> it = f->NewInstance();
+        return hexToBinData(scope, it, bdtUUID, *utf);
+    }
+
+    v8::Handle<v8::Value> md5Init( V8Scope* scope, const v8::Arguments& args ) {
+        if (args.Length() != 1) {
+            return v8::ThrowException( v8::String::New( "MD5 needs 1 argument" ) );
+        }
+        v8::String::Utf8Value utf( args[ 0 ] );
+        if( utf.length() != 32 ) {
+            return v8::ThrowException( v8::String::New( "MD5 string must have 32 characters" ) );
+        }
+
+        v8::Function * f = scope->getNamedCons("BinData");
+        Local<v8::Object> it = f->NewInstance();
+        return hexToBinData(scope, it, MD5Type, *utf);
+    }
+
+    v8::Handle<v8::Value> hexDataInit( V8Scope* scope, const v8::Arguments& args ) {
+        if (args.Length() != 2) {
+            return v8::ThrowException( v8::String::New( "HexData needs 2 arguments" ) );
+        }
+        v8::String::Utf8Value utf( args[ 1 ] );
+        v8::Function * f = scope->getNamedCons("BinData");
+        Local<v8::Object> it = f->NewInstance();
+        return hexToBinData(scope, it, args[0]->IntegerValue(), *utf);
+    }
+
+    v8::Handle<v8::Value> numberLongInit( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        if ( it->IsUndefined() || it == v8::Context::GetCurrent()->Global() ) {
+            v8::Function * f = scope->getNamedCons( "NumberLong" );
+            return newInstance(f, args);
+        }
+
+        if (args.Length() != 0 && args.Length() != 1 && args.Length() != 3) {
+            return v8::ThrowException( v8::String::New( "NumberLong needs 0, 1 or 3 arguments" ) );
+        }
+
+        if ( args.Length() == 0 ) {
+            it->Set( scope->getV8Str( "floatApprox" ), v8::Number::New( 0 ) );
+        }
+        else if ( args.Length() == 1 ) {
+            if ( args[ 0 ]->IsNumber() ) {
+                it->Set( scope->getV8Str( "floatApprox" ), args[ 0 ] );
+            }
+            else {
+                v8::String::Utf8Value data( args[ 0 ] );
+                string num = *data;
+                const char *numStr = num.c_str();
+                long long n;
+                try {
+                    n = parseLL( numStr );
+                }
+                catch ( const AssertionException & ) {
+                    return v8::ThrowException( v8::String::New( "could not convert string to long long" ) );
+                }
+                unsigned long long val = n;
+                // values above 2^53 are not accurately represented in JS
+                if ( (long long)val == (long long)(double)(long long)(val) && val < 9007199254740992ULL ) {
+                    it->Set( scope->getV8Str( "floatApprox" ), v8::Number::New( (double)(long long)( val ) ) );
+                }
+                else {
+                    it->Set( scope->getV8Str( "floatApprox" ), v8::Number::New( (double)(long long)( val ) ) );
+                    it->Set( scope->getV8Str( "top" ), v8::Integer::New( val >> 32 ) );
+                    it->Set( scope->getV8Str( "bottom" ), v8::Integer::New( (unsigned long)(val & 0x00000000ffffffff) ) );
+                }
+            }
+        }
+        else {
+            it->Set( scope->getV8Str( "floatApprox" ) , args[0] );
+            it->Set( scope->getV8Str( "top" ) , args[1] );
+            it->Set( scope->getV8Str( "bottom" ) , args[2] );
+        }
+        it->SetHiddenValue( scope->V8STR_NUMBERLONG, v8::Number::New( 1 ) );
+
+        return it;
+    }
+
+    long long numberLongVal( const v8::Handle< v8::Object > &it ) {
+        if ( !it->Has( v8::String::New( "top" ) ) )
+            return (long long)( it->Get( v8::String::New( "floatApprox" ) )->NumberValue() );
+        return
+            (long long)
+            ( (unsigned long long)( it->Get( v8::String::New( "top" ) )->ToInt32()->Value() ) << 32 ) +
+            (unsigned)( it->Get( v8::String::New( "bottom" ) )->ToInt32()->Value() );
+    }
+
+    v8::Handle<v8::Value> numberLongValueOf( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        long long val = numberLongVal( it );
+        return v8::Number::New( double( val ) );
+    }
+
+    v8::Handle<v8::Value> numberLongToNumber( V8Scope* scope, const v8::Arguments& args ) {
+        return numberLongValueOf( scope, args );
+    }
+
+    v8::Handle<v8::Value> numberLongToString( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+
+        stringstream ss;
+        long long val = numberLongVal( it );
+        const long long limit = 2LL << 30;
+
+        if ( val <= -limit || limit <= val )
+            ss << "NumberLong(\"" << val << "\")";
+        else
+            ss << "NumberLong(" << val << ")";
+
+        string ret = ss.str();
+        return v8::String::New( ret.c_str() );
+    }
+
+    v8::Handle<v8::Value> numberIntInit( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        if ( it->IsUndefined() || it == v8::Context::GetCurrent()->Global() ) {
+            v8::Function * f = scope->getNamedCons( "NumberInt" );
+            return newInstance(f, args);
+        }
+
+        if (args.Length() != 0 && args.Length() != 1) {
+            return v8::ThrowException( v8::String::New( "NumberInt needs 0, 1 argument" ) );
+        }
+
+        if ( args.Length() == 0 ) {
+            it->SetHiddenValue( scope->V8STR_NUMBERINT, v8::Number::New( 0 ) );
+        }
+        else if ( args.Length() == 1 ) {
+            it->SetHiddenValue( scope->V8STR_NUMBERINT, args[0]->ToInt32() );
+        }
+
+        return it;
+    }
+
+    v8::Handle<v8::Value> numberIntValueOf( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+        int val = it->GetHiddenValue( scope->V8STR_NUMBERINT )->Int32Value();
+        return v8::Number::New( double( val ) );
+    }
+
+    v8::Handle<v8::Value> numberIntToNumber( V8Scope* scope, const v8::Arguments& args ) {
+        return numberIntValueOf( scope, args );
+    }
+
+    v8::Handle<v8::Value> numberIntToString( V8Scope* scope, const v8::Arguments& args ) {
+        v8::Handle<v8::Object> it = args.This();
+
+        stringstream ss;
+        int val = it->GetHiddenValue( scope->V8STR_NUMBERINT )->Int32Value();
+        ss << "NumberInt(" << val << ")";
+
+        string ret = ss.str();
+        return v8::String::New( ret.c_str() );
+    }
+
+    v8::Handle<v8::Value> bsonsize( V8Scope* scope, const v8::Arguments& args ) {
+
+        if ( args.Length() != 1 )
+            return v8::ThrowException( v8::String::New( "bsonsize needs 1 argument" ) );
+
+        if ( args[0]->IsNull() )
+            return v8::Number::New(0);
+
+        if ( ! args[ 0 ]->IsObject() )
+            return v8::ThrowException( v8::String::New( "argument to bsonsize has to be an object" ) );
+
+        return v8::Number::New( scope->v8ToMongo( args[ 0 ]->ToObject() ).objsize() );
+    }
+
+    namespace v8Locks {
+        boost::mutex& __interruptMutex = *( new boost::mutex );
+
+        InterruptLock::InterruptLock() {
+            __interruptMutex.lock();
+        }
+
+        InterruptLock::~InterruptLock() {
+            __interruptMutex.unlock();
+        }
+
+        boost::mutex& __v8Mutex = *( new boost::mutex );
+        ThreadLocalValue< bool > __locked;
+
+        RecursiveLock::RecursiveLock() : _unlock() {
+            if ( !__locked.get() ) {
+                __v8Mutex.lock();
+                __locked.set( true );
+                _unlock = true;
+            }
+        }
+        RecursiveLock::~RecursiveLock() {
+            if ( _unlock ) {
+                __v8Mutex.unlock();
+                __locked.set( false );
+            }
+        }
+
+        RecursiveUnlock::RecursiveUnlock() : _lock() {
+            if ( __locked.get() ) {
+                __v8Mutex.unlock();
+                __locked.set( false );
+                _lock = true;
+            }
+        }
+        RecursiveUnlock::~RecursiveUnlock() {
+            if ( _lock ) {
+                __v8Mutex.lock();
+                __locked.set( true );
+            }
+        }
+    } // namespace v8Locks
+}
diff --git a/src/mongo/scripting/v8_db.h b/src/mongo/scripting/v8_db.h
new file mode 100644
index 00000000000..68946e0ed06
--- /dev/null
+++ b/src/mongo/scripting/v8_db.h
@@ -0,0 +1,94 @@
+// v8_db.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <v8.h>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+
+#include "engine_v8.h"
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+    // These functions may depend on the caller creating a handle scope and context scope.
+
+    v8::Handle<v8::FunctionTemplate> getMongoFunctionTemplate( V8Scope * scope, bool local );
+//    void installDBTypes( V8Scope * scope, v8::Handle<v8::ObjectTemplate>& global );
+    void installDBTypes( V8Scope * scope, v8::Handle<v8::Object>& global );
+
+    // the actual globals
+
+    mongo::DBClientBase * getConnection( const v8::Arguments& args );
+
+    // Mongo members
+    v8::Handle<v8::Value> mongoConsLocal(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> mongoConsExternal(V8Scope* scope, const v8::Arguments& args);
+
+    v8::Handle<v8::Value> mongoFind(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> mongoInsert(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> mongoRemove(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> mongoUpdate(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> mongoAuth(V8Scope* scope, const v8::Arguments& args);
+
+    v8::Handle<v8::Value> internalCursorCons(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> internalCursorNext(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> internalCursorHasNext(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> internalCursorObjsLeftInBatch(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> internalCursorReadOnly(V8Scope* scope, const v8::Arguments& args);
+
+    // DB members
+
+    v8::Handle<v8::Value> dbInit(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> collectionInit(V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> objectIdInit( V8Scope* scope, const v8::Arguments& args );
+
+    v8::Handle<v8::Value> dbRefInit( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> dbPointerInit( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> dbTimestampInit( V8Scope* scope, const v8::Arguments& args );
+
+    v8::Handle<v8::Value> binDataInit( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> binDataToString( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> binDataToBase64( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> binDataToHex( V8Scope* scope, const v8::Arguments& args );
+
+    v8::Handle<v8::Value> uuidInit( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> md5Init( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> hexDataInit( V8Scope* scope, const v8::Arguments& args );
+
+    v8::Handle<v8::Value> numberLongInit( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> numberLongToNumber(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> numberLongValueOf(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> numberLongToString(V8Scope* scope, const v8::Arguments& args);
+
+    v8::Handle<v8::Value> numberIntInit( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> numberIntToNumber(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> numberIntValueOf(V8Scope* scope, const v8::Arguments& args);
+    v8::Handle<v8::Value> numberIntToString(V8Scope* scope, const v8::Arguments& args);
+
+    v8::Handle<v8::Value> dbQueryInit( V8Scope* scope, const v8::Arguments& args );
+    v8::Handle<v8::Value> dbQueryIndexAccess( uint32_t index , const v8::AccessorInfo& info );
+
+    v8::Handle<v8::Value> collectionGetter( v8::Local<v8::String> name, const v8::AccessorInfo &info);
+    v8::Handle<v8::Value> collectionSetter( Local<v8::String> name, Local<Value> value, const AccessorInfo& info );
+
+    v8::Handle<v8::Value> bsonsize( V8Scope* scope, const v8::Arguments& args );
+
+}
+
diff --git a/src/mongo/scripting/v8_utils.cpp b/src/mongo/scripting/v8_utils.cpp
new file mode 100644
index 00000000000..9e7e8072220
--- /dev/null
+++ b/src/mongo/scripting/v8_utils.cpp
@@ -0,0 +1,295 @@
+// v8_utils.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#if defined(_WIN32)
+/** this is a hack - v8stdint.h defined uint16_t etc. on _WIN32 only, and that collides with 
+    our usage of boost */
+#include "boost/cstdint.hpp"
+using namespace boost;
+#define V8STDINT_H_
+#endif
+
+#include "v8_utils.h"
+#include "v8_db.h"
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <boost/smart_ptr.hpp>
+#include <boost/thread/thread.hpp>
+#include <boost/thread/xtime.hpp>
+#include "engine_v8.h"
+
+using namespace std;
+using namespace v8;
+
+namespace mongo {
+
+    std::string toSTLString( const Handle<v8::Value> & o ) {
+        v8::String::Utf8Value str(o);
+        const char * foo = *str;
+        std::string s(foo);
+        return s;
+    }
+
+    std::string toSTLString( const v8::TryCatch * try_catch ) {
+
+        stringstream ss;
+
+        //while ( try_catch ){ // disabled for v8 bleeding edge
+
+        v8::String::Utf8Value exception(try_catch->Exception());
+        Handle<v8::Message> message = try_catch->Message();
+
+        if (message.IsEmpty()) {
+            ss << *exception << endl;
+        }
+        else {
+
+            v8::String::Utf8Value filename(message->GetScriptResourceName());
+            int linenum = message->GetLineNumber();
+            ss << *filename << ":" << linenum << " " << *exception << endl;
+
+            v8::String::Utf8Value sourceline(message->GetSourceLine());
+            ss << *sourceline << endl;
+
+            int start = message->GetStartColumn();
+            for (int i = 0; i < start; i++)
+                ss << " ";
+
+            int end = message->GetEndColumn();
+            for (int i = start; i < end; i++)
+                ss << "^";
+
+            ss << endl;
+        }
+
+        //try_catch = try_catch->next_;
+        //}
+
+        return ss.str();
+    }
+
+
+    std::ostream& operator<<( std::ostream &s, const Handle<v8::Value> & o ) {
+        v8::String::Utf8Value str(o);
+        s << *str;
+        return s;
+    }
+
+    std::ostream& operator<<( std::ostream &s, const v8::TryCatch * try_catch ) {
+        HandleScope handle_scope;
+        v8::String::Utf8Value exception(try_catch->Exception());
+        Handle<v8::Message> message = try_catch->Message();
+
+        if (message.IsEmpty()) {
+            s << *exception << endl;
+        }
+        else {
+
+            v8::String::Utf8Value filename(message->GetScriptResourceName());
+            int linenum = message->GetLineNumber();
+            cout << *filename << ":" << linenum << " " << *exception << endl;
+
+            v8::String::Utf8Value sourceline(message->GetSourceLine());
+            cout << *sourceline << endl;
+
+            int start = message->GetStartColumn();
+            for (int i = 0; i < start; i++)
+                cout << " ";
+
+            int end = message->GetEndColumn();
+            for (int i = start; i < end; i++)
+                cout << "^";
+
+            cout << endl;
+        }
+
+        //if ( try_catch->next_ ) // disabled for v8 bleeding edge
+        //    s << try_catch->next_;
+
+        return s;
+    }
+
+    void ReportException(v8::TryCatch* try_catch) {
+        cout << try_catch << endl;
+    }
+
+    Handle< Context > baseContext_;
+
+    class JSThreadConfig {
+    public:
+        JSThreadConfig( V8Scope* scope, const Arguments &args, bool newScope = false ) : started_(), done_(), newScope_( newScope ) {
+            jsassert( args.Length() > 0, "need at least one argument" );
+            jsassert( args[ 0 ]->IsFunction(), "first argument must be a function" );
+
+            // arguments need to be copied into the isolate, go through bson
+            BSONObjBuilder b;
+            for( int i = 0; i < args.Length(); ++i ) {
+                scope->v8ToMongoElement(b, "arg" + i, args[i]);
+            }
+            args_ = b.obj();
+        }
+
+        ~JSThreadConfig() {
+        }
+
+        void start() {
+            jsassert( !started_, "Thread already started" );
+            // obtain own scope for execution
+            // do it here, not in constructor, otherwise it creates an infinite recursion from ScopedThread
+            _scope.reset( dynamic_cast< V8Scope * >( globalScriptEngine->newScope() ) );
+
+            JSThread jt( *this );
+            thread_.reset( new boost::thread( jt ) );
+            started_ = true;
+        }
+        void join() {
+            jsassert( started_ && !done_, "Thread not running" );
+            thread_->join();
+            done_ = true;
+        }
+
+        BSONObj returnData() {
+            if ( !done_ )
+                join();
+            return returnData_;
+        }
+
+    private:
+        class JSThread {
+        public:
+            JSThread( JSThreadConfig &config ) : config_( config ) {}
+
+            void operator()() {
+                V8Scope* scope = config_._scope.get();
+                v8::Isolate::Scope iscope(scope->getIsolate());
+                v8::Locker l(scope->getIsolate());
+                HandleScope handle_scope;
+                Context::Scope context_scope( scope->getContext() );
+
+                BSONObj args = config_.args_;
+                Local< v8::Function > f = v8::Function::Cast( *(scope->mongoToV8Element(args.firstElement(), true)) );
+                int argc = args.nFields() - 1;
+
+                boost::scoped_array< Local< Value > > argv( new Local< Value >[ argc ] );
+                BSONObjIterator it(args);
+                it.next();
+                for( int i = 0; i < argc; ++i ) {
+                    argv[ i ] = Local< Value >::New( scope->mongoToV8Element(*it, true) );
+                    it.next();
+                }
+                TryCatch try_catch;
+                Handle< Value > ret = f->Call( scope->getContext()->Global(), argc, argv.get() );
+                if ( ret.IsEmpty() ) {
+                    string e = toSTLString( &try_catch );
+                    log() << "js thread raised exception: " << e << endl;
+                    // v8 probably does something sane if ret is empty, but not going to assume that for now
+                    ret = v8::Undefined();
+                }
+                // ret is translated to BSON to switch isolate
+                BSONObjBuilder b;
+                scope->v8ToMongoElement(b, "ret", ret);
+                config_.returnData_ = b.obj();
+            }
+
+        private:
+            JSThreadConfig &config_;
+        };
+
+        bool started_;
+        bool done_;
+        bool newScope_;
+        BSONObj args_;
+        scoped_ptr< boost::thread > thread_;
+        scoped_ptr< V8Scope > _scope;
+        BSONObj returnData_;
+    };
+
+    Handle< Value > ThreadInit( V8Scope* scope, const Arguments &args ) {
+        Handle<v8::Object> it = args.This();
+        // NOTE I believe the passed JSThreadConfig will never be freed.  If this
+        // policy is changed, JSThread may no longer be able to store JSThreadConfig
+        // by reference.
+        it->SetHiddenValue( v8::String::New( "_JSThreadConfig" ), External::New( new JSThreadConfig( scope, args ) ) );
+        return v8::Undefined();
+    }
+
+    Handle< Value > ScopedThreadInit( V8Scope* scope, const Arguments &args ) {
+        Handle<v8::Object> it = args.This();
+        // NOTE I believe the passed JSThreadConfig will never be freed.  If this
+        // policy is changed, JSThread may no longer be able to store JSThreadConfig
+        // by reference.
+        it->SetHiddenValue( v8::String::New( "_JSThreadConfig" ), External::New( new JSThreadConfig( scope, args, true ) ) );
+        return v8::Undefined();
+    }
+
+    JSThreadConfig *thisConfig( V8Scope* scope, const Arguments &args ) {
+        Local< External > c = External::Cast( *(args.This()->GetHiddenValue( v8::String::New( "_JSThreadConfig" ) ) ) );
+        JSThreadConfig *config = (JSThreadConfig *)( c->Value() );
+        return config;
+    }
+
+    Handle< Value > ThreadStart( V8Scope* scope, const Arguments &args ) {
+        thisConfig( scope, args )->start();
+        return v8::Undefined();
+    }
+
+    Handle< Value > ThreadJoin( V8Scope* scope, const Arguments &args ) {
+        thisConfig( scope, args )->join();
+        return v8::Undefined();
+    }
+
+    Handle< Value > ThreadReturnData( V8Scope* scope, const Arguments &args ) {
+        BSONObj data = thisConfig( scope, args )->returnData();
+        return scope->mongoToV8Element(data.firstElement(), true);
+    }
+
+    Handle< Value > ThreadInject( V8Scope* scope, const Arguments &args ) {
+        jsassert( args.Length() == 1 , "threadInject takes exactly 1 argument" );
+        jsassert( args[0]->IsObject() , "threadInject needs to be passed a prototype" );
+
+        Local<v8::Object> o = args[0]->ToObject();
+
+        // install method on the Thread object
+        scope->injectV8Function("init", ThreadInit, o);
+        scope->injectV8Function("start", ThreadStart, o);
+        scope->injectV8Function("join", ThreadJoin, o);
+        scope->injectV8Function("returnData", ThreadReturnData, o);
+        return v8::Undefined();
+    }
+
+    Handle< Value > ScopedThreadInject( V8Scope* scope, const Arguments &args ) {
+        jsassert( args.Length() == 1 , "threadInject takes exactly 1 argument" );
+        jsassert( args[0]->IsObject() , "threadInject needs to be passed a prototype" );
+
+        Local<v8::Object> o = args[0]->ToObject();
+
+        scope->injectV8Function("init", ScopedThreadInit, o);
+        // inheritance takes care of other member functions
+
+        return v8::Undefined();
+    }
+
+    void installFork( V8Scope* scope, v8::Handle< v8::Object > &global, v8::Handle< v8::Context > &context ) {
+        if ( baseContext_.IsEmpty() ) // if this is the shell, first call will be with shell context, otherwise don't expect to use fork() anyway
+            baseContext_ = context;
+        scope->injectV8Function("_threadInject", ThreadInject, global);
+        scope->injectV8Function("_scopedThreadInject", ScopedThreadInject, global);
+    }
+
+}
diff --git a/src/mongo/scripting/v8_utils.h b/src/mongo/scripting/v8_utils.h
new file mode 100644
index 00000000000..ca5d317885f
--- /dev/null
+++ b/src/mongo/scripting/v8_utils.h
@@ -0,0 +1,43 @@
+// v8_utils.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <v8.h>
+
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <assert.h>
+#include <iostream>
+
+namespace mongo {
+
+    void ReportException(v8::TryCatch* handler);
+
+#define jsassert(x,msg) assert(x)
+
+    std::ostream& operator<<( std::ostream &s, const v8::Handle<v8::Value> & o );
+    std::ostream& operator<<( std::ostream &s, const v8::Handle<v8::TryCatch> * try_catch );
+
+    std::string toSTLString( const v8::Handle<v8::Value> & o );
+    std::string toSTLString( const v8::TryCatch * try_catch );
+
+    class V8Scope;
+    void installFork( V8Scope* scope, v8::Handle< v8::Object > &global, v8::Handle< v8::Context > &context );
+}
+
diff --git a/src/mongo/scripting/v8_wrapper.cpp b/src/mongo/scripting/v8_wrapper.cpp
new file mode 100644
index 00000000000..7c28a39cceb
--- /dev/null
+++ b/src/mongo/scripting/v8_wrapper.cpp
@@ -0,0 +1,99 @@
+// v8_wrapper.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#if defined(_WIN32)
+/** this is a hack - v8stdint.h defined uint16_t etc. on _WIN32 only, and that collides with 
+    our usage of boost */
+#include "boost/cstdint.hpp"
+using namespace boost;
+#define V8STDINT_H_
+#endif
+
+#include "v8_wrapper.h"
+#include "v8_utils.h"
+#include "v8_db.h"
+#include "engine_v8.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace v8;
+
+namespace mongo {
+
+#define DDD(x)
+
+    // --- object wrapper ---
+
+    class WrapperHolder {
+    public:
+        WrapperHolder( V8Scope* scope, const BSONObj * o , bool readOnly , bool iDelete )
+            : _scope(scope), _o(o), _readOnly( readOnly ), _iDelete( iDelete ) {
+        }
+
+        ~WrapperHolder() {
+            if ( _o && _iDelete ) {
+                delete _o;
+            }
+            _o = 0;
+        }
+
+        v8::Handle<v8::Value> get( v8::Local<v8::String> name ) {
+            const string& s = toSTLString( name );
+            const BSONElement& e = _o->getField( s );
+            return _scope->mongoToV8Element(e);
+        }
+
+        V8Scope* _scope;
+        const BSONObj * _o;
+        bool _readOnly;
+        bool _iDelete;
+    };
+
+    WrapperHolder * createWrapperHolder( V8Scope* scope, const BSONObj * o , bool readOnly , bool iDelete ) {
+        return new WrapperHolder( scope, o , readOnly , iDelete );
+    }
+
+    WrapperHolder * getWrapper( v8::Handle<v8::Object> o ) {
+        Handle<v8::Value> t = o->GetRealNamedProperty( v8::String::New( "_wrapper" ) );
+        assert( t->IsExternal() );
+        Local<External> c = External::Cast( *t );
+        WrapperHolder * w = (WrapperHolder*)(c->Value());
+        assert( w );
+        return w;
+    }
+
+
+    Handle<Value> wrapperCons(V8Scope* scope, const Arguments& args) {
+        if ( ! ( args.Length() == 1 && args[0]->IsExternal() ) )
+            return v8::ThrowException( v8::String::New( "wrapperCons needs 1 External arg" ) );
+
+        args.This()->Set( v8::String::New( "_wrapper" ) , args[0] );
+
+        return v8::Undefined();
+    }
+
+    v8::Handle<v8::Value> wrapperGetHandler( v8::Local<v8::String> name, const v8::AccessorInfo &info) {
+        return getWrapper( info.This() )->get( name );
+    }
+
+    v8::Handle<v8::FunctionTemplate> getObjectWrapperTemplate(V8Scope* scope) {
+        v8::Handle<v8::FunctionTemplate> t = scope->createV8Function(wrapperCons);
+        t->InstanceTemplate()->SetNamedPropertyHandler( wrapperGetHandler );
+        return t;
+    }
+}
diff --git a/src/mongo/scripting/v8_wrapper.h b/src/mongo/scripting/v8_wrapper.h
new file mode 100644
index 00000000000..22f14e6ae94
--- /dev/null
+++ b/src/mongo/scripting/v8_wrapper.h
@@ -0,0 +1,34 @@
+// v8_wrapper.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <v8.h>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include "../db/jsobj.h"
+#include "engine_v8.h"
+
+namespace mongo {
+
+    v8::Handle<v8::FunctionTemplate> getObjectWrapperTemplate(V8Scope* scope);
+
+    class WrapperHolder;
+    WrapperHolder * createWrapperHolder( V8Scope* scope, const BSONObj * o , bool readOnly , bool iDelete );
+
+}
diff --git a/src/mongo/server.h b/src/mongo/server.h
new file mode 100644
index 00000000000..d9a711ef780
--- /dev/null
+++ b/src/mongo/server.h
@@ -0,0 +1,46 @@
+/** @file server.h
+
+    This file contains includes commonly needed in the server files (mongod, mongos, test).  It is *NOT* included in the C++ client; i.e. 
+    this is a very good place for global-ish things that you don't need to be in the client lib.
+
+    Over time we should move more here, and more out of pch.h.  And get rid of pch.h at some point.
+*/
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <set>
+
+#include "bson/inline_decls.h"
+
+//using namespace std;
+//using namespace bson;
+
+/* Note: do not clutter code with these -- ONLY use in hot spots / significant loops. */
+
+// branch prediction.  indicate we expect to be true
+#define likely MONGO_likely
+
+// branch prediction.  indicate we expect to be false
+#define unlikely MONGO_unlikely
+
+// prefetch data from memory
+//#define PREFETCH MONGOPREFETCH
+
+#if defined(__GNUC__)
+
+#define CACHEALIGN __attribute__((aligned(64))
+
+#elif defined(_MSC_VER)
+
+#define CACHEALIGN __declspec(align(64)) 
+
+#else
+
+#define CACHEALIGN 
+
+#endif
+
+// log but not too fast.  this is rather simplistic we can do something fancier later
+#define LOGSOME static time_t __last; time_t __now=time(0); if(__last+5<__now) {} else log() 
diff --git a/src/mongo/shell/collection.js b/src/mongo/shell/collection.js
new file mode 100644
index 00000000000..df3fa516f86
--- /dev/null
+++ b/src/mongo/shell/collection.js
@@ -0,0 +1,893 @@
+// @file collection.js - DBCollection support in the mongo shell
+// db.colName is a DBCollection object
+// or db["colName"]
+
+if ( ( typeof  DBCollection ) == "undefined" ){
+    DBCollection = function( mongo , db , shortName , fullName ){
+        this._mongo = mongo;
+        this._db = db;
+        this._shortName = shortName;
+        this._fullName = fullName;
+
+        this.verify();
+    }
+}
+
+DBCollection.prototype.verify = function(){
+    assert( this._fullName , "no fullName" );
+    assert( this._shortName , "no shortName" );
+    assert( this._db , "no db" );
+
+    assert.eq( this._fullName , this._db._name + "." + this._shortName , "name mismatch" );
+
+    assert( this._mongo , "no mongo in DBCollection" );
+}
+
+DBCollection.prototype.getName = function(){
+    return this._shortName;
+}
+
+DBCollection.prototype.help = function () {
+    var shortName = this.getName();
+    print("DBCollection help");
+    print("\tdb." + shortName + ".find().help() - show DBCursor help");
+    print("\tdb." + shortName + ".count()");
+    print("\tdb." + shortName + ".copyTo(newColl) - duplicates collection by copying all documents to newColl; no indexes are copied.");
+    print("\tdb." + shortName + ".convertToCapped(maxBytes) - calls {convertToCapped:'" + shortName + "', size:maxBytes}} command");
+    print("\tdb." + shortName + ".dataSize()");
+    print("\tdb." + shortName + ".distinct( key ) - eg. db." + shortName + ".distinct( 'x' )");
+    print("\tdb." + shortName + ".drop() drop the collection");
+    print("\tdb." + shortName + ".dropIndex(name)");
+    print("\tdb." + shortName + ".dropIndexes()");
+    print("\tdb." + shortName + ".ensureIndex(keypattern[,options]) - options is an object with these possible fields: name, unique, dropDups");
+    print("\tdb." + shortName + ".reIndex()");
+    print("\tdb." + shortName + ".find([query],[fields]) - query is an optional query filter. fields is optional set of fields to return.");
+    print("\t                                              e.g. db." + shortName + ".find( {x:77} , {name:1, x:1} )");
+    print("\tdb." + shortName + ".find(...).count()");
+    print("\tdb." + shortName + ".find(...).limit(n)");
+    print("\tdb." + shortName + ".find(...).skip(n)");
+    print("\tdb." + shortName + ".find(...).sort(...)");
+    print("\tdb." + shortName + ".findOne([query])");
+    print("\tdb." + shortName + ".findAndModify( { update : ... , remove : bool [, query: {}, sort: {}, 'new': false] } )");
+    print("\tdb." + shortName + ".getDB() get DB object associated with collection");
+    print("\tdb." + shortName + ".getIndexes()");
+    print("\tdb." + shortName + ".group( { key : ..., initial: ..., reduce : ...[, cond: ...] } )");
+    print("\tdb." + shortName + ".insert(obj)");
+    print("\tdb." + shortName + ".mapReduce( mapFunction , reduceFunction , <optional params> )");
+    print("\tdb." + shortName + ".remove(query)");
+    print("\tdb." + shortName + ".renameCollection( newName , <dropTarget> ) renames the collection.");
+    print("\tdb." + shortName + ".runCommand( name , <options> ) runs a db command with the given name where the first param is the collection name");
+    print("\tdb." + shortName + ".save(obj)");
+    print("\tdb." + shortName + ".stats()");
+    print("\tdb." + shortName + ".storageSize() - includes free space allocated to this collection");
+    print("\tdb." + shortName + ".totalIndexSize() - size in bytes of all the indexes");
+    print("\tdb." + shortName + ".totalSize() - storage allocated for all data and indexes");
+    print("\tdb." + shortName + ".update(query, object[, upsert_bool, multi_bool]) - instead of two flags, you can pass an object with fields: upsert, multi");
+    print("\tdb." + shortName + ".validate( <full> ) - SLOW");;
+    print("\tdb." + shortName + ".getShardVersion() - only for use with sharding");
+    print("\tdb." + shortName + ".getShardDistribution() - prints statistics about data distribution in the cluster");
+    print("\tdb." + shortName + ".getSplitKeysForChunks( <maxChunkSize> ) - calculates split points over all chunks and returns splitter function");
+    return __magicNoPrint;
+}
+
+DBCollection.prototype.getFullName = function(){
+    return this._fullName;
+}
+DBCollection.prototype.getMongo = function(){
+    return this._db.getMongo();
+}
+DBCollection.prototype.getDB = function(){
+    return this._db;
+}
+
+DBCollection.prototype._dbCommand = function( cmd , params ){
+    if ( typeof( cmd ) == "object" )
+        return this._db._dbCommand( cmd );
+    
+    var c = {};
+    c[cmd] = this.getName();
+    if ( params )
+        Object.extend( c , params );
+    return this._db._dbCommand( c );    
+}
+
+DBCollection.prototype.runCommand = DBCollection.prototype._dbCommand;
+
+DBCollection.prototype._massageObject = function( q ){
+    if ( ! q )
+        return {};
+
+    var type = typeof q;
+
+    if ( type == "function" )
+        return { $where : q };
+
+    if ( q.isObjectId )
+        return { _id : q };
+
+    if ( type == "object" )
+        return q;
+
+    if ( type == "string" ){
+        if ( q.length == 24 )
+            return { _id : q };
+
+        return { $where : q };
+    }
+
+    throw "don't know how to massage : " + type;
+
+}
+
+
+DBCollection.prototype._validateObject = function( o ){
+    if ( o._ensureSpecial && o._checkModify )
+        throw "can't save a DBQuery object";
+}
+
+DBCollection._allowedFields = { $id : 1 , $ref : 1 , $db : 1 , $MinKey : 1, $MaxKey : 1 };
+
+DBCollection.prototype._validateForStorage = function( o ){
+    this._validateObject( o );
+    for ( var k in o ){
+        if ( k.indexOf( "." ) >= 0 ) {
+            throw "can't have . in field names [" + k + "]" ;
+        }
+
+        if ( k.indexOf( "$" ) == 0 && ! DBCollection._allowedFields[k] ) {
+            throw "field names cannot start with $ [" + k + "]";
+        }
+
+        if ( o[k] !== null && typeof( o[k] ) === "object" ) {
+            this._validateForStorage( o[k] );
+        }
+    }
+};
+
+
+DBCollection.prototype.find = function( query , fields , limit , skip, batchSize, options ){
+    return new DBQuery( this._mongo , this._db , this ,
+                        this._fullName , this._massageObject( query ) , fields , limit , skip , batchSize , options || this.getQueryOptions() );
+}
+
+DBCollection.prototype.findOne = function( query , fields, options ){
+    var cursor = this._mongo.find( this._fullName , this._massageObject( query ) || {} , fields , 
+        -1 /* limit */ , 0 /* skip*/, 0 /* batchSize */ , options || this.getQueryOptions() /* options */ );
+    if ( ! cursor.hasNext() )
+        return null;
+    var ret = cursor.next();
+    if ( cursor.hasNext() ) throw "findOne has more than 1 result!";
+    if ( ret.$err )
+        throw "error " + tojson( ret );
+    return ret;
+}
+
+DBCollection.prototype.insert = function( obj , _allow_dot ){
+    if ( ! obj )
+        throw "no object passed to insert!";
+    if ( ! _allow_dot ) {
+        this._validateForStorage( obj );
+    }
+    if ( typeof( obj._id ) == "undefined" && ! Array.isArray( obj ) ){
+        var tmp = obj; // don't want to modify input
+        obj = {_id: new ObjectId()};
+        for (var key in tmp){
+            obj[key] = tmp[key];
+        }
+    }
+    this._db._initExtraInfo();
+    this._mongo.insert( this._fullName , obj );
+    this._lastID = obj._id;
+    this._db._getExtraInfo("Inserted");
+}
+
+DBCollection.prototype.remove = function( t , justOne ){
+    for ( var k in t ){
+        if ( k == "_id" && typeof( t[k] ) == "undefined" ){
+            throw "can't have _id set to undefined in a remove expression"
+        }
+    }
+    this._db._initExtraInfo();
+    this._mongo.remove( this._fullName , this._massageObject( t ) , justOne ? true : false );
+    this._db._getExtraInfo("Removed");
+}
+
+DBCollection.prototype.update = function( query , obj , upsert , multi ){
+    assert( query , "need a query" );
+    assert( obj , "need an object" );
+
+    var firstKey = null;
+    for (var k in obj) { firstKey = k; break; }
+
+    if (firstKey != null && firstKey[0] == '$') {
+        // for mods we only validate partially, for example keys may have dots
+        this._validateObject( obj );
+    } else {
+        // we're basically inserting a brand new object, do full validation
+        this._validateForStorage( obj );
+    }
+
+    // can pass options via object for improved readability    
+    if ( typeof(upsert) === 'object' ) {
+        assert( multi === undefined, "Fourth argument must be empty when specifying upsert and multi with an object." );
+
+        opts = upsert;
+        multi = opts.multi;
+        upsert = opts.upsert;
+    }
+
+    this._db._initExtraInfo();
+    this._mongo.update( this._fullName , query , obj , upsert ? true : false , multi ? true : false );
+    this._db._getExtraInfo("Updated");
+}
+
+DBCollection.prototype.save = function( obj ){
+    if ( obj == null || typeof( obj ) == "undefined" ) 
+        throw "can't save a null";
+
+    if ( typeof( obj ) == "number" || typeof( obj) == "string" )
+        throw "can't save a number or string"
+
+    if ( typeof( obj._id ) == "undefined" ){
+        obj._id = new ObjectId();
+        return this.insert( obj );
+    }
+    else {
+        return this.update( { _id : obj._id } , obj , true );
+    }
+}
+
+DBCollection.prototype._genIndexName = function( keys ){
+    var name = "";
+    for ( var k in keys ){
+        var v = keys[k];
+        if ( typeof v == "function" )
+            continue;
+        
+        if ( name.length > 0 )
+            name += "_";
+        name += k + "_";
+
+        if ( typeof v == "number" )
+            name += v;
+    }
+    return name;
+}
+
+DBCollection.prototype._indexSpec = function( keys, options ) {
+    var ret = { ns : this._fullName , key : keys , name : this._genIndexName( keys ) };
+
+    if ( ! options ){
+    }
+    else if ( typeof ( options ) == "string" )
+        ret.name = options;
+    else if ( typeof ( options ) == "boolean" )
+        ret.unique = true;
+    else if ( typeof ( options ) == "object" ){
+        if ( options.length ){
+            var nb = 0;
+            for ( var i=0; i<options.length; i++ ){
+                if ( typeof ( options[i] ) == "string" )
+                    ret.name = options[i];
+                else if ( typeof( options[i] ) == "boolean" ){
+                    if ( options[i] ){
+                        if ( nb == 0 )
+                            ret.unique = true;
+                        if ( nb == 1 )
+                            ret.dropDups = true;
+                    }
+                    nb++;
+                }
+            }
+        }
+        else {
+            Object.extend( ret , options );
+        }
+    }
+    else {
+        throw "can't handle: " + typeof( options );
+    }
+    /*
+        return ret;
+
+    var name;
+    var nTrue = 0;
+    
+    if ( ! isObject( options ) ) {
+        options = [ options ];
+    }
+    
+    if ( options.length ){
+        for( var i = 0; i < options.length; ++i ) {
+            var o = options[ i ];
+            if ( isString( o ) ) {
+                ret.name = o;
+            } else if ( typeof( o ) == "boolean" ) {
+	        if ( o ) {
+		    ++nTrue;
+	        }
+            }
+        }
+        if ( nTrue > 0 ) {
+	    ret.unique = true;
+        }
+        if ( nTrue > 1 ) {
+	    ret.dropDups = true;
+        }
+    }
+*/
+    return ret;
+}
+
+DBCollection.prototype.createIndex = function( keys , options ){
+    var o = this._indexSpec( keys, options );
+    this._db.getCollection( "system.indexes" ).insert( o , true );
+}
+
+DBCollection.prototype.ensureIndex = function( keys , options ){
+    var name = this._indexSpec( keys, options ).name;
+    this._indexCache = this._indexCache || {};
+    if ( this._indexCache[ name ] ){
+        return;
+    }
+
+    this.createIndex( keys , options );
+    if ( this.getDB().getLastError() == "" ) {
+	this._indexCache[name] = true;
+    }
+}
+
+DBCollection.prototype.resetIndexCache = function(){
+    this._indexCache = {};
+}
+
+DBCollection.prototype.reIndex = function() {
+    return this._db.runCommand({ reIndex: this.getName() });
+}
+
+DBCollection.prototype.dropIndexes = function(){
+    this.resetIndexCache();
+
+    var res = this._db.runCommand( { deleteIndexes: this.getName(), index: "*" } );
+    assert( res , "no result from dropIndex result" );
+    if ( res.ok )
+        return res;
+
+    if ( res.errmsg.match( /not found/ ) )
+        return res;
+
+    throw "error dropping indexes : " + tojson( res );
+}
+
+
+DBCollection.prototype.drop = function(){
+    if ( arguments.length > 0 )
+        throw "drop takes no argument";
+    this.resetIndexCache();
+    var ret = this._db.runCommand( { drop: this.getName() } );
+    if ( ! ret.ok ){
+        if ( ret.errmsg == "ns not found" )
+            return false;
+        throw "drop failed: " + tojson( ret );
+    }
+    return true;
+}
+
+DBCollection.prototype.findAndModify = function(args){
+    var cmd = { findandmodify: this.getName() };
+    for (var key in args){
+        cmd[key] = args[key];
+    }
+
+    var ret = this._db.runCommand( cmd );
+    if ( ! ret.ok ){
+        if (ret.errmsg == "No matching object found"){
+            return null;
+        }
+        throw "findAndModifyFailed failed: " + tojson( ret.errmsg );
+    }
+    return ret.value;
+}
+
+DBCollection.prototype.renameCollection = function( newName , dropTarget ){
+    return this._db._adminCommand( { renameCollection : this._fullName , 
+                                     to : this._db._name + "." + newName , 
+                                     dropTarget : dropTarget } )
+}
+
+DBCollection.prototype.validate = function(full) {
+    var cmd = { validate: this.getName() };
+
+    if (typeof(full) == 'object') // support arbitrary options here
+        Object.extend(cmd, full);
+    else
+        cmd.full = full;
+
+    var res = this._db.runCommand( cmd );
+
+    if (typeof(res.valid) == 'undefined') {
+        // old-style format just put everything in a string. Now using proper fields
+
+        res.valid = false;
+
+        var raw = res.result || res.raw;
+
+        if ( raw ){
+            var str = "-" + tojson( raw );
+            res.valid = ! ( str.match( /exception/ ) || str.match( /corrupt/ ) );
+
+            var p = /lastExtentSize:(\d+)/;
+            var r = p.exec( str );
+            if ( r ){
+                res.lastExtentSize = Number( r[1] );
+            }
+        }
+    }
+
+    return res;
+}
+
+DBCollection.prototype.getShardVersion = function(){
+    return this._db._adminCommand( { getShardVersion : this._fullName } );
+}
+
+DBCollection.prototype.getIndexes = function(){
+    return this.getDB().getCollection( "system.indexes" ).find( { ns : this.getFullName() } ).toArray();
+}
+
+DBCollection.prototype.getIndices = DBCollection.prototype.getIndexes;
+DBCollection.prototype.getIndexSpecs = DBCollection.prototype.getIndexes;
+
+DBCollection.prototype.getIndexKeys = function(){
+    return this.getIndexes().map(
+        function(i){
+            return i.key;
+        }
+    );
+}
+
+
+DBCollection.prototype.count = function( x ){
+    return this.find( x ).count();
+}
+
+/**
+ *  Drop free lists. Normally not used.
+ *  Note this only does the collection itself, not the namespaces of its indexes (see cleanAll).
+ */
+DBCollection.prototype.clean = function() {
+    return this._dbCommand( { clean: this.getName() } );
+}
+
+
+
+/**
+ * <p>Drop a specified index.</p>
+ *
+ * <p>
+ * Name is the name of the index in the system.indexes name field. (Run db.system.indexes.find() to
+ *  see example data.)
+ * </p>
+ *
+ * <p>Note :  alpha: space is not reclaimed </p>
+ * @param {String} name of index to delete.
+ * @return A result object.  result.ok will be true if successful.
+ */
+DBCollection.prototype.dropIndex =  function(index) {
+    assert(index , "need to specify index to dropIndex" );
+
+    if ( ! isString( index ) && isObject( index ) )
+    	index = this._genIndexName( index );
+
+    var res = this._dbCommand( "deleteIndexes" ,{ index: index } );
+    this.resetIndexCache();
+    return res;
+}
+
+DBCollection.prototype.copyTo = function( newName ){
+    return this.getDB().eval(
+        function( collName , newName ){
+            var from = db[collName];
+            var to = db[newName];
+            to.ensureIndex( { _id : 1 } );
+            var count = 0;
+
+            var cursor = from.find();
+            while ( cursor.hasNext() ){
+                var o = cursor.next();
+                count++;
+                to.save( o );
+            }
+
+            return count;
+        } , this.getName() , newName
+    );
+}
+
+DBCollection.prototype.getCollection = function( subName ){
+    return this._db.getCollection( this._shortName + "." + subName );
+}
+
+DBCollection.prototype.stats = function( scale ){
+    return this._db.runCommand( { collstats : this._shortName , scale : scale } );
+}
+
+DBCollection.prototype.dataSize = function(){
+    return this.stats().size;
+}
+
+DBCollection.prototype.storageSize = function(){
+    return this.stats().storageSize;
+}
+
+DBCollection.prototype.totalIndexSize = function( verbose ){
+    var stats = this.stats();
+    if (verbose){
+        for (var ns in stats.indexSizes){
+            print( ns + "\t" + stats.indexSizes[ns] );
+        }
+    }
+    return stats.totalIndexSize;
+}
+
+
+DBCollection.prototype.totalSize = function(){
+    var total = this.storageSize();
+    var mydb = this._db;
+    var shortName = this._shortName;
+    this.getIndexes().forEach(
+        function( spec ){
+            var coll = mydb.getCollection( shortName + ".$" + spec.name );
+            var mysize = coll.storageSize();
+            //print( coll + "\t" + mysize + "\t" + tojson( coll.validate() ) );
+            total += coll.dataSize();
+        }
+    );
+    return total;
+}
+
+
+DBCollection.prototype.convertToCapped = function( bytes ){
+    if ( ! bytes )
+        throw "have to specify # of bytes";
+    return this._dbCommand( { convertToCapped : this._shortName , size : bytes } )
+}
+
+DBCollection.prototype.exists = function(){
+    return this._db.system.namespaces.findOne( { name : this._fullName } );
+}
+
+DBCollection.prototype.isCapped = function(){
+    var e = this.exists();
+    return ( e && e.options && e.options.capped ) ? true : false;
+}
+
+DBCollection.prototype._distinct = function( keyString , query ){
+    return this._dbCommand( { distinct : this._shortName , key : keyString , query : query || {} } );
+    if ( ! res.ok )
+        throw "distinct failed: " + tojson( res );
+    return res.values;
+}
+
+DBCollection.prototype.distinct = function( keyString , query ){
+    var res = this._distinct( keyString , query );
+    if ( ! res.ok )
+        throw "distinct failed: " + tojson( res );
+    return res.values;
+}
+
+
+DBCollection.prototype.aggregate = function( ops ) {
+    
+    var arr = ops;
+    
+    if ( ! ops.length ) {
+        arr = [];
+        for ( var i=0; i<arguments.length; i++ ) {
+            arr.push( arguments[i] )
+        }
+    }
+    
+    return this.runCommand( "aggregate" , { pipeline : arr } );
+}
+
+DBCollection.prototype.group = function( params ){
+    params.ns = this._shortName;
+    return this._db.group( params );
+}
+
+DBCollection.prototype.groupcmd = function( params ){
+    params.ns = this._shortName;
+    return this._db.groupcmd( params );
+}
+
+MapReduceResult = function( db , o ){
+    Object.extend( this , o );
+    this._o = o;
+    this._keys = Object.keySet( o );
+    this._db = db;
+    if ( this.result != null ) {
+        this._coll = this._db.getCollection( this.result );
+    }
+}
+
+MapReduceResult.prototype._simpleKeys = function(){
+    return this._o;
+}
+
+MapReduceResult.prototype.find = function(){
+    if ( this.results )
+        return this.results;
+    return DBCollection.prototype.find.apply( this._coll , arguments );
+}
+
+MapReduceResult.prototype.drop = function(){
+    if ( this._coll ) {
+        return this._coll.drop();
+    }
+}
+
+/**
+* just for debugging really
+*/
+MapReduceResult.prototype.convertToSingleObject = function(){
+    var z = {};
+    var it = this.results != null ? this.results : this._coll.find();
+    it.forEach( function(a){ z[a._id] = a.value; } );
+    return z;
+}
+
+DBCollection.prototype.convertToSingleObject = function(valueField){
+    var z = {};
+    this.find().forEach( function(a){ z[a._id] = a[valueField]; } );
+    return z;
+}
+
+/**
+* @param optional object of optional fields;
+*/
+DBCollection.prototype.mapReduce = function( map , reduce , optionsOrOutString ){
+    var c = { mapreduce : this._shortName , map : map , reduce : reduce };
+    assert( optionsOrOutString , "need to supply an optionsOrOutString" )
+
+    if ( typeof( optionsOrOutString ) == "string" )
+        c["out"] = optionsOrOutString;
+    else
+        Object.extend( c , optionsOrOutString );
+
+    var raw = this._db.runCommand( c );
+    if ( ! raw.ok ){
+        __mrerror__ = raw;
+        throw "map reduce failed:" + tojson(raw);
+    }
+    return new MapReduceResult( this._db , raw );
+
+}
+
+DBCollection.prototype.toString = function(){
+    return this.getFullName();
+}
+
+DBCollection.prototype.toString = function(){
+    return this.getFullName();
+}
+
+
+DBCollection.prototype.tojson = DBCollection.prototype.toString;
+
+DBCollection.prototype.shellPrint = DBCollection.prototype.toString;
+
+DBCollection.autocomplete = function(obj){
+    var colls = DB.autocomplete(obj.getDB());
+    var ret = [];
+    for (var i=0; i<colls.length; i++){
+        var c = colls[i];
+        if (c.length <= obj.getName().length) continue;
+        if (c.slice(0,obj.getName().length+1) != obj.getName()+'.') continue;
+
+        ret.push(c.slice(obj.getName().length+1));
+    }
+    return ret;
+}
+
+
+// Sharding additions
+
+/* 
+Usage :
+
+mongo <mongos>
+> load('path-to-file/shardingAdditions.js')
+Loading custom sharding extensions...
+true
+
+> var collection = db.getMongo().getCollection("foo.bar")
+> collection.getShardDistribution() // prints statistics related to the collection's data distribution
+
+> collection.getSplitKeysForChunks() // generates split points for all chunks in the collection, based on the
+                                     // default maxChunkSize or alternately a specified chunk size
+> collection.getSplitKeysForChunks( 10 ) // Mb
+
+> var splitter = collection.getSplitKeysForChunks() // by default, the chunks are not split, the keys are just
+                                                    // found.  A splitter function is returned which will actually
+                                                    // do the splits.
+                                                    
+> splitter() // ! Actually executes the splits on the cluster !
+                                                    
+*/
+
+DBCollection.prototype.getShardDistribution = function(){
+
+   var stats = this.stats()
+   
+   if( ! stats.sharded ){
+       print( "Collection " + this + " is not sharded." )
+       return
+   }
+   
+   var config = this.getMongo().getDB("config")
+       
+   var numChunks = 0
+   
+   for( var shard in stats.shards ){
+       
+       var shardDoc = config.shards.findOne({ _id : shard })
+       
+       print( "\nShard " + shard + " at " + shardDoc.host ) 
+       
+       var shardStats = stats.shards[ shard ]
+               
+       var chunks = config.chunks.find({ _id : sh._collRE( this ), shard : shard }).toArray()
+       
+       numChunks += chunks.length
+       
+       var estChunkData = shardStats.size / chunks.length
+       var estChunkCount = Math.floor( shardStats.count / chunks.length )
+       
+       print( " data : " + sh._dataFormat( shardStats.size ) +
+              " docs : " + shardStats.count +
+              " chunks : " +  chunks.length )
+       print( " estimated data per chunk : " + sh._dataFormat( estChunkData ) )
+       print( " estimated docs per chunk : " + estChunkCount )
+       
+   }
+   
+   print( "\nTotals" )
+   print( " data : " + sh._dataFormat( stats.size ) +
+          " docs : " + stats.count +
+          " chunks : " +  numChunks )
+   for( var shard in stats.shards ){
+   
+       var shardStats = stats.shards[ shard ]
+       
+       var estDataPercent = Math.floor( shardStats.size / stats.size * 10000 ) / 100
+       var estDocPercent = Math.floor( shardStats.count / stats.count * 10000 ) / 100
+       
+       print( " Shard " + shard + " contains " + estDataPercent + "% data, " + estDocPercent + "% docs in cluster, " +
+              "avg obj size on shard : " + sh._dataFormat( stats.shards[ shard ].avgObjSize ) )
+   }
+   
+   print( "\n" )
+   
+}
+
+
+DBCollection.prototype.getSplitKeysForChunks = function( chunkSize ){
+       
+   var stats = this.stats()
+   
+   if( ! stats.sharded ){
+       print( "Collection " + this + " is not sharded." )
+       return
+   }
+   
+   var config = this.getMongo().getDB("config")
+   
+   if( ! chunkSize ){
+       chunkSize = config.settings.findOne({ _id : "chunksize" }).value
+       print( "Chunk size not set, using default of " + chunkSize + "Mb" )
+   }
+   else{
+       print( "Using chunk size of " + chunkSize + "Mb" )
+   }
+    
+   var shardDocs = config.shards.find().toArray()
+   
+   var allSplitPoints = {}
+   var numSplits = 0    
+   
+   for( var i = 0; i < shardDocs.length; i++ ){
+       
+       var shardDoc = shardDocs[i]
+       var shard = shardDoc._id
+       var host = shardDoc.host
+       var sconn = new Mongo( host )
+       
+       var chunks = config.chunks.find({ _id : sh._collRE( this ), shard : shard }).toArray()
+       
+       print( "\nGetting split points for chunks on shard " + shard + " at " + host )
+               
+       var splitPoints = []
+       
+       for( var j = 0; j < chunks.length; j++ ){
+           var chunk = chunks[j]
+           var result = sconn.getDB("admin").runCommand({ splitVector : this + "", min : chunk.min, max : chunk.max, maxChunkSize : chunkSize })
+           if( ! result.ok ){
+               print( " Had trouble getting split keys for chunk " + sh._pchunk( chunk ) + " :\n" )
+               printjson( result )
+           }
+           else{
+               splitPoints = splitPoints.concat( result.splitKeys )
+               
+               if( result.splitKeys.length > 0 )
+                   print( " Added " + result.splitKeys.length + " split points for chunk " + sh._pchunk( chunk ) )
+           }
+       }
+       
+       print( "Total splits for shard " + shard + " : " + splitPoints.length )
+       
+       numSplits += splitPoints.length
+       allSplitPoints[ shard ] = splitPoints
+       
+   }
+   
+   // Get most recent migration
+   var migration = config.changelog.find({ what : /^move.*/ }).sort({ time : -1 }).limit( 1 ).toArray()
+   if( migration.length == 0 ) 
+       print( "\nNo migrations found in changelog." )
+   else {
+       migration = migration[0]
+       print( "\nMost recent migration activity was on " + migration.ns + " at " + migration.time )
+   }
+   
+   var admin = this.getMongo().getDB("admin") 
+   var coll = this
+   var splitFunction = function(){
+       
+       // Turn off the balancer, just to be safe
+       print( "Turning off balancer..." )
+       config.settings.update({ _id : "balancer" }, { $set : { stopped : true } }, true )
+       print( "Sleeping for 30s to allow balancers to detect change.  To be extra safe, check config.changelog" +
+              " for recent migrations." )
+       sleep( 30000 )
+              
+       for( shard in allSplitPoints ){
+           for( var i = 0; i < allSplitPoints[ shard ].length; i++ ){
+               var splitKey = allSplitPoints[ shard ][i]
+               print( "Splitting at " + tojson( splitKey ) )
+               printjson( admin.runCommand({ split : coll + "", middle : splitKey }) )
+           }
+       }
+       
+       print( "Turning the balancer back on." )
+       config.settings.update({ _id : "balancer" }, { $set : { stopped : false } } )
+       sleep( 1 )
+   }
+   
+   splitFunction.getSplitPoints = function(){ return allSplitPoints; }
+   
+   print( "\nGenerated " + numSplits + " split keys, run output function to perform splits.\n" +
+          " ex : \n" + 
+          "  > var splitter = <collection>.getSplitKeysForChunks()\n" +
+          "  > splitter() // Execute splits on cluster !\n" )
+       
+   return splitFunction
+   
+}
+
+DBCollection.prototype.setSlaveOk = function( value ) {
+    if( value == undefined ) value = true;
+    this._slaveOk = value;
+}
+
+DBCollection.prototype.getSlaveOk = function() {
+    if (this._slaveOk != undefined) return this._slaveOk;
+    return this._db.getSlaveOk();
+}
+
+DBCollection.prototype.getQueryOptions = function() {
+    var options = 0;
+    if (this.getSlaveOk()) options |= 4;
+    return options;
+}
+
diff --git a/src/mongo/shell/db.js b/src/mongo/shell/db.js
new file mode 100644
index 00000000000..6414e0351e7
--- /dev/null
+++ b/src/mongo/shell/db.js
@@ -0,0 +1,881 @@
+// db.js
+
+if ( typeof DB == "undefined" ){                     
+    DB = function( mongo , name ){
+        this._mongo = mongo;
+        this._name = name;
+    }
+}
+
+DB.prototype.getMongo = function(){
+    assert( this._mongo , "why no mongo!" );
+    return this._mongo;
+}
+
+DB.prototype.getSiblingDB = function( name ){
+    return this.getMongo().getDB( name );
+}
+
+DB.prototype.getSisterDB = DB.prototype.getSiblingDB;
+
+DB.prototype.getName = function(){
+    return this._name;
+}
+
+DB.prototype.stats = function(scale){
+    return this.runCommand( { dbstats : 1 , scale : scale } );
+}
+
+DB.prototype.getCollection = function( name ){
+    return new DBCollection( this._mongo , this , name , this._name + "." + name );
+}
+
+DB.prototype.commandHelp = function( name ){
+    var c = {};
+    c[name] = 1;
+    c.help = true;
+    var res = this.runCommand( c );
+    if ( ! res.ok )
+        throw res.errmsg;
+    return res.help;
+}
+
+DB.prototype.runCommand = function( obj ){
+    if ( typeof( obj ) == "string" ){
+        var n = {};
+        n[obj] = 1;
+        obj = n;
+    }
+    return this.getCollection( "$cmd" ).findOne( obj );
+}
+
+DB.prototype._dbCommand = DB.prototype.runCommand;
+
+DB.prototype.adminCommand = function( obj ){
+    if ( this._name == "admin" )
+        return this.runCommand( obj );
+    return this.getSiblingDB( "admin" ).runCommand( obj );
+}
+
+DB.prototype._adminCommand = DB.prototype.adminCommand; // alias old name
+
+DB.prototype.addUser = function( username , pass, readOnly ){
+    if ( pass == null || pass.length == 0 )
+        throw "password can't be empty";
+
+    readOnly = readOnly || false;
+    var c = this.getCollection( "system.users" );
+    
+    var u = c.findOne( { user : username } ) || { user : username };
+    u.readOnly = readOnly;
+    u.pwd = hex_md5( username + ":mongo:" + pass );
+
+    c.save( u );
+    print( tojson( u ) );
+
+    // in mongod version 2.1.0-, this worked
+    var le = {};
+    try {
+        le = this.getLastErrorObj();
+        printjson( le )
+    }
+    catch (e) {}
+
+    if ( le.err )
+        throw "couldn't add user: " + le.err
+}
+
+DB.prototype.logout = function(){
+    return this.runCommand({logout : 1});
+}
+
+DB.prototype.removeUser = function( username ){
+    this.getCollection( "system.users" ).remove( { user : username } );
+}
+
+DB.prototype.__pwHash = function( nonce, username, pass ) {
+    return hex_md5( nonce + username + hex_md5( username + ":mongo:" + pass ) );
+}
+
+DB.prototype.auth = function( username , pass ){
+    var result = 0;
+    try {
+        result = this.getMongo().auth(this.getName(), username, pass);
+    }
+    catch (e) {
+        print(e);
+        return 0;
+    }
+    return 1;
+}
+
+/**
+  Create a new collection in the database.  Normally, collection creation is automatic.  You would
+   use this function if you wish to specify special options on creation.
+
+   If the collection already exists, no action occurs.
+   
+   <p>Options:</p>
+   <ul>
+   	<li>
+     size: desired initial extent size for the collection.  Must be <= 1000000000.
+           for fixed size (capped) collections, this size is the total/max size of the 
+           collection.
+    </li>
+    <li>
+     capped: if true, this is a capped collection (where old data rolls out).
+    </li>
+    <li> max: maximum number of objects if capped (optional).</li>
+    </ul>
+
+   <p>Example: </p>
+   
+   <code>db.createCollection("movies", { size: 10 * 1024 * 1024, capped:true } );</code>
+ 
+ * @param {String} name Name of new collection to create 
+ * @param {Object} options Object with options for call.  Options are listed above.
+ * @return SOMETHING_FIXME
+*/
+DB.prototype.createCollection = function(name, opt) {
+    var options = opt || {};
+    var cmd = { create: name, capped: options.capped, size: options.size };
+    if (options.max != undefined)
+        cmd.max = options.max;
+    if (options.autoIndexId != undefined)
+        cmd.autoIndexId = options.autoIndexId;
+    var res = this._dbCommand(cmd);
+    return res;
+}
+
+/**
+ * @deprecated use getProfilingStatus
+ *  Returns the current profiling level of this database
+ *  @return SOMETHING_FIXME or null on error
+ */
+DB.prototype.getProfilingLevel  = function() {
+    var res = this._dbCommand( { profile: -1 } );
+    return res ? res.was : null;
+}
+
+/**
+ *  @return the current profiling status
+ *  example { was : 0, slowms : 100 }
+ *  @return SOMETHING_FIXME or null on error
+ */
+DB.prototype.getProfilingStatus  = function() {
+    var res = this._dbCommand( { profile: -1 } );
+    if ( ! res.ok )
+        throw "profile command failed: " + tojson( res );
+    delete res.ok
+    return res;
+}
+
+
+/**
+  Erase the entire database.  (!)
+
+ * @return Object returned has member ok set to true if operation succeeds, false otherwise.
+*/
+DB.prototype.dropDatabase = function() {
+    if ( arguments.length )
+        throw "dropDatabase doesn't take arguments";
+    return this._dbCommand( { dropDatabase: 1 } );
+}
+
+/**
+ * Shuts down the database.  Must be run while using the admin database.
+ * @param opts Options for shutdown. Possible options are:
+ *   - force: (boolean) if the server should shut down, even if there is no
+ *     up-to-date slave
+ *   - timeoutSecs: (number) the server will continue checking over timeoutSecs
+ *     if any other servers have caught up enough for it to shut down.
+ */
+DB.prototype.shutdownServer = function(opts) {
+    if( "admin" != this._name ){
+	return "shutdown command only works with the admin database; try 'use admin'";
+    }
+
+    cmd = {"shutdown" : 1};
+    opts = opts || {};
+    for (var o in opts) {
+        cmd[o] = opts[o];
+    }
+
+    try {
+        var res = this.runCommand(cmd);
+	if( res )
+	    throw "shutdownServer failed: " + res.errmsg;
+	throw "shutdownServer failed";
+    }
+    catch ( e ){
+        assert( tojson( e ).indexOf( "error doing query: failed" ) >= 0 , "unexpected error: " + tojson( e ) );
+        print( "server should be down..." );
+    }
+}
+
+/**
+  Clone database on another server to here.
+  <p>
+  Generally, you should dropDatabase() first as otherwise the cloned information will MERGE 
+  into whatever data is already present in this database.  (That is however a valid way to use 
+  clone if you are trying to do something intentionally, such as union three non-overlapping
+  databases into one.)
+  <p>
+  This is a low level administrative function will is not typically used.
+
+ * @param {String} from Where to clone from (dbhostname[:port]).  May not be this database 
+                   (self) as you cannot clone to yourself.
+ * @return Object returned has member ok set to true if operation succeeds, false otherwise.
+ * See also: db.copyDatabase()
+*/
+DB.prototype.cloneDatabase = function(from) { 
+    assert( isString(from) && from.length );
+    //this.resetIndexCache();
+    return this._dbCommand( { clone: from } );
+}
+
+
+/**
+ Clone collection on another server to here.
+ <p>
+ Generally, you should drop() first as otherwise the cloned information will MERGE 
+ into whatever data is already present in this collection.  (That is however a valid way to use 
+ clone if you are trying to do something intentionally, such as union three non-overlapping
+ collections into one.)
+ <p>
+ This is a low level administrative function is not typically used.
+ 
+ * @param {String} from mongod instance from which to clnoe (dbhostname:port).  May
+ not be this mongod instance, as clone from self is not allowed.
+ * @param {String} collection name of collection to clone.
+ * @param {Object} query query specifying which elements of collection are to be cloned.
+ * @return Object returned has member ok set to true if operation succeeds, false otherwise.
+ * See also: db.cloneDatabase()
+ */
+DB.prototype.cloneCollection = function(from, collection, query) { 
+    assert( isString(from) && from.length );
+    assert( isString(collection) && collection.length );
+    collection = this._name + "." + collection;
+    query = query || {};
+    //this.resetIndexCache();
+    return this._dbCommand( { cloneCollection:collection, from:from, query:query } );
+}
+
+
+/**
+  Copy database from one server or name to another server or name.
+
+  Generally, you should dropDatabase() first as otherwise the copied information will MERGE 
+  into whatever data is already present in this database (and you will get duplicate objects 
+  in collections potentially.)
+
+  For security reasons this function only works when executed on the "admin" db.  However, 
+  if you have access to said db, you can copy any database from one place to another.
+
+  This method provides a way to "rename" a database by copying it to a new db name and 
+  location.  Additionally, it effectively provides a repair facility.
+
+  * @param {String} fromdb database name from which to copy.
+  * @param {String} todb database name to copy to.
+  * @param {String} fromhost hostname of the database (and optionally, ":port") from which to 
+                    copy the data.  default if unspecified is to copy from self.
+  * @return Object returned has member ok set to true if operation succeeds, false otherwise.
+  * See also: db.clone()
+*/
+DB.prototype.copyDatabase = function(fromdb, todb, fromhost, username, password) { 
+    assert( isString(fromdb) && fromdb.length );
+    assert( isString(todb) && todb.length );
+    fromhost = fromhost || "";
+    if ( username && password ) {
+        var n = this._adminCommand( { copydbgetnonce : 1, fromhost:fromhost } );
+        return this._adminCommand( { copydb:1, fromhost:fromhost, fromdb:fromdb, todb:todb, username:username, nonce:n.nonce, key:this.__pwHash( n.nonce, username, password ) } );
+    } else {
+        return this._adminCommand( { copydb:1, fromhost:fromhost, fromdb:fromdb, todb:todb } );
+    }
+}
+
+/**
+  Repair database.
+ 
+ * @return Object returned has member ok set to true if operation succeeds, false otherwise.
+*/
+DB.prototype.repairDatabase = function() {
+    return this._dbCommand( { repairDatabase: 1 } );
+}
+
+
+DB.prototype.help = function() {
+    print("DB methods:");
+    print("\tdb.addUser(username, password[, readOnly=false])");
+    print("\tdb.adminCommand(nameOrDocument) - switches to 'admin' db, and runs command [ just calls db.runCommand(...) ]");
+    print("\tdb.auth(username, password)");
+    print("\tdb.cloneDatabase(fromhost)");
+    print("\tdb.commandHelp(name) returns the help for the command");
+    print("\tdb.copyDatabase(fromdb, todb, fromhost)");
+    print("\tdb.createCollection(name, { size : ..., capped : ..., max : ... } )");
+    print("\tdb.currentOp() displays currently executing operations in the db");
+    print("\tdb.dropDatabase()");
+    print("\tdb.eval(func, args) run code server-side");
+    print("\tdb.fsyncLock() flush data to disk and lock server for backups");
+    print("\tdb.fsyncUnlock() unlocks server following a db.fsyncLock()");
+    print("\tdb.getCollection(cname) same as db['cname'] or db.cname");
+    print("\tdb.getCollectionNames()");
+    print("\tdb.getLastError() - just returns the err msg string");
+    print("\tdb.getLastErrorObj() - return full status object");
+    print("\tdb.getMongo() get the server connection object");
+    print("\tdb.getMongo().setSlaveOk() allow queries on a replication slave server");
+    print("\tdb.getName()");
+    print("\tdb.getPrevError()");
+    print("\tdb.getProfilingLevel() - deprecated");
+    print("\tdb.getProfilingStatus() - returns if profiling is on and slow threshold");
+    print("\tdb.getReplicationInfo()");
+    print("\tdb.getSiblingDB(name) get the db at the same server as this one");
+    print("\tdb.isMaster() check replica primary status");
+    print("\tdb.killOp(opid) kills the current operation in the db");
+    print("\tdb.listCommands() lists all the db commands");
+    print("\tdb.loadServerScripts() loads all the scripts in db.system.js");
+    print("\tdb.logout()");
+    print("\tdb.printCollectionStats()");
+    print("\tdb.printReplicationInfo()");
+    print("\tdb.printShardingStatus()");
+    print("\tdb.printSlaveReplicationInfo()");
+    print("\tdb.removeUser(username)");
+    print("\tdb.repairDatabase()");
+    print("\tdb.resetError()");
+    print("\tdb.runCommand(cmdObj) run a database command.  if cmdObj is a string, turns it into { cmdObj : 1 }");
+    print("\tdb.serverStatus()");
+    print("\tdb.setProfilingLevel(level,<slowms>) 0=off 1=slow 2=all");
+    print("\tdb.setVerboseShell(flag) display extra information in shell output");
+    print("\tdb.shutdownServer()");
+    print("\tdb.stats()");
+    print("\tdb.version() current version of the server");
+
+    return __magicNoPrint;
+}
+
+DB.prototype.printCollectionStats = function(){
+    var mydb = this;
+    this.getCollectionNames().forEach(
+        function(z){
+            print( z );
+            printjson( mydb.getCollection(z).stats() );
+            print( "---" );
+        }
+    );
+}
+
+/**
+ * <p> Set profiling level for your db.  Profiling gathers stats on query performance. </p>
+ * 
+ * <p>Default is off, and resets to off on a database restart -- so if you want it on,
+ *    turn it on periodically. </p>
+ *  
+ *  <p>Levels :</p>
+ *   <ul>
+ *    <li>0=off</li>
+ *    <li>1=log very slow operations; optional argument slowms specifies slowness threshold</li>
+ *    <li>2=log all</li>
+ *  @param {String} level Desired level of profiling
+ *  @param {String} slowms For slow logging, query duration that counts as slow (default 100ms)
+ *  @return SOMETHING_FIXME or null on error
+ */
+DB.prototype.setProfilingLevel = function(level,slowms) {
+    
+    if (level < 0 || level > 2) { 
+        throw { dbSetProfilingException : "input level " + level + " is out of range [0..2]" };        
+    }
+
+    var cmd = { profile: level };
+    if ( slowms )
+        cmd["slowms"] = slowms;
+    return this._dbCommand( cmd );
+}
+
+DB.prototype._initExtraInfo = function() {
+    if ( typeof _verboseShell === 'undefined' || !_verboseShell ) return;
+    this.startTime = new Date().getTime();
+}
+
+DB.prototype._getExtraInfo = function(action) {
+    if ( typeof _verboseShell === 'undefined' || !_verboseShell ) {
+        __callLastError = true;
+        return;
+    }
+
+    // explicit w:1 so that replset getLastErrorDefaults aren't used here which would be bad.
+    var res = this.getLastErrorCmd(1); 
+    if (res) {
+        if (res.err != undefined && res.err != null) {
+            // error occured, display it
+            print(res.err);
+            return;
+        }
+
+        var info = action + " ";  
+        // hack for inserted because res.n is 0
+        info += action != "Inserted" ? res.n : 1;
+        if (res.n > 0 && res.updatedExisting != undefined) info += " " + (res.updatedExisting ? "existing" : "new")  
+        info += " record(s)";  
+        var time = new Date().getTime() - this.startTime;  
+        info += " in " + time + "ms";
+        print(info);
+    }
+} 
+
+/**
+ *  <p> Evaluate a js expression at the database server.</p>
+ * 
+ * <p>Useful if you need to touch a lot of data lightly; in such a scenario
+ *  the network transfer of the data could be a bottleneck.  A good example
+ *  is "select count(*)" -- can be done server side via this mechanism.
+ * </p>
+ *
+ * <p>
+ * If the eval fails, an exception is thrown of the form:
+ * </p>
+ * <code>{ dbEvalException: { retval: functionReturnValue, ok: num [, errno: num] [, errmsg: str] } }</code>
+ * 
+ * <p>Example: </p>
+ * <code>print( "mycount: " + db.eval( function(){db.mycoll.find({},{_id:ObjId()}).length();} );</code>
+ *
+ * @param {Function} jsfunction Javascript function to run on server.  Note this it not a closure, but rather just "code".
+ * @return result of your function, or null if error
+ * 
+ */
+DB.prototype.eval = function(jsfunction) {
+    var cmd = { $eval : jsfunction };
+    if ( arguments.length > 1 ) {
+	cmd.args = argumentsToArray( arguments ).slice(1);
+    }
+    
+    var res = this._dbCommand( cmd );
+    
+    if (!res.ok)
+    	throw tojson( res );
+    
+    return res.retval;
+}
+
+DB.prototype.dbEval = DB.prototype.eval;
+
+
+/**
+ * 
+ *  <p>
+ *   Similar to SQL group by.  For example: </p>
+ *
+ *  <code>select a,b,sum(c) csum from coll where active=1 group by a,b</code>
+ *
+ *  <p>
+ *    corresponds to the following in 10gen:
+ *  </p>
+ * 
+ *  <code>
+     db.group(
+       {
+         ns: "coll",
+         key: { a:true, b:true },
+	 // keyf: ...,
+	 cond: { active:1 },
+	 reduce: function(obj,prev) { prev.csum += obj.c; } ,
+	 initial: { csum: 0 }
+	 });
+	 </code>
+ *
+ * 
+ * <p>
+ *  An array of grouped items is returned.  The array must fit in RAM, thus this function is not
+ * suitable when the return set is extremely large.
+ * </p>
+ * <p>
+ * To order the grouped data, simply sort it client side upon return.
+ * <p>
+   Defaults
+     cond may be null if you want to run against all rows in the collection
+     keyf is a function which takes an object and returns the desired key.  set either key or keyf (not both).
+ * </p>
+*/
+DB.prototype.groupeval = function(parmsObj) {
+	
+    var groupFunction = function() {
+	var parms = args[0];
+    	var c = db[parms.ns].find(parms.cond||{});
+    	var map = new Map();
+        var pks = parms.key ? Object.keySet( parms.key ) : null;
+        var pkl = pks ? pks.length : 0;
+        var key = {};
+        
+    	while( c.hasNext() ) {
+	    var obj = c.next();
+	    if ( pks ) {
+	    	for( var i=0; i<pkl; i++ ){
+                    var k = pks[i];
+		    key[k] = obj[k];
+                }
+	    }
+	    else {
+	    	key = parms.$keyf(obj);
+	    }
+
+	    var aggObj = map.get(key);
+	    if( aggObj == null ) {
+		var newObj = Object.extend({}, key); // clone
+	    	aggObj = Object.extend(newObj, parms.initial)
+                map.put( key , aggObj );
+	    }
+	    parms.$reduce(obj, aggObj);
+	}
+        
+	return map.values();
+    }
+    
+    return this.eval(groupFunction, this._groupFixParms( parmsObj ));
+}
+
+DB.prototype.groupcmd = function( parmsObj ){
+    var ret = this.runCommand( { "group" : this._groupFixParms( parmsObj ) } );
+    if ( ! ret.ok ){
+        throw "group command failed: " + tojson( ret );
+    }
+    return ret.retval;
+}
+
+DB.prototype.group = DB.prototype.groupcmd;
+
+DB.prototype._groupFixParms = function( parmsObj ){
+    var parms = Object.extend({}, parmsObj);
+    
+    if( parms.reduce ) {
+	parms.$reduce = parms.reduce; // must have $ to pass to db
+	delete parms.reduce;
+    }
+    
+    if( parms.keyf ) {
+	parms.$keyf = parms.keyf;
+	delete parms.keyf;
+    }
+    
+    return parms;
+}
+
+DB.prototype.resetError = function(){
+    return this.runCommand( { reseterror : 1 } );
+}
+
+DB.prototype.forceError = function(){
+    return this.runCommand( { forceerror : 1 } );
+}
+
+DB.prototype.getLastError = function( w , wtimeout ){
+    var res = this.getLastErrorObj( w , wtimeout );
+    if ( ! res.ok )
+        throw "getlasterror failed: " + tojson( res );
+    return res.err;
+}
+DB.prototype.getLastErrorObj = function( w , wtimeout ){
+    var cmd = { getlasterror : 1 };
+    if ( w ){
+        cmd.w = w;
+        if ( wtimeout )
+            cmd.wtimeout = wtimeout;
+    }
+    var res = this.runCommand( cmd );
+
+    if ( ! res.ok )
+        throw "getlasterror failed: " + tojson( res );
+    return res;
+}
+DB.prototype.getLastErrorCmd = DB.prototype.getLastErrorObj;
+
+
+/* Return the last error which has occurred, even if not the very last error.
+
+   Returns: 
+    { err : <error message>, nPrev : <how_many_ops_back_occurred>, ok : 1 }
+
+   result.err will be null if no error has occurred.
+ */
+DB.prototype.getPrevError = function(){
+    return this.runCommand( { getpreverror : 1 } );
+}
+
+DB.prototype.getCollectionNames = function(){
+    var all = [];
+
+    var nsLength = this._name.length + 1;
+    
+    var c = this.getCollection( "system.namespaces" ).find();
+    while ( c.hasNext() ){
+        var name = c.next().name;
+        
+        if ( name.indexOf( "$" ) >= 0 && name.indexOf( ".oplog.$" ) < 0 )
+            continue;
+        
+        all.push( name.substring( nsLength ) );
+    }
+    
+    return all.sort();
+}
+
+DB.prototype.tojson = function(){
+    return this._name;
+}
+
+DB.prototype.toString = function(){
+    return this._name;
+}
+
+DB.prototype.isMaster = function () { return this.runCommand("isMaster"); }
+
+DB.prototype.currentOp = function( arg ){
+    var q = {}
+    if ( arg ) {
+        if ( typeof( arg ) == "object" )
+            Object.extend( q , arg );
+        else if ( arg )
+            q["$all"] = true;
+    }
+    return this.$cmd.sys.inprog.findOne( q );
+}
+DB.prototype.currentOP = DB.prototype.currentOp;
+
+DB.prototype.killOp = function(op) {
+    if( !op ) 
+        throw "no opNum to kill specified";
+    return this.$cmd.sys.killop.findOne({'op':op});
+}
+DB.prototype.killOP = DB.prototype.killOp;
+
+DB.tsToSeconds = function(x){
+    if ( x.t && x.i )
+        return x.t / 1000;
+    return x / 4294967296; // low 32 bits are ordinal #s within a second
+}
+
+/** 
+  Get a replication log information summary.
+  <p>
+  This command is for the database/cloud administer and not applicable to most databases.
+  It is only used with the local database.  One might invoke from the JS shell:
+  <pre>
+       use local
+       db.getReplicationInfo();
+  </pre>
+  It is assumed that this database is a replication master -- the information returned is 
+  about the operation log stored at local.oplog.$main on the replication master.  (It also 
+  works on a machine in a replica pair: for replica pairs, both machines are "masters" from 
+  an internal database perspective.
+  <p>
+  * @return Object timeSpan: time span of the oplog from start to end  if slave is more out 
+  *                          of date than that, it can't recover without a complete resync
+*/
+DB.prototype.getReplicationInfo = function() { 
+    var db = this.getSiblingDB("local");
+
+    var result = { };
+    var oplog;
+    if (db.system.namespaces.findOne({name:"local.oplog.rs"}) != null) {
+        oplog = 'oplog.rs';
+    }
+    else if (db.system.namespaces.findOne({name:"local.oplog.$main"}) != null) {
+        oplog = 'oplog.$main';
+    }
+    else {
+        result.errmsg = "neither master/slave nor replica set replication detected";
+        return result;
+    }
+    
+    var ol_entry = db.system.namespaces.findOne({name:"local."+oplog});
+    if( ol_entry && ol_entry.options ) {
+	result.logSizeMB = ol_entry.options.size / ( 1024 * 1024 );
+    } else {
+        result.errmsg  = "local."+oplog+", or its options, not found in system.namespaces collection";
+        return result;
+    }
+    ol = db.getCollection(oplog);
+
+    result.usedMB = ol.stats().size / ( 1024 * 1024 );
+    result.usedMB = Math.ceil( result.usedMB * 100 ) / 100;
+    
+    var firstc = ol.find().sort({$natural:1}).limit(1);
+    var lastc = ol.find().sort({$natural:-1}).limit(1);
+    if( !firstc.hasNext() || !lastc.hasNext() ) { 
+	result.errmsg = "objects not found in local.oplog.$main -- is this a new and empty db instance?";
+	result.oplogMainRowCount = ol.count();
+	return result;
+    }
+
+    var first = firstc.next();
+    var last = lastc.next();
+    {
+	var tfirst = first.ts;
+	var tlast = last.ts;
+        
+	if( tfirst && tlast ) { 
+	    tfirst = DB.tsToSeconds( tfirst ); 
+	    tlast = DB.tsToSeconds( tlast );
+	    result.timeDiff = tlast - tfirst;
+	    result.timeDiffHours = Math.round(result.timeDiff / 36)/100;
+	    result.tFirst = (new Date(tfirst*1000)).toString();
+	    result.tLast  = (new Date(tlast*1000)).toString();
+	    result.now = Date();
+	}
+	else { 
+	    result.errmsg = "ts element not found in oplog objects";
+	}
+    }
+
+    return result;
+};
+
+DB.prototype.printReplicationInfo = function() {
+    var result = this.getReplicationInfo();
+    if( result.errmsg ) {
+        if (!this.isMaster().ismaster) {
+            print("this is a slave, printing slave replication info.");
+            this.printSlaveReplicationInfo();
+            return;
+        }
+	print(tojson(result));
+	return;
+    }
+    print("configured oplog size:   " + result.logSizeMB + "MB");
+    print("log length start to end: " + result.timeDiff + "secs (" + result.timeDiffHours + "hrs)");
+    print("oplog first event time:  " + result.tFirst);
+    print("oplog last event time:   " + result.tLast);
+    print("now:                     " + result.now);
+}
+
+DB.prototype.printSlaveReplicationInfo = function() {
+    function getReplLag(st) {
+        var now = new Date();
+        print("\t syncedTo: " + st.toString() );
+        var ago = (now-st)/1000;
+        var hrs = Math.round(ago/36)/100;
+        print("\t\t = " + Math.round(ago) + " secs ago (" + hrs + "hrs)");
+    };
+    
+    function g(x) {
+        assert( x , "how could this be null (printSlaveReplicationInfo gx)" )
+        print("source:   " + x.host);
+        if ( x.syncedTo ){
+            var st = new Date( DB.tsToSeconds( x.syncedTo ) * 1000 );
+            getReplLag(st);
+        }
+        else {
+            print( "\t doing initial sync" );
+        }
+    };
+
+    function r(x) {
+        assert( x , "how could this be null (printSlaveReplicationInfo rx)" );
+        if ( x.state == 1 ) {
+            return;
+        }
+        
+        print("source:   " + x.name);
+        if ( x.optime ) {
+            getReplLag(x.optimeDate);
+        }
+        else {
+            print( "\t no replication info, yet.  State: " + x.stateStr );
+        }
+    };
+    
+    var L = this.getSiblingDB("local");
+
+    if (L.system.replset.count() != 0) {
+        var status = this.adminCommand({'replSetGetStatus' : 1});
+        status.members.forEach(r);
+    }
+    else if( L.sources.count() != 0 ) {
+        L.sources.find().forEach(g);
+    }
+    else {
+        print("local.sources is empty; is this db a --slave?");
+        return;
+    }
+}
+
+DB.prototype.serverBuildInfo = function(){
+    return this._adminCommand( "buildinfo" );
+}
+
+DB.prototype.serverStatus = function(){
+    return this._adminCommand( "serverStatus" );
+}
+
+DB.prototype.serverCmdLineOpts = function(){
+    return this._adminCommand( "getCmdLineOpts" );
+}
+
+DB.prototype.version = function(){
+    return this.serverBuildInfo().version;
+}
+
+DB.prototype.serverBits = function(){
+    return this.serverBuildInfo().bits;
+}
+
+DB.prototype.listCommands = function(){
+    var x = this.runCommand( "listCommands" );
+    for ( var name in x.commands ){
+        var c = x.commands[name];
+
+        var s = name + ": ";
+        
+        switch ( c.lockType ){
+        case -1: s += "read-lock"; break;
+        case  0: s += "no-lock"; break;
+        case  1: s += "write-lock"; break;
+        default: s += c.lockType;
+        }
+        
+        if (c.adminOnly) s += " adminOnly ";
+        if (c.adminOnly) s += " slaveOk ";
+
+        s += "\n  ";
+        s += c.help.replace(/\n/g, '\n  ');
+        s += "\n";
+        
+        print( s );
+    }
+}
+
+DB.prototype.printShardingStatus = function( verbose ){
+    printShardingStatus( this.getSiblingDB( "config" ) , verbose );
+}
+
+DB.prototype.fsyncLock = function() {
+    return this.adminCommand({fsync:1, lock:true});
+}
+
+DB.prototype.fsyncUnlock = function() {
+    return this.getSiblingDB("admin").$cmd.sys.unlock.findOne()
+}
+
+DB.autocomplete = function(obj){
+    var colls = obj.getCollectionNames();
+    var ret=[];
+    for (var i=0; i<colls.length; i++){
+        if (colls[i].match(/^[a-zA-Z0-9_.\$]+$/))
+            ret.push(colls[i]);
+    }
+    return ret;
+}
+
+DB.prototype.setSlaveOk = function( value ) {
+    if( value == undefined ) value = true;
+    this._slaveOk = value;
+}
+
+DB.prototype.getSlaveOk = function() {
+    if (this._slaveOk != undefined) return this._slaveOk;
+    return this._mongo.getSlaveOk();
+}
+
+/* Loads any scripts contained in db.system.js into the client shell.
+*/
+DB.prototype.loadServerScripts = function(){
+    db.system.js.find().forEach(function(u){eval(u._id + " = " + u.value);});
+}
+\ No newline at end of file
diff --git a/src/mongo/shell/dbshell.cpp b/src/mongo/shell/dbshell.cpp
new file mode 100644
index 00000000000..998301d1ee6
--- /dev/null
+++ b/src/mongo/shell/dbshell.cpp
@@ -0,0 +1,962 @@
+// dbshell.cpp
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include <stdio.h>
+#include <string.h>
+
+#include "../third_party/linenoise/linenoise.h"
+#include "../scripting/engine.h"
+#include "../client/dbclient.h"
+#include "../util/unittest.h"
+#include "../db/cmdline.h"
+#include "utils.h"
+#include "../util/password.h"
+#include "../util/version.h"
+#include "../util/goodies.h"
+#include "../util/file.h"
+#include "../db/repl/rs_member.h"
+
+using namespace std;
+using namespace boost::filesystem;
+using namespace mongo;
+
+string historyFile;
+bool gotInterrupted = false;
+bool inMultiLine = false;
+static volatile bool atPrompt = false; // can eval before getting to prompt
+bool autoKillOp = false;
+
+#if !defined(__freebsd__) && !defined(__openbsd__) && !defined(_WIN32)
+// this is for ctrl-c handling
+#include <setjmp.h>
+jmp_buf jbuf;
+#endif
+
+namespace mongo {
+
+    Scope * shellMainScope;
+
+    extern bool dbexitCalled;
+}
+
+void generateCompletions( const string& prefix , vector<string>& all ) {
+    if ( prefix.find( '"' ) != string::npos )
+        return;
+
+    try {
+        BSONObj args = BSON( "0" << prefix );
+        shellMainScope->invokeSafe( "function callShellAutocomplete(x) {shellAutocomplete(x)}", &args, 0, 1000 );
+        BSONObjBuilder b;
+        shellMainScope->append( b , "" , "__autocomplete__" );
+        BSONObj res = b.obj();
+        BSONObj arr = res.firstElement().Obj();
+
+        BSONObjIterator i( arr );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            all.push_back( e.String() );
+        }
+    }
+    catch ( ... ) {
+    }
+}
+
+void completionHook( const char* text , linenoiseCompletions* lc ) {
+    vector<string> all;
+    generateCompletions( text , all );
+
+    for ( unsigned i = 0; i < all.size(); ++i )
+        linenoiseAddCompletion( lc , (char*)all[i].c_str() );
+}
+
+void shellHistoryInit() {
+    stringstream ss;
+    const char * h = shellUtils::getUserDir();
+    if ( h )
+        ss << h << "/";
+    ss << ".dbshell";
+    historyFile = ss.str();
+
+    linenoiseHistoryLoad( historyFile.c_str() );
+    linenoiseSetCompletionCallback( completionHook );
+}
+
+void shellHistoryDone() {
+    linenoiseHistorySave( historyFile.c_str() );
+    linenoiseHistoryFree();
+}
+void shellHistoryAdd( const char * line ) {
+    if ( line[0] == '\0' )
+        return;
+
+    // dont record duplicate lines
+    static string lastLine;
+    if ( lastLine == line )
+        return;
+    lastLine = line;
+
+    if ( strstr( line, ".auth") == NULL )
+        linenoiseHistoryAdd( line );
+}
+
+#ifdef CTRLC_HANDLE
+void intr( int sig ) {
+    longjmp( jbuf , 1 );
+}
+#endif
+
+void killOps() {
+    if ( mongo::shellUtils::_nokillop || mongo::shellUtils::_allMyUris.size() == 0 )
+        return;
+
+    if ( atPrompt )
+        return;
+
+    sleepmillis(10); // give current op a chance to finish
+
+    for( map< string, set<string> >::const_iterator i = shellUtils::_allMyUris.begin(); i != shellUtils::_allMyUris.end(); ++i ) {
+        string errmsg;
+        ConnectionString cs = ConnectionString::parse( i->first, errmsg );
+        if (!cs.isValid()) continue;
+        boost::scoped_ptr<DBClientWithCommands> conn( cs.connect( errmsg ) );
+        if (!conn) continue;
+
+        const set<string>& uris = i->second;
+
+        BSONObj inprog =  conn->findOne( "admin.$cmd.sys.inprog", Query() )["inprog"].embeddedObject().getOwned();
+        BSONForEach( op, inprog ) {
+            if ( uris.count( op["client"].String() ) ) {
+                ONCE if ( !autoKillOp ) {
+                    cout << endl << "do you want to kill the current op(s) on the server? (y/n): ";
+                    cout.flush();
+
+                    char yn;
+                    cin >> yn;
+
+                    if ( yn != 'y' && yn != 'Y' )
+                        return;
+                }
+
+                conn->findOne( "admin.$cmd.sys.killop", QUERY( "op"<< op["opid"] ) );
+            }
+        }
+    }
+}
+
+void quitNicely( int sig ) {
+    mongo::dbexitCalled = true;
+    if ( sig == SIGINT && inMultiLine ) {
+        gotInterrupted = 1;
+        return;
+    }
+
+#if !defined(_WIN32)
+    if ( sig == SIGPIPE )
+        mongo::rawOut( "mongo got signal SIGPIPE\n" );
+#endif
+
+    killOps();
+    shellHistoryDone();
+    exit(0);
+}
+
+// the returned string is allocated with strdup() or malloc() and must be freed by calling free()
+char * shellReadline( const char * prompt , int handlesigint = 0 ) {
+    atPrompt = true;
+
+#ifdef CTRLC_HANDLE
+    if ( ! handlesigint ) {
+        char* ret = linenoise( prompt );
+        atPrompt = false;
+        return ret;
+    }
+    if ( setjmp( jbuf ) ) {
+        gotInterrupted = 1;
+        sigrelse(SIGINT);
+        signal( SIGINT , quitNicely );
+        return 0;
+    }
+    signal( SIGINT , intr );
+#endif
+
+    char * ret = linenoise( prompt );
+    if ( ! ret ) {
+        gotInterrupted = true;  // got ^C, break out of multiline
+    }
+
+    signal( SIGINT , quitNicely );
+    atPrompt = false;
+    return ret;
+}
+
+#ifdef _WIN32
+char * strsignal(int sig){
+    switch (sig){
+        case SIGINT: return "SIGINT";
+        case SIGTERM: return "SIGTERM";
+        case SIGABRT: return "SIGABRT";
+        case SIGSEGV: return "SIGSEGV";
+        case SIGFPE: return "SIGFPE";
+        default: return "unknown";
+    }
+}
+#endif
+
+void quitAbruptly( int sig ) {
+    ostringstream ossSig;
+    ossSig << "mongo got signal " << sig << " (" << strsignal( sig ) << "), stack trace: " << endl;
+    mongo::rawOut( ossSig.str() );
+
+    ostringstream ossBt;
+    mongo::printStackTrace( ossBt );
+    mongo::rawOut( ossBt.str() );
+
+    mongo::shellUtils::KillMongoProgramInstances();
+    exit( 14 );
+}
+
+// this will be called in certain c++ error cases, for example if there are two active
+// exceptions
+void myterminate() {
+    mongo::rawOut( "terminate() called in shell, printing stack:" );
+    mongo::printStackTrace();
+    exit( 14 );
+}
+
+void setupSignals() {
+    signal( SIGINT , quitNicely );
+    signal( SIGTERM , quitNicely );
+    signal( SIGABRT , quitAbruptly );
+    signal( SIGSEGV , quitAbruptly );
+    signal( SIGFPE , quitAbruptly );
+
+#if !defined(_WIN32) // surprisingly these are the only ones that don't work on windows
+    signal( SIGPIPE , quitNicely ); // Maybe just log and continue?
+    signal( SIGBUS , quitAbruptly );
+#endif
+
+    set_terminate( myterminate );
+}
+
+string fixHost( string url , string host , string port ) {
+    //cout << "fixHost url: " << url << " host: " << host << " port: " << port << endl;
+
+    if ( host.size() == 0 && port.size() == 0 ) {
+        if ( url.find( "/" ) == string::npos ) {
+            // check for ips
+            if ( url.find( "." ) != string::npos )
+                return url + "/test";
+
+            if ( url.rfind( ":" ) != string::npos &&
+                    isdigit( url[url.rfind(":")+1] ) )
+                return url + "/test";
+        }
+        return url;
+    }
+
+    if ( url.find( "/" ) != string::npos ) {
+        cerr << "url can't have host or port if you specify them individually" << endl;
+        exit(-1);
+    }
+
+    if ( host.size() == 0 )
+        host = "127.0.0.1";
+
+    string newurl = host;
+    if ( port.size() > 0 )
+        newurl += ":" + port;
+    else if ( host.find(':') == string::npos ) {
+        // need to add port with IPv6 addresses
+        newurl += ":27017";
+    }
+
+    newurl += "/" + url;
+
+    return newurl;
+}
+
+static string OpSymbols = "~!%^&*-+=|:,<>/?.";
+
+bool isOpSymbol( char c ) {
+    for ( size_t i = 0; i < OpSymbols.size(); i++ )
+        if ( OpSymbols[i] == c ) return true;
+    return false;
+}
+
+bool isUseCmd( string code ) {
+    string cmd = code;
+    if ( cmd.find( " " ) > 0 )
+        cmd = cmd.substr( 0 , cmd.find( " " ) );
+    return cmd == "use";
+}
+
+bool isBalanced( string code ) {
+    if (isUseCmd( code ))
+        return true;  // don't balance "use <dbname>" in case dbname contains special chars
+    int brackets = 0;
+    int parens = 0;
+    bool danglingOp = false;
+
+    for ( size_t i=0; i<code.size(); i++ ) {
+        switch( code[i] ) {
+        case '/':
+            if ( i + 1 < code.size() && code[i+1] == '/' ) {
+                while ( i  <code.size() && code[i] != '\n' )
+                    i++;
+            }
+            continue;
+        case '{': brackets++; break;
+        case '}': if ( brackets <= 0 ) return true; brackets--; break;
+        case '(': parens++; break;
+        case ')': if ( parens <= 0 ) return true; parens--; break;
+        case '"':
+            i++;
+            while ( i < code.size() && code[i] != '"' ) i++;
+            break;
+        case '\'':
+            i++;
+            while ( i < code.size() && code[i] != '\'' ) i++;
+            break;
+        case '\\':
+            if ( i + 1 < code.size() && code[i+1] == '/' ) i++;
+            break;
+        case '+':
+        case '-':
+            if ( i + 1 < code.size() && code[i+1] == code[i] ) {
+                i++;
+                continue; // postfix op (++/--) can't be a dangling op
+            }
+            break;
+        }
+        if ( i >= code.size() ) {
+            danglingOp = false;
+            break;
+        }
+        if ( isOpSymbol( code[i] ) ) danglingOp = true;
+        else if ( !std::isspace( code[i] ) ) danglingOp = false;
+    }
+
+    return brackets == 0 && parens == 0 && !danglingOp;
+}
+
+using mongo::asserted;
+
+struct BalancedTest : public mongo::UnitTest {
+public:
+    void run() {
+        assert( isBalanced( "x = 5" ) );
+        assert( isBalanced( "function(){}" ) );
+        assert( isBalanced( "function(){\n}" ) );
+        assert( ! isBalanced( "function(){" ) );
+        assert( isBalanced( "x = \"{\";" ) );
+        assert( isBalanced( "// {" ) );
+        assert( ! isBalanced( "// \n {" ) );
+        assert( ! isBalanced( "\"//\" {" ) );
+        assert( isBalanced( "{x:/x\\//}" ) );
+        assert( ! isBalanced( "{ \\/// }" ) );
+        assert( isBalanced( "x = 5 + y ") );
+        assert( ! isBalanced( "x = ") );
+        assert( ! isBalanced( "x = // hello") );
+        assert( ! isBalanced( "x = 5 +") );
+        assert( isBalanced( " x ++") );
+        assert( isBalanced( "-- x") );
+        assert( !isBalanced( "a.") );
+        assert( !isBalanced( "a. ") );
+        assert( isBalanced( "a.b") );
+    }
+} balanced_test;
+
+string finishCode( string code ) {
+    while ( ! isBalanced( code ) ) {
+        inMultiLine = true;
+        code += "\n";
+        // cancel multiline if two blank lines are entered
+        if ( code.find( "\n\n\n" ) != string::npos )
+            return ";";
+        char * line = shellReadline( "... " , 1 );
+        if ( gotInterrupted ) {
+            if ( line )
+                free( line );
+            return "";
+        }
+        if ( ! line )
+            return "";
+
+        while ( startsWith( line, "... " ) )
+            line += 4;
+
+        code += line;
+        free( line );
+    }
+    return code;
+}
+
+#include <boost/program_options.hpp>
+namespace po = boost::program_options;
+
+void show_help_text( const char* name, po::options_description options ) {
+    cout << "MongoDB shell version: " << mongo::versionString << endl;
+    cout << "usage: " << name << " [options] [db address] [file names (ending in .js)]" << endl
+         << "db address can be:" << endl
+         << "  foo                   foo database on local machine" << endl
+         << "  192.169.0.5/foo       foo database on 192.168.0.5 machine" << endl
+         << "  192.169.0.5:9999/foo  foo database on 192.168.0.5 machine on port 9999" << endl
+         << options << endl
+         << "file names: a list of files to run. files have to end in .js and will exit after "
+         << "unless --shell is specified" << endl;
+};
+
+bool fileExists( string file ) {
+    try {
+        path p( file );
+        return boost::filesystem::exists( file );
+    }
+    catch ( ... ) {
+        return false;
+    }
+}
+
+namespace mongo {
+    extern bool isShell;
+    extern DBClientWithCommands *latestConn;
+}
+
+string sayReplSetMemberState() {
+    try {
+        if( latestConn ) {
+            BSONObj info;
+            if( latestConn->runCommand( "admin", BSON( "replSetGetStatus" << 1 << "forShell" << 1 ) , info ) ) {
+                stringstream ss;
+                ss << info["set"].String() << ':';
+                
+                int s = info["myState"].Int();
+                MemberState ms( s );
+                ss << ms.toString();
+
+                return ss.str();
+            }
+            else {
+                string s = info.getStringField( "info" );
+                if( s.size() < 20 )
+                    return s; // "mongos", "configsvr"
+            }
+        }
+    }
+    catch( std::exception& e ) {
+        log( 1 ) << "error in sayReplSetMemberState:" << e.what() << endl;
+    }
+    return "";
+}
+
+/**
+ * Edit a variable in an external editor -- EDITOR must be defined
+ *
+ * @param var Name of JavaScript variable to be edited
+ */
+static void edit( const string& var ) {
+
+    // EDITOR must be defined in the environment
+    static const char * editor = getenv( "EDITOR" );
+    if ( !editor ) {
+        cout << "please define the EDITOR environment variable" << endl;
+        return;
+    }
+
+    // "var" must look like a variable/property name
+    for ( const char* p=var.c_str(); *p; ++p ) {
+        if ( ! ( isalnum( *p ) || *p == '_' || *p == '.' ) ) {
+            cout << "can only edit variable or property" << endl;
+            return;
+        }
+    }
+
+    // Convert "var" to JavaScript (JSON) text
+    if ( !shellMainScope->exec( "__jsout__ = tojson(" + var + ")", "tojs", false, false, false ) )
+        return; // Error already printed
+
+    const string js = shellMainScope->getString( "__jsout__" );
+
+    if ( strstr( js.c_str(), "[native code]" ) ) {
+        cout << "can't edit native functions" << endl;
+        return;
+    }
+
+    // Pick a name to use for the temp file
+    string filename;
+    const int maxAttempts = 10;
+    int i;
+    for ( i = 0; i < maxAttempts; ++i ) {
+        StringBuilder sb;
+#ifdef _WIN32
+        char tempFolder[MAX_PATH];
+        GetTempPathA( sizeof tempFolder, tempFolder );
+        sb << tempFolder << "mongo_edit" << time( 0 ) + i << ".js";
+#else
+        sb << "/tmp/mongo_edit" << time( 0 ) + i << ".js";
+#endif
+        filename = sb.str();
+        if ( ! fileExists( filename ) )
+            break;
+    }
+    if ( i == maxAttempts ) {
+        cout << "couldn't create unique temp file after " << maxAttempts << " attempts" << endl;
+        return;
+    }
+
+    // Create the temp file
+    FILE * tempFileStream;
+    tempFileStream = fopen( filename.c_str(), "wt" );
+    if ( ! tempFileStream ) {
+        cout << "couldn't create temp file (" << filename << "): " << errnoWithDescription() << endl;
+        return;
+    }
+
+    // Write JSON into the temp file
+    size_t fileSize = js.size();
+    if ( fwrite( js.data(), sizeof( char ), fileSize, tempFileStream ) != fileSize ) {
+        int systemErrno = errno;
+        cout << "failed to write to temp file: " << errnoWithDescription( systemErrno ) << endl;
+        fclose( tempFileStream );
+        remove( filename.c_str() );
+        return;
+    }
+    fclose( tempFileStream );
+
+    // Pass file to editor
+    StringBuilder sb;
+    sb << editor << " " << filename;
+    int ret = ::system( sb.str().c_str() );
+    if ( ret ) {
+        if ( ret == -1 ) {
+            int systemErrno = errno;
+            cout << "failed to launch $EDITOR (" << editor << "): " << errnoWithDescription( systemErrno ) << endl;
+        }
+        else
+            cout << "editor exited with error (" << ret << "), not applying changes" << endl;
+        remove( filename.c_str() );
+        return;
+    }
+
+    // The editor gave return code zero, so read the file back in
+    tempFileStream = fopen( filename.c_str(), "rt" );
+    if ( ! tempFileStream ) {
+        cout << "couldn't open temp file on return from editor: " << errnoWithDescription() << endl;
+        remove( filename.c_str() );
+        return;
+    }
+    sb.reset();
+    sb << var << " = ";
+    int bytes;
+    do {
+        char buf[1024];
+        bytes = fread( buf, sizeof( char ), sizeof buf, tempFileStream );
+        if ( ferror( tempFileStream ) ) {
+            cout << "failed to read temp file: " << errnoWithDescription() << endl;
+            fclose( tempFileStream );
+            remove( filename.c_str() );
+            return;
+        }
+        sb.append( StringData( buf, bytes ) );
+    } while ( bytes );
+
+    // Done with temp file, close and delete it
+    fclose( tempFileStream );
+    remove( filename.c_str() );
+
+    // Try to execute assignment to copy edited value back into the variable
+    const string code = sb.str();
+    if ( !shellMainScope->exec( code, "tojs", false, false, false ) )
+        return; // Error already printed
+}
+
+int _main( int argc, char* argv[] ) {
+    mongo::isShell = true;
+    setupSignals();
+
+    mongo::shellUtils::RecordMyLocation( argv[ 0 ] );
+
+    string url = "test";
+    string dbhost;
+    string port;
+    vector<string> files;
+
+    string username;
+    string password;
+
+    bool runShell = false;
+    bool nodb = false;
+    bool norc = false;
+
+    string script;
+
+    po::options_description shell_options( "options" );
+    po::options_description hidden_options( "Hidden options" );
+    po::options_description cmdline_options( "Command line options" );
+    po::positional_options_description positional_options;
+
+    shell_options.add_options()
+    ( "shell", "run the shell after executing files" )
+    ( "nodb", "don't connect to mongod on startup - no 'db address' arg expected" )
+    ( "norc", "will not run the \".mongorc.js\" file on start up" )
+    ( "quiet", "be less chatty" )
+    ( "port", po::value<string>( &port ), "port to connect to" )
+    ( "host", po::value<string>( &dbhost ), "server to connect to" )
+    ( "eval", po::value<string>( &script ), "evaluate javascript" )
+    ( "username,u", po::value<string>(&username), "username for authentication" )
+    ( "password,p", new mongo::PasswordValue( &password ), "password for authentication" )
+    ( "help,h", "show this usage information" )
+    ( "version", "show version information" )
+    ( "verbose", "increase verbosity" )
+    ( "ipv6", "enable IPv6 support (disabled by default)" )
+#ifdef MONGO_SSL
+    ( "ssl", "use all for connections" )
+#endif
+    ;
+
+    hidden_options.add_options()
+    ( "dbaddress", po::value<string>(), "dbaddress" )
+    ( "files", po::value< vector<string> >(), "files" )
+    ( "nokillop", "nokillop" ) // for testing, kill op will also be disabled automatically if the tests starts a mongo program
+    ( "autokillop", "autokillop" ) // for testing, will kill op without prompting
+    ;
+
+    positional_options.add( "dbaddress", 1 );
+    positional_options.add( "files", -1 );
+
+    cmdline_options.add( shell_options ).add( hidden_options );
+
+    po::variables_map params;
+
+    /* using the same style as db.cpp uses because eventually we're going
+     * to merge some of this stuff. */
+    int command_line_style = (((po::command_line_style::unix_style ^
+                                po::command_line_style::allow_guessing) |
+                               po::command_line_style::allow_long_disguise) ^
+                              po::command_line_style::allow_sticky);
+
+    try {
+        po::store(po::command_line_parser(argc, argv).options(cmdline_options).
+                  positional(positional_options).
+                  style(command_line_style).run(), params);
+        po::notify( params );
+    }
+    catch ( po::error &e ) {
+        cout << "ERROR: " << e.what() << endl << endl;
+        show_help_text( argv[0], shell_options );
+        return mongo::EXIT_BADOPTIONS;
+    }
+
+    // hide password from ps output
+    for ( int i = 0; i < (argc-1); ++i ) {
+        if ( !strcmp(argv[i], "-p") || !strcmp( argv[i], "--password" ) ) {
+            char* arg = argv[i + 1];
+            while ( *arg ) {
+                *arg++ = 'x';
+            }
+        }
+    }
+
+    if ( params.count( "shell" ) ) {
+        runShell = true;
+    }
+    if ( params.count( "nodb" ) ) {
+        nodb = true;
+    }
+    if ( params.count( "norc" ) ) {
+        norc = true;
+    }
+    if ( params.count( "help" ) ) {
+        show_help_text( argv[0], shell_options );
+        return mongo::EXIT_CLEAN;
+    }
+    if ( params.count( "files" ) ) {
+        files = params["files"].as< vector<string> >();
+    }
+    if ( params.count( "version" ) ) {
+        cout << "MongoDB shell version: " << mongo::versionString << endl;
+        return mongo::EXIT_CLEAN;
+    }
+    if ( params.count( "quiet" ) ) {
+        mongo::cmdLine.quiet = true;
+    }
+#ifdef MONGO_SSL
+    if ( params.count( "ssl" ) ) {
+        mongo::cmdLine.sslOnNormalPorts = true;
+    }
+#endif
+    if ( params.count( "nokillop" ) ) {
+        mongo::shellUtils::_nokillop = true;
+    }
+    if ( params.count( "autokillop" ) ) {
+        autoKillOp = true;
+    }
+
+    /* This is a bit confusing, here are the rules:
+     *
+     * if nodb is set then all positional parameters are files
+     * otherwise the first positional parameter might be a dbaddress, but
+     * only if one of these conditions is met:
+     *   - it contains no '.' after the last appearance of '\' or '/'
+     *   - it doesn't end in '.js' and it doesn't specify a path to an existing file */
+    if ( params.count( "dbaddress" ) ) {
+        string dbaddress = params["dbaddress"].as<string>();
+        if (nodb) {
+            files.insert( files.begin(), dbaddress );
+        }
+        else {
+            string basename = dbaddress.substr( dbaddress.find_last_of( "/\\" ) + 1 );
+            if (basename.find_first_of( '.' ) == string::npos ||
+                    ( basename.find( ".js", basename.size() - 3 ) == string::npos && !fileExists( dbaddress ) ) ) {
+                url = dbaddress;
+            }
+            else {
+                files.insert( files.begin(), dbaddress );
+            }
+        }
+    }
+    if ( params.count( "ipv6" ) ) {
+        mongo::enableIPv6();
+    }
+    if ( params.count( "verbose" ) ) {
+        logLevel = 1;
+    }
+
+    if ( url == "*" ) {
+        cout << "ERROR: " << "\"*\" is an invalid db address" << endl << endl;
+        show_help_text( argv[0], shell_options );
+        return mongo::EXIT_BADOPTIONS;
+    }
+
+    if ( ! mongo::cmdLine.quiet )
+        cout << "MongoDB shell version: " << mongo::versionString << endl;
+
+    mongo::UnitTest::runTests();
+
+    if ( !nodb ) { // connect to db
+        //if ( ! mongo::cmdLine.quiet ) cout << "url: " << url << endl;
+
+        stringstream ss;
+        if ( mongo::cmdLine.quiet )
+            ss << "__quiet = true;";
+        ss << "db = connect( \"" << fixHost( url , dbhost , port ) << "\")";
+
+        mongo::shellUtils::_dbConnect = ss.str();
+
+        if ( params.count( "password" ) && password.empty() )
+            password = mongo::askPassword();
+
+        if ( username.size() && password.size() ) {
+            stringstream ss;
+            ss << "if ( ! db.auth( \"" << username << "\" , \"" << password << "\" ) ){ throw 'login failed'; }";
+            mongo::shellUtils::_dbAuth = ss.str();
+        }
+    }
+
+    mongo::ScriptEngine::setConnectCallback( mongo::shellUtils::onConnect );
+    mongo::ScriptEngine::setup();
+    mongo::globalScriptEngine->setScopeInitCallback( mongo::shellUtils::initScope );
+    auto_ptr< mongo::Scope > scope( mongo::globalScriptEngine->newScope() );
+    shellMainScope = scope.get();
+
+    if( runShell )
+        cout << "type \"help\" for help" << endl;
+
+    if ( !script.empty() ) {
+        mongo::shellUtils::MongoProgramScope s;
+        if ( ! scope->exec( script , "(shell eval)" , true , true , false ) )
+            return -4;
+    }
+
+    for (size_t i = 0; i < files.size(); ++i) {
+        mongo::shellUtils::MongoProgramScope s;
+
+        if ( files.size() > 1 )
+            cout << "loading file: " << files[i] << endl;
+
+        if ( ! scope->execFile( files[i] , false , true , false ) ) {
+            cout << "failed to load: " << files[i] << endl;
+            return -3;
+        }
+    }
+
+    if ( files.size() == 0 && script.empty() )
+        runShell = true;
+
+    if ( runShell ) {
+
+        mongo::shellUtils::MongoProgramScope s;
+
+        if ( !norc ) {
+            string rcLocation;
+#ifndef _WIN32
+            if ( getenv( "HOME" ) != NULL )
+                rcLocation = str::stream() << getenv( "HOME" ) << "/.mongorc.js" ;
+#else
+            if ( getenv( "HOMEDRIVE" ) != NULL && getenv( "HOMEPATH" ) != NULL )
+                rcLocation = str::stream() << getenv( "HOMEDRIVE" ) << getenv( "HOMEPATH" ) << "\\.mongorc.js";
+#endif
+            if ( !rcLocation.empty() && fileExists(rcLocation) ) {
+                if ( ! scope->execFile( rcLocation , false , true , false , 0 ) ) {
+                    cout << "The \".mongorc.js\" file located in your home folder could not be executed" << endl;
+                    return -5;
+                }
+            }
+        }
+
+        shellHistoryInit();
+
+        string prompt;
+        int promptType; 
+
+        //v8::Handle<v8::Object> shellHelper = baseContext_->Global()->Get( v8::String::New( "shellHelper" ) )->ToObject();
+
+        while ( 1 ) {
+            inMultiLine = false;
+            gotInterrupted = false;
+//            shellMainScope->localConnect;
+            //DBClientWithCommands *c = getConnection( JSContext *cx, JSObject *obj );
+
+            bool haveStringPrompt = false;
+            promptType = scope->type( "prompt" );
+            if( promptType == String ) {
+                prompt = scope->getString( "prompt" );
+                haveStringPrompt = true;
+            }
+            else if( promptType == Code ) {
+                scope->exec( "delete __prompt__;", "", false, false, false, 0 );
+                scope->exec( "__prompt__ = prompt();", "", false, false, false, 0 );
+                if( scope->type( "__prompt__" ) == String ) {
+                    prompt = scope->getString( "__prompt__" );
+                    haveStringPrompt = true;
+                }
+            }
+            if( !haveStringPrompt )
+                prompt = sayReplSetMemberState() + "> ";
+
+            char * line = shellReadline( prompt.c_str() );
+
+            char * linePtr = line;  // can't clobber 'line', we need to free() it later
+            if ( linePtr ) {
+                while ( linePtr[0] == ' ' )
+                    ++linePtr;
+                int lineLen = strlen( linePtr );
+                while ( lineLen > 0 && linePtr[lineLen - 1] == ' ' )
+                    linePtr[--lineLen] = 0;
+            }
+
+            if ( ! linePtr || ( strlen( linePtr ) == 4 && strstr( linePtr , "exit" ) ) ) {
+                if ( ! mongo::cmdLine.quiet )
+                    cout << "bye" << endl;
+                if ( line )
+                    free( line );
+                break;
+            }
+
+            string code = linePtr;
+            if ( code == "exit" || code == "exit;" ) {
+                free( line );
+                break;
+            }
+            if ( code == "cls" ) {
+                free( line );
+                linenoiseClearScreen();
+                continue;
+            }
+
+            if ( code.size() == 0 ) {
+                free( line );
+                continue;
+            }
+
+            if ( startsWith( linePtr, "edit " ) ) {
+                shellHistoryAdd( linePtr );
+
+                const char* s = linePtr + 5; // skip "edit "
+                while( *s && isspace( *s ) )
+                    s++;
+
+                edit( s );
+                free( line );
+                continue;
+            }
+
+            gotInterrupted = false;
+            code = finishCode( code );
+            if ( gotInterrupted ) {
+                cout << endl;
+                free( line );
+                continue;
+            }
+
+            if ( code.size() == 0 ) {
+                free( line );
+                break;
+            }
+
+            bool wascmd = false;
+            {
+                string cmd = linePtr;
+                if ( cmd.find( " " ) > 0 )
+                    cmd = cmd.substr( 0 , cmd.find( " " ) );
+
+                if ( cmd.find( "\"" ) == string::npos ) {
+                    try {
+                        scope->exec( (string)"__iscmd__ = shellHelper[\"" + cmd + "\"];" , "(shellhelp1)" , false , true , true );
+                        if ( scope->getBoolean( "__iscmd__" )  ) {
+                            scope->exec( (string)"shellHelper( \"" + cmd + "\" , \"" + code.substr( cmd.size() ) + "\");" , "(shellhelp2)" , false , true , false );
+                            wascmd = true;
+                        }
+                    }
+                    catch ( std::exception& e ) {
+                        cout << "error2:" << e.what() << endl;
+                        wascmd = true;
+                    }
+                }
+            }
+
+            if ( ! wascmd ) {
+                try {
+                    if ( scope->exec( code.c_str() , "(shell)" , false , true , false ) )
+                        scope->exec( "shellPrintHelper( __lastres__ );" , "(shell2)" , true , true , false );
+                }
+                catch ( std::exception& e ) {
+                    cout << "error:" << e.what() << endl;
+                }
+            }
+
+            shellHistoryAdd( code.c_str() );
+            free( line );
+        }
+
+        shellHistoryDone();
+    }
+
+    mongo::dbexitCalled = true;
+    return 0;
+}
+
+int main( int argc, char* argv[] ) {
+    static mongo::StaticObserver staticObserver;
+    try {
+        return _main( argc , argv );
+    }
+    catch ( mongo::DBException& e ) {
+        cerr << "exception: " << e.what() << endl;
+        return -1;
+    }
+}
diff --git a/src/mongo/shell/mongo.js b/src/mongo/shell/mongo.js
new file mode 100644
index 00000000000..5e18f38fb63
--- /dev/null
+++ b/src/mongo/shell/mongo.js
@@ -0,0 +1,102 @@
+// mongo.js
+
+// NOTE 'Mongo' may be defined here or in MongoJS.cpp.  Add code to init, not to this constructor.
+if ( typeof Mongo == "undefined" ){
+    Mongo = function( host ){
+        this.init( host );
+    }
+}
+
+if ( ! Mongo.prototype ){
+    throw "Mongo.prototype not defined";
+}
+
+if ( ! Mongo.prototype.find )
+    Mongo.prototype.find = function( ns , query , fields , limit , skip , batchSize , options ){ throw "find not implemented"; }
+if ( ! Mongo.prototype.insert )
+    Mongo.prototype.insert = function( ns , obj ){ throw "insert not implemented"; }
+if ( ! Mongo.prototype.remove )
+    Mongo.prototype.remove = function( ns , pattern ){ throw "remove not implemented;" }
+if ( ! Mongo.prototype.update )
+    Mongo.prototype.update = function( ns , query , obj , upsert ){ throw "update not implemented;" }
+
+if ( typeof mongoInject == "function" ){
+    mongoInject( Mongo.prototype );
+}
+
+Mongo.prototype.setSlaveOk = function( value ) {
+    if( value == undefined ) value = true;
+    this.slaveOk = value;
+}
+
+Mongo.prototype.getSlaveOk = function() {
+    return this.slaveOk || false;
+}
+
+Mongo.prototype.getDB = function( name ){
+    if (jsTest.options().keyFile && ((typeof this.authenticated == 'undefined') || !this.authenticated)) {
+        jsTest.authenticate(this)
+    }
+    return new DB( this , name );
+}
+
+Mongo.prototype.getDBs = function(){
+    var res = this.getDB( "admin" ).runCommand( { "listDatabases" : 1 } );
+    if ( ! res.ok )
+        throw "listDatabases failed:" + tojson( res );
+    return res;
+}
+
+Mongo.prototype.adminCommand = function( cmd ){
+    return this.getDB( "admin" ).runCommand( cmd );
+}
+
+Mongo.prototype.setLogLevel = function( logLevel ){
+    return this.adminCommand({ setParameter : 1, logLevel : logLevel })
+}
+
+Mongo.prototype.getDBNames = function(){
+    return this.getDBs().databases.map( 
+        function(z){
+            return z.name;
+        }
+    );
+}
+
+Mongo.prototype.getCollection = function(ns){
+    var idx = ns.indexOf( "." );
+    if ( idx < 0 ) 
+        throw "need . in ns";
+    var db = ns.substring( 0 , idx );
+    var c = ns.substring( idx + 1 );
+    return this.getDB( db ).getCollection( c );
+}
+
+Mongo.prototype.toString = function(){
+    return "connection to " + this.host;
+}
+Mongo.prototype.tojson = Mongo.prototype.toString;
+
+connect = function( url , user , pass ){
+    chatty( "connecting to: " + url )
+
+    if ( user && ! pass )
+        throw "you specified a user and not a password.  either you need a password, or you're using the old connect api";
+
+    var idx = url.lastIndexOf( "/" );
+    
+    var db;
+    
+    if ( idx < 0 )
+        db = new Mongo().getDB( url );
+    else 
+        db = new Mongo( url.substring( 0 , idx ) ).getDB( url.substring( idx + 1 ) );
+    
+    if ( user && pass ){
+        if ( ! db.auth( user , pass ) ){
+            throw "couldn't login";
+        }
+    }
+    
+    return db;
+}
diff --git a/src/mongo/shell/mr.js b/src/mongo/shell/mr.js
new file mode 100644
index 00000000000..7b0814dd557
--- /dev/null
+++ b/src/mongo/shell/mr.js
@@ -0,0 +1,95 @@
+// mr.js
+
+MR = {};
+
+MR.init = function(){
+    $max = 0;
+    $arr = [];
+    emit = MR.emit;
+    $numEmits = 0;
+    $numReduces = 0;
+    $numReducesToDB = 0;
+    gc(); // this is just so that keep memory size sane
+}
+
+MR.cleanup = function(){
+    MR.init();
+    gc();
+}
+
+MR.emit = function(k,v){
+    $numEmits++;
+    var num = nativeHelper.apply( get_num_ , [ k ] );
+    var data = $arr[num];
+    if ( ! data ){
+        data = { key : k , values : new Array(1000) , count : 0 };
+        $arr[num] = data;
+    }
+    data.values[data.count++] = v;
+    $max = Math.max( $max , data.count );
+}
+
+MR.doReduce = function( useDB ){
+    $numReduces++;
+    if ( useDB )
+        $numReducesToDB++;
+    $max = 0;
+    for ( var i=0; i<$arr.length; i++){
+        var data = $arr[i];
+        if ( ! data ) 
+            continue;
+        
+        if ( useDB ){
+            var x = tempcoll.findOne( { _id : data.key } );
+            if ( x ){
+                data.values[data.count++] = x.value;
+            }
+        }
+
+        var r = $reduce( data.key , data.values.slice( 0 , data.count ) );
+        if ( r && r.length && r[0] ){ 
+            data.values = r; 
+            data.count = r.length;
+        }
+        else{ 
+            data.values[0] = r;
+            data.count = 1;
+        }
+        
+        $max = Math.max( $max , data.count ); 
+        
+        if ( useDB ){
+            if ( data.count == 1 ){
+                tempcoll.save( { _id : data.key , value : data.values[0] } );
+            }
+            else {
+                tempcoll.save( { _id : data.key , value : data.values.slice( 0 , data.count ) } );
+            }
+        }
+    }
+}
+
+MR.check = function(){                        
+    if ( $max < 2000 && $arr.length < 1000 ){ 
+        return 0; 
+    }
+    MR.doReduce();
+    if ( $max < 2000 && $arr.length < 1000 ){ 
+        return 1;
+    }
+    MR.doReduce( true );
+    $arr = []; 
+    $max = 0; 
+    reset_num();
+    gc();
+    return 2;
+}
+
+MR.finalize = function(){
+    tempcoll.find().forEach( 
+        function(z){
+            z.value = $finalize( z._id , z.value );
+            tempcoll.save( z );
+        }
+    );
+}
diff --git a/src/mongo/shell/msvc/createCPPfromJavaScriptFiles.js b/src/mongo/shell/msvc/createCPPfromJavaScriptFiles.js
new file mode 100644
index 00000000000..ff6f2a54b12
--- /dev/null
+++ b/src/mongo/shell/msvc/createCPPfromJavaScriptFiles.js
@@ -0,0 +1,105 @@
+// createCPPfromJavaScriptFiles.js
+
+/*   Copyright 2011 10gen Inc.
+*
+*    Licensed under the Apache License, Version 2.0 (the "License");
+*    you may not use this file except in compliance with the License.
+*    You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+*    Unless required by applicable law or agreed to in writing, software
+*    distributed under the License is distributed on an "AS IS" BASIS,
+*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*    See the License for the specific language governing permissions and
+*    limitations under the License.
+*/
+
+// This JavaScript file is run under Windows Script Host from the Visual Studio build.
+// It creates .CPP files from JavaScript files and is intended to duplicate the functionality
+// of the jsToH Python function in SConstruct.  By using only standard Windows components
+// (Windows Script Host, JScript) we avoid the need for Visual Studio builders to install
+// Python, and we don't need to include the generated files in Git because they can be
+// recreated as required.
+
+var whitespace = " \t";
+function cppEscape( s ) {
+    for ( var i = 0, len = s.length; i < len; ++i ) {
+        if ( whitespace.indexOf( s.charAt( i ) ) === -1 ) {
+            s = s.substring( i );
+            break;
+        }
+    }
+    if ( i == len )
+        return "";
+    for ( i = s.length - 1; i >= 0; --i ) {
+        if ( whitespace.indexOf( s.charAt( i ) ) === -1 ) {
+            s = s.substr( 0, i + 1 );
+            break;
+        }
+    }
+    s = s.replace( /\\/g, "\\\\" );
+    s = s.replace( /"/g, '\\"' );
+    return s;
+};
+
+function jsToH( fso, outputFileNameString, inputFileNameStringArray ) {
+    var displayString = 'jsToH( "' + outputFileNameString + '", [';
+    var i, len = inputFileNameStringArray.length;
+    for ( i = 0; i < len; ++i ) {
+        displayString += '"' + inputFileNameStringArray[i] + '"';
+        if ( i < len - 1 )
+            displayString += ', ';
+    }
+    displayString += '] );'
+    WScript.Echo( displayString );
+    var h = ['#include "bson/stringdata.h"'
+         , 'namespace mongo {'
+         , 'struct JSFile{ const char* name; const StringData& source; };'
+         , 'namespace JSFiles{'
+         ];
+    for ( i = 0; i < len; ++i ) {
+        var filename = inputFileNameStringArray[i];
+        var objname = filename.substring( 0, filename.lastIndexOf( '.' ) ).substr( 1 + filename.lastIndexOf('/') );
+        var stringname = '_jscode_raw_' + objname;
+        h.push( 'const StringData ' + stringname + ' = ' );
+        var inputFile = fso.GetFile( filename );
+        var inputStream = inputFile.OpenAsTextStream( 1 /* ForReading */, 0 /* TristateFalse == ASCII */ );
+        while ( !inputStream.AtEndOfStream )
+            h.push( '"' + cppEscape(inputStream.ReadLine()) + '\\n" ' );
+        inputStream.Close();
+        h.push( ';' );
+        h.push( 'extern const JSFile ' + objname + ';' ); //symbols aren't exported w/o this
+        h.push( 'const JSFile ' + objname + ' = { "' + filename + '" , ' + stringname + ' };' );
+    }
+    h.push( "} // namespace JSFiles" );
+    h.push( "} // namespace mongo" );
+    h.push( "" );
+    var out = fso.CreateTextFile( outputFileNameString, true /* overwrite */ );
+    out.Write( h.join( '\n' ) );
+    out.Close();
+};
+
+function rebuildIfNeeded( fso, outputFileNameString, inputFileNameStringArray ) {
+    var rebuildNeeded = false;
+    if ( !fso.FileExists( outputFileNameString ) ) {
+        rebuildNeeded = true;
+    } else {
+        var outputFileDate = fso.GetFile( outputFileNameString ).DateLastModified;
+        for ( var i = 0, len = inputFileNameStringArray.length; i < len; ++i ) {
+            if ( fso.GetFile( inputFileNameStringArray[i] ).DateLastModified > outputFileDate ) {
+                rebuildNeeded = true;
+                break;
+            }
+        }
+    }
+    if ( rebuildNeeded )
+        jsToH( fso, outputFileNameString, inputFileNameStringArray );
+};
+
+var shell = new ActiveXObject( "WScript.Shell" );
+shell.CurrentDirectory = WScript.Arguments.Unnamed.Item( 0 );
+
+var fso = new ActiveXObject( "Scripting.FileSystemObject" );
+rebuildIfNeeded( fso, "shell/mongo.cpp", ["shell/utils.js", "shell/utils_sh.js", "shell/db.js", "shell/mongo.js", "shell/mr.js", "shell/query.js", "shell/collection.js"] );
+rebuildIfNeeded( fso, "shell/mongo-server.cpp", ["shell/servers.js"] );
diff --git a/src/mongo/shell/msvc/mongo.ico b/src/mongo/shell/msvc/mongo.ico
new file mode 100755
index 00000000000..1eba9ed5131
--- /dev/null
+++ b/src/mongo/shell/msvc/mongo.ico
diff --git a/src/mongo/shell/msvc/mongo.sln b/src/mongo/shell/msvc/mongo.sln
new file mode 100644
index 00000000000..01c9e1e6e40
--- /dev/null
+++ b/src/mongo/shell/msvc/mongo.sln
@@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongo", "mongo.vcxproj", "{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.Build.0 = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/src/mongo/shell/msvc/mongo.vcxproj b/src/mongo/shell/msvc/mongo.vcxproj
new file mode 100644
index 00000000000..968215d5b8d
--- /dev/null
+++ b/src/mongo/shell/msvc/mongo.vcxproj
@@ -0,0 +1,272 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mongo</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <LibraryPath>\boost\lib\vs2010_32\;$(VCInstallDir)lib;$(VCInstallDir)atlmfc\lib;$(WindowsSdkDir)lib;$(FrameworkSDKDir)\lib</LibraryPath>
+    <ExecutablePath>$(VCInstallDir)bin;$(WindowsSdkDir)bin\NETFX 4.0 Tools;$(WindowsSdkDir)bin;$(VSInstallDir)Common7\Tools\bin;$(VSInstallDir)Common7\tools;$(VSInstallDir)Common7\ide;$(ProgramFiles)\HTML Help Workshop;$(FrameworkSDKDir)\bin;$(MSBuildToolsPath32);$(VSInstallDir);$(SystemRoot)\SysWow64;$(FxCopDir);$(PATH);</ExecutablePath>
+    <IncludePath>..\..\..\readline\include;..\..\..\js\src\;..\..\third_party/pcre-7.4;..\..\;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSdkDir)include;$(FrameworkSDKDir)\include</IncludePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <IncludePath>..\..\..\readline\include;..\..\..\js\src\;..\..\third_party/pcre-7.4;..\..\;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSdkDir)include;$(FrameworkSDKDir)\include</IncludePath>
+    <LinkIncremental>false</LinkIncremental>
+    <LibraryPath>\boost\lib\vs2010_32\;$(VCInstallDir)lib;$(VCInstallDir)atlmfc\lib;$(WindowsSdkDir)lib;$(FrameworkSDKDir)\lib</LibraryPath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>XP_WIN;HAVE_CONFIG_H;OLDJS;MONGO_EXPOSE_MACROS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>\boost\</AdditionalIncludeDirectories>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo createCPPfromJavaScriptFiles.js "$(ProjectDir)..\.."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>USE_READLINE;XP_WIN;HAVE_CONFIG_H;OLDJS;MONGO_EXPOSE_MACROS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>\boost\</AdditionalIncludeDirectories>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo createCPPfromJavaScriptFiles.js "$(ProjectDir)..\.."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\bson\oid.cpp" />
+    <ClCompile Include="..\..\client\clientOnly.cpp" />
+    <ClCompile Include="..\..\client\connpool.cpp" />
+    <ClCompile Include="..\..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\..\db\commands.cpp" />
+    <ClCompile Include="..\..\db\lasterror.cpp" />
+    <ClCompile Include="..\..\db\nonce.cpp" />
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcrecpp.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_compile.c">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_config.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_chartables.c">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_stringpiece.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\scripting\bench.cpp" />
+    <ClCompile Include="..\..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\..\scripting\utils.cpp" />
+    <ClCompile Include="..\..\third_party\linenoise\linenoise.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">_CRT_SECURE_NO_WARNINGS;XP_WIN;HAVE_CONFIG_H;OLDJS;MONGO_EXPOSE_MACROS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">_CRT_SECURE_NO_WARNINGS;USE_READLINE;XP_WIN;HAVE_CONFIG_H;OLDJS;MONGO_EXPOSE_MACROS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\util\background.cpp" />
+    <ClCompile Include="..\..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\..\util\log.cpp" />
+    <ClCompile Include="..\..\util\mmap.cpp" />
+    <ClCompile Include="..\..\util\net\listen.cpp" />
+    <ClCompile Include="..\..\util\net\message.cpp" />
+    <ClCompile Include="..\..\util\net\message_port.cpp" />
+    <ClCompile Include="..\..\util\net\sock.cpp" />
+    <ClCompile Include="..\..\util\password.cpp" />
+    <ClCompile Include="..\..\util\ramlog.cpp" />
+    <ClCompile Include="..\..\util\text.cpp" />
+    <ClCompile Include="..\..\util\mmap_win.cpp" />
+    <ClCompile Include="..\..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\..\util\assert_util.cpp" />
+    <ClCompile Include="..\..\util\md5main.cpp" />
+    <ClCompile Include="..\..\util\md5.c">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\util\base64.cpp" />
+    <ClCompile Include="..\..\util\debug_util.cpp" />
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_dfa_exec.c">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_fullinfo.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_get.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_globals.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_info.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_maketables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_newline.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_ord2utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_refcount.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_study.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_tables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_try_flipped.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_ucp_searchfuncs.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_valid_utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_version.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_xclass.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\..\client\dbclient.cpp" />
+    <ClCompile Include="..\..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\..\db\jsobj.cpp" />
+    <ClCompile Include="..\..\db\json.cpp" />
+    <ClCompile Include="..\..\pch.cpp">
+      <PrecompiledHeader>Create</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+    </ClCompile>
+    <ClCompile Include="..\..\scripting\engine.cpp" />
+    <ClCompile Include="..\..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\..\util\util.cpp" />
+    <ClCompile Include="..\..\util\version.cpp" />
+    <ClCompile Include="..\dbshell.cpp" />
+    <ClCompile Include="..\mongo-server.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\mongo.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\shell_utils.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\SConstruct" />
+    <None Include="..\collection.js" />
+    <None Include="..\db.js" />
+    <None Include="..\mongo.js" />
+    <None Include="..\mr.js" />
+    <None Include="..\query.js" />
+    <None Include="..\servers.js" />
+    <None Include="..\utils.js" />
+    <None Include="..\utils_sh.js" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\..\js\js32d.lib">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\..\js\js32r.lib">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+    </Library>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\db\lasterror.h" />
+    <ClInclude Include="..\..\third_party\linenoise\linenoise.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\db\db.rc" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/shell/msvc/mongo.vcxproj.filters b/src/mongo/shell/msvc/mongo.vcxproj.filters
new file mode 100644
index 00000000000..dcf9bcd7550
--- /dev/null
+++ b/src/mongo/shell/msvc/mongo.vcxproj.filters
@@ -0,0 +1,285 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+    <Filter Include="util">
+      <UniqueIdentifier>{2a0d6120-434d-4732-ac31-2a7bf077f6ee}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="util\concurrency">
+      <UniqueIdentifier>{a1e59094-b70c-463a-8dc1-691efe337f14}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="scripting">
+      <UniqueIdentifier>{2d0fd975-0cc9-43dc-ac8e-53cb8c3a0040}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="bson">
+      <UniqueIdentifier>{a33442e2-39da-4c70-8310-6de9fa70cd71}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="db">
+      <UniqueIdentifier>{1044ce7b-72c4-4892-82c0-f46d8708a6ff}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="client">
+      <UniqueIdentifier>{fc0f6c1a-9627-4254-9b5e-0bcb8b3257f3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="shared source files">
+      <UniqueIdentifier>{30b62472-d7a7-4b8a-8a07-d7d341bc6252}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="pcre">
+      <UniqueIdentifier>{291e0d72-13ca-42d7-b0fd-2e7b5f89639f}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="shell">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="_js files">
+      <UniqueIdentifier>{473e7192-9f2a-47c5-ad95-e5b75d4f48f9}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="shell\generated_from_js">
+      <UniqueIdentifier>{96e4c411-7ab4-4bcd-b7c6-a33059f5d492}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="thirdparty">
+      <UniqueIdentifier>{5eca87ab-5987-4fb0-97be-e80cc721e328}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\dbshell.cpp">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\version.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\concurrency\vars.cpp">
+      <Filter>util\concurrency</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\scripting\engine.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\db\jsobj.cpp">
+      <Filter>db</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\client\dbclient.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\client\dbclientcursor.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\pch.cpp" />
+    <ClCompile Include="..\..\db\json.cpp">
+      <Filter>shared source files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\debug_util.cpp">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\db\lasterror.cpp">
+      <Filter>shared source files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\db\nonce.cpp">
+      <Filter>shared source files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\client\connpool.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\processinfo_win32.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\db\commands.cpp">
+      <Filter>db</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\scripting\utils.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\assert_util.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\background.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\base64.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\client\clientOnly.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\mmap.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\md5main.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\util.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\client\syncclusterconnection.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\md5.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\mmap_win.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\text.cpp">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\shell_utils.cpp">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\scripting\engine_spidermonkey.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\password.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mongo.cpp">
+      <Filter>shell\generated_from_js</Filter>
+    </ClCompile>
+    <ClCompile Include="..\mongo-server.cpp">
+      <Filter>shell\generated_from_js</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\log.cpp">
+      <Filter>shared source files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\scripting\bench.cpp">
+      <Filter>scripting</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\bson\oid.cpp">
+      <Filter>bson</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\client\dbclient_rs.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\linenoise\linenoise.cpp">
+      <Filter>thirdparty</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\concurrency\spin_lock.cpp">
+      <Filter>util\concurrency</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\net\listen.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\net\message.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\net\sock.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\net\message_port.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcrecpp.cc">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_compile.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_config.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_chartables.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_stringpiece.cc">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_dfa_exec.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_exec.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_fullinfo.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_get.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_globals.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_info.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_maketables.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_newline.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_ord2utf8.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_refcount.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_study.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_tables.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_try_flipped.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_ucp_searchfuncs.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_valid_utf8.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_version.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party/pcre-7.4\pcre_xclass.c">
+      <Filter>shell</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\util\ramlog.cpp">
+      <Filter>util</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\SConstruct" />
+    <None Include="..\collection.js">
+      <Filter>_js files</Filter>
+    </None>
+    <None Include="..\db.js">
+      <Filter>_js files</Filter>
+    </None>
+    <None Include="..\mongo.js">
+      <Filter>_js files</Filter>
+    </None>
+    <None Include="..\mr.js">
+      <Filter>_js files</Filter>
+    </None>
+    <None Include="..\query.js">
+      <Filter>_js files</Filter>
+    </None>
+    <None Include="..\servers.js">
+      <Filter>_js files</Filter>
+    </None>
+    <None Include="..\utils.js">
+      <Filter>_js files</Filter>
+    </None>
+    <None Include="..\utils_sh.js">
+      <Filter>_js files</Filter>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\..\js\js32d.lib" />
+    <Library Include="..\..\..\js\js32r.lib" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\db\lasterror.h">
+      <Filter>db</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\third_party\linenoise\linenoise.h">
+      <Filter>thirdparty</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\db\db.rc">
+      <Filter>Resource Files</Filter>
+    </ResourceCompile>
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/shell/query.js b/src/mongo/shell/query.js
new file mode 100644
index 00000000000..51b9fd8e6a1
--- /dev/null
+++ b/src/mongo/shell/query.js
@@ -0,0 +1,344 @@
+// query.js
+
+if ( typeof DBQuery == "undefined" ){
+    DBQuery = function( mongo , db , collection , ns , query , fields , limit , skip , batchSize , options ){
+        
+        this._mongo = mongo; // 0
+        this._db = db; // 1
+        this._collection = collection; // 2
+        this._ns = ns; // 3
+        
+        this._query = query || {}; // 4
+        this._fields = fields; // 5
+        this._limit = limit || 0; // 6
+        this._skip = skip || 0; // 7
+        this._batchSize = batchSize || 0;
+        this._options = options || 0;
+
+        this._cursor = null;
+        this._numReturned = 0;
+        this._special = false;
+        this._prettyShell = false;
+    }
+    print( "DBQuery probably won't have array access " );
+}
+
+DBQuery.prototype.help = function () {
+    print("find() modifiers")
+    print("\t.sort( {...} )")
+    print("\t.limit( n )")
+    print("\t.skip( n )")
+    print("\t.count() - total # of objects matching query, ignores skip,limit")
+    print("\t.size() - total # of objects cursor would return, honors skip,limit")
+    print("\t.explain([verbose])")
+    print("\t.hint(...)")
+    print("\t.addOption(n) - adds op_query options -- see wire protocol")
+    print("\t._addSpecial(name, value) - http://www.mongodb.org/display/DOCS/Advanced%20Queries#AdvancedQueries-Metaqueryoperators")
+    print("\t.batchSize(n) - sets the number of docs to return per getMore")
+    print("\t.showDiskLoc() - adds a $diskLoc field to each returned object")
+    print("\t.min(idxDoc)")
+    print("\t.max(idxDoc)")
+    
+    print("\nCursor methods");
+    print("\t.toArray() - iterates through docs and returns an array of the results")
+    print("\t.forEach( func )")
+    print("\t.map( func )")
+    print("\t.hasNext()")
+    print("\t.next()")
+    print("\t.objsLeftInBatch() - returns count of docs left in current batch (when exhausted, a new getMore will be issued)")
+    print("\t.count(applySkipLimit) - runs command at server")    
+    print("\t.itcount() - iterates through documents and counts them")
+}
+
+DBQuery.prototype.clone = function(){
+    var q =  new DBQuery( this._mongo , this._db , this._collection , this._ns , 
+        this._query , this._fields , 
+        this._limit , this._skip , this._batchSize , this._options );
+    q._special = this._special;
+    return q;
+}
+
+DBQuery.prototype._ensureSpecial = function(){
+    if ( this._special )
+        return;
+    
+    var n = { query : this._query };
+    this._query = n;
+    this._special = true;
+}
+
+DBQuery.prototype._checkModify = function(){
+    if ( this._cursor )
+        throw "query already executed";
+}
+
+DBQuery.prototype._exec = function(){
+    if ( ! this._cursor ){
+        assert.eq( 0 , this._numReturned );
+        this._cursor = this._mongo.find( this._ns , this._query , this._fields , this._limit , this._skip , this._batchSize , this._options );
+        this._cursorSeen = 0;
+    }
+    return this._cursor;
+}
+
+DBQuery.prototype.limit = function( limit ){
+    this._checkModify();
+    this._limit = limit;
+    return this;
+}
+
+DBQuery.prototype.batchSize = function( batchSize ){
+    this._checkModify();
+    this._batchSize = batchSize;
+    return this;
+}
+
+
+DBQuery.prototype.addOption = function( option ){
+    this._options |= option;
+    return this;
+}
+
+DBQuery.prototype.skip = function( skip ){
+    this._checkModify();
+    this._skip = skip;
+    return this;
+}
+
+DBQuery.prototype.hasNext = function(){
+    this._exec();
+
+    if ( this._limit > 0 && this._cursorSeen >= this._limit )
+        return false;
+    var o = this._cursor.hasNext();
+    return o;
+}
+
+DBQuery.prototype.next = function(){
+    this._exec();
+    
+    var o = this._cursor.hasNext();
+    if ( o )
+        this._cursorSeen++;
+    else
+        throw "error hasNext: " + o;
+    
+    var ret = this._cursor.next();
+    if ( ret.$err && this._numReturned == 0 && ! this.hasNext() )
+        throw "error: " + tojson( ret );
+
+    this._numReturned++;
+    return ret;
+}
+
+DBQuery.prototype.objsLeftInBatch = function(){
+    this._exec();
+
+    var ret = this._cursor.objsLeftInBatch();
+    if ( ret.$err )
+        throw "error: " + tojson( ret );
+
+    return ret;
+}
+
+DBQuery.prototype.readOnly = function(){
+    this._exec();
+    this._cursor.readOnly();
+    return this;
+}
+
+DBQuery.prototype.toArray = function(){
+    if ( this._arr )
+        return this._arr;
+    
+    var a = [];
+    while ( this.hasNext() )
+        a.push( this.next() );
+    this._arr = a;
+    return a;
+}
+
+DBQuery.prototype.count = function( applySkipLimit ){
+    var cmd = { count: this._collection.getName() };
+    if ( this._query ){
+        if ( this._special )
+            cmd.query = this._query.query;
+        else 
+            cmd.query = this._query;
+    }
+    cmd.fields = this._fields || {};
+
+    if ( applySkipLimit ){
+        if ( this._limit )
+            cmd.limit = this._limit;
+        if ( this._skip )
+            cmd.skip = this._skip;
+    }
+    
+    var res = this._db.runCommand( cmd );
+    if( res && res.n != null ) return res.n;
+    throw "count failed: " + tojson( res );
+}
+
+DBQuery.prototype.size = function(){
+    return this.count( true );
+}
+
+DBQuery.prototype.countReturn = function(){
+    var c = this.count();
+
+    if ( this._skip )
+        c = c - this._skip;
+
+    if ( this._limit > 0 && this._limit < c )
+        return this._limit;
+    
+    return c;
+}
+
+/**
+* iterative count - only for testing
+*/
+DBQuery.prototype.itcount = function(){
+    var num = 0;
+    while ( this.hasNext() ){
+        num++;
+        this.next();
+    }
+    return num;
+}
+
+DBQuery.prototype.length = function(){
+    return this.toArray().length;
+}
+
+DBQuery.prototype._addSpecial = function( name , value ){
+    this._ensureSpecial();
+    this._query[name] = value;
+    return this;
+}
+
+DBQuery.prototype.sort = function( sortBy ){
+    return this._addSpecial( "orderby" , sortBy );
+}
+
+DBQuery.prototype.hint = function( hint ){
+    return this._addSpecial( "$hint" , hint );
+}
+
+DBQuery.prototype.min = function( min ) {
+    return this._addSpecial( "$min" , min );
+}
+
+DBQuery.prototype.max = function( max ) {
+    return this._addSpecial( "$max" , max );
+}
+
+DBQuery.prototype.showDiskLoc = function() {
+    return this._addSpecial( "$showDiskLoc" , true);
+}
+
+DBQuery.prototype.forEach = function( func ){
+    while ( this.hasNext() )
+        func( this.next() );
+}
+
+DBQuery.prototype.map = function( func ){
+    var a = [];
+    while ( this.hasNext() )
+        a.push( func( this.next() ) );
+    return a;
+}
+
+DBQuery.prototype.arrayAccess = function( idx ){
+    return this.toArray()[idx];
+}
+DBQuery.prototype.comment = function (comment) {
+    var n = this.clone();
+    n._ensureSpecial();
+    n._addSpecial("$comment", comment);
+    return this.next();
+}
+
+DBQuery.prototype.explain = function (verbose) {
+    /* verbose=true --> include allPlans, oldPlan fields */
+    var n = this.clone();
+    n._ensureSpecial();
+    n._query.$explain = true;
+    n._limit = Math.abs(n._limit) * -1;
+    var e = n.next();
+
+    function cleanup(obj){
+        if (typeof(obj) != 'object'){
+            return;
+        }
+
+        delete obj.allPlans;
+        delete obj.oldPlan;
+
+        if (typeof(obj.length) == 'number'){
+            for (var i=0; i < obj.length; i++){
+                cleanup(obj[i]);
+            }
+        }
+
+        if (obj.shards){
+            for (var key in obj.shards){
+                cleanup(obj.shards[key]);
+            }
+        }
+
+        if (obj.clauses){
+            cleanup(obj.clauses);
+        }
+    }
+
+    if (!verbose)
+        cleanup(e);
+
+    return e;
+}
+
+DBQuery.prototype.snapshot = function(){
+    this._ensureSpecial();
+    this._query.$snapshot = true;
+    return this;
+}
+
+DBQuery.prototype.pretty = function(){
+    this._prettyShell = true;
+    return this;
+}
+
+DBQuery.prototype.shellPrint = function(){
+    try {
+        var start = new Date().getTime();
+        var n = 0;
+        while ( this.hasNext() && n < DBQuery.shellBatchSize ){
+            var s = this._prettyShell ? tojson( this.next() ) : tojson( this.next() , "" , true );
+            print( s );
+            n++;
+        }
+        if (typeof _verboseShell !== 'undefined' && _verboseShell) {
+            var time = new Date().getTime() - start;
+            print("Fetched " + n + " record(s) in " + time + "ms");
+        }
+         if ( this.hasNext() ){
+            print( "Type \"it\" for more" );
+            ___it___  = this;
+        }
+        else {
+            ___it___  = null;
+        }
+   }
+    catch ( e ){
+        print( e );
+    }
+    
+}
+
+DBQuery.prototype.toString = function(){
+    return "DBQuery: " + this._ns + " -> " + tojson( this.query );
+}
+
+DBQuery.shellBatchSize = 20;
diff --git a/src/mongo/shell/servers.js b/src/mongo/shell/servers.js
new file mode 100755
index 00000000000..6b12918db2e
--- /dev/null
+++ b/src/mongo/shell/servers.js
@@ -0,0 +1,2618 @@
+_parsePath = function() {
+    var dbpath = "";
+    for( var i = 0; i < arguments.length; ++i )
+        if ( arguments[ i ] == "--dbpath" )
+            dbpath = arguments[ i + 1 ];
+
+    if ( dbpath == "" )
+        throw "No dbpath specified";
+
+    return dbpath;
+}
+
+_parsePort = function() {
+    var port = "";
+    for( var i = 0; i < arguments.length; ++i )
+        if ( arguments[ i ] == "--port" )
+            port = arguments[ i + 1 ];
+
+    if ( port == "" )
+        throw "No port specified";
+    return port;
+}
+
+connectionURLTheSame = function( a , b ){
+    
+    if ( a == b )
+        return true;
+
+    if ( ! a || ! b )
+        return false;
+    
+    if( a.host ) return connectionURLTheSame( a.host, b )
+    if( b.host ) return connectionURLTheSame( a, b.host )
+    
+    if( a.name ) return connectionURLTheSame( a.name, b )
+    if( b.name ) return connectionURLTheSame( a, b.name )
+    
+    if( a.indexOf( "/" ) < 0 && b.indexOf( "/" ) < 0 ){
+        a = a.split( ":" )
+        b = b.split( ":" )
+        
+        if( a.length != b.length ) return false
+        
+        if( a.length == 2 && a[1] != b[1] ) return false
+                
+        if( a[0] == "localhost" || a[0] == "127.0.0.1" ) a[0] = getHostName()
+        if( b[0] == "localhost" || b[0] == "127.0.0.1" ) b[0] = getHostName()
+        
+        return a[0] == b[0]
+    }
+    else {
+        a0 = a.split( "/" )[0]
+        b0 = b.split( "/" )[0]
+        return a0 == b0
+    }
+}
+
+assert( connectionURLTheSame( "foo" , "foo" ) )
+assert( ! connectionURLTheSame( "foo" , "bar" ) )
+
+assert( connectionURLTheSame( "foo/a,b" , "foo/b,a" ) )
+assert( ! connectionURLTheSame( "foo/a,b" , "bar/a,b" ) )
+
+createMongoArgs = function( binaryName , args ){
+    var fullArgs = [ binaryName ];
+
+    if ( args.length == 1 && isObject( args[0] ) ){
+        var o = args[0];
+        for ( var k in o ){
+          if ( o.hasOwnProperty(k) ){
+            if ( k == "v" && isNumber( o[k] ) ){
+                var n = o[k];
+                if ( n > 0 ){
+                    if ( n > 10 ) n = 10;
+                    var temp = "-";
+                    while ( n-- > 0 ) temp += "v";
+                    fullArgs.push( temp );
+                }
+            }
+            else {
+                fullArgs.push( "--" + k );
+                if ( o[k] != "" )
+                    fullArgs.push( "" + o[k] );
+            }
+          }
+        }
+    }
+    else {
+        for ( var i=0; i<args.length; i++ )
+            fullArgs.push( args[i] )
+    }
+
+    return fullArgs;
+}
+
+
+MongoRunner = function(){}
+    
+MongoRunner.dataDir = "/data/db"
+MongoRunner.dataPath = "/data/db/"
+MongoRunner.usedPortMap = {}
+MongoRunner.logicalOptions = { runId : true,
+                               pathOpts : true, 
+                               remember : true,
+                               noRemember : true,
+                               appendOptions : true,
+                               restart : true,
+                               noCleanData : true,
+                               cleanData : true,
+                               startClean : true,
+                               forceLock : true,
+                               useLogFiles : true,
+                               useHostName : true,
+                               useHostname : true,
+                               noReplSet : true,
+                               forgetPort : true,
+                               arbiter : true,
+                               noJournalPrealloc : true,
+                               noJournal : true }
+
+MongoRunner.toRealPath = function( path, pathOpts ){
+    
+    // Replace all $pathOptions with actual values
+    pathOpts = pathOpts || {}
+    path = path.replace( /\$dataPath/g, MongoRunner.dataPath )
+    path = path.replace( /\$dataDir/g, MongoRunner.dataDir )
+    for( key in pathOpts ){
+        path = path.replace( RegExp( "\\$" + key, "g" ), pathOpts[ key ] )
+    }
+    
+    // Relative path
+    if( ! path.startsWith( "/" ) ){
+        if( path != "" && ! path.endsWith( "/" ) )
+            path += "/"
+                
+        path = MongoRunner.dataPath + path
+    }
+    
+    return path
+    
+}
+
+MongoRunner.toRealDir = function( path, pathOpts ){
+    
+    path = MongoRunner.toRealPath( path, pathOpts )
+    
+    if( path.endsWith( "/" ) )
+        path = path.substring( 0, path.length - 1 )
+        
+    return path
+}
+
+MongoRunner.toRealFile = MongoRunner.toRealDir
+
+MongoRunner.nextOpenPort = function(){
+
+    var i = 0;
+    while( MongoRunner.usedPortMap[ "" + ( 27000 + i ) ] ) i++;
+    MongoRunner.usedPortMap[ "" + ( 27000 + i ) ] = true
+    
+    return 27000 + i
+
+}
+
+MongoRunner.arrOptions = function( binaryName , args ){
+
+    var fullArgs = [ binaryName ]
+
+    if ( isObject( args ) || ( args.length == 1 && isObject( args[0] ) ) ){
+
+        var o = isObject( args ) ? args : args[0]
+        for ( var k in o ){
+            
+            if( ! o.hasOwnProperty(k) || k in MongoRunner.logicalOptions ) continue
+            
+            if ( ( k == "v" || k == "verbose" ) && isNumber( o[k] ) ){
+                var n = o[k]
+                if ( n > 0 ){
+                    if ( n > 10 ) n = 10
+                    var temp = "-"
+                    while ( n-- > 0 ) temp += "v"
+                    fullArgs.push( temp )
+                }
+            }
+            else {
+                if( o[k] == undefined || o[k] == null ) continue
+                fullArgs.push( "--" + k )
+                if ( o[k] != "" )
+                    fullArgs.push( "" + o[k] ) 
+            }
+        } 
+    }
+    else {
+        for ( var i=0; i<args.length; i++ )
+            fullArgs.push( args[i] )
+    }
+
+    return fullArgs
+}
+
+MongoRunner.arrToOpts = function( arr ){
+        
+    var opts = {}
+    for( var i = 1; i < arr.length; i++ ){
+        if( arr[i].startsWith( "-" ) ){
+            var opt = arr[i].replace( /^-/, "" ).replace( /^-/, "" )
+            
+            if( arr.length > i + 1 && ! arr[ i + 1 ].startsWith( "-" ) ){
+                opts[ opt ] = arr[ i + 1 ]
+                i++
+            }
+            else{
+                opts[ opt ] = ""
+            }
+            
+            if( opt.replace( /v/g, "" ) == "" ){
+                opts[ "verbose" ] = opt.length
+            }
+        }
+    }
+    
+    return opts
+}
+
+MongoRunner.savedOptions = {}
+
+MongoRunner.mongoOptions = function( opts ){
+    
+    // Initialize and create a copy of the opts
+    opts = Object.merge( opts || {}, {} )
+    
+    if( ! opts.restart ) opts.restart = false
+    
+    // RunId can come from a number of places
+    if( isObject( opts.restart ) ){
+        opts.runId = opts.restart
+        opts.restart = true
+    }
+    
+    if( isObject( opts.remember ) ){
+        opts.runId = opts.remember
+        opts.remember = true
+    }
+    else if( opts.remember == undefined ){
+        // Remember by default if we're restarting
+        opts.remember = opts.restart
+    }
+    
+    // If we passed in restart : <conn> or runId : <conn>
+    if( isObject( opts.runId ) && opts.runId.runId ) opts.runId = opts.runId.runId
+    
+    if( opts.restart && opts.remember ) opts = Object.merge( MongoRunner.savedOptions[ opts.runId ], opts )
+
+    // Create a new runId
+    opts.runId = opts.runId || ObjectId()
+    
+    // Save the port if required
+    if( ! opts.forgetPort ) opts.port = opts.port || MongoRunner.nextOpenPort()
+    
+    var shouldRemember = ( ! opts.restart && ! opts.noRemember ) || ( opts.restart && opts.appendOptions )
+    
+    if ( shouldRemember ){
+        MongoRunner.savedOptions[ opts.runId ] = Object.merge( opts, {} )
+    }
+    
+    opts.port = opts.port || MongoRunner.nextOpenPort()
+    MongoRunner.usedPortMap[ "" + parseInt( opts.port ) ] = true
+    
+    opts.pathOpts = Object.merge( opts.pathOpts || {}, { port : "" + opts.port, runId : "" + opts.runId } )
+    
+    return opts
+}
+
+MongoRunner.mongodOptions = function( opts ){
+    
+    opts = MongoRunner.mongoOptions( opts )
+    
+    opts.dbpath = MongoRunner.toRealDir( opts.dbpath || "$dataDir/mongod-$port",
+                                         opts.pathOpts )
+                                         
+    opts.pathOpts = Object.merge( opts.pathOpts, { dbpath : opts.dbpath } )
+    
+    if( ! opts.logFile && opts.useLogFiles ){
+        opts.logFile = opts.dbpath + "/mongod.log"
+    }
+    else if( opts.logFile ){
+        opts.logFile = MongoRunner.toRealFile( opts.logFile, opts.pathOpts )
+    }
+    
+    if( jsTestOptions().noJournalPrealloc || opts.noJournalPrealloc )
+        opts.nopreallocj = ""
+            
+    if( jsTestOptions().noJournal || opts.noJournal )
+        opts.nojournal = ""
+
+    if( jsTestOptions().keyFile && !opts.keyFile) {
+        opts.keyFile = jsTestOptions().keyFile
+    }
+
+    if( opts.noReplSet ) opts.replSet = null
+    if( opts.arbiter ) opts.oplogSize = 1
+            
+    return opts
+}
+
+MongoRunner.mongosOptions = function( opts ){
+    
+    opts = MongoRunner.mongoOptions( opts )
+    
+    opts.pathOpts = Object.merge( opts.pathOpts, 
+                                { configdb : opts.configdb.replace( /:|,/g, "-" ) } )
+    
+    if( ! opts.logFile && opts.useLogFiles ){
+        opts.logFile = MongoRunner.toRealFile( "$dataDir/mongos-$configdb-$port.log",
+                                               opts.pathOpts )
+    }
+    else if( opts.logFile ){
+        opts.logFile = MongoRunner.toRealFile( opts.logFile, opts.pathOpts )
+    }
+
+    if( jsTestOptions().keyFile && !opts.keyFile) {
+        opts.keyFile = jsTestOptions().keyFile
+    }
+
+    return opts
+}
+
+MongoRunner.runMongod = function( opts ){
+    
+    var useHostName = false
+    var runId = null
+    if( isObject( opts ) ) {
+        
+        opts = MongoRunner.mongodOptions( opts )
+        
+        useHostName = opts.useHostName || opts.useHostname
+        runId = opts.runId
+        
+        if( opts.forceLock ) removeFile( opts.dbpath + "/mongod.lock" )
+        if( ( opts.cleanData || opts.startClean ) || ( ! opts.restart && ! opts.noCleanData ) ){
+            print( "Resetting db path '" + opts.dbpath + "'" )
+            resetDbpath( opts.dbpath )
+        }
+        
+        opts = MongoRunner.arrOptions( "mongod", opts )
+    }
+    
+    var mongod = startMongoProgram.apply( null, opts )
+    mongod.commandLine = MongoRunner.arrToOpts( opts )
+    mongod.name = (useHostName ? getHostName() : "localhost") + ":" + mongod.commandLine.port
+    mongod.host = mongod.name
+    mongod.port = parseInt( mongod.commandLine.port )
+    mongod.runId = runId || ObjectId()
+    mongod.savedOptions = MongoRunner.savedOptions[ mongod.runId ]
+    
+    return mongod
+}
+
+MongoRunner.runMongos = function( opts ){
+    
+    var useHostName = false
+    var runId = null
+    if( isObject( opts ) ) {
+        
+        opts = MongoRunner.mongosOptions( opts )
+        
+        useHostName = opts.useHostName || opts.useHostname
+        runId = opts.runId
+        
+        opts = MongoRunner.arrOptions( "mongos", opts )
+    }
+    
+    var mongos = startMongoProgram.apply( null, opts )
+    mongos.commandLine = MongoRunner.arrToOpts( opts )
+    mongos.name = (useHostName ? getHostName() : "localhost") + ":" + mongos.commandLine.port
+    mongos.host = mongos.name
+    mongos.port = parseInt( mongos.commandLine.port ) 
+    mongos.runId = runId || ObjectId()
+    mongos.savedOptions = MongoRunner.savedOptions[ mongos.runId ]
+        
+    return mongos
+}
+
+MongoRunner.stopMongod = function( port, signal ){
+    
+    if( ! port ) {
+        print( "Cannot stop mongo process " + port )
+        return
+    }
+    
+    signal = signal || 15
+    
+    if( port.port )
+        port = parseInt( port.port )
+    
+    if( port instanceof ObjectId ){
+        var opts = MongoRunner.savedOptions( port )
+        if( opts ) port = parseInt( opts.port )
+    }
+    
+    var exitCode = stopMongod( parseInt( port ), parseInt( signal ) )
+    
+    delete MongoRunner.usedPortMap[ "" + parseInt( port ) ]
+
+    return exitCode
+}
+
+MongoRunner.stopMongos = MongoRunner.stopMongod
+
+MongoRunner.isStopped = function( port ){
+    
+    if( ! port ) {
+        print( "Cannot detect if process " + port + " is stopped." )
+        return
+    }
+    
+    if( port.port )
+        port = parseInt( port.port )
+    
+    if( port instanceof ObjectId ){
+        var opts = MongoRunner.savedOptions( port )
+        if( opts ) port = parseInt( opts.port )
+    }
+    
+    return MongoRunner.usedPortMap[ "" + parseInt( port ) ] ? false : true
+}
+
+__nextPort = 27000;
+startMongodTest = function (port, dirname, restart, extraOptions ) {
+    if (!port)
+        port = __nextPort++;
+    var f = startMongodEmpty;
+    if (restart)
+        f = startMongodNoReset;
+    if (!dirname)
+        dirname = "" + port; // e.g., data/db/27000
+
+    var useHostname = false;
+    if (extraOptions) {
+         useHostname = extraOptions.useHostname;
+         delete extraOptions.useHostname;
+    }
+
+    
+    var options = 
+        {
+            port: port,
+            dbpath: "/data/db/" + dirname,
+            noprealloc: "",
+            smallfiles: "",
+            oplogSize: "40",
+            nohttpinterface: ""
+        };
+    
+    if( jsTestOptions().noJournal ) options["nojournal"] = ""
+    if( jsTestOptions().noJournalPrealloc ) options["nopreallocj"] = ""
+    if( jsTestOptions().auth ) options["auth"] = ""
+    if( jsTestOptions().keyFile && (!extraOptions || !extraOptions['keyFile']) ) options['keyFile'] = jsTestOptions().keyFile
+
+    if ( extraOptions )
+        Object.extend( options , extraOptions );
+    
+    var conn = f.apply(null, [ options ] );
+
+    conn.name = (useHostname ? getHostName() : "localhost") + ":" + port;
+
+    if (options['auth'] || options['keyFile']) {
+        if (!this.shardsvr && !options.replSet) {
+            jsTest.addAuth(conn);
+        }
+        jsTest.authenticate(conn);
+    }
+    return conn;
+}
+
+// Start a mongod instance and return a 'Mongo' object connected to it.
+// This function's arguments are passed as command line arguments to mongod.
+// The specified 'dbpath' is cleared if it exists, created if not.
+// var conn = startMongodEmpty("--port", 30000, "--dbpath", "asdf");
+startMongodEmpty = function () {
+    var args = createMongoArgs("mongod", arguments);
+
+    var dbpath = _parsePath.apply(null, args);
+    resetDbpath(dbpath);
+
+    return startMongoProgram.apply(null, args);
+}
+startMongod = function () {
+    print("startMongod WARNING DELETES DATA DIRECTORY THIS IS FOR TESTING ONLY");
+    return startMongodEmpty.apply(null, arguments);
+}
+startMongodNoReset = function(){
+    var args = createMongoArgs( "mongod" , arguments );
+    return startMongoProgram.apply( null, args );
+}
+
+startMongos = function(args){
+    return MongoRunner.runMongos(args);
+}
+
+/* Start mongod or mongos and return a Mongo() object connected to there.
+  This function's first argument is "mongod" or "mongos" program name, \
+  and subsequent arguments to this function are passed as
+  command line arguments to the program.
+*/
+startMongoProgram = function(){
+    var port = _parsePort.apply( null, arguments );
+
+    _startMongoProgram.apply( null, arguments );
+
+    var m;
+    assert.soon
+    ( function() {
+        try {
+            m = new Mongo( "127.0.0.1:" + port );
+            return true;
+        } catch( e ) {
+        }
+        return false;
+    }, "unable to connect to mongo program on port " + port, 600 * 1000 );
+
+    return m;
+}
+
+// Start a mongo program instance.  This function's first argument is the
+// program name, and subsequent arguments to this function are passed as
+// command line arguments to the program.  Returns pid of the spawned program.
+startMongoProgramNoConnect = function() {
+    return _startMongoProgram.apply( null, arguments );
+}
+
+myPort = function() {
+    var m = db.getMongo();
+    if ( m.host.match( /:/ ) )
+        return m.host.match( /:(.*)/ )[ 1 ];
+    else
+        return 27017;
+}
+
+/**
+ * otherParams can be:
+ * * useHostname to use the hostname (instead of localhost)
+ */
+ShardingTest = function( testName , numShards , verboseLevel , numMongos , otherParams ){
+    
+    // Check if testName is an object, if so, pull params from there
+    var keyFile = undefined
+    otherParams = Object.merge( otherParams || {}, {} )
+    otherParams.extraOptions = otherParams.extraOptions || {}
+    
+    if( isObject( testName ) ){
+        
+        var params = Object.merge( testName, {} )
+        
+        testName = params.name || "test"
+        
+        otherParams = Object.merge( params.other || {}, {} )
+        otherParams.extraOptions = otherParams.extraOptions || {}
+        
+        numShards = params.shards || 2
+        verboseLevel = params.verbose || 0
+        numMongos = params.mongos || 1
+        
+        keyFile = params.keyFile || otherParams.keyFile || otherParams.extraOptions.keyFile
+        otherParams.nopreallocj = params.nopreallocj || otherParams.nopreallocj
+        otherParams.rs = params.rs || ( params.other ? params.other.rs : undefined )
+        otherParams.chunksize = params.chunksize || ( params.other ? params.other.chunksize : undefined )
+        
+        // Allow specifying options like :
+        // { mongos : [ { noprealloc : "" } ], config : [ { smallfiles : "" } ], shards : { rs : true, d : true } } 
+        if( isObject( numShards ) ){
+            var len = 0
+            for( var i in numShards ){
+                otherParams[ "" + i ] = numShards[i]
+                len++
+            }
+            numShards = len
+        }
+        
+        if( isObject( numMongos ) ){
+            var len = 0
+            for( var i in numMongos ){
+                otherParams[ "" + i ] = numMongos[i]
+                len++
+            }
+            numMongos = len
+        }
+        else if( Array.isArray( numMongos ) ){
+            for( var i = 0; i < numMongos.length; i++ )
+                otherParams[ "s" + i ] = numMongos[i]
+            numMongos = numMongos.length
+        }
+        
+        if( isObject( params.config ) ){
+            var len = 0
+            for( var i in params.config ){
+                otherParams[ "" + i ] = params.config[i]
+                len++
+            }
+            
+            // If we're specifying explicit config options, we need separate config servers
+            otherParams.separateConfig = true
+            if( len == 3 ) otherParams.sync = true
+            else otherParams.sync = false
+        }
+        else if( Array.isArray( params.config ) ){
+            for( var i = 0; i < params.config.length; i++ )
+                otherParams[ "c" + i ] = params.config[i]
+            
+            // If we're specifying explicit config options, we need separate config servers
+            otherParams.separateConfig = true
+            if( params.config.length == 3 ) otherParams.sync = true
+            else otherParams.sync = false
+        }
+        else if( params.config ) {
+            
+            if( params.config == 3 ){
+                otherParams.separateConfig = otherParams.separateConfig || true
+                otherParams.sync = true
+            }
+            
+        }
+    }
+    else {
+        // Handle legacy stuff
+        keyFile = otherParams.extraOptions.keyFile
+    }
+
+    this._testName = testName
+    this._otherParams = otherParams
+    
+    var pathOpts = this.pathOpts = { testName : testName }
+
+    var hasRS = false
+    for( var k in otherParams ){
+        if( k.startsWith( "rs" ) ){
+            hasRS = true
+            break
+        }
+    }
+    
+    if( hasRS ){
+        otherParams.separateConfig = true
+        otherParams.useHostname = otherParams.useHostname == undefined ? true : otherParams.useHostname
+    }
+    
+    var localhost = otherParams.useHostname ? getHostName() : "localhost";
+
+    this._alldbpaths = []
+    this._connections = []
+    this._shardServers = this._connections
+    this._rs = []
+    this._rsObjects = []
+
+    for ( var i = 0; i < numShards; i++ ) {
+        if( otherParams.rs || otherParams["rs" + i] ){
+            
+            otherParams.separateConfig = true
+            
+            var setName = testName + "-rs" + i;
+            
+            rsDefaults = { useHostname : otherParams.useHostname,
+                           noJournalPrealloc : otherParams.nopreallocj, 
+                           oplogSize : 40,
+                           nodes : 3,
+                           pathOpts : Object.merge( pathOpts, { shard : i } )}
+            
+            rsDefaults = Object.merge( rsDefaults, otherParams.rs )
+            rsDefaults = Object.merge( rsDefaults, otherParams.rsOptions )
+            rsDefaults = Object.merge( rsDefaults, otherParams["rs" + i] )
+            
+            var numReplicas = rsDefaults.nodes || otherParams.numReplicas || 3
+            delete rsDefaults.nodes
+            
+            print( "Replica set test!" )
+            
+            var rs = new ReplSetTest( { name : setName , nodes : numReplicas , startPort : 31100 + ( i * 100 ), useHostName : otherParams.useHostname, keyFile : keyFile, shardSvr : true } );
+            this._rs[i] = { setName : setName , test : rs , nodes : rs.startSet( rsDefaults ) , url : rs.getURL() };
+            rs.initiate();
+            this["rs" + i] = rs
+            
+            this._rsObjects[i] = rs
+            
+            this._alldbpaths.push( null )
+            this._connections.push( null )
+        }
+        else {
+            var options = { useHostname : otherParams.useHostname,
+                            noJournalPrealloc : otherParams.nopreallocj,
+                            port : 30000 + i,
+                            pathOpts : Object.merge( pathOpts, { shard : i } ),
+                            dbpath : "$testName$shard",
+                            keyFile : keyFile
+                          }
+            
+            options = Object.merge( options, otherParams.shardOptions )
+            options = Object.merge( options, otherParams["d" + i] )
+            
+            var conn = MongoRunner.runMongod( options );
+            
+            this._alldbpaths.push( testName +i )
+            this._connections.push( conn );
+            this["shard" + i] = conn
+            this["d" + i] = conn
+            
+            this._rs[i] = null
+            this._rsObjects[i] = null
+        }
+    }
+        
+    // Do replication on replica sets if required
+    for ( var i = 0; i < numShards; i++ ){
+        if( ! otherParams.rs && ! otherParams["rs" + i] ) continue
+        
+        var rs = this._rs[i].test;
+        
+        rs.getMaster().getDB( "admin" ).foo.save( { x : 1 } )
+        rs.awaitReplication();
+        
+        var rsConn = new Mongo( rs.getURL() );
+        rsConn.name = rs.getURL();
+        this._connections[i] = rsConn
+        this["shard" + i] = rsConn
+        rsConn.rs = rs
+    }
+
+    this._configServers = []
+    this._configNames = []
+    
+    if ( otherParams.sync && ! otherParams.separateConfig && numShards < 3 )
+        throw "if you want sync, you need at least 3 servers";
+    
+    for ( var i = 0; i < ( otherParams.sync ? 3 : 1 ) ; i++ ) {
+        
+        var conn = null
+        
+        if( otherParams.separateConfig ){
+            
+            var options = { useHostname : otherParams.useHostname, 
+                            noJournalPrealloc : otherParams.nopreallocj, 
+                            port : 40000 + i,
+                            pathOpts : Object.merge( pathOpts, { config : i } ),
+                            dbpath : "$testName-config$config",
+                            keyFile : keyFile
+                          }
+            
+            options = Object.merge( options, otherParams.configOptions )
+            options = Object.merge( options, otherParams["c" + i] )
+                        
+            var conn = MongoRunner.runMongod( options )
+            
+            // TODO:  Needed?
+            this._alldbpaths.push( testName + "-config" + i )
+        }
+        else{
+            conn = this["shard" + i]
+        }
+        
+        this._configServers.push( conn );
+        this._configNames.push( conn.name )
+        this["config" + i] = conn
+        this["c" + i] = conn
+    }
+
+    printjson( this._configDB = this._configNames.join( "," ) )
+    this._configConnection = new Mongo( this._configDB )
+    if ( ! otherParams.noChunkSize ) {
+        this._configConnection.getDB( "config" ).settings.insert( { _id : "chunksize" , value : otherParams.chunksize || otherParams.chunkSize || 50 } )
+    }
+
+    print( "ShardingTest " + this._testName + " :\n" + tojson( { config : this._configDB, shards : this._connections } ) );
+    
+    this._mongos = []
+    this._mongoses = this._mongos
+    for ( var i = 0; i < ( ( numMongos == 0 ? -1 : numMongos ) || 1 ); i++ ){
+        
+        var options = { useHostname : otherParams.useHostname, 
+                        port : 31000 - i - 1,
+                        pathOpts : Object.merge( pathOpts, { mongos : i } ),
+                        configdb : this._configDB,
+                        verbose : verboseLevel || 0,
+                        keyFile : keyFile
+                      }
+
+        options = Object.merge( options, otherParams.mongosOptions )
+        options = Object.merge( options, otherParams.extraOptions )
+        options = Object.merge( options, otherParams["s" + i] )
+        
+        var conn = MongoRunner.runMongos( options )
+
+        this._mongos.push( conn );
+        if ( i == 0 ) this.s = conn
+        this["s" + i] = conn
+    }
+
+    var admin = this.admin = this.s.getDB( "admin" );
+    this.config = this.s.getDB( "config" );
+
+    if ( ! otherParams.manualAddShard ){
+        this._shardNames = []
+        var shardNames = this._shardNames
+        this._connections.forEach(
+            function(z){
+                var n = z.name;
+                if ( ! n ){
+                    n = z.host;
+                    if ( ! n )
+                        n = z;
+                }
+                print( "ShardingTest " + this._testName + " going to add shard : " + n )
+                x = admin.runCommand( { addshard : n } );
+                printjson( x )
+                shardNames.push( x.shardAdded )
+                z.shardName = x.shardAdded
+            }
+        );
+    }
+
+    if (jsTestOptions().keyFile && !keyFile) {
+        jsTest.addAuth(this._mongos[0]);
+        jsTest.authenticateNodes(this._connections);
+        jsTest.authenticateNodes(this._configServers);
+        jsTest.authenticateNodes(this._mongos);
+    }
+}
+
+ShardingTest.prototype.getRSEntry = function( setName ){
+    for ( var i=0; i<this._rs.length; i++ )
+        if ( this._rs[i].setName == setName )
+            return this._rs[i];
+    throw "can't find rs: " + setName;
+}
+
+ShardingTest.prototype.getConfigIndex = function( config ){
+    
+    // Assume config is a # if not a conn object
+    if( ! isObject( config ) ) config = getHostName() + ":" + config
+    
+    for( var i = 0; i < this._configServers.length; i++ ){
+        if( connectionURLTheSame( this._configServers[i], config ) ) return i
+    }
+    
+    return -1
+}
+
+ShardingTest.prototype.getDB = function( name ){
+    return this.s.getDB( name );
+}
+
+ShardingTest.prototype.getServerName = function( dbname ){
+    var x = this.config.databases.findOne( { _id : "" + dbname } );
+    if ( x )
+        return x.primary;
+    this.config.databases.find().forEach( printjson );
+    throw "couldn't find dbname: " + dbname + " total: " + this.config.databases.count();
+}
+
+
+ShardingTest.prototype.getNonPrimaries = function( dbname ){
+    var x = this.config.databases.findOne( { _id : dbname } );
+    if ( ! x ){
+        this.config.databases.find().forEach( printjson );
+        throw "couldn't find dbname: " + dbname + " total: " + this.config.databases.count();
+    }
+    
+    return this.config.shards.find( { _id : { $ne : x.primary } } ).map( function(z){ return z._id; } )
+}
+
+
+ShardingTest.prototype.getConnNames = function(){
+    var names = [];
+    for ( var i=0; i<this._connections.length; i++ ){
+        names.push( this._connections[i].name );
+    }
+    return names; 
+}
+
+ShardingTest.prototype.getServer = function( dbname ){
+    var name = this.getServerName( dbname );
+
+    var x = this.config.shards.findOne( { _id : name } );
+    if ( x )
+        name = x.host;
+
+    var rsName = null;
+    if ( name.indexOf( "/" ) > 0 )
+	rsName = name.substring( 0 , name.indexOf( "/" ) );
+    
+    for ( var i=0; i<this._connections.length; i++ ){
+        var c = this._connections[i];
+        if ( connectionURLTheSame( name , c.name ) || 
+             connectionURLTheSame( rsName , c.name ) )
+            return c;
+    }
+    
+    throw "can't find server for: " + dbname + " name:" + name;
+
+}
+
+ShardingTest.prototype.normalize = function( x ){
+    var z = this.config.shards.findOne( { host : x } );
+    if ( z )
+        return z._id;
+    return x;
+}
+
+ShardingTest.prototype.getOther = function( one ){
+    if ( this._connections.length < 2 )
+        throw "getOther only works with 2 servers";
+
+    if ( one._mongo )
+        one = one._mongo
+    
+    for( var i = 0; i < this._connections.length; i++ ){
+        if( this._connections[i] != one ) return this._connections[i]
+    }
+    
+    return null
+}
+
+ShardingTest.prototype.getAnother = function( one ){
+    if(this._connections.length < 2)
+    	throw "getAnother() only works with multiple servers";
+	
+	if ( one._mongo )
+        one = one._mongo
+    
+    for(var i = 0; i < this._connections.length; i++){
+    	if(this._connections[i] == one)
+    		return this._connections[(i + 1) % this._connections.length];
+    }
+}
+
+ShardingTest.prototype.getFirstOther = function( one ){
+    for ( var i=0; i<this._connections.length; i++ ){
+        if ( this._connections[i] != one )
+        return this._connections[i];
+    }
+    throw "impossible";
+}
+
+ShardingTest.prototype.stop = function(){
+    for ( var i=0; i<this._mongos.length; i++ ){
+        stopMongoProgram( 31000 - i - 1 );
+    }
+    for ( var i=0; i<this._connections.length; i++){
+        stopMongod( 30000 + i );
+    }
+    if ( this._rs ){
+        for ( var i=0; i<this._rs.length; i++ ){
+            if( this._rs[i] ) this._rs[i].test.stopSet( 15 );
+        }
+    }
+    if( this._otherParams.separateConfig ){
+        for ( var i=0; i<this._configServers.length; i++ ){
+            MongoRunner.stopMongod( this._configServers[i] )
+        }
+    }
+    if ( this._alldbpaths ){
+        for( i=0; i<this._alldbpaths.length; i++ ){
+            resetDbpath( "/data/db/" + this._alldbpaths[i] );
+        }
+    }
+
+    print('*** ShardingTest ' + this._testName + " completed successfully ***");
+}
+
+ShardingTest.prototype.adminCommand = function(cmd){
+    var res = this.admin.runCommand( cmd );
+    if ( res && res.ok == 1 )
+        return true;
+
+    throw "command " + tojson( cmd ) + " failed: " + tojson( res );
+}
+
+ShardingTest.prototype._rangeToString = function(r){
+    return tojsononeline( r.min ) + " -> " + tojsononeline( r.max );
+}
+
+ShardingTest.prototype.printChangeLog = function(){
+    var s = this;
+    this.config.changelog.find().forEach( 
+        function(z){
+            var msg = z.server + "\t" + z.time + "\t" + z.what;
+            for ( i=z.what.length; i<15; i++ )
+                msg += " ";
+            msg += " " + z.ns + "\t";
+            if ( z.what == "split" ){
+                msg += s._rangeToString( z.details.before ) + " -->> (" + s._rangeToString( z.details.left ) + "),(" + s._rangeToString( z.details.right ) + ")";
+            }
+            else if (z.what == "multi-split" ){
+                msg += s._rangeToString( z.details.before ) + "  -->> (" + z.details.number + "/" + z.details.of + " " + s._rangeToString( z.details.chunk ) + ")"; 
+            }
+            else {
+                msg += tojsononeline( z.details );
+            }
+
+            print( "ShardingTest " + msg )
+        }
+    );
+
+}
+
+ShardingTest.prototype.getChunksString = function( ns ){
+    var q = {}
+    if ( ns )
+        q.ns = ns;
+
+    var s = "";
+    this.config.chunks.find( q ).sort( { ns : 1 , min : 1 } ).forEach( 
+        function(z){
+            s +=  "  " + z._id + "\t" + z.lastmod.t + "|" + z.lastmod.i + "\t" + tojson(z.min) + " -> " + tojson(z.max) + " " + z.shard + "  " + z.ns + "\n";
+        }
+    );
+    
+    return s;
+}
+
+ShardingTest.prototype.printChunks = function( ns ){
+    print( "ShardingTest " + this.getChunksString( ns ) );
+}
+
+ShardingTest.prototype.printShardingStatus = function(){
+    printShardingStatus( this.config );
+}
+
+ShardingTest.prototype.printCollectionInfo = function( ns , msg ){
+    var out = "";
+    if ( msg )
+        out += msg + "\n";
+    out += "sharding collection info: " + ns + "\n";
+    for ( var i=0; i<this._connections.length; i++ ){
+        var c = this._connections[i];
+        out += "  mongod " + c + " " + tojson( c.getCollection( ns ).getShardVersion() , " " , true ) + "\n";
+    }
+    for ( var i=0; i<this._mongos.length; i++ ){
+        var c = this._mongos[i];
+        out += "  mongos " + c + " " + tojson( c.getCollection( ns ).getShardVersion() , " " , true ) + "\n";
+    }
+    
+    out += this.getChunksString( ns );
+
+    print( "ShardingTest " + out );
+}
+
+printShardingStatus = function( configDB , verbose ){
+    if (configDB === undefined)
+        configDB = db.getSisterDB('config')
+    
+    var version = configDB.getCollection( "version" ).findOne();
+    if ( version == null ){
+        print( "printShardingStatus: this db does not have sharding enabled. be sure you are connecting to a mongos from the shell and not to a mongod." );
+        return;
+    }
+    
+    var raw = "";
+    var output = function(s){
+        raw += s + "\n";
+    }
+    output( "--- Sharding Status --- " );
+    output( "  sharding version: " + tojson( configDB.getCollection( "version" ).findOne() ) );
+    
+    output( "  shards:" );
+    configDB.shards.find().sort( { _id : 1 } ).forEach( 
+        function(z){
+            output( "\t" + tojsononeline( z ) );
+        }
+    );
+
+    output( "  databases:" );
+    configDB.databases.find().sort( { name : 1 } ).forEach( 
+        function(db){
+            output( "\t" + tojsononeline(db,"",true) );
+        
+            if (db.partitioned){
+                configDB.collections.find( { _id : new RegExp( "^" + db._id + "\\." ) } ).sort( { _id : 1 } ).forEach(
+                    function( coll ){
+                        if ( coll.dropped == false ){
+                            output("\t\t" + coll._id + " chunks:");
+                            
+                            res = configDB.chunks.group( { cond : { ns : coll._id } , key : { shard : 1 }  , reduce : function( doc , out ){ out.nChunks++; } , initial : { nChunks : 0 } } );
+                            var totalChunks = 0;
+                            res.forEach( function(z){
+                                totalChunks += z.nChunks;
+                                output( "\t\t\t\t" + z.shard + "\t" + z.nChunks );
+                            } )
+                            
+                            if ( totalChunks < 20 || verbose ){
+                                configDB.chunks.find( { "ns" : coll._id } ).sort( { min : 1 } ).forEach( 
+                                    function(chunk){
+                                        output( "\t\t\t" + tojson( chunk.min ) + " -->> " + tojson( chunk.max ) + 
+                                                " on : " + chunk.shard + " " + tojson( chunk.lastmod ) + " " +
+                                                ( chunk.jumbo ? "jumbo " : "" ) );
+                                    }
+                                );
+                            }
+                            else {
+                                output( "\t\t\ttoo many chunks to print, use verbose if you want to force print" );
+                            }
+                        }
+                    }
+                )
+            }
+        }
+    );
+    
+    print( raw );
+}
+
+printShardingSizes = function(){
+    configDB = db.getSisterDB('config')
+    
+    var version = configDB.getCollection( "version" ).findOne();
+    if ( version == null ){
+        print( "printShardingSizes : not a shard db!" );
+        return;
+    }
+    
+    var raw = "";
+    var output = function(s){
+        raw += s + "\n";
+    }
+    output( "--- Sharding Status --- " );
+    output( "  sharding version: " + tojson( configDB.getCollection( "version" ).findOne() ) );
+    
+    output( "  shards:" );
+    var shards = {};
+    configDB.shards.find().forEach( 
+        function(z){
+            shards[z._id] = new Mongo(z.host);
+            output( "      " + tojson(z) );
+        }
+    );
+
+    var saveDB = db;
+    output( "  databases:" );
+    configDB.databases.find().sort( { name : 1 } ).forEach( 
+        function(db){
+            output( "\t" + tojson(db,"",true) );
+        
+            if (db.partitioned){
+                configDB.collections.find( { _id : new RegExp( "^" + db._id + "\." ) } ).sort( { _id : 1 } ).forEach(
+                    function( coll ){
+                        output("\t\t" + coll._id + " chunks:");
+                        configDB.chunks.find( { "ns" : coll._id } ).sort( { min : 1 } ).forEach( 
+                            function(chunk){
+                                var mydb = shards[chunk.shard].getDB(db._id)
+                                var out = mydb.runCommand({dataSize: coll._id,
+                                                           keyPattern: coll.key, 
+                                                           min: chunk.min,
+                                                           max: chunk.max });
+                                delete out.millis;
+                                delete out.ok;
+
+                                output( "\t\t\t" + tojson( chunk.min ) + " -->> " + tojson( chunk.max ) + 
+                                        " on : " + chunk.shard + " " + tojson( out ) );
+
+                            }
+                        );
+                    }
+                )
+            }
+        }
+    );
+    
+    print( raw );
+}
+
+ShardingTest.prototype.sync = function(){
+    this.adminCommand( "connpoolsync" );
+}
+
+ShardingTest.prototype.onNumShards = function( collName , dbName ){
+    this.sync(); // we should sync since we're going directly to mongod here
+    dbName = dbName || "test";
+    var num=0;
+    for ( var i=0; i<this._connections.length; i++ )
+        if ( this._connections[i].getDB( dbName ).getCollection( collName ).count() > 0 )
+            num++;
+    return num;
+}
+
+
+ShardingTest.prototype.shardCounts = function( collName , dbName ){
+    this.sync(); // we should sync since we're going directly to mongod here
+    dbName = dbName || "test";
+    var counts = {}
+    for ( var i=0; i<this._connections.length; i++ )
+        counts[i] = this._connections[i].getDB( dbName ).getCollection( collName ).count();
+    return counts;
+}
+
+ShardingTest.prototype.chunkCounts = function( collName , dbName ){
+    dbName = dbName || "test";
+    var x = {}
+
+    s.config.shards.find().forEach( 
+        function(z){
+            x[z._id] = 0;
+        }
+    );
+    
+    s.config.chunks.find( { ns : dbName + "." + collName } ).forEach(
+        function(z){
+            if ( x[z.shard] )
+                x[z.shard]++
+            else
+                x[z.shard] = 1;
+        }
+    );
+    return x;
+
+}
+
+ShardingTest.prototype.chunkDiff = function( collName , dbName ){
+    var c = this.chunkCounts( collName , dbName );
+    var min = 100000000;
+    var max = 0;
+    for ( var s in c ){
+        if ( c[s] < min )
+            min = c[s];
+        if ( c[s] > max )
+            max = c[s];
+    }
+    print( "ShardingTest input: " + tojson( c ) + " min: " + min + " max: " + max  );
+    return max - min;
+}
+
+ShardingTest.prototype.getShard = function( coll, query, includeEmpty ){
+    var shards = this.getShards( coll, query, includeEmpty )
+    assert.eq( shards.length, 1 )
+    return shards[0]
+}
+
+// Returns the shards on which documents matching a particular query reside
+ShardingTest.prototype.getShards = function( coll, query, includeEmpty ){
+    if( ! coll.getDB )
+        coll = this.s.getCollection( coll )
+    
+    var explain = coll.find( query ).explain()
+    var shards = []
+        
+    if( explain.shards ){
+        
+        for( var shardName in explain.shards ){           
+            for( var i = 0; i < explain.shards[shardName].length; i++ ){
+                if( includeEmpty || ( explain.shards[shardName][i].n && explain.shards[shardName][i].n > 0 ) )
+                    shards.push( shardName )
+            }
+        }
+        
+    }
+    
+    for( var i = 0; i < shards.length; i++ ){
+        for( var j = 0; j < this._connections.length; j++ ){
+            if ( connectionURLTheSame(  this._connections[j] , shards[i] ) ){
+                shards[i] = this._connections[j]
+                break;
+            }
+        }
+    }
+    
+    return shards
+}
+
+ShardingTest.prototype.isSharded = function( collName ){
+    
+    var collName = "" + collName
+    var dbName = undefined
+    
+    if( typeof collName.getCollectionNames == 'function' ){
+        dbName = "" + collName
+        collName = undefined
+    }
+    
+    if( dbName ){
+        var x = this.config.databases.findOne( { _id : dbname } )
+        if( x ) return x.partitioned
+        else return false
+    }
+    
+    if( collName ){
+        var x = this.config.collections.findOne( { _id : collName } )
+        if( x ) return true
+        else return false
+    }
+    
+}
+
+ShardingTest.prototype.shardGo = function( collName , key , split , move , dbName ){
+    
+    split = ( split != false ? ( split || key ) : split )
+    move = ( split != false && move != false ? ( move || split ) : false )
+    
+    if( collName.getDB )
+        dbName = "" + collName.getDB()
+    else dbName = dbName || "test";
+
+    var c = dbName + "." + collName;
+    if( collName.getDB )
+        c = "" + collName
+
+    var isEmpty = this.s.getCollection( c ).count() == 0
+        
+    if( ! this.isSharded( dbName ) )
+        this.s.adminCommand( { enableSharding : dbName } )
+    
+    var result = this.s.adminCommand( { shardcollection : c , key : key } )
+    if( ! result.ok ){
+        printjson( result )
+        assert( false )
+    }
+    
+    if( split == false ) return
+    
+    result = this.s.adminCommand( { split : c , middle : split } );
+    if( ! result.ok ){
+        printjson( result )
+        assert( false )
+    }
+        
+    if( move == false ) return
+    
+    var result = null
+    for( var i = 0; i < 5; i++ ){
+        result = this.s.adminCommand( { movechunk : c , find : move , to : this.getOther( this.getServer( dbName ) ).name } );
+        if( result.ok ) break;
+        sleep( 5 * 1000 );
+    }
+    printjson( result )
+    assert( result.ok )
+    
+};
+
+ShardingTest.prototype.shardColl = ShardingTest.prototype.shardGo
+
+ShardingTest.prototype.setBalancer = function( balancer ){
+    if( balancer || balancer == undefined ){
+        this.config.settings.update( { _id: "balancer" }, { $set : { stopped: false } } , true )
+    }
+    else if( balancer == false ){
+        this.config.settings.update( { _id: "balancer" }, { $set : { stopped: true } } , true )
+    }
+}
+
+ShardingTest.prototype.stopBalancer = function( timeout, interval ) {
+    this.setBalancer( false )
+    
+    if( typeof db == "undefined" ) db = undefined
+    var oldDB = db
+    
+    db = this.config
+    sh.waitForBalancer( false, timeout, interval )
+    db = oldDB
+}
+
+ShardingTest.prototype.startBalancer = function( timeout, interval ) {
+    this.setBalancer( true )
+    
+    if( typeof db == "undefined" ) db = undefined
+    var oldDB = db
+    
+    db = this.config
+    sh.waitForBalancer( true, timeout, interval )
+    db = oldDB
+}
+
+/**
+ * Run a mongod process.
+ *
+ * After initializing a MongodRunner, you must call start() on it.
+ * @param {int} port port to run db on, use allocatePorts(num) to requision
+ * @param {string} dbpath path to use
+ * @param {boolean} peer pass in false (DEPRECATED, was used for replica pair host)
+ * @param {boolean} arbiter pass in false (DEPRECATED, was used for replica pair host)
+ * @param {array} extraArgs other arguments for the command line
+ * @param {object} options other options include no_bind to not bind_ip to 127.0.0.1
+ *    (necessary for replica set testing)
+ */
+MongodRunner = function( port, dbpath, peer, arbiter, extraArgs, options ) {
+    this.port_ = port;
+    this.dbpath_ = dbpath;
+    this.peer_ = peer;
+    this.arbiter_ = arbiter;
+    this.extraArgs_ = extraArgs;
+    this.options_ = options ? options : {};
+};
+
+/**
+ * Start this mongod process.
+ *
+ * @param {boolean} reuseData If the data directory should be left intact (default is to wipe it)
+ */
+MongodRunner.prototype.start = function( reuseData ) {
+    var args = [];
+    if ( reuseData ) {
+        args.push( "mongod" );
+    }
+    args.push( "--port" );
+    args.push( this.port_ );
+    args.push( "--dbpath" );
+    args.push( this.dbpath_ );
+    args.push( "--nohttpinterface" );
+    args.push( "--noprealloc" );
+    args.push( "--smallfiles" );
+    if (!this.options_.no_bind) {
+      args.push( "--bind_ip" );
+      args.push( "127.0.0.1" );
+    }
+    if ( this.extraArgs_ ) {
+        args = args.concat( this.extraArgs_ );
+    }
+    removeFile( this.dbpath_ + "/mongod.lock" );
+    if ( reuseData ) {
+        return startMongoProgram.apply( null, args );
+    } else {
+        return startMongod.apply( null, args );
+    }
+}
+
+MongodRunner.prototype.port = function() { return this.port_; }
+
+MongodRunner.prototype.toString = function() { return [ this.port_, this.dbpath_, this.peer_, this.arbiter_ ].toString(); }
+
+ToolTest = function( name ){
+    this.name = name;
+    this.port = allocatePorts(1)[0];
+    this.baseName = "jstests_tool_" + name;
+    this.root = "/data/db/" + this.baseName;
+    this.dbpath = this.root + "/";
+    this.ext = this.root + "_external/";
+    this.extFile = this.root + "_external/a";
+    resetDbpath( this.dbpath );
+    resetDbpath( this.ext );
+}
+
+ToolTest.prototype.startDB = function( coll ){
+    assert( ! this.m , "db already running" );
+ 
+    this.m = startMongoProgram( "mongod" , "--port", this.port , "--dbpath" , this.dbpath , "--nohttpinterface", "--noprealloc" , "--smallfiles" , "--bind_ip", "127.0.0.1" );
+    this.db = this.m.getDB( this.baseName );
+    if ( coll )
+        return this.db.getCollection( coll );
+    return this.db;
+}
+
+ToolTest.prototype.stop = function(){
+    if ( ! this.m )
+        return;
+    stopMongod( this.port );
+    this.m = null;
+    this.db = null;
+
+    print('*** ' + this.name + " completed successfully ***");
+}
+
+ToolTest.prototype.runTool = function(){
+    var a = [ "mongo" + arguments[0] ];
+
+    var hasdbpath = false;
+    
+    for ( var i=1; i<arguments.length; i++ ){
+        a.push( arguments[i] );
+        if ( arguments[i] == "--dbpath" )
+            hasdbpath = true;
+    }
+
+    if ( ! hasdbpath ){
+        a.push( "--host" );
+        a.push( "127.0.0.1:" + this.port );
+    }
+
+    return runMongoProgram.apply( null , a );
+}
+
+
+ReplTest = function( name, ports ){
+    this.name = name;
+    this.ports = ports || allocatePorts( 2 );
+}
+
+ReplTest.prototype.getPort = function( master ){
+    if ( master )
+        return this.ports[ 0 ];
+    return this.ports[ 1 ]
+}
+
+ReplTest.prototype.getPath = function( master ){
+    var p = "/data/db/" + this.name + "-";
+    if ( master )
+        p += "master";
+    else
+        p += "slave"
+    return p;
+}
+
+ReplTest.prototype.getOptions = function( master , extra , putBinaryFirst, norepl ){
+
+    if ( ! extra )
+        extra = {};
+
+    if ( ! extra.oplogSize )
+        extra.oplogSize = "40";
+        
+    var a = []
+    if ( putBinaryFirst )
+        a.push( "mongod" )
+    a.push( "--nohttpinterface", "--noprealloc", "--bind_ip" , "127.0.0.1" , "--smallfiles" );
+
+    a.push( "--port" );
+    a.push( this.getPort( master ) );
+
+    a.push( "--dbpath" );
+    a.push( this.getPath( master ) );
+    
+    if( jsTestOptions().noJournal ) a.push( "--nojournal" )
+    if( jsTestOptions().noJournalPrealloc ) a.push( "--nopreallocj" )
+    if( jsTestOptions().keyFile ) {
+        a.push( "--keyFile" )
+        a.push( jsTestOptions().keyFile )
+    }
+
+    if ( !norepl ) {
+        if ( master ){
+            a.push( "--master" );
+        }
+        else {
+            a.push( "--slave" );
+            a.push( "--source" );
+            a.push( "127.0.0.1:" + this.ports[0] );
+        }
+    }
+    
+    for ( var k in extra ){
+        var v = extra[k];
+        if( k in MongoRunner.logicalOptions ) continue
+        a.push( "--" + k );
+        if ( v != null )
+            a.push( v );                    
+    }
+
+    return a;
+}
+
+ReplTest.prototype.start = function( master , options , restart, norepl ){
+    var lockFile = this.getPath( master ) + "/mongod.lock";
+    removeFile( lockFile );
+    var o = this.getOptions( master , options , restart, norepl );
+
+
+    if ( restart )
+        return startMongoProgram.apply( null , o );
+    else
+        return startMongod.apply( null , o );
+}
+
+ReplTest.prototype.stop = function( master , signal ){
+    if ( arguments.length == 0 ){
+        this.stop( true );
+        this.stop( false );
+        return;
+    }
+
+    print('*** ' + this.name + " completed successfully ***");
+    return stopMongod( this.getPort( master ) , signal || 15 );
+}
+
+allocatePorts = function( n , startPort ) {
+    var ret = [];
+    var start = startPort || 31000;
+    for( var i = start; i < start + n; ++i )
+        ret.push( i );
+    return ret;
+}
+
+
+SyncCCTest = function( testName , extraMongodOptions ){
+    this._testName = testName;
+    this._connections = [];
+    
+    for ( var i=0; i<3; i++ ){
+        this._connections.push( startMongodTest( 30000 + i , testName + i , false, extraMongodOptions ) );
+    }
+    
+    this.url = this._connections.map( function(z){ return z.name; } ).join( "," );
+    this.conn = new Mongo( this.url );
+}
+
+SyncCCTest.prototype.stop = function(){
+    for ( var i=0; i<this._connections.length; i++){
+        stopMongod( 30000 + i );
+    }
+
+    print('*** ' + this._testName + " completed successfully ***");
+}
+
+SyncCCTest.prototype.checkHashes = function( dbname , msg ){
+    var hashes = this._connections.map(
+        function(z){
+            return z.getDB( dbname ).runCommand( "dbhash" );
+        }
+    );
+
+    for ( var i=1; i<hashes.length; i++ ){
+        assert.eq( hashes[0].md5 , hashes[i].md5 , "checkHash on " + dbname + " " + msg + "\n" + tojson( hashes ) )
+    }
+}
+
+SyncCCTest.prototype.tempKill = function( num ){
+    num = num || 0;
+    stopMongod( 30000 + num );
+}
+
+SyncCCTest.prototype.tempStart = function( num ){
+    num = num || 0;
+    this._connections[num] = startMongodTest( 30000 + num , this._testName + num , true );
+}
+
+
+function startParallelShell( jsCode, port ){
+    var x;
+
+    var args = ["mongo"];
+    if (port) {
+        args.push("--port", port);
+    }
+
+    if (TestData) {
+        jsCode = "TestData = " + tojson(TestData) + ";jsTest.authenticate(db.getMongo());" + jsCode;
+    }
+
+    args.push("--eval", jsCode);
+
+    if (typeof db == "object") {
+        args.push(db.getMongo().host);
+    }
+
+    x = startMongoProgramNoConnect.apply(null, args);
+    return function(){
+        waitProgram( x );
+    };
+}
+
+var testingReplication = false;
+
+function skipIfTestingReplication(){
+    if (testingReplication) {
+        print("skipIfTestingReplication skipping");
+        quit(0);
+    }
+}
+
+ReplSetTest = function( opts ){
+    this.name  = opts.name || "testReplSet";
+    this.host  = opts.host || getHostName();
+    this.useHostName = opts.useHostName
+    this.numNodes = opts.nodes || 0;
+    this.oplogSize = opts.oplogSize || 40;
+    this.useSeedList = opts.useSeedList || false;
+    this.bridged = opts.bridged || false;
+    this.ports = [];
+    this.keyFile = opts.keyFile
+    this.shardSvr = opts.shardSvr || false;
+
+    this.startPort = opts.startPort || 31000;
+
+    this.nodeOptions = {}    
+    if( isObject( this.numNodes ) ){
+        var len = 0
+        for( var i in this.numNodes ){
+            var options = this.nodeOptions[ "n" + len ] = this.numNodes[i]
+            if( i.startsWith( "a" ) ) options.arbiter = true
+            len++
+        }
+        this.numNodes = len
+    }
+    else if( Array.isArray( this.numNodes ) ){
+        for( var i = 0; i < this.numNodes.length; i++ )
+            this.nodeOptions[ "n" + i ] = this.numNodes[i]
+        this.numNodes = this.numNodes.length
+    }
+    
+    if(this.bridged) {
+        this.bridgePorts = [];
+
+        var allPorts = allocatePorts( this.numNodes * 2 , this.startPort );
+        for(var i=0; i < this.numNodes; i++) {
+            this.ports[i] = allPorts[i*2];
+            this.bridgePorts[i] = allPorts[i*2 + 1];
+        }
+
+        this.initBridges();
+    }
+    else {
+        this.ports = allocatePorts( this.numNodes , this.startPort );
+    }
+
+    this.nodes = []
+    this.initLiveNodes()
+    
+    Object.extend( this, ReplSetTest.Health )
+    Object.extend( this, ReplSetTest.State )
+    
+}
+
+ReplSetTest.prototype.initBridges = function() {
+    for(var i=0; i<this.ports.length; i++) {
+        startMongoProgram( "mongobridge", "--port", this.bridgePorts[i], "--dest", this.host + ":" + this.ports[i] );
+    }
+}
+
+// List of nodes as host:port strings.
+ReplSetTest.prototype.nodeList = function() {
+    var list = [];
+    for(var i=0; i<this.ports.length; i++) {
+      list.push( this.host + ":" + this.ports[i]);
+    }
+
+    return list;
+}
+
+// Here we store a reference to all reachable nodes.
+ReplSetTest.prototype.initLiveNodes = function() {
+    this.liveNodes = { master: null, slaves: [] }
+}
+
+ReplSetTest.prototype.getNodeId = function(node) {
+    
+    if( node.toFixed ) return parseInt( node )
+    
+    for( var i = 0; i < this.nodes.length; i++ ){
+        if( this.nodes[i] == node ) return i
+    }
+    
+    if( node instanceof ObjectId ){
+        for( var i = 0; i < this.nodes.length; i++ ){
+            if( this.nodes[i].runId == node ) return i
+        }
+    }
+    
+    if( node.nodeId ) return parseInt( node.nodeId )
+    
+    return undefined
+    
+}
+
+ReplSetTest.prototype.getPort = function( n ){
+    
+    n = this.getNodeId( n )
+    
+    print( "ReplSetTest n: " + n + " ports: " + tojson( this.ports ) + "\t" + this.ports[n] + " " + typeof(n) );
+    return this.ports[ n ];
+}
+
+ReplSetTest.prototype.getPath = function( n ){
+    
+    if( n.host )
+        n = this.getNodeId( n )
+
+    var p = "/data/db/" + this.name + "-"+n;
+    if ( ! this._alldbpaths )
+        this._alldbpaths = [ p ];
+    else
+        this._alldbpaths.push( p );
+    return p;
+}
+
+ReplSetTest.prototype.getReplSetConfig = function() {
+    var cfg = {};
+
+    cfg['_id']  = this.name;
+    cfg.members = [];
+
+    for(i=0; i<this.ports.length; i++) {
+        member = {};
+        member['_id']  = i;
+
+        if(this.bridged)
+          var port = this.bridgePorts[i];
+        else
+          var port = this.ports[i];
+
+        member['host'] = this.host + ":" + port;
+        if( this.nodeOptions[ "n" + i ] && this.nodeOptions[ "n" + i ].arbiter )
+            member['arbiterOnly'] = true
+            
+        cfg.members.push(member);
+    }
+
+    return cfg;
+}
+
+ReplSetTest.prototype.getURL = function(){
+    var hosts = [];
+    
+    for(i=0; i<this.ports.length; i++) {
+
+        // Don't include this node in the replica set list
+        if(this.bridged && this.ports[i] == this.ports[n]) {
+            continue;
+        }
+        
+        var port;
+        // Connect on the right port
+        if(this.bridged) {
+            port = this.bridgePorts[i];
+        }
+        else {
+            port = this.ports[i];
+        }
+        
+        var str = this.host + ":" + port;
+        hosts.push(str);
+    }
+    
+    return this.name + "/" + hosts.join(",");
+}
+
+ReplSetTest.prototype.getOptions = function( n , extra , putBinaryFirst ){
+
+    if ( ! extra )
+        extra = {};
+
+    if ( ! extra.oplogSize )
+        extra.oplogSize = this.oplogSize;
+
+    var a = []
+
+
+    if ( putBinaryFirst )
+        a.push( "mongod" );
+
+    if ( extra.noReplSet ) {
+        delete extra.noReplSet;
+    }
+    else {
+        a.push( "--replSet" );
+        
+        if( this.useSeedList ) {
+            a.push( this.getURL() );
+        }
+        else {
+            a.push( this.name );
+        }
+    }
+    
+    a.push( "--noprealloc", "--smallfiles" );
+
+    a.push( "--rest" );
+
+    a.push( "--port" );
+    a.push( this.getPort( n ) );
+
+    a.push( "--dbpath" );
+    a.push( this.getPath( ( n.host ? this.getNodeId( n ) : n ) ) );
+    
+    if( this.keyFile ){
+        a.push( "--keyFile" )
+        a.push( keyFile )
+    }        
+    
+    if( jsTestOptions().noJournal ) a.push( "--nojournal" )
+    if( jsTestOptions().noJournalPrealloc ) a.push( "--nopreallocj" )
+    if( jsTestOptions().keyFile && !this.keyFile) {
+        a.push( "--keyFile" )
+        a.push( jsTestOptions().keyFile )
+    }
+    
+    for ( var k in extra ){
+        var v = extra[k];
+        if( k in MongoRunner.logicalOptions ) continue
+        a.push( "--" + k );
+        if ( v != null ){
+            if( v.replace ){
+                v = v.replace(/\$node/g, "" + ( n.host ? this.getNodeId( n ) : n ) )
+                v = v.replace(/\$set/g, this.name )
+                v = v.replace(/\$path/g, this.getPath( n ) )
+            }
+            a.push( v );
+        }
+    }
+
+    return a;
+}
+
+ReplSetTest.prototype.startSet = function( options ) {
+    
+    var nodes = [];
+    print( "ReplSetTest Starting Set" );
+
+    for( n = 0 ; n < this.ports.length; n++ ) {
+        node = this.start(n, options)
+        nodes.push(node);
+    }
+
+    this.nodes = nodes;
+    return this.nodes;
+}
+
+ReplSetTest.prototype.callIsMaster = function() {
+  
+  var master = null;
+  this.initLiveNodes();
+    
+  for(var i=0; i<this.nodes.length; i++) {
+
+    try {
+      var n = this.nodes[i].getDB('admin').runCommand({ismaster:1});
+      
+      if(n['ismaster'] == true) {
+        master = this.nodes[i]
+        this.liveNodes.master = master
+      }
+      else {
+        this.nodes[i].setSlaveOk();
+        this.liveNodes.slaves.push(this.nodes[i]);
+      }
+
+    }
+    catch(err) {
+      print("ReplSetTest Could not call ismaster on node " + i);
+    }
+  }
+
+  return master || false;
+}
+
+ReplSetTest.awaitRSClientHosts = function( conn, host, hostOk, rs ) {
+    
+    if( host.length ){
+        for( var i = 0; i < host.length; i++ ) this.awaitOk( conn, host[i] )
+        return
+    }
+    
+    if( hostOk == undefined ) hostOk = { ok : true }
+    if( host.host ) host = host.host
+    if( rs && rs.getMaster ) rs = rs.name
+    
+    print( "Awaiting " + host + " to be " + tojson( hostOk ) + " for " + conn + " (rs: " + rs + ")" )
+    
+    var tests = 0
+    assert.soon( function() {
+        var rsClientHosts = conn.getDB( "admin" ).runCommand( "connPoolStats" )[ "replicaSets" ]
+        if( tests++ % 10 == 0 ) 
+            printjson( rsClientHosts )
+        
+        for ( rsName in rsClientHosts ){
+            if( rs && rs != rsName ) continue
+            for ( var i = 0; i < rsClientHosts[rsName].hosts.length; i++ ){
+                var clientHost = rsClientHosts[rsName].hosts[ i ];
+                if( clientHost.addr != host ) continue
+                
+                // Check that *all* host properties are set correctly
+                var propOk = true
+                for( var prop in hostOk ){
+                    if( clientHost[prop] != hostOk[prop] ){ 
+                        propOk = false
+                        break
+                    }
+                }
+                
+                if( propOk ) return true;
+
+            }
+        }
+        return false;
+    }, "timed out waiting for replica set client to recognize hosts",
+       3 * 20 * 1000 /* ReplicaSetMonitorWatcher updates every 20s */ )
+    
+}
+
+ReplSetTest.prototype.awaitSecondaryNodes = function( timeout ) {
+  var master = this.getMaster();
+  var slaves = this.liveNodes.slaves;
+  var len = slaves.length;
+
+  jsTest.attempt({context: this, timeout: 60000, desc: "Awaiting secondaries"}, function() {
+     var ready = true;
+     for(var i=0; i<len; i++) {
+       ready = ready && slaves[i].getDB("admin").runCommand({ismaster: 1})['secondary'];
+     }
+
+     return ready;
+  });
+}
+
+ReplSetTest.prototype.getMaster = function( timeout ) {
+  var tries = 0;
+  var sleepTime = 500;
+  var t = timeout || 000;
+  var master = null;
+
+  master = jsTest.attempt({context: this, timeout: 60000, desc: "Finding master"}, this.callIsMaster);
+  return master;
+}
+
+ReplSetTest.prototype.getPrimary = ReplSetTest.prototype.getMaster
+
+ReplSetTest.prototype.getSecondaries = function( timeout ){
+    var master = this.getMaster( timeout )
+    var secs = []
+    for( var i = 0; i < this.nodes.length; i++ ){
+        if( this.nodes[i] != master ){
+            secs.push( this.nodes[i] )
+        }
+    }
+    return secs
+}
+
+ReplSetTest.prototype.getSecondary = function( timeout ){
+    return this.getSecondaries( timeout )[0];
+}
+
+ReplSetTest.prototype.status = function( timeout ){
+    var master = this.callIsMaster()
+    if( ! master ) master = this.liveNodes.slaves[0]
+    return master.getDB("admin").runCommand({replSetGetStatus: 1})
+}
+
+// Add a node to the test set
+ReplSetTest.prototype.add = function( config ) {
+  if(this.ports.length == 0) {
+    var nextPort = allocatePorts( 1, this.startPort )[0];
+  }
+  else {
+    var nextPort = this.ports[this.ports.length-1] + 1;
+  }
+  print("ReplSetTest Next port: " + nextPort);
+  this.ports.push(nextPort);
+  printjson(this.ports);
+
+  var nextId = this.nodes.length;
+  printjson(this.nodes);
+  print("ReplSetTest nextId:" + nextId);
+  var newNode = this.start( nextId );
+  
+  return newNode;
+}
+
+ReplSetTest.prototype.remove = function( nodeId ) {
+    nodeId = this.getNodeId( nodeId )
+    this.nodes.splice( nodeId, 1 );
+    this.ports.splice( nodeId, 1 );
+}
+
+ReplSetTest.prototype.initiate = function( cfg , initCmd , timeout ) {
+    var master  = this.nodes[0].getDB("admin");
+    var config  = cfg || this.getReplSetConfig();
+    var cmd     = {};
+    var cmdKey  = initCmd || 'replSetInitiate';
+    var timeout = timeout || 30000;
+    cmd[cmdKey] = config;
+    printjson(cmd);
+
+    jsTest.attempt({context:this, timeout: timeout, desc: "Initiate replica set"}, function() {
+        var result = master.runCommand(cmd);
+        printjson(result);
+        return result['ok'] == 1;
+    });
+
+    // Setup authentication if running test with authentication
+    if (jsTestOptions().keyFile && !this.keyFile) {
+        if (!this.shardSvr) {
+            master = this.getMaster();
+            jsTest.addAuth(master);
+        }
+        jsTest.authenticateNodes(this.nodes);
+    }
+}
+
+ReplSetTest.prototype.reInitiate = function() {
+    var master  = this.nodes[0];
+    var c = master.getDB("local")['system.replset'].findOne();
+    var config  = this.getReplSetConfig();
+    config.version = c.version + 1;
+    this.initiate( config , 'replSetReconfig' );
+}
+
+ReplSetTest.prototype.getLastOpTimeWritten = function() {
+    this.getMaster();
+    jsTest.attempt({context : this, desc : "awaiting oplog query"},
+                 function() {
+                     try {
+                         this.latest = this.liveNodes.master.getDB("local")['oplog.rs'].find({}).sort({'$natural': -1}).limit(1).next()['ts'];
+                     }
+                     catch(e) {
+                         print("ReplSetTest caught exception " + e);
+                         return false;
+                     }
+                     return true;
+                 });
+};
+
+ReplSetTest.prototype.awaitReplication = function(timeout) {
+    timeout = timeout || 30000;
+
+    this.getLastOpTimeWritten();
+
+    print("ReplSetTest " + this.latest);
+
+    jsTest.attempt({context: this, timeout: timeout, desc: "awaiting replication"},
+                 function() {
+                     try {
+                         var synced = true;
+                         for(var i=0; i<this.liveNodes.slaves.length; i++) {
+                             var slave = this.liveNodes.slaves[i];
+
+                             // Continue if we're connected to an arbiter
+                             if(res = slave.getDB("admin").runCommand({replSetGetStatus: 1})) {
+                                 if(res.myState == 7) {
+                                     continue;
+                                 }
+                             }
+
+                             slave.getDB("admin").getMongo().setSlaveOk();
+                             var log = slave.getDB("local")['oplog.rs'];
+                             if(log.find({}).sort({'$natural': -1}).limit(1).hasNext()) {
+                                 var entry = log.find({}).sort({'$natural': -1}).limit(1).next();
+                                 printjson( entry );
+                                 var ts = entry['ts'];
+                                 print("ReplSetTest await TS for " + slave + " is " + ts.t+":"+ts.i + " and latest is " + this.latest.t+":"+this.latest.i);
+
+                                 if (this.latest.t < ts.t || (this.latest.t == ts.t && this.latest.i < ts.i)) {
+                                     this.latest = this.liveNodes.master.getDB("local")['oplog.rs'].find({}).sort({'$natural': -1}).limit(1).next()['ts'];
+                                 }
+
+                                 print("ReplSetTest await oplog size for " + slave + " is " + log.count());
+                                 synced = (synced && friendlyEqual(this.latest,ts))
+                             }
+                             else {
+                                 synced = false;
+                             }
+                         }
+
+                         if(synced) {
+                             print("ReplSetTest await synced=" + synced);
+                         }
+                         return synced;
+                     }
+                     catch (e) {
+                         print("ReplSetTest.awaitReplication: caught exception "+e);
+
+                         // we might have a new master now
+                         this.getLastOpTimeWritten();
+
+                         return false;
+                     }
+                 });
+}
+
+ReplSetTest.prototype.getHashes = function( db ){
+    this.getMaster();
+    var res = {};
+    res.master = this.liveNodes.master.getDB( db ).runCommand( "dbhash" )
+    res.slaves = this.liveNodes.slaves.map( function(z){ return z.getDB( db ).runCommand( "dbhash" ); } )
+    return res;
+}
+
+/**
+ * Starts up a server.  Options are saved by default for subsequent starts.
+ * 
+ * 
+ * Options { remember : true } re-applies the saved options from a prior start.
+ * Options { noRemember : true } ignores the current properties.
+ * Options { appendOptions : true } appends the current options to those remembered.
+ * Options { startClean : true } clears the data directory before starting.
+ *
+ * @param @param {int|conn|[int|conn]} n array or single server number (0, 1, 2, ...) or conn
+ * @param {object} [options]
+ * @param {boolean} [restart] If false, the data directory will be cleared 
+ * before the server starts.  Defaults to false.
+ * 
+ */
+ReplSetTest.prototype.start = function( n , options , restart , wait ){
+    
+    if( n.length ){
+        
+        var nodes = n
+        var started = []
+        
+        for( var i = 0; i < nodes.length; i++ ){
+            if( this.start( nodes[i], Object.merge({}, options), restart, wait ) ){
+                started.push( nodes[i] )
+            }
+        }
+        
+        return started
+        
+    }
+    
+    print( "ReplSetTest n is : " + n )
+    
+    defaults = { useHostName : this.useHostName,
+                 oplogSize : this.oplogSize, 
+                 keyFile : this.keyFile, 
+                 port : this.getPort( n ),
+                 noprealloc : "",
+                 smallfiles : "",
+                 rest : "",
+                 replSet : this.useSeedList ? this.getURL() : this.name,
+                 dbpath : "$set-$node" }
+    
+    // TODO : should we do something special if we don't currently know about this node?
+    n = this.getNodeId( n )
+    
+    options = Object.merge( defaults, options )
+    options = Object.merge( options, this.nodeOptions[ "n" + n ] )
+    
+    options.restart = restart
+            
+    var pathOpts = { node : n, set : this.name }
+    options.pathOpts = Object.merge( options.pathOpts || {}, pathOpts )
+    
+    if( tojson(options) != tojson({}) )
+        printjson(options)
+
+    // make sure to call getPath, otherwise folders wont be cleaned
+    this.getPath(n);
+
+    print("ReplSetTest " + (restart ? "(Re)" : "") + "Starting....");
+    
+    var rval = this.nodes[n] = MongoRunner.runMongod( options )
+    
+    if( ! rval ) return rval
+    
+    // Add replica set specific attributes
+    this.nodes[n].nodeId = n
+            
+    printjson( this.nodes )
+        
+    wait = wait || false
+    if( ! wait.toFixed ){
+        if( wait ) wait = 0
+        else wait = -1
+    }
+    
+    if( wait < 0 ) return rval
+    
+    // Wait for startup
+    this.waitForHealth( rval, this.UP, wait )
+    
+    return rval
+    
+}
+
+
+/**
+ * Restarts a db without clearing the data directory by default.  If the server is not
+ * stopped first, this function will not work.  
+ * 
+ * Option { startClean : true } forces clearing the data directory.
+ * 
+ * @param {int|conn|[int|conn]} n array or single server number (0, 1, 2, ...) or conn
+ */
+ReplSetTest.prototype.restart = function( n , options, signal, wait ){
+    // Can specify wait as third parameter, if using default signal
+    if( signal == true || signal == false ){
+        wait = signal
+        signal = undefined
+    }
+    
+    this.stop( n, signal, wait && wait.toFixed ? wait : true )
+    started = this.start( n , options , true, wait );
+
+    if (jsTestOptions().keyFile && !this.keyFile) {
+        if (started.length) {
+             // if n was an array of conns, start will return an array of connections
+            for (var i = 0; i < started.length; i++) {
+                jsTest.authenticate(started[i]);
+            }
+        } else {
+            jsTest.authenticate(started);
+        }
+    }
+    return started;
+}
+
+ReplSetTest.prototype.stopMaster = function( signal , wait ) {
+    var master = this.getMaster();
+    var master_id = this.getNodeId( master );
+    return this.stop( master_id , signal , wait );
+}
+
+// Stops a particular node or nodes, specified by conn or id
+ReplSetTest.prototype.stop = function( n , signal, wait /* wait for stop */ ){
+        
+    // Flatten array of nodes to stop
+    if( n.length ){
+        nodes = n
+        
+        var stopped = []
+        for( var i = 0; i < nodes.length; i++ ){
+            if( this.stop( nodes[i], signal, wait ) )
+                stopped.push( nodes[i] )
+        }
+        
+        return stopped
+    }
+    
+    // Can specify wait as second parameter, if using default signal
+    if( signal == true || signal == false ){
+        wait = signal
+        signal = undefined
+    }
+        
+    wait = wait || false
+    if( ! wait.toFixed ){
+        if( wait ) wait = 0
+        else wait = -1
+    }
+    
+    var port = this.getPort( n );
+    print('ReplSetTest stop *** Shutting down mongod in port ' + port + ' ***');
+    var ret = MongoRunner.stopMongod( port , signal );
+    
+    if( ! ret || wait < 0 ) return ret
+    
+    // Wait for shutdown
+    this.waitForHealth( n, this.DOWN, wait )
+    
+    return true
+}
+
+
+ReplSetTest.prototype.stopSet = function( signal , forRestart ) {
+    for(i=0; i < this.ports.length; i++) {
+        this.stop( i, signal );
+    }
+    if ( ! forRestart && this._alldbpaths ){
+        print("ReplSetTest stopSet deleting all dbpaths");
+        for( i=0; i<this._alldbpaths.length; i++ ){
+            resetDbpath( this._alldbpaths[i] );
+        }
+    }
+
+    print('ReplSetTest stopSet *** Shut down repl set - test worked ****' )
+};
+
+
+/**
+ * Waits until there is a master node
+ */
+ReplSetTest.prototype.waitForMaster = function( timeout ){
+    
+    var master = undefined
+    
+    jsTest.attempt({context: this, timeout: timeout, desc: "waiting for master"}, function() {
+        return ( master = this.getMaster() )
+    });
+    
+    return master
+}
+
+
+/**
+ * Wait for a health indicator to go to a particular state or states.
+ * 
+ * @param node is a single node or list of nodes, by id or conn
+ * @param state is a single state or list of states
+ * 
+ */
+ReplSetTest.prototype.waitForHealth = function( node, state, timeout ){
+    this.waitForIndicator( node, state, "health", timeout )    
+}
+
+/**
+ * Wait for a state indicator to go to a particular state or states.
+ * 
+ * @param node is a single node or list of nodes, by id or conn
+ * @param state is a single state or list of states
+ * 
+ */
+ReplSetTest.prototype.waitForState = function( node, state, timeout ){
+    this.waitForIndicator( node, state, "state", timeout )
+}
+
+/**
+ * Wait for a rs indicator to go to a particular state or states.
+ * 
+ * @param node is a single node or list of nodes, by id or conn
+ * @param states is a single state or list of states
+ * @param ind is the indicator specified
+ * 
+ */
+ReplSetTest.prototype.waitForIndicator = function( node, states, ind, timeout ){
+    
+    if( node.length ){
+        
+        var nodes = node        
+        for( var i = 0; i < nodes.length; i++ ){
+            if( states.length )
+                this.waitForIndicator( nodes[i], states[i], ind, timeout )
+            else
+                this.waitForIndicator( nodes[i], states, ind, timeout )
+        }
+        
+        return;
+    }    
+    
+    timeout = timeout || 30000;
+    
+    if( ! node.getDB ){
+        node = this.nodes[node]
+    }
+    
+    if( ! states.length ) states = [ states ]
+    
+    print( "ReplSetTest waitForIndicator " + ind + " on " + node )
+    printjson( states )
+    print( "ReplSetTest waitForIndicator from node " + node )
+    
+    var lastTime = null
+    var currTime = new Date().getTime()
+    var status = undefined
+        
+    jsTest.attempt({context: this, timeout: timeout, desc: "waiting for state indicator " + ind + " for " + timeout + "ms" }, function() {
+        
+        status = this.status()
+        
+        var printStatus = false
+        if( lastTime == null || ( currTime = new Date().getTime() ) - (1000 * 5) > lastTime ){
+            if( lastTime == null ) print( "ReplSetTest waitForIndicator Initial status ( timeout : " + timeout + " ) :" )
+            printjson( status )
+            lastTime = new Date().getTime()
+            printStatus = true
+        }
+
+        if (typeof status.members == 'undefined') {
+            return false;
+        }
+
+        for( var i = 0; i < status.members.length; i++ ){
+            if( printStatus ) print( "Status for : " + status.members[i].name + ", checking " + node.host + "/" + node.name )
+            if( status.members[i].name == node.host || status.members[i].name == node.name ){
+                for( var j = 0; j < states.length; j++ ){
+                    if( printStatus ) print( "Status " + " : " + status.members[i][ind] + "  target state : " + states[j] )
+                    if( status.members[i][ind] == states[j] ) return true;
+                }
+            }
+        }
+        
+        return false
+        
+    });
+    
+    print( "ReplSetTest waitForIndicator final status:" )
+    printjson( status )
+    
+}
+
+ReplSetTest.Health = {}
+ReplSetTest.Health.UP = 1
+ReplSetTest.Health.DOWN = 0
+
+ReplSetTest.State = {}
+ReplSetTest.State.PRIMARY = 1
+ReplSetTest.State.SECONDARY = 2
+ReplSetTest.State.RECOVERING = 3
+
+/** 
+ * Overflows a replica set secondary or secondaries, specified by id or conn.
+ */
+ReplSetTest.prototype.overflow = function( secondaries ){
+    
+    // Create a new collection to overflow, allow secondaries to replicate
+    var master = this.getMaster()
+    var overflowColl = master.getCollection( "_overflow.coll" )
+    overflowColl.insert({ replicated : "value" })
+    this.awaitReplication()
+    
+    this.stop( secondaries, undefined, 5 * 60 * 1000 )
+        
+    var count = master.getDB("local").oplog.rs.count();
+    var prevCount = -1;
+    
+    // Keep inserting till we hit our capped coll limits
+    while (count != prevCount) {
+      
+      print("ReplSetTest overflow inserting 10000");
+      
+      for (var i = 0; i < 10000; i++) {
+          overflowColl.insert({ overflow : "value" });
+      }
+      prevCount = count;
+      this.awaitReplication();
+      
+      count = master.getDB("local").oplog.rs.count();
+      
+      print( "ReplSetTest overflow count : " + count + " prev : " + prevCount );
+      
+    }
+    
+    // Restart all our secondaries and wait for recovery state
+    this.start( secondaries, { remember : true }, true, true )
+    this.waitForState( secondaries, this.RECOVERING, 5 * 60 * 1000 )
+    
+}
+
+
+
+
+/**
+ * Bridging allows you to test network partitioning.  For example, you can set
+ * up a replica set, run bridge(), then kill the connection between any two
+ * nodes x and y with partition(x, y).
+ *
+ * Once you have called bridging, you cannot reconfigure the replica set.
+ */
+ReplSetTest.prototype.bridge = function() {
+    if (this.bridges) {
+        print("ReplSetTest bridge bridges have already been created!");
+        return;
+    }
+    
+    var n = this.nodes.length;
+
+    // create bridges
+    this.bridges = [];
+    for (var i=0; i<n; i++) {
+        var nodeBridges = [];
+        for (var j=0; j<n; j++) {
+            if (i == j) {
+                continue;
+            }
+            nodeBridges[j] = new ReplSetBridge(this, i, j);
+        }
+        this.bridges.push(nodeBridges);
+    }
+    print("ReplSetTest bridge bridges: " + this.bridges);
+    
+    // restart everyone independently
+    this.stopSet(null, true);
+    for (var i=0; i<n; i++) {
+        this.restart(i, {noReplSet : true});
+    }
+    
+    // create new configs
+    for (var i=0; i<n; i++) {
+        config = this.nodes[i].getDB("local").system.replset.findOne();
+        
+        if (!config) {
+            print("ReplSetTest bridge couldn't find config for "+this.nodes[i]);
+            printjson(this.nodes[i].getDB("local").system.namespaces.find().toArray());
+            assert(false);
+        }
+
+        var updateMod = {"$set" : {}};
+        for (var j = 0; j<config.members.length; j++) {
+            if (config.members[j].host == this.host+":"+this.ports[i]) {
+                continue;
+            }
+
+            updateMod['$set']["members."+j+".host"] = this.bridges[i][j].host;
+        }
+        print("ReplSetTest bridge for node " + i + ":");
+        printjson(updateMod);
+        this.nodes[i].getDB("local").system.replset.update({},updateMod);
+    }
+
+    this.stopSet(null, true);
+    
+    // start set
+    for (var i=0; i<n; i++) {
+        this.restart(i);
+    }
+
+    return this.getMaster();
+};
+
+/**
+ * This kills the bridge between two nodes.  As parameters, specify the from and
+ * to node numbers.
+ *
+ * For example, with a three-member replica set, we'd have nodes 0, 1, and 2,
+ * with the following bridges: 0->1, 0->2, 1->0, 1->2, 2->0, 2->1.  We can kill
+ * the connection between nodes 0 and 2 by calling replTest.partition(0,2) or
+ * replTest.partition(2,0) (either way is identical). Then the replica set would
+ * have the following bridges: 0->1, 1->0, 1->2, 2->1.
+ */
+ReplSetTest.prototype.partition = function(from, to) {
+    this.bridges[from][to].stop();
+    this.bridges[to][from].stop();
+};
+
+/**
+ * This reverses a partition created by partition() above.
+ */
+ReplSetTest.prototype.unPartition = function(from, to) {
+    this.bridges[from][to].start();
+    this.bridges[to][from].start();
+};
+
+ReplSetBridge = function(rst, from, to) {
+    var n = rst.nodes.length;
+
+    var startPort = rst.startPort+n;
+    this.port = (startPort+(from*n+to));
+    this.host = rst.host+":"+this.port;
+
+    this.dest = rst.host+":"+rst.ports[to];
+    this.start();
+};
+
+ReplSetBridge.prototype.start = function() {
+    var args = ["mongobridge", "--port", this.port, "--dest", this.dest];
+    print("ReplSetBridge starting: "+tojson(args));
+    this.bridge = startMongoProgram.apply( null , args );
+    print("ReplSetBridge started " + this.bridge);
+};
+
+ReplSetBridge.prototype.stop = function() {
+    print("ReplSetBridge stopping: " + this.port);
+    stopMongod(this.port);
+};
+
+ReplSetBridge.prototype.toString = function() {
+    return this.host+" -> "+this.dest;
+};
diff --git a/src/mongo/shell/shell_utils.cpp b/src/mongo/shell/shell_utils.cpp
new file mode 100644
index 00000000000..f3283ab0ca1
--- /dev/null
+++ b/src/mongo/shell/shell_utils.cpp
@@ -0,0 +1,985 @@
+// utils.cpp
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#include <boost/thread/xtime.hpp>
+
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <assert.h>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <fcntl.h>
+
+#ifdef _WIN32
+# include <io.h>
+# define SIGKILL 9
+#else
+# include <sys/socket.h>
+# include <netinet/in.h>
+# include <signal.h>
+# include <sys/stat.h>
+# include <sys/wait.h>
+#endif
+
+#include "utils.h"
+#include "../client/dbclient.h"
+#include "../util/md5.hpp"
+#include "../util/processinfo.h"
+#include "../util/text.h"
+#include "../util/heapcheck.h"
+#include "../util/time_support.h"
+#include "../util/file.h"
+
+namespace mongo {
+
+    DBClientWithCommands *latestConn = 0;
+    extern bool dbexitCalled;
+
+#ifdef _WIN32
+    inline int close(int fd) { return _close(fd); }
+    inline int read(int fd, void* buf, size_t size) { return _read(fd, buf, size); }
+    inline int pipe(int fds[2]) { return _pipe(fds, 4096, _O_TEXT | _O_NOINHERIT); }
+#endif
+
+    namespace JSFiles {
+        extern const JSFile servers;
+    }
+
+    // these functions have not been audited for thread safety - currently they are called with an exclusive js mutex
+    namespace shellUtils {
+
+        Scope* theScope = 0;
+
+        std::string _dbConnect;
+        std::string _dbAuth;
+
+        const char *argv0 = 0;
+        void RecordMyLocation( const char *_argv0 ) { argv0 = _argv0; }
+
+        // helpers
+
+        BSONObj makeUndefined() {
+            BSONObjBuilder b;
+            b.appendUndefined( "" );
+            return b.obj();
+        }
+        const BSONObj undefined_ = makeUndefined();
+
+        BSONObj encapsulate( const BSONObj &obj ) {
+            return BSON( "" << obj );
+        }
+
+        // real methods
+
+        void goingAwaySoon();
+        BSONObj Quit(const BSONObj& args, void* data) {
+            // If not arguments are given first element will be EOO, which
+            // converts to the integer value 0.
+            goingAwaySoon();
+            int exit_code = int( args.firstElement().number() );
+            ::exit(exit_code);
+            return undefined_;
+        }
+
+        BSONObj JSGetMemInfo( const BSONObj& args, void* data ) {
+            ProcessInfo pi;
+            uassert( 10258 ,  "processinfo not supported" , pi.supported() );
+
+            BSONObjBuilder e;
+            e.append( "virtual" , pi.getVirtualMemorySize() );
+            e.append( "resident" , pi.getResidentSize() );
+
+            BSONObjBuilder b;
+            b.append( "ret" , e.obj() );
+
+            return b.obj();
+        }
+
+
+#ifndef MONGO_SAFE_SHELL
+
+        BSONObj listFiles(const BSONObj& _args, void* data) {
+            static BSONObj cd = BSON( "0" << "." );
+            BSONObj args = _args.isEmpty() ? cd : _args;
+
+            uassert( 10257 ,  "need to specify 1 argument to listFiles" , args.nFields() == 1 );
+
+            BSONArrayBuilder lst;
+
+            string rootname = args.firstElement().valuestrsafe();
+            path root( rootname );
+            stringstream ss;
+            ss << "listFiles: no such directory: " << rootname;
+            string msg = ss.str();
+            uassert( 12581, msg.c_str(), boost::filesystem::exists( root ) );
+
+            directory_iterator end;
+            directory_iterator i( root);
+
+            while ( i != end ) {
+                path p = *i;
+                BSONObjBuilder b;
+                b << "name" << p.string();
+                b.appendBool( "isDirectory", is_directory( p ) );
+                if ( ! is_directory( p ) ) {
+                    try {
+                        b.append( "size" , (double)file_size( p ) );
+                    }
+                    catch ( ... ) {
+                        i++;
+                        continue;
+                    }
+                }
+
+                lst.append( b.obj() );
+                i++;
+            }
+            
+            BSONObjBuilder ret;
+            ret.appendArray( "", lst.done() );
+            return ret.obj();
+        }
+
+        BSONObj ls(const BSONObj& args, void* data) {
+            BSONObj o = listFiles(args, data);
+            if( !o.isEmpty() ) {
+                for( BSONObj::iterator i = o.firstElement().Obj().begin(); i.more(); ) {
+                    BSONObj f = i.next().Obj();
+                    cout << f["name"].String();
+                    if( f["isDirectory"].trueValue() ) cout << '/';
+                    cout << '\n';
+                }
+                cout.flush();
+            }
+            return BSONObj();
+        }
+
+        BSONObj cd(const BSONObj& args, void* data) {
+#if defined(_WIN32)
+            std::wstring dir = toWideString( args.firstElement().String().c_str() );
+            if( SetCurrentDirectory(dir.c_str()) )
+                return BSONObj();
+#else
+            string dir = args.firstElement().String();
+            /*            if( chdir(dir.c_str) ) == 0 )
+                            return BSONObj();
+                            */
+            if( 1 ) return BSON(""<<"implementation not done for posix");
+#endif
+            return BSON( "" << "change directory failed" );
+        }
+
+        BSONObj pwd(const BSONObj&, void* data) {
+            boost::filesystem::path p = boost::filesystem::current_path();
+            return BSON( "" << p.string() );
+        }
+
+        BSONObj hostname(const BSONObj&, void* data) {
+            return BSON( "" << getHostName() );
+        }
+
+        static BSONElement oneArg(const BSONObj& args) {
+            uassert( 12597 , "need to specify 1 argument" , args.nFields() == 1 );
+            return args.firstElement();
+        }
+
+        const int CANT_OPEN_FILE = 13300;
+
+        BSONObj cat(const BSONObj& args, void* data) {
+            BSONElement e = oneArg(args);
+            stringstream ss;
+            ifstream f(e.valuestrsafe());
+            uassert(CANT_OPEN_FILE, "couldn't open file", f.is_open() );
+
+            streamsize sz = 0;
+            while( 1 ) {
+                char ch = 0;
+                // slow...maybe change one day
+                f.get(ch);
+                if( ch == 0 ) break;
+                ss << ch;
+                sz += 1;
+                uassert(13301, "cat() : file to big to load as a variable", sz < 1024 * 1024 * 16);
+            }
+            return BSON( "" << ss.str() );
+        }
+
+        BSONObj md5sumFile(const BSONObj& args, void* data) {
+            BSONElement e = oneArg(args);
+            stringstream ss;
+            FILE* f = fopen(e.valuestrsafe(), "rb");
+            uassert(CANT_OPEN_FILE, "couldn't open file", f );
+
+            md5digest d;
+            md5_state_t st;
+            md5_init(&st);
+
+            enum {BUFLEN = 4*1024};
+            char buffer[BUFLEN];
+            int bytes_read;
+            while( (bytes_read = fread(buffer, 1, BUFLEN, f)) ) {
+                md5_append( &st , (const md5_byte_t*)(buffer) , bytes_read );
+            }
+
+            md5_finish(&st, d);
+            return BSON( "" << digestToString( d ) );
+        }
+
+        BSONObj mkdir(const BSONObj& args, void* data) {
+            boost::filesystem::create_directories(args.firstElement().String());
+            return BSON( "" << true );
+        }
+
+        BSONObj removeFile(const BSONObj& args, void* data) {
+            BSONElement e = oneArg(args);
+            bool found = false;
+
+            path root( e.valuestrsafe() );
+            if ( boost::filesystem::exists( root ) ) {
+                found = true;
+                boost::filesystem::remove_all( root );
+            }
+
+            BSONObjBuilder b;
+            b.appendBool( "removed" , found );
+            return b.obj();
+        }
+
+        /**
+         * @param args - [ name, byte index ]
+         * In this initial implementation, all bits in the specified byte are flipped.
+         */
+        BSONObj fuzzFile(const BSONObj& args, void* data) {
+            uassert( 13619, "fuzzFile takes 2 arguments", args.nFields() == 2 );
+            shared_ptr< File > f( new File() );
+            f->open( args.getStringField( "0" ) );
+            uassert( 13620, "couldn't open file to fuzz", !f->bad() && f->is_open() );
+
+            char c;
+            f->read( args.getIntField( "1" ), &c, 1 );
+            c = ~c;
+            f->write( args.getIntField( "1" ), &c, 1 );
+
+            return undefined_;
+            // f close is implicit
+        }
+
+        map< int, pair< pid_t, int > > dbs;
+        map< pid_t, int > shells;
+#ifdef _WIN32
+        map< pid_t, HANDLE > handles;
+#endif
+
+        mongo::mutex mongoProgramOutputMutex("mongoProgramOutputMutex");
+        stringstream mongoProgramOutput_;
+
+        void goingAwaySoon() {
+            mongo::mutex::scoped_lock lk( mongoProgramOutputMutex );
+            mongo::dbexitCalled = true;
+        }
+
+        void writeMongoProgramOutputLine( int port, int pid, const char *line ) {
+            mongo::mutex::scoped_lock lk( mongoProgramOutputMutex );
+            if( mongo::dbexitCalled ) throw "program is terminating";
+            stringstream buf;
+            if ( port > 0 )
+                buf << " m" << port << "| " << line;
+            else
+                buf << "sh" << pid << "| " << line;
+            cout << buf.str() << endl;
+            mongoProgramOutput_ << buf.str() << endl;
+        }
+
+        // only returns last 100000 characters
+        BSONObj RawMongoProgramOutput( const BSONObj &args, void* data ) {
+            mongo::mutex::scoped_lock lk( mongoProgramOutputMutex );
+            string out = mongoProgramOutput_.str();
+            size_t len = out.length();
+            if ( len > 100000 )
+                out = out.substr( len - 100000, 100000 );
+            return BSON( "" << out );
+        }
+
+        BSONObj ClearRawMongoProgramOutput( const BSONObj &args, void* data ) {
+            mongo::mutex::scoped_lock lk( mongoProgramOutputMutex );
+            mongoProgramOutput_.str( "" );
+            return undefined_;
+        }
+
+        class ProgramRunner {
+            vector<string> argv_;
+            int port_;
+            int pipe_;
+            pid_t pid_;
+        public:
+            pid_t pid() const { return pid_; }
+            int port() const { return port_; }
+
+            boost::filesystem::path find(string prog) {
+                boost::filesystem::path p = prog;
+#ifdef _WIN32
+                p = change_extension(p, ".exe");
+#endif
+
+                if( boost::filesystem::exists(p) ) {
+#ifndef _WIN32
+                    p = boost::filesystem::initial_path() / p;
+#endif
+                    return p;
+                }
+
+                {
+                    boost::filesystem::path t = boost::filesystem::current_path() / p;
+                    if( boost::filesystem::exists(t)  ) return t;
+                }
+                try {
+                    if( theScope->type("_path") == String ) {
+                        string path = theScope->getString("_path");
+                        if( !path.empty() ) {
+                            boost::filesystem::path t = boost::filesystem::path(path) / p;
+                            if( boost::filesystem::exists(t) ) return t;
+                        }
+                    }
+                }
+                catch(...) { }
+                {
+                    boost::filesystem::path t = boost::filesystem::initial_path() / p;
+                    if( boost::filesystem::exists(t)  ) return t;
+                }
+                return p; // not found; might find via system path
+            }
+
+            ProgramRunner( const BSONObj &args , bool isMongoProgram=true) {
+                assert( !args.isEmpty() );
+
+                string program( args.firstElement().valuestrsafe() );
+                assert( !program.empty() );
+                boost::filesystem::path programPath = find(program);
+
+                if (isMongoProgram) {
+#if 0
+                    if (program == "mongos") {
+                        argv_.push_back("valgrind");
+                        argv_.push_back("--log-file=/tmp/mongos-%p.valgrind");
+                        argv_.push_back("--leak-check=yes");
+                        argv_.push_back("--suppressions=valgrind.suppressions");
+                        //argv_.push_back("--error-exitcode=1");
+                        argv_.push_back("--");
+                    }
+#endif
+                }
+
+                argv_.push_back( programPath.native_file_string() );
+
+                port_ = -1;
+
+                BSONObjIterator j( args );
+                j.next(); // skip program name (handled above)
+                while(j.more()) {
+                    BSONElement e = j.next();
+                    string str;
+                    if ( e.isNumber() ) {
+                        stringstream ss;
+                        ss << e.number();
+                        str = ss.str();
+                    }
+                    else {
+                        assert( e.type() == mongo::String );
+                        str = e.valuestr();
+                    }
+                    if ( str == "--port" )
+                        port_ = -2;
+                    else if ( port_ == -2 )
+                        port_ = strtol( str.c_str(), 0, 10 );
+                    argv_.push_back(str);
+                }
+
+                if ( program != "mongod" && program != "mongos" && program != "mongobridge" )
+                    port_ = 0;
+                else {
+                    if ( port_ <= 0 )
+                        cout << "error: a port number is expected when running mongod (etc.) from the shell" << endl;
+                    assert( port_ > 0 );
+                }
+                if ( port_ > 0 && dbs.count( port_ ) != 0 ) {
+                    cerr << "count for port: " << port_ << " is not 0 is: " << dbs.count( port_ ) << endl;
+                    assert( dbs.count( port_ ) == 0 );
+                }
+            }
+
+            void start() {
+                int pipeEnds[ 2 ];
+                assert( pipe( pipeEnds ) != -1 );
+
+                fflush( 0 );
+                launch_process(pipeEnds[1]); //sets pid_
+
+                {
+                    stringstream ss;
+                    ss << "shell: started program";
+                    for (unsigned i=0; i < argv_.size(); i++)
+                        ss << " " << argv_[i];
+                    ss << '\n';
+                    cout << ss.str(); cout.flush();
+                }
+
+                if ( port_ > 0 )
+                    dbs.insert( make_pair( port_, make_pair( pid_, pipeEnds[ 1 ] ) ) );
+                else
+                    shells.insert( make_pair( pid_, pipeEnds[ 1 ] ) );
+                pipe_ = pipeEnds[ 0 ];
+            }
+
+            // Continue reading output
+            void operator()() {
+                try {
+                    // This assumes there aren't any 0's in the mongo program output.
+                    // Hope that's ok.
+                    const unsigned bufSize = 128 * 1024;
+                    char buf[ bufSize ];
+                    char temp[ bufSize ];
+                    char *start = buf;
+                    while( 1 ) {
+                        int lenToRead = ( bufSize - 1 ) - ( start - buf );
+                        if ( lenToRead <= 0 ) {
+                            cout << "error: lenToRead: " << lenToRead << endl;
+                            cout << "first 300: " << string(buf,0,300) << endl;
+                        }
+                        assert( lenToRead > 0 );
+                        int ret = read( pipe_, (void *)start, lenToRead );
+                        if( mongo::dbexitCalled )
+                            break;
+                        assert( ret != -1 );
+                        start[ ret ] = '\0';
+                        if ( strlen( start ) != unsigned( ret ) )
+                            writeMongoProgramOutputLine( port_, pid_, "WARNING: mongod wrote null bytes to output" );
+                        char *last = buf;
+                        for( char *i = strchr( buf, '\n' ); i; last = i + 1, i = strchr( last, '\n' ) ) {
+                            *i = '\0';
+                            writeMongoProgramOutputLine( port_, pid_, last );
+                        }
+                        if ( ret == 0 ) {
+                            if ( *last )
+                                writeMongoProgramOutputLine( port_, pid_, last );
+                            close( pipe_ );
+                            break;
+                        }
+                        if ( last != buf ) {
+                            strcpy( temp, last );
+                            strcpy( buf, temp );
+                        }
+                        else {
+                            assert( strlen( buf ) < bufSize );
+                        }
+                        start = buf + strlen( buf );
+                    }
+                }
+                catch(...) {
+                }
+            }
+            void launch_process(int child_stdout) {
+#ifdef _WIN32
+                stringstream ss;
+                for( unsigned i=0; i < argv_.size(); i++ ) {
+                    if (i) ss << ' ';
+                    if (argv_[i].find(' ') == string::npos)
+                        ss << argv_[i];
+                    else {
+                        ss << '"';
+                        // escape all embedded quotes
+                        for (size_t j=0; j<argv_[i].size(); ++j) {
+                            if (argv_[i][j]=='"') ss << '"';
+                            ss << argv_[i][j];
+                        }
+                        ss << '"';
+                    }
+                }
+
+                string args = ss.str();
+
+                boost::scoped_array<TCHAR> args_tchar (new TCHAR[args.size() + 1]);
+                size_t i;
+                for(i=0; i < args.size(); i++)
+                    args_tchar[i] = args[i];
+                args_tchar[i] = 0;
+
+                HANDLE h = (HANDLE)_get_osfhandle(child_stdout);
+                assert(h != INVALID_HANDLE_VALUE);
+                assert(SetHandleInformation(h, HANDLE_FLAG_INHERIT, 1));
+
+                STARTUPINFO si;
+                ZeroMemory(&si, sizeof(si));
+                si.cb = sizeof(si);
+                si.hStdError = h;
+                si.hStdOutput = h;
+                si.dwFlags |= STARTF_USESTDHANDLES;
+
+                PROCESS_INFORMATION pi;
+                ZeroMemory(&pi, sizeof(pi));
+
+                bool success = CreateProcess( NULL, args_tchar.get(), NULL, NULL, true, 0, NULL, NULL, &si, &pi) != 0;
+                if (!success) {
+                    LPSTR lpMsgBuf=0;
+                    DWORD dw = GetLastError();
+                    FormatMessageA(
+                        FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                        FORMAT_MESSAGE_FROM_SYSTEM |
+                        FORMAT_MESSAGE_IGNORE_INSERTS,
+                        NULL,
+                        dw,
+                        MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                        (LPSTR)&lpMsgBuf,
+                        0, NULL );
+                    stringstream ss;
+                    ss << "couldn't start process " << argv_[0] << "; " << lpMsgBuf;
+                    uassert(14042, ss.str(), success);
+                    LocalFree(lpMsgBuf);
+                }
+
+                CloseHandle(pi.hThread);
+
+                pid_ = pi.dwProcessId;
+                handles.insert( make_pair( pid_, pi.hProcess ) );
+
+#else
+
+                pid_ = fork();
+                assert( pid_ != -1 );
+
+                if ( pid_ == 0 ) {
+                    // DON'T ASSERT IN THIS BLOCK - very bad things will happen
+
+                    const char** argv = new const char* [argv_.size()+1]; // don't need to free - in child
+                    for (unsigned i=0; i < argv_.size(); i++) {
+                        argv[i] = argv_[i].c_str();
+                    }
+                    argv[argv_.size()] = 0;
+
+                    if ( dup2( child_stdout, STDOUT_FILENO ) == -1 ||
+                            dup2( child_stdout, STDERR_FILENO ) == -1 ) {
+                        cout << "Unable to dup2 child output: " << errnoWithDescription() << endl;
+                        ::_Exit(-1); //do not pass go, do not call atexit handlers
+                    }
+
+                    const char** env = new const char* [2]; // don't need to free - in child
+                    env[0] = NULL;
+#if defined(HEAP_CHECKING)
+                    env[0] = "HEAPCHECK=normal";
+                    env[1] = NULL;
+
+                    // Heap-check for mongos only. 'argv[0]' must be in the path format.
+                    if ( argv_[0].find("mongos") != string::npos) {
+                        execvpe( argv[ 0 ], const_cast<char**>(argv) , const_cast<char**>(env) );
+                    }
+#endif // HEAP_CHECKING
+
+                    execvp( argv[ 0 ], const_cast<char**>(argv) );
+
+                    cout << "Unable to start program " << argv[0] << ' ' << errnoWithDescription() << endl;
+                    ::_Exit(-1);
+                }
+
+#endif
+            }
+        };
+
+        //returns true if process exited
+        bool wait_for_pid(pid_t pid, bool block=true, int* exit_code=NULL) {
+#ifdef _WIN32
+            assert(handles.count(pid));
+            HANDLE h = handles[pid];
+
+            if (block)
+                WaitForSingleObject(h, INFINITE);
+
+            DWORD tmp;
+            if(GetExitCodeProcess(h, &tmp)) {
+                if ( tmp == STILL_ACTIVE ) {
+                    return false;
+                }
+                CloseHandle(h);
+                handles.erase(pid);
+                if (exit_code)
+                    *exit_code = tmp;
+                return true;
+            }
+            else {
+                return false;
+            }
+#else
+            int tmp;
+            bool ret = (pid == waitpid(pid, &tmp, (block ? 0 : WNOHANG)));
+            if (exit_code)
+                *exit_code = WEXITSTATUS(tmp);
+            return ret;
+
+#endif
+        }
+
+        BSONObj WaitProgram( const BSONObj& a, void* data ) {
+            int pid = oneArg( a ).numberInt();
+            BSONObj x = BSON( "" << wait_for_pid( pid ) );
+            shells.erase( pid );
+            return x;
+        }
+
+        BSONObj WaitMongoProgramOnPort( const BSONObj &a, void* data ) {
+            int port = oneArg( a ).numberInt();
+            uassert( 13621, "no known mongo program on port", dbs.count( port ) != 0 );
+            log() << "waiting port: " << port << ", pid: " << dbs[ port ].first << endl;
+            bool ret = wait_for_pid( dbs[ port ].first );
+            if ( ret ) {
+                dbs.erase( port );
+            }
+            return BSON( "" << ret );
+        }
+
+        BSONObj StartMongoProgram( const BSONObj &a, void* data ) {
+            _nokillop = true;
+            ProgramRunner r( a );
+            r.start();
+            boost::thread t( r );
+            return BSON( string( "" ) << int( r.pid() ) );
+        }
+
+        BSONObj RunMongoProgram( const BSONObj &a, void* data ) {
+            ProgramRunner r( a );
+            r.start();
+            boost::thread t( r );
+            int exit_code;
+            wait_for_pid( r.pid(), true, &exit_code );
+            if ( r.port() > 0 ) {
+                dbs.erase( r.port() );
+            }
+            else {
+                shells.erase( r.pid() );
+            }
+            return BSON( string( "" ) << exit_code );
+        }
+
+        BSONObj RunProgram(const BSONObj &a, void* data) {
+            ProgramRunner r( a, false );
+            r.start();
+            boost::thread t( r );
+            int exit_code;
+            wait_for_pid(r.pid(), true,  &exit_code);
+            shells.erase( r.pid() );
+            return BSON( string( "" ) << exit_code );
+        }
+
+        BSONObj ResetDbpath( const BSONObj &a, void* data ) {
+            assert( a.nFields() == 1 );
+            string path = a.firstElement().valuestrsafe();
+            assert( !path.empty() );
+            if ( boost::filesystem::exists( path ) )
+                boost::filesystem::remove_all( path );
+            boost::filesystem::create_directory( path );
+            return undefined_;
+        }
+
+        void copyDir( const path &from, const path &to ) {
+            directory_iterator end;
+            directory_iterator i( from );
+            while( i != end ) {
+                path p = *i;
+                if ( p.leaf() != "mongod.lock" ) {
+                    if ( is_directory( p ) ) {
+                        path newDir = to / p.leaf();
+                        boost::filesystem::create_directory( newDir );
+                        copyDir( p, newDir );
+                    }
+                    else {
+                        boost::filesystem::copy_file( p, to / p.leaf() );
+                    }
+                }
+                ++i;
+            }
+        }
+
+        // NOTE target dbpath will be cleared first
+        BSONObj CopyDbpath( const BSONObj &a, void* data ) {
+            assert( a.nFields() == 2 );
+            BSONObjIterator i( a );
+            string from = i.next().str();
+            string to = i.next().str();
+            assert( !from.empty() );
+            assert( !to.empty() );
+            if ( boost::filesystem::exists( to ) )
+                boost::filesystem::remove_all( to );
+            boost::filesystem::create_directory( to );
+            copyDir( from, to );
+            return undefined_;
+        }
+
+        inline void kill_wrapper(pid_t pid, int sig, int port) {
+#ifdef _WIN32
+            if (sig == SIGKILL || port == 0) {
+                assert( handles.count(pid) );
+                TerminateProcess(handles[pid], 1); // returns failure for "zombie" processes.
+            }
+            else {
+                DBClientConnection conn;
+                conn.connect("127.0.0.1:" + BSONObjBuilder::numStr(port));
+                try {
+                    conn.simpleCommand("admin", NULL, "shutdown");
+                }
+                catch (...) {
+                    //Do nothing. This command never returns data to the client and the driver doesn't like that.
+                }
+            }
+#else
+            int x = kill( pid, sig );
+            if ( x ) {
+                if ( errno == ESRCH ) {
+                }
+                else {
+                    cout << "killFailed: " << errnoWithDescription() << endl;
+                    assert( x == 0 );
+                }
+            }
+
+#endif
+        }
+
+        int killDb( int port, pid_t _pid, int signal ) {
+            pid_t pid;
+            int exitCode = 0;
+            if ( port > 0 ) {
+                if( dbs.count( port ) != 1 ) {
+                    cout << "No db started on port: " << port << endl;
+                    return 0;
+                }
+                pid = dbs[ port ].first;
+            }
+            else {
+                pid = _pid;
+            }
+
+            kill_wrapper( pid, signal, port );
+
+            int i = 0;
+            for( ; i < 130; ++i ) {
+                if ( i == 30 ) {
+                    char now[64];
+                    time_t_to_String(time(0), now);
+                    now[ 20 ] = 0;
+                    cout << now << " process on port " << port << ", with pid " << pid << " not terminated, sending sigkill" << endl;
+                    kill_wrapper( pid, SIGKILL, port );
+                }
+                if(wait_for_pid(pid, false, &exitCode))
+                    break;
+                sleepmillis( 1000 );
+            }
+            if ( i == 130 ) {
+                char now[64];
+                time_t_to_String(time(0), now);
+                now[ 20 ] = 0;
+                cout << now << " failed to terminate process on port " << port << ", with pid " << pid << endl;
+                assert( "Failed to terminate process" == 0 );
+            }
+
+            if ( port > 0 ) {
+                close( dbs[ port ].second );
+                dbs.erase( port );
+            }
+            else {
+                close( shells[ pid ] );
+                shells.erase( pid );
+            }
+            // FIXME I think the intention here is to do an extra sleep only when SIGKILL is sent to the child process.
+            // We may want to change the 4 below to 29, since values of i greater than that indicate we sent a SIGKILL.
+            if ( i > 4 || signal == SIGKILL ) {
+                sleepmillis( 4000 ); // allow operating system to reclaim resources
+            }
+
+            return exitCode;
+        }
+
+        int getSignal( const BSONObj &a ) {
+            int ret = SIGTERM;
+            if ( a.nFields() == 2 ) {
+                BSONObjIterator i( a );
+                i.next();
+                BSONElement e = i.next();
+                assert( e.isNumber() );
+                ret = int( e.number() );
+            }
+            return ret;
+        }
+
+        /** stopMongoProgram(port[, signal]) */
+        BSONObj StopMongoProgram( const BSONObj &a, void* data ) {
+            assert( a.nFields() == 1 || a.nFields() == 2 );
+            uassert( 15853 , "stopMongo needs a number" , a.firstElement().isNumber() );
+            int port = int( a.firstElement().number() );
+            int code = killDb( port, 0, getSignal( a ) );
+            cout << "shell: stopped mongo program on port " << port << endl;
+            return BSON( "" << (double)code );
+        }
+
+        BSONObj StopMongoProgramByPid( const BSONObj &a, void* data ) {
+            assert( a.nFields() == 1 || a.nFields() == 2 );
+            uassert( 15852 , "stopMongoByPid needs a number" , a.firstElement().isNumber() );
+            int pid = int( a.firstElement().number() );
+            int code = killDb( 0, pid, getSignal( a ) );
+            cout << "shell: stopped mongo program on pid " << pid << endl;
+            return BSON( "" << (double)code );
+        }
+
+        void KillMongoProgramInstances() {
+            vector< int > ports;
+            for( map< int, pair< pid_t, int > >::iterator i = dbs.begin(); i != dbs.end(); ++i )
+                ports.push_back( i->first );
+            for( vector< int >::iterator i = ports.begin(); i != ports.end(); ++i )
+                killDb( *i, 0, SIGTERM );
+            vector< pid_t > pids;
+            for( map< pid_t, int >::iterator i = shells.begin(); i != shells.end(); ++i )
+                pids.push_back( i->first );
+            for( vector< pid_t >::iterator i = pids.begin(); i != pids.end(); ++i )
+                killDb( 0, *i, SIGTERM );
+        }
+#else // ndef MONGO_SAFE_SHELL
+        void KillMongoProgramInstances() {}
+#endif
+
+        MongoProgramScope::~MongoProgramScope() {
+            DESTRUCTOR_GUARD(
+                KillMongoProgramInstances();
+                ClearRawMongoProgramOutput( BSONObj(), 0 );
+            )
+        }
+
+        unsigned _randomSeed;
+
+        BSONObj JSSrand( const BSONObj &a, void* data ) {
+            uassert( 12518, "srand requires a single numeric argument",
+                     a.nFields() == 1 && a.firstElement().isNumber() );
+            _randomSeed = (unsigned)a.firstElement().numberLong(); // grab least significant digits
+            return undefined_;
+        }
+
+        BSONObj JSRand( const BSONObj &a, void* data ) {
+            uassert( 12519, "rand accepts no arguments", a.nFields() == 0 );
+            unsigned r;
+#if !defined(_WIN32)
+            r = rand_r( &_randomSeed );
+#else
+            r = rand(); // seed not used in this case
+#endif
+            return BSON( "" << double( r ) / ( double( RAND_MAX ) + 1 ) );
+        }
+
+        BSONObj isWindows(const BSONObj& a, void* data) {
+            uassert( 13006, "isWindows accepts no arguments", a.nFields() == 0 );
+#ifdef _WIN32
+            return BSON( "" << true );
+#else
+            return BSON( "" << false );
+#endif
+        }
+
+        const char* getUserDir() {
+#ifdef _WIN32
+            return getenv( "USERPROFILE" );
+#else
+            return getenv( "HOME" );
+#endif
+        }
+        BSONObj getHostName(const BSONObj& a, void* data) {
+            uassert( 13411, "getHostName accepts no arguments", a.nFields() == 0 );
+            char buf[260]; // HOST_NAME_MAX is usually 255
+            assert(gethostname(buf, 260) == 0);
+            buf[259] = '\0';
+            return BSON("" << buf);
+
+        }
+
+        void installShellUtils( Scope& scope ) {
+            theScope = &scope;
+            scope.injectNative( "quit", Quit );
+            scope.injectNative( "getMemInfo" , JSGetMemInfo );
+            scope.injectNative( "_srand" , JSSrand );
+            scope.injectNative( "_rand" , JSRand );
+            scope.injectNative( "_isWindows" , isWindows );
+
+#ifndef MONGO_SAFE_SHELL
+            //can't launch programs
+            scope.injectNative( "_startMongoProgram", StartMongoProgram );
+            scope.injectNative( "runProgram", RunProgram );
+            scope.injectNative( "run", RunProgram );
+            scope.injectNative( "runMongoProgram", RunMongoProgram );
+            scope.injectNative( "stopMongod", StopMongoProgram );
+            scope.injectNative( "stopMongoProgram", StopMongoProgram );
+            scope.injectNative( "stopMongoProgramByPid", StopMongoProgramByPid );
+            scope.injectNative( "rawMongoProgramOutput", RawMongoProgramOutput );
+            scope.injectNative( "clearRawMongoProgramOutput", ClearRawMongoProgramOutput );
+            scope.injectNative( "waitProgram" , WaitProgram );
+            scope.injectNative( "waitMongoProgramOnPort" , WaitMongoProgramOnPort );
+
+            scope.injectNative( "getHostName" , getHostName );
+            scope.injectNative( "removeFile" , removeFile );
+            scope.injectNative( "fuzzFile" , fuzzFile );
+            scope.injectNative( "listFiles" , listFiles );
+            scope.injectNative( "ls" , ls );
+            scope.injectNative( "pwd", pwd );
+            scope.injectNative( "cd", cd );
+            scope.injectNative( "cat", cat );
+            scope.injectNative( "hostname", hostname);
+            scope.injectNative( "resetDbpath", ResetDbpath );
+            scope.injectNative( "copyDbpath", CopyDbpath );
+            scope.injectNative( "md5sumFile", md5sumFile );
+            scope.injectNative( "mkdir" , mkdir );
+#endif
+        }
+
+        void initScope( Scope &scope ) {
+            scope.externalSetup();
+            mongo::shellUtils::installShellUtils( scope );
+            scope.execSetup(JSFiles::servers);
+
+            if ( !_dbConnect.empty() ) {
+                uassert( 12513, "connect failed", scope.exec( _dbConnect , "(connect)" , false , true , false ) );
+                if ( !_dbAuth.empty() ) {
+                    installGlobalUtils( scope );
+                    uassert( 12514, "login failed", scope.exec( _dbAuth , "(auth)" , true , true , false ) );
+                }
+            }
+        }
+
+        //   connstr, myuris
+        map< string, set<string> > _allMyUris;
+        mongo::mutex _allMyUrisMutex("_allMyUrisMutex");
+        bool _nokillop = false;
+        void onConnect( DBClientWithCommands &c ) {
+            latestConn = &c;
+            if ( _nokillop ) {
+                return;
+            }
+            BSONObj info;
+            if ( c.runCommand( "admin", BSON( "whatsmyuri" << 1 ), info ) ) {
+                string connstr = dynamic_cast<DBClientBase&>(c).getServerAddress();
+                mongo::mutex::scoped_lock lk( _allMyUrisMutex );
+                _allMyUris[connstr].insert(info[ "you" ].str());
+            }
+        }
+    }
+}
diff --git a/src/mongo/shell/utils.h b/src/mongo/shell/utils.h
new file mode 100644
index 00000000000..433fe7b7d25
--- /dev/null
+++ b/src/mongo/shell/utils.h
@@ -0,0 +1,48 @@
+// utils.h
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+
+#pragma once
+
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+    namespace shellUtils {
+
+        extern std::string _dbConnect;
+        extern std::string _dbAuth;
+        extern map< string, set<string> > _allMyUris;
+        extern bool _nokillop;
+
+        void RecordMyLocation( const char *_argv0 );
+        void installShellUtils( Scope& scope );
+
+        // Scoped management of mongo program instances.  Simple implementation:
+        // destructor kills all mongod instances created by the shell.
+        struct MongoProgramScope {
+            MongoProgramScope() {} // Avoid 'unused variable' warning.
+            ~MongoProgramScope();
+        };
+        void KillMongoProgramInstances();
+
+        void initScope( Scope &scope );
+        void onConnect( DBClientWithCommands &c );
+
+        const char* getUserDir();
+    }
+}
diff --git a/src/mongo/shell/utils.js b/src/mongo/shell/utils.js
new file mode 100644
index 00000000000..adc763e3893
--- /dev/null
+++ b/src/mongo/shell/utils.js
@@ -0,0 +1,1896 @@
+__quiet = false;
+__magicNoPrint = { __magicNoPrint : 1111 }
+__callLastError = false; 
+_verboseShell = false;
+
+chatty = function(s){
+    if ( ! __quiet )
+        print( s );
+}
+
+friendlyEqual = function( a , b ){
+    if ( a == b )
+        return true;
+    
+    a = tojson(a,false,true);
+    b = tojson(b,false,true);
+
+    if ( a == b )
+        return true;
+
+    var clean = function( s ){
+        s = s.replace( /NumberInt\((\-?\d+)\)/g , "$1" );
+        return s;
+    }
+    
+    a = clean(a);
+    b = clean(b);
+
+    if ( a == b )
+        return true;
+    
+    return false;
+}
+
+printStackTrace = function(){
+    try{
+        throw new Error("Printing Stack Trace");
+    } catch (e) {
+        print(e.stack);
+    }
+}
+
+/**
+ * <p> Set the shell verbosity. If verbose the shell will display more information about command results. </>
+ * <p> Default is off. <p>
+ * @param {Bool} verbosity on / off
+ */
+setVerboseShell = function( value ) { 
+    if( value == undefined ) value = true; 
+    _verboseShell = value; 
+}
+
+doassert = function (msg) {
+    if (msg.indexOf("assert") == 0)
+        print(msg);
+    else
+        print("assert: " + msg);
+    printStackTrace();
+    throw msg;
+}
+
+assert = function( b , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+    if ( b )
+        return;    
+    doassert( msg == undefined ? "assert failed" : "assert failed : " + msg );
+}
+
+assert.automsg = function( b ) {
+    assert( eval( b ), b );
+}
+
+assert._debug = false;
+
+assert.eq = function( a , b , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if ( a == b )
+        return;
+
+    if ( ( a != null && b != null ) && friendlyEqual( a , b ) )
+        return;
+
+    doassert( "[" + tojson( a ) + "] != [" + tojson( b ) + "] are not equal : " + msg );
+}
+
+assert.eq.automsg = function( a, b ) {
+    assert.eq( eval( a ), eval( b ), "[" + a + "] != [" + b + "]" );
+}
+
+assert.neq = function( a , b , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+    if ( a != b )
+        return;
+
+    doassert( "[" + a + "] != [" + b + "] are equal : " + msg );
+}
+
+assert.contains = function( o, arr, msg ){
+    var wasIn = false
+    
+    if( ! arr.length ){
+        for( i in arr ){
+            wasIn = arr[i] == o || ( ( arr[i] != null && o != null ) && friendlyEqual( arr[i] , o ) )
+                return;
+            if( wasIn ) break
+        }
+    }
+    else {
+        for( var i = 0; i < arr.length; i++ ){
+            wasIn = arr[i] == o || ( ( arr[i] != null && o != null ) && friendlyEqual( arr[i] , o ) )
+            if( wasIn ) break
+        }
+    }
+    
+    if( ! wasIn ) doassert( tojson( o ) + " was not in " + tojson( arr ) + " : " + msg )
+}
+
+assert.repeat = function( f, msg, timeout, interval ) {
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    var start = new Date();
+    timeout = timeout || 30000;
+    interval = interval || 200;
+    var last;
+    while( 1 ) {
+        
+        if ( typeof( f ) == "string" ){
+            if ( eval( f ) )
+                return;
+        }
+        else {
+            if ( f() )
+                return;
+        }
+        
+        if ( ( new Date() ).getTime() - start.getTime() > timeout )
+            break;
+        sleep( interval );
+    }
+}
+    
+assert.soon = function( f, msg, timeout /*ms*/, interval ) {
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    var start = new Date();
+    timeout = timeout || 30000;
+    interval = interval || 200;
+    var last;
+    while( 1 ) {
+        
+        if ( typeof( f ) == "string" ){
+            if ( eval( f ) )
+                return;
+        }
+        else {
+            if ( f() )
+                return;
+        }
+       
+        diff = ( new Date() ).getTime() - start.getTime();
+        if ( diff > timeout )
+            doassert( "assert.soon failed: " + f + ", msg:" + msg );
+        sleep( interval );
+    }
+}
+
+assert.time = function( f, msg, timeout /*ms*/ ) {
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    var start = new Date();
+    timeout = timeout || 30000;
+        
+        if ( typeof( f ) == "string" ){
+            res = eval( f );
+        }
+        else {
+            res = f();
+        }
+       
+        diff = ( new Date() ).getTime() - start.getTime();
+        if ( diff > timeout )
+            doassert( "assert.time failed timeout " + timeout + "ms took " + diff + "ms : " + f + ", msg:" + msg );
+        return res;
+}
+
+assert.throws = function( func , params , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+    
+    if ( params && typeof( params ) == "string" )
+        throw "2nd argument to assert.throws has to be an array"
+    
+    try {
+        func.apply( null , params );
+    }
+    catch ( e ){
+        return e;
+    }
+
+    doassert( "did not throw exception: " + msg );
+}
+
+assert.throws.automsg = function( func, params ) {
+    assert.throws( func, params, func.toString() );
+}
+
+assert.commandWorked = function( res , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if ( res.ok == 1 )
+        return;
+    
+    doassert( "command failed: " + tojson( res ) + " : " + msg );
+}
+
+assert.commandFailed = function( res , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if ( res.ok == 0 )
+        return;
+    
+    doassert( "command worked when it should have failed: " + tojson( res ) + " : " + msg );
+}
+
+assert.isnull = function( what , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if ( what == null )
+        return;
+    
+    doassert( "supposed to null (" + ( msg || "" ) + ") was: " + tojson( what ) );
+}
+
+assert.lt = function( a , b , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if ( a < b )
+        return;
+    doassert( a + " is not less than " + b + " : " + msg );
+}
+
+assert.gt = function( a , b , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if ( a > b )
+        return;
+    doassert( a + " is not greater than " + b + " : " + msg );
+}
+
+assert.lte = function( a , b , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if ( a <= b )
+        return;
+    doassert( a + " is not less than or eq " + b + " : " + msg );
+}
+
+assert.gte = function( a , b , msg ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if ( a >= b )
+        return;
+    doassert( a + " is not greater than or eq " + b + " : " + msg );
+}
+
+assert.between = function( a, b, c, msg, inclusive ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if( ( inclusive == undefined || inclusive == true ) &&
+        a <= b && b <= c ) return;
+    else if( a < b && b < c ) return;
+    
+    doassert( b + " is not between " + a + " and " + c + " : " + msg );
+}
+
+assert.betweenIn = function( a, b, c, msg ){ assert.between( a, b, c, msg, true ) }
+assert.betweenEx = function( a, b, c, msg ){ assert.between( a, b, c, msg, false ) }
+
+assert.close = function( a , b , msg , places ){
+    if (places === undefined) {
+        places = 4;
+    }
+    if (Math.round((a - b) * Math.pow(10, places)) === 0) {
+        return;
+    }
+    doassert( a + " is not equal to " + b + " within " + places +
+              " places, diff: " + (a-b) + " : " + msg );
+};
+
+Object.extend = function( dst , src , deep ){
+    for ( var k in src ){
+        var v = src[k];
+        if ( deep && typeof(v) == "object" ){
+            if ( "floatApprox" in v ) { // convert NumberLong properly
+                eval( "v = " + tojson( v ) );
+            } else {
+                v = Object.extend( typeof ( v.length ) == "number" ? [] : {} , v , true );
+            }
+        }
+        dst[k] = v;
+    }
+    return dst;
+}
+
+Object.merge = function( dst, src, deep ){
+    var clone = Object.extend( {}, dst, deep )
+    return Object.extend( clone, src, deep )
+}
+
+argumentsToArray = function( a ){
+    var arr = [];
+    for ( var i=0; i<a.length; i++ )
+        arr[i] = a[i];
+    return arr;
+}
+
+isString = function( x ){
+    return typeof( x ) == "string";
+}
+
+isNumber = function(x){
+    return typeof( x ) == "number";
+}
+
+isObject = function( x ){
+    return typeof( x ) == "object";
+}
+
+String.prototype.trim = function() {
+    return this.replace(/^\s+|\s+$/g,"");
+}
+String.prototype.ltrim = function() {
+    return this.replace(/^\s+/,"");
+}
+String.prototype.rtrim = function() {
+    return this.replace(/\s+$/,"");
+}
+
+String.prototype.startsWith = function (str){
+    return this.indexOf(str) == 0
+}
+
+String.prototype.endsWith = function (str){
+    return new RegExp( str + "$" ).test( this )
+}
+
+Number.prototype.zeroPad = function(width) {
+    var str = this + '';
+    while (str.length < width)
+        str = '0' + str;
+    return str;
+}
+
+Date.timeFunc = function( theFunc , numTimes ){
+
+    var start = new Date();
+    
+    numTimes = numTimes || 1;
+    for ( var i=0; i<numTimes; i++ ){
+        theFunc.apply( null , argumentsToArray( arguments ).slice( 2 ) );
+    }
+
+    return (new Date()).getTime() - start.getTime();
+}
+
+Date.prototype.tojson = function(){
+
+    var UTC = Date.printAsUTC ? 'UTC' : '';
+
+    var year = this['get'+UTC+'FullYear']().zeroPad(4);
+    var month = (this['get'+UTC+'Month']() + 1).zeroPad(2);
+    var date = this['get'+UTC+'Date']().zeroPad(2);
+    var hour = this['get'+UTC+'Hours']().zeroPad(2);
+    var minute = this['get'+UTC+'Minutes']().zeroPad(2);
+    var sec = this['get'+UTC+'Seconds']().zeroPad(2)
+
+    if (this['get'+UTC+'Milliseconds']())
+        sec += '.' + this['get'+UTC+'Milliseconds']().zeroPad(3)
+
+    var ofs = 'Z';
+    if (!Date.printAsUTC){
+        var ofsmin = this.getTimezoneOffset();
+        if (ofsmin != 0){
+            ofs = ofsmin > 0 ? '-' : '+'; // This is correct
+            ofs += (ofsmin/60).zeroPad(2)
+            ofs += (ofsmin%60).zeroPad(2)
+        }
+    }
+
+    return 'ISODate("'+year+'-'+month+'-'+date+'T'+hour+':'+minute+':'+sec+ofs+'")';
+}
+
+Date.printAsUTC = true;
+
+
+ISODate = function(isoDateStr){
+    if (!isoDateStr)
+        return new Date();
+
+    var isoDateRegex = /(\d{4})-?(\d{2})-?(\d{2})([T ](\d{2})(:?(\d{2})(:?(\d{2}(\.\d+)?))?)?(Z|([+-])(\d{2}):?(\d{2})?)?)?/;
+    var res = isoDateRegex.exec(isoDateStr);
+
+    if (!res)
+        throw "invalid ISO date";
+
+    var year = parseInt(res[1],10) || 1970; // this should always be present
+    var month = (parseInt(res[2],10) || 1) - 1;
+    var date = parseInt(res[3],10) || 0;
+    var hour = parseInt(res[5],10) || 0;
+    var min = parseInt(res[7],10) || 0;
+    var sec = parseFloat(res[9]) || 0;
+    var ms = Math.round((sec%1) * 1000)
+    sec -= ms/1000
+
+    var time = Date.UTC(year, month, date, hour, min, sec, ms);
+
+    if (res[11] && res[11] != 'Z'){
+        var ofs = 0;
+        ofs += (parseInt(res[13],10) || 0) * 60*60*1000; // hours
+        ofs += (parseInt(res[14],10) || 0) *    60*1000; // mins
+        if (res[12] == '+') // if ahead subtract
+            ofs *= -1;
+
+        time += ofs
+    }
+
+    return new Date(time);
+}
+
+RegExp.prototype.tojson = RegExp.prototype.toString;
+
+Array.contains = function( a  , x ){
+    for ( var i=0; i<a.length; i++ ){
+        if ( a[i] == x )
+            return true;
+    }
+    return false;
+}
+
+Array.unique = function( a ){
+    var u = [];
+    for ( var i=0; i<a.length; i++){
+        var o = a[i];
+        if ( ! Array.contains( u , o ) ){
+            u.push( o );
+        }
+    }
+    return u;
+}
+
+Array.shuffle = function( arr ){
+    for ( var i=0; i<arr.length-1; i++ ){
+        var pos = i+Random.randInt(arr.length-i);
+        var save = arr[i];
+        arr[i] = arr[pos];
+        arr[pos] = save;
+    }
+    return arr;
+}
+
+
+Array.tojson = function( a , indent , nolint ){
+    var lineEnding = nolint ? " " : "\n";
+
+    if (!indent) 
+        indent = "";
+    
+    if ( nolint )
+        indent = "";
+
+    if (a.length == 0) {
+        return "[ ]";
+    }
+
+    var s = "[" + lineEnding;
+    indent += "\t";
+    for ( var i=0; i<a.length; i++){
+        s += indent + tojson( a[i], indent , nolint );
+        if ( i < a.length - 1 ){
+            s += "," + lineEnding;
+        }
+    }
+    if ( a.length == 0 ) {
+        s += indent;
+    }
+
+    indent = indent.substring(1);
+    s += lineEnding+indent+"]";
+    return s;
+}
+
+Array.fetchRefs = function( arr , coll ){
+    var n = [];
+    for ( var i=0; i<arr.length; i ++){
+        var z = arr[i];
+        if ( coll && coll != z.getCollection() )
+            continue;
+        n.push( z.fetch() );
+    }
+    
+    return n;
+}
+
+Array.sum = function( arr ){
+    if ( arr.length == 0 )
+        return null;
+    var s = arr[0];
+    for ( var i=1; i<arr.length; i++ )
+        s += arr[i];
+    return s;
+}
+
+Array.avg = function( arr ){
+    if ( arr.length == 0 )
+        return null;
+    return Array.sum( arr ) / arr.length;
+}
+
+Array.stdDev = function( arr ){
+    var avg = Array.avg( arr );
+    var sum = 0;
+
+    for ( var i=0; i<arr.length; i++ ){
+        sum += Math.pow( arr[i] - avg , 2 );
+    }
+
+    return Math.sqrt( sum / arr.length );
+}
+
+if( typeof Array.isArray != "function" ){
+    Array.isArray = function( arr ){
+        return arr != undefined && arr.constructor == Array
+    }
+}
+
+//these two are helpers for Array.sort(func)
+compare = function(l, r){ return (l == r ? 0 : (l < r ? -1 : 1)); }
+
+// arr.sort(compareOn('name'))
+compareOn = function(field){
+    return function(l, r) { return compare(l[field], r[field]); }
+}
+
+Object.keySet = function( o ) {
+    var ret = new Array();
+    for( i in o ) {
+        if ( !( i in o.__proto__ && o[ i ] === o.__proto__[ i ] ) ) {
+            ret.push( i );
+        }
+    }
+    return ret;
+}
+
+if ( ! NumberLong.prototype ) {
+    NumberLong.prototype = {}
+}
+
+NumberLong.prototype.tojson = function() {
+    return this.toString();
+}
+
+if ( ! NumberInt.prototype ) {
+    NumberInt.prototype = {}
+}
+
+NumberInt.prototype.tojson = function() {
+    return this.toString();
+}
+
+if ( ! ObjectId.prototype )
+    ObjectId.prototype = {}
+
+ObjectId.prototype.toString = function(){
+    return "ObjectId(" + tojson(this.str) + ")";
+}
+
+ObjectId.prototype.tojson = function(){
+    return this.toString();
+}
+
+ObjectId.prototype.valueOf = function(){
+    return this.str;
+}
+
+ObjectId.prototype.isObjectId = true;
+
+ObjectId.prototype.getTimestamp = function(){
+    return new Date(parseInt(this.valueOf().slice(0,8), 16)*1000);
+}
+
+ObjectId.prototype.equals = function( other){
+    return this.str == other.str;
+}
+
+if ( typeof( DBPointer ) != "undefined" ){
+    DBPointer.prototype.fetch = function(){
+        assert( this.ns , "need a ns" );
+        assert( this.id , "need an id" );
+        
+        return db[ this.ns ].findOne( { _id : this.id } );
+    }
+    
+    DBPointer.prototype.tojson = function(indent){
+        return this.toString();
+    }
+
+    DBPointer.prototype.getCollection = function(){
+        return this.ns;
+    }
+    
+    DBPointer.prototype.getId = function(){
+        return this.id;
+    }
+ 
+     DBPointer.prototype.toString = function(){
+        return "DBPointer(" + tojson(this.ns) + ", " + tojson(this.id) + ")";
+    }
+}
+else {
+    print( "warning: no DBPointer" );
+}
+
+if ( typeof( DBRef ) != "undefined" ){
+    DBRef.prototype.fetch = function(){
+        assert( this.$ref , "need a ns" );
+        assert( this.$id , "need an id" );
+        
+        return db[ this.$ref ].findOne( { _id : this.$id } );
+    }
+    
+    DBRef.prototype.tojson = function(indent){
+        return this.toString();
+    }
+
+    DBRef.prototype.getCollection = function(){
+        return this.$ref;
+    }
+    
+    DBRef.prototype.getRef = function(){
+        return this.$ref;
+    }
+
+    DBRef.prototype.getId = function(){
+        return this.$id;
+    }
+
+    DBRef.prototype.toString = function(){
+        return "DBRef(" + tojson(this.$ref) + ", " + tojson(this.$id) + ")";
+    }
+}
+else {
+    print( "warning: no DBRef" );
+}
+
+if ( typeof( Timestamp ) != "undefined" ){
+    Timestamp.prototype.tojson = function () {
+        return this.toString();
+    }
+
+    Timestamp.prototype.getTime = function () {
+        return this.t;
+    }
+
+    Timestamp.prototype.getInc = function () {
+        return this.i;
+    }
+
+    Timestamp.prototype.toString = function () {
+        return "Timestamp(" + this.t + ", " + this.i + ")";
+    }
+}
+else {
+    print( "warning: no Timestamp class" );
+}
+
+if ( typeof( BinData ) != "undefined" ){
+    BinData.prototype.tojson = function () {
+        return this.toString();
+    }
+    
+    BinData.prototype.subtype = function () {
+        return this.type;
+    }
+    
+    BinData.prototype.length = function () {
+        return this.len;
+    } 
+}
+else {
+    print( "warning: no BinData class" );
+}
+
+if ( typeof _threadInject != "undefined" ){
+    print( "fork() available!" );
+    
+    Thread = function(){
+        this.init.apply( this, arguments );
+    }
+    _threadInject( Thread.prototype );
+    
+    ScopedThread = function() {
+        this.init.apply( this, arguments );
+    }
+    ScopedThread.prototype = new Thread( function() {} );
+    _scopedThreadInject( ScopedThread.prototype );
+    
+    fork = function() {
+        var t = new Thread( function() {} );
+        Thread.apply( t, arguments );
+        return t;
+    }    
+
+    // Helper class to generate a list of events which may be executed by a ParallelTester
+    EventGenerator = function( me, collectionName, mean ) {
+        this.mean = mean;
+        this.events = new Array( me, collectionName );
+    }
+    
+    EventGenerator.prototype._add = function( action ) {
+        this.events.push( [ Random.genExp( this.mean ), action ] );
+    }
+    
+    EventGenerator.prototype.addInsert = function( obj ) {
+        this._add( "t.insert( " + tojson( obj ) + " )" );
+    }
+
+    EventGenerator.prototype.addRemove = function( obj ) {
+        this._add( "t.remove( " + tojson( obj ) + " )" );
+    }
+
+    EventGenerator.prototype.addUpdate = function( objOld, objNew ) {
+        this._add( "t.update( " + tojson( objOld ) + ", " + tojson( objNew ) + " )" );
+    }
+    
+    EventGenerator.prototype.addCheckCount = function( count, query, shouldPrint, checkQuery ) {
+        query = query || {};
+        shouldPrint = shouldPrint || false;
+        checkQuery = checkQuery || false;
+        var action = "assert.eq( " + count + ", t.count( " + tojson( query ) + " ) );"
+        if ( checkQuery ) {
+            action += " assert.eq( " + count + ", t.find( " + tojson( query ) + " ).toArray().length );"
+        }
+        if ( shouldPrint ) {
+            action += " print( me + ' ' + " + count + " );";
+        }
+        this._add( action );
+    }
+    
+    EventGenerator.prototype.getEvents = function() {
+        return this.events;
+    }
+    
+    EventGenerator.dispatch = function() {
+        var args = argumentsToArray( arguments );
+        var me = args.shift();
+        var collectionName = args.shift();
+        var m = new Mongo( db.getMongo().host );
+        var t = m.getDB( "test" )[ collectionName ];
+        for( var i in args ) {
+            sleep( args[ i ][ 0 ] );
+            eval( args[ i ][ 1 ] );
+        }
+    }
+    
+    // Helper class for running tests in parallel.  It assembles a set of tests
+    // and then calls assert.parallelests to run them.
+    ParallelTester = function() {
+        this.params = new Array();
+    }
+    
+    ParallelTester.prototype.add = function( fun, args ) {
+        args = args || [];
+        args.unshift( fun );
+        this.params.push( args );
+    }
+    
+    ParallelTester.prototype.run = function( msg, newScopes ) {
+        newScopes = newScopes || false;
+        assert.parallelTests( this.params, msg, newScopes );
+    }
+    
+    // creates lists of tests from jstests dir in a format suitable for use by
+    // ParallelTester.fileTester.  The lists will be in random order.
+    // n: number of lists to split these tests into
+    ParallelTester.createJstestsLists = function( n ) {
+        var params = new Array();
+        for( var i = 0; i < n; ++i ) {
+            params.push( [] );
+        }
+
+        var makeKeys = function( a ) {
+            var ret = {};
+            for( var i in a ) {
+                ret[ a[ i ] ] = 1;
+            }
+            return ret;
+        }
+        
+        // some tests can't run in parallel with most others
+        var skipTests = makeKeys( [ "jstests/dbadmin.js",
+                                   "jstests/repair.js",
+                                   "jstests/cursor8.js",
+                                   "jstests/recstore.js",
+                                   "jstests/extent.js",
+                                   "jstests/indexb.js",
+                                   "jstests/profile1.js",
+                                   "jstests/mr3.js",
+                                   "jstests/indexh.js",
+                                   "jstests/apitest_db.js",
+                                   "jstests/evalb.js",
+                                   "jstests/evald.js",
+                                   "jstests/evalf.js",
+                                   "jstests/killop.js",
+                                   "jstests/run_program1.js",
+                                   "jstests/notablescan.js",
+                                   "jstests/drop2.js",
+                                   "jstests/dropdb_race.js",
+                                   "jstests/fsync2.js", // May be placed in serialTestsArr once SERVER-4243 is fixed.
+                                   "jstests/bench_test1.js"] );
+        
+        // some tests can't be run in parallel with each other
+        var serialTestsArr = [ "jstests/fsync.js"
+//                              ,"jstests/fsync2.js" // SERVER-4243
+                              ];
+        var serialTests = makeKeys( serialTestsArr );
+        
+        params[ 0 ] = serialTestsArr;
+        
+        var files = listFiles("jstests");
+        files = Array.shuffle( files );
+        
+        var i = 0;
+        files.forEach(
+                      function(x) {
+                      
+                      if ( ( /[\/\\]_/.test(x.name) ) ||
+                          ( ! /\.js$/.test(x.name ) ) ||
+                          ( x.name in skipTests ) ||
+                          ( x.name in serialTests ) ||
+                          ! /\.js$/.test(x.name ) ){ 
+                      print(" >>>>>>>>>>>>>>> skipping " + x.name);
+                      return;
+                      }
+                      
+                      params[ i % n ].push( x.name );
+                      ++i;
+                      }
+        );
+        
+        // randomize ordering of the serialTests
+        params[ 0 ] = Array.shuffle( params[ 0 ] );
+        
+        for( var i in params ) {
+            params[ i ].unshift( i );
+        }
+        
+        return params;
+    }
+    
+    // runs a set of test files
+    // first argument is an identifier for this tester, remaining arguments are file names
+    ParallelTester.fileTester = function() {
+        var args = argumentsToArray( arguments );
+        var suite = args.shift();
+        args.forEach(
+                     function( x ) {
+                     print("         S" + suite + " Test : " + x + " ...");
+                     var time = Date.timeFunc( function() { load(x); }, 1);
+                     print("         S" + suite + " Test : " + x + " " + time + "ms" );
+                     }
+                     );        
+    }
+    
+    // params: array of arrays, each element of which consists of a function followed
+    // by zero or more arguments to that function.  Each function and its arguments will
+    // be called in a separate thread.
+    // msg: failure message
+    // newScopes: if true, each thread starts in a fresh scope
+    assert.parallelTests = function( params, msg, newScopes ) {
+        newScopes = newScopes || false;
+        var wrapper = function( fun, argv ) {
+                   eval (
+                         "var z = function() {" +
+                         "var __parallelTests__fun = " + fun.toString() + ";" +
+                         "var __parallelTests__argv = " + tojson( argv ) + ";" +
+                         "var __parallelTests__passed = false;" +
+                         "try {" +
+                            "__parallelTests__fun.apply( 0, __parallelTests__argv );" +
+                            "__parallelTests__passed = true;" +
+                         "} catch ( e ) {" +
+                            "print( '********** Parallel Test FAILED: ' + tojson(e) );" +
+                         "}" +
+                         "return __parallelTests__passed;" +
+                         "}"
+                         );
+            return z;
+        }
+        var runners = new Array();
+        for( var i in params ) {
+            var param = params[ i ];
+            var test = param.shift();
+            var t;
+            if ( newScopes )
+                t = new ScopedThread( wrapper( test, param ) );
+            else
+                t = new Thread( wrapper( test, param ) );
+            runners.push( t );
+        }
+        
+        runners.forEach( function( x ) { x.start(); } );
+        var nFailed = 0;
+        // v8 doesn't like it if we exit before all threads are joined (SERVER-529)
+        runners.forEach( function( x ) { if( !x.returnData() ) { ++nFailed; } } );        
+        assert.eq( 0, nFailed, msg );
+    }
+}
+
+tojsononeline = function( x ){
+    return tojson( x , " " , true );
+}
+
+tojson = function( x, indent , nolint ){
+    if ( x === null )
+        return "null";
+    
+    if ( x === undefined )
+        return "undefined";
+    
+    if (!indent) 
+        indent = "";
+
+    switch ( typeof x ) {
+    case "string": {
+        var s = "\"";
+        for ( var i=0; i<x.length; i++ ){
+            switch (x[i]){
+                case '"': s += '\\"'; break;
+                case '\\': s += '\\\\'; break;
+                case '\b': s += '\\b'; break;
+                case '\f': s += '\\f'; break;
+                case '\n': s += '\\n'; break;
+                case '\r': s += '\\r'; break;
+                case '\t': s += '\\t'; break;
+
+                default: {
+                    var code = x.charCodeAt(i);
+                    if (code < 0x20){
+                        s += (code < 0x10 ? '\\u000' : '\\u00') + code.toString(16);
+                    } else {
+                        s += x[i];
+                    }
+                }
+            }
+        }
+        return s + "\"";
+    }
+    case "number": 
+    case "boolean":
+        return "" + x;
+    case "object":{
+        var s = tojsonObject( x, indent , nolint );
+        if ( ( nolint == null || nolint == true ) && s.length < 80 && ( indent == null || indent.length == 0 ) ){
+            s = s.replace( /[\s\r\n ]+/gm , " " );
+        }
+        return s;
+    }
+    case "function":
+        return x.toString();
+    default:
+        throw "tojson can't handle type " + ( typeof x );
+    }
+    
+}
+
+tojsonObject = function( x, indent , nolint ){
+    var lineEnding = nolint ? " " : "\n";
+    var tabSpace = nolint ? "" : "\t";
+    
+    assert.eq( ( typeof x ) , "object" , "tojsonObject needs object, not [" + ( typeof x ) + "]" );
+
+    if (!indent) 
+        indent = "";
+    
+    if ( typeof( x.tojson ) == "function" && x.tojson != tojson ) {
+        return x.tojson(indent,nolint);
+    }
+    
+    if ( x.constructor && typeof( x.constructor.tojson ) == "function" && x.constructor.tojson != tojson ) {
+        return x.constructor.tojson( x, indent , nolint );
+    }
+
+    if ( x.toString() == "[object MaxKey]" )
+        return "{ $maxKey : 1 }";
+    if ( x.toString() == "[object MinKey]" )
+        return "{ $minKey : 1 }";
+    
+    var s = "{" + lineEnding;
+
+    // push one level of indent
+    indent += tabSpace;
+    
+    var total = 0;
+    for ( var k in x ) total++;
+    if ( total == 0 ) {
+        s += indent + lineEnding;
+    }
+
+    var keys = x;
+    if ( typeof( x._simpleKeys ) == "function" )
+        keys = x._simpleKeys();
+    var num = 1;
+    for ( var k in keys ){
+        
+        var val = x[k];
+        if ( val == DB.prototype || val == DBCollection.prototype )
+            continue;
+
+        s += indent + "\"" + k + "\" : " + tojson( val, indent , nolint );
+        if (num != total) {
+            s += ",";
+            num++;
+        }
+        s += lineEnding;
+    }
+
+    // pop one level of indent
+    indent = indent.substring(1);
+    return s + indent + "}";
+}
+
+shellPrint = function( x ){
+    it = x;
+    if ( x != undefined )
+        shellPrintHelper( x );
+    
+    if ( db ){
+        var e = db.getPrevError();
+        if ( e.err ) {
+            if ( e.nPrev <= 1 )
+                print( "error on last call: " + tojson( e.err ) );
+            else
+                print( "an error " + tojson( e.err ) + " occurred " + e.nPrev + " operations back in the command invocation" );
+        }
+        db.resetError();
+    }
+}
+
+printjson = function(x){
+    print( tojson( x ) );
+}
+
+printjsononeline = function(x){
+    print( tojsononeline( x ) );
+}
+
+if ( typeof TestData == "undefined" ){
+    TestData = undefined
+}
+
+jsTestName = function(){
+    if( TestData ) return TestData.testName
+    return "__unknown_name__"
+}
+
+jsTestFile = function(){
+    if( TestData ) return TestData.testFile
+    return "__unknown_file__"
+}
+
+jsTestPath = function(){
+    if( TestData ) return TestData.testPath
+    return "__unknown_path__"
+}
+
+jsTestOptions = function(){
+    if( TestData ) return { noJournal : TestData.noJournal,
+                            noJournalPrealloc : TestData.noJournalPrealloc,
+                            auth : TestData.auth,
+                            keyFile : TestData.keyFile,
+                            authUser : "__system",
+                            authPassword : TestData.keyFileData,
+                            adminUser : "admin",
+                            adminPassword : "password" }
+    return {}
+}
+
+jsTestLog = function(msg){
+    print( "\n\n----\n" + msg + "\n----\n\n" )
+}
+
+jsTest = {}
+
+jsTest.name = jsTestName
+jsTest.file = jsTestFile
+jsTest.path = jsTestPath
+jsTest.options = jsTestOptions
+jsTest.log = jsTestLog
+
+jsTest.dir = function(){
+    return jsTest.path().replace( /\/[^\/]+$/, "/" )
+}
+
+jsTest.randomize = function( seed ) {
+    if( seed == undefined ) seed = new Date().getTime()
+    Random.srand( seed )
+    print( "Random seed for test : " + seed ) 
+}
+
+/**
+* Adds a user to the admin DB on the given connection. This is only used for running the test suite
+* with authentication enabled.
+*/
+jsTest.addAuth = function(conn) {
+    print ("Adding admin user on connection: " + conn);
+    return conn.getDB('admin').addUser(jsTestOptions().adminUser, jsTestOptions().adminPassword);
+}
+
+jsTest.authenticate = function(conn) {
+    conn.authenticated = true;
+    result1 = null;
+    result2 = null;
+    if (jsTest.options().auth) {
+        print ("Authenticating to admin user on connection: " + conn);
+        result1 = conn.getDB('admin').auth(jsTestOptions().adminUser, jsTestOptions().adminPassword);
+    }
+    if (jsTest.options().keyFile && !jsTest.isMongos(conn)) {
+        print ("Authenticating to system user on connection: " + conn);
+        result2 = conn.getDB('local').auth(jsTestOptions().authUser, jsTestOptions().authPassword);
+    }
+
+    if (result1 == 1 || result2 == 1) {
+        return 1;
+    }
+
+    return result2 != null ? result2 : result1;
+}
+
+jsTest.authenticateNodes = function(nodes) {
+    jsTest.attempt({timeout:30000, desc: "Authenticate to nodes: " + nodes}, function() {
+        for (var i = 0; i < nodes.length; i++) {
+            // Don't try to authenticate to arbiters
+            res = nodes[i].getDB("admin").runCommand({replSetGetStatus: 1});
+            if(res.myState == 7) {
+                continue;
+            }
+            if(jsTest.authenticate(nodes[i]) != 1) {
+                return false;
+            }
+        }
+        return true;
+    });
+}
+
+jsTest.isMongos = function(conn) {
+    return conn.getDB('admin').isMaster().msg=='isdbgrid';
+}
+
+// Pass this method a function to call repeatedly until
+// that function returns true. Example:
+//   attempt({timeout: 20000, desc: "get master"}, function() { // return false until success })
+jsTest.attempt = function( opts, func ) {
+    var timeout = opts.timeout || 1000;
+    var tries   = 0;
+    var sleepTime = 500;
+    var result = null;
+    var context = opts.context || this;
+
+    while((result = func.apply(context)) == false) {
+        tries += 1;
+        sleep(sleepTime);
+        if( tries * sleepTime > timeout) {
+            throw('[' + opts['desc'] + ']' + " timed out after " + timeout + "ms ( " + tries + " tries )");
+        }
+    }
+
+    return result;
+}
+
+
+shellPrintHelper = function (x) {
+    if (typeof (x) == "undefined") {
+        // Make sure that we have a db var before we use it
+        // TODO: This implicit calling of GLE can cause subtle, hard to track issues - remove?
+        if (__callLastError && typeof( db ) != "undefined" && db.getMongo ) {
+            __callLastError = false;
+            // explicit w:1 so that replset getLastErrorDefaults aren't used here which would be bad.
+            var err = db.getLastError(1);
+            if (err != null) {
+                print(err);
+            }
+        }
+        return;
+    }
+
+    if (x == __magicNoPrint)
+        return;
+
+    if (x == null) {
+        print("null");
+        return;
+    }
+
+    if (typeof x != "object")
+        return print(x);
+
+    var p = x.shellPrint;
+    if (typeof p == "function")
+        return x.shellPrint();
+
+    var p = x.tojson;
+    if (typeof p == "function")
+        print(x.tojson());
+    else
+        print(tojson(x));
+}
+
+shellAutocomplete = function ( /*prefix*/ ) { // outer scope function called on init. Actual function at end
+
+    var universalMethods = "constructor prototype toString valueOf toLocaleString hasOwnProperty propertyIsEnumerable".split( ' ' );
+
+    var builtinMethods = {}; // uses constructor objects as keys
+    builtinMethods[Array] = "length concat join pop push reverse shift slice sort splice unshift indexOf lastIndexOf every filter forEach map some".split( ' ' );
+    builtinMethods[Boolean] = "".split( ' ' ); // nothing more than universal methods
+    builtinMethods[Date] = "getDate getDay getFullYear getHours getMilliseconds getMinutes getMonth getSeconds getTime getTimezoneOffset getUTCDate getUTCDay getUTCFullYear getUTCHours getUTCMilliseconds getUTCMinutes getUTCMonth getUTCSeconds getYear parse setDate setFullYear setHours setMilliseconds setMinutes setMonth setSeconds setTime setUTCDate setUTCFullYear setUTCHours setUTCMilliseconds setUTCMinutes setUTCMonth setUTCSeconds setYear toDateString toGMTString toLocaleDateString toLocaleTimeString toTimeString toUTCString UTC".split( ' ' );
+    builtinMethods[Math] = "E LN2 LN10 LOG2E LOG10E PI SQRT1_2 SQRT2 abs acos asin atan atan2 ceil cos exp floor log max min pow random round sin sqrt tan".split( ' ' );
+    builtinMethods[Number] = "MAX_VALUE MIN_VALUE NEGATIVE_INFINITY POSITIVE_INFINITY toExponential toFixed toPrecision".split( ' ' );
+    builtinMethods[RegExp] = "global ignoreCase lastIndex multiline source compile exec test".split( ' ' );
+    builtinMethods[String] = "length charAt charCodeAt concat fromCharCode indexOf lastIndexOf match replace search slice split substr substring toLowerCase toUpperCase".split( ' ' );
+    builtinMethods[Function] = "call apply".split( ' ' );
+    builtinMethods[Object] = "bsonsize".split( ' ' );
+
+    builtinMethods[Mongo] = "find update insert remove".split( ' ' );
+    builtinMethods[BinData] = "hex base64 length subtype".split( ' ' );
+
+    var extraGlobals = "Infinity NaN undefined null true false decodeURI decodeURIComponent encodeURI encodeURIComponent escape eval isFinite isNaN parseFloat parseInt unescape Array Boolean Date Math Number RegExp String print load gc MinKey MaxKey Mongo NumberLong ObjectId DBPointer UUID BinData Map".split( ' ' );
+
+    var isPrivate = function( name ) {
+        if ( shellAutocomplete.showPrivate ) return false;
+        if ( name == '_id' ) return false;
+        if ( name[0] == '_' ) return true;
+        if ( name[name.length - 1] == '_' ) return true; // some native functions have an extra name_ method
+        return false;
+    }
+
+    var customComplete = function( obj ) {
+        try {
+            if ( obj.__proto__.constructor.autocomplete ) {
+                var ret = obj.constructor.autocomplete( obj );
+                if ( ret.constructor != Array ) {
+                    print( "\nautocompleters must return real Arrays" );
+                    return [];
+                }
+                return ret;
+            } else {
+                return [];
+            }
+        } catch ( e ) {
+            // print( e ); // uncomment if debugging custom completers
+            return [];
+        }
+    }
+
+    var worker = function( prefix ) {
+        var global = ( function() { return this; } ).call(); // trick to get global object
+
+        var curObj = global;
+        var parts = prefix.split( '.' );
+        for ( var p = 0; p < parts.length - 1; p++ ) { // doesn't include last part
+            curObj = curObj[parts[p]];
+            if ( curObj == null )
+                return [];
+        }
+
+        var lastPrefix = parts[parts.length - 1] || '';
+        var lastPrefixLowercase = lastPrefix.toLowerCase()
+        var beginning = parts.slice( 0, parts.length - 1 ).join( '.' );
+        if ( beginning.length )
+            beginning += '.';
+
+        var possibilities = new Array().concat(
+            universalMethods,
+            Object.keySet( curObj ),
+            Object.keySet( curObj.__proto__ ),
+            builtinMethods[curObj] || [], // curObj is a builtin constructor
+            builtinMethods[curObj.__proto__.constructor] || [], // curObj is made from a builtin constructor
+            curObj == global ? extraGlobals : [],
+            customComplete( curObj )
+        );
+
+        var noDuplicates = {}; // see http://dreaminginjavascript.wordpress.com/2008/08/22/eliminating-duplicates/
+        for ( var i = 0; i < possibilities.length; i++ ) {
+            var p = possibilities[i];
+            if ( typeof ( curObj[p] ) == "undefined" && curObj != global ) continue; // extraGlobals aren't in the global object
+            if ( p.length == 0 || p.length < lastPrefix.length ) continue;
+            if ( lastPrefix[0] != '_' && isPrivate( p ) ) continue;
+            if ( p.match( /^[0-9]+$/ ) ) continue; // don't array number indexes
+            if ( p.substr( 0, lastPrefix.length ).toLowerCase() != lastPrefixLowercase ) continue;
+
+            var completion = beginning + p;
+            if ( curObj[p] && curObj[p].constructor == Function && p != 'constructor' )
+                completion += '(';
+
+            noDuplicates[completion] = 0;
+        }
+
+        var ret = [];
+        for ( i in noDuplicates )
+            ret.push( i );
+
+        return ret;
+    }
+
+    // this is the actual function that gets assigned to shellAutocomplete
+    return function( prefix ) {
+        try {
+            __autocomplete__ = worker( prefix ).sort();
+        } catch ( e ) {
+            print( "exception during autocomplete: " + tojson( e.message ) );
+            __autocomplete__ = [];
+        }
+    }
+} ();
+
+shellAutocomplete.showPrivate = false; // toggle to show (useful when working on internals)
+
+shellHelper = function( command , rest , shouldPrint ){
+    command = command.trim();
+    var args = rest.trim().replace(/\s*;$/,"").split( "\s+" );
+    
+    if ( ! shellHelper[command] )
+        throw "no command [" + command + "]";
+    
+    var res = shellHelper[command].apply( null , args );
+    if ( shouldPrint ){
+        shellPrintHelper( res );
+    }
+    return res;
+}
+
+shellHelper.use = function (dbname) {
+    var s = "" + dbname;
+    if (s == "") {
+        print("bad use parameter");
+        return;
+    }
+    db = db.getMongo().getDB(dbname);
+    print("switched to db " + db.getName());
+}
+
+shellHelper.set = function (str) {
+    if (str == "") {
+        print("bad use parameter");
+        return;
+    }
+    tokens = str.split(" ");
+    param = tokens[0];
+    value = tokens[1];
+    
+    if ( value == undefined ) value = true;
+    // value comes in as a string..
+    if ( value == "true" ) value = true;
+    if ( value == "false" ) value = false;
+
+    if (param == "verbose") {
+        _verboseShell = value;
+    }
+    print("set " + param + " to " + value);
+}
+
+shellHelper.it = function(){
+    if ( typeof( ___it___ ) == "undefined" || ___it___ == null ){
+        print( "no cursor" );
+        return;
+    }
+    shellPrintHelper( ___it___ );
+}
+
+shellHelper.show = function (what) {
+    assert(typeof what == "string");
+
+    var args = what.split( /\s+/ );
+    what = args[0]
+    args = args.splice(1)
+
+    if (what == "profile") {
+        if (db.system.profile.count() == 0) {
+            print("db.system.profile is empty");
+            print("Use db.setProfilingLevel(2) will enable profiling");
+            print("Use db.system.profile.find() to show raw profile entries");
+        }
+        else {
+            print();
+            db.system.profile.find({ millis: { $gt: 0} }).sort({ $natural: -1 }).limit(5).forEach(
+                function (x) { 
+                    print("" + x.op + "\t" + x.ns + " " + x.millis + "ms " + String(x.ts).substring(0, 24)); 
+                    var l = "";
+                    for ( var z in x ){
+                        if ( z == "op" || z == "ns" || z == "millis" || z == "ts" )
+                            continue;
+                        
+                        var val = x[z];
+                        var mytype = typeof(val);
+                        
+                        if ( mytype == "string" || 
+                             mytype == "number" )
+                            l += z + ":" + val + " ";
+                        else if ( mytype == "object" ) 
+                            l += z + ":" + tojson(val ) + " ";
+                        else if ( mytype == "boolean" )
+                            l += z + " ";
+                        else
+                            l += z + ":" + val + " ";
+
+                    }
+                    print( l );
+                    print("\n"); 
+                }
+            )
+        }
+        return "";
+    }
+
+    if (what == "users") {
+        db.system.users.find().forEach(printjson);
+        return "";
+    }
+
+    if (what == "collections" || what == "tables") {
+        db.getCollectionNames().forEach(function (x) { print(x) });
+        return "";
+    }
+
+    if (what == "dbs") {
+        var dbs = db.getMongo().getDBs();
+        var size = {};
+        dbs.databases.forEach(function (x) { size[x.name] = x.sizeOnDisk; });
+        var names = dbs.databases.map(function (z) { return z.name; }).sort();
+        names.forEach(function (n) {
+            if (size[n] > 1) {
+                print(n + "\t" + size[n] / 1024 / 1024 / 1024 + "GB");
+            } else {
+                print(n + "\t(empty)");
+            }
+        });
+        //db.getMongo().getDBNames().sort().forEach(function (x) { print(x) });
+        return "";
+    }
+    
+    if (what == "log" ) {
+        var n = "global";
+        if ( args.length > 0 )
+            n = args[0]
+        
+        var res = db.adminCommand( { getLog : n } )
+        for ( var i=0; i<res.log.length; i++){
+            print( res.log[i] )
+        }
+        return ""
+    }
+
+    if (what == "logs" ) {
+        var res = db.adminCommand( { getLog : "*" } )
+        for ( var i=0; i<res.names.length; i++){
+            print( res.names[i] )
+        }
+        return ""
+    }
+
+
+    throw "don't know how to show [" + what + "]";
+
+}
+
+if ( typeof( Map ) == "undefined" ){
+    Map = function(){
+        this._data = {};
+    }
+}
+
+Map.hash = function( val ){
+    if ( ! val )
+        return val;
+
+    switch ( typeof( val ) ){
+    case 'string':
+    case 'number':
+    case 'date':
+        return val.toString();
+    case 'object':
+    case 'array':
+        var s = "";
+        for ( var k in val ){
+            s += k + val[k];
+        }
+        return s;
+    }
+
+    throw "can't hash : " + typeof( val );
+}
+
+Map.prototype.put = function( key , value ){
+    var o = this._get( key );
+    var old = o.value;
+    o.value = value;
+    return old;
+}
+
+Map.prototype.get = function( key ){
+    return this._get( key ).value;
+}
+
+Map.prototype._get = function( key ){
+    var h = Map.hash( key );
+    var a = this._data[h];
+    if ( ! a ){
+        a = [];
+        this._data[h] = a;
+    }
+    
+    for ( var i=0; i<a.length; i++ ){
+        if ( friendlyEqual( key , a[i].key ) ){
+            return a[i];
+        }
+    }
+    var o = { key : key , value : null };
+    a.push( o );
+    return o;
+}
+
+Map.prototype.values = function(){
+    var all = [];
+    for ( var k in this._data ){
+        this._data[k].forEach( function(z){ all.push( z.value ); } );
+    }
+    return all;
+}
+
+if ( typeof( gc ) == "undefined" ){
+    gc = function(){
+        print( "warning: using noop gc()" );
+    }
+}
+   
+
+Math.sigFig = function( x , N ){
+    if ( ! N ){
+        N = 3;
+    }
+    var p = Math.pow( 10, N - Math.ceil( Math.log( Math.abs(x) ) / Math.log( 10 )) );
+    return Math.round(x*p)/p;
+}
+
+Random = function() {}
+
+// set random seed
+Random.srand = function( s ) { _srand( s ); }
+
+// random number 0 <= r < 1
+Random.rand = function() { return _rand(); }
+
+// random integer 0 <= r < n
+Random.randInt = function( n ) { return Math.floor( Random.rand() * n ); }
+
+Random.setRandomSeed = function( s ) {
+    s = s || new Date().getTime();
+    print( "setting random seed: " + s );
+    Random.srand( s );
+}
+
+// generate a random value from the exponential distribution with the specified mean
+Random.genExp = function( mean ) {
+    return -Math.log( Random.rand() ) * mean;
+}
+
+Geo = {};
+Geo.distance = function( a , b ){
+    var ax = null;
+    var ay = null;
+    var bx = null;
+    var by = null;
+    
+    for ( var key in a ){
+        if ( ax == null )
+            ax = a[key];
+        else if ( ay == null )
+            ay = a[key];
+    }
+    
+    for ( var key in b ){
+        if ( bx == null )
+            bx = b[key];
+        else if ( by == null )
+            by = b[key];
+    }
+
+    return Math.sqrt( Math.pow( by - ay , 2 ) + 
+                      Math.pow( bx - ax , 2 ) );
+}
+
+Geo.sphereDistance = function( a , b ){
+    var ax = null;
+    var ay = null;
+    var bx = null;
+    var by = null;
+    
+    // TODO swap order of x and y when done on server
+    for ( var key in a ){
+        if ( ax == null )
+            ax = a[key] * (Math.PI/180);
+        else if ( ay == null )
+            ay = a[key] * (Math.PI/180);
+    }
+    
+    for ( var key in b ){
+        if ( bx == null )
+            bx = b[key] * (Math.PI/180);
+        else if ( by == null )
+            by = b[key] * (Math.PI/180);
+    }
+
+    var sin_x1=Math.sin(ax), cos_x1=Math.cos(ax);
+    var sin_y1=Math.sin(ay), cos_y1=Math.cos(ay);
+    var sin_x2=Math.sin(bx), cos_x2=Math.cos(bx);
+    var sin_y2=Math.sin(by), cos_y2=Math.cos(by);
+
+    var cross_prod = 
+        (cos_y1*cos_x1 * cos_y2*cos_x2) +
+        (cos_y1*sin_x1 * cos_y2*sin_x2) +
+        (sin_y1        * sin_y2);
+
+    if (cross_prod >= 1 || cross_prod <= -1){
+        // fun with floats
+        assert( Math.abs(cross_prod)-1 < 1e-6 );
+        return cross_prod > 0 ? 0 : Math.PI;
+    }
+
+    return Math.acos(cross_prod);
+}
+
+rs = function () { return "try rs.help()"; }
+
+rs.help = function () {
+    print("\trs.status()                     { replSetGetStatus : 1 } checks repl set status");
+    print("\trs.initiate()                   { replSetInitiate : null } initiates set with default settings");
+    print("\trs.initiate(cfg)                { replSetInitiate : cfg } initiates set with configuration cfg");
+    print("\trs.conf()                       get the current configuration object from local.system.replset");
+    print("\trs.reconfig(cfg)                updates the configuration of a running replica set with cfg (disconnects)");
+    print("\trs.add(hostportstr)             add a new member to the set with default attributes (disconnects)");
+    print("\trs.add(membercfgobj)            add a new member to the set with extra attributes (disconnects)");
+    print("\trs.addArb(hostportstr)          add a new member which is arbiterOnly:true (disconnects)");
+    print("\trs.stepDown([secs])             step down as primary (momentarily) (disconnects)");
+    print("\trs.freeze(secs)                 make a node ineligible to become primary for the time specified");
+    print("\trs.remove(hostportstr)          remove a host from the replica set (disconnects)");
+    print("\trs.slaveOk()                    shorthand for db.getMongo().setSlaveOk()");
+    print();
+    print("\tdb.isMaster()                   check who is primary");
+    print();
+    print("\treconfiguration helpers disconnect from the database so the shell will display");
+    print("\tan error, even if the command succeeds.");
+    print("\tsee also http://<mongod_host>:28017/_replSet for additional diagnostic info");
+}
+rs.slaveOk = function (value) { return db.getMongo().setSlaveOk(value); }
+rs.status = function () { return db._adminCommand("replSetGetStatus"); }
+rs.isMaster = function () { return db.isMaster(); }
+rs.initiate = function (c) { return db._adminCommand({ replSetInitiate: c }); }
+rs._runCmd = function (c) {
+    // after the command, catch the disconnect and reconnect if necessary
+    var res = null;
+    try {
+        res = db.adminCommand(c);
+    }
+    catch (e) {
+        if (("" + e).indexOf("error doing query") >= 0) {
+            // closed connection.  reconnect.
+            db.getLastErrorObj();
+            var o = db.getLastErrorObj();
+            if (o.ok) {
+                print("reconnected to server after rs command (which is normal)");
+            }
+            else {
+                printjson(o);
+            }
+        }
+        else {
+            print("shell got exception during repl set operation: " + e);
+            print("in some circumstances, the primary steps down and closes connections on a reconfig");
+        }
+        return "";
+    }
+    return res;
+}
+rs.reconfig = function (cfg, options) {
+    cfg.version = rs.conf().version + 1;
+    cmd = { replSetReconfig: cfg };
+    for (var i in options) {
+        cmd[i] = options[i];
+    }
+    return this._runCmd(cmd);
+}
+rs.add = function (hostport, arb) {
+    var cfg = hostport;
+
+    var local = db.getSisterDB("local");
+    assert(local.system.replset.count() <= 1, "error: local.system.replset has unexpected contents");
+    var c = local.system.replset.findOne();
+    assert(c, "no config object retrievable from local.system.replset");
+
+    c.version++;
+
+    var max = 0;
+    for (var i in c.members)
+        if (c.members[i]._id > max) max = c.members[i]._id;
+    if (isString(hostport)) {
+        cfg = { _id: max + 1, host: hostport };
+        if (arb)
+            cfg.arbiterOnly = true;
+    }
+    c.members.push(cfg);
+    return this._runCmd({ replSetReconfig: c });
+}
+rs.stepDown = function (secs) { return db._adminCommand({ replSetStepDown:(secs === undefined) ? 60:secs}); }
+rs.freeze = function (secs) { return db._adminCommand({replSetFreeze:secs}); }
+rs.addArb = function (hn) { return this.add(hn, true); }
+rs.conf = function () { return db.getSisterDB("local").system.replset.findOne(); }
+rs.config = function () { return rs.conf(); }
+
+rs.remove = function (hn) {
+    var local = db.getSisterDB("local");
+    assert(local.system.replset.count() <= 1, "error: local.system.replset has unexpected contents");
+    var c = local.system.replset.findOne();
+    assert(c, "no config object retrievable from local.system.replset");
+    c.version++;
+
+    for (var i in c.members) {
+        if (c.members[i].host == hn) {
+            c.members.splice(i, 1);
+            return db._adminCommand({ replSetReconfig : c});
+        }
+    }
+
+    return "error: couldn't find "+hn+" in "+tojson(c.members);
+};
+
+rs.debug = {};
+
+rs.debug.nullLastOpWritten = function(primary, secondary) {
+    var p = connect(primary+"/local");
+    var s = connect(secondary+"/local");
+    s.getMongo().setSlaveOk();
+
+    var secondToLast = s.oplog.rs.find().sort({$natural : -1}).limit(1).next();
+    var last = p.runCommand({findAndModify : "oplog.rs",
+                             query : {ts : {$gt : secondToLast.ts}},
+                             sort : {$natural : 1},
+                             update : {$set : {op : "n"}}});
+
+    if (!last.value.o || !last.value.o._id) {
+        print("couldn't find an _id?");
+    }
+    else {
+        last.value.o = {_id : last.value.o._id};
+    }
+
+    print("nulling out this op:");
+    printjson(last);
+};
+
+rs.debug.getLastOpWritten = function(server) {
+    var s = db.getSisterDB("local");
+    if (server) {
+        s = connect(server+"/local");
+    }
+    s.getMongo().setSlaveOk();
+
+    return s.oplog.rs.find().sort({$natural : -1}).limit(1).next();
+};
+
+
+help = shellHelper.help = function (x) {
+    if (x == "mr") {
+        print("\nSee also http://www.mongodb.org/display/DOCS/MapReduce");
+        print("\nfunction mapf() {");
+        print("  // 'this' holds current document to inspect");
+        print("  emit(key, value);");
+        print("}");
+        print("\nfunction reducef(key,value_array) {");
+        print("  return reduced_value;");
+        print("}");
+        print("\ndb.mycollection.mapReduce(mapf, reducef[, options])");
+        print("\noptions");
+        print("{[query : <query filter object>]");
+        print(" [, sort : <sort the query.  useful for optimization>]");
+        print(" [, limit : <number of objects to return from collection>]");
+        print(" [, out : <output-collection name>]");
+        print(" [, keeptemp: <true|false>]");
+        print(" [, finalize : <finalizefunction>]");
+        print(" [, scope : <object where fields go into javascript global scope >]");
+        print(" [, verbose : true]}\n");
+        return;
+    } else if (x == "connect") {
+        print("\nNormally one specifies the server on the mongo shell command line.  Run mongo --help to see those options.");
+        print("Additional connections may be opened:\n");
+        print("    var x = new Mongo('host[:port]');");
+        print("    var mydb = x.getDB('mydb');");
+        print("  or");
+        print("    var mydb = connect('host[:port]/mydb');");
+        print("\nNote: the REPL prompt only auto-reports getLastError() for the shell command line connection.\n");
+        return;
+    }
+    else if (x == "keys") {
+        print("Tab completion and command history is available at the command prompt.\n");
+        print("Some emacs keystrokes are available too:");
+        print("  Ctrl-A start of line");
+        print("  Ctrl-E end of line");
+        print("  Ctrl-K del to end of line");
+        print("\nMulti-line commands");
+        print("You can enter a multi line javascript expression.  If parens, braces, etc. are not closed, you will see a new line ");
+        print("beginning with '...' characters.  Type the rest of your expression.  Press Ctrl-C to abort the data entry if you");
+        print("get stuck.\n");
+    }
+    else if (x == "misc") {
+        print("\tb = new BinData(subtype,base64str)  create a BSON BinData value");
+        print("\tb.subtype()                         the BinData subtype (0..255)");
+        print("\tb.length()                          length of the BinData data in bytes");
+        print("\tb.hex()                             the data as a hex encoded string");
+        print("\tb.base64()                          the data as a base 64 encoded string");
+        print("\tb.toString()");
+        print();
+        print("\tb = HexData(subtype,hexstr)         create a BSON BinData value from a hex string");
+        print("\tb = UUID(hexstr)                    create a BSON BinData value of UUID subtype");
+        print("\tb = MD5(hexstr)                     create a BSON BinData value of MD5 subtype");
+        print();
+        print("\to = new ObjectId()                  create a new ObjectId");
+        print("\to.getTimestamp()                    return timestamp derived from first 32 bits of the OID");
+        print("\to.isObjectId()");
+        print("\to.toString()");
+        print("\to.equals(otherid)");
+        print();
+        print("\td = ISODate()                       like Date() but behaves more intuitively when used");
+        print("\td = ISODate('YYYY-MM-DD hh:mm:ss')    without an explicit \"new \" prefix on construction");
+        return;
+    }
+    else if (x == "admin") {
+        print("\tls([path])                      list files");
+        print("\tpwd()                           returns current directory");
+        print("\tlistFiles([path])               returns file list");
+        print("\thostname()                      returns name of this host");
+        print("\tcat(fname)                      returns contents of text file as a string");
+        print("\tremoveFile(f)                   delete a file or directory");
+        print("\tload(jsfilename)                load and execute a .js file");
+        print("\trun(program[, args...])         spawn a program and wait for its completion");
+        print("\trunProgram(program[, args...])  same as run(), above");
+        print("\tsleep(m)                        sleep m milliseconds");
+        print("\tgetMemInfo()                    diagnostic");
+        return;
+    }
+    else if (x == "test") {
+        print("\tstartMongodEmpty(args)        DELETES DATA DIR and then starts mongod");
+        print("\t                              returns a connection to the new server");
+        print("\tstartMongodTest(port,dir,options)");
+        print("\t                              DELETES DATA DIR");
+        print("\t                              automatically picks port #s starting at 27000 and increasing");
+        print("\t                              or you can specify the port as the first arg");
+        print("\t                              dir is /data/db/<port>/ if not specified as the 2nd arg");
+        print("\t                              returns a connection to the new server");
+        print("\tresetDbpath(dirpathstr)       deletes everything under the dir specified including subdirs");
+        print("\tstopMongoProgram(port[, signal])");
+        return;
+    }
+    else if (x == "") {
+        print("\t" + "db.help()                    help on db methods");
+        print("\t" + "db.mycoll.help()             help on collection methods");
+        print("\t" + "sh.help()                    sharding helpers");
+        print("\t" + "rs.help()                    replica set helpers");
+        print("\t" + "help admin                   administrative help");
+        print("\t" + "help connect                 connecting to a db help");
+        print("\t" + "help keys                    key shortcuts");
+        print("\t" + "help misc                    misc things to know");
+        print("\t" + "help mr                      mapreduce");
+        print();
+        print("\t" + "show dbs                     show database names");
+        print("\t" + "show collections             show collections in current database");
+        print("\t" + "show users                   show users in current database");
+        print("\t" + "show profile                 show most recent system.profile entries with time >= 1ms");
+        print("\t" + "show logs                    show the accessible logger names");
+        print("\t" + "show log [name]              prints out the last segment of log in memory, 'global' is default");
+        print("\t" + "use <db_name>                set current database");
+        print("\t" + "db.foo.find()                list objects in collection foo");
+        print("\t" + "db.foo.find( { a : 1 } )     list objects in foo where a == 1");
+        print("\t" + "it                           result of the last line evaluated; use to further iterate");
+        print("\t" + "DBQuery.shellBatchSize = x   set default number of items to display on shell");
+        print("\t" + "exit                         quit the mongo shell");
+    }
+    else
+        print("unknown help option");
+}
diff --git a/src/mongo/shell/utils_sh.js b/src/mongo/shell/utils_sh.js
new file mode 100644
index 00000000000..5c7fbafa75d
--- /dev/null
+++ b/src/mongo/shell/utils_sh.js
@@ -0,0 +1,164 @@
+sh = function() { return "try sh.help();" }
+
+sh._checkMongos = function() {
+    var x = db.runCommand( "ismaster" );
+    if ( x.msg != "isdbgrid" )
+        throw "not connected to a mongos"
+}
+
+sh._checkFullName = function( fullName ) {
+    assert( fullName , "neeed a full name" )
+    assert( fullName.indexOf( "." ) > 0 , "name needs to be fully qualified <db>.<collection>'" )
+}
+
+sh._adminCommand = function( cmd , skipCheck ) {
+    if ( ! skipCheck ) sh._checkMongos();
+    var res = db.getSisterDB( "admin" ).runCommand( cmd );
+
+    if ( res == null || ! res.ok ) {
+        print( "command failed: " + tojson( res ) )
+    }
+
+    return res;
+}
+
+sh._dataFormat = function( bytes ){
+   if( bytes < 1024 ) return Math.floor( bytes ) + "b"
+   if( bytes < 1024 * 1024 ) return Math.floor( bytes / 1024 ) + "kb"
+   if( bytes < 1024 * 1024 * 1024 ) return Math.floor( ( Math.floor( bytes / 1024 ) / 1024 ) * 100 ) / 100 + "Mb"
+   return Math.floor( ( Math.floor( bytes / ( 1024 * 1024 ) ) / 1024 ) * 100 ) / 100 + "Gb"
+}
+
+sh._collRE = function( coll ){
+   return RegExp( "^" + (coll + "").replace(/\./g, "\\.") + "-.*" )
+}
+
+sh._pchunk = function( chunk ){
+   return "[" + tojson( chunk.min ) + " -> " + tojson( chunk.max ) + "]"
+}
+
+sh.help = function() {
+    print( "\tsh.addShard( host )                       server:port OR setname/server:port" )
+    print( "\tsh.enableSharding(dbname)                 enables sharding on the database dbname" )
+    print( "\tsh.shardCollection(fullName,key,unique)   shards the collection" );
+
+    print( "\tsh.splitFind(fullName,find)               splits the chunk that find is in at the median" );
+    print( "\tsh.splitAt(fullName,middle)               splits the chunk that middle is in at middle" );
+    print( "\tsh.moveChunk(fullName,find,to)            move the chunk where 'find' is to 'to' (name of shard)");
+    
+    print( "\tsh.setBalancerState( <bool on or not> )   turns the balancer on or off true=on, false=off" );
+    print( "\tsh.getBalancerState()                     return true if on, off if not" );
+    print( "\tsh.isBalancerRunning()                    return true if the balancer is running on any mongos" );
+    
+    print( "\tsh.status()                               prints a general overview of the cluster" )
+}
+
+sh.status = function( verbose , configDB ) { 
+    // TODO: move the actual commadn here
+    printShardingStatus( configDB , verbose );
+}
+
+sh.addShard = function( url ){
+    sh._adminCommand( { addShard : url } , true )
+}
+
+sh.enableSharding = function( dbname ) { 
+    assert( dbname , "need a valid dbname" )
+    sh._adminCommand( { enableSharding : dbname } )
+}
+
+sh.shardCollection = function( fullName , key , unique ) {
+    sh._checkFullName( fullName )
+    assert( key , "need a key" )
+    assert( typeof( key ) == "object" , "key needs to be an object" )
+    
+    var cmd = { shardCollection : fullName , key : key }
+    if ( unique ) 
+        cmd.unique = true;
+
+    sh._adminCommand( cmd )
+}
+
+sh.splitFind = function( fullName , find ) {
+    sh._checkFullName( fullName )
+    sh._adminCommand( { split : fullName , find : find } )
+}
+
+sh.splitAt = function( fullName , middle ) {
+    sh._checkFullName( fullName )
+    sh._adminCommand( { split : fullName , middle : middle } )
+}
+
+sh.moveChunk = function( fullName , find , to ) {
+    sh._checkFullName( fullName );
+    return sh._adminCommand( { moveChunk : fullName , find : find , to : to } )
+}
+
+sh.setBalancerState = function( onOrNot ) { 
+    db.getSisterDB( "config" ).settings.update({ _id: "balancer" }, { $set : { stopped: onOrNot ? false : true } }, true );
+}
+
+sh.getBalancerState = function() {
+    var x = db.getSisterDB( "config" ).settings.findOne({ _id: "balancer" } )
+    if ( x == null )
+        return true;
+    return ! x.stopped;
+}
+
+sh.isBalancerRunning = function () {
+    var x = db.getSisterDB("config").locks.findOne({ _id: "balancer" });
+    if (x == null) {
+        print("config.locks collection empty or missing. be sure you are connected to a mongos");
+        return false;
+    }
+    return x.state > 0;
+}
+
+sh.stopBalancer = function( timeout, interval ) {
+    sh.setBalancerState( false )
+    sh.waitForBalancer( false, timeout, interval )
+}
+
+sh.startBalancer = function( timeout, interval ) {
+    sh.setBalancerState( true )
+    sh.waitForBalancer( true, timeout, interval )
+}
+
+sh.waitForBalancer = function( onOrNot, timeout, interval ){
+    
+    if( onOrNot != undefined ){
+        
+        // Wait for balancer to be on or off
+        // Can also wait for particular balancer state
+        var state = null
+        if( ! onOrNot ) state = 0
+        else if( onOrNot == true ) state = 2
+        else state = onOrNot
+        
+        assert.soon( function(){ var lock = db.getSisterDB( "config" ).locks.findOne( { _id : "balancer" } );
+                                 return ( lock == null && state == 0 ) || ( lock != null && lock.state == state ) 
+                     },
+                     "waited too long for balancer to " + ( state > 0 ? "start" : "stop" ) + " [ state : " + state + "]",
+                     timeout,
+                     interval
+        )
+        
+    }
+    else{
+        
+        // Wait for balancer to run at least once
+        
+        var lock = db.getSisterDB( "config" ).locks.findOne({ _id : "balancer" })
+        var ts = lock ? lock.ts : ""
+        
+        assert.soon( function(){ var lock = db.getSisterDB( "config" ).locks.findOne({ _id : "balancer" });
+                                 if( ! lock ) return false;
+                                 return lock.ts != ts
+                                },
+                                "waited too long for balancer to activate",
+                                timeout,
+                                interval
+        )        
+    }
+}
+
diff --git a/src/mongo/targetver.h b/src/mongo/targetver.h
new file mode 100644
index 00000000000..eb1b69bceba
--- /dev/null
+++ b/src/mongo/targetver.h
@@ -0,0 +1,20 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#ifndef _WIN32_WINNT            // Allow use of features specific to Windows Vista or later.
+#define _WIN32_WINNT 0x0600     // Change this to the appropriate value to target other versions of Windows.
+#endif
diff --git a/src/mongo/tools/bridge.cpp b/src/mongo/tools/bridge.cpp
new file mode 100644
index 00000000000..341a1dae687
--- /dev/null
+++ b/src/mongo/tools/bridge.cpp
@@ -0,0 +1,166 @@
+// bridge.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../util/net/message.h"
+#include "../util/net/listen.h"
+#include "../client/dbclient.h"
+#include "../db/dbmessage.h"
+
+using namespace mongo;
+using namespace std;
+
+int port = 0;
+string destUri;
+
+class Forwarder {
+public:
+    Forwarder( MessagingPort &mp ) : mp_( mp ) {
+    }
+    void operator()() const {
+        DBClientConnection dest;
+        string errmsg;
+        while( !dest.connect( destUri, errmsg ) )
+            sleepmillis( 500 );
+        Message m;
+        while( 1 ) {
+            try {
+                m.reset();
+                if ( !mp_.recv( m ) ) {
+                    cout << "end connection " << mp_.remoteString() << endl;
+                    mp_.shutdown();
+                    break;
+                }
+
+                int oldId = m.header()->id;
+                if ( m.operation() == dbQuery || m.operation() == dbMsg || m.operation() == dbGetMore ) {
+                    bool exhaust = false;
+                    if ( m.operation() == dbQuery ) {
+                        DbMessage d( m );
+                        QueryMessage q( d );
+                        exhaust = q.queryOptions & QueryOption_Exhaust;
+                    }
+                    Message response;
+                    dest.port().call( m, response );
+                    mp_.reply( m, response, oldId );
+                    while ( exhaust ) {
+                        MsgData *header = response.header();
+                        QueryResult *qr = (QueryResult *) header;
+                        if ( qr->cursorId ) {
+                            response.reset();
+                            dest.port().recv( response );
+                            mp_.reply( m, response ); // m argument is ignored anyway
+                        }
+                        else {
+                            exhaust = false;
+                        }
+                    }
+                }
+                else {
+                    dest.port().say( m, oldId );
+                }
+            }
+            catch ( ... ) {
+                log() << "caught exception in Forwarder, continuing" << endl;
+            }
+        }
+    }
+private:
+    MessagingPort &mp_;
+};
+
+set<MessagingPort*> ports;
+
+class MyListener : public Listener {
+public:
+    MyListener( int port ) : Listener( "bridge" , "", port ) {}
+    virtual void accepted(MessagingPort *mp) {
+        ports.insert( mp );
+        Forwarder f( *mp );
+        boost::thread t( f );
+    }
+};
+
+auto_ptr< MyListener > listener;
+
+#if !defined(_WIN32)
+void cleanup( int sig ) {
+    ListeningSockets::get()->closeAll();
+    for ( set<MessagingPort*>::iterator i = ports.begin(); i != ports.end(); i++ )
+        (*i)->shutdown();
+    ::exit( 0 );
+}
+
+void myterminate() {
+    rawOut( "bridge terminate() called, printing stack:" );
+    printStackTrace();
+    ::abort();
+}
+
+void setupSignals() {
+    signal( SIGINT , cleanup );
+    signal( SIGTERM , cleanup );
+    signal( SIGPIPE , cleanup );
+    signal( SIGABRT , cleanup );
+    signal( SIGSEGV , cleanup );
+    signal( SIGBUS , cleanup );
+    signal( SIGFPE , cleanup );
+    set_terminate( myterminate );
+}
+#else
+inline void setupSignals() {}
+#endif
+
+void helpExit() {
+    cout << "usage mongobridge --port <port> --dest <destUri>" << endl;
+    cout << "    port: port to listen for mongo messages" << endl;
+    cout << "    destUri: uri of remote mongod instance" << endl;
+    ::exit( -1 );
+}
+
+void check( bool b ) {
+    if ( !b )
+        helpExit();
+}
+
+int main( int argc, char **argv ) {
+    static StaticObserver staticObserver;
+
+    setupSignals();
+
+    check( argc == 5 );
+
+    for( int i = 1; i < 5; ++i ) {
+        check( i % 2 != 0 );
+        if ( strcmp( argv[ i ], "--port" ) == 0 ) {
+            port = strtol( argv[ ++i ], 0, 10 );
+        }
+        else if ( strcmp( argv[ i ], "--dest" ) == 0 ) {
+            destUri = argv[ ++i ];
+        }
+        else {
+            check( false );
+        }
+    }
+    check( port != 0 && !destUri.empty() );
+
+    listener.reset( new MyListener( port ) );
+    listener->initAndListen();
+
+    return 0;
+}
diff --git a/src/mongo/tools/bsondump.cpp b/src/mongo/tools/bsondump.cpp
new file mode 100644
index 00000000000..3825c071cd6
--- /dev/null
+++ b/src/mongo/tools/bsondump.cpp
@@ -0,0 +1,140 @@
+// restore.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "../pch.h"
+#include "../client/dbclient.h"
+#include "../util/mmap.h"
+#include "../util/text.h"
+#include "tool.h"
+
+#include <boost/program_options.hpp>
+
+#include <fcntl.h>
+
+using namespace mongo;
+
+namespace po = boost::program_options;
+
+class BSONDump : public BSONTool {
+
+    enum OutputType { JSON , DEBUG } _type;
+
+public:
+
+    BSONDump() : BSONTool( "bsondump", NONE ) {
+        add_options()
+        ("type" , po::value<string>()->default_value("json") , "type of output: json,debug" )
+        ;
+        add_hidden_options()
+        ("file" , po::value<string>() , ".bson file" )
+        ;
+        addPositionArg( "file" , 1 );
+        _noconnection = true;
+    }
+
+    virtual void printExtraHelp(ostream& out) {
+        out << "Display BSON objects in a data file.\n" << endl;
+        out << "usage: " << _name << " [options] <bson filename>" << endl;
+    }
+
+    virtual int doRun() {
+        {
+            string t = getParam( "type" );
+            if ( t == "json" )
+                _type = JSON;
+            else if ( t == "debug" )
+                _type = DEBUG;
+            else {
+                cerr << "bad type: " << t << endl;
+                return 1;
+            }
+        }
+
+        path root = getParam( "file" );
+        if ( root == "" ) {
+            printExtraHelp(cout);
+            return 1;
+        }
+
+        processFile( root );
+        return 0;
+    }
+
+    bool debug( const BSONObj& o , int depth=0) {
+        string prefix = "";
+        for ( int i=0; i<depth; i++ ) {
+            prefix += "\t\t\t";
+        }
+
+        int read = 4;
+
+        try {
+            cout << prefix << "--- new object ---\n";
+            cout << prefix << "\t size : " << o.objsize() << "\n";
+            BSONObjIterator i(o);
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                cout << prefix << "\t\t " << e.fieldName() << "\n" << prefix << "\t\t\t type:" << setw(3) << e.type() << " size: " << e.size() << endl;
+                if ( ( read + e.size() ) > o.objsize() ) {
+                    cout << prefix << " SIZE DOES NOT WORK" << endl;
+                    return false;
+                }
+                read += e.size();
+                try {
+                    e.validate();
+                    if ( e.isABSONObj() ) {
+                        if ( ! debug( e.Obj() , depth + 1 ) )
+                            return false;
+                    }
+                    else if ( e.type() == String && ! isValidUTF8( e.valuestr() ) ) {
+                        cout << prefix << "\t\t\t" << "bad utf8 String!" << endl;
+                    }
+                    else if ( logLevel > 0 ) {
+                        cout << prefix << "\t\t\t" << e << endl;
+                    }
+
+                }
+                catch ( std::exception& e ) {
+                    cout << prefix << "\t\t\t bad value: " << e.what() << endl;
+                }
+            }
+        }
+        catch ( std::exception& e ) {
+            cout << prefix << "\t" << e.what() << endl;
+        }
+        return true;
+    }
+
+    virtual void gotObject( const BSONObj& o ) {
+        switch ( _type ) {
+        case JSON:
+            cout << o.jsonString( TenGen ) << endl;
+            break;
+        case DEBUG:
+            debug(o);
+            break;
+        default:
+            cerr << "bad type? : " << _type << endl;
+        }
+    }
+};
+
+int main( int argc , char ** argv ) {
+    BSONDump dump;
+    return dump.main( argc , argv );
+}
diff --git a/src/mongo/tools/dump.cpp b/src/mongo/tools/dump.cpp
new file mode 100644
index 00000000000..b6e0d2912e5
--- /dev/null
+++ b/src/mongo/tools/dump.cpp
@@ -0,0 +1,527 @@
+// dump.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "../pch.h"
+#include "../client/dbclient.h"
+#include "../db/db.h"
+#include "tool.h"
+
+#include <fcntl.h>
+#include <map>
+
+using namespace mongo;
+
+namespace po = boost::program_options;
+
+class Dump : public Tool {
+    class FilePtr : boost::noncopyable {
+    public:
+        /*implicit*/ FilePtr(FILE* f) : _f(f) {}
+        ~FilePtr() { fclose(_f); }
+        operator FILE*() { return _f; }
+    private:
+        FILE* _f;
+    };
+public:
+    Dump() : Tool( "dump" , ALL , "" , "" , true ) {
+        add_options()
+        ("out,o", po::value<string>()->default_value("dump"), "output directory or \"-\" for stdout")
+        ("query,q", po::value<string>() , "json query" )
+        ("oplog", "Use oplog for point-in-time snapshotting" )
+        ("repair", "try to recover a crashed database" )
+        ("forceTableScan", "force a table scan (do not use $snapshot)" )
+        ;
+    }
+
+    virtual void preSetup() {
+        string out = getParam("out");
+        if ( out == "-" ) {
+                // write output to standard error to avoid mangling output
+                // must happen early to avoid sending junk to stdout
+                useStandardOutput(false);
+        }
+    }
+
+    virtual void printExtraHelp(ostream& out) {
+        out << "Export MongoDB data to BSON files.\n" << endl;
+    }
+
+    // This is a functor that writes a BSONObj to a file
+    struct Writer {
+        Writer(FILE* out, ProgressMeter* m) :_out(out), _m(m) {}
+
+        void operator () (const BSONObj& obj) {
+            size_t toWrite = obj.objsize();
+            size_t written = 0;
+
+            while (toWrite) {
+                size_t ret = fwrite( obj.objdata()+written, 1, toWrite, _out );
+                uassert(14035, errnoWithPrefix("couldn't write to file"), ret);
+                toWrite -= ret;
+                written += ret;
+            }
+
+            // if there's a progress bar, hit it
+            if (_m) {
+                _m->hit();
+            }
+        }
+
+        FILE* _out;
+        ProgressMeter* _m;
+    };
+
+    void doCollection( const string coll , FILE* out , ProgressMeter *m ) {
+        Query q = _query;
+
+        int queryOptions = QueryOption_SlaveOk | QueryOption_NoCursorTimeout;
+        if (startsWith(coll.c_str(), "local.oplog."))
+            queryOptions |= QueryOption_OplogReplay;
+        else if ( _query.isEmpty() && !hasParam("dbpath") && !hasParam("forceTableScan") )
+            q.snapshot();
+        
+        DBClientBase& connBase = conn(true);
+        Writer writer(out, m);
+
+        // use low-latency "exhaust" mode if going over the network
+        if (!_usingMongos && typeid(connBase) == typeid(DBClientConnection&)) {
+            DBClientConnection& conn = static_cast<DBClientConnection&>(connBase);
+            boost::function<void(const BSONObj&)> castedWriter(writer); // needed for overload resolution
+            conn.query( castedWriter, coll.c_str() , q , NULL, queryOptions | QueryOption_Exhaust);
+        }
+        else {
+            //This branch should only be taken with DBDirectClient or mongos which doesn't support exhaust mode
+            scoped_ptr<DBClientCursor> cursor(connBase.query( coll.c_str() , q , 0 , 0 , 0 , queryOptions ));
+            while ( cursor->more() ) {
+                writer(cursor->next());
+            }
+        }
+    }
+
+    void writeCollectionFile( const string coll , path outputFile ) {
+        log() << "\t" << coll << " to " << outputFile.string() << endl;
+
+        FilePtr f (fopen(outputFile.string().c_str(), "wb"));
+        uassert(10262, errnoWithPrefix("couldn't open file"), f);
+
+        ProgressMeter m( conn( true ).count( coll.c_str() , BSONObj() , QueryOption_SlaveOk ) );
+        m.setUnits("objects");
+
+        doCollection(coll, f, &m);
+
+        log() << "\t\t " << m.done() << " objects" << endl;
+    }
+
+    void writeMetadataFile( const string coll, path outputFile, map<string, BSONObj> options, multimap<string, BSONObj> indexes ) {
+        log() << "\tMetadata for " << coll << " to " << outputFile.string() << endl;
+
+        ofstream file (outputFile.string().c_str());
+        uassert(15933, "Couldn't open file: " + outputFile.string(), file.is_open());
+
+        bool hasOptions = options.count(coll) > 0;
+        bool hasIndexes = indexes.count(coll) > 0;
+
+        if (hasOptions) {
+            file << "{options : " << options.find(coll)->second.jsonString();
+
+            if (hasIndexes) {
+                file << ", ";
+            }
+        } else {
+            file << "{";
+        }
+
+        if (hasIndexes) {
+            file << "indexes:[";
+            for (multimap<string, BSONObj>::iterator it=indexes.equal_range(coll).first; it!=indexes.equal_range(coll).second; ++it) {
+                if (it != indexes.equal_range(coll).first) {
+                    file << ", ";
+                }
+                file << (*it).second.jsonString();
+            }
+            file << "]";
+        }
+        file << "}";
+    }
+
+
+
+    void writeCollectionStdout( const string coll ) {
+        doCollection(coll, stdout, NULL);
+    }
+
+    void go( const string db , const path outdir ) {
+        log() << "DATABASE: " << db << "\t to \t" << outdir.string() << endl;
+
+        create_directories( outdir );
+
+        map <string, BSONObj> collectionOptions;
+        multimap <string, BSONObj> indexes;
+        vector <string> collections;
+
+        // Save indexes for database
+        string ins = db + ".system.indexes";
+        auto_ptr<DBClientCursor> cursor = conn( true ).query( ins.c_str() , Query() , 0 , 0 , 0 , QueryOption_SlaveOk | QueryOption_NoCursorTimeout );
+        while ( cursor->more() ) {
+            BSONObj obj = cursor->nextSafe();
+            const string name = obj.getField( "ns" ).valuestr();
+            indexes.insert( pair<string, BSONObj> (name, obj.getOwned()) );
+        }
+
+        string sns = db + ".system.namespaces";
+        cursor = conn( true ).query( sns.c_str() , Query() , 0 , 0 , 0 , QueryOption_SlaveOk | QueryOption_NoCursorTimeout );
+        while ( cursor->more() ) {
+            BSONObj obj = cursor->nextSafe();
+            const string name = obj.getField( "name" ).valuestr();
+            if (obj.hasField("options")) {
+                collectionOptions.insert( pair<string,BSONObj> (name, obj.getField("options").embeddedObject()) );
+            }
+
+            // skip namespaces with $ in them only if we don't specify a collection to dump
+            if ( _coll == "" && name.find( ".$" ) != string::npos ) {
+                log(1) << "\tskipping collection: " << name << endl;
+                continue;
+            }
+
+            const string filename = name.substr( db.size() + 1 );
+
+            //if a particular collections is specified, and it's not this one, skip it
+            if ( _coll != "" && db + "." + _coll != name && _coll != name )
+                continue;
+
+            // raise error before writing collection with non-permitted filename chars in the name
+            size_t hasBadChars = name.find_first_of("/\0");
+            if (hasBadChars != string::npos){
+              error() << "Cannot dump "  << name << ". Collection has '/' or null in the collection name." << endl;
+              continue;
+            }
+            
+            // Don't dump indexes
+            if ( endsWith(name.c_str(), ".system.indexes") ) {
+              continue;
+            }
+            
+            if ( _coll != "" && db + "." + _coll != name && _coll != name )
+              continue;
+            
+            collections.push_back(name);
+        }
+        
+        for (vector<string>::iterator it = collections.begin(); it != collections.end(); ++it) {
+            string name = *it;
+            const string filename = name.substr( db.size() + 1 );
+            writeCollectionFile( name , outdir / ( filename + ".bson" ) );
+            writeMetadataFile( name, outdir / (filename + ".metadata.json"), collectionOptions, indexes);
+        }
+
+    }
+
+    int repair() {
+        if ( ! hasParam( "dbpath" ) ){
+            log() << "repair mode only works with --dbpath" << endl;
+            return -1;
+        }
+        
+        if ( ! hasParam( "db" ) ){
+            log() << "repair mode only works on 1 db right at a time right now" << endl;
+            return -1;
+        }
+
+        string dbname = getParam( "db" );
+        log() << "going to try and recover data from: " << dbname << endl;
+
+        return _repair( dbname  );
+    }
+    
+    DiskLoc _repairExtent( Database* db , string ns, bool forward , DiskLoc eLoc , Writer& w ){
+        LogIndentLevel lil;
+        
+        if ( eLoc.getOfs() <= 0 ){
+            error() << "invalid extent ofs: " << eLoc.getOfs() << endl;
+            return DiskLoc();
+        }
+        
+
+        MongoDataFile * mdf = db->getFile( eLoc.a() );
+
+        Extent * e = mdf->debug_getExtent( eLoc );
+        if ( ! e->isOk() ){
+            warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl;
+        }
+        
+        log() << "length:" << e->length << endl;
+        
+        LogIndentLevel lil2;
+        
+        set<DiskLoc> seen;
+
+        DiskLoc loc = forward ? e->firstRecord : e->lastRecord;
+        while ( ! loc.isNull() ){
+            
+            if ( ! seen.insert( loc ).second ) {
+                error() << "infinite loop in extend, seen: " << loc << " before" << endl;
+                break;
+            }
+
+            if ( loc.getOfs() <= 0 ){
+                error() << "offset is 0 for record which should be impossible" << endl;
+                break;
+            }
+            log(1) << loc << endl;
+            Record* rec = loc.rec();
+            BSONObj obj;
+            try {
+                obj = loc.obj();
+                assert( obj.valid() );
+                LOG(1) << obj << endl;
+                w( obj );
+            }
+            catch ( std::exception& e ) {
+                log() << "found invalid document @ " << loc << " " << e.what() << endl;
+                if ( ! obj.isEmpty() ) {
+                    try {
+                        BSONElement e = obj.firstElement();
+                        stringstream ss;
+                        ss << "first element: " << e;
+                        log() << ss.str();
+                    }
+                    catch ( std::exception& ) {
+                    }
+                }
+            }
+            loc = forward ? rec->getNext( loc ) : rec->getPrev( loc );
+        }
+        return forward ? e->xnext : e->xprev;
+        
+    }
+
+    void _repair( Database* db , string ns , path outfile ){
+        NamespaceDetails * nsd = nsdetails( ns.c_str() );
+        log() << "nrecords: " << nsd->stats.nrecords 
+              << " datasize: " << nsd->stats.datasize 
+              << " firstExtent: " << nsd->firstExtent 
+              << endl;
+        
+        if ( nsd->firstExtent.isNull() ){
+            log() << " ERROR fisrtExtent is null" << endl;
+            return;
+        }
+        
+        if ( ! nsd->firstExtent.isValid() ){
+            log() << " ERROR fisrtExtent is not valid" << endl;
+            return;
+        }
+
+        outfile /= ( ns.substr( ns.find( "." ) + 1 ) + ".bson" );
+        log() << "writing to: " << outfile.string() << endl;
+        
+        FilePtr f (fopen(outfile.string().c_str(), "wb"));
+
+        ProgressMeter m( nsd->stats.nrecords * 2 );
+        m.setUnits("objects");
+        
+        Writer w( f , &m );
+
+        try {
+            log() << "forward extent pass" << endl;
+            LogIndentLevel lil;
+            DiskLoc eLoc = nsd->firstExtent;
+            while ( ! eLoc.isNull() ){
+                log() << "extent loc: " << eLoc << endl;
+                eLoc = _repairExtent( db , ns , true , eLoc , w );
+            }
+        }
+        catch ( DBException& e ){
+            error() << "forward extent pass failed:" << e.toString() << endl;
+        }
+        
+        try {
+            log() << "backwards extent pass" << endl;
+            LogIndentLevel lil;
+            DiskLoc eLoc = nsd->lastExtent;
+            while ( ! eLoc.isNull() ){
+                log() << "extent loc: " << eLoc << endl;
+                eLoc = _repairExtent( db , ns , false , eLoc , w );
+            }
+        }
+        catch ( DBException& e ){
+            error() << "ERROR: backwards extent pass failed:" << e.toString() << endl;
+        }
+
+        log() << "\t\t " << m.done() << " objects" << endl;
+    }
+    
+    int _repair( string dbname ) {
+        dblock lk;
+        Client::Context cx( dbname );
+        Database * db = cx.db();
+        
+        list<string> namespaces;
+        db->namespaceIndex.getNamespaces( namespaces );
+        
+        path root = getParam( "out" );
+        root /= dbname;
+        create_directories( root );
+
+        for ( list<string>::iterator i=namespaces.begin(); i!=namespaces.end(); ++i ){
+            LogIndentLevel lil;
+            string ns = *i;
+
+            if ( str::endsWith( ns , ".system.namespaces" ) )
+                continue;
+            
+            if ( str::contains( ns , ".tmp.mr." ) )
+                continue;
+            
+            if ( _coll != "" && ! str::endsWith( ns , _coll ) )
+                continue;
+
+            log() << "trying to recover: " << ns << endl;
+            
+            LogIndentLevel lil2;
+            try {
+                _repair( db , ns , root );
+            }
+            catch ( DBException& e ){
+                log() << "ERROR recovering: " << ns << " " << e.toString() << endl;
+            }
+        }
+   
+        return 0;
+    }
+
+    int run() {
+        
+        if ( hasParam( "repair" ) ){
+            warning() << "repair is a work in progress" << endl;
+            return repair();
+        }
+
+        {
+            string q = getParam("query");
+            if ( q.size() )
+                _query = fromjson( q );
+        }
+
+        string opLogName = "";
+        unsigned long long opLogStart = 0;
+        if (hasParam("oplog")) {
+            if (hasParam("query") || hasParam("db") || hasParam("collection")) {
+                log() << "oplog mode is only supported on full dumps" << endl;
+                return -1;
+            }
+
+
+            BSONObj isMaster;
+            conn("true").simpleCommand("admin", &isMaster, "isMaster");
+
+            if (isMaster.hasField("hosts")) { // if connected to replica set member
+                opLogName = "local.oplog.rs";
+            }
+            else {
+                opLogName = "local.oplog.$main";
+                if ( ! isMaster["ismaster"].trueValue() ) {
+                    log() << "oplog mode is only supported on master or replica set member" << endl;
+                    return -1;
+                }
+            }
+
+            auth("local");
+
+            BSONObj op = conn(true).findOne(opLogName, Query().sort("$natural", -1), 0, QueryOption_SlaveOk);
+            if (op.isEmpty()) {
+                log() << "No operations in oplog. Please ensure you are connecting to a master." << endl;
+                return -1;
+            }
+
+            assert(op["ts"].type() == Timestamp);
+            opLogStart = op["ts"]._numberLong();
+        }
+
+        // check if we're outputting to stdout
+        string out = getParam("out");
+        if ( out == "-" ) {
+            if ( _db != "" && _coll != "" ) {
+                writeCollectionStdout( _db+"."+_coll );
+                return 0;
+            }
+            else {
+                log() << "You must specify database and collection to print to stdout" << endl;
+                return -1;
+            }
+        }
+
+        _usingMongos = isMongos();
+
+        path root( out );
+        string db = _db;
+
+        if ( db == "" ) {
+            log() << "all dbs" << endl;
+            auth( "admin" );
+
+            BSONObj res = conn( true ).findOne( "admin.$cmd" , BSON( "listDatabases" << 1 ) );
+            if ( ! res["databases"].isABSONObj() ) {
+                error() << "output of listDatabases isn't what we expected, no 'databases' field:\n" << res << endl;
+                return -2;
+            }
+            BSONObj dbs = res["databases"].embeddedObjectUserCheck();
+            set<string> keys;
+            dbs.getFieldNames( keys );
+            for ( set<string>::iterator i = keys.begin() ; i != keys.end() ; i++ ) {
+                string key = *i;
+                
+                if ( ! dbs[key].isABSONObj() ) {
+                    error() << "database field not an object key: " << key << " value: " << dbs[key] << endl;
+                    return -3;
+                }
+
+                BSONObj dbobj = dbs[key].embeddedObjectUserCheck();
+
+                const char * dbName = dbobj.getField( "name" ).valuestr();
+                if ( (string)dbName == "local" )
+                    continue;
+
+                go ( dbName , root / dbName );
+            }
+        }
+        else {
+            auth( db );
+            go( db , root / db );
+        }
+
+        if (!opLogName.empty()) {
+            BSONObjBuilder b;
+            b.appendTimestamp("$gt", opLogStart);
+
+            _query = BSON("ts" << b.obj());
+
+            writeCollectionFile( opLogName , root / "oplog.bson" );
+        }
+
+        return 0;
+    }
+
+    bool _usingMongos;
+    BSONObj _query;
+};
+
+int main( int argc , char ** argv ) {
+    Dump d;
+    return d.main( argc , argv );
+}
diff --git a/src/mongo/tools/export.cpp b/src/mongo/tools/export.cpp
new file mode 100644
index 00000000000..0d9f0225da0
--- /dev/null
+++ b/src/mongo/tools/export.cpp
@@ -0,0 +1,248 @@
+// export.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client/dbclient.h"
+#include "db/json.h"
+
+#include "tool.h"
+
+#include <fstream>
+#include <iostream>
+
+#include <boost/program_options.hpp>
+
+using namespace mongo;
+
+namespace po = boost::program_options;
+
+class Export : public Tool {
+public:
+    Export() : Tool( "export" ) {
+        addFieldOptions();
+        add_options()
+        ("query,q" , po::value<string>() , "query filter, as a JSON string" )
+        ("csv","export to csv instead of json")
+        ("out,o", po::value<string>(), "output file; if not specified, stdout is used")
+        ("jsonArray", "output to a json array rather than one object per line")
+        ("slaveOk,k", po::value<bool>()->default_value(true) , "use secondaries for export if available, default true")
+        ;
+        _usesstdout = false;
+    }
+
+    virtual void preSetup() {
+        string out = getParam("out");
+        if ( out == "-" ) {
+                // write output to standard error to avoid mangling output
+                // must happen early to avoid sending junk to stdout
+                useStandardOutput(false);
+        }
+    }
+
+    virtual void printExtraHelp( ostream & out ) {
+        out << "Export MongoDB data to CSV, TSV or JSON files.\n" << endl;
+    }
+
+    // Turn every double quote character into two double quote characters
+    // If hasSurroundingQuotes is true, doesn't escape the first and last
+    // characters of the string, if it's false, add a double quote character
+    // around the whole string.
+    string csvEscape(string str, bool hasSurroundingQuotes = false) {
+        size_t index = hasSurroundingQuotes ? 1 : 0;
+        while (((index = str.find('"', index)) != string::npos)
+               && (index < (hasSurroundingQuotes ? str.size() - 1 : str.size()))) {
+            str.replace(index, 1, "\"\"");
+            index += 2;
+        }
+        return hasSurroundingQuotes ? str : "\"" + str + "\"";
+    }
+
+    // Gets the string representation of a BSON object that can be correctly written to a CSV file
+    string csvString (const BSONElement& object) {
+        const char* binData; // Only used with BinData type
+
+        switch (object.type()) {
+        case MinKey:
+            return "$MinKey";
+        case MaxKey:
+            return "$MaxKey";
+        case NumberInt:
+        case NumberDouble:
+        case NumberLong:
+        case Bool:
+            return object.toString(false);
+        case String:
+        case Symbol:
+            return csvEscape(object.toString(false), true);
+        case Object:
+            return csvEscape(object.jsonString(Strict, false));
+        case Array:
+            return csvEscape(object.jsonString(Strict, false));
+        case BinData:
+            int len;
+            binData = object.binDataClean(len);
+            return toHex(binData, len);
+        case jstOID:
+            return "ObjectID(" + object.OID().toString() + ")"; // OIDs are always 24 bytes
+        case Date:
+            return timeToISOString(object.Date() / 1000);
+        case Timestamp:
+            return csvEscape(object.jsonString(Strict, false));
+        case RegEx:
+            return csvEscape("/" + string(object.regex()) + "/" + string(object.regexFlags()));
+        case Code:
+            return csvEscape(object.toString(false));
+        case CodeWScope:
+            if (string(object.codeWScopeScopeData()) == "") {
+                return csvEscape(object.toString(false));
+            } else {
+                return csvEscape(object.jsonString(Strict, false));
+            }
+        case EOO:
+        case Undefined:
+        case DBRef:
+        case jstNULL:
+            cerr << "Invalid BSON object type for CSV output: " << object.type() << endl;
+            return "";
+        }
+        // Can never get here
+        assert(false);
+        return "";
+    }
+
+    int run() {
+        string ns;
+        const bool csv = hasParam( "csv" );
+        const bool jsonArray = hasParam( "jsonArray" );
+        ostream *outPtr = &cout;
+        string outfile = getParam( "out" );
+        auto_ptr<ofstream> fileStream;
+        if ( hasParam( "out" ) ) {
+            size_t idx = outfile.rfind( "/" );
+            if ( idx != string::npos ) {
+                string dir = outfile.substr( 0 , idx + 1 );
+                create_directories( dir );
+            }
+            ofstream * s = new ofstream( outfile.c_str() , ios_base::out );
+            fileStream.reset( s );
+            outPtr = s;
+            if ( ! s->good() ) {
+                cerr << "couldn't open [" << outfile << "]" << endl;
+                return -1;
+            }
+        }
+        ostream &out = *outPtr;
+
+        BSONObj * fieldsToReturn = 0;
+        BSONObj realFieldsToReturn;
+
+        try {
+            ns = getNS();
+        }
+        catch (...) {
+            printHelp(cerr);
+            return 1;
+        }
+
+        auth();
+
+        if ( hasParam( "fields" ) || csv ) {
+            needFields();
+            
+            // we can't use just _fieldsObj since we support everything getFieldDotted does
+            
+            set<string> seen;
+            BSONObjBuilder b;
+            
+            BSONObjIterator i( _fieldsObj );
+            while ( i.more() ){
+                BSONElement e = i.next();
+                string f = str::before( e.fieldName() , '.' );
+                if ( seen.insert( f ).second )
+                    b.append( f , 1 );
+            }
+            
+            realFieldsToReturn = b.obj();
+            fieldsToReturn = &realFieldsToReturn;
+        }
+        
+        
+        if ( csv && _fields.size() == 0 ) {
+            cerr << "csv mode requires a field list" << endl;
+            return -1;
+        }
+
+        Query q( getParam( "query" , "" ) );
+        if ( q.getFilter().isEmpty() && !hasParam("dbpath"))
+            q.snapshot();
+
+        bool slaveOk = _params["slaveOk"].as<bool>();
+
+        auto_ptr<DBClientCursor> cursor = conn().query( ns.c_str() , q , 0 , 0 , fieldsToReturn , ( slaveOk ? QueryOption_SlaveOk : 0 ) | QueryOption_NoCursorTimeout );
+
+        if ( csv ) {
+            for ( vector<string>::iterator i=_fields.begin(); i != _fields.end(); i++ ) {
+                if ( i != _fields.begin() )
+                    out << ",";
+                out << *i;
+            }
+            out << endl;
+        }
+
+        if (jsonArray)
+            out << '[';
+
+        long long num = 0;
+        while ( cursor->more() ) {
+            num++;
+            BSONObj obj = cursor->next();
+            if ( csv ) {
+                for ( vector<string>::iterator i=_fields.begin(); i != _fields.end(); i++ ) {
+                    if ( i != _fields.begin() )
+                        out << ",";
+                    const BSONElement & e = obj.getFieldDotted(i->c_str());
+                    if ( ! e.eoo() ) {
+                        out << csvString(e);
+                    }
+                }
+                out << endl;
+            }
+            else {
+                if (jsonArray && num != 1)
+                    out << ',';
+
+                out << obj.jsonString();
+
+                if (!jsonArray)
+                    out << endl;
+            }
+        }
+
+        if (jsonArray)
+            out << ']' << endl;
+
+        cerr << "exported " << num << " records" << endl;
+
+        return 0;
+    }
+};
+
+int main( int argc , char ** argv ) {
+    Export e;
+    return e.main( argc , argv );
+}
diff --git a/src/mongo/tools/files.cpp b/src/mongo/tools/files.cpp
new file mode 100644
index 00000000000..06660361485
--- /dev/null
+++ b/src/mongo/tools/files.cpp
@@ -0,0 +1,164 @@
+// files.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client/gridfs.h"
+#include "client/dbclient.h"
+
+#include "tool.h"
+
+#include <fstream>
+#include <iostream>
+
+#include <boost/program_options.hpp>
+
+using namespace mongo;
+
+namespace po = boost::program_options;
+
+class Files : public Tool {
+public:
+    Files() : Tool( "files" ) {
+        add_options()
+        ( "local,l", po::value<string>(), "local filename for put|get (default is to use the same name as 'gridfs filename')")
+        ( "type,t", po::value<string>(), "MIME type for put (default is to omit)")
+        ( "replace,r", "Remove other files with same name after PUT")
+        ;
+        add_hidden_options()
+        ( "command" , po::value<string>() , "command (list|search|put|get)" )
+        ( "file" , po::value<string>() , "filename for get|put" )
+        ;
+        addPositionArg( "command" , 1 );
+        addPositionArg( "file" , 2 );
+    }
+
+    virtual void printExtraHelp( ostream & out ) {
+        out << "Browse and modify a GridFS filesystem.\n" << endl;
+        out << "usage: " << _name << " [options] command [gridfs filename]" << endl;
+        out << "command:" << endl;
+        out << "  one of (list|search|put|get)" << endl;
+        out << "  list - list all files.  'gridfs filename' is an optional prefix " << endl;
+        out << "         which listed filenames must begin with." << endl;
+        out << "  search - search all files. 'gridfs filename' is a substring " << endl;
+        out << "           which listed filenames must contain." << endl;
+        out << "  put - add a file with filename 'gridfs filename'" << endl;
+        out << "  get - get a file with filename 'gridfs filename'" << endl;
+        out << "  delete - delete all files with filename 'gridfs filename'" << endl;
+    }
+
+    void display( GridFS * grid , BSONObj obj ) {
+        auto_ptr<DBClientCursor> c = grid->list( obj );
+        while ( c->more() ) {
+            BSONObj obj = c->next();
+            cout
+                    << obj["filename"].str() << "\t"
+                    << (long)obj["length"].number()
+                    << endl;
+        }
+    }
+
+    int run() {
+        string cmd = getParam( "command" );
+        if ( cmd.size() == 0 ) {
+            cerr << "ERROR: need command" << endl << endl;
+            printHelp(cout);
+            return -1;
+        }
+
+        GridFS g( conn() , _db );
+        auth();
+
+        string filename = getParam( "file" );
+
+        if ( cmd == "list" ) {
+            BSONObjBuilder b;
+            if ( filename.size() )
+                b.appendRegex( "filename" , ( (string)"^" + filename ) );
+            display( &g , b.obj() );
+            return 0;
+        }
+
+        if ( filename.size() == 0 ) {
+            cerr << "ERROR: need a filename" << endl << endl;
+            printHelp(cout);
+            return -1;
+        }
+
+        if ( cmd == "search" ) {
+            BSONObjBuilder b;
+            b.appendRegex( "filename" , filename );
+            display( &g , b.obj() );
+            return 0;
+        }
+
+        if ( cmd == "get" ) {
+            GridFile f = g.findFile( filename );
+            if ( ! f.exists() ) {
+                cerr << "ERROR: file not found" << endl;
+                return -2;
+            }
+
+            string out = getParam("local", f.getFilename());
+            f.write( out );
+
+            if (out != "-")
+                cout << "done write to: " << out << endl;
+
+            return 0;
+        }
+
+        if ( cmd == "put" ) {
+            const string& infile = getParam("local", filename);
+            const string& type = getParam("type", "");
+
+            BSONObj file = g.storeFile(infile, filename, type);
+            cout << "added file: " << file << endl;
+
+            if (hasParam("replace")) {
+                auto_ptr<DBClientCursor> cursor = conn().query(_db+".fs.files", BSON("filename" << filename << "_id" << NE << file["_id"] ));
+                while (cursor->more()) {
+                    BSONObj o = cursor->nextSafe();
+                    conn().remove(_db+".fs.files", BSON("_id" << o["_id"]));
+                    conn().remove(_db+".fs.chunks", BSON("_id" << o["_id"]));
+                    cout << "removed file: " << o << endl;
+                }
+
+            }
+
+            conn().getLastError();
+            cout << "done!" << endl;
+            return 0;
+        }
+
+        if ( cmd == "delete" ) {
+            g.removeFile(filename);
+            conn().getLastError();
+            cout << "done!" << endl;
+            return 0;
+        }
+
+        cerr << "ERROR: unknown command '" << cmd << "'" << endl << endl;
+        printHelp(cout);
+        return -1;
+    }
+};
+
+int main( int argc , char ** argv ) {
+    Files f;
+    return f.main( argc , argv );
+}
diff --git a/src/mongo/tools/import.cpp b/src/mongo/tools/import.cpp
new file mode 100644
index 00000000000..24741ed46ad
--- /dev/null
+++ b/src/mongo/tools/import.cpp
@@ -0,0 +1,463 @@
+// import.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client/dbclient.h"
+#include "db/json.h"
+
+#include "tool.h"
+#include "../util/text.h"
+
+#include <fstream>
+#include <iostream>
+
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string.hpp>
+
+using namespace mongo;
+
+namespace po = boost::program_options;
+
+class Import : public Tool {
+
+    enum Type { JSON , CSV , TSV };
+    Type _type;
+
+    const char * _sep;
+    bool _ignoreBlanks;
+    bool _headerLine;
+    bool _upsert;
+    bool _doimport;
+    bool _jsonArray;
+    vector<string> _upsertFields;
+    static const int BUF_SIZE = 1024 * 1024 * 4;
+
+    void csvTokenizeRow(const string& row, vector<string>& tokens) {
+        bool inQuotes = false;
+        bool prevWasQuote = false;
+        bool tokenQuoted = false;
+        string curtoken = "";
+        for (string::const_iterator it = row.begin(); it != row.end(); ++it) {
+            char element = *it;
+            if (element == '"') {
+                if (!inQuotes) {
+                    inQuotes = true;
+                    tokenQuoted = true;
+                    curtoken = "";
+                } else {
+                    if (prevWasQuote) {
+                        curtoken += "\"";
+                        prevWasQuote = false;
+                    } else {
+                        prevWasQuote = true;
+                    }
+                }
+            } else {
+                if (inQuotes && prevWasQuote) {
+                    inQuotes = false;
+                    prevWasQuote = false;
+                    tokens.push_back(curtoken);
+                }
+
+                if (element == ',' && !inQuotes) {
+                    if (!tokenQuoted) { // If token was quoted, it's already been added
+                        boost::trim(curtoken);
+                        tokens.push_back(curtoken);
+                    }
+                    curtoken = "";
+                    tokenQuoted = false;
+                } else {
+                    curtoken += element;
+                }
+            }
+        }
+        if (!tokenQuoted || (inQuotes && prevWasQuote)) {
+            boost::trim(curtoken);
+            tokens.push_back(curtoken);
+        }
+    }
+
+    void _append( BSONObjBuilder& b , const string& fieldName , const string& data ) {
+        if ( _ignoreBlanks && data.size() == 0 )
+            return;
+
+        if ( b.appendAsNumber( fieldName , data ) )
+            return;
+
+        // TODO: other types?
+        b.append ( fieldName , data );
+    }
+
+    /*
+     * Reads one line from in into buf.
+     * Returns the number of bytes that should be skipped - the caller should
+     * increment buf by this amount.
+     */
+    int getLine(istream* in, char* buf) {
+        if (_jsonArray) {
+            in->read(buf, BUF_SIZE);
+            uassert(13295, "JSONArray file too large", (in->rdstate() & ios_base::eofbit));
+            buf[ in->gcount() ] = '\0';
+        }
+        else {
+            in->getline( buf , BUF_SIZE );
+            log(1) << "got line:" << buf << endl;
+        }
+        uassert( 10263 ,  "unknown error reading file" ,
+                 (!(in->rdstate() & ios_base::badbit)) &&
+                 (!(in->rdstate() & ios_base::failbit) || (in->rdstate() & ios_base::eofbit)) );
+
+        int numBytesSkipped = 0;
+        if (strncmp("\xEF\xBB\xBF", buf, 3) == 0) { // UTF-8 BOM (notepad is stupid)
+            buf += 3;
+            numBytesSkipped += 3;
+        }
+
+        uassert(13289, "Invalid UTF8 character detected", isValidUTF8(buf));
+        return numBytesSkipped;
+    }
+
+    /*
+     * Parses a BSON object out of a JSON array.
+     * Returns number of bytes processed on success and -1 on failure.
+     */
+    int parseJSONArray(char* buf, BSONObj& o) {
+        int len = 0;
+        while (buf[0] != '{' && buf[0] != '\0') {
+            len++;
+            buf++;
+        }
+        if (buf[0] == '\0')
+            return -1;
+
+        int jslen;
+        o = fromjson(buf, &jslen);
+        len += jslen;
+
+        return len;
+    }
+
+    /*
+     * Parses one object from the input file.  This usually corresponds to one line in the input
+     * file, unless the file is a CSV and contains a newline within a quoted string entry.
+     * Returns a true if a BSONObj was successfully created and false if not.
+     */
+    bool parseRow(istream* in, BSONObj& o, int& numBytesRead) {
+        boost::scoped_array<char> buffer(new char[BUF_SIZE+2]);
+        char* line = buffer.get();
+
+        numBytesRead = getLine(in, line);
+        line += numBytesRead;
+
+        if (line[0] == '\0') {
+            return false;
+        }
+        numBytesRead += strlen( line );
+
+        if (_type == JSON) {
+            // Strip out trailing whitespace
+            char * end = ( line + strlen( line ) ) - 1;
+            while ( end >= line && isspace(*end) ) {
+                *end = 0;
+                end--;
+            }
+            o = fromjson( line );
+            return true;
+        }
+
+        vector<string> tokens;
+        if (_type == CSV) {
+            string row;
+            bool inside_quotes = false;
+            size_t last_quote = 0;
+            while (true) {
+                string lineStr(line);
+                // Deal with line breaks in quoted strings
+                last_quote = lineStr.find_first_of('"');
+                while (last_quote != string::npos) {
+                    inside_quotes = !inside_quotes;
+                    last_quote = lineStr.find_first_of('"', last_quote+1);
+                }
+
+                row.append(lineStr);
+
+                if (inside_quotes) {
+                    row.append("\n");
+                    int num = getLine(in, line);
+                    line += num;
+                    numBytesRead += num;
+
+                    uassert (15854, "CSV file ends while inside quoted field", line[0] != '\0');
+                    numBytesRead += strlen( line );
+                } else {
+                    break;
+                }
+            }
+            // now 'row' is string corresponding to one row of the CSV file
+            // (which may span multiple lines) and represents one BSONObj
+            csvTokenizeRow(row, tokens);
+        }
+        else {  // _type == TSV
+            while (line[0] != '\t' && isspace(line[0])) { // Strip leading whitespace, but not tabs
+                line++;
+            }
+
+            boost::split(tokens, line, boost::is_any_of(_sep));
+        }
+
+        // Now that the row is tokenized, create a BSONObj out of it.
+        BSONObjBuilder b;
+        unsigned int pos=0;
+        for (vector<string>::iterator it = tokens.begin(); it != tokens.end(); ++it) {
+            string token = *it;
+            if ( _headerLine ) {
+                _fields.push_back(token);
+            }
+            else {
+                string name;
+                if ( pos < _fields.size() ) {
+                    name = _fields[pos];
+                }
+                else {
+                    stringstream ss;
+                    ss << "field" << pos;
+                    name = ss.str();
+                }
+                pos++;
+
+                _append( b , name , token );
+            }
+        }
+        o = b.obj();
+        return true;
+    }
+
+public:
+    Import() : Tool( "import" ) {
+        addFieldOptions();
+        add_options()
+        ("ignoreBlanks","if given, empty fields in csv and tsv will be ignored")
+        ("type",po::value<string>() , "type of file to import.  default: json (json,csv,tsv)")
+        ("file",po::value<string>() , "file to import from; if not specified stdin is used" )
+        ("drop", "drop collection first " )
+        ("headerline","CSV,TSV only - use first line as headers")
+        ("upsert", "insert or update objects that already exist" )
+        ("upsertFields", po::value<string>(), "comma-separated fields for the query part of the upsert. You should make sure this is indexed" )
+        ("stopOnError", "stop importing at first error rather than continuing" )
+        ("jsonArray", "load a json array, not one item per line. Currently limited to 4MB." )
+        ;
+        add_hidden_options()
+        ("noimport", "don't actually import. useful for benchmarking parser" )
+        ;
+        addPositionArg( "file" , 1 );
+        _type = JSON;
+        _ignoreBlanks = false;
+        _headerLine = false;
+        _upsert = false;
+        _doimport = true;
+        _jsonArray = false;
+    }
+
+    virtual void printExtraHelp( ostream & out ) {
+        out << "Import CSV, TSV or JSON data into MongoDB.\n" << endl;
+    }
+
+    int run() {
+        string filename = getParam( "file" );
+        long long fileSize = 0;
+        int headerRows = 0;
+
+        istream * in = &cin;
+
+        ifstream file( filename.c_str() , ios_base::in);
+
+        if ( filename.size() > 0 && filename != "-" ) {
+            if ( ! exists( filename ) ) {
+                error() << "file doesn't exist: " << filename << endl;
+                return -1;
+            }
+            in = &file;
+            fileSize = file_size( filename );
+        }
+
+        // check if we're actually talking to a machine that can write
+        if (!isMaster()) {
+            return -1;
+        }
+
+        string ns;
+
+        try {
+            ns = getNS();
+        }
+        catch (...) {
+            printHelp(cerr);
+            return -1;
+        }
+
+        log(1) << "ns: " << ns << endl;
+
+        auth();
+
+        if ( hasParam( "drop" ) ) {
+            log() << "dropping: " << ns << endl;
+            conn().dropCollection( ns.c_str() );
+        }
+
+        if ( hasParam( "ignoreBlanks" ) ) {
+            _ignoreBlanks = true;
+        }
+
+        if ( hasParam( "upsert" ) || hasParam( "upsertFields" )) {
+            _upsert = true;
+
+            string uf = getParam("upsertFields");
+            if (uf.empty()) {
+                _upsertFields.push_back("_id");
+            }
+            else {
+                StringSplitter(uf.c_str(), ",").split(_upsertFields);
+            }
+        }
+
+        if ( hasParam( "noimport" ) ) {
+            _doimport = false;
+        }
+
+        if ( hasParam( "type" ) ) {
+            string type = getParam( "type" );
+            if ( type == "json" )
+                _type = JSON;
+            else if ( type == "csv" ) {
+                _type = CSV;
+                _sep = ",";
+            }
+            else if ( type == "tsv" ) {
+                _type = TSV;
+                _sep = "\t";
+            }
+            else {
+                error() << "don't know what type [" << type << "] is" << endl;
+                return -1;
+            }
+        }
+
+        if ( _type == CSV || _type == TSV ) {
+            _headerLine = hasParam( "headerline" );
+            if ( _headerLine ) {
+                headerRows = 1;
+            }
+            else {
+                needFields();
+            }
+        }
+
+        if (_type == JSON && hasParam("jsonArray")) {
+            _jsonArray = true;
+        }
+
+        time_t start = time(0);
+        log(1) << "filesize: " << fileSize << endl;
+        ProgressMeter pm( fileSize );
+        int num = 0;
+        int errors = 0;
+        int len = 0;
+        // buffer and line are only used when parsing a jsonArray
+        boost::scoped_array<char> buffer(new char[BUF_SIZE+2]);
+        char* line = buffer.get();
+
+        while ( _jsonArray || in->rdstate() == 0 ) {
+            try {
+                BSONObj o;
+                if (_jsonArray) {
+                    int bytesProcessed = 0;
+                    if (line == buffer.get()) { // Only read on first pass - the whole array must be on one line.
+                        bytesProcessed = getLine(in, line);
+                        line += bytesProcessed;
+                        len += bytesProcessed;
+                    }
+                    if ((bytesProcessed = parseJSONArray(line, o)) < 0) {
+                        len += bytesProcessed;
+                        break;
+                    }
+                    len += bytesProcessed;
+                    line += bytesProcessed;
+                }
+                else {
+                    if (!parseRow(in, o, len)) {
+                        continue;
+                    }
+                }
+
+                if ( _headerLine ) {
+                    _headerLine = false;
+                }
+                else if (_doimport) {
+                    bool doUpsert = _upsert;
+                    BSONObjBuilder b;
+                    if (_upsert) {
+                        for (vector<string>::const_iterator it=_upsertFields.begin(), end=_upsertFields.end(); it!=end; ++it) {
+                            BSONElement e = o.getFieldDotted(it->c_str());
+                            if (e.eoo()) {
+                                doUpsert = false;
+                                break;
+                            }
+                            b.appendAs(e, *it);
+                        }
+                    }
+
+                    if (doUpsert) {
+                        conn().update(ns, Query(b.obj()), o, true);
+                    }
+                    else {
+                        conn().insert( ns.c_str() , o );
+                    }
+                }
+
+                num++;
+            }
+            catch ( std::exception& e ) {
+                log() << "exception:" << e.what() << endl;
+                log() << line << endl;
+                errors++;
+
+                if (hasParam("stopOnError") || _jsonArray)
+                    break;
+            }
+
+            if ( pm.hit( len + 1 ) ) {
+                log() << "\t\t\t" << num << "\t" << ( num / ( time(0) - start ) ) << "/second" << endl;
+            }
+        }
+
+        log() << "imported " << ( num - headerRows ) << " objects" << endl;
+
+        conn().getLastError();
+
+        if ( errors == 0 )
+            return 0;
+
+        error() << "encountered " << errors << " error" << ( errors == 1 ? "" : "s" ) << endl;
+        return -1;
+    }
+};
+
+int main( int argc , char ** argv ) {
+    Import import;
+    return import.main( argc , argv );
+}
diff --git a/src/mongo/tools/oplog.cpp b/src/mongo/tools/oplog.cpp
new file mode 100644
index 00000000000..1c09f37064f
--- /dev/null
+++ b/src/mongo/tools/oplog.cpp
@@ -0,0 +1,108 @@
+// oplog.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client/dbclient.h"
+#include "db/json.h"
+#include "db/oplogreader.h"
+
+#include "tool.h"
+
+#include <fstream>
+#include <iostream>
+
+#include <boost/program_options.hpp>
+
+using namespace mongo;
+
+namespace po = boost::program_options;
+
+class OplogTool : public Tool {
+public:
+    OplogTool() : Tool( "oplog" ) {
+        addFieldOptions();
+        add_options()
+        ("seconds,s" , po::value<int>() , "seconds to go back default:86400" )
+        ("from", po::value<string>() , "host to pull from" )
+        ("oplogns", po::value<string>()->default_value( "local.oplog.rs" ) , "ns to pull from" )
+        ;
+    }
+
+    virtual void printExtraHelp(ostream& out) {
+        out << "Pull and replay a remote MongoDB oplog.\n" << endl;
+    }
+
+    int run() {
+
+        if ( ! hasParam( "from" ) ) {
+            log() << "need to specify --from" << endl;
+            return -1;
+        }
+
+        Client::initThread( "oplogreplay" );
+
+        log() << "going to connect" << endl;
+        
+        OplogReader r;
+        r.connect( getParam( "from" ) );
+
+        log() << "connected" << endl;
+
+        OpTime start( time(0) - getParam( "seconds" , 86400 ) , 0 );
+        log() << "starting from " << start.toStringPretty() << endl;
+
+        string ns = getParam( "oplogns" );
+        r.tailingQueryGTE( ns.c_str() , start );
+
+        int num = 0;
+        while ( r.more() ) {
+            BSONObj o = r.next();
+
+            bool print = ++num % 100000 == 0;
+            if ( print )
+                cout << num << "\t" << o << endl;
+
+            if ( o["op"].String() == "n" )
+                continue;
+
+            if ( o["op"].String() == "c" ) {
+                cout << "skipping: " << o << endl;
+                continue;
+            }
+
+            BSONObjBuilder b( o.objsize() + 32 );
+            BSONArrayBuilder updates( b.subarrayStart( "applyOps" ) );
+            updates.append( o );
+            updates.done();
+
+            BSONObj c = b.obj();
+
+            BSONObj res;
+            conn().runCommand( "admin" , c , res );
+            if ( print )
+                cout << res << endl;
+        }
+
+        return 0;
+    }
+};
+
+int main( int argc , char** argv ) {
+    OplogTool t;
+    return t.main( argc , argv );
+}
diff --git a/src/mongo/tools/restore.cpp b/src/mongo/tools/restore.cpp
new file mode 100644
index 00000000000..85e91ce6485
--- /dev/null
+++ b/src/mongo/tools/restore.cpp
@@ -0,0 +1,583 @@
+// @file restore.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "../pch.h"
+#include "../client/dbclient.h"
+#include "../util/mmap.h"
+#include "../util/version.h"
+#include "tool.h"
+
+#include <boost/program_options.hpp>
+
+#include <fcntl.h>
+#include <set>
+
+using namespace mongo;
+
+namespace po = boost::program_options;
+
+namespace {
+    const char* OPLOG_SENTINEL = "$oplog";  // compare by ptr not strcmp
+}
+
+class Restore : public BSONTool {
+public:
+
+    bool _drop;
+    bool _keepIndexVersion;
+    bool _restoreOptions;
+    bool _restoreIndexes;
+    bool _restoreShardingConfig;
+    int _w;
+    string _curns;
+    string _curdb;
+    string _curcoll;
+    set<string> _users; // For restoring users with --drop
+    auto_ptr<Matcher> _opmatcher; // For oplog replay
+    Restore() : BSONTool( "restore" ) , _drop(false) {
+        add_options()
+        ("drop" , "drop each collection before import" )
+        ("oplogReplay", "replay oplog for point-in-time restore")
+        ("oplogLimit", po::value<string>(), "exclude oplog entries newer than provided timestamp (epoch[:ordinal])")
+        ("keepIndexVersion" , "don't upgrade indexes to newest version")
+        ("noOptionsRestore" , "don't restore collection options")
+        ("noIndexRestore" , "don't restore indexes")
+        ("restoreShardingConfig", "restore sharding configuration before doing the full import")
+        ("w" , po::value<int>()->default_value(1) , "minimum number of replicas per write" )
+        ;
+        add_hidden_options()
+        ("dir", po::value<string>()->default_value("dump"), "directory to restore from")
+        ("indexesLast" , "wait to add indexes (now default)") // left in for backwards compatibility
+        ("forceConfigRestore", "don't use confirmation prompt when doing --restoreShardingConfig") // For testing
+        ;
+        addPositionArg("dir", 1);
+    }
+
+    virtual void printExtraHelp(ostream& out) {
+        out << "Import BSON files into MongoDB.\n" << endl;
+        out << "usage: " << _name << " [options] [directory or filename to restore from]" << endl;
+    }
+
+    virtual int doRun() {
+        auth();
+        path root = getParam("dir");
+
+        // check if we're actually talking to a machine that can write
+        if (!isMaster()) {
+            return -1;
+        }
+
+        _drop = hasParam( "drop" );
+        _keepIndexVersion = hasParam("keepIndexVersion");
+        _restoreOptions = !hasParam("noOptionRestore");
+        _restoreIndexes = !hasParam("noIndexRestore");
+        _w = getParam( "w" , 1 );
+        _restoreShardingConfig = hasParam("restoreShardingConfig");
+        bool forceConfigRestore = hasParam("forceConfigRestore");
+
+        bool doOplog = hasParam( "oplogReplay" );
+
+        if (doOplog) {
+            // fail early if errors
+
+            if (_db != "") {
+                log() << "Can only replay oplog on full restore" << endl;
+                return -1;
+            }
+
+            if ( ! exists(root / "oplog.bson") ) {
+                log() << "No oplog file to replay. Make sure you run mongodump with --oplog." << endl;
+                return -1;
+            }
+
+
+            BSONObj out;
+            if (! conn().simpleCommand("admin", &out, "buildinfo")) {
+                log() << "buildinfo command failed: " << out["errmsg"].String() << endl;
+                return -1;
+            }
+
+            StringData version = out["version"].valuestr();
+            if (versionCmp(version, "1.7.4-pre-") < 0) {
+                log() << "Can only replay oplog to server version >= 1.7.4" << endl;
+                return -1;
+            }
+
+            string oplogLimit = getParam( "oplogLimit", "" );
+            string oplogInc = "0";
+
+            if(!oplogLimit.empty()) {
+                size_t i = oplogLimit.find_first_of(':');
+                if ( i != string::npos ) {
+                    if ( i + 1 < oplogLimit.length() ) {
+                        oplogInc = oplogLimit.substr(i + 1);
+                    }
+
+                    oplogLimit = oplogLimit.substr(0, i);
+                }
+                
+                if ( ! oplogLimit.empty() ) {
+                    _opmatcher.reset( new Matcher( fromjson( string("{ \"ts\": { \"$lt\": { \"$timestamp\": { \"t\": ") + oplogLimit + string(", \"i\": ") + oplogInc + string(" } } } }") ) ) );
+                }
+            }
+        }
+
+        if (_restoreShardingConfig) {
+            if (_db != "" && _db != "config") {
+                log() << "Can only setup sharding configuration on full restore or on restoring config database" << endl;
+                return -1;
+            }
+
+            // make sure we're talking to a mongos
+            if (!isMongos()) {
+                log() << "Can only use --restoreShardingConfig on a mongos" << endl;
+                return -1;
+            }
+
+            if ( ! exists(root / "config") ) {
+                log() << "No config directory to restore sharding setup from." << endl;
+                return -1;
+            }
+
+            // Make sure this isn't an active cluster.
+            log() << "WARNING: this will drop the config database, overriding any sharding configuration you currently have." << endl
+                 << "DO NOT RUN THIS COMMAND WHILE YOUR CLUSTER IS ACTIVE" << endl;
+            if (forceConfigRestore) {
+                log() << "Running with --forceConfigRestore. Continuing." << endl;
+            } else {
+                if (!confirmAction()) {
+                    log() << "aborting" << endl;
+                    return -1;
+                }
+            }
+
+            // Restore config database
+            BSONObj info;
+            bool ok = conn().runCommand( "config" , BSON( "dropDatabase" << 1 ) , info );
+            if (!ok) {
+                log() << "Error dropping config database. Aborting" << endl;
+                return -1;
+            }
+            drillDown(root / "config", false, false);
+
+            log() << "Finished restoring config database." << endl
+                 << "Calling flushRouterConfig on this connection" << endl;
+            conn().runCommand( "config" , BSON( "flushRouterConfig" << 1 ) , info );
+        }
+
+        /* If _db is not "" then the user specified a db name to restore as.
+         *
+         * In that case we better be given either a root directory that
+         * contains only .bson files or a single .bson file  (a db).
+         *
+         * In the case where a collection name is specified we better be
+         * given either a root directory that contains only a single
+         * .bson file, or a single .bson file itself (a collection).
+         */
+        drillDown(root, _db != "", _coll != "", true);
+
+        if (_restoreShardingConfig) {
+            log() << "Flushing routing configuration from all mongos that we're aware of" << endl;
+            flushAllMongos();
+            log(1) << "Finished flushing the sharding configuration on all mongos' that the dumped config data knew of." << endl
+                 << "If there are new mongos' that weren't just flushed, make sure to call flushRouterConfig on them." << endl;
+        }
+
+        // should this happen for oplog replay as well?
+        conn().getLastError();
+
+        if (doOplog) {
+            log() << "\t Replaying oplog" << endl;
+            _curns = OPLOG_SENTINEL;
+            processFile( root / "oplog.bson" );
+        }
+
+        return EXIT_CLEAN;
+    }
+
+    void drillDown( path root, bool use_db, bool use_coll, bool top_level=false ) {
+        log(2) << "drillDown: " << root.string() << endl;
+
+        // skip hidden files and directories
+        if (root.leaf()[0] == '.' && root.leaf() != ".")
+            return;
+
+        if ( is_directory( root ) ) {
+            directory_iterator end;
+            directory_iterator i(root);
+            path indexes;
+            while ( i != end ) {
+                path p = *i;
+                i++;
+
+                if (use_db) {
+                    if (is_directory(p)) {
+                        error() << "ERROR: root directory must be a dump of a single database" << endl;
+                        error() << "       when specifying a db name with --db" << endl;
+                        printHelp(cout);
+                        return;
+                    }
+                }
+
+                if (use_coll) {
+                    if (is_directory(p) || i != end) {
+                        error() << "ERROR: root directory must be a dump of a single collection" << endl;
+                        error() << "       when specifying a collection name with --collection" << endl;
+                        printHelp(cout);
+                        return;
+                    }
+                }
+
+                // don't insert oplog
+                if (top_level && !use_db && p.leaf() == "oplog.bson")
+                    continue;
+
+                if ( p.leaf() == "system.indexes.bson" ) {
+                    indexes = p;
+                }
+                else if (_restoreShardingConfig && is_directory(p) && p.leaf() == "config") {
+                    // Config directory should have already been restored. Skip it here.
+                    continue;
+                }
+                else {
+                    drillDown(p, use_db, use_coll);
+                }
+            }
+
+            if (!indexes.empty())
+                drillDown(indexes, use_db, use_coll);
+
+            return;
+        }
+
+        if ( endsWith( root.string().c_str() , ".metadata.json" ) ) {
+            // Metadata files are handled when the corresponding .bson file is handled
+            return;
+        }
+
+        if ( ! ( endsWith( root.string().c_str() , ".bson" ) ||
+                 endsWith( root.string().c_str() , ".bin" ) ) ) {
+            error() << "don't know what to do with file [" << root.string() << "]" << endl;
+            return;
+        }
+
+        log() << root.string() << endl;
+
+        if ( root.leaf() == "system.profile.bson" ) {
+            log() << "\t skipping" << endl;
+            return;
+        }
+
+        string ns;
+        if (use_db) {
+            ns += _db;
+        }
+        else {
+            string dir = root.branch_path().string();
+            if ( dir.find( "/" ) == string::npos )
+                ns += dir;
+            else
+                ns += dir.substr( dir.find_last_of( "/" ) + 1 );
+
+            if ( ns.size() == 0 )
+                ns = "test";
+        }
+
+        assert( ns.size() );
+
+        string oldCollName = root.leaf(); // Name of the collection that was dumped from
+        oldCollName = oldCollName.substr( 0 , oldCollName.find_last_of( "." ) );
+        if (use_coll) {
+            ns += "." + _coll;
+        }
+        else {
+            ns += "." + oldCollName;
+        }
+
+        log() << "\tgoing into namespace [" << ns << "]" << endl;
+
+        if ( _drop ) {
+            if (root.leaf() != "system.users.bson" ) {
+                log() << "\t dropping" << endl;
+                conn().dropCollection( ns );
+            } else {
+                // Create map of the users currently in the DB
+                BSONObj fields = BSON("user" << 1);
+                scoped_ptr<DBClientCursor> cursor(conn().query(ns, Query(), 0, 0, &fields));
+                while (cursor->more()) {
+                    BSONObj user = cursor->next();
+                    _users.insert(user["user"].String());
+                }
+            }
+        }
+
+        BSONObj metadataObject;
+        if (_restoreOptions || _restoreIndexes) {
+            path metadataFile = (root.branch_path() / (oldCollName + ".metadata.json"));
+            if (!exists(metadataFile.string())) {
+                // This is fine because dumps from before 2.1 won't have a metadata file, just print a warning.
+                // System collections shouldn't have metadata so don't warn if that file is missing.
+                if (!startsWith(metadataFile.leaf(), "system.")) {
+                    log() << metadataFile.string() << " not found. Skipping." << endl;
+                }
+            } else {
+                metadataObject = parseMetadataFile(metadataFile.string());
+            }
+        }
+
+        _curns = ns.c_str();
+        _curdb = NamespaceString(_curns).db;
+        _curcoll = NamespaceString(_curns).coll;
+
+        if (_restoreOptions && metadataObject.hasField("options")) {
+            // Try to create collection with given options
+            createCollectionWithOptions(metadataObject["options"].Obj());
+        }
+
+        processFile( root );
+        if (_drop && root.leaf() == "system.users.bson") {
+            // Delete any users that used to exist but weren't in the dump file
+            for (set<string>::iterator it = _users.begin(); it != _users.end(); ++it) {
+                BSONObj userMatch = BSON("user" << *it);
+                conn().remove(ns, Query(userMatch));
+            }
+            _users.clear();
+        }
+
+        if (_restoreIndexes && metadataObject.hasField("indexes")) {
+            vector<BSONElement> indexes = metadataObject["indexes"].Array();
+            for (vector<BSONElement>::iterator it = indexes.begin(); it != indexes.end(); ++it) {
+                createIndex((*it).Obj(), false);
+            }
+        }
+    }
+
+    virtual void gotObject( const BSONObj& obj ) {
+        if (_curns == OPLOG_SENTINEL) { // intentional ptr compare
+            if (obj["op"].valuestr()[0] == 'n') // skip no-ops
+                return;
+            
+            // exclude operations that don't meet (timestamp) criteria
+            if ( _opmatcher.get() && ! _opmatcher->matches ( obj ) ) {
+                return;
+            }
+
+            string db = obj["ns"].valuestr();
+            db = db.substr(0, db.find('.'));
+
+            BSONObj cmd = BSON( "applyOps" << BSON_ARRAY( obj ) );
+            BSONObj out;
+            conn().runCommand(db, cmd, out);
+
+            // wait for ops to propagate to "w" nodes (doesn't warn if w used without replset)
+            if ( _w > 1 ) {
+                conn().getLastError(false, false, _w);
+            }
+        }
+        else if ( endsWith( _curns.c_str() , ".system.indexes" )) {
+            createIndex(obj, true);
+        }
+        else if (_drop && endsWith(_curns.c_str(), ".system.users") && _users.count(obj["user"].String())) {
+            // Since system collections can't be dropped, we have to manually
+            // replace the contents of the system.users collection
+            BSONObj userMatch = BSON("user" << obj["user"].String());
+            conn().update(_curns, Query(userMatch), obj);
+            _users.erase(obj["user"].String());
+        } else {
+            conn().insert( _curns , obj );
+
+            // wait for insert to propagate to "w" nodes (doesn't warn if w used without replset)
+            if ( _w > 1 ) {
+                conn().getLastErrorDetailed(false, false, _w);
+            }
+        }
+    }
+
+private:
+
+    BSONObj parseMetadataFile(string filePath) {
+        long long fileSize = file_size(filePath);
+        ifstream file(filePath.c_str(), ios_base::in);
+
+        scoped_ptr<char> buf(new char[fileSize]);
+        file.read(buf.get(), fileSize);
+        int objSize;
+        BSONObj obj;
+        obj = fromjson (buf.get(), &objSize);
+        uassert(15934, "JSON object size didn't match file size", objSize == fileSize);
+        return obj;
+    }
+
+    // Compares 2 BSONObj representing collection options. Returns true if the objects
+    // represent different options. Ignores the "create" field.
+    bool optionsSame(BSONObj obj1, BSONObj obj2) {
+        int nfields = 0;
+        BSONObjIterator i(obj1);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if (!obj2.hasField(e.fieldName())) {
+                if (strcmp(e.fieldName(), "create") == 0) {
+                    continue;
+                } else {
+                    return false;
+                }
+            }
+            nfields++;
+            if (e != obj2[e.fieldName()]) {
+                return false;
+            }
+        }
+        return nfields == obj2.nFields();
+    }
+
+    void createCollectionWithOptions(BSONObj cmdObj) {
+        if (!cmdObj.hasField("create") || cmdObj["create"].String() != _curcoll) {
+            BSONObjBuilder bo;
+            if (!cmdObj.hasField("create")) {
+                bo.append("create", _curcoll);
+            }
+
+            BSONObjIterator i(cmdObj);
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if (strcmp(e.fieldName(), "create") == 0) {
+                    bo.append("create", _curcoll);
+                }
+                else {
+                    bo.append(e);
+                }
+            }
+            cmdObj = bo.obj();
+        }
+
+        BSONObj fields = BSON("options" << 1);
+        scoped_ptr<DBClientCursor> cursor(conn().query(_curdb + ".system.namespaces", Query(BSON("name" << _curns)), 0, 0, &fields));
+
+        bool createColl = true;
+        if (cursor->more()) {
+            createColl = false;
+            BSONObj obj = cursor->next();
+            if (!obj.hasField("options") || !optionsSame(cmdObj, obj["options"].Obj())) {
+                    log() << "WARNING: collection " << _curns << " exists with different options than are in the metadata.json file and not using --drop. Options in the metadata file will be ignored." << endl;
+            }
+        }
+
+        if (!createColl) {
+            return;
+        }
+
+        BSONObj info;
+        if (!conn().runCommand(_curdb, cmdObj, info)) {
+            uasserted(15936, "Creating collection " + _curns + " failed. Errmsg: " + info["errmsg"].String());
+        } else {
+            log() << "\tCreated collection " << _curns << " with options: " << cmdObj.jsonString() << endl;
+        }
+    }
+
+    /* We must handle if the dbname or collection name is different at restore time than what was dumped.
+       If keepCollName is true, however, we keep the same collection name that's in the index object.
+     */
+    void createIndex(BSONObj indexObj, bool keepCollName) {
+        BSONObjBuilder bo;
+        BSONObjIterator i(indexObj);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if (strcmp(e.fieldName(), "ns") == 0) {
+                NamespaceString n(e.String());
+                string s = _curdb + "." + (keepCollName ? n.coll : _curcoll);
+                bo.append("ns", s);
+            }
+            else if (strcmp(e.fieldName(), "v") != 0 || _keepIndexVersion) { // Remove index version number
+                bo.append(e);
+            }
+        }
+        BSONObj o = bo.obj();
+        log(0) << "\tCreating index: " << o << endl;
+        conn().insert( _curdb + ".system.indexes" ,  o );
+
+        // We're stricter about errors for indexes than for regular data
+        BSONObj err = conn().getLastErrorDetailed(false, false, _w);
+
+        if ( ! ( err["err"].isNull() ) ) {
+            if (err["err"].String() == "norepl" && _w > 1) {
+                error() << "Cannot specify write concern for non-replicas" << endl;
+            }
+            else {
+                error() << "Error creating index " << o["ns"].String();
+                error() << ": " << err["code"].Int() << " " << err["err"].String() << endl;
+                error() << "To resume index restoration, run " << _name << " on file" << _fileName << " manually." << endl;
+            }
+
+            ::abort();
+        }
+    }
+
+    // Calls flushRouterConfig on all mongos' in the config db's mongos collection, as well as on the one
+    // we're currently connected to.
+    void flushAllMongos() {
+        BSONObj info;
+
+        auto_ptr<DBClientCursor> cursor = conn().query ("config.mongos", Query());
+        while (cursor->more()) {
+            BSONObj obj = cursor->nextSafe();
+            string mongos = obj.getField("_id").valuestr();
+            string errmsg;
+            ConnectionString cs = ConnectionString::parse( mongos , errmsg );
+
+            if ( ! cs.isValid() ) {
+                error() << "invalid mongos hostname [" << mongos << "] " << errmsg << endl;
+                continue;
+            }
+
+            DBClientBase* mongosConn = cs.connect( errmsg );
+            if ( ! mongosConn ) {
+                error() << "Error connecting to mongos [" << mongos << "]: " << errmsg << endl;
+                continue;
+            }
+
+            log(1) << "Calling flushRouterConfig on mongos: " << mongos << endl;
+            mongosConn->runCommand( "config" , BSON( "flushRouterConfig" << 1 ) , info );
+        }
+    }
+
+    bool confirmAction() {
+        string userInput;
+        int attemptCount = 0;
+        while (attemptCount < 3) {
+            log() << "Are you sure you want to continue? [y/n]: ";
+            cin >> userInput;
+            if (userInput == "Y" || userInput == "y" || userInput == "yes" || userInput == "Yes" || userInput == "YES") {
+                return true;
+            }
+            else if (userInput == "N" || userInput == "n" || userInput == "no" || userInput == "No" || userInput == "NO") {
+                return false;
+            }
+            else {
+                log() << "Invalid input." << endl;
+                attemptCount++;
+            }
+        }
+        log() << "Entered invalid input 3 times in a row." << endl;
+        return false;
+    }
+};
+
+int main( int argc , char ** argv ) {
+    Restore restore;
+    return restore.main( argc , argv );
+}
diff --git a/src/mongo/tools/sniffer.cpp b/src/mongo/tools/sniffer.cpp
new file mode 100644
index 00000000000..aeab808cfed
--- /dev/null
+++ b/src/mongo/tools/sniffer.cpp
@@ -0,0 +1,566 @@
+// sniffer.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*
+  TODO:
+    large messages - need to track what's left and ingore
+    single object over packet size - can only display begging of object
+
+    getmore
+    delete
+    killcursors
+
+ */
+#include "../pch.h"
+#include <pcap.h>
+
+#ifdef _WIN32
+#undef min
+#undef max
+#endif
+
+#include "../bson/util/builder.h"
+#include "../util/net/message.h"
+#include "../util/mmap.h"
+#include "../db/dbmessage.h"
+#include "../client/dbclient.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+#include <sys/types.h>
+#ifndef _WIN32
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#endif
+
+#include <iostream>
+#include <map>
+#include <string>
+
+#include <boost/shared_ptr.hpp>
+
+using namespace std;
+using mongo::asserted;
+using mongo::Message;
+using mongo::MsgData;
+using mongo::DbMessage;
+using mongo::BSONObj;
+using mongo::BufBuilder;
+using mongo::DBClientConnection;
+using mongo::QueryResult;
+using mongo::MemoryMappedFile;
+
+mongo::CmdLine mongo::cmdLine;
+namespace mongo {
+    void setupSignals( bool inFork ){}
+}
+
+#define SNAP_LEN 65535
+
+int captureHeaderSize;
+set<int> serverPorts;
+string forwardAddress;
+bool objcheck = false;
+
+ostream *outPtr = &cout;
+ostream &out() { return *outPtr; }
+
+/* IP header */
+struct sniff_ip {
+    u_char  ip_vhl;                 /* version << 4 | header length >> 2 */
+    u_char  ip_tos;                 /* type of service */
+    u_short ip_len;                 /* total length */
+    u_short ip_id;                  /* identification */
+    u_short ip_off;                 /* fragment offset field */
+#define IP_RF 0x8000            /* reserved fragment flag */
+#define IP_DF 0x4000            /* dont fragment flag */
+#define IP_MF 0x2000            /* more fragments flag */
+#define IP_OFFMASK 0x1fff       /* mask for fragmenting bits */
+    u_char  ip_ttl;                 /* time to live */
+    u_char  ip_p;                   /* protocol */
+    u_short ip_sum;                 /* checksum */
+    struct  in_addr ip_src,ip_dst;  /* source and dest address */
+};
+#define IP_HL(ip)               (((ip)->ip_vhl) & 0x0f)
+#define IP_V(ip)                (((ip)->ip_vhl) >> 4)
+
+/* TCP header */
+#ifdef _WIN32
+typedef unsigned __int32 uint32_t;
+#endif
+typedef uint32_t tcp_seq;
+
+struct sniff_tcp {
+    u_short th_sport;               /* source port */
+    u_short th_dport;               /* destination port */
+    tcp_seq th_seq;                 /* sequence number */
+    tcp_seq th_ack;                 /* acknowledgement number */
+    u_char  th_offx2;               /* data offset, rsvd */
+#define TH_OFF(th)      (((th)->th_offx2 & 0xf0) >> 4)
+    u_char  th_flags;
+#define TH_FIN  0x01
+#define TH_SYN  0x02
+#define TH_RST  0x04
+#define TH_PUSH 0x08
+#define TH_ACK  0x10
+#define TH_URG  0x20
+#define TH_ECE  0x40
+#define TH_CWR  0x80
+
+#ifndef TH_FLAGS
+#define TH_FLAGS        (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG|TH_ECE|TH_CWR)
+#endif
+
+    u_short th_win;                 /* window */
+    u_short th_sum;                 /* checksum */
+    u_short th_urp;                 /* urgent pointer */
+};
+
+#pragma pack( 1 )
+struct Connection {
+    struct in_addr srcAddr;
+    u_short srcPort;
+    struct in_addr dstAddr;
+    u_short dstPort;
+    bool operator<( const Connection &other ) const {
+        return memcmp( this, &other, sizeof( Connection ) ) < 0;
+    }
+    Connection reverse() const {
+        Connection c;
+        c.srcAddr = dstAddr;
+        c.srcPort = dstPort;
+        c.dstAddr = srcAddr;
+        c.dstPort = srcPort;
+        return c;
+    }
+};
+#pragma pack()
+
+map< Connection, bool > seen;
+map< Connection, int > bytesRemainingInMessage;
+map< Connection, boost::shared_ptr< BufBuilder > > messageBuilder;
+map< Connection, unsigned > expectedSeq;
+map< Connection, boost::shared_ptr<DBClientConnection> > forwarder;
+map< Connection, long long > lastCursor;
+map< Connection, map< long long, long long > > mapCursor;
+
+void processMessage( Connection& c , Message& d );
+
+void got_packet(u_char *args, const struct pcap_pkthdr *header, const u_char *packet) {
+
+    const struct sniff_ip* ip = (struct sniff_ip*)(packet + captureHeaderSize);
+    int size_ip = IP_HL(ip)*4;
+    if ( size_ip < 20 ) {
+        cerr << "*** Invalid IP header length: " << size_ip << " bytes" << endl;
+        return;
+    }
+
+    assert( ip->ip_p == IPPROTO_TCP );
+
+    const struct sniff_tcp* tcp = (struct sniff_tcp*)(packet + captureHeaderSize + size_ip);
+    int size_tcp = TH_OFF(tcp)*4;
+    if (size_tcp < 20) {
+        cerr << "*** Invalid TCP header length: " << size_tcp << " bytes" << endl;
+        return;
+    }
+
+    if ( ! ( serverPorts.count( ntohs( tcp->th_sport ) ) ||
+             serverPorts.count( ntohs( tcp->th_dport ) ) ) ) {
+        return;
+    }
+
+    const u_char * payload = (const u_char*)(packet + captureHeaderSize + size_ip + size_tcp);
+
+    unsigned totalSize = ntohs(ip->ip_len);
+    assert( totalSize <= header->caplen );
+
+    int size_payload = totalSize - (size_ip + size_tcp);
+    if (size_payload <= 0 )
+        return;
+
+    Connection c;
+    c.srcAddr = ip->ip_src;
+    c.srcPort = tcp->th_sport;
+    c.dstAddr = ip->ip_dst;
+    c.dstPort = tcp->th_dport;
+
+    if ( seen[ c ] ) {
+        if ( expectedSeq[ c ] != ntohl( tcp->th_seq ) ) {
+            cerr << "Warning: sequence # mismatch, there may be dropped packets" << endl;
+        }
+    }
+    else {
+        seen[ c ] = true;
+    }
+
+    expectedSeq[ c ] = ntohl( tcp->th_seq ) + size_payload;
+
+    Message m;
+
+    if ( bytesRemainingInMessage[ c ] == 0 ) {
+        m.setData( (MsgData*)payload , false );
+        if ( !m.header()->valid() ) {
+            cerr << "Invalid message start, skipping packet." << endl;
+            return;
+        }
+        if ( size_payload > m.header()->len ) {
+            cerr << "Multiple messages in packet, skipping packet." << endl;
+            return;
+        }
+        if ( size_payload < m.header()->len ) {
+            bytesRemainingInMessage[ c ] = m.header()->len - size_payload;
+            messageBuilder[ c ].reset( new BufBuilder() );
+            messageBuilder[ c ]->appendBuf( (void*)payload, size_payload );
+            return;
+        }
+    }
+    else {
+        bytesRemainingInMessage[ c ] -= size_payload;
+        messageBuilder[ c ]->appendBuf( (void*)payload, size_payload );
+        if ( bytesRemainingInMessage[ c ] < 0 ) {
+            cerr << "Received too many bytes to complete message, resetting buffer" << endl;
+            bytesRemainingInMessage[ c ] = 0;
+            messageBuilder[ c ].reset();
+            return;
+        }
+        if ( bytesRemainingInMessage[ c ] > 0 )
+            return;
+        m.setData( (MsgData*)messageBuilder[ c ]->buf(), true );
+        messageBuilder[ c ]->decouple();
+        messageBuilder[ c ].reset();
+    }
+
+    DbMessage d( m );
+
+    out() << inet_ntoa(ip->ip_src) << ":" << ntohs( tcp->th_sport )
+          << ( serverPorts.count( ntohs( tcp->th_dport ) ) ? "  -->> " : "  <<--  " )
+          << inet_ntoa(ip->ip_dst) << ":" << ntohs( tcp->th_dport )
+          << " " << d.getns()
+          << "  " << m.header()->len << " bytes "
+          << " id:" << hex << m.header()->id << dec << "\t" << m.header()->id;
+
+    processMessage( c , m );
+}
+
+class AuditingDbMessage : public DbMessage {
+public:
+    AuditingDbMessage( const Message &m ) : DbMessage( m ) {}
+    BSONObj nextJsObj( const char *context ) {
+        BSONObj ret = DbMessage::nextJsObj();
+        if ( objcheck && !ret.valid() ) {
+            // TODO provide more debugging info
+            cout << "invalid object in " << context << ": " << ret.hexDump() << endl;
+        }
+        return ret;
+    }
+};
+
+void processMessage( Connection& c , Message& m ) {
+    AuditingDbMessage d(m);
+
+    if ( m.operation() == mongo::opReply )
+        out() << " - " << (unsigned)m.header()->responseTo;
+    out() << '\n';
+
+    try {
+        switch( m.operation() ) {
+        case mongo::opReply: {
+            mongo::QueryResult* r = (mongo::QueryResult*)m.singleData();
+            out() << "\treply" << " n:" << r->nReturned << " cursorId: " << r->cursorId << endl;
+            if ( r->nReturned ) {
+                mongo::BSONObj o( r->data() );
+                out() << "\t" << o << endl;
+            }
+            break;
+        }
+        case mongo::dbQuery: {
+            mongo::QueryMessage q(d);
+            out() << "\tquery: " << q.query << "  ntoreturn: " << q.ntoreturn << " ntoskip: " << q.ntoskip;
+            if( !q.fields.isEmpty() )
+                out() << " hasfields";
+            if( q.queryOptions & mongo::QueryOption_SlaveOk )
+                out() << " SlaveOk";
+            if( q.queryOptions & mongo::QueryOption_NoCursorTimeout )
+                out() << " NoCursorTimeout";
+            if( q.queryOptions & ~(mongo::QueryOption_SlaveOk | mongo::QueryOption_NoCursorTimeout) )
+                out() << " queryOptions:" << hex << q.queryOptions;
+            out() << endl;
+            break;
+        }
+        case mongo::dbUpdate: {
+            int flags = d.pullInt();
+            BSONObj q = d.nextJsObj( "update" );
+            BSONObj o = d.nextJsObj( "update" );
+            out() << "\tupdate  flags:" << flags << " q:" << q << " o:" << o << endl;
+            break;
+        }
+        case mongo::dbInsert: {
+            out() << "\tinsert: " << d.nextJsObj( "insert" ) << endl;
+            while ( d.moreJSObjs() ) {
+                out() << "\t\t" << d.nextJsObj( "insert" ) << endl;
+            }
+            break;
+        }
+        case mongo::dbGetMore: {
+            int nToReturn = d.pullInt();
+            long long cursorId = d.pullInt64();
+            out() << "\tgetMore nToReturn: " << nToReturn << " cursorId: " << cursorId << endl;
+            break;
+        }
+        case mongo::dbDelete: {
+            int flags = d.pullInt();
+            BSONObj q = d.nextJsObj( "delete" );
+            out() << "\tdelete flags: " << flags << " q: " << q << endl;
+            break;
+        }
+        case mongo::dbKillCursors: {
+            int *x = (int *) m.singleData()->_data;
+            x++; // reserved
+            int n = *x;
+            out() << "\tkillCursors n: " << n << endl;
+            break;
+        }
+        default:
+            out() << "\tunknown opcode " << m.operation() << endl;
+            cerr << "*** CANNOT HANDLE TYPE: " << m.operation() << endl;
+        }
+    }
+    catch ( ... ) {
+        cerr << "Error parsing message for operation: " << m.operation() << endl;
+    }
+
+
+    if ( !forwardAddress.empty() ) {
+        if ( m.operation() != mongo::opReply ) {
+            boost::shared_ptr<DBClientConnection> conn = forwarder[ c ];
+            if ( !conn ) {
+                conn.reset(new DBClientConnection( true ));
+                conn->connect( forwardAddress );
+                forwarder[ c ] = conn;
+            }
+            if ( m.operation() == mongo::dbQuery || m.operation() == mongo::dbGetMore ) {
+                if ( m.operation() == mongo::dbGetMore ) {
+                    DbMessage d( m );
+                    d.pullInt();
+                    long long &cId = d.pullInt64();
+                    cId = mapCursor[ c ][ cId ];
+                }
+                Message response;
+                conn->port().call( m, response );
+                QueryResult *qr = (QueryResult *) response.singleData();
+                if ( !( qr->resultFlags() & mongo::ResultFlag_CursorNotFound ) ) {
+                    if ( qr->cursorId != 0 ) {
+                        lastCursor[ c ] = qr->cursorId;
+                        return;
+                    }
+                }
+                lastCursor[ c ] = 0;
+            }
+            else {
+                conn->port().say( m );
+            }
+        }
+        else {
+            Connection r = c.reverse();
+            long long myCursor = lastCursor[ r ];
+            QueryResult *qr = (QueryResult *) m.singleData();
+            long long yourCursor = qr->cursorId;
+            if ( ( qr->resultFlags() & mongo::ResultFlag_CursorNotFound ) )
+                yourCursor = 0;
+            if ( myCursor && !yourCursor )
+                cerr << "Expected valid cursor in sniffed response, found none" << endl;
+            if ( !myCursor && yourCursor )
+                cerr << "Sniffed valid cursor when none expected" << endl;
+            if ( myCursor && yourCursor ) {
+                mapCursor[ r ][ qr->cursorId ] = lastCursor[ r ];
+                lastCursor[ r ] = 0;
+            }
+        }
+    }
+}
+
+void processDiagLog( const char * file ) {
+    Connection c;
+    MemoryMappedFile f;
+    long length;
+    unsigned long long L = 0;
+    char * root = (char*)f.map( file , L, MemoryMappedFile::SEQUENTIAL );
+    assert( L < 0x80000000 );
+    length = (long) L;
+    assert( root );
+    assert( length > 0 );
+
+    char * pos = root;
+
+    long read = 0;
+    while ( read < length ) {
+        Message m(pos,false);
+        int len = m.header()->len;
+        DbMessage d(m);
+        cout << len << " " << d.getns() << endl;
+
+        processMessage( c , m );
+
+        read += len;
+        pos += len;
+    }
+
+    f.close();
+}
+
+void usage() {
+    cout <<
+         "Usage: mongosniff [--help] [--forward host:port] [--source (NET <interface> | (FILE | DIAGLOG) <filename>)] [<port0> <port1> ... ]\n"
+         "--forward       Forward all parsed request messages to mongod instance at \n"
+         "                specified host:port\n"
+         "--source        Source of traffic to sniff, either a network interface or a\n"
+         "                file containing previously captured packets in pcap format,\n"
+         "                or a file containing output from mongod's --diaglog option.\n"
+         "                If no source is specified, mongosniff will attempt to sniff\n"
+         "                from one of the machine's network interfaces.\n"
+         "--objcheck      Log hex representation of invalid BSON objects and nothing\n"
+         "                else.  Spurious messages about invalid objects may result\n"
+         "                when there are dropped tcp packets.\n"
+         "<port0>...      These parameters are used to filter sniffing.  By default, \n"
+         "                only port 27017 is sniffed.\n"
+         "--help          Print this help message.\n"
+         << endl;
+}
+
+int main(int argc, char **argv) {
+
+    stringstream nullStream;
+    nullStream.clear(ios::failbit);
+
+    const char *dev = NULL;
+    char errbuf[PCAP_ERRBUF_SIZE];
+    pcap_t *handle;
+
+    struct bpf_program fp;
+    bpf_u_int32 mask;
+    bpf_u_int32 net;
+
+    bool source = false;
+    bool replay = false;
+    bool diaglog = false;
+    const char *file = 0;
+
+    vector< const char * > args;
+    for( int i = 1; i < argc; ++i )
+        args.push_back( argv[ i ] );
+
+    try {
+        for( unsigned i = 0; i < args.size(); ++i ) {
+            const char *arg = args[ i ];
+            if ( arg == string( "--help" ) ) {
+                usage();
+                return 0;
+            }
+            else if ( arg == string( "--forward" ) ) {
+                forwardAddress = args[ ++i ];
+            }
+            else if ( arg == string( "--source" ) ) {
+                uassert( 10266 ,  "can't use --source twice" , source == false );
+                uassert( 10267 ,  "source needs more args" , args.size() > i + 2);
+                source = true;
+                replay = ( args[ ++i ] == string( "FILE" ) );
+                diaglog = ( args[ i ] == string( "DIAGLOG" ) );
+                if ( replay || diaglog )
+                    file = args[ ++i ];
+                else
+                    dev = args[ ++i ];
+            }
+            else if ( arg == string( "--objcheck" ) ) {
+                objcheck = true;
+                outPtr = &nullStream;
+            }
+            else {
+                serverPorts.insert( atoi( args[ i ] ) );
+            }
+        }
+    }
+    catch ( ... ) {
+        usage();
+        return -1;
+    }
+
+    if ( !serverPorts.size() )
+        serverPorts.insert( 27017 );
+
+    if ( diaglog ) {
+        processDiagLog( file );
+        return 0;
+    }
+    else if ( replay ) {
+        handle = pcap_open_offline(file, errbuf);
+        if ( ! handle ) {
+            cerr << "error opening capture file!" << endl;
+            return -1;
+        }
+    }
+    else {
+        if ( !dev ) {
+            dev = pcap_lookupdev(errbuf);
+            if ( ! dev ) {
+                cerr << "error finding device: " << errbuf << endl;
+                return -1;
+            }
+            cout << "found device: " << dev << endl;
+        }
+        if (pcap_lookupnet(dev, &net, &mask, errbuf) == -1) {
+            cerr << "can't get netmask: " << errbuf << endl;
+            return -1;
+        }
+        handle = pcap_open_live(dev, SNAP_LEN, 1, 1000, errbuf);
+        if ( ! handle ) {
+            cerr << "error opening device: " << errbuf << endl;
+            return -1;
+        }
+    }
+
+    switch ( pcap_datalink( handle ) ) {
+    case DLT_EN10MB:
+        captureHeaderSize = 14;
+        break;
+    case DLT_NULL:
+        captureHeaderSize = 4;
+        break;
+    default:
+        cerr << "don't know how to handle datalink type: " << pcap_datalink( handle ) << endl;
+    }
+
+    assert( pcap_compile(handle, &fp, const_cast< char * >( "tcp" ) , 0, net) != -1 );
+    assert( pcap_setfilter(handle, &fp) != -1 );
+
+    cout << "sniffing... ";
+    for ( set<int>::iterator i = serverPorts.begin(); i != serverPorts.end(); i++ )
+        cout << *i << " ";
+    cout << endl;
+
+    pcap_loop(handle, 0 , got_packet, NULL);
+
+    pcap_freecode(&fp);
+    pcap_close(handle);
+
+    return 0;
+}
+
diff --git a/src/mongo/tools/stat.cpp b/src/mongo/tools/stat.cpp
new file mode 100644
index 00000000000..f5c506308e2
--- /dev/null
+++ b/src/mongo/tools/stat.cpp
@@ -0,0 +1,544 @@
+// stat.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client/dbclient.h"
+#include "db/json.h"
+#include "../util/net/httpclient.h"
+#include "../util/text.h"
+#include "tool.h"
+#include "stat_util.h"
+#include <fstream>
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace po = boost::program_options;
+
+namespace mongo {
+
+    class Stat : public Tool {
+    public:
+
+        Stat() : Tool( "stat" , REMOTE_SERVER , "admin" ) {
+            _http = false;
+            _many = false;
+
+            add_hidden_options()
+            ( "sleep" , po::value<int>() , "time to sleep between calls" )
+            ;
+            add_options()
+            ("noheaders", "don't output column names")
+            ("rowcount,n", po::value<int>()->default_value(0), "number of stats lines to print (0 for indefinite)")
+            ("http", "use http instead of raw db connection")
+            ("discover" , "discover nodes and display stats for all" )
+            ("all" , "all optional fields" )
+            ;
+
+            addPositionArg( "sleep" , 1 );
+
+            _autoreconnect = true;
+        }
+
+        virtual void printExtraHelp( ostream & out ) {
+            out << "View live MongoDB performance statistics.\n" << endl;
+            out << "usage: " << _name << " [options] [sleep time]" << endl;
+            out << "sleep time: time to wait (in seconds) between calls" << endl;
+        }
+
+        virtual void printExtraHelpAfter( ostream & out ) {
+            out << "\n";
+            out << " Fields\n";
+            out << "   inserts  \t- # of inserts per second (* means replicated op)\n";
+            out << "   query    \t- # of queries per second\n";
+            out << "   update   \t- # of updates per second\n";
+            out << "   delete   \t- # of deletes per second\n";
+            out << "   getmore  \t- # of get mores (cursor batch) per second\n";
+            out << "   command  \t- # of commands per second, on a slave its local|replicated\n";
+            out << "   flushes  \t- # of fsync flushes per second\n";
+            out << "   mapped   \t- amount of data mmaped (total data size) megabytes\n";
+            out << "   vsize    \t- virtual size of process in megabytes\n";
+            out << "   res      \t- resident size of process in megabytes\n";
+            out << "   faults   \t- # of pages faults per sec (linux only)\n";
+            out << "   locked   \t- percent of time in global write lock\n";
+            out << "   idx miss \t- percent of btree page misses (sampled)\n";
+            out << "   qr|qw    \t- queue lengths for clients waiting (read|write)\n";
+            out << "   ar|aw    \t- active clients (read|write)\n";
+            out << "   netIn    \t- network traffic in - bits\n";
+            out << "   netOut   \t- network traffic out - bits\n";
+            out << "   conn     \t- number of open connections\n";
+            out << "   set      \t- replica set name\n";
+            out << "   repl     \t- replication type \n";
+            out << "            \t    PRI - primary (master)\n";
+            out << "            \t    SEC - secondary\n";
+            out << "            \t    REC - recovering\n";
+            out << "            \t    UNK - unknown\n";
+            out << "            \t    SLV - slave\n";
+            out << "            \t    RTR - mongos process (\"router\")\n";
+        }
+
+        BSONObj stats() {
+            if ( _http ) {
+                HttpClient c;
+                HttpClient::Result r;
+
+                string url;
+                {
+                    stringstream ss;
+                    ss << "http://" << _host;
+                    if ( _host.find( ":" ) == string::npos )
+                        ss << ":28017";
+                    ss << "/_status";
+                    url = ss.str();
+                }
+
+                if ( c.get( url , &r ) != 200 ) {
+                    cout << "error (http): " << r.getEntireResponse() << endl;
+                    return BSONObj();
+                }
+
+                BSONObj x = fromjson( r.getBody() );
+                BSONElement e = x["serverStatus"];
+                if ( e.type() != Object ) {
+                    cout << "BROKEN: " << x << endl;
+                    return BSONObj();
+                }
+                return e.embeddedObjectUserCheck();
+            }
+            BSONObj out;
+            if ( ! conn().simpleCommand( _db , &out , "serverStatus" ) ) {
+                cout << "error: " << out << endl;
+                return BSONObj();
+            }
+            return out.getOwned();
+        }
+
+
+        virtual void preSetup() {
+            if ( hasParam( "http" ) ) {
+                _http = true;
+                _noconnection = true;
+            }
+
+            if ( hasParam( "host" ) &&
+                    getParam( "host" ).find( ',' ) != string::npos ) {
+                _noconnection = true;
+                _many = true;
+            }
+
+            if ( hasParam( "discover" ) ) {
+                _many = true;
+            }
+        }
+
+        int run() {
+            _statUtil.setSeconds( getParam( "sleep" , 1 ) );
+            _statUtil.setAll( hasParam( "all" ) );
+            if ( _many )
+                return runMany();
+            return runNormal();
+        }
+
+        static void printHeaders( const BSONObj& o ) {
+            BSONObjIterator i(o);
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                BSONObj x = e.Obj();
+                cout << setw( x["width"].numberInt() ) << e.fieldName() << ' ';
+            }
+            cout << endl;
+        }
+
+        static void printData( const BSONObj& o , const BSONObj& headers ) {
+
+            BSONObjIterator i(headers);
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                BSONObj h = e.Obj();
+                int w = h["width"].numberInt();
+
+                BSONElement data;
+                {
+                    BSONElement temp = o[e.fieldName()];
+                    if ( temp.isABSONObj() )
+                        data = temp.Obj()["data"];
+                }
+
+                if ( data.type() == String )
+                    cout << setw(w) << data.String();
+                else if ( data.type() == NumberDouble )
+                    cout << setw(w) << setprecision(3) << data.number();
+                else if ( data.type() == NumberInt )
+                    cout << setw(w) << data.numberInt();
+                else if ( data.eoo() )
+                    cout << setw(w) << "";
+                else
+                    cout << setw(w) << "???";
+
+                cout << ' ';
+            }
+            cout << endl;
+        }
+
+        int runNormal() {
+            bool showHeaders = ! hasParam( "noheaders" );
+            int rowCount = getParam( "rowcount" , 0 );
+            int rowNum = 0;
+
+            auth();
+
+            BSONObj prev = stats();
+            if ( prev.isEmpty() )
+                return -1;
+
+            while ( rowCount == 0 || rowNum < rowCount ) {
+                sleepsecs((int)ceil(_statUtil.getSeconds()));
+                BSONObj now;
+                try {
+                    now = stats();
+                }
+                catch ( std::exception& e ) {
+                    cout << "can't get data: " << e.what() << endl;
+                    continue;
+                }
+
+                if ( now.isEmpty() )
+                    return -2;
+
+                try {
+
+                    BSONObj out = _statUtil.doRow( prev , now );
+
+                    if ( showHeaders && rowNum % 10 == 0 ) {
+                        printHeaders( out );
+                    }
+
+                    printData( out , out );
+
+                }
+                catch ( AssertionException& e ) {
+                    cout << "\nerror: " << e.what() << "\n"
+                         << now
+                         << endl;
+                }
+
+                prev = now;
+                rowNum++;
+            }
+            return 0;
+        }
+
+        struct ServerState {
+            ServerState() : lock( "Stat::ServerState" ) {}
+            string host;
+            scoped_ptr<boost::thread> thr;
+
+            mongo::mutex lock;
+
+            BSONObj prev;
+            BSONObj now;
+            time_t lastUpdate;
+            vector<BSONObj> shards;
+
+            string error;
+            bool mongos;
+
+            string username;
+            string password;
+        };
+
+        static void serverThread( shared_ptr<ServerState> state ) {
+            try {
+                DBClientConnection conn( true );
+                conn._logLevel = 1;
+                string errmsg;
+                if ( ! conn.connect( state->host , errmsg ) )
+                    state->error = errmsg;
+                long long cycleNumber = 0;
+                
+                conn.auth("admin", state->username, state->password, errmsg);
+
+                while ( ++cycleNumber ) {
+                    try {
+                        BSONObj out;
+                        if ( conn.simpleCommand( "admin" , &out , "serverStatus" ) ) {
+                            scoped_lock lk( state->lock );
+                            state->error = "";
+                            state->lastUpdate = time(0);
+                            state->prev = state->now;
+                            state->now = out.getOwned();
+                        }
+                        else {
+                            scoped_lock lk( state->lock );
+                            state->error = "serverStatus failed";
+                            state->lastUpdate = time(0);
+                        }
+
+                        if ( out["shardCursorType"].type() == Object ) {
+                            state->mongos = true;
+                            if ( cycleNumber % 10 == 1 ) {
+                                auto_ptr<DBClientCursor> c = conn.query( "config.shards" , BSONObj() );
+                                vector<BSONObj> shards;
+                                while ( c->more() ) {
+                                    shards.push_back( c->next().getOwned() );
+                                }
+                                scoped_lock lk( state->lock );
+                                state->shards = shards;
+                            }
+                        }
+                    }
+                    catch ( std::exception& e ) {
+                        scoped_lock lk( state->lock );
+                        state->error = e.what();
+                    }
+
+                    sleepsecs( 1 );
+                }
+
+
+            }
+            catch ( std::exception& e ) {
+                cout << "serverThread (" << state->host << ") fatal error : " << e.what() << endl;
+            }
+            catch ( ... ) {
+                cout << "serverThread (" << state->host << ") fatal error" << endl;
+            }
+        }
+
+        typedef map<string,shared_ptr<ServerState> >  StateMap;
+
+        bool _add( StateMap& threads , string host ) {
+            shared_ptr<ServerState>& state = threads[host];
+            if ( state )
+                return false;
+
+            state.reset( new ServerState() );
+            state->host = host;
+            state->thr.reset( new boost::thread( boost::bind( serverThread , state ) ) );
+            state->username = _username;
+            state->password = _password;
+
+            return true;
+        }
+
+        /**
+         * @param hosts [ "a.foo.com" , "b.foo.com" ]
+         */
+        bool _addAll( StateMap& threads , const BSONObj& hosts ) {
+            BSONObjIterator i( hosts );
+            bool added = false;
+            while ( i.more() ) {
+                bool me = _add( threads , i.next().String() );
+                added = added || me;
+            }
+            return added;
+        }
+
+        bool _discover( StateMap& threads , const string& host , const shared_ptr<ServerState>& ss ) {
+
+            BSONObj info = ss->now;
+
+            bool found = false;
+
+            if ( info["repl"].isABSONObj() ) {
+                BSONObj x = info["repl"].Obj();
+                if ( x["hosts"].isABSONObj() )
+                    if ( _addAll( threads , x["hosts"].Obj() ) )
+                        found = true;
+                if ( x["passives"].isABSONObj() )
+                    if ( _addAll( threads , x["passives"].Obj() ) )
+                        found = true;
+            }
+
+            if ( ss->mongos ) {
+                for ( unsigned i=0; i<ss->shards.size(); i++ ) {
+                    BSONObj x = ss->shards[i];
+
+                    string errmsg;
+                    ConnectionString cs = ConnectionString::parse( x["host"].String() , errmsg );
+                    if ( errmsg.size() ) {
+                        cerr << errmsg << endl;
+                        continue;
+                    }
+
+                    vector<HostAndPort> v = cs.getServers();
+                    for ( unsigned i=0; i<v.size(); i++ ) {
+                        if ( _add( threads , v[i].toString() ) )
+                            found = true;
+                    }
+                }
+            }
+
+            return found;
+        }
+
+        int runMany() {
+            StateMap threads;
+            
+            {
+                string orig = getParam( "host" );
+                if ( orig == "" )
+                    orig = "localhost";
+                
+                if ( orig.find( ":" ) == string::npos ) {
+                    if ( hasParam( "port" ) )
+                        orig += ":" + _params["port"].as<string>();
+                    else 
+                        orig += ":27017";
+                }
+                
+                StringSplitter ss( orig.c_str() , "," );
+                while ( ss.more() ) {
+                    string host = ss.next();
+                    _add( threads , host );
+                }
+            }
+
+            sleepsecs(1);
+
+            int row = 0;
+            bool discover = hasParam( "discover" );
+
+            while ( 1 ) {
+                sleepsecs( (int)ceil(_statUtil.getSeconds()) );
+
+                // collect data
+                vector<Row> rows;
+                for ( map<string,shared_ptr<ServerState> >::iterator i=threads.begin(); i!=threads.end(); ++i ) {
+                    scoped_lock lk( i->second->lock );
+
+                    if ( i->second->error.size() ) {
+                        rows.push_back( Row( i->first , i->second->error ) );
+                    }
+                    else if ( i->second->prev.isEmpty() || i->second->now.isEmpty() ) {
+                        rows.push_back( Row( i->first ) );
+                    }
+                    else {
+                        BSONObj out = _statUtil.doRow( i->second->prev , i->second->now );
+                        rows.push_back( Row( i->first , out ) );
+                    }
+
+                    if ( discover && ! i->second->now.isEmpty() ) {
+                        if ( _discover( threads , i->first , i->second ) )
+                            break;
+                    }
+                }
+
+                // compute some stats
+                unsigned longestHost = 0;
+                BSONObj biggest;
+                for ( unsigned i=0; i<rows.size(); i++ ) {
+                    if ( rows[i].host.size() > longestHost )
+                        longestHost = rows[i].host.size();
+                    if ( rows[i].data.nFields() > biggest.nFields() )
+                        biggest = rows[i].data;
+                }
+
+                {
+                    // check for any headers not in biggest
+
+                    // TODO: we put any new headers at end,
+                    //       ideally we would interleave
+
+                    set<string> seen;
+
+                    BSONObjBuilder b;
+
+                    {
+                        // iterate biggest
+                        BSONObjIterator i( biggest );
+                        while ( i.more() ) {
+                            BSONElement e = i.next();
+                            seen.insert( e.fieldName() );
+                            b.append( e );
+                        }
+                    }
+
+                    // now do the rest
+                    for ( unsigned j=0; j<rows.size(); j++ ) {
+                        BSONObjIterator i( rows[j].data );
+                        while ( i.more() ) {
+                            BSONElement e = i.next();
+                            if ( seen.count( e.fieldName() ) )
+                                continue;
+                            seen.insert( e.fieldName() );
+                            b.append( e );
+                        }
+
+                    }
+
+                    biggest = b.obj();
+
+                }
+
+                // display data
+
+                cout << endl;
+
+                //    header
+                if ( row++ % 5 == 0 && ! biggest.isEmpty() ) {
+                    cout << setw( longestHost ) << "" << "\t";
+                    printHeaders( biggest );
+                }
+
+                //    rows
+                for ( unsigned i=0; i<rows.size(); i++ ) {
+                    cout << setw( longestHost ) << rows[i].host << "\t";
+                    if ( rows[i].err.size() )
+                        cout << rows[i].err << endl;
+                    else if ( rows[i].data.isEmpty() )
+                        cout << "no data" << endl;
+                    else
+                        printData( rows[i].data , biggest );
+                }
+
+            }
+
+            return 0;
+        }
+
+        StatUtil _statUtil;
+        bool _http;
+        bool _many;
+
+        struct Row {
+            Row( string h , string e ) {
+                host = h;
+                err = e;
+            }
+
+            Row( string h ) {
+                host = h;
+            }
+
+            Row( string h , BSONObj d ) {
+                host = h;
+                data = d;
+            }
+            string host;
+            string err;
+            BSONObj data;
+        };
+    };
+
+}
+
+int main( int argc , char ** argv ) {
+    mongo::Stat stat;
+    return stat.main( argc , argv );
+}
+
diff --git a/src/mongo/tools/stat_util.cpp b/src/mongo/tools/stat_util.cpp
new file mode 100644
index 00000000000..38f780e8734
--- /dev/null
+++ b/src/mongo/tools/stat_util.cpp
@@ -0,0 +1,269 @@
+// stat_util.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stat_util.h"
+#include "../util/mongoutils/str.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    StatUtil::StatUtil( double seconds , bool all ) :
+        _seconds( seconds ) ,
+        _all( all )
+        
+    {
+
+    }
+
+    bool StatUtil::_in( const BSONElement& me , const BSONElement& arr ) {
+        if ( me.type() != String || arr.type() != Array )
+            return false;
+
+        string s = me.String();
+        BSONForEach(e, arr.Obj()) {
+            if ( e.type() == String && s == e.String() )
+                return true;
+        }
+        return false;
+    }
+
+    BSONObj StatUtil::doRow( const BSONObj& a , const BSONObj& b ) {
+        BSONObjBuilder result;
+
+        bool isMongos =  b["shardCursorType"].type() == Object; // TODO: should have a better check
+
+        if ( a["opcounters"].isABSONObj() && b["opcounters"].isABSONObj() ) {
+            BSONObj ax = a["opcounters"].embeddedObject();
+            BSONObj bx = b["opcounters"].embeddedObject();
+
+            BSONObj ar = a["opcountersRepl"].isABSONObj() ? a["opcountersRepl"].embeddedObject() : BSONObj();
+            BSONObj br = b["opcountersRepl"].isABSONObj() ? b["opcountersRepl"].embeddedObject() : BSONObj();
+
+            BSONObjIterator i( bx );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( ar.isEmpty() || br.isEmpty() ) {
+                    _append( result , e.fieldName() , 6 , (int)diff( e.fieldName() , ax , bx ) );
+                }
+                else {
+                    string f = e.fieldName();
+
+                    int m = (int)diff( f , ax , bx );
+                    int r = (int)diff( f , ar , br );
+
+                    string myout;
+
+                    if ( f == "command" ) {
+                        myout = str::stream() << m << "|" << r;
+                    }
+                    else if ( f == "getmore" ) {
+                        myout = str::stream() << m;
+                    }
+                    else if ( m && r ) {
+                        // this is weird...
+                        myout = str::stream() << m << "|" << r;
+                    }
+                    else if ( m ) {
+                        myout = str::stream() << m;
+                    }
+                    else if ( r ) {
+                        myout = str::stream() << "*" << r;
+                    }
+                    else {
+                        myout = "*0";
+                    }
+
+                    _append( result , f , 6 , myout );
+                }
+            }
+        }
+
+        if ( b["backgroundFlushing"].type() == Object ) {
+            BSONObj ax = a["backgroundFlushing"].embeddedObject();
+            BSONObj bx = b["backgroundFlushing"].embeddedObject();
+            _append( result , "flushes" , 6 , (int)diff( "flushes" , ax , bx ) );
+        }
+
+        if ( b.getFieldDotted("mem.supported").trueValue() ) {
+            BSONObj bx = b["mem"].embeddedObject();
+            BSONObjIterator i( bx );
+            if (!isMongos)
+                _appendMem( result , "mapped" , 6 , bx["mapped"].numberInt() );
+            _appendMem( result , "vsize" , 6 , bx["virtual"].numberInt() );
+            _appendMem( result , "res" , 6 , bx["resident"].numberInt() );
+
+            if ( !isMongos && _all )
+                _appendMem( result , "non-mapped" , 6 , bx["virtual"].numberInt() - bx["mapped"].numberInt() );
+        }
+
+        if ( b["extra_info"].type() == Object ) {
+            BSONObj ax = a["extra_info"].embeddedObject();
+            BSONObj bx = b["extra_info"].embeddedObject();
+            if ( ax["page_faults"].type() || ax["page_faults"].type() )
+                _append( result , "faults" , 6 , (int)diff( "page_faults" , ax , bx ) );
+        }
+
+        if (!isMongos) {
+            _append( result , "locked %" , 8 , percent( "globalLock.totalTime" , "globalLock.lockTime" , a , b ) );
+            _append( result , "idx miss %" , 8 , percent( "indexCounters.btree.accesses" , "indexCounters.btree.misses" , a , b ) );
+        }
+
+        if ( b.getFieldDotted( "globalLock.currentQueue" ).type() == Object ) {
+            int r = b.getFieldDotted( "globalLock.currentQueue.readers" ).numberInt();
+            int w = b.getFieldDotted( "globalLock.currentQueue.writers" ).numberInt();
+            stringstream temp;
+            temp << r << "|" << w;
+            _append( result , "qr|qw" , 9 , temp.str() );
+        }
+
+        if ( b.getFieldDotted( "globalLock.activeClients" ).type() == Object ) {
+            int r = b.getFieldDotted( "globalLock.activeClients.readers" ).numberInt();
+            int w = b.getFieldDotted( "globalLock.activeClients.writers" ).numberInt();
+            stringstream temp;
+            temp << r << "|" << w;
+            _append( result , "ar|aw" , 7 , temp.str() );
+        }
+
+        if ( a["network"].isABSONObj() && b["network"].isABSONObj() ) {
+            BSONObj ax = a["network"].embeddedObject();
+            BSONObj bx = b["network"].embeddedObject();
+            _appendNet( result , "netIn" , diff( "bytesIn" , ax , bx ) );
+            _appendNet( result , "netOut" , diff( "bytesOut" , ax , bx ) );
+        }
+
+        _append( result , "conn" , 5 , b.getFieldDotted( "connections.current" ).numberInt() );
+
+        if ( b["repl"].type() == Object ) {
+
+            BSONObj x = b["repl"].embeddedObject();
+            bool isReplSet = x["setName"].type() == String;
+
+            stringstream ss;
+
+            if ( isReplSet ) {
+                string setName = x["setName"].String();
+                _append( result , "set" , setName.size() , setName );
+            }
+
+            if ( x["ismaster"].trueValue() )
+                ss << "PRI";
+            else if ( x["secondary"].trueValue() )
+                ss << "SEC";
+            else if ( x["isreplicaset"].trueValue() )
+                ss << "REC";
+            else if ( x["arbiterOnly"].trueValue() )
+                ss << "ARB";
+            else if ( _in( x["me"] , x["passives"] ) )
+                ss << "PSV";
+            else if ( isReplSet ) 
+                ss << "UNK";
+            else
+                ss << "SLV";
+
+            _append( result , "repl" , 4 , ss.str() );
+
+        }
+        else if ( isMongos ) {
+            _append( result , "repl" , 4 , "RTR" );
+        }
+
+        {
+            struct tm t;
+            time_t_to_Struct( time(0), &t , true );
+            stringstream temp;
+            temp << setfill('0') << setw(2) << t.tm_hour
+                 << ":"
+                 << setfill('0') << setw(2) << t.tm_min
+                 << ":"
+                 << setfill('0') << setw(2) << t.tm_sec;
+            _append( result , "time" , 10 , temp.str() );
+        }
+        return result.obj();
+    }
+
+
+
+    double StatUtil::percent( const char * outof , const char * val , const BSONObj& a , const BSONObj& b ) {
+        double x = ( b.getFieldDotted( val ).number() - a.getFieldDotted( val ).number() );
+        double y = ( b.getFieldDotted( outof ).number() - a.getFieldDotted( outof ).number() );
+        if ( y == 0 )
+            return 0;
+        double p = x / y;
+        p = (double)((int)(p * 1000)) / 10;
+        return p;
+    }
+
+
+
+    double StatUtil::diff( const string& name , const BSONObj& a , const BSONObj& b ) {
+        BSONElement x = a.getFieldDotted( name.c_str() );
+        BSONElement y = b.getFieldDotted( name.c_str() );
+        if ( ! x.isNumber() || ! y.isNumber() )
+            return -1;
+        return ( y.number() - x.number() ) / _seconds;
+    }
+
+
+    void StatUtil::_appendNet( BSONObjBuilder& result , const string& name , double diff ) {
+        // I think 1000 is correct for megabit, but I've seen conflicting things (ERH 11/2010)
+        const double div = 1000;
+
+        string unit = "b";
+
+        if ( diff >= div ) {
+            unit = "k";
+            diff /= div;
+        }
+
+        if ( diff >= div ) {
+            unit = "m";
+            diff /= div;
+        }
+
+        if ( diff >= div ) {
+            unit = "g";
+            diff /= div;
+        }
+
+        string out = str::stream() << (int)diff << unit;
+        _append( result , name , 6 , out );
+    }
+
+
+    
+    void StatUtil::_appendMem( BSONObjBuilder& result , const string& name , unsigned width , double sz ) {
+        string unit = "m";
+        if ( sz > 1024 ) {
+            unit = "g";
+            sz /= 1024;
+        }
+        
+        if ( sz >= 1000 ) {
+            string s = str::stream() << (int)sz << unit;
+            _append( result , name , width , s );
+            return;
+        }
+        
+        stringstream ss;
+        ss << setprecision(3) << sz << unit;
+        _append( result , name , width , ss.str() );
+    }
+
+}
+
diff --git a/src/mongo/tools/stat_util.h b/src/mongo/tools/stat_util.h
new file mode 100644
index 00000000000..4990e91624c
--- /dev/null
+++ b/src/mongo/tools/stat_util.h
@@ -0,0 +1,78 @@
+// stat_util.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "../db/jsobj.h"
+
+namespace mongo {
+
+    /**
+     * static methods useful for computing status from serverStatus type things
+     */
+    class StatUtil {
+    public:
+        /**
+         * @param seconds - seconds between calls to serverStatus
+         * @param all - show all fields
+         */
+        StatUtil( double seconds = 1 , bool all = false );
+
+        /**
+         * @param a older serverStatus
+         * @param b newer serverStatus
+         */
+        BSONObj doRow( const BSONObj& a , const BSONObj& b );
+
+        double getSeconds() const { return _seconds; }
+        bool getAll() const { return _all; }
+
+        void setSeconds( double seconds ) { _seconds = seconds; }
+        void setAll( bool all ) { _all = all; }
+
+    private:
+
+
+        double percent( const char * outof , const char * val , const BSONObj& a , const BSONObj& b );
+        
+        double diff( const string& name , const BSONObj& a , const BSONObj& b );
+
+        void _appendMem( BSONObjBuilder& result , const string& name , unsigned width , double sz );
+
+        void _appendNet( BSONObjBuilder& result , const string& name , double diff );
+
+        template<typename T>
+        void _append( BSONObjBuilder& result , const string& name , unsigned width , const T& t ) {
+            if ( name.size() > width )
+                width = name.size();
+            result.append( name , BSON( "width" << (int)width << "data" << t ) );
+        }
+
+        bool _in( const BSONElement& me , const BSONElement& arr );
+
+        
+        // -------
+
+        double _seconds;
+        bool _all;
+        
+    };
+    
+}
+
diff --git a/src/mongo/tools/tool.cpp b/src/mongo/tools/tool.cpp
new file mode 100644
index 00000000000..dc08625a545
--- /dev/null
+++ b/src/mongo/tools/tool.cpp
@@ -0,0 +1,526 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// Tool.cpp
+
+#include "tool.h"
+
+#include <iostream>
+
+#include <boost/filesystem/operations.hpp>
+#include "pcrecpp.h"
+
+#include "util/file_allocator.h"
+#include "util/password.h"
+#include "util/version.h"
+
+using namespace std;
+using namespace mongo;
+
+namespace po = boost::program_options;
+
+namespace mongo {
+
+    CmdLine cmdLine;
+
+    Tool::Tool( string name , DBAccess access , string defaultDB ,
+                string defaultCollection , bool usesstdout ) :
+        _name( name ) , _db( defaultDB ) , _coll( defaultCollection ) ,
+        _usesstdout(usesstdout), _noconnection(false), _autoreconnect(false), _conn(0), _slaveConn(0), _paired(false) {
+
+        _options = new po::options_description( "options" );
+        _options->add_options()
+        ("help","produce help message")
+        ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
+        ("version", "print the program's version and exit" )
+        ;
+
+        if ( access & REMOTE_SERVER )
+            _options->add_options()
+            ("host,h",po::value<string>(), "mongo host to connect to ( <set name>/s1,s2 for sets)" )
+            ("port",po::value<string>(), "server port. Can also use --host hostname:port" )
+            ("ipv6", "enable IPv6 support (disabled by default)")
+#ifdef MONGO_SSL
+            ("ssl", "use all for connections")
+#endif
+
+            ("username,u",po::value<string>(), "username" )
+            ("password,p", new PasswordValue( &_password ), "password" )
+            ;
+
+        if ( access & LOCAL_SERVER )
+            _options->add_options()
+            ("dbpath",po::value<string>(), "directly access mongod database "
+             "files in the given path, instead of connecting to a mongod  "
+             "server - needs to lock the data directory, so cannot be "
+             "used if a mongod is currently accessing the same path" )
+            ("directoryperdb", "if dbpath specified, each db is in a separate directory" )
+            ("journal", "enable journaling" )
+            ;
+
+        if ( access & SPECIFY_DBCOL )
+            _options->add_options()
+            ("db,d",po::value<string>(), "database to use" )
+            ("collection,c",po::value<string>(), "collection to use (some commands)" )
+            ;
+
+        _hidden_options = new po::options_description( name + " hidden options" );
+
+        /* support for -vv -vvvv etc. */
+        for (string s = "vv"; s.length() <= 10; s.append("v")) {
+            _hidden_options->add_options()(s.c_str(), "verbose");
+        }
+    }
+
+    Tool::~Tool() {
+        delete( _options );
+        delete( _hidden_options );
+        if ( _conn )
+            delete _conn;
+    }
+
+    void Tool::printHelp(ostream &out) {
+        printExtraHelp(out);
+        _options->print(out);
+        printExtraHelpAfter(out);
+    }
+
+    void Tool::printVersion(ostream &out) {
+        out << _name << " version " << mongo::versionString;
+        if (mongo::versionString[strlen(mongo::versionString)-1] == '-')
+            out << " (commit " << mongo::gitVersion() << ")";
+        out << endl;
+    }
+    int Tool::main( int argc , char ** argv ) {
+        static StaticObserver staticObserver;
+
+        cmdLine.prealloc = false;
+
+        // The default value may vary depending on compile options, but for tools
+        // we want durability to be disabled.
+        cmdLine.dur = false;
+
+#if( BOOST_VERSION >= 104500 )
+    boost::filesystem::path::default_name_check( boost::filesystem2::no_check );
+#else
+    boost::filesystem::path::default_name_check( boost::filesystem::no_check );
+#endif
+
+        _name = argv[0];
+
+        /* using the same style as db.cpp */
+        int command_line_style = (((po::command_line_style::unix_style ^
+                                    po::command_line_style::allow_guessing) |
+                                   po::command_line_style::allow_long_disguise) ^
+                                  po::command_line_style::allow_sticky);
+        try {
+            po::options_description all_options("all options");
+            all_options.add(*_options).add(*_hidden_options);
+
+            po::store( po::command_line_parser( argc , argv ).
+                       options(all_options).
+                       positional( _positonalOptions ).
+                       style(command_line_style).run() , _params );
+
+            po::notify( _params );
+        }
+        catch (po::error &e) {
+            cerr << "ERROR: " << e.what() << endl << endl;
+            printHelp(cerr);
+            return EXIT_BADOPTIONS;
+        }
+
+        // hide password from ps output
+        for (int i=0; i < (argc-1); ++i) {
+            if (!strcmp(argv[i], "-p") || !strcmp(argv[i], "--password")) {
+                char* arg = argv[i+1];
+                while (*arg) {
+                    *arg++ = 'x';
+                }
+            }
+        }
+
+        if ( _params.count( "help" ) ) {
+            printHelp(cout);
+            return 0;
+        }
+
+        if ( _params.count( "version" ) ) {
+            printVersion(cout);
+            return 0;
+        }
+
+        if ( _params.count( "verbose" ) ) {
+            logLevel = 1;
+        }
+
+        for (string s = "vv"; s.length() <= 10; s.append("v")) {
+            if (_params.count(s)) {
+                logLevel = s.length();
+            }
+        }
+
+
+#ifdef MONGO_SSL
+        if (_params.count("ssl")) {
+            mongo::cmdLine.sslOnNormalPorts = true;
+        }
+#endif
+
+        preSetup();
+
+        bool useDirectClient = hasParam( "dbpath" );
+
+        if ( ! useDirectClient ) {
+            _host = "127.0.0.1";
+            if ( _params.count( "host" ) )
+                _host = _params["host"].as<string>();
+
+            if ( _params.count( "port" ) )
+                _host += ':' + _params["port"].as<string>();
+
+            if ( _noconnection ) {
+                // do nothing
+            }
+            else {
+                string errmsg;
+
+                ConnectionString cs = ConnectionString::parse( _host , errmsg );
+                if ( ! cs.isValid() ) {
+                    cerr << "invalid hostname [" << _host << "] " << errmsg << endl;
+                    return -1;
+                }
+
+                _conn = cs.connect( errmsg );
+                if ( ! _conn ) {
+                    cerr << "couldn't connect to [" << _host << "] " << errmsg << endl;
+                    return -1;
+                }
+
+                (_usesstdout ? cout : cerr ) << "connected to: " << _host << endl;
+            }
+
+        }
+        else {
+            if ( _params.count( "directoryperdb" ) ) {
+                directoryperdb = true;
+            }
+            assert( lastError.get( true ) );
+
+            if (_params.count("journal")){
+                cmdLine.dur = true;
+            }
+
+            Client::initThread("tools");
+            _conn = new DBDirectClient();
+            _host = "DIRECT";
+            static string myDbpath = getParam( "dbpath" );
+            dbpath = myDbpath.c_str();
+            try {
+                acquirePathLock();
+            }
+            catch ( DBException& ) {
+                cerr << endl << "If you are running a mongod on the same "
+                     "path you should connect to that instead of direct data "
+                     "file access" << endl << endl;
+                dbexit( EXIT_CLEAN );
+                return -1;
+            }
+
+            FileAllocator::get()->start();
+
+            dur::startup();
+        }
+
+        if ( _params.count( "db" ) )
+            _db = _params["db"].as<string>();
+
+        if ( _params.count( "collection" ) )
+            _coll = _params["collection"].as<string>();
+
+        if ( _params.count( "username" ) )
+            _username = _params["username"].as<string>();
+
+        if ( _params.count( "password" )
+                && ( _password.empty() ) ) {
+            _password = askPassword();
+        }
+
+        if (_params.count("ipv6"))
+            enableIPv6();
+
+        int ret = -1;
+        try {
+            ret = run();
+        }
+        catch ( DBException& e ) {
+            cerr << "assertion: " << e.toString() << endl;
+            ret = -1;
+        }
+	catch(const boost::filesystem::filesystem_error &fse) {
+	    /*
+	      https://jira.mongodb.org/browse/SERVER-2904
+
+	      Simple tools that don't access the database, such as
+	      bsondump, aren't throwing DBExceptions, but are throwing
+	      boost exceptions.
+
+	      The currently available set of error codes don't seem to match
+	      boost documentation.  boost::filesystem::not_found_error
+	      (from http://www.boost.org/doc/libs/1_31_0/libs/filesystem/doc/exception.htm)
+	      doesn't seem to exist in our headers.  Also, fse.code() isn't
+	      boost::system::errc::no_such_file_or_directory when this
+	      happens, as you would expect.  And, determined from
+	      experimentation that the command-line argument gets turned into
+	      "\\?" instead of "/?" !!!
+	     */
+#if defined(_WIN32)
+	    if (/*(fse.code() == boost::system::errc::no_such_file_or_directory) &&*/
+		(fse.path1() == "\\?"))
+		printHelp(cerr);
+	    else
+#endif // _WIN32
+		cerr << "error: " << fse.what() << endl;
+
+	    ret = -1;
+	}
+
+        if ( currentClient.get() )
+            currentClient.get()->shutdown();
+
+        if ( useDirectClient )
+            dbexit( EXIT_CLEAN );
+        return ret;
+    }
+
+    DBClientBase& Tool::conn( bool slaveIfPaired ) {
+        if ( slaveIfPaired && _conn->type() == ConnectionString::SET ) {
+            if (!_slaveConn)
+                _slaveConn = &((DBClientReplicaSet*)_conn)->slaveConn();
+            return *_slaveConn;
+        }
+        return *_conn;
+    }
+
+    bool Tool::isMaster() {
+        if ( hasParam("dbpath") ) {
+            return true;
+        }
+
+        BSONObj info;
+        bool isMaster;
+        bool ok = conn().isMaster(isMaster, &info);
+
+        if (ok && !isMaster) {
+            cerr << "ERROR: trying to write to non-master " << conn().toString() << endl;
+            cerr << "isMaster info: " << info << endl;
+            return false;
+        }
+
+        return true;
+    }
+
+    bool Tool::isMongos() {
+        // TODO: when mongos supports QueryOption_Exaust add a version check (SERVER-2628)
+        BSONObj isdbgrid;
+        conn("true").simpleCommand("admin", &isdbgrid, "isdbgrid");
+        return isdbgrid["isdbgrid"].trueValue();
+    }
+
+    void Tool::addFieldOptions() {
+        add_options()
+        ("fields,f" , po::value<string>() , "comma separated list of field names e.g. -f name,age" )
+        ("fieldFile" , po::value<string>() , "file with fields names - 1 per line" )
+        ;
+    }
+
+    void Tool::needFields() {
+
+        if ( hasParam( "fields" ) ) {
+            BSONObjBuilder b;
+
+            string fields_arg = getParam("fields");
+            pcrecpp::StringPiece input(fields_arg);
+
+            string f;
+            pcrecpp::RE re("([#\\w\\.\\s\\-]+),?" );
+            while ( re.Consume( &input, &f ) ) {
+                _fields.push_back( f );
+                b.append( f , 1 );
+            }
+
+            _fieldsObj = b.obj();
+            return;
+        }
+
+        if ( hasParam( "fieldFile" ) ) {
+            string fn = getParam( "fieldFile" );
+            if ( ! exists( fn ) )
+                throw UserException( 9999 , ((string)"file: " + fn ) + " doesn't exist" );
+
+            const int BUF_SIZE = 1024;
+            char line[ 1024 + 128];
+            ifstream file( fn.c_str() );
+
+            BSONObjBuilder b;
+            while ( file.rdstate() == ios_base::goodbit ) {
+                file.getline( line , BUF_SIZE );
+                const char * cur = line;
+                while ( isspace( cur[0] ) ) cur++;
+                if ( cur[0] == '\0' )
+                    continue;
+
+                _fields.push_back( cur );
+                b.append( cur , 1 );
+            }
+            _fieldsObj = b.obj();
+            return;
+        }
+
+        throw UserException( 9998 , "you need to specify fields" );
+    }
+
+    void Tool::auth( string dbname ) {
+        if ( ! dbname.size() )
+            dbname = _db;
+
+        if ( ! ( _username.size() || _password.size() ) ) {
+            // Make sure that we don't need authentication to connect to this db
+            // findOne throws an AssertionException if it's not authenticated.
+            if (_coll.size() > 0) {
+                // BSONTools don't have a collection
+                conn().findOne(getNS(), Query("{}"), 0, QueryOption_SlaveOk);
+            }
+            return;
+        }
+
+        string errmsg;
+        if ( _conn->auth( dbname , _username , _password , errmsg ) )
+            return;
+
+        // try against the admin db
+        string err2;
+        if ( _conn->auth( "admin" , _username , _password , errmsg ) )
+            return;
+
+        throw UserException( 9997 , (string)"authentication failed: " + errmsg );
+    }
+
+    BSONTool::BSONTool( const char * name, DBAccess access , bool objcheck )
+        : Tool( name , access , "" , "" , false ) , _objcheck( objcheck ) {
+
+        add_options()
+        ("objcheck" , "validate object before inserting" )
+        ("filter" , po::value<string>() , "filter to apply before inserting" )
+        ;
+    }
+
+
+    int BSONTool::run() {
+        _objcheck = hasParam( "objcheck" );
+
+        if ( hasParam( "filter" ) )
+            _matcher.reset( new Matcher( fromjson( getParam( "filter" ) ) ) );
+
+        return doRun();
+    }
+
+    long long BSONTool::processFile( const path& root ) {
+        _fileName = root.string();
+
+        unsigned long long fileLength = file_size( root );
+
+        if ( fileLength == 0 ) {
+            out() << "file " << _fileName << " empty, skipping" << endl;
+            return 0;
+        }
+
+
+        FILE* file = fopen( _fileName.c_str() , "rb" );
+        if ( ! file ) {
+            log() << "error opening file: " << _fileName << " " << errnoWithDescription() << endl;
+            return 0;
+        }
+
+#if !defined(__sunos__) && defined(POSIX_FADV_SEQUENTIAL)
+        posix_fadvise(fileno(file), 0, fileLength, POSIX_FADV_SEQUENTIAL);
+#endif
+
+        log(1) << "\t file size: " << fileLength << endl;
+
+        unsigned long long read = 0;
+        unsigned long long num = 0;
+        unsigned long long processed = 0;
+
+        const int BUF_SIZE = BSONObjMaxUserSize + ( 1024 * 1024 );
+        boost::scoped_array<char> buf_holder(new char[BUF_SIZE]);
+        char * buf = buf_holder.get();
+
+        ProgressMeter m( fileLength );
+        m.setUnits( "bytes" );
+
+        while ( read < fileLength ) {
+            size_t amt = fread(buf, 1, 4, file);
+            assert( amt == 4 );
+
+            int size = ((int*)buf)[0];
+            uassert( 10264 , str::stream() << "invalid object size: " << size , size < BUF_SIZE );
+
+            amt = fread(buf+4, 1, size-4, file);
+            assert( amt == (size_t)( size - 4 ) );
+
+            BSONObj o( buf );
+            if ( _objcheck && ! o.valid() ) {
+                cerr << "INVALID OBJECT - going try and pring out " << endl;
+                cerr << "size: " << size << endl;
+                BSONObjIterator i(o);
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    try {
+                        e.validate();
+                    }
+                    catch ( ... ) {
+                        cerr << "\t\t NEXT ONE IS INVALID" << endl;
+                    }
+                    cerr << "\t name : " << e.fieldName() << " " << e.type() << endl;
+                    cerr << "\t " << e << endl;
+                }
+            }
+
+            if ( _matcher.get() == 0 || _matcher->matches( o ) ) {
+                gotObject( o );
+                processed++;
+            }
+
+            read += o.objsize();
+            num++;
+
+            m.hit( o.objsize() );
+        }
+
+        fclose( file );
+
+        uassert( 10265 ,  "counts don't match" , m.done() == fileLength );
+        (_usesstdout ? cout : cerr ) << m.hits() << " objects found" << endl;
+        if ( _matcher.get() )
+            (_usesstdout ? cout : cerr ) << processed << " objects processed" << endl;
+        return processed;
+    }
+
+
+
+    void setupSignals( bool inFork ) {}
+}
diff --git a/src/mongo/tools/tool.h b/src/mongo/tools/tool.h
new file mode 100644
index 00000000000..e40109362c5
--- /dev/null
+++ b/src/mongo/tools/tool.h
@@ -0,0 +1,160 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// Tool.h
+
+#pragma once
+
+#include <string>
+
+#include <boost/program_options.hpp>
+
+#if defined(_WIN32)
+#include <io.h>
+#endif
+
+#include "client/dbclient.h"
+#include "db/instance.h"
+#include "db/matcher.h"
+
+using std::string;
+
+namespace mongo {
+
+    class Tool {
+    public:
+        enum DBAccess {
+            NONE = 0 ,
+            REMOTE_SERVER = 1 << 1 ,
+            LOCAL_SERVER = 1 << 2 ,
+            SPECIFY_DBCOL = 1 << 3 ,
+            ALL = REMOTE_SERVER | LOCAL_SERVER | SPECIFY_DBCOL
+        };
+
+        Tool( string name , DBAccess access=ALL, string defaultDB="test" ,
+              string defaultCollection="", bool usesstdout=true);
+        virtual ~Tool();
+
+        int main( int argc , char ** argv );
+
+        boost::program_options::options_description_easy_init add_options() {
+            return _options->add_options();
+        }
+        boost::program_options::options_description_easy_init add_hidden_options() {
+            return _hidden_options->add_options();
+        }
+        void addPositionArg( const char * name , int pos ) {
+            _positonalOptions.add( name , pos );
+        }
+
+        string getParam( string name , string def="" ) {
+            if ( _params.count( name ) )
+                return _params[name.c_str()].as<string>();
+            return def;
+        }
+        int getParam( string name , int def ) {
+            if ( _params.count( name ) )
+                return _params[name.c_str()].as<int>();
+            return def;
+        }
+        bool hasParam( string name ) {
+            return _params.count( name );
+        }
+
+        string getNS() {
+            if ( _coll.size() == 0 ) {
+                cerr << "no collection specified!" << endl;
+                throw -1;
+            }
+            return _db + "." + _coll;
+        }
+
+        void useStandardOutput( bool mode ) {
+            _usesstdout = mode;
+        }
+
+        bool isMaster();
+        bool isMongos();
+        
+        virtual void preSetup() {}
+
+        virtual int run() = 0;
+
+        virtual void printHelp(ostream &out);
+
+        virtual void printExtraHelp( ostream & out ) {}
+        virtual void printExtraHelpAfter( ostream & out ) {}
+
+        virtual void printVersion(ostream &out);
+
+    protected:
+
+        mongo::DBClientBase &conn( bool slaveIfPaired = false );
+        void auth( string db = "" );
+
+        string _name;
+
+        string _db;
+        string _coll;
+        string _fileName;
+
+        string _username;
+        string _password;
+
+        bool _usesstdout;
+        bool _noconnection;
+        bool _autoreconnect;
+
+        void addFieldOptions();
+        void needFields();
+
+        vector<string> _fields;
+        BSONObj _fieldsObj;
+
+
+        string _host;
+
+    protected:
+
+        mongo::DBClientBase * _conn;
+        mongo::DBClientBase * _slaveConn;
+        bool _paired;
+
+        boost::program_options::options_description * _options;
+        boost::program_options::options_description * _hidden_options;
+        boost::program_options::positional_options_description _positonalOptions;
+
+        boost::program_options::variables_map _params;
+
+    };
+
+    class BSONTool : public Tool {
+        bool _objcheck;
+        auto_ptr<Matcher> _matcher;
+
+    public:
+        BSONTool( const char * name , DBAccess access=ALL, bool objcheck = false );
+
+        virtual int doRun() = 0;
+        virtual void gotObject( const BSONObj& obj ) = 0;
+
+        virtual int run();
+
+        long long processFile( const path& file );
+
+    };
+
+}
diff --git a/src/mongo/tools/top.cpp b/src/mongo/tools/top.cpp
new file mode 100644
index 00000000000..6479bb2bd7a
--- /dev/null
+++ b/src/mongo/tools/top.cpp
@@ -0,0 +1,200 @@
+// stat.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client/dbclient.h"
+#include "db/json.h"
+#include "../util/text.h"
+#include "tool.h"
+#include <fstream>
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace po = boost::program_options;
+
+namespace mongo {
+
+    class TopTool : public Tool {
+    public:
+
+        TopTool() : Tool( "top" , REMOTE_SERVER , "admin" ) {
+            _sleep = 1;
+
+            add_hidden_options()
+            ( "sleep" , po::value<int>() , "time to sleep between calls" )
+            ;
+            addPositionArg( "sleep" , 1 );
+
+            _autoreconnect = true;
+        }
+
+        virtual void printExtraHelp( ostream & out ) {
+            out << "View live MongoDB collection statistics.\n" << endl;
+        }
+        
+        BSONObj getData() {
+            BSONObj out;
+            if ( ! conn().simpleCommand( _db , &out , "top" ) ) {
+                cout << "error: " << out << endl;
+                return BSONObj();
+            }
+            return out.getOwned();
+        }
+        
+        void printDiff( BSONObj prev , BSONObj now ) { 
+            if ( ! prev["totals"].isABSONObj() ||
+                 ! now["totals"].isABSONObj() ) {
+                cout << "." << endl;
+                return;
+            }
+
+            prev = prev["totals"].Obj();
+            now = now["totals"].Obj();
+            
+            vector<NSInfo> data;
+            
+            unsigned longest = 30;
+
+            BSONObjIterator i( now );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                
+                // invalid, data fixed in 1.8.0
+                if ( e.fieldName()[0] == '?' )
+                    continue;
+                
+                if ( ! str::contains( e.fieldName() , '.' ) )
+                    continue;
+                
+                BSONElement old = prev[e.fieldName()];
+                if ( old.eoo() ) 
+                    continue;
+                
+                if ( strlen( e.fieldName() ) > longest )
+                    longest = strlen(e.fieldName());
+
+                data.push_back( NSInfo( e.fieldName() , old.Obj() , e.Obj() ) );
+            }
+            
+            std::sort( data.begin() , data.end() );
+
+            cout << "\n"
+                 << setw(longest) << "ns" 
+                 << "\ttotal   "  
+                 << "\tread    "    
+                 << "\twrite   "  
+                 << "\t\t" << terseCurrentTime()
+                 << endl;
+            for ( int i=data.size()-1; i>=0 && data.size() - i < 10 ; i-- ) {
+                cout << setw(longest) << data[i].ns 
+                     << "\t" << setprecision(3) << data[i].diffTimeMS( "total" ) << "ms" 
+                     << "\t" << setprecision(3) << data[i].diffTimeMS( "readLock" ) << "ms" 
+                     << "\t" << setprecision(3) << data[i].diffTimeMS( "writeLock" ) << "ms" 
+                     << endl;
+            }
+        }
+
+        int run() {
+            _sleep = getParam( "sleep" , _sleep );
+            
+            BSONObj prev = getData();
+
+            while ( true ) {
+                sleepsecs( _sleep );
+                
+                BSONObj now;
+                try {
+                    now = getData();
+                }
+                catch ( std::exception& e ) {
+                    cout << "can't get data: " << e.what() << endl;
+                    continue;
+                }
+
+                if ( now.isEmpty() )
+                    return -2;
+                
+                try {
+                    printDiff( prev , now );
+                }
+                catch ( AssertionException& e ) {
+                    cout << "\nerror: " << e.what() << "\n"
+                         << now
+                         << endl;
+                }
+
+
+                prev = now;
+            }
+
+            return 0;
+        }
+
+        struct NSInfo {
+            NSInfo( string thens , BSONObj a , BSONObj b ) {
+                ns = thens;
+                prev = a;
+                cur = b;
+                
+                timeDiff = diffTime( "total" );
+            }
+            
+            
+            int diffTimeMS( const char * field  ) const {
+                return (int)(diffTime( field ) / 1000);
+            }
+
+            double diffTime( const char * field ) const {
+                return diff( field , "time" );
+            }
+            
+            double diffCount( const char * field ) const {
+                return diff( field , "count" );
+            }
+
+            /**
+             * @param field total,readLock, etc...
+             * @param type time or count
+             */
+            double diff( const char * field , const char * type ) const {
+                return cur[field].Obj()[type].number() - prev[field].Obj()[type].number();
+            }
+
+            bool operator<(const NSInfo& r) const {
+                return timeDiff < r.timeDiff;
+            }
+            
+            string ns;
+            
+            BSONObj prev;
+            BSONObj cur;
+
+            double timeDiff; // time diff between prev and cur
+        };
+
+    private:
+        int _sleep;
+    };
+
+}
+
+int main( int argc , char ** argv ) {
+    mongo::TopTool top;
+    return top.main( argc , argv );
+}
+
diff --git a/src/mongo/util/admin_access.h b/src/mongo/util/admin_access.h
new file mode 100644
index 00000000000..bb882b2b4c5
--- /dev/null
+++ b/src/mongo/util/admin_access.h
@@ -0,0 +1,52 @@
+/** @file admin_access.h
+ */
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /*
+     * An AdminAccess is an interface class used to determine if certain users have
+     * priviledges to a given resource.
+     *
+     */
+    class AdminAccess {
+    public:
+        virtual ~AdminAccess() { }
+
+        /** @return if there are any priviledge users. This should not
+         *          block for long and throw if can't get a lock if needed.
+         */
+        virtual bool haveAdminUsers() const = 0;
+
+        /** @return priviledged user with this name. This should not block
+         *          for long and throw if can't get a lock if needed
+         */
+        virtual BSONObj getAdminUser( const string& username ) const = 0;
+    };
+
+    class NoAdminAccess : public AdminAccess {
+    public:
+        virtual ~NoAdminAccess() { }
+
+        virtual bool haveAdminUsers() const { return false; }
+        virtual BSONObj getAdminUser( const string& username ) const { return BSONObj(); }
+    };
+
+}  // namespace mongo
diff --git a/src/mongo/util/alignedbuilder.cpp b/src/mongo/util/alignedbuilder.cpp
new file mode 100644
index 00000000000..b2e0461b733
--- /dev/null
+++ b/src/mongo/util/alignedbuilder.cpp
@@ -0,0 +1,141 @@
+// @file alignedbuilder.cpp
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "alignedbuilder.h"
+
+namespace mongo {
+
+    AlignedBuilder::AlignedBuilder(unsigned initSize) {
+        _len = 0;
+        _malloc(initSize);
+        uassert(13584, "out of memory AlignedBuilder", _p._allocationAddress);
+    }
+
+    BOOST_STATIC_ASSERT(sizeof(void*) == sizeof(size_t));
+
+    /** reset for a re-use. shrinks if > 128MB */
+    void AlignedBuilder::reset() {
+        _len = 0;
+        RARELY {
+            const unsigned sizeCap = 128*1024*1024;
+            if (_p._size > sizeCap)
+                _realloc(sizeCap, _len);
+        }
+    }
+
+    /** reset with a hint as to the upcoming needed size specified */
+    void AlignedBuilder::reset(unsigned sz) { 
+        _len = 0;
+        unsigned Q = 32 * 1024 * 1024 - 1;
+        unsigned want = (sz+Q) & (~Q);
+        if( _p._size == want ) {
+            return;
+        }        
+        if( _p._size > want ) {
+            if( _p._size <= 64 * 1024 * 1024 )
+                return;
+            bool downsize = false;
+            RARELY { downsize = true; }
+            if( !downsize )
+                return;
+        }
+        _realloc(want, _len);
+    }
+
+    void AlignedBuilder::mallocSelfAligned(unsigned sz) {
+        assert( sz == _p._size );
+        void *p = malloc(sz + Alignment - 1);
+        _p._allocationAddress = p;
+        size_t s = (size_t) p;
+        size_t sold = s;
+        s += Alignment - 1;
+        s = (s/Alignment)*Alignment;
+        assert( s >= sold ); // begining
+        assert( (s + sz) <= (sold + sz + Alignment - 1) ); //end
+        _p._data = (char *) s;
+    }
+
+    /* "slow"/infrequent portion of 'grow()'  */
+    void NOINLINE_DECL AlignedBuilder::growReallocate(unsigned oldLen) {
+        dassert( _len > _p._size );
+        unsigned a = _p._size;
+        assert( a );
+        while( 1 ) {
+            if( a < 128 * 1024 * 1024 )
+                a *= 2;
+            else if( sizeof(int*) == 4 )
+                a += 32 * 1024 * 1024;
+            else 
+                a += 64 * 1024 * 1024;
+            DEV if( a > 256*1024*1024 ) { 
+                log() << "dur AlignedBuilder too big, aborting in _DEBUG build" << endl;
+                abort();
+            }
+            wassert( a <= 256*1024*1024 );
+            assert( a <= 512*1024*1024 );
+            if( _len < a )
+                break;
+        }
+        _realloc(a, oldLen);
+    }
+
+    void AlignedBuilder::_malloc(unsigned sz) {
+        _p._size = sz;
+#if defined(_WIN32)
+        void *p = VirtualAlloc(0, sz, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+        _p._allocationAddress = p;
+        _p._data = (char *) p;
+#elif defined(__linux__)
+        // in theory #ifdef _POSIX_VERSION should work, but it doesn't on OS X 10.4, and needs to be testeed on solaris.
+        // so for now, linux only for this.
+        void *p = 0;
+        int res = posix_memalign(&p, Alignment, sz);
+        massert(13524, "out of memory AlignedBuilder", res == 0);
+        _p._allocationAddress = p;
+        _p._data = (char *) p;
+#else
+        mallocSelfAligned(sz);
+        assert( ((size_t) _p._data) % Alignment == 0 );
+#endif
+    }
+
+    void AlignedBuilder::_realloc(unsigned newSize, unsigned oldLen) {
+        // posix_memalign alignment is not maintained on reallocs, so we can't use realloc().
+        AllocationInfo old = _p;
+        _malloc(newSize);
+        assert( oldLen <= _len );
+        memcpy(_p._data, old._data, oldLen);
+        _free(old._allocationAddress);
+    }
+
+    void AlignedBuilder::_free(void *p) {
+#if defined(_WIN32)
+        VirtualFree(p, 0, MEM_RELEASE);
+#else
+        free(p);
+#endif
+    }
+
+    void AlignedBuilder::kill() {
+        _free(_p._allocationAddress);
+        _p._allocationAddress = 0;
+        _p._data = 0;
+    }
+
+}
diff --git a/src/mongo/util/alignedbuilder.h b/src/mongo/util/alignedbuilder.h
new file mode 100644
index 00000000000..1d246a9d78e
--- /dev/null
+++ b/src/mongo/util/alignedbuilder.h
@@ -0,0 +1,125 @@
+// @file alignedbuilder.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../bson/stringdata.h"
+
+namespace mongo {
+
+    /** a page-aligned BufBuilder. */
+    class AlignedBuilder {
+    public:
+        AlignedBuilder(unsigned init_size);
+        ~AlignedBuilder() { kill(); }
+
+        /** reset with a hint as to the upcoming needed size specified */
+        void reset(unsigned sz);
+
+        /** reset for a re-use. shrinks if > 128MB */
+        void reset();
+
+        /** note this may be deallocated (realloced) if you keep writing or reset(). */
+        const char* buf() const { return _p._data; }
+
+        /** leave room for some stuff later
+            @return offset in the buffer that was our current position
+        */
+        size_t skip(unsigned n) {
+            unsigned l = len();
+            grow(n);
+            return l;
+        }
+
+        /** if buffer grows pointer no longer valid */
+        char* atOfs(unsigned ofs) { return _p._data + ofs; }
+
+        /** if buffer grows pointer no longer valid */
+        char* cur() { return _p._data + _len; }
+
+        void appendChar(char j) {
+            *((char*)grow(sizeof(char))) = j;
+        }
+        void appendNum(char j) {
+            *((char*)grow(sizeof(char))) = j;
+        }
+        void appendNum(short j) {
+            *((short*)grow(sizeof(short))) = j;
+        }
+        void appendNum(int j) {
+            *((int*)grow(sizeof(int))) = j;
+        }
+        void appendNum(unsigned j) {
+            *((unsigned*)grow(sizeof(unsigned))) = j;
+        }
+        void appendNum(bool j) {
+            *((bool*)grow(sizeof(bool))) = j;
+        }
+        void appendNum(double j) {
+            *((double*)grow(sizeof(double))) = j;
+        }
+        void appendNum(long long j) {
+            *((long long*)grow(sizeof(long long))) = j;
+        }
+        void appendNum(unsigned long long j) {
+            *((unsigned long long*)grow(sizeof(unsigned long long))) = j;
+        }
+
+        void appendBuf(const void *src, size_t len) { memcpy(grow((unsigned) len), src, len); }
+
+        template<class T>
+        void appendStruct(const T& s) { appendBuf(&s, sizeof(T)); }
+
+        void appendStr(const StringData &str , bool includeEOO = true ) {
+            const unsigned len = str.size() + ( includeEOO ? 1 : 0 );
+            assert( len < (unsigned) BSONObjMaxUserSize );
+            memcpy(grow(len), str.data(), len);
+        }
+
+        /** @return the in-use length */
+        unsigned len() const { return _len; }
+
+    private:
+        static const unsigned Alignment = 8192;
+
+        /** returns the pre-grow write position */
+        inline char* grow(unsigned by) {
+            unsigned oldlen = _len;
+            _len += by;
+            if (MONGO_unlikely( _len > _p._size )) {
+                growReallocate(oldlen);
+            }
+            return _p._data + oldlen;
+        }
+
+        void growReallocate(unsigned oldLenInUse);
+        void kill();
+        void mallocSelfAligned(unsigned sz);
+        void _malloc(unsigned sz);
+        void _realloc(unsigned newSize, unsigned oldLenInUse);
+        void _free(void*);
+
+        struct AllocationInfo {
+            char *_data;
+            void *_allocationAddress;
+            unsigned _size;
+        } _p;
+        unsigned _len;  // bytes in use
+    };
+
+}
diff --git a/src/mongo/util/allocator.h b/src/mongo/util/allocator.h
new file mode 100644
index 00000000000..a642e7cab56
--- /dev/null
+++ b/src/mongo/util/allocator.h
@@ -0,0 +1,39 @@
+// allocator.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    inline void * ourmalloc(size_t size) {
+        void *x = malloc(size);
+        if ( x == 0 ) dbexit( EXIT_OOM_MALLOC , "malloc fails");
+        return x;
+    }
+
+    inline void * ourrealloc(void *ptr, size_t size) {
+        void *x = realloc(ptr, size);
+        if ( x == 0 ) dbexit( EXIT_OOM_REALLOC , "realloc fails");
+        return x;
+    }
+
+#define MONGO_malloc mongo::ourmalloc
+#define malloc MONGO_malloc
+#define MONGO_realloc mongo::ourrealloc
+#define realloc MONGO_realloc
+
+} // namespace mongo
diff --git a/src/mongo/util/array.h b/src/mongo/util/array.h
new file mode 100644
index 00000000000..12822252fd7
--- /dev/null
+++ b/src/mongo/util/array.h
@@ -0,0 +1,127 @@
+// array.h
+
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+namespace mongo {
+
+    /*
+     * simple array class that does no allocations
+     * same api as vector
+     * fixed buffer, so once capacity is exceeded, will assert
+     * meant to be-reused with clear()
+     */
+    template<typename T>
+    class FastArray {
+    public:
+        FastArray( int capacity=10000 )
+            : _capacity( capacity ) , _size(0) , _end(this,capacity) {
+            _data = new T[capacity];
+        }
+
+        ~FastArray() {
+            delete[] _data;
+        }
+
+        void clear() {
+            _size = 0;
+        }
+
+        T& operator[]( int x ) {
+            assert( x >= 0 && x < _capacity );
+            return _data[x];
+        }
+
+        T& getNext() {
+            return _data[_size++];
+        }
+
+        void push_back( const T& t ) {
+            assert( _size < _capacity );
+            _data[_size++] = t;
+        }
+
+        void sort( int (*comp)(const void *, const void *) ) {
+            qsort( _data , _size , sizeof(T) , comp );
+        }
+
+        int size() {
+            return _size;
+        }
+
+        bool hasSpace() {
+            return _size < _capacity;
+        }
+        class iterator {
+        public:
+            iterator() {
+                _it = 0;
+                _pos = 0;
+            }
+
+            iterator( FastArray * it , int pos=0 ) {
+                _it = it;
+                _pos = pos;
+            }
+
+            bool operator==(const iterator& other ) const {
+                return _pos == other._pos;
+            }
+
+            bool operator!=(const iterator& other ) const {
+                return _pos != other._pos;
+            }
+
+            void operator++() {
+                _pos++;
+            }
+
+            T& operator*() {
+                return _it->_data[_pos];
+            }
+
+            string toString() const {
+                stringstream ss;
+                ss << _pos;
+                return ss.str();
+            }
+        private:
+            FastArray * _it;
+            int _pos;
+
+            friend class FastArray;
+        };
+
+
+        iterator begin() {
+            return iterator(this);
+        }
+
+        iterator end() {
+            _end._pos = _size;
+            return _end;
+        }
+
+
+    private:
+        int _capacity;
+        int _size;
+
+        iterator _end;
+
+        T * _data;
+    };
+}
diff --git a/src/mongo/util/assert_util.cpp b/src/mongo/util/assert_util.cpp
new file mode 100644
index 00000000000..2199cb1ce11
--- /dev/null
+++ b/src/mongo/util/assert_util.cpp
@@ -0,0 +1,213 @@
+// assert_util.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "assert_util.h"
+#include "assert.h"
+//#include "file.h"
+#include <cmath>
+using namespace std;
+
+#ifndef _WIN32
+#include <cxxabi.h>
+#include <sys/file.h>
+#endif
+
+//#include "../bson/bson.h"
+#include "../db/jsobj.h"
+
+namespace mongo {
+
+    AssertionCount assertionCount;
+
+    AssertionCount::AssertionCount()
+        : regular(0),warning(0),msg(0),user(0),rollovers(0) {
+    }
+
+    void AssertionCount::rollover() {
+        rollovers++;
+        regular = 0;
+        warning = 0;
+        msg = 0;
+        user = 0;
+    }
+
+    void AssertionCount::condrollover( int newvalue ) {
+        static int max = (int)pow( 2.0 , 30 );
+        if ( newvalue >= max )
+            rollover();
+    }
+
+    bool DBException::traceExceptions = false;
+
+    void ExceptionInfo::append( BSONObjBuilder& b , const char * m , const char * c ) const {
+        if ( msg.empty() )
+            b.append( m , "unknown assertion" );
+        else
+            b.append( m , msg );
+
+        if ( code )
+            b.append( c , code );
+    }
+
+    string getDbContext();
+
+    /* "warning" assert -- safe to continue, so we don't throw exception. */
+    NOINLINE_DECL void wasserted(const char *msg, const char *file, unsigned line) {
+        static bool rateLimited;
+        static time_t lastWhen;
+        static unsigned lastLine;
+        if( lastLine == line && time(0)-lastWhen < 5 ) { 
+            if( rateLimited++ == 0 ) { 
+                log() << "rate limiting wassert" << endl;
+            }
+            return;
+        }
+        lastWhen = time(0);
+        lastLine = line;
+
+        problem() << "warning assertion failure " << msg << ' ' << file << ' ' << dec << line << endl;
+        sayDbContext();
+        raiseError(0,msg && *msg ? msg : "wassertion failure");
+        assertionCount.condrollover( ++assertionCount.warning );
+#if defined(_DEBUG) || defined(_DURABLEDEFAULTON) || defined(_DURABLEDEFAULTOFF)
+        // this is so we notice in buildbot
+        log() << "\n\n***aborting after wassert() failure in a debug/test build\n\n" << endl;
+        abort();
+#endif
+    }
+
+    NOINLINE_DECL void asserted(const char *msg, const char *file, unsigned line) {
+        assertionCount.condrollover( ++assertionCount.regular );
+        problem() << "Assertion failure " << msg << ' ' << file << ' ' << dec << line << endl;
+        sayDbContext();
+        raiseError(0,msg && *msg ? msg : "assertion failure");
+        stringstream temp;
+        temp << "assertion " << file << ":" << line;
+        AssertionException e(temp.str(),0);
+        breakpoint();
+#if defined(_DEBUG) || defined(_DURABLEDEFAULTON) || defined(_DURABLEDEFAULTOFF)
+        // this is so we notice in buildbot
+        log() << "\n\n***aborting after assert() failure as this is a debug/test build\n\n" << endl;
+        abort();
+#endif
+        throw e;
+    }
+
+    NOINLINE_DECL void verifyFailed( int msgid ) {
+        assertionCount.condrollover( ++assertionCount.regular );
+        problem() << "Assertion failure " << msgid << endl;
+        sayDbContext();
+        raiseError(0,"assertion failure");
+        stringstream temp;
+        temp << msgid;
+        AssertionException e(temp.str(),0);
+        breakpoint();
+#if defined(_DEBUG) || defined(_DURABLEDEFAULTON) || defined(_DURABLEDEFAULTOFF)
+        // this is so we notice in buildbot
+        log() << "\n\n***aborting after verify() failure in a debug/test build\n\n" << endl;
+        abort();
+#endif
+        throw e;
+    }
+
+    void uassert_nothrow(const char *msg) {
+        raiseError(0,msg);
+    }
+
+    void uasserted(int msgid , const string &msg) {
+        uasserted(msgid, msg.c_str());
+    }
+
+    NOINLINE_DECL void uasserted(int msgid, const char *msg) {
+        assertionCount.condrollover( ++assertionCount.user );
+        LOG(1) << "User Assertion: " << msgid << ":" << msg << endl;
+        raiseError(msgid,msg);
+        throw UserException(msgid, msg);
+    }
+
+    void msgasserted(int msgid, const string &msg) {
+        msgasserted(msgid, msg.c_str());
+    }
+
+    NOINLINE_DECL void msgasserted(int msgid, const char *msg) {
+        assertionCount.condrollover( ++assertionCount.warning );
+        tlog() << "Assertion: " << msgid << ":" << msg << endl;
+        raiseError(msgid,msg && *msg ? msg : "massert failure");
+        breakpoint();
+        printStackTrace();
+        throw MsgAssertionException(msgid, msg);
+    }
+
+    NOINLINE_DECL void msgassertedNoTrace(int msgid, const char *msg) {
+        assertionCount.condrollover( ++assertionCount.warning );
+        log() << "Assertion: " << msgid << ":" << msg << endl;
+        raiseError(msgid,msg && *msg ? msg : "massert failure");
+        throw MsgAssertionException(msgid, msg);
+    }
+
+    NOINLINE_DECL void streamNotGood( int code , string msg , std::ios& myios ) {
+        stringstream ss;
+        // errno might not work on all systems for streams
+        // if it doesn't for a system should deal with here
+        ss << msg << " stream invalid: " << errnoWithDescription();
+        throw UserException( code , ss.str() );
+    }
+
+    string errnoWithPrefix( const char * prefix ) {
+        stringstream ss;
+        if ( prefix )
+            ss << prefix << ": ";
+        ss << errnoWithDescription();
+        return ss.str();
+    }
+
+    string demangleName( const type_info& typeinfo ) {
+#ifdef _WIN32
+        return typeinfo.name();
+#else
+        int status;
+
+        char * niceName = abi::__cxa_demangle(typeinfo.name(), 0, 0, &status);
+        if ( ! niceName )
+            return typeinfo.name();
+
+        string s = niceName;
+        free(niceName);
+        return s;
+#endif
+    }
+
+    NOINLINE_DECL ErrorMsg::ErrorMsg(const char *msg, char ch) {
+        int l = strlen(msg);
+        assert( l < 128);
+        memcpy(buf, msg, l);
+        char *p = buf + l;
+        p[0] = ch;
+        p[1] = 0;
+    }
+
+    NOINLINE_DECL ErrorMsg::ErrorMsg(const char *msg, unsigned val) {
+        int l = strlen(msg);
+        assert( l < 128);
+        memcpy(buf, msg, l);
+        char *p = buf + l;
+        sprintf(p, "%u", val);
+    }
+
+}
+
diff --git a/src/mongo/util/assert_util.h b/src/mongo/util/assert_util.h
new file mode 100644
index 00000000000..2e6b2a9732a
--- /dev/null
+++ b/src/mongo/util/assert_util.h
@@ -0,0 +1,275 @@
+// assert_util.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+
+#pragma once
+
+#include "../db/lasterror.h"
+
+// MONGO_NORETURN undefed at end of file
+#ifdef __GNUC__
+# define MONGO_NORETURN __attribute__((__noreturn__))
+#else
+# define MONGO_NORETURN 
+#endif
+
+namespace mongo {
+
+    enum CommonErrorCodes {
+        DatabaseDifferCaseCode = 13297 ,
+        SendStaleConfigCode = 13388 ,
+        RecvStaleConfigCode = 9996
+    };
+
+    class AssertionCount {
+    public:
+        AssertionCount();
+        void rollover();
+        void condrollover( int newValue );
+
+        int regular;
+        int warning;
+        int msg;
+        int user;
+        int rollovers;
+    };
+
+    extern AssertionCount assertionCount;
+
+    struct ExceptionInfo {
+        ExceptionInfo() : msg(""),code(-1) {}
+        ExceptionInfo( const char * m , int c )
+            : msg( m ) , code( c ) {
+        }
+        ExceptionInfo( const string& m , int c )
+            : msg( m ) , code( c ) {
+        }
+        void append( BSONObjBuilder& b , const char * m = "$err" , const char * c = "code" ) const ;
+        string toString() const { stringstream ss; ss << "exception: " << code << " " << msg; return ss.str(); }
+        bool empty() const { return msg.empty(); }
+        
+        void reset(){ msg = ""; code=-1; }
+
+        string msg;
+        int code;
+    };
+
+    /** helper class that builds error strings.  lighter weight than a StringBuilder, albeit less flexible.
+        NOINLINE_DECL used in the constructor implementations as we are assuming this is a cold code path when used.
+
+        example: 
+          throw UserException(123, ErrorMsg("blah", num_val));
+    */
+    class ErrorMsg { 
+    public:
+        ErrorMsg(const char *msg, char ch);
+        ErrorMsg(const char *msg, unsigned val);
+        operator string() const { return buf; }
+    private:
+        char buf[256];
+    };
+
+    class DBException;
+    string causedBy( const DBException& e );
+    string causedBy( const string& e );
+
+    class DBException : public std::exception {
+    public:
+        DBException( const ExceptionInfo& ei ) : _ei(ei) { traceIfNeeded(*this); }
+        DBException( const char * msg , int code ) : _ei(msg,code) { traceIfNeeded(*this); }
+        DBException( const string& msg , int code ) : _ei(msg,code) { traceIfNeeded(*this); }
+        virtual ~DBException() throw() { }
+
+        virtual const char* what() const throw() { return _ei.msg.c_str(); }
+        virtual int getCode() const { return _ei.code; }
+
+        virtual void appendPrefix( stringstream& ss ) const { }
+        virtual void addContext( const string& str ) {
+            _ei.msg = str + causedBy( _ei.msg );
+        }
+
+        virtual string toString() const {
+            stringstream ss; ss << getCode() << " " << what(); return ss.str();
+            return ss.str();
+        }
+
+        const ExceptionInfo& getInfo() const { return _ei; }
+
+        static void traceIfNeeded( const DBException& e ) {
+            if( traceExceptions && ! inShutdown() ){
+                warning() << "DBException thrown" << causedBy( e ) << endl;
+                printStackTrace();
+            }
+        }
+
+        static bool traceExceptions;
+
+    protected:
+        ExceptionInfo _ei;
+    };
+
+    class AssertionException : public DBException {
+    public:
+
+        AssertionException( const ExceptionInfo& ei ) : DBException(ei) {}
+        AssertionException( const char * msg , int code ) : DBException(msg,code) {}
+        AssertionException( const string& msg , int code ) : DBException(msg,code) {}
+
+        virtual ~AssertionException() throw() { }
+
+        virtual bool severe() { return true; }
+        virtual bool isUserAssertion() { return false; }
+
+        /* true if an interrupted exception - see KillCurrentOp */
+        bool interrupted() {
+            return _ei.code == 11600 || _ei.code == 11601;
+        }
+    };
+
+    /* UserExceptions are valid errors that a user can cause, like out of disk space or duplicate key */
+    class UserException : public AssertionException {
+    public:
+        UserException(int c , const string& m) : AssertionException( m , c ) {}
+
+        virtual bool severe() { return false; }
+        virtual bool isUserAssertion() { return true; }
+        virtual void appendPrefix( stringstream& ss ) const { ss << "userassert:"; }
+    };
+
+    class MsgAssertionException : public AssertionException {
+    public:
+        MsgAssertionException( const ExceptionInfo& ei ) : AssertionException( ei ) {}
+        MsgAssertionException(int c, const string& m) : AssertionException( m , c ) {}
+        virtual bool severe() { return false; }
+        virtual void appendPrefix( stringstream& ss ) const { ss << "massert:"; }
+    };
+
+    void asserted(const char *msg, const char *file, unsigned line) MONGO_NORETURN;
+    void wasserted(const char *msg, const char *file, unsigned line);
+    void verifyFailed( int msgid );
+    
+    /** a "user assertion".  throws UserAssertion.  logs.  typically used for errors that a user
+        could cause, such as duplicate key, disk full, etc.
+    */
+    void uasserted(int msgid, const char *msg) MONGO_NORETURN;
+    void uasserted(int msgid , const string &msg);
+
+    /** reported via lasterror, but don't throw exception */
+    void uassert_nothrow(const char *msg);
+
+    /** msgassert and massert are for errors that are internal but have a well defined error text string.
+        a stack trace is logged.
+    */
+    void msgassertedNoTrace(int msgid, const char *msg) MONGO_NORETURN;
+    inline void msgassertedNoTrace(int msgid, const string& msg) { msgassertedNoTrace( msgid , msg.c_str() ); }
+    void msgasserted(int msgid, const char *msg) MONGO_NORETURN;
+    void msgasserted(int msgid, const string &msg);
+
+    /* convert various types of exceptions to strings */
+    inline string causedBy( const char* e ){ return (string)" :: caused by :: " + e; }
+    inline string causedBy( const DBException& e ){ return causedBy( e.toString().c_str() ); }
+    inline string causedBy( const std::exception& e ){ return causedBy( e.what() ); }
+    inline string causedBy( const string& e ){ return causedBy( e.c_str() ); }
+
+    /** in the mongodb source, use verify() instead of assert().  verify is always evaluated even in release builds. */
+    inline void verify( int msgid , bool testOK ) { if ( ! testOK ) verifyFailed( msgid ); }
+
+#ifdef assert
+#undef assert
+#endif
+
+#define MONGO_assert(_Expression) (void)( MONGO_likely(!!(_Expression)) || (mongo::asserted(#_Expression, __FILE__, __LINE__), 0) )
+#define assert MONGO_assert
+
+    /* "user assert".  if asserts, user did something wrong, not our code */
+#define MONGO_uassert(msgid, msg, expr) (void)( MONGO_likely(!!(expr)) || (mongo::uasserted(msgid, msg), 0) )
+#define uassert MONGO_uassert
+
+    /* warning only - keeps going */
+#define MONGO_wassert(_Expression) (void)( MONGO_likely(!!(_Expression)) || (mongo::wasserted(#_Expression, __FILE__, __LINE__), 0) )
+#define wassert MONGO_wassert
+
+    /* display a message, no context, and throw assertionexception
+
+       easy way to throw an exception and log something without our stack trace
+       display happening.
+    */
+#define MONGO_massert(msgid, msg, expr) (void)( MONGO_likely(!!(expr)) || (mongo::msgasserted(msgid, msg), 0) )
+#define massert MONGO_massert
+
+    /* dassert is 'debug assert' -- might want to turn off for production as these
+       could be slow.
+    */
+#if defined(_DEBUG)
+# define MONGO_dassert assert
+#else
+# define MONGO_dassert(x)
+#endif
+#define dassert MONGO_dassert
+
+    // some special ids that we want to duplicate
+
+    // > 10000 asserts
+    // < 10000 UserException
+
+    enum { ASSERT_ID_DUPKEY = 11000 };
+
+    /* throws a uassertion with an appropriate msg */
+    void streamNotGood( int code , string msg , std::ios& myios ) MONGO_NORETURN;
+
+    inline void assertStreamGood(unsigned msgid, string msg, std::ios& myios) {
+        if( !myios.good() ) streamNotGood(msgid, msg, myios);
+    }
+
+    string demangleName( const type_info& typeinfo );
+
+} // namespace mongo
+
+#define BOOST_CHECK_EXCEPTION MONGO_BOOST_CHECK_EXCEPTION
+#define MONGO_BOOST_CHECK_EXCEPTION( expression ) \
+    try { \
+        expression; \
+    } catch ( const std::exception &e ) { \
+        stringstream ss; \
+        ss << "caught boost exception: " << e.what() << ' ' << __FILE__ << ' ' << __LINE__; \
+        msgasserted( 13294 , ss.str() ); \
+    } catch ( ... ) { \
+        massert( 10437 ,  "unknown boost failed" , false ); \
+    }
+
+#define MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( expression, msg ) \
+    try { \
+        expression; \
+    } catch ( const std::exception &e ) { \
+        stringstream ss; \
+        ss << msg << " caught boost exception: " << e.what();   \
+        msgasserted( 14043 , ss.str() );        \
+    } catch ( ... ) { \
+        msgasserted( 14044 , string("unknown boost failed ") + msg );   \
+    }
+
+#define DESTRUCTOR_GUARD MONGO_DESTRUCTOR_GUARD
+#define MONGO_DESTRUCTOR_GUARD( expression ) \
+    try { \
+        expression; \
+    } catch ( const std::exception &e ) { \
+        problem() << "caught exception (" << e.what() << ") in destructor (" << __FUNCTION__ << ")" << endl; \
+    } catch ( ... ) { \
+        problem() << "caught unknown exception in destructor (" << __FUNCTION__ << ")" << endl; \
+    }
+
+#undef MONGO_NORETURN
diff --git a/src/mongo/util/background.cpp b/src/mongo/util/background.cpp
new file mode 100644
index 00000000000..ef3ee9426b9
--- /dev/null
+++ b/src/mongo/util/background.cpp
@@ -0,0 +1,190 @@
+// @file background.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#include "concurrency/mutex.h"
+#include "concurrency/spin_lock.h"
+
+#include "background.h"
+#include "time_support.h"
+#include "timer.h"
+
+#include "mongoutils/str.h"
+
+namespace mongo {
+
+    // both the BackgroundJob and the internal thread point to JobStatus
+    struct BackgroundJob::JobStatus {
+        JobStatus( bool delFlag )
+            : deleteSelf(delFlag), m("backgroundJob"), state(NotStarted) { }
+
+        const bool deleteSelf;
+
+        mongo::mutex m;  // protects state below
+        boost::condition finished; // means _state == Done
+        State state;
+    };
+
+    BackgroundJob::BackgroundJob( bool selfDelete ) {
+        _status.reset( new JobStatus( selfDelete ) );
+    }
+
+    // Background object can be only be destroyed after jobBody() ran
+    void BackgroundJob::jobBody( boost::shared_ptr<JobStatus> status ) {
+        LOG(1) << "BackgroundJob starting: " << name() << endl;
+        {
+            scoped_lock l( status->m );
+            massert( 13643 , mongoutils::str::stream() << "backgroundjob already started: " << name() , status->state == NotStarted );
+            status->state = Running;
+        }
+
+        const string threadName = name();
+        if( ! threadName.empty() )
+            setThreadName( threadName.c_str() );
+
+        try {
+            run();
+        }
+        catch ( std::exception& e ) {
+            log( LL_ERROR ) << "backgroundjob " << name() << "error: " << e.what() << endl;
+        }
+        catch(...) {
+            log( LL_ERROR ) << "uncaught exception in BackgroundJob " << name() << endl;
+        }
+
+        {
+            scoped_lock l( status->m );
+            status->state = Done;
+            status->finished.notify_all();
+        }
+
+        if( status->deleteSelf )
+            delete this;
+    }
+
+    BackgroundJob& BackgroundJob::go() {
+        boost::thread t( boost::bind( &BackgroundJob::jobBody , this, _status ) );
+        return *this;
+    }
+
+    bool BackgroundJob::wait( unsigned msTimeOut ) {
+        assert( !_status->deleteSelf ); // you cannot call wait on a self-deleting job
+        scoped_lock l( _status->m );
+        while ( _status->state != Done ) {
+            if ( msTimeOut ) {
+                // add msTimeOut millisecond to current time
+                boost::xtime xt;
+                boost::xtime_get( &xt, boost::TIME_UTC );
+
+                unsigned long long ns = msTimeOut * 1000000ULL; // milli to nano
+                if ( xt.nsec + ns < 1000000000 ) {
+                    xt.nsec = (boost::xtime::xtime_nsec_t) (xt.nsec + ns);
+                }
+                else {
+                    xt.sec += 1 + ns / 1000000000;
+                    xt.nsec = ( ns + xt.nsec ) % 1000000000;
+                }
+
+                if ( ! _status->finished.timed_wait( l.boost() , xt ) )
+                    return false;
+
+            }
+            else {
+                _status->finished.wait( l.boost() );
+            }
+        }
+        return true;
+    }
+
+    BackgroundJob::State BackgroundJob::getState() const {
+        scoped_lock l( _status->m);
+        return _status->state;
+    }
+
+    bool BackgroundJob::running() const {
+        scoped_lock l( _status->m);
+        return _status->state == Running;
+    }
+
+    // -------------------------
+
+    PeriodicTask::PeriodicTask() {
+        if ( ! theRunner )
+            theRunner = new Runner();
+        theRunner->add( this );
+    }
+
+    PeriodicTask::~PeriodicTask() {
+        theRunner->remove( this );
+    }
+
+    void PeriodicTask::Runner::add( PeriodicTask* task ) {
+        scoped_spinlock lk( _lock );
+        _tasks.push_back( task );
+    }
+    
+    void PeriodicTask::Runner::remove( PeriodicTask* task ) {
+        scoped_spinlock lk( _lock );
+        for ( size_t i=0; i<_tasks.size(); i++ ) {
+            if ( _tasks[i] == task ) {
+                _tasks[i] = 0;
+                break;
+            }
+        }
+    }
+
+    void PeriodicTask::Runner::run() { 
+        int sleeptime = 60;
+        DEV sleeptime = 5; // to catch race conditions
+
+        while ( ! inShutdown() ) {
+
+            sleepsecs( sleeptime );
+            
+            scoped_spinlock lk( _lock );
+            
+            size_t size = _tasks.size();
+            
+            for ( size_t i=0; i<size; i++ ) {
+                PeriodicTask * t = _tasks[i];
+                if ( ! t )
+                    continue;
+
+                if ( inShutdown() )
+                    break;
+                
+                Timer timer;
+                try {
+                    t->taskDoWork();
+                }
+                catch ( std::exception& e ) {
+                    error() << "task: " << t->taskName() << " failed: " << e.what() << endl;
+                }
+                catch ( ... ) {
+                    error() << "task: " << t->taskName() << " failed with unknown error" << endl;
+                }
+                
+                int ms = timer.millis();
+                LOG( ms <= 3 ? 1 : 0 ) << "task: " << t->taskName() << " took: " << ms << "ms" << endl;
+            }
+        }
+    }
+
+    PeriodicTask::Runner* PeriodicTask::theRunner = 0;
+
+} // namespace mongo
diff --git a/src/mongo/util/background.h b/src/mongo/util/background.h
new file mode 100644
index 00000000000..496a1f44f88
--- /dev/null
+++ b/src/mongo/util/background.h
@@ -0,0 +1,155 @@
+// @file background.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "concurrency/spin_lock.h"
+
+namespace mongo {
+
+    /**
+     *  Background thread dispatching.
+     *  subclass and define run()
+     *
+     *  It is ok to call go(), that is, run the job, more than once -- if the
+     *  previous invocation has finished. Thus one pattern of use is to embed
+     *  a backgroundjob in your object and reuse it (or same thing with
+     *  inheritance).  Each go() call spawns a new thread.
+     *
+     *  Thread safety:
+     *    note when job destructs, the thread is not terminated if still running.
+     *    generally if the thread could still be running, allocate the job dynamically
+     *    and set deleteSelf to true.
+     *
+     *    go() and wait() are not thread safe
+     *    run() will be executed on the background thread
+     *    BackgroundJob object must exist for as long the background thread is running
+     */
+
+    class BackgroundJob : boost::noncopyable {
+    protected:
+        /**
+         * sub-class must intantiate the BackgrounJob
+         *
+         * @param selfDelete if set to true, object will destruct itself after the run() finished
+         * @note selfDelete instantes cannot be wait()-ed upon
+         */
+        explicit BackgroundJob(bool selfDelete = false);
+
+        virtual string name() const = 0;
+
+        /**
+         * define this to do your work.
+         * after this returns, state is set to done.
+         * after this returns, deleted if deleteSelf true.
+         *
+         * NOTE:
+         *   if run() throws, the exception will be caught within 'this' object and will ultimately lead to the
+         *   BackgroundJob's thread being finished, as if run() returned.
+         *
+         */
+        virtual void run() = 0;
+
+    public:
+        enum State {
+            NotStarted,
+            Running,
+            Done
+        };
+
+        virtual ~BackgroundJob() { }
+
+        /**
+         * starts job.
+         * returns immediatelly after dispatching.
+         *
+         * @note the BackgroundJob object must live for as long the thread is still running, ie
+         * until getState() returns Done.
+         */
+        BackgroundJob& go();
+
+        /**
+         * wait for completion.
+         *
+         * @param msTimeOut maximum amount of time to wait in millisecons
+         * @return true if did not time out. false otherwise.
+         *
+         * @note you can call wait() more than once if the first call times out.
+         * but you cannot call wait on a self-deleting job.
+         */
+        bool wait( unsigned msTimeOut = 0 );
+
+        // accessors
+        State getState() const;
+        bool running() const;
+
+    private:
+        struct JobStatus;
+        boost::shared_ptr<JobStatus> _status;  // shared between 'this' and body() thread
+
+        void jobBody( boost::shared_ptr<JobStatus> status );
+
+    };
+    
+    /**
+     * these run "roughly" every minute
+     * instantiate statically
+     * class MyTask : public PeriodicTask {
+     * public:
+     *   virtual string name() const { return "MyTask; " }
+     *   virtual void doWork() { log() << "hi" << endl; }
+     * } myTask;
+     */
+    class PeriodicTask {
+    public:
+        PeriodicTask();
+        virtual ~PeriodicTask();
+
+        virtual void taskDoWork() = 0;
+        virtual string taskName() const = 0;
+
+        class Runner : public BackgroundJob {
+        public:
+            virtual ~Runner(){}
+
+            virtual string name() const { return "PeriodicTask::Runner"; }
+            
+            virtual void run();
+            
+            void add( PeriodicTask* task );
+            void remove( PeriodicTask* task );
+
+        private:
+            
+            SpinLock _lock;
+            
+            // these are NOT owned by Runner
+            // Runner will not delete these
+            // this never gets smaller
+            // only fields replaced with nulls
+            vector<PeriodicTask*> _tasks;
+
+        };
+
+        static Runner* theRunner;
+
+    };
+
+
+
+
+} // namespace mongo
diff --git a/src/mongo/util/base64.cpp b/src/mongo/util/base64.cpp
new file mode 100644
index 00000000000..aff06e26126
--- /dev/null
+++ b/src/mongo/util/base64.cpp
@@ -0,0 +1,109 @@
+// util/base64.cpp
+
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "base64.h"
+
+namespace mongo {
+    namespace base64 {
+
+        Alphabet alphabet;
+
+        void encode( stringstream& ss , const char * data , int size ) {
+            for ( int i=0; i<size; i+=3 ) {
+                int left = size - i;
+                const unsigned char * start = (const unsigned char*)data + i;
+
+                // byte 0
+                ss << alphabet.e(start[0]>>2);
+
+                // byte 1
+                unsigned char temp = ( start[0] << 4 );
+                if ( left == 1 ) {
+                    ss << alphabet.e(temp);
+                    break;
+                }
+                temp |= ( ( start[1] >> 4 ) & 0xF );
+                ss << alphabet.e(temp);
+
+                // byte 2
+                temp = ( start[1] & 0xF ) << 2;
+                if ( left == 2 ) {
+                    ss << alphabet.e(temp);
+                    break;
+                }
+                temp |= ( ( start[2] >> 6 ) & 0x3 );
+                ss << alphabet.e(temp);
+
+                // byte 3
+                ss << alphabet.e(start[2] & 0x3f);
+            }
+
+            int mod = size % 3;
+            if ( mod == 1 ) {
+                ss << "==";
+            }
+            else if ( mod == 2 ) {
+                ss << "=";
+            }
+        }
+
+
+        string encode( const char * data , int size ) {
+            stringstream ss;
+            encode( ss , data ,size );
+            return ss.str();
+        }
+
+        string encode( const string& s ) {
+            return encode( s.c_str() , s.size() );
+        }
+
+
+        void decode( stringstream& ss , const string& s ) {
+            uassert( 10270 ,  "invalid base64" , s.size() % 4 == 0 );
+            const unsigned char * data = (const unsigned char*)s.c_str();
+            int size = s.size();
+
+            unsigned char buf[3];
+            for ( int i=0; i<size; i+=4) {
+                const unsigned char * start = data + i;
+                buf[0] = ( ( alphabet.decode[start[0]] << 2 ) & 0xFC ) | ( ( alphabet.decode[start[1]] >> 4 ) & 0x3 );
+                buf[1] = ( ( alphabet.decode[start[1]] << 4 ) & 0xF0 ) | ( ( alphabet.decode[start[2]] >> 2 ) & 0xF );
+                buf[2] = ( ( alphabet.decode[start[2]] << 6 ) & 0xC0 ) | ( ( alphabet.decode[start[3]] & 0x3F ) );
+
+                int len = 3;
+                if ( start[3] == '=' ) {
+                    len = 2;
+                    if ( start[2] == '=' ) {
+                        len = 1;
+                    }
+                }
+                ss.write( (const char*)buf , len );
+            }
+        }
+
+        string decode( const string& s ) {
+            stringstream ss;
+            decode( ss , s );
+            return ss.str();
+        }
+
+    }
+}
+
diff --git a/src/mongo/util/base64.h b/src/mongo/util/base64.h
new file mode 100644
index 00000000000..505b5d78cca
--- /dev/null
+++ b/src/mongo/util/base64.h
@@ -0,0 +1,68 @@
+// util/base64.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+    namespace base64 {
+
+        class Alphabet {
+        public:
+            Alphabet()
+                : encode((unsigned char*)
+                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                         "abcdefghijklmnopqrstuvwxyz"
+                         "0123456789"
+                         "+/")
+                , decode(new unsigned char[257]) {
+                memset( decode.get() , 0 , 256 );
+                for ( int i=0; i<64; i++ ) {
+                    decode[ encode[i] ] = i;
+                }
+
+                test();
+            }
+            void test() {
+                assert( strlen( (char*)encode ) == 64 );
+                for ( int i=0; i<26; i++ )
+                    assert( encode[i] == toupper( encode[i+26] ) );
+            }
+
+            char e( int x ) {
+                return encode[x&0x3f];
+            }
+
+        private:
+            const unsigned char * encode;
+        public:
+            boost::scoped_array<unsigned char> decode;
+        };
+
+        extern Alphabet alphabet;
+
+
+        void encode( stringstream& ss , const char * data , int size );
+        string encode( const char * data , int size );
+        string encode( const string& s );
+
+        void decode( stringstream& ss , const string& s );
+        string decode( const string& s );
+
+
+        void testAlphabet();
+    }
+}
diff --git a/src/mongo/util/bson_util.h b/src/mongo/util/bson_util.h
new file mode 100644
index 00000000000..973e31f1af1
--- /dev/null
+++ b/src/mongo/util/bson_util.h
@@ -0,0 +1,42 @@
+// bson_util.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+
+namespace mongo {
+
+template <typename T>
+void bsonArrToNumVector(BSONElement el, vector<T>& results){
+
+    if(el.type() == Array){
+
+	vector<BSONElement> elements = el.Array();
+
+        for(vector<BSONElement>::iterator i = elements.begin(); i != elements.end(); ++i){
+            results.push_back( (T) (*i).Number() );
+        }
+    }
+    else if(el.isNumber()){
+	results.push_back( (T) el.Number() );
+    }
+
+}
+
+
+}
diff --git a/src/mongo/util/bufreader.h b/src/mongo/util/bufreader.h
new file mode 100644
index 00000000000..53f0ba744e2
--- /dev/null
+++ b/src/mongo/util/bufreader.h
@@ -0,0 +1,100 @@
+// @file bufreader.h parse a memory region into usable pieces
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /** helper to read and parse a block of memory
+        methods throw the eof exception if the operation would pass the end of the
+        buffer with which we are working.
+    */
+    class BufReader : boost::noncopyable {
+    public:
+        class eof : public std::exception {
+        public:
+            eof() { }
+            virtual const char * what() { return "BufReader eof"; }
+        };
+
+        BufReader(const void *p, unsigned len) : _start(p), _pos(p), _end(((char *)_pos)+len) { }
+
+        bool atEof() const { return _pos == _end; }
+
+        /** read in the object specified, and advance buffer pointer */
+        template <typename T>
+        void read(T &t) {
+            T* cur = (T*) _pos;
+            T *next = cur + 1;
+            if( _end < next ) throw eof();
+            t = *cur;
+            _pos = next;
+        }
+
+        /** verify we can look at t, but do not advance */
+        template <typename T>
+        void peek(T &t) {
+            T* cur = (T*) _pos;
+            T *next = cur + 1;
+            if( _end < next ) throw eof();
+            t = *cur;
+        }
+
+        /** return current offset into buffer */
+        unsigned offset() const { return (char*)_pos - (char*)_start; }
+
+        /** return remaining bytes */
+        unsigned remaining() const { return (char*)_end -(char*)_pos; }
+
+        /** back up by nbytes */
+        void rewind(unsigned nbytes) {
+            _pos = ((char *) _pos) - nbytes;
+            assert( _pos >= _start );
+        }
+
+        /** return current position pointer, and advance by len */
+        const void* skip(unsigned len) {
+            const char *nxt = ((char *) _pos) + len;
+            if( _end < nxt ) throw eof();
+            const void *p = _pos;
+            _pos = nxt;
+            return p;
+        }
+
+        void readStr(string& s) {
+            StringBuilder b;
+            while( 1 ) {
+                char ch;
+                read(ch);
+                if( ch == 0 )
+                    break;
+                b << ch;
+            }
+            s = b.str();
+        }
+
+        const void* pos() { return _pos; }
+        const void* start() { return _start; }
+
+    private:
+        const void *_start;
+        const void *_pos;
+        const void *_end;
+    };
+
+}
diff --git a/src/mongo/util/checksum.h b/src/mongo/util/checksum.h
new file mode 100644
index 00000000000..009ab56fbeb
--- /dev/null
+++ b/src/mongo/util/checksum.h
@@ -0,0 +1,37 @@
+#pragma once
+#include "../pch.h"
+namespace mongo {
+    /** a simple, rather dumb, but very fast checksum.  see perftests.cpp for unit tests. */
+    struct Checksum { 
+        union { 
+            unsigned char bytes[16];
+            unsigned long long words[2];
+        };
+
+        // if you change this you must bump dur::CurrentVersion
+        void gen(const void *buf, unsigned len) {
+            wassert( ((size_t)buf) % 8 == 0 ); // performance warning
+            unsigned n = len / 8 / 2;
+            const unsigned long long *p = (const unsigned long long *) buf;
+            unsigned long long a = 0;
+            for( unsigned i = 0; i < n; i++ ) {
+                a += (*p ^ i);
+                p++;
+            }
+            unsigned long long b = 0;
+            for( unsigned i = 0; i < n; i++ ) {
+                b += (*p ^ i);
+                p++;
+            }
+            unsigned long long c = 0;
+            for( unsigned i = n * 2 * 8; i < len; i++ ) { // 0-7 bytes left
+                c = (c << 8) | ((const char *)buf)[i];
+            }
+            words[0] = a ^ len;
+            words[1] = b ^ c;
+        }
+
+        bool operator==(const Checksum& rhs) const { return words[0]==rhs.words[0] && words[1]==rhs.words[1]; }
+        bool operator!=(const Checksum& rhs) const { return words[0]!=rhs.words[0] || words[1]!=rhs.words[1]; }
+    };
+}
diff --git a/src/mongo/util/compress.cpp b/src/mongo/util/compress.cpp
new file mode 100644
index 00000000000..bcde488b88b
--- /dev/null
+++ b/src/mongo/util/compress.cpp
@@ -0,0 +1,31 @@
+// @file compress.cpp
+
+#include "../third_party/snappy/snappy.h"
+#include "compress.h"
+#include <string>
+#include <string.h>
+#include <assert.h>
+
+namespace mongo {
+
+    void rawCompress(const char* input,
+        size_t input_length,
+        char* compressed,
+        size_t* compressed_length) 
+    { 
+        snappy::RawCompress(input, input_length, compressed, compressed_length);
+    }
+
+    size_t maxCompressedLength(size_t source_len) { 
+        return snappy::MaxCompressedLength(source_len);
+    }
+
+    size_t compress(const char* input, size_t input_length, std::string* output) { 
+        return snappy::Compress(input, input_length, output);
+    }
+
+    bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed) { 
+        return snappy::Uncompress(compressed, compressed_length, uncompressed);
+    }
+
+}
diff --git a/src/mongo/util/compress.h b/src/mongo/util/compress.h
new file mode 100644
index 00000000000..5bc5a3392bb
--- /dev/null
+++ b/src/mongo/util/compress.h
@@ -0,0 +1,21 @@
+// @file compress.h
+
+#pragma once
+
+#include <string>
+
+namespace mongo { 
+
+    size_t compress(const char* input, size_t input_length, std::string* output);
+
+    bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed);
+
+    size_t maxCompressedLength(size_t source_len);
+    void rawCompress(const char* input,
+        size_t input_length,
+        char* compressed,
+        size_t* compressed_length);
+
+}
+
+
diff --git a/src/mongo/util/concurrency/README b/src/mongo/util/concurrency/README
new file mode 100644
index 00000000000..1a19264f4b6
--- /dev/null
+++ b/src/mongo/util/concurrency/README
@@ -0,0 +1,39 @@
+util/concurrency/ files
+
+msg.h - message passing between threads
+
+mutex.h - small enhancements that wrap boost::mutex
+ also SimpleMutex
+
+mvar.h
+ This is based on haskell's MVar synchronization primitive:
+ http://www.haskell.org/ghc/docs/latest/html/libraries/base/Control-Concurrent-MVar.html
+ It is a thread-safe queue that can hold at most one object.
+ You can also think of it as a box that can be either full or empty.
+
+race.h
+  RACECHECK
+
+rwlock.h - read/write locks (RWLock)
+  RWLock
+  RWLockRecursive
+  RWLockRecursiveNongreedy
+
+spin_lock.h
+
+synchronization.h 
+  Notification, NotifyAll
+
+threadlocal.h
+
+thread_pool.h 
+
+value.h
+  Guarded
+  DiagStr
+  mapsf
+
+goofy things that need reworking:
+  list.h
+  task.h
+
diff --git a/src/mongo/util/concurrency/list.h b/src/mongo/util/concurrency/list.h
new file mode 100644
index 00000000000..61bdd55f46f
--- /dev/null
+++ b/src/mongo/util/concurrency/list.h
@@ -0,0 +1,99 @@
+// list.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /* DONT USE THIS.  it was a dumb idea.
+    
+       this class uses a mutex for writes, but not for reads.
+       we can get fancier later...
+
+            struct Member : public List1<Member>::Base {
+                const char *host;
+                int port;
+            };
+            List1<Member> _members;
+            _members.head()->next();
+
+    */
+    template<typename T>
+    class List1 : boost::noncopyable {
+    public:
+        /* next() and head() return 0 at end of list */
+
+        List1() : _head(0), _m("List1"), _orphans(0) { }
+
+        class Base {
+            friend class List1;
+            T *_next;
+        public:
+            Base() : _next(0){}
+            ~Base() { wassert(false); } // we never want this to happen
+            T* next() const { return _next; }
+        };
+
+        /** note this is safe: 
+
+              T* p = mylist.head();
+              if( p ) 
+                use(p);
+
+            and this is not:
+
+              if( mylist.head() )
+                use( mylist.head() ); // could become 0
+        */
+        T* head() const { return (T*) _head; }
+
+        void push(T* t) {
+            assert( t->_next == 0 );
+            scoped_lock lk(_m);
+            t->_next = (T*) _head;
+            _head = t;
+        }
+
+        // intentionally leaks.
+        void orphanAll() {
+            scoped_lock lk(_m);
+            _head = 0;
+        }
+
+        /* t is not deleted, but is removed from the list. (orphaned) */
+        void orphan(T* t) {
+            scoped_lock lk(_m);
+            T *&prev = (T*&) _head;
+            T *n = prev;
+            while( n != t ) {
+                uassert( 14050 , "List1: item to orphan not in list", n );
+                prev = n->_next;
+                n = prev;
+            }
+            prev = t->_next;
+            if( ++_orphans > 500 )
+                log() << "warning List1 orphans=" << _orphans << '\n';
+        }
+
+    private:
+        volatile T *_head;
+        mongo::mutex _m;
+        int _orphans;
+    };
+
+};
diff --git a/src/mongo/util/concurrency/msg.h b/src/mongo/util/concurrency/msg.h
new file mode 100644
index 00000000000..0b9a7c5048c
--- /dev/null
+++ b/src/mongo/util/concurrency/msg.h
@@ -0,0 +1,61 @@
+// @file msg.h - interthread message passing
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <deque>
+#include "task.h"
+
+namespace mongo {
+
+    namespace task {
+
+        typedef boost::function<void()> lam;
+
+        /** typical usage is: task::fork( new Server("threadname") ); */
+        class Server : public Task {
+        public:
+            /** send a message to the port */
+            void send(lam);
+
+            Server(string name) : m("server"), _name(name), rq(false) { }
+            virtual ~Server() { }
+
+            /** send message but block until function completes */
+            void call(const lam&);
+
+            void requeue() { rq = true; }
+
+        protected:
+            /* REMINDER : for use in mongod, you will want to have this call Client::initThread(). */
+            virtual void starting() { }
+
+        private:
+            virtual bool initClient() { return true; }
+            virtual string name() const { return _name; }
+            void doWork();
+            deque<lam> d;
+            mongo::mutex m;
+            boost::condition c;
+            string _name;
+            bool rq;
+        };
+
+    }
+
+}
diff --git a/src/mongo/util/concurrency/mutex.h b/src/mongo/util/concurrency/mutex.h
new file mode 100644
index 00000000000..429f280b1cb
--- /dev/null
+++ b/src/mongo/util/concurrency/mutex.h
@@ -0,0 +1,228 @@
+// @file mutex.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../heapcheck.h"
+#include "threadlocal.h"
+#if defined(_DEBUG)
+#include "mutexdebugger.h"
+#endif
+
+namespace mongo {
+
+    void printStackTrace( ostream &o );
+
+    inline boost::xtime incxtimemillis( long long s ) {
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        xt.sec += (int)( s / 1000 );
+        xt.nsec += (int)(( s % 1000 ) * 1000000);
+        if ( xt.nsec >= 1000000000 ) {
+            xt.nsec -= 1000000000;
+            xt.sec++;
+        }
+        return xt;
+    }
+
+    // If you create a local static instance of this class, that instance will be destroyed
+    // before all global static objects are destroyed, so _destroyingStatics will be set
+    // to true before the global static variables are destroyed.
+    class StaticObserver : boost::noncopyable {
+    public:
+        static bool _destroyingStatics;
+        ~StaticObserver() { _destroyingStatics = true; }
+    };
+
+    /** On pthread systems, it is an error to destroy a mutex while held (boost mutex 
+     *    may use pthread).  Static global mutexes may be held upon shutdown in our 
+     *    implementation, and this way we avoid destroying them.
+     *  NOT recursive.
+     */
+    class mutex : boost::noncopyable {
+    public:
+        const char * const _name;
+        mutex(const char *name) : _name(name)
+        {
+            _m = new boost::timed_mutex();
+            IGNORE_OBJECT( _m  );   // Turn-off heap checking on _m
+        }
+        ~mutex() {
+            if( !StaticObserver::_destroyingStatics ) {
+                UNIGNORE_OBJECT( _m );
+                delete _m;
+            }
+        }
+
+        class try_lock : boost::noncopyable {
+        public:
+            try_lock( mongo::mutex &m , int millis = 0 )
+                : _l( m.boost() , incxtimemillis( millis ) ) ,
+#if BOOST_VERSION >= 103500
+                  ok( _l.owns_lock() )
+#else
+                  ok( _l.locked() )
+#endif
+            { }
+        private:
+            boost::timed_mutex::scoped_timed_lock _l;
+        public:
+            const bool ok;
+        };
+
+        class scoped_lock : boost::noncopyable {
+        public:
+#if defined(_DEBUG)
+            struct PostStaticCheck {
+                PostStaticCheck() {
+                    if ( StaticObserver::_destroyingStatics ) {
+                        cout << "_DEBUG warning trying to lock a mongo::mutex during static shutdown" << endl;
+                        printStackTrace( cout );
+                    }
+                }
+            } _check;
+            mongo::mutex * const _mut;
+#endif
+            scoped_lock( mongo::mutex &m ) : 
+#if defined(_DEBUG)
+            _mut(&m),
+#endif
+            _l( m.boost() ) {
+#if defined(_DEBUG)
+                mutexDebugger.entering(_mut->_name);
+#endif
+            }
+            ~scoped_lock() {
+#if defined(_DEBUG)
+                mutexDebugger.leaving(_mut->_name);
+#endif
+            }
+            boost::timed_mutex::scoped_lock &boost() { return _l; }
+        private:
+            boost::timed_mutex::scoped_lock _l;
+        };
+    private:
+        boost::timed_mutex &boost() { return *_m; }
+        boost::timed_mutex *_m;
+    };
+
+    typedef mutex::scoped_lock scoped_lock;
+    typedef boost::recursive_mutex::scoped_lock recursive_scoped_lock;
+
+    /** The concept with SimpleMutex is that it is a basic lock/unlock with no 
+          special functionality (such as try and try timeout).  Thus it can be 
+          implemented using OS-specific facilities in all environments (if desired).
+        On Windows, the implementation below is faster than boost mutex.
+    */
+#if defined(_WIN32)
+    class SimpleMutex : boost::noncopyable {
+        CRITICAL_SECTION _cs;
+    public:
+        SimpleMutex(const char *name) { InitializeCriticalSection(&_cs); }
+        ~SimpleMutex() { DeleteCriticalSection(&_cs); }
+
+#if defined(_DEBUG)
+        ThreadLocalValue<int> _nlocksByMe;
+        void lock() { 
+            assert( _nlocksByMe.get() == 0 ); // indicates you rae trying to lock recursively
+            _nlocksByMe.set(1);
+            EnterCriticalSection(&_cs); 
+        }
+        void dassertLocked() const { 
+            assert( _nlocksByMe.get() == 1 );
+        }
+        void unlock() { 
+            dassertLocked();
+            _nlocksByMe.set(0);
+            LeaveCriticalSection(&_cs); 
+        }
+#else
+        void dassertLocked() const { }
+        void lock() { 
+            EnterCriticalSection(&_cs); 
+        }
+        void unlock() { 
+            LeaveCriticalSection(&_cs); 
+        }
+#endif
+
+        class scoped_lock : boost::noncopyable {
+            SimpleMutex& _m;
+        public:
+            scoped_lock( SimpleMutex &m ) : _m(m) { _m.lock(); }
+            ~scoped_lock() { _m.unlock(); }
+# if defined(_DEBUG)
+            const SimpleMutex& m() const { return _m; }
+# endif
+        };
+    };
+#else
+    class SimpleMutex : boost::noncopyable {
+    public:
+        void dassertLocked() const { }
+        SimpleMutex(const char* name) { assert( pthread_mutex_init(&_lock,0) == 0 ); }
+        ~SimpleMutex(){ 
+            if ( ! StaticObserver::_destroyingStatics ) { 
+                assert( pthread_mutex_destroy(&_lock) == 0 ); 
+            }
+        }
+
+        void lock() { assert( pthread_mutex_lock(&_lock) == 0 ); }
+        void unlock() { assert( pthread_mutex_unlock(&_lock) == 0 ); }
+    public:
+        class scoped_lock : boost::noncopyable {
+            SimpleMutex& _m;
+        public:
+            scoped_lock( SimpleMutex &m ) : _m(m) { _m.lock(); }
+            ~scoped_lock() { _m.unlock(); }
+            const SimpleMutex& m() const { return _m; }
+        };
+
+    private:
+        pthread_mutex_t _lock;
+    };
+    
+#endif
+
+    /** This can be used instead of boost recursive mutex. The advantage is the _DEBUG checks
+     *  and ability to assertLocked(). This has not yet been tested for speed vs. the boost one.
+     */
+    class RecursiveMutex : boost::noncopyable {
+    public:
+        RecursiveMutex(const char* name) : m(name) { }
+        bool isLocked() const { return n.get() > 0; }
+        class scoped_lock : boost::noncopyable {
+            RecursiveMutex& rm;
+            int& nLocksByMe;
+        public:
+            scoped_lock( RecursiveMutex &m ) : rm(m), nLocksByMe(rm.n.getRef()) { 
+                if( nLocksByMe++ == 0 ) 
+                    rm.m.lock(); 
+            }
+            ~scoped_lock() { 
+                assert( nLocksByMe > 0 );
+                if( --nLocksByMe == 0 ) {
+                    rm.m.unlock(); 
+                }
+            }
+        };
+    private:
+        SimpleMutex m;
+        ThreadLocalValue<int> n;
+    };
+
+}
diff --git a/src/mongo/util/concurrency/mutexdebugger.h b/src/mongo/util/concurrency/mutexdebugger.h
new file mode 100644
index 00000000000..7dc57f29e98
--- /dev/null
+++ b/src/mongo/util/concurrency/mutexdebugger.h
@@ -0,0 +1,117 @@
+#pragma once
+
+namespace mongo {
+
+    /** only used on _DEBUG builds.
+        MutexDebugger checks that we always acquire locks for multiple mutexes in a consistant (acyclic) order.
+        If we were inconsistent we could deadlock.
+    */
+    class MutexDebugger {
+        typedef const char * mid; // mid = mutex ID
+        typedef map<mid,int> Preceeding;
+        map< mid, int > maxNest;
+        boost::thread_specific_ptr< Preceeding > us;
+        map< mid, set<mid> > followers;
+        boost::mutex &x;
+        unsigned magic;
+        void aBreakPoint() { } // for debugging
+    public:
+        // set these to create an assert that
+        //   b must never be locked before a
+        //   so
+        //     a.lock(); b.lock(); is fine
+        //     b.lock(); alone is fine too
+        //   only checked on _DEBUG builds.
+        string a,b;
+
+        /** outputs some diagnostic info on mutexes (on _DEBUG builds) */
+        void programEnding();
+
+        MutexDebugger();
+
+        string currentlyLocked() const { 
+            Preceeding *_preceeding = us.get();
+            if( _preceeding == 0 )
+                return "";
+            Preceeding &preceeding = *_preceeding;
+            stringstream q;
+            for( Preceeding::const_iterator i = preceeding.begin(); i != preceeding.end(); i++ ) {
+                if( i->second > 0 )
+                    q << "  " << i->first << ' ' << i->second << '\n';
+            }
+            return q.str();
+        }
+
+        void entering(mid m) {
+            if( this == 0 || m == 0 ) return;
+            assert( magic == 0x12345678 );
+
+            Preceeding *_preceeding = us.get();
+            if( _preceeding == 0 )
+                us.reset( _preceeding = new Preceeding() );
+            Preceeding &preceeding = *_preceeding;
+
+            if( a == m ) {
+                aBreakPoint();
+                if( preceeding[b.c_str()] ) {
+                    cout << "****** MutexDebugger error! warning " << b << " was locked before " << a << endl;
+                    assert(false);
+                }
+            }
+
+            preceeding[m]++;
+            if( preceeding[m] > 1 ) {
+                // recursive re-locking.
+                if( preceeding[m] > maxNest[m] )
+                    maxNest[m] = preceeding[m];
+                return;
+            }
+
+            bool failed = false;
+            string err;
+            {
+                boost::mutex::scoped_lock lk(x);
+                followers[m];
+                for( Preceeding::iterator i = preceeding.begin(); i != preceeding.end(); i++ ) {
+                    if( m != i->first && i->second > 0 ) {
+                        followers[i->first].insert(m);
+                        if( followers[m].count(i->first) != 0 ) {
+                            failed = true;
+                            stringstream ss;
+                            mid bad = i->first;
+                            ss << "mutex problem" <<
+                               "\n  when locking " << m <<
+                               "\n  " << bad << " was already locked and should not be."
+                               "\n  set a and b above to debug.\n";
+                            stringstream q;
+                            for( Preceeding::iterator i = preceeding.begin(); i != preceeding.end(); i++ ) {
+                                if( i->first != m && i->first != bad && i->second > 0 )
+                                    q << "  " << i->first << '\n';
+                            }
+                            string also = q.str();
+                            if( !also.empty() )
+                                ss << "also locked before " << m << " in this thread (no particular order):\n" << also;
+                            err = ss.str();
+                            break;
+                        }
+                    }
+                }
+            }
+            if( failed ) {
+                cout << err << endl;
+                assert( 0 );
+            }
+        }
+        void leaving(mid m) {
+            if( this == 0 || m == 0 ) return; // still in startup pre-main()
+            Preceeding& preceeding = *us.get();
+            preceeding[m]--;
+            if( preceeding[m] < 0 ) {
+                cout << "ERROR: lock count for " << m << " is " << preceeding[m] << endl;
+                assert( preceeding[m] >= 0 );
+            }
+        }
+    };
+    extern MutexDebugger &mutexDebugger;
+
+}
diff --git a/src/mongo/util/concurrency/mvar.h b/src/mongo/util/concurrency/mvar.h
new file mode 100644
index 00000000000..bc1855a85cc
--- /dev/null
+++ b/src/mongo/util/concurrency/mvar.h
@@ -0,0 +1,118 @@
+// mvar.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    /* This is based on haskell's MVar synchronization primitive:
+     * http://www.haskell.org/ghc/docs/latest/html/libraries/base/Control-Concurrent-MVar.html
+     *
+     * It is a thread-safe queue that can hold at most one object.
+     * You can also think of it as a box that can be either full or empty.
+     */
+
+    template <typename T>
+    class MVar {
+    public:
+        enum State {EMPTY=0, FULL};
+
+        // create an empty MVar
+        MVar()
+            : _state(EMPTY)
+        {}
+
+        // creates a full MVar
+        MVar(const T& val)
+            : _state(FULL)
+            , _value(val)
+        {}
+
+        // puts val into the MVar and returns true or returns false if full
+        // never blocks
+        bool tryPut(const T& val) {
+            // intentionally repeat test before and after lock
+            if (_state == FULL) return false;
+            Mutex::scoped_lock lock(_mutex);
+            if (_state == FULL) return false;
+
+            _state = FULL;
+            _value = val;
+
+            // unblock threads waiting to 'take'
+            _condition.notify_all();
+
+            return true;
+        }
+
+        // puts val into the MVar
+        // will block if the MVar is already full
+        void put(const T& val) {
+            Mutex::scoped_lock lock(_mutex);
+            while (!tryPut(val)) {
+                // unlocks lock while waiting and relocks before returning
+                _condition.wait(lock);
+            }
+        }
+
+        // takes val out of the MVar and returns true or returns false if empty
+        // never blocks
+        bool tryTake(T& out) {
+            // intentionally repeat test before and after lock
+            if (_state == EMPTY) return false;
+            Mutex::scoped_lock lock(_mutex);
+            if (_state == EMPTY) return false;
+
+            _state = EMPTY;
+            out = _value;
+
+            // unblock threads waiting to 'put'
+            _condition.notify_all();
+
+            return true;
+        }
+
+        // takes val out of the MVar
+        // will block if the MVar is empty
+        T take() {
+            T ret = T();
+
+            Mutex::scoped_lock lock(_mutex);
+            while (!tryTake(ret)) {
+                // unlocks lock while waiting and relocks before returning
+                _condition.wait(lock);
+            }
+
+            return ret;
+        }
+
+
+        // Note: this is fast because there is no locking, but state could
+        // change before you get a chance to act on it.
+        // Mainly useful for sanity checks / asserts.
+        State getState() { return _state; }
+
+
+    private:
+        State _state;
+        T _value;
+        typedef boost::recursive_mutex Mutex;
+        Mutex _mutex;
+        boost::condition _condition;
+    };
+
+}
diff --git a/src/mongo/util/concurrency/race.h b/src/mongo/util/concurrency/race.h
new file mode 100644
index 00000000000..837ae23ac13
--- /dev/null
+++ b/src/mongo/util/concurrency/race.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "../goodies.h" // printStackTrace
+#include "mutexdebugger.h"
+
+namespace mongo {
+
+    namespace race {
+
+#ifdef _WIN32
+    typedef unsigned threadId_t;
+#else
+    typedef pthread_t threadId_t;
+#endif
+
+#if defined(_DEBUG)
+
+        class Block { 
+            volatile int n;
+            unsigned ncalls;
+            const string file;
+            const unsigned line;
+            void fail() { 
+                log() << "\n\n\nrace: synchronization (race condition) failure\ncurrent locks this thread (" << getThreadName() << "):\n"
+                    << mutexDebugger.currentlyLocked() << endl;
+                printStackTrace();
+                ::abort();
+            }
+            void enter() { 
+                if( ++n != 1 ) fail();
+                ncalls++;
+                if( ncalls < 100 ) {
+                    sleepmillis(0);
+                }
+                else {
+                    RARELY {
+                        sleepmillis(0);
+                        if( ncalls < 128 * 20 ) {
+                            OCCASIONALLY { 
+                                sleepmillis(3);
+                            }
+                        }
+                    }
+                }
+            }
+            void leave() {
+                if( --n != 0 ) fail();
+            }
+        public:
+            Block(string f, unsigned l) : n(0), ncalls(0), file(f), line(l) { }
+            ~Block() { 
+                if( ncalls > 1000000 ) { 
+                    // just so we know if we are slowing things down
+                    log() << "race::Block lots of calls " << file << ' ' << line << " n:" << ncalls << endl;
+                }
+            }
+            class Within { 
+                Block& _s;
+            public:
+                Within(Block& s) : _s(s) { _s.enter(); }
+                ~Within() { _s.leave(); }
+            };
+        };
+ 
+        /* in a rwlock situation this will fail, so not appropriate for things like that. */
+# define RACECHECK \
+        static race::Block __cp(__FILE__, __LINE__); \
+        race::Block::Within __ck(__cp);
+
+#else
+        /* !_DEBUG */
+# define RACECHECK
+
+#endif
+
+    }
+}
diff --git a/src/mongo/util/concurrency/rwlock.h b/src/mongo/util/concurrency/rwlock.h
new file mode 100644
index 00000000000..3dbfc35ed6e
--- /dev/null
+++ b/src/mongo/util/concurrency/rwlock.h
@@ -0,0 +1,271 @@
+// @file rwlock.h generic reader-writer lock (cross platform support)
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mutex.h"
+#include "../time_support.h"
+#include "rwlockimpl.h"
+
+#if defined(_DEBUG)
+#include "mutexdebugger.h"
+#endif
+
+namespace mongo {
+
+    /** separated out as later the implementation of this may be different than RWLock, 
+        depending on OS, as there is no upgrade etc. facility herein.
+    */
+    class SimpleRWLock : public RWLockBase { 
+    public:
+        void lock() { RWLockBase::lock(); }
+        void unlock() { RWLockBase::unlock(); }
+        void lock_shared() { RWLockBase::lock_shared(); }
+        void unlock_shared() { RWLockBase::unlock_shared(); }
+        class Shared : boost::noncopyable {
+            SimpleRWLock& _r;
+        public:
+            Shared(SimpleRWLock& rwlock) : _r(rwlock) {_r.lock_shared(); }
+            ~Shared() { _r.unlock_shared(); }
+        };
+        class Exclusive : boost::noncopyable {
+            SimpleRWLock& _r;
+        public:
+            Exclusive(SimpleRWLock& rwlock) : _r(rwlock) {_r.lock(); }
+            ~Exclusive() { _r.unlock(); }
+        };
+    };
+
+    class RWLock : public RWLockBase { 
+        enum { NilState, UpgradableState, Exclusive } x; // only bother to set when doing upgradable related things
+    public:
+        const char * const _name;
+        RWLock(const char *name) : _name(name) { 
+            x = NilState;
+        }
+        void lock() {
+            RWLockBase::lock();
+#if defined(_DEBUG)
+            mutexDebugger.entering(_name);
+#endif
+        }
+        void unlock() {
+#if defined(_DEBUG)            
+            mutexDebugger.leaving(_name);
+#endif
+            RWLockBase::unlock();
+        }
+
+        void lock_shared() { RWLockBase::lock_shared(); }
+        void unlock_shared() { RWLockBase::unlock_shared(); }
+    private:
+        void lockAsUpgradable() { RWLockBase::lockAsUpgradable(); }
+        void unlockFromUpgradable() { // upgradable -> unlocked
+            RWLockBase::unlockFromUpgradable();
+        }
+    public:
+        void upgrade() { // upgradable -> exclusive lock
+            assert( x == UpgradableState );
+            RWLockBase::upgrade();
+            x = Exclusive;
+        }
+
+        bool lock_shared_try( int millis ) { return RWLockBase::lock_shared_try(millis); }
+
+        bool lock_try( int millis = 0 ) {
+            if( RWLockBase::lock_try(millis) ) {
+#if defined(_DEBUG)            
+                mutexDebugger.entering(_name);
+#endif
+                return true;
+            }
+            return false;
+        }
+
+        /** acquire upgradable state.  You must be unlocked before creating.
+            unlocks on destruction, whether in upgradable state or upgraded to exclusive
+            in the interim.
+            */
+        class Upgradable : boost::noncopyable { 
+            RWLock& _r;
+        public:
+            Upgradable(RWLock& r) : _r(r) { 
+                r.lockAsUpgradable();
+                assert( _r.x == NilState );
+                _r.x = RWLock::UpgradableState;
+            }
+            ~Upgradable() {
+                if( _r.x == RWLock::UpgradableState ) {
+                    _r.x = NilState;
+                    _r.unlockFromUpgradable();
+                }
+                else {
+                    //TEMP                     assert( _r.x == Exclusive ); // has been upgraded
+                    _r.x = NilState;
+                    _r.unlock();
+                }
+            }
+        };
+    };
+
+    /** throws on failure to acquire in the specified time period. */
+    class rwlock_try_write : boost::noncopyable {
+    public:
+        struct exception { };
+        rwlock_try_write(RWLock& l, int millis = 0) : _l(l) {
+            if( !l.lock_try(millis) )
+                throw exception();
+        }
+        ~rwlock_try_write() { _l.unlock(); }
+    private:
+        RWLock& _l;
+    };
+
+    class rwlock_shared : boost::noncopyable {
+    public:
+        rwlock_shared(RWLock& rwlock) : _r(rwlock) {_r.lock_shared(); }
+        ~rwlock_shared() { _r.unlock_shared(); }
+    private:
+        RWLock& _r;
+    };
+
+    /* scoped lock for RWLock */
+    class rwlock : boost::noncopyable {
+    public:
+        /**
+         * @param write acquire write lock if true sharable if false
+         * @param lowPriority if > 0, will try to get the lock non-greedily for that many ms
+         */
+        rwlock( const RWLock& lock , bool write, /* bool alreadyHaveLock = false , */int lowPriorityWaitMS = 0 )
+            : _lock( (RWLock&)lock ) , _write( write ) {            
+            {
+                if ( _write ) {
+                    _lock.lock();
+                }
+                else { 
+                    _lock.lock_shared();
+                }
+            }
+        }
+        ~rwlock() {
+            if ( _write )
+                _lock.unlock();
+            else
+                _lock.unlock_shared();
+        }
+    private:
+        RWLock& _lock;
+        const bool _write;
+    };
+
+    // ----------------------------------------------------------------------------------------
+
+    /** recursive on shared locks is ok for this implementation */
+    class RWLockRecursive : protected RWLockBase {
+    protected:
+        ThreadLocalValue<int> _state;
+        void lock(); // not implemented - Lock() should be used; didn't overload this name to avoid mistakes
+        virtual void Lock() { RWLockBase::lock(); }
+    public:
+        virtual ~RWLockRecursive() { }
+        const char * const _name;
+        RWLockRecursive(const char *name) : _name(name) { }
+
+        void assertExclusivelyLocked() { 
+            assert( _state.get() < 0 );
+        }
+
+        class Exclusive : boost::noncopyable { 
+            RWLockRecursive& _r;
+        public:
+            Exclusive(RWLockRecursive& r) : _r(r) {
+                int s = _r._state.get();
+                dassert( s <= 0 );
+                if( s == 0 )
+                    _r.Lock();
+                _r._state.set(s-1);
+            }
+            ~Exclusive() {
+                int s = _r._state.get();
+                DEV wassert( s < 0 ); // wassert: don't throw from destructors
+                ++s;
+                _r._state.set(s);
+                if ( s == 0 )
+                    _r.unlock();
+            }
+        };
+
+        class Shared : boost::noncopyable { 
+            RWLockRecursive& _r;
+            bool _alreadyLockedExclusiveByUs;
+        public:
+            Shared(RWLockRecursive& r) : _r(r) {
+                int s = _r._state.get();
+                _alreadyLockedExclusiveByUs = s < 0;
+                if( !_alreadyLockedExclusiveByUs ) {
+                    dassert( s >= 0 ); // -1 would mean exclusive
+                    if( s == 0 )
+                        _r.lock_shared(); 
+                    _r._state.set(s+1);
+                }
+            }
+            ~Shared() {
+                if( _alreadyLockedExclusiveByUs ) {
+                    DEV wassert( _r._state.get() < 0 );
+                }
+                else {
+                    int s = _r._state.get() - 1;
+                    DEV wassert( s >= 0 );
+                    _r._state.set(s);
+                    if( s == 0 ) 
+                        _r.unlock_shared();
+                }
+            }
+        };
+    };
+
+    class RWLockRecursiveNongreedy : public RWLockRecursive { 
+        virtual void Lock() { 
+            bool got = false;
+            for ( int i=0; i<lowPriorityWaitMS; i++ ) {
+                if ( lock_try(0) ) {
+                    got = true;
+                    break;
+                }                            
+                int sleep = 1;
+                if ( i > ( lowPriorityWaitMS / 20 ) )
+                    sleep = 10;
+                sleepmillis(sleep);
+                i += ( sleep - 1 );
+            }
+            if ( ! got ) {
+                log() << "couldn't lazily get rwlock" << endl;
+                RWLockBase::lock();
+            }
+        }
+
+    public:
+        const int lowPriorityWaitMS;
+        RWLockRecursiveNongreedy(const char *nm, int lpwaitms) : RWLockRecursive(nm), lowPriorityWaitMS(lpwaitms) { }
+        const char * implType() const { return RWLockRecursive::implType(); }
+
+        //just for testing:
+        bool __lock_try( int millis ) { return RWLockRecursive::lock_try(millis); }
+    };
+
+}
diff --git a/src/mongo/util/concurrency/rwlockimpl.h b/src/mongo/util/concurrency/rwlockimpl.h
new file mode 100644
index 00000000000..4e07231447b
--- /dev/null
+++ b/src/mongo/util/concurrency/rwlockimpl.h
@@ -0,0 +1,170 @@
+// @file rwlockimpl.h
+
+#pragma once
+
+#if defined(MONGO_USE_SRW_ON_WINDOWS) && defined(_WIN32)
+
+// windows slimreaderwriter version.  newer windows versions only
+
+namespace mongo { 
+    class RWLockBase : boost::noncopyable {
+        SRWLOCK _lock;
+    protected:
+        RWLockBase() { InitializeSRWLock(&_lock); }
+        ~RWLockBase() {
+            // no special action needed to destroy a SRWLOCK
+        }
+        void lock()          { AcquireSRWLockExclusive(&_lock); }
+        void unlock()        { ReleaseSRWLockExclusive(&_lock); }
+        void lock_shared()   { AcquireSRWLockShared(&_lock); }
+        void unlock_shared() { ReleaseSRWLockShared(&_lock); }
+        bool lock_shared_try( int millis ) {
+            if( TryAcquireSRWLockShared(&_lock) )
+                return true;
+            if( millis == 0 )
+                return false;
+            unsigned long long end = curTimeMicros64() + millis*1000;
+            while( 1 ) {
+                Sleep(1);
+                if( TryAcquireSRWLockShared(&_lock) )
+                    return true;
+                if( curTimeMicros64() >= end )
+                    break;
+            }
+            return false;
+        }
+        bool lock_try( int millis = 0 ) {
+            if( TryAcquireSRWLockExclusive(&_lock) ) // quick check to optimistically avoid calling curTimeMicros64
+                return true;
+            if( millis == 0 )
+                return false;
+            unsigned long long end = curTimeMicros64() + millis*1000;
+            do {
+                Sleep(1);
+                if( TryAcquireSRWLockExclusive(&_lock) )
+                    return true;
+            } while( curTimeMicros64() < end );
+            return false;
+        }
+        // no upgradable for this impl
+        void lockAsUpgradable() { lock(); }
+        void unlockFromUpgradable() { unlock(); }
+        void upgrade() { }
+    public:
+        const char * implType() const { return "WINSRW"; }
+    };
+}
+
+#elif( BOOST_VERSION < 103500 ) 
+
+# if defined(_WIN32)
+#  error need boost >= 1.35 for windows
+# endif
+
+// pthreads version
+
+# include <pthread.h>
+
+namespace mongo { 
+    class RWLockBase : boost::noncopyable {
+        pthread_rwlock_t _lock;
+        static void check( int x ) {
+            if( x == 0 ) return;
+            log() << "pthread rwlock failed: " << x << endl;
+            assert( x == 0 );
+        }        
+
+        ~RWLockBase() {
+            if ( ! StaticObserver::_destroyingStatics ) {
+                wassert( pthread_rwlock_destroy( &_lock ) == 0 ); // wassert as don't want to throw from a destructor
+            }
+        }
+
+    protected:
+        RWLockBase() {
+            check( pthread_rwlock_init( &_lock , 0 ) );
+        }
+
+        void lock() { check( pthread_rwlock_wrlock( &_lock ) ); }
+        void unlock() { check( pthread_rwlock_unlock( &_lock ) ); }
+        void lock_shared() { check( pthread_rwlock_rdlock( &_lock ) ); }
+        void unlock_shared() { check( pthread_rwlock_unlock( &_lock ) ); }
+        bool lock_shared_try( int millis ) { return _try( millis , false ); }
+        bool lock_try( int millis = 0 ) { return _try( millis , true ); }
+        bool _try( int millis , bool write ) {
+            while ( true ) {
+                int x = write ?
+                        pthread_rwlock_trywrlock( &_lock ) :
+                        pthread_rwlock_tryrdlock( &_lock );
+                if ( x <= 0 )
+                    return true;
+                if ( millis-- <= 0 )
+                    return false;
+                if ( x == EBUSY ) {
+                    sleepmillis(1);
+                    continue;
+                }
+                check(x);
+            }
+            return false;
+        }
+        // no upgradable for this impl
+        void lockAsUpgradable() { lock(); }
+        void unlockFromUpgradable() { unlock(); }
+        void upgrade() { }
+    public:
+        const char * implType() const { return "posix"; }
+    };
+}
+
+#else
+
+// Boost version
+
+# if defined(_WIN32)
+#  include "shared_mutex_win.hpp"
+namespace mongo { typedef boost::modified_shared_mutex shared_mutex; }
+# else
+#  include <boost/thread/shared_mutex.hpp>
+namespace mongo { using boost::shared_mutex; }
+# endif
+# undef assert
+# define assert MONGO_assert
+
+namespace mongo { 
+    class RWLockBase : boost::noncopyable {
+        shared_mutex _m;
+    protected:
+        void lock() {
+             _m.lock();
+        }
+        void unlock() {
+            _m.unlock();
+        }
+        void lockAsUpgradable() { 
+            _m.lock_upgrade();
+        }
+        void unlockFromUpgradable() { // upgradable -> unlocked
+            _m.unlock_upgrade();
+        }
+        void upgrade() { // upgradable -> exclusive lock
+            _m.unlock_upgrade_and_lock();
+        }
+        void lock_shared() {
+            _m.lock_shared();
+        }
+        void unlock_shared() {
+            _m.unlock_shared();
+        }
+        bool lock_shared_try( int millis ) {
+            return _m.timed_lock_shared( boost::posix_time::milliseconds(millis) );
+        }
+        bool lock_try( int millis = 0 ) {
+            return _m.timed_lock( boost::posix_time::milliseconds(millis) );
+        }
+    public:
+        const char * implType() const { return "boost"; }
+    };
+}
+
+#endif
diff --git a/src/mongo/util/concurrency/shared_mutex_win.hpp b/src/mongo/util/concurrency/shared_mutex_win.hpp
new file mode 100644
index 00000000000..e850fc6bab4
--- /dev/null
+++ b/src/mongo/util/concurrency/shared_mutex_win.hpp
@@ -0,0 +1,594 @@
+#ifndef BOOST_THREAD_WIN32_SHARED_MUTEX_HPP_MODIFIED
+#define BOOST_THREAD_WIN32_SHARED_MUTEX_HPP_MODIFIED
+
+//  (C) Copyright 2006-8 Anthony Williams
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+/* MongoDB : 
+  Slightly modified boost file to not die above 127 pending writes
+  Here is what changed (from boost 1.42.0 shared_mutex.hpp):
+    1,2c1,2
+    < #ifndef BOOST_THREAD_WIN32_SHARED_MUTEX_HPP
+    < #define BOOST_THREAD_WIN32_SHARED_MUTEX_HPP
+    ---
+    > #ifndef BOOST_THREAD_WIN32_SHARED_MUTEX_HPP_MODIFIED
+    > #define BOOST_THREAD_WIN32_SHARED_MUTEX_HPP_MODIFIED
+    22c27
+    <     class shared_mutex:
+    ---
+    >     class modified_shared_mutex:
+    73c78
+    <         shared_mutex():
+    ---
+    >         modified_shared_mutex():
+    84c89
+    <         ~shared_mutex()
+    ---
+    >         ~modified_shared_mutex()
+    283a289,290
+    >                         if( new_state.exclusive_waiting == 127 ) // the maximum already!
+    >                             break;
+*/
+
+#include <boost/assert.hpp>
+#include <boost/detail/interlocked.hpp>
+#include <boost/thread/win32/thread_primitives.hpp>
+#include <boost/static_assert.hpp>
+#include <limits.h>
+#include <boost/utility.hpp>
+#include <boost/thread/thread_time.hpp>
+
+#include <boost/config/abi_prefix.hpp>
+
+namespace boost
+{
+    class modified_shared_mutex:
+        private boost::noncopyable
+    {
+    private:
+        struct state_data
+        {
+            unsigned shared_count:11,
+                shared_waiting:11,
+                exclusive:1,
+                upgrade:1,
+                exclusive_waiting:7,
+                exclusive_waiting_blocked:1;
+
+            friend bool operator==(state_data const& lhs,state_data const& rhs)
+            {
+                return *reinterpret_cast<unsigned const*>(&lhs)==*reinterpret_cast<unsigned const*>(&rhs);
+            }
+        };
+        
+
+        template<typename T>
+        T interlocked_compare_exchange(T* target,T new_value,T comparand)
+        {
+            BOOST_STATIC_ASSERT(sizeof(T)==sizeof(long));
+            long const res=BOOST_INTERLOCKED_COMPARE_EXCHANGE(reinterpret_cast<long*>(target),
+                                                              *reinterpret_cast<long*>(&new_value),
+                                                              *reinterpret_cast<long*>(&comparand));
+            return *reinterpret_cast<T const*>(&res);
+        }
+
+        state_data state;
+        detail::win32::handle semaphores[2];
+        detail::win32::handle &unlock_sem;
+        detail::win32::handle &exclusive_sem;
+        detail::win32::handle upgrade_sem;
+
+        void release_waiters(state_data old_state)
+        {
+            if(old_state.exclusive_waiting)
+            {
+                BOOST_VERIFY(detail::win32::ReleaseSemaphore(exclusive_sem,1,0)!=0);
+            }
+                        
+            if(old_state.shared_waiting || old_state.exclusive_waiting)
+            {
+                BOOST_VERIFY(detail::win32::ReleaseSemaphore(unlock_sem,old_state.shared_waiting + (old_state.exclusive_waiting?1:0),0)!=0);
+            }
+        }
+        
+
+    public:
+        modified_shared_mutex():
+            unlock_sem(semaphores[0]),
+            exclusive_sem(semaphores[1]) 
+        {
+            unlock_sem=detail::win32::create_anonymous_semaphore(0,LONG_MAX);
+            exclusive_sem=detail::win32::create_anonymous_semaphore(0,LONG_MAX);
+            upgrade_sem=detail::win32::create_anonymous_semaphore(0,LONG_MAX);
+            state_data state_={0};
+            state=state_;
+        }
+
+        ~modified_shared_mutex()
+        {
+            detail::win32::CloseHandle(upgrade_sem);
+            detail::win32::CloseHandle(unlock_sem);
+            detail::win32::CloseHandle(exclusive_sem);
+        }
+
+        bool try_lock_shared()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                if(!new_state.exclusive && !new_state.exclusive_waiting_blocked)
+                {
+                    ++new_state.shared_count;
+                }
+                
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    break;
+                }
+                old_state=current_state;
+            }
+            return !(old_state.exclusive| old_state.exclusive_waiting_blocked);
+        }
+
+        void lock_shared()
+        {
+            BOOST_VERIFY(timed_lock_shared(::boost::detail::get_system_time_sentinel()));
+        }
+
+        template<typename TimeDuration>
+        bool timed_lock_shared(TimeDuration const & relative_time)
+        {
+            return timed_lock_shared(get_system_time()+relative_time);
+        }
+
+        bool timed_lock_shared(boost::system_time const& wait_until)
+        {
+            for(;;)
+            {
+                state_data old_state=state;
+                for(;;)
+                {
+                    state_data new_state=old_state;
+                    if(new_state.exclusive || new_state.exclusive_waiting_blocked)
+                    {
+                        ++new_state.shared_waiting;
+                    }
+                    else
+                    {
+                        ++new_state.shared_count;
+                    }
+
+                    state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                    if(current_state==old_state)
+                    {
+                        break;
+                    }
+                    old_state=current_state;
+                }
+
+                if(!(old_state.exclusive| old_state.exclusive_waiting_blocked))
+                {
+                    return true;
+                }
+                    
+                unsigned long const res=detail::win32::WaitForSingleObject(unlock_sem,::boost::detail::get_milliseconds_until(wait_until));
+                if(res==detail::win32::timeout)
+                {
+                    for(;;)
+                    {
+                        state_data new_state=old_state;
+                        if(new_state.exclusive || new_state.exclusive_waiting_blocked)
+                        {
+                            if(new_state.shared_waiting)
+                            {
+                                --new_state.shared_waiting;
+                            }
+                        }
+                        else
+                        {
+                            ++new_state.shared_count;
+                        }
+
+                        state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                        if(current_state==old_state)
+                        {
+                            break;
+                        }
+                        old_state=current_state;
+                    }
+
+                    if(!(old_state.exclusive| old_state.exclusive_waiting_blocked))
+                    {
+                        return true;
+                    }
+                    return false;
+                }
+                
+                BOOST_ASSERT(res==0);
+            }
+        }
+
+        void unlock_shared()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                bool const last_reader=!--new_state.shared_count;
+                
+                if(last_reader)
+                {
+                    if(new_state.upgrade)
+                    {
+                        new_state.upgrade=false;
+                        new_state.exclusive=true;
+                    }
+                    else
+                    {
+                        if(new_state.exclusive_waiting)
+                        {
+                            --new_state.exclusive_waiting;
+                            new_state.exclusive_waiting_blocked=false;
+                        }
+                        new_state.shared_waiting=0;
+                    }
+                }
+                
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    if(last_reader)
+                    {
+                        if(old_state.upgrade)
+                        {
+                            BOOST_VERIFY(detail::win32::ReleaseSemaphore(upgrade_sem,1,0)!=0);
+                        }
+                        else
+                        {
+                            release_waiters(old_state);
+                        }
+                    }
+                    break;
+                }
+                old_state=current_state;
+            }
+        }
+
+        void lock()
+        {
+            BOOST_VERIFY(timed_lock(::boost::detail::get_system_time_sentinel()));
+        }
+
+        template<typename TimeDuration>
+        bool timed_lock(TimeDuration const & relative_time)
+        {
+            return timed_lock(get_system_time()+relative_time);
+        }
+
+        bool try_lock()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                if(new_state.shared_count || new_state.exclusive)
+                {
+                    return false;
+                }
+                else
+                {
+                    new_state.exclusive=true;
+                }
+                
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    break;
+                }
+                old_state=current_state;
+            }
+            return true;
+        }
+
+
+        bool timed_lock(boost::system_time const& wait_until)
+        {
+            for(;;)
+            {
+                state_data old_state=state;
+
+                for(;;)
+                {
+                    state_data new_state=old_state;
+                    if(new_state.shared_count || new_state.exclusive)
+                    {
+                        if( new_state.exclusive_waiting == 127 ) // the maximum already!
+                            break;
+                        ++new_state.exclusive_waiting;
+                        new_state.exclusive_waiting_blocked=true;
+                    }
+                    else
+                    {
+                        new_state.exclusive=true;
+                    }
+
+                    state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                    if(current_state==old_state)
+                    {
+                        break;
+                    }
+                    old_state=current_state;
+                }
+
+                if(!old_state.shared_count && !old_state.exclusive)
+                {
+                    return true;
+                }
+                unsigned long const wait_res=detail::win32::WaitForMultipleObjects(2,semaphores,true,::boost::detail::get_milliseconds_until(wait_until));
+                if(wait_res==detail::win32::timeout)
+                {
+                    for(;;)
+                    {
+                        state_data new_state=old_state;
+                        if(new_state.shared_count || new_state.exclusive)
+                        {
+                            if(new_state.exclusive_waiting)
+                            {
+                                if(!--new_state.exclusive_waiting)
+                                {
+                                    new_state.exclusive_waiting_blocked=false;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            new_state.exclusive=true;
+                        }
+
+                        state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                        if(current_state==old_state)
+                        {
+                            break;
+                        }
+                        old_state=current_state;
+                    }
+                    if(!old_state.shared_count && !old_state.exclusive)
+                    {
+                        return true;
+                    }
+                    return false;
+                }
+                BOOST_ASSERT(wait_res<2);
+            }
+        }
+
+        void unlock()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                new_state.exclusive=false;
+                if(new_state.exclusive_waiting)
+                {
+                    --new_state.exclusive_waiting;
+                    new_state.exclusive_waiting_blocked=false;
+                }
+                new_state.shared_waiting=0;
+
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    break;
+                }
+                old_state=current_state;
+            }
+            release_waiters(old_state);
+        }
+
+        void lock_upgrade()
+        {
+            for(;;)
+            {
+                state_data old_state=state;
+                for(;;)
+                {
+                    state_data new_state=old_state;
+                    if(new_state.exclusive || new_state.exclusive_waiting_blocked || new_state.upgrade)
+                    {
+                        ++new_state.shared_waiting;
+                    }
+                    else
+                    {
+                        ++new_state.shared_count;
+                        new_state.upgrade=true;
+                    }
+
+                    state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                    if(current_state==old_state)
+                    {
+                        break;
+                    }
+                    old_state=current_state;
+                }
+
+                if(!(old_state.exclusive|| old_state.exclusive_waiting_blocked|| old_state.upgrade))
+                {
+                    return;
+                }
+                    
+                BOOST_VERIFY(!detail::win32::WaitForSingleObject(unlock_sem,detail::win32::infinite));
+            }
+        }
+
+        bool try_lock_upgrade()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                if(new_state.exclusive || new_state.exclusive_waiting_blocked || new_state.upgrade)
+                {
+                    return false;
+                }
+                else
+                {
+                    ++new_state.shared_count;
+                    new_state.upgrade=true;
+                }
+                
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    break;
+                }
+                old_state=current_state;
+            }
+            return true;
+        }
+
+        void unlock_upgrade()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                new_state.upgrade=false;
+                bool const last_reader=!--new_state.shared_count;
+                
+                if(last_reader)
+                {
+                    if(new_state.exclusive_waiting)
+                    {
+                        --new_state.exclusive_waiting;
+                        new_state.exclusive_waiting_blocked=false;
+                    }
+                    new_state.shared_waiting=0;
+                }
+                
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    if(last_reader)
+                    {
+                        release_waiters(old_state);
+                    }
+                    break;
+                }
+                old_state=current_state;
+            }
+        }
+
+        void unlock_upgrade_and_lock()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                bool const last_reader=!--new_state.shared_count;
+                
+                if(last_reader)
+                {
+                    new_state.upgrade=false;
+                    new_state.exclusive=true;
+                }
+                
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    if(!last_reader)
+                    {
+                        BOOST_VERIFY(!detail::win32::WaitForSingleObject(upgrade_sem,detail::win32::infinite));
+                    }
+                    break;
+                }
+                old_state=current_state;
+            }
+        }
+
+        void unlock_and_lock_upgrade()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                new_state.exclusive=false;
+                new_state.upgrade=true;
+                ++new_state.shared_count;
+                if(new_state.exclusive_waiting)
+                {
+                    --new_state.exclusive_waiting;
+                    new_state.exclusive_waiting_blocked=false;
+                }
+                new_state.shared_waiting=0;
+
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    break;
+                }
+                old_state=current_state;
+            }
+            release_waiters(old_state);
+        }
+        
+        void unlock_and_lock_shared()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                new_state.exclusive=false;
+                ++new_state.shared_count;
+                if(new_state.exclusive_waiting)
+                {
+                    --new_state.exclusive_waiting;
+                    new_state.exclusive_waiting_blocked=false;
+                }
+                new_state.shared_waiting=0;
+
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    break;
+                }
+                old_state=current_state;
+            }
+            release_waiters(old_state);
+        }
+        
+        void unlock_upgrade_and_lock_shared()
+        {
+            state_data old_state=state;
+            for(;;)
+            {
+                state_data new_state=old_state;
+                new_state.upgrade=false;
+                if(new_state.exclusive_waiting)
+                {
+                    --new_state.exclusive_waiting;
+                    new_state.exclusive_waiting_blocked=false;
+                }
+                new_state.shared_waiting=0;
+
+                state_data const current_state=interlocked_compare_exchange(&state,new_state,old_state);
+                if(current_state==old_state)
+                {
+                    break;
+                }
+                old_state=current_state;
+            }
+            release_waiters(old_state);
+        }
+        
+    };
+}
+
+#include <boost/config/abi_suffix.hpp>
+
+#endif
diff --git a/src/mongo/util/concurrency/spin_lock.cpp b/src/mongo/util/concurrency/spin_lock.cpp
new file mode 100644
index 00000000000..cbf517b2746
--- /dev/null
+++ b/src/mongo/util/concurrency/spin_lock.cpp
@@ -0,0 +1,107 @@
+// spin_lock.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h" // todo eliminate this include
+#include <time.h>
+#include "spin_lock.h"
+
+namespace mongo {
+
+    SpinLock::~SpinLock() {
+#if defined(_WIN32)
+        DeleteCriticalSection(&_cs);
+#elif defined(__USE_XOPEN2K)
+        pthread_spin_destroy(&_lock);
+#endif
+    }
+
+    SpinLock::SpinLock()
+#if defined(_WIN32)
+    { InitializeCriticalSectionAndSpinCount(&_cs, 4000); }
+#elif defined(__USE_XOPEN2K)
+    { pthread_spin_init( &_lock , 0 ); }
+#elif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4)
+    : _locked( false ) { }
+#else
+    : _mutex( "SpinLock" ) { }
+#endif
+
+#if defined(__USE_XOPEN2K)
+    NOINLINE_DECL void SpinLock::_lk() {
+        /**
+         * this is designed to perform close to the default spin lock
+         * the reason for the mild insanity is to prevent horrible performance
+         * when contention spikes 
+         * it allows spinlocks to be used in many more places
+         * which is good because even with this change they are about 8x faster on linux
+         */
+        
+        for ( int i=0; i<1000; i++ ) {            
+            if ( pthread_spin_trylock( &_lock ) == 0 )
+                return;
+            asm volatile ( "pause" ) ; // maybe trylock does this; just in case.
+        }
+
+        for ( int i=0; i<1000; i++ ) {
+            if ( pthread_spin_trylock( &_lock ) == 0 )
+                return;
+            pthread_yield();
+        }
+        
+        struct timespec t;
+        t.tv_sec = 0;
+        t.tv_nsec = 5000000;
+
+        while ( pthread_spin_trylock( &_lock ) != 0 ) {
+            nanosleep(&t, NULL);
+        }
+    }
+#elif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4)
+    void SpinLock::lock() {
+
+        // fast path
+        if (!_locked && !__sync_lock_test_and_set(&_locked, true)) {
+            return;
+        }
+
+        // wait for lock
+        int wait = 1000;
+        while ((wait-- > 0) && (_locked)) {
+            asm volatile ( "pause" ) ;
+        }
+
+        // if failed to grab lock, sleep
+        struct timespec t;
+        t.tv_sec = 0;
+        t.tv_nsec = 5000000;
+        while (__sync_lock_test_and_set(&_locked, true)) {
+            nanosleep(&t, NULL);
+        }
+    }
+#endif
+
+    bool SpinLock::isfast() {
+#if defined(_WIN32) || defined(__USE_XOPEN2K) || defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4)
+        return true;
+#else
+        return false;
+#endif
+    }
+
+
+}  // namespace mongo
diff --git a/src/mongo/util/concurrency/spin_lock.h b/src/mongo/util/concurrency/spin_lock.h
new file mode 100644
index 00000000000..d90de51afac
--- /dev/null
+++ b/src/mongo/util/concurrency/spin_lock.h
@@ -0,0 +1,77 @@
+// spin_lock.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mutex.h"
+
+namespace mongo {
+
+    /**
+     * The spinlock currently requires late GCC support routines to be efficient.
+     * Other platforms default to a mutex implemenation.
+     */
+    class SpinLock : boost::noncopyable {
+    public:
+        SpinLock();
+        ~SpinLock();
+
+        static bool isfast(); // true if a real spinlock on this platform
+
+    private:
+#if defined(_WIN32)
+        CRITICAL_SECTION _cs;
+    public:
+        void lock() {EnterCriticalSection(&_cs); }
+        void unlock() { LeaveCriticalSection(&_cs); }
+#elif defined(__USE_XOPEN2K)
+        pthread_spinlock_t _lock;
+        void _lk();
+    public:
+        void unlock() { pthread_spin_unlock(&_lock); }
+        void lock() {
+            if ( pthread_spin_trylock( &_lock ) == 0 )
+                return;
+            _lk(); 
+        }
+#elif defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4)
+        volatile bool _locked;
+    public:
+        void unlock() {__sync_lock_release(&_locked); }
+        void lock();
+#else
+        // default to a mutex if not implemented
+        SimpleMutex _mutex;
+    public:
+        void unlock() { _mutex.unlock(); }
+        void lock() { _mutex.lock(); }
+#endif
+    };
+    
+    class scoped_spinlock : boost::noncopyable {
+    public:
+        scoped_spinlock( SpinLock& l ) : _l(l) {
+            _l.lock();
+        }
+        ~scoped_spinlock() {
+            _l.unlock();}
+    private:
+        SpinLock& _l;
+    };
+
+}  // namespace mongo
diff --git a/src/mongo/util/concurrency/synchronization.cpp b/src/mongo/util/concurrency/synchronization.cpp
new file mode 100644
index 00000000000..4186745dc16
--- /dev/null
+++ b/src/mongo/util/concurrency/synchronization.cpp
@@ -0,0 +1,81 @@
+// synchronization.cpp
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "synchronization.h"
+
+namespace mongo {
+
+    Notification::Notification() : _mutex ( "Notification" ){ 
+        lookFor = 1;
+        cur = 0;
+    }
+
+    Notification::~Notification() { }
+
+    void Notification::waitToBeNotified() {
+        scoped_lock lock( _mutex );
+        while ( lookFor != cur )
+            _condition.wait( lock.boost() );
+        lookFor++;
+    }
+
+    void Notification::notifyOne() {
+        scoped_lock lock( _mutex );
+        assert( cur != lookFor );
+        cur++;
+        _condition.notify_one();
+    }
+
+    /* --- NotifyAll --- */
+
+    NotifyAll::NotifyAll() : _mutex("NotifyAll") { 
+        _lastDone = 0;
+        _lastReturned = 0;
+        _nWaiting = 0;
+    }
+
+    NotifyAll::When NotifyAll::now() { 
+        scoped_lock lock( _mutex );
+        return ++_lastReturned;
+    }
+
+    void NotifyAll::waitFor(When e) {
+        scoped_lock lock( _mutex );
+        ++_nWaiting;
+        while( _lastDone < e ) {
+            _condition.wait( lock.boost() );
+        }
+    }
+
+    void NotifyAll::awaitBeyondNow() { 
+        scoped_lock lock( _mutex );
+        ++_nWaiting;
+        When e = ++_lastReturned;
+        while( _lastDone <= e ) {
+            _condition.wait( lock.boost() );
+        }
+    }
+
+    void NotifyAll::notifyAll(When e) {
+        scoped_lock lock( _mutex );
+        _lastDone = e;
+        _nWaiting = 0;
+        _condition.notify_all();
+    }
+
+} // namespace mongo
diff --git a/src/mongo/util/concurrency/synchronization.h b/src/mongo/util/concurrency/synchronization.h
new file mode 100644
index 00000000000..f9a40cc3ab9
--- /dev/null
+++ b/src/mongo/util/concurrency/synchronization.h
@@ -0,0 +1,86 @@
+// synchronization.h
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/thread/condition.hpp>
+#include "mutex.h"
+
+namespace mongo {
+
+    /*
+     * A class to establish a synchronization point between two threads. One thread is the waiter and one is
+     * the notifier. After the notification event, both proceed normally.
+     *
+     * This class is thread-safe.
+     */
+    class Notification {
+    public:
+        Notification();
+        ~Notification();
+
+        /*
+         * Blocks until the method 'notifyOne()' is called.
+         */
+        void waitToBeNotified();
+
+        /*
+         * Notifies the waiter of '*this' that it can proceed.  Can only be called once.
+         */
+        void notifyOne();
+
+    private:
+        mongo::mutex _mutex;          // protects state below
+        unsigned long long lookFor;
+        unsigned long long cur;
+        boost::condition _condition;  // cond over _notified being true
+    };
+
+    /** establishes a synchronization point between threads. N threads are waits and one is notifier.
+        threadsafe.
+    */
+    class NotifyAll : boost::noncopyable {
+    public:
+        NotifyAll();
+
+        typedef unsigned long long When;
+
+        When now();
+
+        /** awaits the next notifyAll() call by another thread. notifications that precede this
+            call are ignored -- we are looking for a fresh event.
+        */
+        void waitFor(When);
+
+        /** a bit faster than waitFor( now() ) */
+        void awaitBeyondNow();
+
+        /** may be called multiple times. notifies all waiters */
+        void notifyAll(When);
+
+        /** indicates how many threads are waiting for a notify. */
+        unsigned nWaiting() const { return _nWaiting; }
+
+    private:
+        mongo::mutex _mutex;
+        boost::condition _condition;
+        When _lastDone;
+        When _lastReturned;
+        unsigned _nWaiting;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/util/concurrency/task.cpp b/src/mongo/util/concurrency/task.cpp
new file mode 100644
index 00000000000..0b6ab166f19
--- /dev/null
+++ b/src/mongo/util/concurrency/task.cpp
@@ -0,0 +1,181 @@
+// @file task.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include <boost/thread/condition.hpp>
+
+#include "task.h"
+#include "../goodies.h"
+#include "../unittest.h"
+#include "../time_support.h"
+
+namespace mongo {
+
+    namespace task {
+
+        /*void foo() {
+            boost::mutex m;
+            boost::mutex::scoped_lock lk(m);
+            boost::condition cond;
+            cond.wait(lk);
+            cond.notify_one();
+        }*/
+
+        Task::Task()
+            : BackgroundJob( true /* deleteSelf */ ) {
+            n = 0;
+            repeat = 0;
+        }
+
+        void Task::halt() { repeat = 0; }
+
+        void Task::run() {
+            assert( n == 0 );
+            while( 1 ) {
+                n++;
+                try {
+                    doWork();
+                }
+                catch(...) { }
+                if( repeat == 0 )
+                    break;
+                sleepmillis(repeat);
+                if( inShutdown() )
+                    break;
+            }
+        }
+
+        void Task::begin() {
+            go();
+        }
+
+        void fork(Task *t) {
+            t->begin();
+        }
+
+        void repeat(Task *t, unsigned millis) {
+            t->repeat = millis;
+            t->begin();
+        }
+
+    }
+}
+
+#include "msg.h"
+
+/* task::Server */
+
+namespace mongo {
+    namespace task {
+
+        /* to get back a return value */
+        struct Ret {
+            Ret() : done(false),m("Ret") { }
+            bool done;
+            mongo::mutex m;
+            boost::condition c;
+            const lam *msg;
+            void f() {
+                (*msg)();
+                done = true;
+                c.notify_one();
+            }
+        };
+
+        void Server::call( const lam& msg ) {
+            Ret r;
+            r.msg = &msg;
+            lam f = boost::bind(&Ret::f, &r);
+            send(f);
+            {
+                scoped_lock lk(r.m);
+                while( !r.done )
+                    r.c.wait(lk.boost());
+            }
+        }
+
+        void Server::send( lam msg ) {
+            {
+                scoped_lock lk(m);
+                d.push_back(msg);
+                wassert( d.size() < 1024 );
+            }
+            c.notify_one();
+        }
+
+        void Server::doWork() {
+            starting();
+            while( 1 ) {
+                lam f;
+                try {
+                    scoped_lock lk(m);
+                    while( d.empty() )
+                        c.wait(lk.boost());
+                    f = d.front();
+                    d.pop_front();
+                }
+                catch(...) {
+                    log() << "ERROR exception in Server:doWork?" << endl;
+                }
+                try {
+                    f();
+                    if( rq ) {
+                        rq = false;
+                        {
+                            scoped_lock lk(m);
+                            d.push_back(f);
+                        }
+                    }
+                }
+                catch(std::exception& e) {
+                    log() << "Server::doWork task:" << name() << " exception:" << e.what() << endl;
+                }
+                catch(const char *p) {
+                    log() << "Server::doWork task:" << name() << " unknown c exception:" <<
+                          ((p&&strlen(p)<800)?p:"?") << endl;
+                }
+                catch(...) {
+                    log() << "Server::doWork unknown exception task:" << name() << endl;
+                }
+            }
+        }
+
+        static Server *s;
+        static void abc(int i) {
+            cout << "Hello " << i << endl;
+            s->requeue();
+        }
+        class TaskUnitTest : public mongo::UnitTest {
+        public:
+            virtual void run() {
+                lam f = boost::bind(abc, 3);
+                //f();
+
+                s = new Server("unittest");
+                fork(s);
+                s->send(f);
+
+                sleepsecs(30);
+                cout <<" done" << endl;
+
+            }
+        }; // not running. taskunittest;
+
+    }
+}
diff --git a/src/mongo/util/concurrency/task.h b/src/mongo/util/concurrency/task.h
new file mode 100644
index 00000000000..d7b45eeef24
--- /dev/null
+++ b/src/mongo/util/concurrency/task.h
@@ -0,0 +1,72 @@
+// @file task.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../background.h"
+
+namespace mongo {
+
+    namespace task {
+
+        /** abstraction around threads.  simpler than BackgroundJob which is used behind the scenes.
+            allocate the Task dynamically.  when the thread terminates, the Task object will delete itself.
+        */
+        class Task : private BackgroundJob {
+        protected:
+            virtual void doWork() = 0;                  // implement the task here.
+            virtual string name() const = 0;            // name the threada
+        public:
+            Task();
+
+            /** for a repeating task, stop after current invocation ends. can be called by other threads
+                as long as the Task is still in scope.
+                */
+            void halt();
+        private:
+            unsigned n, repeat;
+            friend void fork(Task* t);
+            friend void repeat(Task* t, unsigned millis);
+            virtual void run();
+            //virtual void ending() { }
+            void begin();
+        };
+
+        /** run once */
+        void fork(Task *t);
+
+        /** run doWork() over and over, with a pause between runs of millis */
+        void repeat(Task *t, unsigned millis);
+
+        /*** Example ***
+        inline void sample() {
+            class Sample : public Task {
+            public:
+                int result;
+                virtual void doWork() { result = 1234; }
+                Sample() : result(0) { }
+            };
+            shared_ptr<Sample> q( new Sample() );
+            fork(q);
+            cout << q->result << endl; // could print 1234 or 0.
+        }
+        */
+
+    }
+
+}
diff --git a/src/mongo/util/concurrency/thread_pool.cpp b/src/mongo/util/concurrency/thread_pool.cpp
new file mode 100644
index 00000000000..1c258847cb5
--- /dev/null
+++ b/src/mongo/util/concurrency/thread_pool.cpp
@@ -0,0 +1,141 @@
+/* threadpool.cpp
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "thread_pool.h"
+#include "mvar.h"
+
+namespace mongo {
+    namespace threadpool {
+
+        // Worker thread
+        class Worker : boost::noncopyable {
+        public:
+            explicit Worker(ThreadPool& owner)
+                : _owner(owner)
+                , _is_done(true)
+                , _thread(boost::bind(&Worker::loop, this))
+            {}
+
+            // destructor will block until current operation is completed
+            // Acts as a "join" on this thread
+            ~Worker() {
+                _task.put(Task());
+                _thread.join();
+            }
+
+            void set_task(Task& func) {
+                assert(!func.empty());
+                assert(_is_done);
+                _is_done = false;
+
+                _task.put(func);
+            }
+
+        private:
+            ThreadPool& _owner;
+            MVar<Task> _task;
+            bool _is_done; // only used for error detection
+            boost::thread _thread;
+
+            void loop() {
+                while (true) {
+                    Task task = _task.take();
+                    if (task.empty())
+                        break; // ends the thread
+
+                    try {
+                        task();
+                    }
+                    catch (std::exception e) {
+                        log() << "Unhandled exception in worker thread: " << e.what() << endl;;
+                    }
+                    catch (...) {
+                        log() << "Unhandled non-exception in worker thread" << endl;
+                    }
+                    _is_done = true;
+                    _owner.task_done(this);
+                }
+            }
+        };
+
+        ThreadPool::ThreadPool(int nThreads)
+            : _mutex("ThreadPool"), _tasksRemaining(0)
+            , _nThreads(nThreads) {
+            scoped_lock lock(_mutex);
+            while (nThreads-- > 0) {
+                Worker* worker = new Worker(*this);
+                _freeWorkers.push_front(worker);
+            }
+        }
+
+        ThreadPool::~ThreadPool() {
+            join();
+
+            assert(_tasks.empty());
+
+            // O(n) but n should be small
+            assert(_freeWorkers.size() == (unsigned)_nThreads);
+
+            while(!_freeWorkers.empty()) {
+                delete _freeWorkers.front();
+                _freeWorkers.pop_front();
+            }
+        }
+
+        void ThreadPool::join() {
+            scoped_lock lock(_mutex);
+            while(_tasksRemaining) {
+                _condition.wait(lock.boost());
+            }
+        }
+
+        void ThreadPool::schedule(Task task) {
+            scoped_lock lock(_mutex);
+
+            _tasksRemaining++;
+
+            if (!_freeWorkers.empty()) {
+                _freeWorkers.front()->set_task(task);
+                _freeWorkers.pop_front();
+            }
+            else {
+                _tasks.push_back(task);
+            }
+        }
+
+        // should only be called by a worker from the worker thread
+        void ThreadPool::task_done(Worker* worker) {
+            scoped_lock lock(_mutex);
+
+            if (!_tasks.empty()) {
+                worker->set_task(_tasks.front());
+                _tasks.pop_front();
+            }
+            else {
+                _freeWorkers.push_front(worker);
+            }
+
+            _tasksRemaining--;
+
+            if(_tasksRemaining == 0)
+                _condition.notify_all();
+        }
+
+    } //namespace threadpool
+} //namespace mongo
diff --git a/src/mongo/util/concurrency/thread_pool.h b/src/mongo/util/concurrency/thread_pool.h
new file mode 100644
index 00000000000..b348ed1d01b
--- /dev/null
+++ b/src/mongo/util/concurrency/thread_pool.h
@@ -0,0 +1,82 @@
+// thread_pool.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/function.hpp>
+#include <boost/bind.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+    namespace threadpool {
+        class Worker;
+
+        typedef boost::function<void(void)> Task; //nullary function or functor
+
+        // exported to the mongo namespace
+        class ThreadPool : boost::noncopyable {
+        public:
+            explicit ThreadPool(int nThreads=8);
+
+            // blocks until all tasks are complete (tasks_remaining() == 0)
+            // You should not call schedule while in the destructor
+            ~ThreadPool();
+
+            // blocks until all tasks are complete (tasks_remaining() == 0)
+            // does not prevent new tasks from being scheduled so could wait forever.
+            // Also, new tasks could be scheduled after this returns.
+            void join();
+
+            // task will be copied a few times so make sure it's relatively cheap
+            void schedule(Task task);
+
+            // Helpers that wrap schedule and boost::bind.
+            // Functor and args will be copied a few times so make sure it's relatively cheap
+            template<typename F, typename A>
+            void schedule(F f, A a) { schedule(boost::bind(f,a)); }
+            template<typename F, typename A, typename B>
+            void schedule(F f, A a, B b) { schedule(boost::bind(f,a,b)); }
+            template<typename F, typename A, typename B, typename C>
+            void schedule(F f, A a, B b, C c) { schedule(boost::bind(f,a,b,c)); }
+            template<typename F, typename A, typename B, typename C, typename D>
+            void schedule(F f, A a, B b, C c, D d) { schedule(boost::bind(f,a,b,c,d)); }
+            template<typename F, typename A, typename B, typename C, typename D, typename E>
+            void schedule(F f, A a, B b, C c, D d, E e) { schedule(boost::bind(f,a,b,c,d,e)); }
+
+            int tasks_remaining() { return _tasksRemaining; }
+
+        private:
+            mongo::mutex _mutex;
+            boost::condition _condition;
+
+            list<Worker*> _freeWorkers; //used as LIFO stack (always front)
+            list<Task> _tasks; //used as FIFO queue (push_back, pop_front)
+            int _tasksRemaining; // in queue + currently processing
+            int _nThreads; // only used for sanity checking. could be removed in the future.
+
+            // should only be called by a worker from the worker's thread
+            void task_done(Worker* worker);
+            friend class Worker;
+        };
+
+    } //namespace threadpool
+
+    using threadpool::ThreadPool;
+
+} //namespace mongo
diff --git a/src/mongo/util/concurrency/threadlocal.h b/src/mongo/util/concurrency/threadlocal.h
new file mode 100644
index 00000000000..57a4f799dfa
--- /dev/null
+++ b/src/mongo/util/concurrency/threadlocal.h
@@ -0,0 +1,126 @@
+#pragma once
+
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <boost/thread/tss.hpp>
+
+namespace mongo { 
+
+    using boost::thread_specific_ptr;
+
+    /* thread local "value" rather than a pointer
+       good for things which have copy constructors (and the copy constructor is fast enough)
+       e.g.
+         ThreadLocalValue<int> myint;
+    */
+    template<class T>
+    class ThreadLocalValue {
+    public:
+        ThreadLocalValue( T def = 0 ) : _default( def ) { }
+
+        T get() const {
+            T * val = _val.get();
+            if ( val )
+                return *val;
+            return _default;
+        }
+
+        void set( const T& i ) {
+            T *v = _val.get();
+            if( v ) {
+                *v = i;
+                return;
+            }
+            v = new T(i);
+            _val.reset( v );
+        }
+
+        T& getRef() {
+            T *v = _val.get();
+            if( v ) {
+                return *v; 
+            }
+            v = new T(_default);
+            _val.reset( v );
+            return *v;
+        }
+
+    private:
+        boost::thread_specific_ptr<T> _val;
+        const T _default;
+    };
+
+    /* TSP
+       These macros use intrinsics which are faster than boost::thread_specific_ptr. 
+       However the intrinsics don't free up objects on thread closure. Thus we use 
+       a combination here, with the assumption that reset's are infrequent, so that 
+       get's are fast.
+    */
+#if defined(_WIN32) || (defined(__GNUC__) && defined(__linux__))
+        
+    template< class T >
+    struct TSP {
+        boost::thread_specific_ptr<T> tsp;
+    public:
+        T* get() const;
+        void reset(T* v);
+    };
+
+# if defined(_WIN32)
+
+#  define TSP_DECLARE(T,p) extern TSP<T> p;
+
+#  define TSP_DEFINE(T,p) __declspec( thread ) T* _ ## p; \
+    TSP<T> p; \
+    template<> T* TSP<T>::get() const { return _ ## p; } \
+    void TSP<T>::reset(T* v) { \
+        tsp.reset(v); \
+        _ ## p = v; \
+    } 
+# else
+
+#  define TSP_DECLARE(T,p) \
+    extern __thread T* _ ## p; \
+    template<> inline T* TSP<T>::get() const { return _ ## p; }	\
+    extern TSP<T> p;
+
+#  define TSP_DEFINE(T,p) \
+    __thread T* _ ## p; \
+    template<> void TSP<T>::reset(T* v) { \
+        tsp.reset(v); \
+        _ ## p = v; \
+    } \
+    TSP<T> p;
+# endif
+
+#else
+
+    template< class T >
+    struct TSP {
+        thread_specific_ptr<T> tsp;
+    public:
+        T* get() const { return tsp.get(); }
+        void reset(T* v) { tsp.reset(v); }
+    };
+
+#  define TSP_DECLARE(T,p) extern TSP<T> p;
+
+# define TSP_DEFINE(T,p) TSP<T> p; 
+
+#endif
+
+}
diff --git a/src/mongo/util/concurrency/value.h b/src/mongo/util/concurrency/value.h
new file mode 100644
index 00000000000..fdd0d9bbb42
--- /dev/null
+++ b/src/mongo/util/concurrency/value.h
@@ -0,0 +1,139 @@
+/* @file value.h
+   concurrency helpers DiagStr, Guarded
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "spin_lock.h"
+
+namespace mongo {
+
+    /** declare that a variable that is "guarded" by a mutex.
+
+        The decl documents the rule.  For example "counta and countb are guarded by xyzMutex":
+
+          Guarded<int, xyzMutex> counta;
+          Guarded<int, xyzMutex> countb;
+
+        Upon use, specify the scoped_lock object.  This makes it hard for someone 
+        later to forget to be in the lock.  Check is made that it is the right lock in _DEBUG
+        builds at runtime.
+    */
+    template <typename T, SimpleMutex& BY>
+    class Guarded {
+        T _val;
+    public:
+        T& ref(const SimpleMutex::scoped_lock& lk) {
+            dassert( &lk.m() == &BY );
+            return _val;
+        }
+    };
+
+    // todo: rename this to ThreadSafeString or something
+    /** there is now one mutex per DiagStr.  If you have hundreds or millions of
+        DiagStrs you'll need to do something different.
+    */
+    class DiagStr {
+        mutable SpinLock m;
+        string _s;
+    public:
+        DiagStr(const DiagStr& r) : _s(r.get()) { }
+        DiagStr(const string& r) : _s(r) { }
+        DiagStr() { }
+        bool empty() const { 
+            scoped_spinlock lk(m);
+            return _s.empty();
+        }
+        string get() const { 
+            scoped_spinlock lk(m);
+            return _s;
+        }
+        void set(const char *s) {
+            scoped_spinlock lk(m);
+            _s = s;
+        }
+        void set(const string& s) { 
+            scoped_spinlock lk(m);
+            _s = s;
+        }
+        operator string() const { return get(); }
+        void operator=(const string& s) { set(s); }
+        void operator=(const DiagStr& rhs) { 
+            set( rhs.get() );
+        }
+
+        // == is not defined.  use get() == ... instead.  done this way so one thinks about if composing multiple operations
+        bool operator==(const string& s) const; 
+    };
+
+    /** Thread safe map.  
+        Be careful not to use this too much or it could make things slow;
+        if not a hot code path no problem.
+    
+        Examples:
+
+        mapsf<int,int> mp;
+
+        int x = mp.get();
+
+        map<int,int> two;
+        mp.swap(two);
+
+        {
+            mapsf<int,int>::ref r(mp);
+            r[9] = 1;
+            map<int,int>::iterator i = r.r.begin();
+        }
+        
+    */
+    template< class K, class V >
+    struct mapsf : boost::noncopyable {
+        SimpleMutex m;
+        map<K,V> val;
+        friend struct ref;
+    public:
+        mapsf() : m("mapsf") { }
+        void swap(map<K,V>& rhs) {
+            SimpleMutex::scoped_lock lk(m);
+            val.swap(rhs);
+        }
+        bool empty() { 
+            SimpleMutex::scoped_lock lk(m);
+            return val.empty(); 
+        }
+        // safe as we pass by value:
+        V get(K k) { 
+            SimpleMutex::scoped_lock lk(m);
+            typename map<K,V>::iterator i = val.find(k);
+            if( i == val.end() )
+                return V();
+            return i->second;
+        }
+        // think about deadlocks when using ref.  the other methods
+        // above will always be safe as they are "leaf" operations.
+        struct ref {
+            SimpleMutex::scoped_lock lk;
+        public:
+            map<K,V> &r;
+            ref(mapsf<K,V> &m) : lk(m.m), r(m.val) { }
+            V& operator[](const K& k) { return r[k]; }
+        };
+    };
+
+}
diff --git a/src/mongo/util/concurrency/vars.cpp b/src/mongo/util/concurrency/vars.cpp
new file mode 100644
index 00000000000..0b2fc960c04
--- /dev/null
+++ b/src/mongo/util/concurrency/vars.cpp
@@ -0,0 +1,56 @@
+// vars.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "mutex.h"
+#include "value.h"
+
+namespace mongo {
+
+#if defined(_DEBUG)
+
+    // intentional leak. otherwise destructor orders can be problematic at termination.
+    MutexDebugger &mutexDebugger = *(new MutexDebugger());
+
+    MutexDebugger::MutexDebugger() :
+        x( *(new boost::mutex()) ), magic(0x12345678) {
+        // optional way to debug lock order
+        /*
+        a = "a_lock";
+        b = "b_lock";
+        */
+    }
+
+    void MutexDebugger::programEnding() {
+        if( logLevel>=1 && followers.size() ) {
+            std::cout << followers.size() << " mutexes in program" << endl;
+            for( map< mid, set<mid> >::iterator i = followers.begin(); i != followers.end(); i++ ) {
+                cout << i->first;
+                if( maxNest[i->first] > 1 )
+                    cout << " maxNest:" << maxNest[i->first];
+                cout << '\n';
+                for( set<mid>::iterator j = i->second.begin(); j != i->second.end(); j++ )
+                    cout << "  " << *j << '\n';
+            }
+            cout.flush();
+        }
+    }
+
+#endif
+
+}
diff --git a/src/mongo/util/debug_util.cpp b/src/mongo/util/debug_util.cpp
new file mode 100644
index 00000000000..8ba6534ef7c
--- /dev/null
+++ b/src/mongo/util/debug_util.cpp
@@ -0,0 +1,60 @@
+// debug_util.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "../db/cmdline.h"
+#include "../db/jsobj.h"
+
+namespace mongo {
+
+#if defined(USE_GDBSERVER)
+    /* Magic gdb trampoline
+     * Do not call directly! call setupSIGTRAPforGDB()
+     * Assumptions:
+     *  1) gdbserver is on your path
+     *  2) You have run "handle SIGSTOP noprint" in gdb
+     *  3) cmdLine.port + 2000 is free
+     */
+    void launchGDB(int) {
+        // Don't come back here
+        signal(SIGTRAP, SIG_IGN);
+
+        int newPort = cmdLine.port + 2000;
+        string newPortStr = "localhost:" + BSONObjBuilder::numStr(newPort);
+        string pidToDebug = BSONObjBuilder::numStr(getpid());
+
+        cout << "\n\n\t**** Launching gdbserver on " << newPortStr << " ****" << endl << endl;
+        if (fork() == 0) {
+            //child
+            execlp("gdbserver", "gdbserver", "--attach", newPortStr.c_str(), pidToDebug.c_str(), NULL);
+            perror(NULL);
+        }
+        else {
+            //parent
+            raise(SIGSTOP); // pause all threads until gdb connects and continues
+            raise(SIGTRAP); // break inside gdbserver
+        }
+    }
+
+    void setupSIGTRAPforGDB() {
+        assert( signal(SIGTRAP , launchGDB ) != SIG_ERR );
+    }
+#else
+    void setupSIGTRAPforGDB() {
+    }
+#endif
+}
diff --git a/src/mongo/util/debug_util.h b/src/mongo/util/debug_util.h
new file mode 100644
index 00000000000..abed8d94924
--- /dev/null
+++ b/src/mongo/util/debug_util.h
@@ -0,0 +1,106 @@
+// debug_util.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#ifndef _WIN32
+#include <signal.h>
+#endif
+
+namespace mongo {
+
+// for debugging
+    typedef struct _Ints {
+        int i[100];
+    } *Ints;
+    typedef struct _Chars {
+        char c[200];
+    } *Chars;
+
+    typedef char CHARS[400];
+
+    typedef struct _OWS {
+        int size;
+        char type;
+        char string[400];
+    } *OWS;
+
+#if defined(_DEBUG)
+    enum {DEBUG_BUILD = 1};
+#else
+    enum {DEBUG_BUILD = 0};
+#endif
+
+#define MONGO_DEV if( DEBUG_BUILD )
+#define DEV MONGO_DEV
+
+#define MONGO_DEBUGGING if( 0 )
+#define DEBUGGING MONGO_DEBUGGING
+
+// The following declare one unique counter per enclosing function.
+// NOTE The implementation double-increments on a match, but we don't really care.
+#define MONGO_SOMETIMES( occasion, howOften ) for( static unsigned occasion = 0; ++occasion % howOften == 0; )
+#define SOMETIMES MONGO_SOMETIMES
+
+#define MONGO_OCCASIONALLY SOMETIMES( occasionally, 16 )
+#define OCCASIONALLY MONGO_OCCASIONALLY
+
+#define MONGO_RARELY SOMETIMES( rarely, 128 )
+#define RARELY MONGO_RARELY
+
+#define MONGO_ONCE for( static bool undone = true; undone; undone = false )
+#define ONCE MONGO_ONCE
+
+#if defined(_WIN32)
+    inline int strcasecmp(const char* s1, const char* s2) {return _stricmp(s1, s2);}
+#endif
+
+    // Sets SIGTRAP handler to launch GDB
+    // Noop unless on *NIX and compiled with _DEBUG
+    void setupSIGTRAPforGDB();
+
+    extern int tlogLevel;
+
+    inline void breakpoint() {
+        if ( tlogLevel < 0 )
+            return;
+#ifdef _WIN32
+        //DEV DebugBreak();
+#endif
+#ifndef _WIN32
+        // code to raise a breakpoint in GDB
+        ONCE {
+            //prevent SIGTRAP from crashing the program if default action is specified and we are not in gdb
+            struct sigaction current;
+            sigaction(SIGTRAP, NULL, &current);
+            if (current.sa_handler == SIG_DFL) {
+                signal(SIGTRAP, SIG_IGN);
+            }
+        }
+
+        raise(SIGTRAP);
+#endif
+    }
+
+
+    // conditional breakpoint
+    inline void breakif(bool test) {
+        if (test)
+            breakpoint();
+    }
+
+} // namespace mongo
diff --git a/src/mongo/util/embedded_builder.h b/src/mongo/util/embedded_builder.h
new file mode 100644
index 00000000000..abf518e2583
--- /dev/null
+++ b/src/mongo/util/embedded_builder.h
@@ -0,0 +1,92 @@
+// embedded_builder.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    // utility class for assembling hierarchical objects
+    class EmbeddedBuilder {
+    public:
+        EmbeddedBuilder( BSONObjBuilder *b ) {
+            _builders.push_back( make_pair( "", b ) );
+        }
+        // It is assumed that the calls to prepareContext will be made with the 'name'
+        // parameter in lex ascending order.
+        void prepareContext( string &name ) {
+            int i = 1, n = _builders.size();
+            while( i < n &&
+                    name.substr( 0, _builders[ i ].first.length() ) == _builders[ i ].first &&
+                    ( name[ _builders[i].first.length() ] == '.' || name[ _builders[i].first.length() ] == 0 )
+                 ) {
+                name = name.substr( _builders[ i ].first.length() + 1 );
+                ++i;
+            }
+            for( int j = n - 1; j >= i; --j ) {
+                popBuilder();
+            }
+            for( string next = splitDot( name ); !next.empty(); next = splitDot( name ) ) {
+                addBuilder( next );
+            }
+        }
+        void appendAs( const BSONElement &e, string name ) {
+            if ( e.type() == Object && e.valuesize() == 5 ) { // empty object -- this way we can add to it later
+                string dummyName = name + ".foo";
+                prepareContext( dummyName );
+                return;
+            }
+            prepareContext( name );
+            back()->appendAs( e, name );
+        }
+        BufBuilder &subarrayStartAs( string name ) {
+            prepareContext( name );
+            return back()->subarrayStart( name );
+        }
+        void done() {
+            while( ! _builderStorage.empty() )
+                popBuilder();
+        }
+
+        static string splitDot( string & str ) {
+            size_t pos = str.find( '.' );
+            if ( pos == string::npos )
+                return "";
+            string ret = str.substr( 0, pos );
+            str = str.substr( pos + 1 );
+            return ret;
+        }
+
+    private:
+        void addBuilder( const string &name ) {
+            shared_ptr< BSONObjBuilder > newBuilder( new BSONObjBuilder( back()->subobjStart( name ) ) );
+            _builders.push_back( make_pair( name, newBuilder.get() ) );
+            _builderStorage.push_back( newBuilder );
+        }
+        void popBuilder() {
+            back()->done();
+            _builders.pop_back();
+            _builderStorage.pop_back();
+        }
+
+        BSONObjBuilder *back() { return _builders.back().second; }
+
+        vector< pair< string, BSONObjBuilder * > > _builders;
+        vector< shared_ptr< BSONObjBuilder > > _builderStorage;
+
+    };
+
+} //namespace mongo
diff --git a/src/mongo/util/file.h b/src/mongo/util/file.h
new file mode 100644
index 00000000000..368e6927b43
--- /dev/null
+++ b/src/mongo/util/file.h
@@ -0,0 +1,230 @@
+// file.h cross platform basic file class. supports 64 bit offsets and such.
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(_WIN32)
+#include "errno.h"
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/statvfs.h>
+#endif
+#include "text.h"
+
+namespace mongo {
+
+#ifndef __sunos__
+    typedef uint64_t fileofs;
+#else
+    typedef boost::uint64_t fileofs;
+#endif
+
+    /* NOTE: not thread-safe. (at least the windows implementation isn't. */
+
+    class FileInterface {
+    public:
+        void open(const char *fn) {}
+        void write(fileofs o, const char *data, unsigned len) {}
+        void read(fileofs o, char *data, unsigned len) {}
+        bool bad() {return false;}
+        bool is_open() {return false;}
+        fileofs len() { return 0; }
+        void fsync() { assert(false); }
+
+        // shrink file to size bytes. No-op if file already smaller.
+        void truncate(fileofs size);
+
+        /** @return  -1 if error or unavailable */
+        static boost::intmax_t freeSpace(const string &path) { assert(false); return -1; }
+    };
+
+#if defined(_WIN32)
+#include <io.h>
+
+    class File : public FileInterface {
+        HANDLE fd;
+        bool _bad;
+        string _name;
+        void err(BOOL b=false) { /* false = error happened */
+            if( !b && !_bad ) {
+                _bad = true;
+                log() << "File " << _name << "I/O error " << GetLastError() << '\n';
+            }
+        }
+    public:
+        File() {
+            fd = INVALID_HANDLE_VALUE;
+            _bad = true;
+        }
+        ~File() {
+            if( is_open() ) CloseHandle(fd);
+            fd = INVALID_HANDLE_VALUE;
+        }
+        void open(const char *filename, bool readOnly=false , bool direct=false) {
+            _name = filename;
+            fd = CreateFile(
+                     toNativeString(filename).c_str(),
+                     ( readOnly ? 0 : GENERIC_WRITE ) | GENERIC_READ, FILE_SHARE_WRITE|FILE_SHARE_READ,
+                     NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+            if( !is_open() ) {
+                DWORD e = GetLastError();
+                log() << "Create/Open File failed " << filename << ' ' << errnoWithDescription(e) << endl;
+            }
+            else
+                _bad = false;
+        }
+        static boost::intmax_t freeSpace(const string &path) {
+            ULARGE_INTEGER avail;
+            if( GetDiskFreeSpaceEx(toNativeString(path.c_str()).c_str(), &avail, NULL, NULL) ) { 
+                return avail.QuadPart;
+            }
+            DWORD e = GetLastError();
+            log() << "GetDiskFreeSpaceEx fails errno: " << e << endl;
+            return -1;
+        }
+        void write(fileofs o, const char *data, unsigned len) {
+            LARGE_INTEGER li;
+            li.QuadPart = o;
+            SetFilePointerEx(fd, li, NULL, FILE_BEGIN);
+            DWORD written;
+            err( WriteFile(fd, data, len, &written, NULL) );
+        }
+        void read(fileofs o, char *data, unsigned len) {
+            DWORD read;
+            LARGE_INTEGER li;
+            li.QuadPart = o;
+            SetFilePointerEx(fd, li, NULL, FILE_BEGIN);
+            int ok = ReadFile(fd, data, len, &read, 0);
+            if( !ok )
+                err(ok);
+            else
+                massert( 10438 , "ReadFile error - truncated file?", read == len);
+        }
+        bool bad() { return _bad; }
+        bool is_open() { return fd != INVALID_HANDLE_VALUE; }
+        fileofs len() {
+            LARGE_INTEGER li;
+            li.LowPart = GetFileSize(fd, (DWORD *) &li.HighPart);
+            if( li.HighPart == 0 && li.LowPart == INVALID_FILE_SIZE ) {
+                err( false );
+                return 0;
+            }
+            return li.QuadPart;
+        }
+        void fsync() { FlushFileBuffers(fd); }
+
+        void truncate(fileofs size) {
+            if (len() <= size)
+                return;
+
+            LARGE_INTEGER li;
+            li.QuadPart = size;
+            if (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) == 0){
+                err(false);
+                return; //couldn't seek
+            }
+
+            err(SetEndOfFile(fd));
+        }
+    };
+
+#else
+
+    class File : public FileInterface {
+    public:
+        int fd;
+    private:
+        bool _bad;
+        void err(bool ok) {
+            if( !ok && !_bad ) {
+                _bad = true;
+                log() << "File I/O " << errnoWithDescription() << '\n';
+            }
+        }
+    public:
+        File() {
+            fd = -1;
+            _bad = true;
+        }
+        ~File() {
+            if( is_open() ) ::close(fd);
+            fd = -1;
+        }
+
+#ifndef O_NOATIME
+#define O_NOATIME 0
+#endif
+
+        void open(const char *filename, bool readOnly=false , bool direct=false) {
+            fd = ::open(filename,
+                        O_CREAT | ( readOnly ? 0 : ( O_RDWR | O_NOATIME ) ) 
+#if defined(O_DIRECT)
+                        | ( direct ? O_DIRECT : 0 ) 
+#endif
+                        ,
+                        S_IRUSR | S_IWUSR);
+            if ( fd <= 0 ) {
+                out() << "couldn't open " << filename << ' ' << errnoWithDescription() << endl;
+                return;
+            }
+            _bad = false;
+        }
+        void write(fileofs o, const char *data, unsigned len) {
+            err( ::pwrite(fd, data, len, o) == (int) len );
+        }
+        void read(fileofs o, char *data, unsigned len) {
+            ssize_t s = ::pread(fd, data, len, o);
+            if( s == -1 ) {
+                err(false);
+            }
+            else if( s != (int) len ) { 
+                _bad = true;
+                log() << "File error read:" << s << " bytes, wanted:" << len << " ofs:" << o << endl;
+            }
+        }
+        bool bad() { return _bad; }
+        bool is_open() { return fd > 0; }
+        fileofs len() {
+            off_t o = lseek(fd, 0, SEEK_END);
+            if( o != (off_t) -1 )
+                return o;
+            err(false);
+            return 0;
+        }
+        void fsync() { ::fsync(fd); }
+        static boost::intmax_t freeSpace ( const string &path ) {
+            struct statvfs info;
+            assert( !statvfs( path.c_str() , &info ) );
+            return boost::intmax_t( info.f_bavail ) * info.f_frsize;
+        }
+
+        void truncate(fileofs size) {
+            if (len() <= size)
+                return;
+
+            err(ftruncate(fd, size) == 0);
+        }
+    };
+
+
+#endif
+
+
+}
+
diff --git a/src/mongo/util/file_allocator.cpp b/src/mongo/util/file_allocator.cpp
new file mode 100644
index 00000000000..b0572f971bd
--- /dev/null
+++ b/src/mongo/util/file_allocator.cpp
@@ -0,0 +1,329 @@
+// @file file_allocator.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include <fcntl.h>
+#include <errno.h>
+
+#if defined(__freebsd__) || defined(__openbsd__)
+#include <sys/stat.h>
+#endif
+
+#include "timer.h"
+#include "mongoutils/str.h"
+using namespace mongoutils;
+
+#ifndef O_NOATIME
+#define O_NOATIME (0)
+#endif
+
+#include "file_allocator.h"
+#include "paths.h"
+
+namespace mongo {
+
+    boost::filesystem::path ensureParentDirCreated(const boost::filesystem::path& p){
+        const boost::filesystem::path parent = p.branch_path();
+		
+        if (! boost::filesystem::exists(parent)){
+            ensureParentDirCreated(parent);
+            log() << "creating directory " << parent.string() << endl;
+            boost::filesystem::create_directory(parent);
+            flushMyDirectory(parent); // flushes grandparent to ensure parent exists after crash
+        }
+        
+        assert(boost::filesystem::is_directory(parent));
+        return parent;
+    }
+
+#if defined(_WIN32)
+    FileAllocator::FileAllocator() {
+    }
+
+    void FileAllocator::start() {
+    }
+
+    void FileAllocator::requestAllocation( const string &name, long &size ) {
+        /* Some of the system calls in the file allocator don't work in win,
+           so no win support - 32 or 64 bit.  Plus we don't seem to need preallocation
+           on windows anyway as we don't have to pre-zero the file there.
+        */
+    }
+
+    void FileAllocator::allocateAsap( const string &name, unsigned long long &size ) {
+        // no-op
+    }
+
+    void FileAllocator::waitUntilFinished() const {
+        // no-op
+    }
+
+    void FileAllocator::ensureLength(int fd , long size) {
+        // we don't zero on windows
+        // TODO : we should to avoid fragmentation
+    }
+
+    bool FileAllocator::hasFailed() const {
+        return false;
+    }
+
+#else
+
+    FileAllocator::FileAllocator()
+        : _pendingMutex("FileAllocator"), _failed() {
+    }
+
+
+    void FileAllocator::start() {
+        boost::thread t( boost::bind( &FileAllocator::run , this ) );
+    }
+
+    void FileAllocator::requestAllocation( const string &name, long &size ) {
+        scoped_lock lk( _pendingMutex );
+        if ( _failed )
+            return;
+        long oldSize = prevSize( name );
+        if ( oldSize != -1 ) {
+            size = oldSize;
+            return;
+        }
+        _pending.push_back( name );
+        _pendingSize[ name ] = size;
+        _pendingUpdated.notify_all();
+    }
+
+    void FileAllocator::allocateAsap( const string &name, unsigned long long &size ) {
+        scoped_lock lk( _pendingMutex );
+        long oldSize = prevSize( name );
+        if ( oldSize != -1 ) {
+            size = oldSize;
+            if ( !inProgress( name ) )
+                return;
+        }
+        checkFailure();
+        _pendingSize[ name ] = size;
+        if ( _pending.size() == 0 )
+            _pending.push_back( name );
+        else if ( _pending.front() != name ) {
+            _pending.remove( name );
+            list< string >::iterator i = _pending.begin();
+            ++i;
+            _pending.insert( i, name );
+        }
+        _pendingUpdated.notify_all();
+        while( inProgress( name ) ) {
+            checkFailure();
+            _pendingUpdated.wait( lk.boost() );
+        }
+
+    }
+
+    void FileAllocator::waitUntilFinished() const {
+        if ( _failed )
+            return;
+        scoped_lock lk( _pendingMutex );
+        while( _pending.size() != 0 )
+            _pendingUpdated.wait( lk.boost() );
+    }
+
+    void FileAllocator::ensureLength(int fd , long size) {
+#if defined(__linux__)
+        int ret = posix_fallocate(fd,0,size);
+        if ( ret == 0 )
+            return;
+
+        log() << "FileAllocator: posix_fallocate failed: " << errnoWithDescription( ret ) << " falling back" << endl;
+#endif
+
+        off_t filelen = lseek(fd, 0, SEEK_END);
+        if ( filelen < size ) {
+            if (filelen != 0) {
+                stringstream ss;
+                ss << "failure creating new datafile; lseek failed for fd " << fd << " with errno: " << errnoWithDescription();
+                uassert( 10440 ,  ss.str(), filelen == 0 );
+            }
+            // Check for end of disk.
+
+            uassert( 10441 ,  str::stream() << "Unable to allocate new file of size " << size << ' ' << errnoWithDescription(),
+                     size - 1 == lseek(fd, size - 1, SEEK_SET) );
+            uassert( 10442 ,  str::stream() << "Unable to allocate new file of size " << size << ' ' << errnoWithDescription(),
+                     1 == write(fd, "", 1) );
+            lseek(fd, 0, SEEK_SET);
+
+            const long z = 256 * 1024;
+            const boost::scoped_array<char> buf_holder (new char[z]);
+            char* buf = buf_holder.get();
+            memset(buf, 0, z);
+            long left = size;
+            while ( left > 0 ) {
+                long towrite = left;
+                if ( towrite > z )
+                    towrite = z;
+
+                int written = write( fd , buf , towrite );
+                uassert( 10443 , errnoWithPrefix("FileAllocator: file write failed" ), written > 0 );
+                left -= written;
+            }
+        }
+    }
+
+    bool FileAllocator::hasFailed() const {
+        return _failed;
+    }
+
+    void FileAllocator::checkFailure() {
+        if (_failed) {
+            // we want to log the problem (diskfull.js expects it) but we do not want to dump a stack tracke
+            msgassertedNoTrace( 12520, "new file allocation failure" );
+        }
+    }
+
+    long FileAllocator::prevSize( const string &name ) const {
+        if ( _pendingSize.count( name ) > 0 )
+            return _pendingSize[ name ];
+        if ( boost::filesystem::exists( name ) )
+            return boost::filesystem::file_size( name );
+        return -1;
+    }
+
+    // caller must hold _pendingMutex lock.
+    bool FileAllocator::inProgress( const string &name ) const {
+        for( list< string >::const_iterator i = _pending.begin(); i != _pending.end(); ++i )
+            if ( *i == name )
+                return true;
+        return false;
+    }
+
+    string makeTempFileName( path root ) {
+        while( 1 ) {
+            path p = root / "_tmp";
+            stringstream ss;
+            ss << (unsigned) rand();
+            p /= ss.str();
+            string fn = p.string();
+            if( !boost::filesystem::exists(p) )
+              return fn;
+        }
+        return "";
+	}
+
+    void FileAllocator::run( FileAllocator * fa ) {
+        setThreadName( "FileAllocator" );
+        while( 1 ) {
+            {
+                scoped_lock lk( fa->_pendingMutex );
+                if ( fa->_pending.size() == 0 )
+                    fa->_pendingUpdated.wait( lk.boost() );
+            }
+            while( 1 ) {
+                string name;
+                long size;
+                {
+                    scoped_lock lk( fa->_pendingMutex );
+                    if ( fa->_pending.size() == 0 )
+                        break;
+                    name = fa->_pending.front();
+                    size = fa->_pendingSize[ name ];
+                }
+
+                string tmp;
+                long fd = 0;
+                try {
+                    log() << "allocating new datafile " << name << ", filling with zeroes..." << endl;
+                    
+                    boost::filesystem::path parent = ensureParentDirCreated(name);
+                    tmp = makeTempFileName( parent );
+                    ensureParentDirCreated(tmp);
+
+                    fd = open(tmp.c_str(), O_CREAT | O_RDWR | O_NOATIME, S_IRUSR | S_IWUSR);
+                    if ( fd <= 0 ) {
+                        log() << "FileAllocator: couldn't create " << name << " (" << tmp << ") " << errnoWithDescription() << endl;
+                        uasserted(10439, "");
+                    }
+
+#if defined(POSIX_FADV_DONTNEED)
+                    if( posix_fadvise(fd, 0, size, POSIX_FADV_DONTNEED) ) {
+                        log() << "warning: posix_fadvise fails " << name << " (" << tmp << ") " << errnoWithDescription() << endl;
+                    }
+#endif
+
+                    Timer t;
+
+                    /* make sure the file is the full desired length */
+                    ensureLength( fd , size );
+
+                    close( fd );
+                    fd = 0;
+
+                    if( rename(tmp.c_str(), name.c_str()) ) { 
+                        log() << "error: couldn't rename " << tmp << " to " << name << ' ' << errnoWithDescription() << endl;
+                        uasserted(13653, "");
+                    }
+                    flushMyDirectory(name);
+
+                    log() << "done allocating datafile " << name << ", "
+                          << "size: " << size/1024/1024 << "MB, "
+                          << " took " << ((double)t.millis())/1000.0 << " secs"
+                          << endl;
+
+                    // no longer in a failed state. allow new writers.
+                    fa->_failed = false;
+                }
+                catch ( ... ) {
+                    if ( fd > 0 )
+                        close( fd );
+                    log() << "error failed to allocate new file: " << name
+                          << " size: " << size << ' ' << errnoWithDescription() << warnings;
+                    log() << "    will try again in 10 seconds" << endl; // not going to warning logs
+                    try {
+                        if ( tmp.size() )
+                            BOOST_CHECK_EXCEPTION( boost::filesystem::remove( tmp ) );
+                        BOOST_CHECK_EXCEPTION( boost::filesystem::remove( name ) );
+                    }
+                    catch ( ... ) {
+                    }
+                    scoped_lock lk( fa->_pendingMutex );
+                    fa->_failed = true;
+                    // not erasing from pending
+                    fa->_pendingUpdated.notify_all();
+                    
+                    
+                    sleepsecs(10);
+                    continue;
+                }
+
+                {
+                    scoped_lock lk( fa->_pendingMutex );
+                    fa->_pendingSize.erase( name );
+                    fa->_pending.pop_front();
+                    fa->_pendingUpdated.notify_all();
+                }
+            }
+        }
+    }
+
+#endif
+
+    FileAllocator* FileAllocator::_instance = 0;
+
+    FileAllocator* FileAllocator::get(){
+        if ( ! _instance )
+            _instance = new FileAllocator();
+        return _instance;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/util/file_allocator.h b/src/mongo/util/file_allocator.h
new file mode 100644
index 00000000000..7c3cacb2888
--- /dev/null
+++ b/src/mongo/util/file_allocator.h
@@ -0,0 +1,91 @@
+// @file file_allocator.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "../pch.h"
+
+namespace mongo {
+
+    /*
+     * Handles allocation of contiguous files on disk.  Allocation may be
+     * requested asynchronously or synchronously.
+     * singleton
+     */
+    class FileAllocator : boost::noncopyable {
+        /*
+         * The public functions may not be called concurrently.  The allocation
+         * functions may be called multiple times per file, but only the first
+         * size specified per file will be used.
+        */
+    public:
+        void start();
+
+        /**
+         * May be called if file exists. If file exists, or its allocation has
+         *  been requested, size is updated to match existing file size.
+         */
+        void requestAllocation( const string &name, long &size );
+
+
+        /**
+         * Returns when file has been allocated.  If file exists, size is
+         * updated to match existing file size.
+         */
+        void allocateAsap( const string &name, unsigned long long &size );
+
+        void waitUntilFinished() const;
+        
+        bool hasFailed() const;
+
+        static void ensureLength(int fd , long size);
+
+        /** @return the singletone */
+        static FileAllocator * get();
+        
+    private:
+
+        FileAllocator();
+
+#if !defined(_WIN32)
+        void checkFailure();
+
+        // caller must hold pendingMutex_ lock.  Returns size if allocated or
+        // allocation requested, -1 otherwise.
+        long prevSize( const string &name ) const;
+
+        // caller must hold pendingMutex_ lock.
+        bool inProgress( const string &name ) const;
+
+        /** called from the worked thread */
+        static void run( FileAllocator * fa );
+
+        mutable mongo::mutex _pendingMutex;
+        mutable boost::condition _pendingUpdated;
+
+        list< string > _pending;
+        mutable map< string, long > _pendingSize;
+
+        bool _failed;
+#endif
+        
+        static FileAllocator* _instance;
+
+    };
+
+    /** like "mkdir -p" but on parent dir of p rather than p itself */
+    boost::filesystem::path ensureParentDirCreated(const boost::filesystem::path& p);
+
+} // namespace mongo
diff --git a/src/mongo/util/goodies.h b/src/mongo/util/goodies.h
new file mode 100644
index 00000000000..9398b5c3f1d
--- /dev/null
+++ b/src/mongo/util/goodies.h
@@ -0,0 +1,475 @@
+// @file goodies.h
+// miscellaneous
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../bson/util/misc.h"
+#include "concurrency/mutex.h"
+
+namespace mongo {
+
+    /* @return a dump of the buffer as hex byte ascii output */
+    string hexdump(const char *data, unsigned len);
+
+    /**
+     * @return if this name has an increasing counter associated, return the value
+     *         otherwise 0
+     */
+    unsigned setThreadName(const char * name);
+    string getThreadName();
+
+    template<class T>
+    inline string ToString(const T& t) {
+        stringstream s;
+        s << t;
+        return s.str();
+    }
+
+#if !defined(_WIN32) && !defined(NOEXECINFO) && !defined(__freebsd__) && !defined(__openbsd__) && !defined(__sun__)
+
+} // namespace mongo
+
+#include <pthread.h>
+#include <execinfo.h>
+
+namespace mongo {
+
+    inline pthread_t GetCurrentThreadId() {
+        return pthread_self();
+    }
+
+    /* use "addr2line -CFe <exe>" to parse. */
+    inline void printStackTrace( ostream &o = cout ) {
+        void *b[20];
+
+        int size = backtrace(b, 20);
+        for (int i = 0; i < size; i++)
+            o << hex << b[i] << dec << ' ';
+        o << endl;
+
+        char **strings;
+
+        strings = backtrace_symbols(b, size);
+        for (int i = 0; i < size; i++)
+            o << ' ' << strings[i] << '\n';
+        o.flush();
+        free (strings);
+    }
+#else
+    inline void printStackTrace( ostream &o = cout ) { }
+#endif
+
+    bool isPrime(int n);
+    int nextPrime(int n);
+
+    inline void dumpmemory(const char *data, int len) {
+        if ( len > 1024 )
+            len = 1024;
+        try {
+            const char *q = data;
+            const char *p = q;
+            while ( len > 0 ) {
+                for ( int i = 0; i < 16; i++ ) {
+                    if ( *p >= 32 && *p <= 126 )
+                        cout << *p;
+                    else
+                        cout << '.';
+                    p++;
+                }
+                cout << "  ";
+                p -= 16;
+                for ( int i = 0; i < 16; i++ )
+                    cout << (unsigned) ((unsigned char)*p++) << ' ';
+                cout << endl;
+                len -= 16;
+            }
+        }
+        catch (...) {
+        }
+    }
+
+// PRINT(2+2);  prints "2+2: 4"
+#define MONGO_PRINT(x) cout << #x ": " << (x) << endl
+#define PRINT MONGO_PRINT
+// PRINTFL; prints file:line
+#define MONGO_PRINTFL cout << __FILE__ ":" << __LINE__ << endl
+#define PRINTFL MONGO_PRINTFL
+#define MONGO_FLOG log() << __FILE__ ":" << __LINE__ << endl
+#define FLOG MONGO_FLOG
+
+#undef assert
+#define assert MONGO_assert
+
+    inline bool startsWith(const char *str, const char *prefix) {
+        size_t l = strlen(prefix);
+        if ( strlen(str) < l ) return false;
+        return strncmp(str, prefix, l) == 0;
+    }
+    inline bool startsWith(string s, string p) { return startsWith(s.c_str(), p.c_str()); }
+
+    inline bool endsWith(const char *p, const char *suffix) {
+        size_t a = strlen(p);
+        size_t b = strlen(suffix);
+        if ( b > a ) return false;
+        return strcmp(p + a - b, suffix) == 0;
+    }
+
+    inline unsigned long swapEndian(unsigned long x) {
+        return
+            ((x & 0xff) << 24) |
+            ((x & 0xff00) << 8) |
+            ((x & 0xff0000) >> 8) |
+            ((x & 0xff000000) >> 24);
+    }
+
+#if defined(BOOST_LITTLE_ENDIAN)
+    inline unsigned long fixEndian(unsigned long x) {
+        return x;
+    }
+#else
+    inline unsigned long fixEndian(unsigned long x) {
+        return swapEndian(x);
+    }
+#endif
+
+#if !defined(_WIN32)
+    typedef int HANDLE;
+    inline void strcpy_s(char *dst, unsigned len, const char *src) {
+        assert( strlen(src) < len );
+        strcpy(dst, src);
+    }
+#else
+    typedef void *HANDLE;
+#endif
+
+    class ProgressMeter : boost::noncopyable {
+    public:
+        ProgressMeter( unsigned long long total , int secondsBetween = 3 , int checkInterval = 100 , string units = "" ) : _units(units) {
+            reset( total , secondsBetween , checkInterval );
+        }
+
+        ProgressMeter() {
+            _active = 0;
+            _units = "";
+        }
+
+        // typically you do ProgressMeterHolder
+        void reset( unsigned long long total , int secondsBetween = 3 , int checkInterval = 100 ) {
+            _total = total;
+            _secondsBetween = secondsBetween;
+            _checkInterval = checkInterval;
+
+            _done = 0;
+            _hits = 0;
+            _lastTime = (int)time(0);
+
+            _active = 1;
+        }
+
+        void finished() {
+            _active = 0;
+        }
+
+        bool isActive() {
+            return _active;
+        }
+
+        /**
+         * @param n how far along we are relative to the total # we set in CurOp::setMessage
+         * @return if row was printed
+         */
+        bool hit( int n = 1 ) {
+            if ( ! _active ) {
+                cout << "warning: hit an inactive ProgressMeter" << endl;
+                return false;
+            }
+
+            _done += n;
+            _hits++;
+            if ( _hits % _checkInterval )
+                return false;
+
+            int t = (int) time(0);
+            if ( t - _lastTime < _secondsBetween )
+                return false;
+
+            if ( _total > 0 ) {
+                int per = (int)( ( (double)_done * 100.0 ) / (double)_total );
+                cout << "\t\t" << _done << "/" << _total << "\t" << per << "%";
+
+                if ( ! _units.empty() ) {
+                    cout << "\t(" << _units << ")" << endl;
+                }
+                else {
+                    cout << endl;
+                }
+            }
+            _lastTime = t;
+            return true;
+        }
+
+        void setUnits( string units ) {
+            _units = units;
+        }
+
+        void setTotalWhileRunning( unsigned long long total ) {
+            _total = total;
+        }
+
+        unsigned long long done() const { return _done; }
+
+        unsigned long long hits() const { return _hits; }
+
+        unsigned long long total() const { return _total; } 
+
+        string toString() const {
+            if ( ! _active )
+                return "";
+            stringstream buf;
+            buf << _done << "/" << _total << " " << (_done*100)/_total << "%";
+
+            if ( ! _units.empty() ) {
+                buf << "\t(" << _units << ")" << endl;
+            }
+
+            return buf.str();
+        }
+
+        bool operator==( const ProgressMeter& other ) const {
+            return this == &other;
+        }
+    private:
+
+        bool _active;
+
+        unsigned long long _total;
+        int _secondsBetween;
+        int _checkInterval;
+
+        unsigned long long _done;
+        unsigned long long _hits;
+        int _lastTime;
+
+        string _units;
+    };
+
+    // e.g.: 
+    // CurOp * op = cc().curop();
+    // ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
+    // loop { pm.hit(); }
+    class ProgressMeterHolder : boost::noncopyable {
+    public:
+        ProgressMeterHolder( ProgressMeter& pm )
+            : _pm( pm ) {
+        }
+
+        ~ProgressMeterHolder() {
+            _pm.finished();
+        }
+
+        ProgressMeter* operator->() {
+            return &_pm;
+        }
+
+        bool hit( int n = 1 ) {
+            return _pm.hit( n );
+        }
+
+        void finished() {
+            _pm.finished();
+        }
+
+        bool operator==( const ProgressMeter& other ) {
+            return _pm == other;
+        }
+
+    private:
+        ProgressMeter& _pm;
+    };
+
+    class TicketHolder {
+    public:
+        TicketHolder( int num ) : _mutex("TicketHolder") {
+            _outof = num;
+            _num = num;
+        }
+
+        bool tryAcquire() {
+            scoped_lock lk( _mutex );
+            if ( _num <= 0 ) {
+                if ( _num < 0 ) {
+                    cerr << "DISASTER! in TicketHolder" << endl;
+                }
+                return false;
+            }
+            _num--;
+            return true;
+        }
+
+        void release() {
+            scoped_lock lk( _mutex );
+            _num++;
+        }
+
+        void resize( int newSize ) {
+            scoped_lock lk( _mutex );
+            int used = _outof - _num;
+            if ( used > newSize ) {
+                cout << "ERROR: can't resize since we're using (" << used << ") more than newSize(" << newSize << ")" << endl;
+                return;
+            }
+
+            _outof = newSize;
+            _num = _outof - used;
+        }
+
+        int available() const {
+            return _num;
+        }
+
+        int used() const {
+            return _outof - _num;
+        }
+
+        int outof() const { return _outof; }
+
+    private:
+        int _outof;
+        int _num;
+        mongo::mutex _mutex;
+    };
+
+    class TicketHolderReleaser {
+    public:
+        TicketHolderReleaser( TicketHolder * holder ) {
+            _holder = holder;
+        }
+
+        ~TicketHolderReleaser() {
+            _holder->release();
+        }
+    private:
+        TicketHolder * _holder;
+    };
+
+
+    /**
+     * this is a thread safe string
+     * you will never get a bad pointer, though data may be mungedd
+     */
+    class ThreadSafeString : boost::noncopyable {
+    public:
+        ThreadSafeString( size_t size=256 )
+            : _size( size ) , _buf( new char[size] ) {
+            memset( _buf , 0 , _size );
+        }
+
+        ThreadSafeString( const ThreadSafeString& other )
+            : _size( other._size ) , _buf( new char[_size] ) {
+            strncpy( _buf , other._buf , _size );
+        }
+
+        ~ThreadSafeString() {
+            delete[] _buf;
+            _buf = 0;
+        }
+
+        string toString() const {
+            string s = _buf;
+            return s;
+        }
+
+        ThreadSafeString& operator=( const char * str ) {
+            size_t s = strlen(str);
+            if ( s >= _size - 2 )
+                s = _size - 2;
+            strncpy( _buf , str , s );
+            _buf[s] = 0;
+            return *this;
+        }
+
+        bool operator==( const ThreadSafeString& other ) const {
+            return strcmp( _buf , other._buf ) == 0;
+        }
+
+        bool operator==( const char * str ) const {
+            return strcmp( _buf , str ) == 0;
+        }
+
+        bool operator!=( const char * str ) const {
+            return strcmp( _buf , str ) != 0;
+        }
+
+        bool empty() const {
+            return _buf == 0 || _buf[0] == 0;
+        }
+
+    private:
+        size_t _size;
+        char * _buf;
+    };
+
+    ostream& operator<<( ostream &s, const ThreadSafeString &o );
+
+    /** A generic pointer type for function arguments.
+     *  It will convert from any pointer type except auto_ptr.
+     *  Semantics are the same as passing the pointer returned from get()
+     *  const ptr<T>  =>  T * const
+     *  ptr<const T>  =>  T const *  or  const T*
+     */
+    template <typename T>
+    struct ptr {
+
+        ptr() : _p(NULL) {}
+
+        // convert to ptr<T>
+        ptr(T* p) : _p(p) {} // needed for NULL
+        template<typename U> ptr(U* p) : _p(p) {}
+        template<typename U> ptr(const ptr<U>& p) : _p(p) {}
+        template<typename U> ptr(const boost::shared_ptr<U>& p) : _p(p.get()) {}
+        template<typename U> ptr(const boost::scoped_ptr<U>& p) : _p(p.get()) {}
+        //template<typename U> ptr(const auto_ptr<U>& p) : _p(p.get()) {}
+
+        // assign to ptr<T>
+        ptr& operator= (T* p) { _p = p; return *this; } // needed for NULL
+        template<typename U> ptr& operator= (U* p) { _p = p; return *this; }
+        template<typename U> ptr& operator= (const ptr<U>& p) { _p = p; return *this; }
+        template<typename U> ptr& operator= (const boost::shared_ptr<U>& p) { _p = p.get(); return *this; }
+        template<typename U> ptr& operator= (const boost::scoped_ptr<U>& p) { _p = p.get(); return *this; }
+        //template<typename U> ptr& operator= (const auto_ptr<U>& p) { _p = p.get(); return *this; }
+
+        // use
+        T* operator->() const { return _p; }
+        T& operator*() const { return *_p; }
+
+        // convert from ptr<T>
+        operator T* () const { return _p; }
+
+    private:
+        T* _p;
+    };
+
+
+
+    using boost::shared_ptr;
+    using boost::scoped_ptr;
+    using boost::scoped_array;
+    using boost::intrusive_ptr;
+    using boost::bad_lexical_cast;
+    using boost::dynamic_pointer_cast;
+} // namespace mongo
diff --git a/src/mongo/util/hashtab.h b/src/mongo/util/hashtab.h
new file mode 100644
index 00000000000..f1a33068e07
--- /dev/null
+++ b/src/mongo/util/hashtab.h
@@ -0,0 +1,179 @@
+/* hashtab.h
+
+   Simple, fixed size hash table.  Darn simple.
+
+   Uses a contiguous block of memory, so you can put it in a memory mapped file very easily.
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include <map>
+#include "../db/dur.h"
+
+namespace mongo {
+
+#pragma pack(1)
+
+    /* you should define:
+
+       int Key::hash() return > 0 always.
+    */
+
+    template <
+    class Key,
+          class Type
+          >
+    class HashTable : boost::noncopyable {
+    public:
+        const char *name;
+        struct Node {
+            int hash;
+            Key k;
+            Type value;
+            bool inUse() {
+                return hash != 0;
+            }
+            void setUnused() {
+                hash = 0;
+            }
+        };
+        void* _buf;
+        int n; // number of hashtable buckets
+        int maxChain;
+
+        Node& nodes(int i) {
+            Node *nodes = (Node *) _buf;
+            return nodes[i];
+        }
+
+        int _find(const Key& k, bool& found) {
+            found = false;
+            int h = k.hash();
+            int i = h % n;
+            int start = i;
+            int chain = 0;
+            int firstNonUsed = -1;
+            while ( 1 ) {
+                if ( !nodes(i).inUse() ) {
+                    if ( firstNonUsed < 0 )
+                        firstNonUsed = i;
+                }
+
+                if ( nodes(i).hash == h && nodes(i).k == k ) {
+                    if ( chain >= 200 )
+                        out() << "warning: hashtable " << name << " long chain " << endl;
+                    found = true;
+                    return i;
+                }
+                chain++;
+                i = (i+1) % n;
+                if ( i == start ) {
+                    // shouldn't get here / defensive for infinite loops
+                    out() << "error: hashtable " << name << " is full n:" << n << endl;
+                    return -1;
+                }
+                if( chain >= maxChain ) {
+                    if ( firstNonUsed >= 0 )
+                        return firstNonUsed;
+                    out() << "error: hashtable " << name << " max chain reached:" << maxChain << endl;
+                    return -1;
+                }
+            }
+        }
+
+    public:
+        /* buf must be all zeroes on initialization. */
+        HashTable(void* buf, int buflen, const char *_name) : name(_name) {
+            int m = sizeof(Node);
+            // out() << "hashtab init, buflen:" << buflen << " m:" << m << endl;
+            n = buflen / m;
+            if ( (n & 1) == 0 )
+                n--;
+            maxChain = (int) (n * 0.05);
+            _buf = buf;
+            //nodes = (Node *) buf;
+
+            if ( sizeof(Node) != 628 ) {
+                out() << "HashTable() " << _name << " sizeof(node):" << sizeof(Node) << " n:" << n << " sizeof(Key): " << sizeof(Key) << " sizeof(Type):" << sizeof(Type) << endl;
+                assert( sizeof(Node) == 628 );
+            }
+
+        }
+
+        Type* get(const Key& k) {
+            bool found;
+            int i = _find(k, found);
+            if ( found )
+                return &nodes(i).value;
+            return 0;
+        }
+
+        void kill(const Key& k) {
+            bool found;
+            int i = _find(k, found);
+            if ( i >= 0 && found ) {
+                Node* n = &nodes(i);
+                n = getDur().writing(n);
+                n->k.kill();
+                n->setUnused();
+            }
+        }
+
+        /** returns false if too full */
+        bool put(const Key& k, const Type& value) {
+            bool found;
+            int i = _find(k, found);
+            if ( i < 0 )
+                return false;
+            Node* n = getDur().writing( &nodes(i) );
+            if ( !found ) {
+                n->k = k;
+                n->hash = k.hash();
+            }
+            else {
+                assert( n->hash == k.hash() );
+            }
+            n->value = value;
+            return true;
+        }
+
+        typedef void (*IteratorCallback)( const Key& k , Type& v );
+        void iterAll( IteratorCallback callback ) {
+            for ( int i=0; i<n; i++ ) {
+                if ( nodes(i).inUse() ) {
+                    callback( nodes(i).k , nodes(i).value );
+                }
+            }
+        }
+
+        // TODO: should probably use boost::bind for this, but didn't want to look at it
+        typedef void (*IteratorCallback2)( const Key& k , Type& v , void * extra );
+        void iterAll( IteratorCallback2 callback , void * extra ) {
+            for ( int i=0; i<n; i++ ) {
+                if ( nodes(i).inUse() ) {
+                    callback( nodes(i).k , nodes(i).value , extra );
+                }
+            }
+        }
+
+    };
+
+#pragma pack()
+
+} // namespace mongo
diff --git a/src/mongo/util/heapcheck.h b/src/mongo/util/heapcheck.h
new file mode 100644
index 00000000000..95da9538db5
--- /dev/null
+++ b/src/mongo/util/heapcheck.h
@@ -0,0 +1,33 @@
+// @file heapcheck.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#if defined(HEAP_CHECKING)
+
+#include <google/heap-checker.h>
+
+#define IGNORE_OBJECT( a ) HeapLeakChecker::IgnoreObject( a )
+#define UNIGNORE_OBJECT( a ) HeapLeakChecker::UnIgnoreObject( a )
+
+#else
+
+#define IGNORE_OBJECT( a )
+#define UNIGNORE_OBJECT( a )
+
+#endif
diff --git a/src/mongo/util/hex.h b/src/mongo/util/hex.h
new file mode 100644
index 00000000000..8cf30f2d9d3
--- /dev/null
+++ b/src/mongo/util/hex.h
@@ -0,0 +1,67 @@
+// util/hex.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+    //can't use hex namespace because it conflicts with hex iostream function
+    inline int fromHex( char c ) {
+        if ( '0' <= c && c <= '9' )
+            return c - '0';
+        if ( 'a' <= c && c <= 'f' )
+            return c - 'a' + 10;
+        if ( 'A' <= c && c <= 'F' )
+            return c - 'A' + 10;
+        assert( false );
+        return 0xff;
+    }
+    inline char fromHex( const char *c ) {
+        return (char)(( fromHex( c[ 0 ] ) << 4 ) | fromHex( c[ 1 ] ));
+    }
+
+    inline string toHex(const void* inRaw, int len) {
+        static const char hexchars[] = "0123456789ABCDEF";
+
+        StringBuilder out;
+        const char* in = reinterpret_cast<const char*>(inRaw);
+        for (int i=0; i<len; ++i) {
+            char c = in[i];
+            char hi = hexchars[(c & 0xF0) >> 4];
+            char lo = hexchars[(c & 0x0F)];
+
+            out << hi << lo;
+        }
+
+        return out.str();
+    }
+
+    inline string toHexLower(const void* inRaw, int len) {
+        static const char hexchars[] = "0123456789abcdef";
+
+        StringBuilder out;
+        const char* in = reinterpret_cast<const char*>(inRaw);
+        for (int i=0; i<len; ++i) {
+            char c = in[i];
+            char hi = hexchars[(c & 0xF0) >> 4];
+            char lo = hexchars[(c & 0x0F)];
+
+            out << hi << lo;
+        }
+
+        return out.str();
+    }
+}
diff --git a/src/mongo/util/histogram.cpp b/src/mongo/util/histogram.cpp
new file mode 100644
index 00000000000..17a85059d58
--- /dev/null
+++ b/src/mongo/util/histogram.cpp
@@ -0,0 +1,131 @@
+// histogram.cc
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iomanip>
+#include <limits>
+#include <sstream>
+
+#include "histogram.h"
+
+namespace mongo {
+
+    using std::ostringstream;
+    using std::setfill;
+    using std::setw;
+
+    Histogram::Histogram( const Options& opts )
+        : _initialValue( opts.initialValue )
+        , _numBuckets( opts.numBuckets )
+        , _boundaries( new uint32_t[_numBuckets] )
+        , _buckets( new uint64_t[_numBuckets] ) {
+
+        // TODO more sanity checks
+        // + not too few buckets
+        // + initialBucket and bucketSize fit within 32 bit ints
+
+        // _boundaries store the maximum value falling in that bucket.
+        if ( opts.exponential ) {
+            uint32_t twoPow = 1; // 2^0
+            for ( uint32_t i = 0; i < _numBuckets - 1; i++) {
+                _boundaries[i] = _initialValue + opts.bucketSize * twoPow;
+                twoPow *= 2;     // 2^i+1
+            }
+        }
+        else {
+            _boundaries[0] = _initialValue + opts.bucketSize;
+            for ( uint32_t i = 1; i < _numBuckets - 1; i++ ) {
+                _boundaries[i] = _boundaries[ i-1 ] + opts.bucketSize;
+            }
+        }
+        _boundaries[ _numBuckets-1 ] = std::numeric_limits<uint32_t>::max();
+
+        for ( uint32_t i = 0; i < _numBuckets; i++ ) {
+            _buckets[i] = 0;
+        }
+    }
+
+    Histogram::~Histogram() {
+        delete [] _boundaries;
+        delete [] _buckets;
+    }
+
+    void Histogram::insert( uint32_t element ) {
+        if ( element < _initialValue) return;
+
+        _buckets[ _findBucket(element) ] += 1;
+    }
+
+    string Histogram::toHTML() const {
+        uint64_t max = 0;
+        for ( uint32_t i = 0; i < _numBuckets; i++ ) {
+            if ( _buckets[i] > max ) {
+                max = _buckets[i];
+            }
+        }
+        if ( max == 0 ) {
+            return "histogram is empty\n";
+        }
+
+        // normalize buckets to max
+        const int maxBar = 20;
+        ostringstream ss;
+        for ( uint32_t i = 0; i < _numBuckets; i++ ) {
+            int barSize = _buckets[i] * maxBar / max;
+            ss << string( barSize,'*' )
+               << setfill(' ') << setw( maxBar-barSize + 12 )
+               << _boundaries[i] << '\n';
+        }
+
+        return ss.str();
+    }
+
+    uint64_t Histogram::getCount( uint32_t bucket ) const {
+        if ( bucket >= _numBuckets ) return 0;
+
+        return _buckets[ bucket ];
+    }
+
+    uint32_t Histogram::getBoundary( uint32_t bucket ) const {
+        if ( bucket >= _numBuckets ) return 0;
+
+        return _boundaries[ bucket ];
+    }
+
+    uint32_t Histogram::getBucketsNum() const {
+        return _numBuckets;
+    }
+
+    uint32_t Histogram::_findBucket( uint32_t element ) const {
+        // TODO assert not too small a value?
+
+        uint32_t low = 0;
+        uint32_t high = _numBuckets - 1;
+        while ( low < high ) {
+            // low + ( (high - low) / 2 );
+            uint32_t mid = ( low + high ) >> 1;
+            if ( element > _boundaries[ mid ] ) {
+                low = mid + 1;
+            }
+            else {
+                high = mid;
+            }
+        }
+        return low;
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/util/histogram.h b/src/mongo/util/histogram.h
new file mode 100644
index 00000000000..40ec5628dda
--- /dev/null
+++ b/src/mongo/util/histogram.h
@@ -0,0 +1,128 @@
+// histogram.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef UTIL_HISTOGRAM_HEADER
+#define UTIL_HISTOGRAM_HEADER
+
+#include "../pch.h"
+
+#include <string>
+
+namespace mongo {
+
+    using std::string;
+
+    /**
+     * A histogram for a 32-bit integer range.
+     */
+    class Histogram {
+    public:
+        /**
+         * Construct a histogram with 'numBuckets' buckets, optionally
+         * having the first bucket start at 'initialValue' rather than
+         * 0. By default, the histogram buckets will be 'bucketSize' wide.
+         *
+         * Usage example:
+         *   Histogram::Options opts;
+         *   opts.numBuckets = 3;
+         *   opts.bucketSize = 10;
+         *   Histogram h( opts );
+         *
+         *   Generates the bucket ranges [0..10],[11..20],[21..max_int]
+         *
+         * Alternatively, the flag 'exponential' could be turned on, in
+         * which case a bucket's maximum value will be
+         *    initialValue + bucketSize * 2 ^ [0..numBuckets-1]
+         *
+         * Usage example:
+         *   Histogram::Options opts;
+         *   opts.numBuckets = 4;
+         *   opts.bucketSize = 125;
+         *   opts.exponential = true;
+         *   Histogram h( opts );
+         *
+         *   Generates the bucket ranges [0..125],[126..250],[251..500],[501..max_int]
+         */
+        struct Options {
+            boost::uint32_t numBuckets;
+            boost::uint32_t bucketSize;
+            boost::uint32_t initialValue;
+
+            // use exponential buckets?
+            bool            exponential;
+
+            Options()
+                : numBuckets(0)
+                , bucketSize(0)
+                , initialValue(0)
+                , exponential(false) {}
+        };
+        explicit Histogram( const Options& opts );
+        ~Histogram();
+
+        /**
+         * Find the bucket that 'element' falls into and increment its count.
+         */
+        void insert( boost::uint32_t element );
+
+        /**
+         * Render the histogram as string that can be used inside an
+         * HTML doc.
+         */
+        string toHTML() const;
+
+        // testing interface below -- consider it private
+
+        /**
+         * Return the count for the 'bucket'-th bucket.
+         */
+        boost::uint64_t getCount( boost::uint32_t bucket ) const;
+
+        /**
+         * Return the maximum element that would fall in the
+         * 'bucket'-th bucket.
+         */
+        boost::uint32_t getBoundary( boost::uint32_t bucket ) const;
+
+        /**
+         * Return the number of buckets in this histogram.
+         */
+        boost::uint32_t getBucketsNum() const;
+
+    private:
+        /**
+         * Returns the bucket where 'element' should fall
+         * into. Currently assumes that 'element' is greater than the
+         * minimum 'inialValue'.
+         */
+        boost::uint32_t _findBucket( boost::uint32_t element ) const;
+
+        boost::uint32_t  _initialValue;  // no value lower than it is recorded
+        boost::uint32_t  _numBuckets;    // total buckets in the histogram
+
+        // all below owned here
+        boost::uint32_t* _boundaries;    // maximum element of each bucket
+        boost::uint64_t* _buckets;       // current count of each bucket
+
+        Histogram( const Histogram& );
+        Histogram& operator=( const Histogram& );
+    };
+
+}  // namespace mongo
+
+#endif  //  UTIL_HISTOGRAM_HEADER
diff --git a/src/mongo/util/intrusive_counter.cpp b/src/mongo/util/intrusive_counter.cpp
new file mode 100755
index 00000000000..fc01f40b41a
--- /dev/null
+++ b/src/mongo/util/intrusive_counter.cpp
@@ -0,0 +1,30 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+
+    void IntrusiveCounterUnsigned::addRef() const {
+	++counter;
+    }
+
+    void IntrusiveCounterUnsigned::release() const {
+	if (!--counter)
+	    delete this;
+    }
+
+}
diff --git a/src/mongo/util/intrusive_counter.h b/src/mongo/util/intrusive_counter.h
new file mode 100755
index 00000000000..bcebb6288cf
--- /dev/null
+++ b/src/mongo/util/intrusive_counter.h
@@ -0,0 +1,79 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/noncopyable.hpp>
+
+namespace mongo {
+
+/*
+  IntrusiveCounter is a sharable implementation of a reference counter that
+  objects can use to be compatible with boost::intrusive_ptr<>.
+
+  Some objects that use IntrusiveCounter are immutable, and only have
+  const methods.  This may require their pointers to be declared as
+  intrusive_ptr<const ClassName> .  In order to be able to share pointers to
+  these immutables, the methods associated with IntrusiveCounter are declared
+  as const, and the counter itself is marked as mutable.
+
+  IntrusiveCounter itself is abstract, allowing for multiple implementations.
+  For example, IntrusiveCounterUnsigned uses ordinary unsigned integers for
+  the reference count, and is good for situations where thread safety is not
+  required.  For others, other implementations using atomic integers should
+  be used.  For static objects, the implementations of addRef() and release()
+  can be overridden to do nothing.
+ */
+    class IntrusiveCounter :
+        boost::noncopyable {
+    public:
+	virtual ~IntrusiveCounter() {};
+
+	// these are here for the boost intrusive_ptr<> class
+	friend inline void intrusive_ptr_add_ref(const IntrusiveCounter *pIC) {
+	    pIC->addRef(); };
+	friend inline void intrusive_ptr_release(const IntrusiveCounter *pIC) {
+	    pIC->release(); };
+
+	virtual void addRef() const = 0;
+	virtual void release() const = 0;
+    };
+
+    class IntrusiveCounterUnsigned :
+        public IntrusiveCounter {
+    public:
+	// virtuals from IntrusiveCounter
+	virtual void addRef() const;
+	virtual void release() const;
+
+	IntrusiveCounterUnsigned();
+
+    private:
+	mutable unsigned counter;
+    };
+
+};
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline IntrusiveCounterUnsigned::IntrusiveCounterUnsigned():
+	counter(0) {
+    }
+
+};
diff --git a/src/mongo/util/log.cpp b/src/mongo/util/log.cpp
new file mode 100644
index 00000000000..aa249597b57
--- /dev/null
+++ b/src/mongo/util/log.cpp
@@ -0,0 +1,197 @@
+/** @file log.cpp
+ */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "assert_util.h"
+#include "assert.h"
+#include <cmath>
+#include "time_support.h"
+using namespace std;
+
+#ifdef _WIN32
+# include <io.h>
+# include <fcntl.h>
+#else
+# include <cxxabi.h>
+# include <sys/file.h>
+#endif
+
+#ifdef _WIN32
+# define dup2   _dup2       // Microsoft headers use ISO C names
+# define fileno _fileno
+#endif
+
+namespace mongo {
+
+    Nullstream nullstream;
+    vector<Tee*>* Logstream::globalTees = 0;
+
+    thread_specific_ptr<Logstream> Logstream::tsp;
+
+    class LoggingManager {
+    public:
+        LoggingManager()
+            : _enabled(0) , _file(0) {
+        }
+
+        void start( const string& lp , bool append ) {
+            uassert( 10268 ,  "LoggingManager already started" , ! _enabled );
+            _append = append;
+
+            bool exists = boost::filesystem::exists(lp);
+            bool isdir = boost::filesystem::is_directory(lp);
+            bool isreg = boost::filesystem::is_regular_file(lp);
+
+            if ( exists ) {
+                if ( isdir ) {
+                    cout << "logpath [" << lp << "] should be a filename, not a directory" << endl;
+                    
+                    dbexit( EXIT_BADOPTIONS );
+                    assert( 0 );
+                }
+
+                if ( ! append ) {
+                    // only attempt rename if log is regular file
+                    if ( isreg ) {
+                        stringstream ss;
+                        ss << lp << "." << terseCurrentTime( false );
+                        string s = ss.str();
+
+                        if ( ! rename( lp.c_str() , s.c_str() ) ) {
+                            cout << "log file [" << lp << "] exists; copied to temporary file [" << s << "]" << endl;
+                        } else {
+                            cout << "log file [" << lp << "] exists and couldn't make backup; run with --logappend or manually remove file (" << strerror(errno) << ")" << endl;
+                            
+                            dbexit( EXIT_BADOPTIONS );
+                            assert( 0 );
+                        }
+                    }
+                }
+            }
+            // test path
+            FILE * test = fopen( lp.c_str() , _append ? "a" : "w" );
+            if ( ! test ) {
+                cout << "can't open [" << lp << "] for log file: " << errnoWithDescription() << endl;
+                dbexit( EXIT_BADOPTIONS );
+                assert( 0 );
+            }
+
+            if (append && exists){
+                // two blank lines before and after
+                const string msg = "\n\n***** SERVER RESTARTED *****\n\n\n";
+                massert(14036, errnoWithPrefix("couldn't write to log file"),
+                        fwrite(msg.data(), 1, msg.size(), test) == msg.size());
+            }
+
+            fclose( test );
+
+            _path = lp;
+            _enabled = 1;
+            rotate();
+        }
+
+        void rotate() {
+            if ( ! _enabled ) {
+                cout << "LoggingManager not enabled" << endl;
+                return;
+            }
+
+            if ( _file ) {
+
+#ifdef POSIX_FADV_DONTNEED
+                posix_fadvise(fileno(_file), 0, 0, POSIX_FADV_DONTNEED);
+#endif
+
+                // Rename the (open) existing log file to a timestamped name
+                stringstream ss;
+                ss << _path << "." << terseCurrentTime( false );
+                string s = ss.str();
+                rename( _path.c_str() , s.c_str() );
+            }
+
+            FILE* tmp = 0;  // The new file using the original logpath name
+
+#if _WIN32
+            // We rename an open log file (above, on next rotation) and the trick to getting Windows to do that is
+            // to open the file with FILE_SHARE_DELETE.  So, we can't use the freopen() call that non-Windows
+            // versions use because it would open the file without the FILE_SHARE_DELETE flag we need.
+            //
+            HANDLE newFileHandle = CreateFileA(
+                    _path.c_str(),
+                    GENERIC_WRITE,
+                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                    NULL,
+                    OPEN_ALWAYS,
+                    FILE_ATTRIBUTE_NORMAL,
+                    NULL
+            );
+            if ( INVALID_HANDLE_VALUE != newFileHandle ) {
+                int newFileDescriptor = _open_osfhandle( reinterpret_cast<intptr_t>(newFileHandle), _O_APPEND );
+                tmp = _fdopen( newFileDescriptor, _append ? "a" : "w" );
+            }
+#else
+            tmp = freopen(_path.c_str(), _append ? "a" : "w", stdout);
+#endif
+            if ( !tmp ) {
+                cerr << "can't open: " << _path.c_str() << " for log file" << endl;
+                dbexit( EXIT_BADOPTIONS );
+                assert( 0 );
+            }
+
+            // redirect stdout and stderr to log file
+            dup2( fileno( tmp ), 1 );   // stdout
+            dup2( fileno( tmp ), 2 );   // stderr
+
+            Logstream::setLogFile(tmp); // after this point no thread will be using old file
+
+#if _WIN32
+            if ( _file )
+                fclose( _file );  // In Windows, we still have the old file open, close it now
+#endif
+
+#if 0 // enable to test redirection
+            cout << "written to cout" << endl;
+            cerr << "written to cerr" << endl;
+            log() << "written to log()" << endl;
+#endif
+
+            _file = tmp;    // Save new file for next rotation
+        }
+
+    private:
+        bool _enabled;
+        string _path;
+        bool _append;
+        FILE * _file;
+
+    } loggingManager;
+
+    void initLogging( const string& lp , bool append ) {
+        cout << "all output going to: " << lp << endl;
+        loggingManager.start( lp , append );
+    }
+
+    void rotateLogs( int signal ) {
+        loggingManager.rotate();
+    }
+
+    // done *before* static initialization
+    FILE* Logstream::logfile = stdout;
+    bool Logstream::isSyslog = false;
+
+}
diff --git a/src/mongo/util/log.h b/src/mongo/util/log.h
new file mode 100644
index 00000000000..a393d4d29a5
--- /dev/null
+++ b/src/mongo/util/log.h
@@ -0,0 +1,581 @@
+// @file log.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <string.h>
+#include <errno.h>
+#include "../bson/util/builder.h"
+
+#ifndef _WIN32
+#include <syslog.h>
+#endif
+
+namespace mongo {
+
+    enum LogLevel {  LL_DEBUG , LL_INFO , LL_NOTICE , LL_WARNING , LL_ERROR , LL_SEVERE };
+
+    inline const char * logLevelToString( LogLevel l ) {
+        switch ( l ) {
+        case LL_DEBUG:
+        case LL_INFO:
+        case LL_NOTICE:
+            return "";
+        case LL_WARNING:
+            return "warning" ;
+        case LL_ERROR:
+            return "ERROR";
+        case LL_SEVERE:
+            return "SEVERE";
+        default:
+            return "UNKNOWN";
+        }
+    }
+    
+#ifndef _WIN32
+    inline const int logLevelToSysLogLevel( LogLevel l) {
+        switch ( l ) {
+        case LL_DEBUG:
+            return LOG_DEBUG;
+        case LL_INFO:
+            return LOG_INFO;
+        case LL_NOTICE:
+            return LOG_NOTICE;
+        case LL_WARNING:
+            return LOG_WARNING;
+        case LL_ERROR:
+            return LOG_ERR;
+        case LL_SEVERE:
+            return LOG_CRIT;
+        default:
+            return LL_INFO;
+        }
+    }
+#endif
+
+    class LabeledLevel {
+    public:
+
+	LabeledLevel( int level ) : _level( level ) {}
+	LabeledLevel( const char* label, int level ) : _label( label ), _level( level ) {}
+	LabeledLevel( const string& label, int level ) : _label( label ), _level( level ) {}
+
+	LabeledLevel operator+( int i ) const {
+	    return LabeledLevel( _label, _level + i );
+	}
+
+	LabeledLevel operator+( const char* label ) const {
+	    if( _label == "" )
+		return LabeledLevel( label, _level );
+	    return LabeledLevel( _label + string("::") + label, _level );
+	}
+
+	LabeledLevel operator+( string& label ) const {
+	    return LabeledLevel( _label + string("::") + label, _level );
+	}
+
+	LabeledLevel operator-( int i ) const {
+	    return LabeledLevel( _label, _level - i );
+	}
+
+	const string& getLabel() const { return _label; }
+	int getLevel() const { return _level; }
+
+    private:
+	string _label;
+	int _level;
+    };
+
+    class LazyString {
+    public:
+        virtual ~LazyString() {}
+        virtual string val() const = 0;
+    };
+
+    // Utility class for stringifying object only when val() called.
+    template< class T >
+    class LazyStringImpl : public LazyString {
+    public:
+        LazyStringImpl( const T &t ) : t_( t ) {}
+        virtual string val() const { return t_.toString(); }
+    private:
+        const T& t_;
+    };
+
+    class Tee {
+    public:
+        virtual ~Tee() {}
+        virtual void write(LogLevel level , const string& str) = 0;
+    };
+
+    class Nullstream {
+    public:
+        virtual Nullstream& operator<< (Tee* tee) {
+            return *this;
+        }
+        virtual ~Nullstream() {}
+        virtual Nullstream& operator<<(const char *) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(const string& ) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(const StringData& ) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(char *) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(char) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(int) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(ExitCode) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(unsigned long) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(long) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(unsigned) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(unsigned short) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(double) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(void *) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(const void *) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(long long) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(unsigned long long) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(bool) {
+            return *this;
+        }
+        virtual Nullstream& operator<<(const LazyString&) {
+            return *this;
+        }
+        template< class T >
+        Nullstream& operator<<(T *t) {
+            return operator<<( static_cast<void*>( t ) );
+        }
+        template< class T >
+        Nullstream& operator<<(const T *t) {
+            return operator<<( static_cast<const void*>( t ) );
+        }
+        template< class T >
+        Nullstream& operator<<(const shared_ptr<T> p ) {
+            T * t = p.get();
+            if ( ! t )
+                *this << "null";
+            else
+                *this << *t;
+            return *this;
+        }
+        template< class T >
+        Nullstream& operator<<(const T &t) {
+            return operator<<( static_cast<const LazyString&>( LazyStringImpl< T >( t ) ) );
+        }
+
+        virtual Nullstream& operator<< (ostream& ( *endl )(ostream&)) {
+            return *this;
+        }
+        virtual Nullstream& operator<< (ios_base& (*hex)(ios_base&)) {
+            return *this;
+        }
+
+        virtual void flush(Tee *t = 0) {}
+    };
+    extern Nullstream nullstream;
+
+    class Logstream : public Nullstream {
+        static mongo::mutex mutex;
+        static int doneSetup;
+        stringstream ss;
+        int indent;
+        LogLevel logLevel;
+        static FILE* logfile;
+        static boost::scoped_ptr<ostream> stream;
+        static vector<Tee*> * globalTees;
+        static bool isSyslog;
+    public:
+        inline static void logLockless( const StringData& s );
+
+        static void setLogFile(FILE* f) {
+            scoped_lock lk(mutex);
+            logfile = f;
+        }
+#ifndef _WIN32
+        static void useSyslog(const char * name) {
+            cout << "using syslog ident: " << name << endl;
+            
+            // openlog requires heap allocated non changing pointer
+            // this should only be called once per pragram execution
+
+            char * newName = (char *) malloc( strlen(name) + 1 );
+            strcpy( newName , name);
+            openlog( newName , LOG_ODELAY , LOG_USER );
+            isSyslog = true;
+        }
+#endif
+        
+        static int magicNumber() {
+            return 1717;
+        }
+
+        static int getLogDesc() {
+            int fd = -1;
+            if (logfile != NULL)
+#if defined(_WIN32)
+                // the ISO C++ conformant name is _fileno
+                fd = _fileno( logfile );
+#else
+                fd = fileno( logfile );
+#endif
+            return fd;
+        }
+
+        inline void flush(Tee *t = 0);
+
+        inline Nullstream& setLogLevel(LogLevel l) {
+            logLevel = l;
+            return *this;
+        }
+
+        /** note these are virtual */
+        Logstream& operator<<(const char *x) { ss << x; return *this; }
+        Logstream& operator<<(const string& x) { ss << x; return *this; }
+        Logstream& operator<<(const StringData& x) { ss << x.data(); return *this; }
+        Logstream& operator<<(char *x)       { ss << x; return *this; }
+        Logstream& operator<<(char x)        { ss << x; return *this; }
+        Logstream& operator<<(int x)         { ss << x; return *this; }
+        Logstream& operator<<(ExitCode x)    { ss << x; return *this; }
+        Logstream& operator<<(long x)          { ss << x; return *this; }
+        Logstream& operator<<(unsigned long x) { ss << x; return *this; }
+        Logstream& operator<<(unsigned x)      { ss << x; return *this; }
+        Logstream& operator<<(unsigned short x){ ss << x; return *this; }
+        Logstream& operator<<(double x)        { ss << x; return *this; }
+        Logstream& operator<<(void *x)         { ss << x; return *this; }
+        Logstream& operator<<(const void *x)   { ss << x; return *this; }
+        Logstream& operator<<(long long x)     { ss << x; return *this; }
+        Logstream& operator<<(unsigned long long x) { ss << x; return *this; }
+        Logstream& operator<<(bool x)               { ss << x; return *this; }
+
+        Logstream& operator<<(const LazyString& x) {
+            ss << x.val();
+            return *this;
+        }
+        Nullstream& operator<< (Tee* tee) {
+            ss << '\n';
+            flush(tee);
+            return *this;
+        }
+        Logstream& operator<< (ostream& ( *_endl )(ostream&)) {
+            ss << '\n';
+            flush(0);
+            return *this;
+        }
+        Logstream& operator<< (ios_base& (*_hex)(ios_base&)) {
+            ss << _hex;
+            return *this;
+        }
+
+        Logstream& prolog() {
+            return *this;
+        }
+
+        void addGlobalTee( Tee * t ) {
+            if ( ! globalTees )
+                globalTees = new vector<Tee*>();
+            globalTees->push_back( t );
+        }
+        
+        void indentInc(){ indent++; }
+        void indentDec(){ indent--; }
+        int getIndent() const { return indent; }
+
+    private:
+        static thread_specific_ptr<Logstream> tsp;
+        Logstream() {
+            indent = 0;
+            _init();
+        }
+        void _init() {
+            ss.str("");
+            logLevel = LL_INFO;
+        }
+    public:
+        static Logstream& get() {
+            if ( StaticObserver::_destroyingStatics ) {
+                cout << "Logstream::get called in uninitialized state" << endl;
+            }
+            Logstream *p = tsp.get();
+            if( p == 0 )
+                tsp.reset( p = new Logstream() );
+            return *p;
+        }
+    };
+
+    extern int logLevel;
+    extern int tlogLevel;
+
+    inline Nullstream& out( int level = 0 ) {
+        if ( level > logLevel )
+            return nullstream;
+        return Logstream::get();
+    }
+
+    /* flush the log stream if the log level is
+       at the specified level or higher. */
+    inline void logflush(int level = 0) {
+        if( level > logLevel )
+            Logstream::get().flush(0);
+    }
+
+    /* without prolog */
+    inline Nullstream& _log( int level = 0 ) {
+        if ( level > logLevel )
+            return nullstream;
+        return Logstream::get();
+    }
+
+    /** logging which we may not want during unit tests (dbtests) runs.
+        set tlogLevel to -1 to suppress tlog() output in a test program. */
+    inline Nullstream& tlog( int level = 0 ) {
+        if ( level > tlogLevel || level > logLevel )
+            return nullstream;
+        return Logstream::get().prolog();
+    }
+
+    // log if debug build or if at a certain level
+    inline Nullstream& dlog( int level ) {
+        if ( level <= logLevel || DEBUG_BUILD )
+            return Logstream::get().prolog();
+        return nullstream;
+    }
+
+    inline Nullstream& log( int level ) {
+        if ( level > logLevel )
+            return nullstream;
+        return Logstream::get().prolog();
+    }
+
+#define MONGO_LOG(level) if ( MONGO_likely(logLevel < (level)) ) { } else log( level )
+#define LOG MONGO_LOG
+
+    inline Nullstream& log( LogLevel l ) {
+        return Logstream::get().prolog().setLogLevel( l );
+    }
+
+    inline Nullstream& log( const LabeledLevel& ll ) {
+        Nullstream& stream = log( ll.getLevel() );
+        if( ll.getLabel() != "" )
+            stream << "[" << ll.getLabel() << "] ";
+        return stream;
+    }
+
+    inline Nullstream& log() {
+        return Logstream::get().prolog();
+    }
+
+    inline Nullstream& error() {
+        return log( LL_ERROR );
+    }
+
+    inline Nullstream& warning() {
+        return log( LL_WARNING );
+    }
+
+    /* default impl returns "" -- mongod overrides */
+    extern const char * (*getcurns)();
+
+    inline Nullstream& problem( int level = 0 ) {
+        if ( level > logLevel )
+            return nullstream;
+        Logstream& l = Logstream::get().prolog();
+        l << ' ' << getcurns() << ' ';
+        return l;
+    }
+
+    /**
+       log to a file rather than stdout
+       defined in assert_util.cpp
+     */
+    void initLogging( const string& logpath , bool append );
+    void rotateLogs( int signal = 0 );
+
+    std::string toUtf8String(const std::wstring& wide);
+
+#if defined(_WIN32)
+    inline string errnoWithDescription(DWORD x = GetLastError()) {
+#else
+    inline string errnoWithDescription(int x = errno) {
+#endif
+        stringstream s;
+        s << "errno:" << x << ' ';
+
+#if defined(_WIN32)
+        LPTSTR errorText = NULL;
+        FormatMessage(
+            FORMAT_MESSAGE_FROM_SYSTEM
+            |FORMAT_MESSAGE_ALLOCATE_BUFFER
+            |FORMAT_MESSAGE_IGNORE_INSERTS,
+            NULL,
+            x, 0,
+            (LPTSTR) &errorText,  // output
+            0, // minimum size for output buffer
+            NULL);
+        if( errorText ) {
+            string x = toUtf8String(errorText);
+            for( string::iterator i = x.begin(); i != x.end(); i++ ) {
+                if( *i == '\n' || *i == '\r' )
+                    break;
+                s << *i;
+            }
+            LocalFree(errorText);
+        }
+        else
+            s << strerror(x);
+        /*
+        DWORD n = FormatMessage(
+            FORMAT_MESSAGE_ALLOCATE_BUFFER |
+            FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+            NULL, x,
+            MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+            (LPTSTR) &lpMsgBuf, 0, NULL);
+        */
+#else
+        s << strerror(x);
+#endif
+        return s.str();
+    }
+
+    /** output the error # and error message with prefix.
+        handy for use as parm in uassert/massert.
+        */
+    string errnoWithPrefix( const char * prefix );
+
+    void Logstream::logLockless( const StringData& s ) {
+        if ( s.size() == 0 )
+            return;
+
+        if ( doneSetup == 1717 ) {
+#ifndef _WIN32
+            if ( isSyslog ) {
+                syslog( LOG_INFO , "%s" , s.data() );
+            } else
+#endif
+            if (fwrite(s.data(), s.size(), 1, logfile)) {
+                fflush(logfile);
+            }
+            else {
+                int x = errno;
+                cout << "Failed to write to logfile: " << errnoWithDescription(x) << endl;
+            }
+        }
+        else {
+            cout << s.data();
+            cout.flush();
+        }
+    }
+
+    void Logstream::flush(Tee *t) {
+        // this ensures things are sane
+        if ( doneSetup == 1717 ) {
+            string msg = ss.str();
+            string threadName = getThreadName();
+            const char * type = logLevelToString(logLevel);
+
+            int spaceNeeded = (int)(msg.size() + 64 + threadName.size());
+            int bufSize = 128;
+            while ( bufSize < spaceNeeded )
+                bufSize += 128;
+
+            BufBuilder b(bufSize);
+            time_t_to_String( time(0) , b.grow(20) );
+            if (!threadName.empty()) {
+                b.appendChar( '[' );
+                b.appendStr( threadName , false );
+                b.appendChar( ']' );
+                b.appendChar( ' ' );
+            }
+
+            for ( int i=0; i<indent; i++ )
+                b.appendChar( '\t' );
+
+            if ( type[0] ) {
+                b.appendStr( type , false );
+                b.appendStr( ": " , false );
+            }
+
+            b.appendStr( msg );
+
+            string out( b.buf() , b.len() - 1);
+
+            scoped_lock lk(mutex);
+
+            if( t ) t->write(logLevel,out);
+            if ( globalTees ) {
+                for ( unsigned i=0; i<globalTees->size(); i++ )
+                    (*globalTees)[i]->write(logLevel,out);
+            }
+#ifndef _WIN32
+            if ( isSyslog ) {
+                syslog( logLevelToSysLogLevel(logLevel) , "%s" , out.data() );
+            } else
+#endif
+            if(fwrite(out.data(), out.size(), 1, logfile)) {
+                fflush(logfile);
+            }
+            else {
+                int x = errno;
+                cout << "Failed to write to logfile: " << errnoWithDescription(x) << ": " << out << endl;
+            }
+#ifdef POSIX_FADV_DONTNEED
+            // This only applies to pages that have already been flushed
+            RARELY posix_fadvise(fileno(logfile), 0, 0, POSIX_FADV_DONTNEED);
+#endif
+        }
+        _init();
+    }
+
+    struct LogIndentLevel {
+        LogIndentLevel(){
+            Logstream::get().indentInc();
+        }
+        ~LogIndentLevel(){
+            Logstream::get().indentDec();
+        }
+    };
+
+    extern Tee* const warnings; // Things put here go in serverStatus
+
+} // namespace mongo
diff --git a/src/mongo/util/logfile.cpp b/src/mongo/util/logfile.cpp
new file mode 100644
index 00000000000..7c362be08d1
--- /dev/null
+++ b/src/mongo/util/logfile.cpp
@@ -0,0 +1,253 @@
+// @file logfile.cpp simple file log writing / journaling
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "logfile.h"
+#include "text.h"
+#include "mongoutils/str.h"
+#include "unittest.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+    struct LogfileTest : public UnitTest {
+        LogfileTest() { }
+        void run() {
+            if( 0 && debug ) {
+                try {
+                    LogFile f("logfile_test");
+                    void *p = malloc(16384);
+                    char *buf = (char*) p;
+                    buf += 4095;
+                    buf = (char*) (((size_t)buf)&(~0xfff));
+                    memset(buf, 'z', 8192);
+                    buf[8190] = '\n';
+                    buf[8191] = 'B';
+                    buf[0] = 'A';
+                    f.synchronousAppend(buf, 8192);
+                    f.synchronousAppend(buf, 8192);
+                    free(p);
+                }
+                catch(DBException& e ) {
+                    log() << "logfile.cpp test failed : " << e.what() << endl;
+                    throw;
+                }
+            }
+        }
+    } __test;
+}
+
+#if defined(_WIN32)
+
+namespace mongo {
+
+    LogFile::LogFile(string name, bool readwrite) : _name(name) {
+        _fd = CreateFile(
+                  toNativeString(name.c_str()).c_str(),
+                  (readwrite?GENERIC_READ:0)|GENERIC_WRITE,
+                  FILE_SHARE_READ,
+                  NULL,
+                  OPEN_ALWAYS,
+                  FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH,
+                  NULL);
+        if( _fd == INVALID_HANDLE_VALUE ) {
+            DWORD e = GetLastError();
+            uasserted(13518, str::stream() << "couldn't open file " << name << " for writing " << errnoWithDescription(e));
+        }
+        SetFilePointer(_fd, 0, 0, FILE_BEGIN);
+    }
+
+    LogFile::~LogFile() {
+        if( _fd != INVALID_HANDLE_VALUE )
+            CloseHandle(_fd);
+    }
+
+    void LogFile::truncate() {
+        verify(15870, _fd != INVALID_HANDLE_VALUE);
+
+        if (!SetEndOfFile(_fd)){
+            msgasserted(15871, "Couldn't truncate file: " + errnoWithDescription());
+        }
+    }
+
+    void LogFile::writeAt(unsigned long long offset, const void *_buf, size_t _len) { 
+// TODO 64 bit offsets
+        OVERLAPPED o;
+        memset(&o,0,sizeof(o));
+        (unsigned long long&) o.Offset = offset; 
+        BOOL ok= WriteFile(_fd, _buf, _len, 0, &o);
+        assert(ok);
+    }
+
+    void LogFile::readAt(unsigned long long offset, void *_buf, size_t _len) { 
+// TODO 64 bit offsets
+        OVERLAPPED o;
+        memset(&o,0,sizeof(o));
+        (unsigned long long&) o.Offset = offset;
+        DWORD nr;
+        BOOL ok = ReadFile(_fd, _buf, _len, &nr, &o);
+        if( !ok ) {
+            string e = errnoWithDescription();
+            //DWORD e = GetLastError();
+            log() << "LogFile readAt(" << offset << ") len:" << _len << "errno:" << e << endl;
+            assert(false);
+        }
+    }
+
+    void LogFile::synchronousAppend(const void *_buf, size_t _len) {
+        const size_t BlockSize = 8 * 1024 * 1024;
+        assert(_fd);
+        assert(_len % 4096 == 0);
+        const char *buf = (const char *) _buf;
+        size_t left = _len;
+        while( left ) {
+            size_t toWrite = min(left, BlockSize);
+            DWORD written;
+            if( !WriteFile(_fd, buf, toWrite, &written, NULL) ) {
+                DWORD e = GetLastError();
+                if( e == 87 )
+                    msgasserted(13519, "error 87 appending to file - invalid parameter");
+                else
+                    uasserted(13517, str::stream() << "error appending to file " << _name << ' ' << _len << ' ' << toWrite << ' ' << errnoWithDescription(e));
+            }
+            else {
+                dassert( written == toWrite );
+            }
+            left -= written;
+            buf += written;
+        }
+    }
+
+}
+
+#else
+
+/// posix
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "paths.h"
+
+namespace mongo {
+
+    LogFile::LogFile(string name, bool readwrite) : _name(name) {
+        int options = O_CREAT
+                    | (readwrite?O_RDWR:O_WRONLY)
+#if defined(O_DIRECT)
+                    | O_DIRECT
+#endif
+#if defined(O_NOATIME)
+                    | O_NOATIME
+#endif
+                    ;
+
+        _fd = open(name.c_str(), options, S_IRUSR | S_IWUSR);
+
+#if defined(O_DIRECT)
+        _direct = true;
+        if( _fd < 0 ) {
+            _direct = false;
+            options &= ~O_DIRECT;
+            _fd = open(name.c_str(), options, S_IRUSR | S_IWUSR);
+        }
+#else
+        _direct = false;
+#endif
+
+        if( _fd < 0 ) {
+            uasserted(13516, str::stream() << "couldn't open file " << name << " for writing " << errnoWithDescription());
+        }
+
+        flushMyDirectory(name);
+    }
+
+    LogFile::~LogFile() {
+        if( _fd >= 0 )
+            close(_fd);
+        _fd = -1;
+    }
+
+    void LogFile::truncate() {
+        verify(15872, _fd >= 0);
+
+        BOOST_STATIC_ASSERT(sizeof(off_t) == 8); // we don't want overflow here
+        const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek
+        if (ftruncate(_fd, pos) != 0){
+            msgasserted(15873, "Couldn't truncate file: " + errnoWithDescription());
+        }
+
+        fsync(_fd);
+    }
+
+    void LogFile::writeAt(unsigned long long offset, const void *buf, size_t len) { 
+        assert(((size_t)buf)%4096==0); // aligned
+        ssize_t written = pwrite(_fd, buf, len, offset);
+        if( written != (ssize_t) len ) {
+            log() << "writeAt fails " << errnoWithDescription() << endl;
+        }
+#if defined(__linux__)
+        fdatasync(_fd);
+#else
+        fsync(_fd);
+#endif
+    }
+
+    void LogFile::readAt(unsigned long long offset, void *_buf, size_t _len) { 
+        assert(((size_t)_buf)%4096==0); // aligned
+        ssize_t rd = pread(_fd, _buf, _len, offset);
+        assert( rd != -1 );
+    }
+
+    void LogFile::synchronousAppend(const void *b, size_t len) {
+#ifdef POSIX_FADV_DONTNEED
+        const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek, just get current position
+#endif
+
+        const char *buf = (char *) b;
+        assert(_fd);
+        assert(((size_t)buf)%4096==0); // aligned
+        if( len % 4096 != 0 ) {
+            log() << len << ' ' << len % 4096 << endl;
+            assert(false);
+        }
+        ssize_t written = write(_fd, buf, len);
+        if( written != (ssize_t) len ) {
+            log() << "write fails written:" << written << " len:" << len << " buf:" << buf << ' ' << errnoWithDescription() << endl;
+            uasserted(13515, str::stream() << "error appending to file " << _fd  << ' ' << errnoWithDescription());
+        }
+
+        if( 
+#if defined(__linux__)
+           fdatasync(_fd) < 0 
+#else
+           fsync(_fd)
+#endif
+            ) {
+            uasserted(13514, str::stream() << "error appending to file on fsync " << ' ' << errnoWithDescription());
+        }
+
+#ifdef POSIX_FADV_DONTNEED
+        if (!_direct)
+            posix_fadvise(_fd, pos, len, POSIX_FADV_DONTNEED);
+#endif
+    }
+
+}
+
+#endif
diff --git a/src/mongo/util/logfile.h b/src/mongo/util/logfile.h
new file mode 100644
index 00000000000..e41ecc2f6ec
--- /dev/null
+++ b/src/mongo/util/logfile.h
@@ -0,0 +1,58 @@
+// @file logfile.h simple file log writing / journaling
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    class LogFile {
+    public:
+        /** create the file and open.  must not already exist.
+            throws UserAssertion on i/o error
+        */
+        LogFile(string name, bool readwrite = false);
+
+        /** closes */
+        ~LogFile();
+
+        /** append to file.  does not return until sync'd.  uses direct i/o when possible.
+            throws UserAssertion on an i/o error
+            note direct i/o may have alignment requirements
+        */
+        void synchronousAppend(const void *buf, size_t len);
+
+        /** write at specified offset. must be aligned.  noreturn until physically written. thread safe */
+        void writeAt(unsigned long long offset, const void *_bug, size_t _len);
+
+        void readAt(unsigned long long offset, void *_buf, size_t _len);
+
+        const string _name;
+
+        void truncate(); // Removes extra data after current position
+
+    private:
+#if defined(_WIN32)
+        typedef HANDLE fd_type;
+#else
+        typedef int fd_type;
+#endif
+        fd_type _fd;
+        bool _direct; // are we using direct I/O
+    };
+
+}
diff --git a/src/mongo/util/lruishmap.h b/src/mongo/util/lruishmap.h
new file mode 100644
index 00000000000..ba91bf6f0f6
--- /dev/null
+++ b/src/mongo/util/lruishmap.h
@@ -0,0 +1,78 @@
+// lru-ish map.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/goodies.h"
+
+namespace mongo {
+
+    /* Your K object must define:
+         int hash() - must always return > 0.
+         operator==
+    */
+
+    template <class K, class V, int MaxChain>
+    class LRUishMap {
+    public:
+        LRUishMap(int _n) {
+            n = nextPrime(_n);
+            keys = new K[n];
+            hashes = new int[n];
+            for ( int i = 0; i < n; i++ ) hashes[i] = 0;
+        }
+        ~LRUishMap() {
+            delete[] keys;
+            delete[] hashes;
+        }
+
+        int _find(const K& k, bool& found) {
+            int h = k.hash();
+            assert( h > 0 );
+            int j = h % n;
+            int first = j;
+            for ( int i = 0; i < MaxChain; i++ ) {
+                if ( hashes[j] == h ) {
+                    if ( keys[j] == k ) {
+                        found = true;
+                        return j;
+                    }
+                }
+                else if ( hashes[j] == 0 ) {
+                    found = false;
+                    return j;
+                }
+            }
+            found = false;
+            return first;
+        }
+
+        V* find(const K& k) {
+            bool found;
+            int j = _find(k, found);
+            return found ? &values[j] : 0;
+        }
+
+    private:
+        int n;
+        K *keys;
+        int *hashes;
+        V *values;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/util/md5.c b/src/mongo/util/md5.c
new file mode 100644
index 00000000000..c35d96c5ef5
--- /dev/null
+++ b/src/mongo/util/md5.c
@@ -0,0 +1,381 @@
+/*
+  Copyright (C) 1999, 2000, 2002 Aladdin Enterprises.  All rights reserved.
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  L. Peter Deutsch
+  ghost@aladdin.com
+
+ */
+/* $Id: md5.c,v 1.6 2002/04/13 19:20:28 lpd Exp $ */
+/*
+  Independent implementation of MD5 (RFC 1321).
+
+  This code implements the MD5 Algorithm defined in RFC 1321, whose
+  text is available at
+	http://www.ietf.org/rfc/rfc1321.txt
+  The code is derived from the text of the RFC, including the test suite
+  (section A.5) but excluding the rest of Appendix A.  It does not include
+  any code or documentation that is identified in the RFC as being
+  copyrighted.
+
+  The original and principal author of md5.c is L. Peter Deutsch
+  <ghost@aladdin.com>.  Other authors are noted in the change history
+  that follows (in reverse chronological order):
+
+  2002-04-13 lpd Clarified derivation from RFC 1321; now handles byte order
+	either statically or dynamically; added missing #include <string.h>
+	in library.
+  2002-03-11 lpd Corrected argument list for main(), and added int return
+	type, in test program and T value program.
+  2002-02-21 lpd Added missing #include <stdio.h> in test program.
+  2000-07-03 lpd Patched to eliminate warnings about "constant is
+	unsigned in ANSI C, signed in traditional"; made test program
+	self-checking.
+  1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
+  1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5).
+  1999-05-03 lpd Original version.
+ */
+
+#include "md5.h"
+#include <string.h>
+
+#undef BYTE_ORDER	/* 1 = big-endian, -1 = little-endian, 0 = unknown */
+#ifdef ARCH_IS_BIG_ENDIAN
+#  define BYTE_ORDER (ARCH_IS_BIG_ENDIAN ? 1 : -1)
+#else
+#  define BYTE_ORDER 0
+#endif
+
+#define T_MASK ((md5_word_t)~0)
+#define T1 /* 0xd76aa478 */ (T_MASK ^ 0x28955b87)
+#define T2 /* 0xe8c7b756 */ (T_MASK ^ 0x173848a9)
+#define T3    0x242070db
+#define T4 /* 0xc1bdceee */ (T_MASK ^ 0x3e423111)
+#define T5 /* 0xf57c0faf */ (T_MASK ^ 0x0a83f050)
+#define T6    0x4787c62a
+#define T7 /* 0xa8304613 */ (T_MASK ^ 0x57cfb9ec)
+#define T8 /* 0xfd469501 */ (T_MASK ^ 0x02b96afe)
+#define T9    0x698098d8
+#define T10 /* 0x8b44f7af */ (T_MASK ^ 0x74bb0850)
+#define T11 /* 0xffff5bb1 */ (T_MASK ^ 0x0000a44e)
+#define T12 /* 0x895cd7be */ (T_MASK ^ 0x76a32841)
+#define T13    0x6b901122
+#define T14 /* 0xfd987193 */ (T_MASK ^ 0x02678e6c)
+#define T15 /* 0xa679438e */ (T_MASK ^ 0x5986bc71)
+#define T16    0x49b40821
+#define T17 /* 0xf61e2562 */ (T_MASK ^ 0x09e1da9d)
+#define T18 /* 0xc040b340 */ (T_MASK ^ 0x3fbf4cbf)
+#define T19    0x265e5a51
+#define T20 /* 0xe9b6c7aa */ (T_MASK ^ 0x16493855)
+#define T21 /* 0xd62f105d */ (T_MASK ^ 0x29d0efa2)
+#define T22    0x02441453
+#define T23 /* 0xd8a1e681 */ (T_MASK ^ 0x275e197e)
+#define T24 /* 0xe7d3fbc8 */ (T_MASK ^ 0x182c0437)
+#define T25    0x21e1cde6
+#define T26 /* 0xc33707d6 */ (T_MASK ^ 0x3cc8f829)
+#define T27 /* 0xf4d50d87 */ (T_MASK ^ 0x0b2af278)
+#define T28    0x455a14ed
+#define T29 /* 0xa9e3e905 */ (T_MASK ^ 0x561c16fa)
+#define T30 /* 0xfcefa3f8 */ (T_MASK ^ 0x03105c07)
+#define T31    0x676f02d9
+#define T32 /* 0x8d2a4c8a */ (T_MASK ^ 0x72d5b375)
+#define T33 /* 0xfffa3942 */ (T_MASK ^ 0x0005c6bd)
+#define T34 /* 0x8771f681 */ (T_MASK ^ 0x788e097e)
+#define T35    0x6d9d6122
+#define T36 /* 0xfde5380c */ (T_MASK ^ 0x021ac7f3)
+#define T37 /* 0xa4beea44 */ (T_MASK ^ 0x5b4115bb)
+#define T38    0x4bdecfa9
+#define T39 /* 0xf6bb4b60 */ (T_MASK ^ 0x0944b49f)
+#define T40 /* 0xbebfbc70 */ (T_MASK ^ 0x4140438f)
+#define T41    0x289b7ec6
+#define T42 /* 0xeaa127fa */ (T_MASK ^ 0x155ed805)
+#define T43 /* 0xd4ef3085 */ (T_MASK ^ 0x2b10cf7a)
+#define T44    0x04881d05
+#define T45 /* 0xd9d4d039 */ (T_MASK ^ 0x262b2fc6)
+#define T46 /* 0xe6db99e5 */ (T_MASK ^ 0x1924661a)
+#define T47    0x1fa27cf8
+#define T48 /* 0xc4ac5665 */ (T_MASK ^ 0x3b53a99a)
+#define T49 /* 0xf4292244 */ (T_MASK ^ 0x0bd6ddbb)
+#define T50    0x432aff97
+#define T51 /* 0xab9423a7 */ (T_MASK ^ 0x546bdc58)
+#define T52 /* 0xfc93a039 */ (T_MASK ^ 0x036c5fc6)
+#define T53    0x655b59c3
+#define T54 /* 0x8f0ccc92 */ (T_MASK ^ 0x70f3336d)
+#define T55 /* 0xffeff47d */ (T_MASK ^ 0x00100b82)
+#define T56 /* 0x85845dd1 */ (T_MASK ^ 0x7a7ba22e)
+#define T57    0x6fa87e4f
+#define T58 /* 0xfe2ce6e0 */ (T_MASK ^ 0x01d3191f)
+#define T59 /* 0xa3014314 */ (T_MASK ^ 0x5cfebceb)
+#define T60    0x4e0811a1
+#define T61 /* 0xf7537e82 */ (T_MASK ^ 0x08ac817d)
+#define T62 /* 0xbd3af235 */ (T_MASK ^ 0x42c50dca)
+#define T63    0x2ad7d2bb
+#define T64 /* 0xeb86d391 */ (T_MASK ^ 0x14792c6e)
+
+
+static void
+md5_process(md5_state_t *pms, const md5_byte_t *data /*[64]*/)
+{
+    md5_word_t
+	a = pms->abcd[0], b = pms->abcd[1],
+	c = pms->abcd[2], d = pms->abcd[3];
+    md5_word_t t;
+#if BYTE_ORDER > 0
+    /* Define storage only for big-endian CPUs. */
+    md5_word_t X[16];
+#else
+    /* Define storage for little-endian or both types of CPUs. */
+    md5_word_t xbuf[16];
+    const md5_word_t *X;
+#endif
+
+    {
+#if BYTE_ORDER == 0
+	/*
+	 * Determine dynamically whether this is a big-endian or
+	 * little-endian machine, since we can use a more efficient
+	 * algorithm on the latter.
+	 */
+	static const int w = 1;
+
+	if (*((const md5_byte_t *)&w)) /* dynamic little-endian */
+#endif
+#if BYTE_ORDER <= 0		/* little-endian */
+	{
+	    /*
+	     * On little-endian machines, we can process properly aligned
+	     * data without copying it.
+	     */
+	    if (!((data - (const md5_byte_t *)0) & 3)) {
+		/* data are properly aligned */
+		X = (const md5_word_t *)data;
+	    } else {
+		/* not aligned */
+		memcpy(xbuf, data, 64);
+		X = xbuf;
+	    }
+	}
+#endif
+#if BYTE_ORDER == 0
+	else			/* dynamic big-endian */
+#endif
+#if BYTE_ORDER >= 0		/* big-endian */
+	{
+	    /*
+	     * On big-endian machines, we must arrange the bytes in the
+	     * right order.
+	     */
+	    const md5_byte_t *xp = data;
+	    int i;
+
+#  if BYTE_ORDER == 0
+	    X = xbuf;		/* (dynamic only) */
+#  else
+#    define xbuf X		/* (static only) */
+#  endif
+	    for (i = 0; i < 16; ++i, xp += 4)
+		xbuf[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24);
+	}
+#endif
+    }
+
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+    /* Round 1. */
+    /* Let [abcd k s i] denote the operation
+       a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */
+#define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
+#define SET(a, b, c, d, k, s, Ti)\
+  t = a + F(b,c,d) + X[k] + Ti;\
+  a = ROTATE_LEFT(t, s) + b
+    /* Do the following 16 operations. */
+    SET(a, b, c, d,  0,  7,  T1);
+    SET(d, a, b, c,  1, 12,  T2);
+    SET(c, d, a, b,  2, 17,  T3);
+    SET(b, c, d, a,  3, 22,  T4);
+    SET(a, b, c, d,  4,  7,  T5);
+    SET(d, a, b, c,  5, 12,  T6);
+    SET(c, d, a, b,  6, 17,  T7);
+    SET(b, c, d, a,  7, 22,  T8);
+    SET(a, b, c, d,  8,  7,  T9);
+    SET(d, a, b, c,  9, 12, T10);
+    SET(c, d, a, b, 10, 17, T11);
+    SET(b, c, d, a, 11, 22, T12);
+    SET(a, b, c, d, 12,  7, T13);
+    SET(d, a, b, c, 13, 12, T14);
+    SET(c, d, a, b, 14, 17, T15);
+    SET(b, c, d, a, 15, 22, T16);
+#undef SET
+
+     /* Round 2. */
+     /* Let [abcd k s i] denote the operation
+          a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */
+#define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
+#define SET(a, b, c, d, k, s, Ti)\
+  t = a + G(b,c,d) + X[k] + Ti;\
+  a = ROTATE_LEFT(t, s) + b
+     /* Do the following 16 operations. */
+    SET(a, b, c, d,  1,  5, T17);
+    SET(d, a, b, c,  6,  9, T18);
+    SET(c, d, a, b, 11, 14, T19);
+    SET(b, c, d, a,  0, 20, T20);
+    SET(a, b, c, d,  5,  5, T21);
+    SET(d, a, b, c, 10,  9, T22);
+    SET(c, d, a, b, 15, 14, T23);
+    SET(b, c, d, a,  4, 20, T24);
+    SET(a, b, c, d,  9,  5, T25);
+    SET(d, a, b, c, 14,  9, T26);
+    SET(c, d, a, b,  3, 14, T27);
+    SET(b, c, d, a,  8, 20, T28);
+    SET(a, b, c, d, 13,  5, T29);
+    SET(d, a, b, c,  2,  9, T30);
+    SET(c, d, a, b,  7, 14, T31);
+    SET(b, c, d, a, 12, 20, T32);
+#undef SET
+
+     /* Round 3. */
+     /* Let [abcd k s t] denote the operation
+          a = b + ((a + H(b,c,d) + X[k] + T[i]) <<< s). */
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define SET(a, b, c, d, k, s, Ti)\
+  t = a + H(b,c,d) + X[k] + Ti;\
+  a = ROTATE_LEFT(t, s) + b
+     /* Do the following 16 operations. */
+    SET(a, b, c, d,  5,  4, T33);
+    SET(d, a, b, c,  8, 11, T34);
+    SET(c, d, a, b, 11, 16, T35);
+    SET(b, c, d, a, 14, 23, T36);
+    SET(a, b, c, d,  1,  4, T37);
+    SET(d, a, b, c,  4, 11, T38);
+    SET(c, d, a, b,  7, 16, T39);
+    SET(b, c, d, a, 10, 23, T40);
+    SET(a, b, c, d, 13,  4, T41);
+    SET(d, a, b, c,  0, 11, T42);
+    SET(c, d, a, b,  3, 16, T43);
+    SET(b, c, d, a,  6, 23, T44);
+    SET(a, b, c, d,  9,  4, T45);
+    SET(d, a, b, c, 12, 11, T46);
+    SET(c, d, a, b, 15, 16, T47);
+    SET(b, c, d, a,  2, 23, T48);
+#undef SET
+
+     /* Round 4. */
+     /* Let [abcd k s t] denote the operation
+          a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+#define SET(a, b, c, d, k, s, Ti)\
+  t = a + I(b,c,d) + X[k] + Ti;\
+  a = ROTATE_LEFT(t, s) + b
+     /* Do the following 16 operations. */
+    SET(a, b, c, d,  0,  6, T49);
+    SET(d, a, b, c,  7, 10, T50);
+    SET(c, d, a, b, 14, 15, T51);
+    SET(b, c, d, a,  5, 21, T52);
+    SET(a, b, c, d, 12,  6, T53);
+    SET(d, a, b, c,  3, 10, T54);
+    SET(c, d, a, b, 10, 15, T55);
+    SET(b, c, d, a,  1, 21, T56);
+    SET(a, b, c, d,  8,  6, T57);
+    SET(d, a, b, c, 15, 10, T58);
+    SET(c, d, a, b,  6, 15, T59);
+    SET(b, c, d, a, 13, 21, T60);
+    SET(a, b, c, d,  4,  6, T61);
+    SET(d, a, b, c, 11, 10, T62);
+    SET(c, d, a, b,  2, 15, T63);
+    SET(b, c, d, a,  9, 21, T64);
+#undef SET
+
+     /* Then perform the following additions. (That is increment each
+        of the four registers by the value it had before this block
+        was started.) */
+    pms->abcd[0] += a;
+    pms->abcd[1] += b;
+    pms->abcd[2] += c;
+    pms->abcd[3] += d;
+}
+
+void
+md5_init(md5_state_t *pms)
+{
+    pms->count[0] = pms->count[1] = 0;
+    pms->abcd[0] = 0x67452301;
+    pms->abcd[1] = /*0xefcdab89*/ T_MASK ^ 0x10325476;
+    pms->abcd[2] = /*0x98badcfe*/ T_MASK ^ 0x67452301;
+    pms->abcd[3] = 0x10325476;
+}
+
+void
+md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes)
+{
+    const md5_byte_t *p = data;
+    int left = nbytes;
+    int offset = (pms->count[0] >> 3) & 63;
+    md5_word_t nbits = (md5_word_t)(nbytes << 3);
+
+    if (nbytes <= 0)
+	return;
+
+    /* Update the message length. */
+    pms->count[1] += nbytes >> 29;
+    pms->count[0] += nbits;
+    if (pms->count[0] < nbits)
+	pms->count[1]++;
+
+    /* Process an initial partial block. */
+    if (offset) {
+	int copy = (offset + nbytes > 64 ? 64 - offset : nbytes);
+
+	memcpy(pms->buf + offset, p, copy);
+	if (offset + copy < 64)
+	    return;
+	p += copy;
+	left -= copy;
+	md5_process(pms, pms->buf);
+    }
+
+    /* Process full blocks. */
+    for (; left >= 64; p += 64, left -= 64)
+	md5_process(pms, p);
+
+    /* Process a final partial block. */
+    if (left)
+	memcpy(pms->buf, p, left);
+}
+
+void
+md5_finish(md5_state_t *pms, md5_byte_t digest[16])
+{
+    static const md5_byte_t pad[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
+    md5_byte_t data[8];
+    int i;
+
+    /* Save the length before padding. */
+    for (i = 0; i < 8; ++i)
+	data[i] = (md5_byte_t)(pms->count[i >> 2] >> ((i & 3) << 3));
+    /* Pad to 56 bytes mod 64. */
+    md5_append(pms, pad, ((55 - (pms->count[0] >> 3)) & 63) + 1);
+    /* Append the length. */
+    md5_append(pms, data, 8);
+    for (i = 0; i < 16; ++i)
+	digest[i] = (md5_byte_t)(pms->abcd[i >> 2] >> ((i & 3) << 3));
+}
diff --git a/src/mongo/util/md5.h b/src/mongo/util/md5.h
new file mode 100644
index 00000000000..a3f3b6db0e2
--- /dev/null
+++ b/src/mongo/util/md5.h
@@ -0,0 +1,91 @@
+/*
+  Copyright (C) 1999, 2002 Aladdin Enterprises.  All rights reserved.
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  L. Peter Deutsch
+  ghost@aladdin.com
+
+ */
+/* $Id: md5.h,v 1.4 2002/04/13 19:20:28 lpd Exp $ */
+/*
+  Independent implementation of MD5 (RFC 1321).
+
+  This code implements the MD5 Algorithm defined in RFC 1321, whose
+  text is available at
+    http://www.ietf.org/rfc/rfc1321.txt
+  The code is derived from the text of the RFC, including the test suite
+  (section A.5) but excluding the rest of Appendix A.  It does not include
+  any code or documentation that is identified in the RFC as being
+  copyrighted.
+
+  The original and principal author of md5.h is L. Peter Deutsch
+  <ghost@aladdin.com>.  Other authors are noted in the change history
+  that follows (in reverse chronological order):
+
+  2002-04-13 lpd Removed support for non-ANSI compilers; removed
+    references to Ghostscript; clarified derivation from RFC 1321;
+    now handles byte order either statically or dynamically.
+  1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
+  1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5);
+    added conditionalization for C++ compilation from Martin
+    Purschke <purschke@bnl.gov>.
+  1999-05-03 lpd Original version.
+ */
+
+#ifndef md5_INCLUDED
+#  define md5_INCLUDED
+
+/*
+ * This package supports both compile-time and run-time determination of CPU
+ * byte order.  If ARCH_IS_BIG_ENDIAN is defined as 0, the code will be
+ * compiled to run only on little-endian CPUs; if ARCH_IS_BIG_ENDIAN is
+ * defined as non-zero, the code will be compiled to run only on big-endian
+ * CPUs; if ARCH_IS_BIG_ENDIAN is not defined, the code will be compiled to
+ * run on either big- or little-endian CPUs, but will run slightly less
+ * efficiently on either one than if ARCH_IS_BIG_ENDIAN is defined.
+ */
+
+typedef unsigned char md5_byte_t; /* 8-bit byte */
+typedef unsigned int md5_word_t; /* 32-bit word */
+
+/* Define the state of the MD5 Algorithm. */
+typedef struct md5_state_s {
+    md5_word_t count[2];    /* message length in bits, lsw first */
+    md5_word_t abcd[4];     /* digest buffer */
+    md5_byte_t buf[64];     /* accumulate block */
+} md5_state_t;
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    /* Initialize the algorithm. */
+    void md5_init(md5_state_t *pms);
+
+    /* Append a string to the message. */
+    void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes);
+
+    /* Finish the message and return the digest. */
+    void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
+
+#ifdef __cplusplus
+}  /* end extern "C" */
+#endif
+
+#endif /* md5_INCLUDED */
diff --git a/src/mongo/util/md5.hpp b/src/mongo/util/md5.hpp
new file mode 100644
index 00000000000..dc061719747
--- /dev/null
+++ b/src/mongo/util/md5.hpp
@@ -0,0 +1,58 @@
+// md5.hpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "md5.h"
+
+namespace mongo {
+
+    typedef unsigned char md5digest[16];
+
+    inline void md5(const void *buf, int nbytes, md5digest digest) {
+        md5_state_t st;
+        md5_init(&st);
+        md5_append(&st, (const md5_byte_t *) buf, nbytes);
+        md5_finish(&st, digest);
+    }
+
+    inline void md5(const char *str, md5digest digest) {
+        md5(str, strlen(str), digest);
+    }
+    
+    inline std::string digestToString( md5digest digest ){
+        static const char * letters = "0123456789abcdef";
+        stringstream ss;
+        for ( int i=0; i<16; i++){
+            unsigned char c = digest[i];
+            ss << letters[ ( c >> 4 ) & 0xf ] << letters[ c & 0xf ];
+        }
+        return ss.str();
+    }
+
+    inline std::string md5simpledigest( const void* buf, int nbytes){
+        md5digest d;
+        md5( buf, nbytes , d );
+        return digestToString( d );
+    }
+
+    inline std::string md5simpledigest( string s ){
+        return md5simpledigest(s.data(), s.size());
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/util/md5main.cpp b/src/mongo/util/md5main.cpp
new file mode 100644
index 00000000000..9995fee8fa7
--- /dev/null
+++ b/src/mongo/util/md5main.cpp
@@ -0,0 +1,142 @@
+/*
+  Copyright (C) 2002 Aladdin Enterprises.  All rights reserved.
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  L. Peter Deutsch
+  ghost@aladdin.com
+
+ */
+/* $Id: md5main.c,v 1.1 2002/04/13 19:20:28 lpd Exp $ */
+/*
+  Independent implementation of MD5 (RFC 1321).
+
+  This code implements the MD5 Algorithm defined in RFC 1321, whose
+  text is available at
+    http://www.ietf.org/rfc/rfc1321.txt
+  The code is derived from the text of the RFC, including the test suite
+  (section A.5) but excluding the rest of Appendix A.  It does not include
+  any code or documentation that is identified in the RFC as being
+  copyrighted.
+
+  The original and principal author of md5.c is L. Peter Deutsch
+  <ghost@aladdin.com>.  Other authors are noted in the change history
+  that follows (in reverse chronological order):
+
+  2002-04-13 lpd Splits off main program into a separate file, md5main.c.
+ */
+
+#include "pch.h"
+#include "md5.h"
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * This file builds an executable that performs various functions related
+ * to the MD5 library.  Typical compilation:
+ *  gcc -o md5main -lm md5main.c md5.c
+ */
+static const char *const usage = "\
+Usage:\n\
+    md5main --test		# run the self-test (A.5 of RFC 1321)\n\
+    md5main --t-values		# print the T values for the library\n\
+    md5main --version		# print the version of the package\n\
+";
+static const char *const version = "2002-04-13";
+
+/* modified: not static, renamed */
+/* Run the self-test. */
+/*static*/ int
+//do_test(void)
+do_md5_test(void) {
+    static const char *const test[7*2] = {
+        "", "d41d8cd98f00b204e9800998ecf8427e",
+        "a", "0cc175b9c0f1b6a831c399e269772661",
+        "abc", "900150983cd24fb0d6963f7d28e17f72",
+        "message digest", "f96b697d7cb7938d525a2f31aaf161d0",
+        "abcdefghijklmnopqrstuvwxyz", "c3fcd3d76192e4007dfb496cca67e13b",
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
+        "d174ab98d277d9f5a5611c2c9f419d9f",
+        "12345678901234567890123456789012345678901234567890123456789012345678901234567890", "57edf4a22be3c955ac49da2e2107b67a"
+    };
+    int i;
+    int status = 0;
+
+    for (i = 0; i < 7*2; i += 2) {
+        md5_state_t state;
+        md5_byte_t digest[16];
+        char hex_output[16*2 + 1];
+        int di;
+
+        md5_init(&state);
+        md5_append(&state, (const md5_byte_t *)test[i], strlen(test[i]));
+        md5_finish(&state, digest);
+        for (di = 0; di < 16; ++di)
+            sprintf(hex_output + di * 2, "%02x", digest[di]);
+        if (strcmp(hex_output, test[i + 1])) {
+            printf("MD5 (\"%s\") = ", test[i]);
+            puts(hex_output);
+            printf("**** ERROR, should be: %s\n", test[i + 1]);
+            status = 1;
+        }
+    }
+//    if (status == 0)
+    /*modified commented out:   puts("md5 self-test completed successfully."); */
+    return status;
+}
+
+/* Print the T values. */
+static int
+do_t_values(void) {
+    int i;
+    for (i = 1; i <= 64; ++i) {
+        unsigned long v = (unsigned long)(4294967296.0 * fabs(sin((double)i)));
+
+        /*
+         * The following nonsense is only to avoid compiler warnings about
+         * "integer constant is unsigned in ANSI C, signed with -traditional".
+         */
+        if (v >> 31) {
+            printf("#define T%d /* 0x%08lx */ (T_MASK ^ 0x%08lx)\n", i,
+                   v, (unsigned long)(unsigned int)(~v));
+        }
+        else {
+            printf("#define T%d    0x%08lx\n", i, v);
+        }
+    }
+    return 0;
+}
+
+/* modified from original code changed function name main->md5main */
+/* Main program */
+int
+md5main(int argc, char *argv[]) {
+    if (argc == 2) {
+        if (!strcmp(argv[1], "--test"))
+            return do_md5_test();
+        if (!strcmp(argv[1], "--t-values"))
+            return do_t_values();
+        if (!strcmp(argv[1], "--version")) {
+            puts(version);
+            return 0;
+        }
+    }
+    puts(usage);
+    return 0;
+}
+
diff --git a/src/mongo/util/mmap.cpp b/src/mongo/util/mmap.cpp
new file mode 100755
index 00000000000..1eb0242e657
--- /dev/null
+++ b/src/mongo/util/mmap.cpp
@@ -0,0 +1,211 @@
+// mmap.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "mmap.h"
+#include "processinfo.h"
+#include "concurrency/rwlock.h"
+#include "../db/namespace.h"
+#include "../db/cmdline.h"
+
+namespace mongo {
+
+    set<MongoFile*> MongoFile::mmfiles;
+    map<string,MongoFile*> MongoFile::pathToFile;
+
+    /* Create. Must not exist.
+    @param zero fill file with zeros when true
+    */
+    void* MemoryMappedFile::create(string filename, unsigned long long len, bool zero) {
+        uassert( 13468, string("can't create file already exists ") + filename, !exists(filename) );
+        void *p = map(filename.c_str(), len);
+        if( p && zero ) {
+            size_t sz = (size_t) len;
+            assert( len == sz );
+            memset(p, 0, sz);
+        }
+        return p;
+    }
+
+    /*static*/ void MemoryMappedFile::updateLength( const char *filename, unsigned long long &length ) {
+        if ( !boost::filesystem::exists( filename ) )
+            return;
+        // make sure we map full length if preexisting file.
+        boost::uintmax_t l = boost::filesystem::file_size( filename );
+        length = l;
+    }
+
+    void* MemoryMappedFile::map(const char *filename) {
+        unsigned long long l;
+        try {
+            l = boost::filesystem::file_size( filename );
+        } 
+        catch(boost::filesystem::filesystem_error& e) { 
+            uasserted(15922, str::stream() << "couldn't get file length when opening mapping " << filename << ' ' << e.what() );
+        }
+        return map( filename , l );
+    }
+    void* MemoryMappedFile::mapWithOptions(const char *filename, int options) {
+        unsigned long long l;
+        try {
+            l = boost::filesystem::file_size( filename );
+        } 
+        catch(boost::filesystem::filesystem_error& e) { 
+            uasserted(15923, str::stream() << "couldn't get file length when opening mapping " << filename << ' ' << e.what() );
+        }
+        return map( filename , l, options );
+    }
+
+    /* --- MongoFile -------------------------------------------------
+       this is the administrative stuff
+    */
+
+    RWLockRecursiveNongreedy LockMongoFilesShared::mmmutex("mmmutex",10*60*1000 /* 10 minutes */);
+    unsigned LockMongoFilesShared::era = 99; // note this rolls over
+
+    /* subclass must call in destructor (or at close).
+        removes this from pathToFile and other maps
+        safe to call more than once, albeit might be wasted work
+        ideal to call close to the close, if the close is well before object destruction
+    */
+    void MongoFile::destroyed() {
+        LockMongoFilesShared::assertExclusivelyLocked();
+        mmfiles.erase(this);
+        pathToFile.erase( filename() );
+    }
+
+    /*static*/
+    void MongoFile::closeAllFiles( stringstream &message ) {
+        static int closingAllFiles = 0;
+        if ( closingAllFiles ) {
+            message << "warning closingAllFiles=" << closingAllFiles << endl;
+            return;
+        }
+        ++closingAllFiles;
+
+        LockMongoFilesExclusive lk;
+
+        ProgressMeter pm( mmfiles.size() , 2 , 1 );
+        set<MongoFile*> temp = mmfiles;
+        for ( set<MongoFile*>::iterator i = temp.begin(); i != temp.end(); i++ ) {
+            (*i)->close(); // close() now removes from mmfiles
+            pm.hit();
+        }
+        message << "closeAllFiles() finished";
+        --closingAllFiles;
+    }
+
+    /*static*/ long long MongoFile::totalMappedLength() {
+        unsigned long long total = 0;
+
+        LockMongoFilesShared lk;
+
+        for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ )
+            total += (*i)->length();
+
+        return total;
+    }
+
+    void nullFunc() { }
+
+    // callback notifications
+    void (*MongoFile::notifyPreFlush)() = nullFunc;
+    void (*MongoFile::notifyPostFlush)() = nullFunc;
+
+    /*static*/ int MongoFile::flushAll( bool sync ) {
+        notifyPreFlush();
+        int x = _flushAll(sync);
+        notifyPostFlush();
+        return x;
+    }
+
+    /*static*/ int MongoFile::_flushAll( bool sync ) {
+        if ( ! sync ) {
+            int num = 0;
+            LockMongoFilesShared lk;
+            for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ ) {
+                num++;
+                MongoFile * mmf = *i;
+                if ( ! mmf )
+                    continue;
+
+                mmf->flush( sync );
+            }
+            return num;
+        }
+
+        // want to do it sync
+        set<MongoFile*> seen;
+        while ( true ) {
+            auto_ptr<Flushable> f;
+            {
+                LockMongoFilesShared lk;
+                for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ ) {
+                    MongoFile * mmf = *i;
+                    if ( ! mmf )
+                        continue;
+                    if ( seen.count( mmf ) )
+                        continue;
+                    f.reset( mmf->prepareFlush() );
+                    seen.insert( mmf );
+                    break;
+                }
+            }
+            if ( ! f.get() )
+                break;
+
+            f->flush();
+        }
+        return seen.size();
+    }
+
+    void MongoFile::created() {
+        LockMongoFilesExclusive lk;
+        mmfiles.insert(this);
+    }
+
+    void MongoFile::setFilename(string fn) {
+        LockMongoFilesExclusive lk;
+        assert( _filename.empty() );
+        _filename = fn;
+        MongoFile *&ptf = pathToFile[fn];
+        massert(13617, "MongoFile : multiple opens of same filename", ptf == 0);
+        ptf = this;
+    }
+
+#if defined(_DEBUG)
+    void MongoFile::markAllWritable() {
+      if( cmdLine.dur )
+          return;
+        LockMongoFilesShared lk;
+        for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ ) {
+            MongoFile * mmf = *i;
+            if (mmf) mmf->_lock();
+        }
+    }
+
+    void MongoFile::unmarkAllWritable() {
+        if( cmdLine.dur )
+            return;
+        LockMongoFilesShared lk;
+        for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ ) {
+            MongoFile * mmf = *i;
+            if (mmf) mmf->_unlock();
+        }
+    }
+#endif
+} // namespace mongo
diff --git a/src/mongo/util/mmap.h b/src/mongo/util/mmap.h
new file mode 100644
index 00000000000..2d4454bbc7f
--- /dev/null
+++ b/src/mongo/util/mmap.h
@@ -0,0 +1,305 @@
+// mmap.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+#include <boost/thread/xtime.hpp>
+#include "concurrency/rwlock.h"
+
+namespace mongo {
+
+    class MAdvise { 
+        void *_p;
+        unsigned _len;
+    public:
+        enum Advice { Sequential=1 };
+        MAdvise(void *p, unsigned len, Advice a); 
+        ~MAdvise(); // destructor resets the range to MADV_NORMAL
+    };
+
+    // lock order: lock dbMutex before this if you lock both
+    class LockMongoFilesShared { 
+        friend class LockMongoFilesExclusive;
+        static RWLockRecursiveNongreedy mmmutex;
+        static unsigned era;
+        RWLockRecursive::Shared lk;
+    public:
+        LockMongoFilesShared() : lk(mmmutex) { }
+
+        /** era changes anytime memory maps come and go.  thus you can use this as a cheap way to verify 
+            that things are still in the condition you expected. of course you must be shared locked 
+            otherwise someone could be in progress.  if you have unlocked this is a reasonable way to 
+            check your memory mapped pointer is still good.
+        */
+        static unsigned getEra() { return era; }
+
+        static void assertExclusivelyLocked() { mmmutex.assertExclusivelyLocked(); }
+    };
+
+    class LockMongoFilesExclusive { 
+        RWLockRecursive::Exclusive lk;
+    public:
+        LockMongoFilesExclusive() : lk(LockMongoFilesShared::mmmutex) { 
+            LockMongoFilesShared::era++;
+        }
+    };
+
+    /* the administrative-ish stuff here */
+    class MongoFile : boost::noncopyable {
+    public:
+        /** Flushable has to fail nicely if the underlying object gets killed */
+        class Flushable {
+        public:
+            virtual ~Flushable() {}
+            virtual void flush() = 0;
+        };
+
+        virtual ~MongoFile() {}
+
+        enum Options {
+            SEQUENTIAL = 1, // hint - e.g. FILE_FLAG_SEQUENTIAL_SCAN on windows
+            READONLY = 2    // not contractually guaranteed, but if specified the impl has option to fault writes
+        };
+
+        /** @param fun is called for each MongoFile.
+            calledl from within a mutex that MongoFile uses. so be careful not to deadlock.
+        */
+        template < class F >
+        static void forEach( F fun );
+
+        /** note: you need to be in mmmutex when using this. forEach (above) handles that for you automatically. 
+*/
+        static set<MongoFile*>& getAllFiles()  { return mmfiles; }
+
+        // callbacks if you need them
+        static void (*notifyPreFlush)();
+        static void (*notifyPostFlush)();
+
+        static int flushAll( bool sync ); // returns n flushed
+        static long long totalMappedLength();
+        static void closeAllFiles( stringstream &message );
+
+#if defined(_DEBUG)
+        static void markAllWritable();
+        static void unmarkAllWritable();
+#else
+        static void markAllWritable() { }
+        static void unmarkAllWritable() { }
+#endif
+
+        static bool exists(boost::filesystem::path p) { return boost::filesystem::exists(p); }
+
+        virtual bool isMongoMMF() { return false; }
+
+        string filename() const { return _filename; }
+        void setFilename(string fn);
+
+    private:
+        string _filename;
+        static int _flushAll( bool sync ); // returns n flushed
+    protected:
+        virtual void close() = 0;
+        virtual void flush(bool sync) = 0;
+        /**
+         * returns a thread safe object that you can call flush on
+         * Flushable has to fail nicely if the underlying object gets killed
+         */
+        virtual Flushable * prepareFlush() = 0;
+
+        void created(); /* subclass must call after create */
+
+        /* subclass must call in destructor (or at close).
+           removes this from pathToFile and other maps
+           safe to call more than once, albeit might be wasted work
+           ideal to call close to the close, if the close is well before object destruction
+        */
+        void destroyed(); 
+
+        virtual unsigned long long length() const = 0;
+
+        // only supporting on posix mmap
+        virtual void _lock() {}
+        virtual void _unlock() {}
+
+        static set<MongoFile*> mmfiles;
+    public:
+        static map<string,MongoFile*> pathToFile;
+    };
+
+    /** look up a MMF by filename. scoped mutex locking convention.
+        example:
+          MMFFinderByName finder;
+          MongoMMF *a = finder.find("file_name_a");
+          MongoMMF *b = finder.find("file_name_b");
+    */
+    class MongoFileFinder : boost::noncopyable {
+    public:
+        MongoFileFinder() { }
+
+        /** @return The MongoFile object associated with the specified file name.  If no file is open
+                    with the specified name, returns null.
+        */
+        MongoFile* findByPath(string path) {
+            map<string,MongoFile*>::iterator i = MongoFile::pathToFile.find(path);
+            return  i == MongoFile::pathToFile.end() ? NULL : i->second;
+        }
+
+    private:
+        LockMongoFilesShared _lk;
+    };
+
+    struct MongoFileAllowWrites {
+        MongoFileAllowWrites() {
+            MongoFile::markAllWritable();
+        }
+        ~MongoFileAllowWrites() {
+            MongoFile::unmarkAllWritable();
+        }
+    };
+
+    class MemoryMappedFile : public MongoFile {
+    protected:
+        virtual void* viewForFlushing() { 
+            if( views.size() == 0 )
+                return 0;
+            assert( views.size() == 1 );
+            return views[0];
+        }
+    public:
+        MemoryMappedFile();
+
+        virtual ~MemoryMappedFile() {
+            LockMongoFilesExclusive lk;
+            close();
+        }
+
+        virtual void close();
+
+        // Throws exception if file doesn't exist. (dm may2010: not sure if this is always true?)
+        void* map(const char *filename);
+
+        /** @param options see MongoFile::Options 
+        */
+        void* mapWithOptions(const char *filename, int options);
+
+        /* Creates with length if DNE, otherwise uses existing file length,
+           passed length.
+           @param options MongoFile::Options bits
+        */
+        void* map(const char *filename, unsigned long long &length, int options = 0 );
+
+        /* Create. Must not exist.
+           @param zero fill file with zeros when true
+        */
+        void* create(string filename, unsigned long long len, bool zero);
+
+        void flush(bool sync);
+        virtual Flushable * prepareFlush();
+
+        long shortLength() const          { return (long) len; }
+        unsigned long long length() const { return len; }
+
+        /** create a new view with the specified properties.
+            automatically cleaned up upon close/destruction of the MemoryMappedFile object.
+            */
+        void* createReadOnlyMap();
+        void* createPrivateMap();
+
+        /** make the private map range writable (necessary for our windows implementation) */
+        static void makeWritable(void *, unsigned len)
+#if defined(_WIN32)
+            ;
+#else
+        { }
+#endif
+
+    private:
+        static void updateLength( const char *filename, unsigned long long &length );
+
+        HANDLE fd;
+        HANDLE maphandle;
+        vector<void *> views;
+        unsigned long long len;
+        
+#ifdef _WIN32
+        boost::shared_ptr<mutex> _flushMutex;
+        void clearWritableBits(void *privateView);
+    public:
+        static const unsigned ChunkSize = 64 * 1024 * 1024;
+        static const unsigned NChunks = 1024 * 1024;
+#else
+        void clearWritableBits(void *privateView) { }
+#endif
+
+    protected:
+        // only posix mmap implementations will support this
+        virtual void _lock();
+        virtual void _unlock();
+
+        /** close the current private view and open a new replacement */
+        void* remapPrivateView(void *oldPrivateAddr);
+    };
+
+    typedef MemoryMappedFile MMF;
+
+    /** p is called from within a mutex that MongoFile uses.  so be careful not to deadlock. */
+    template < class F >
+    inline void MongoFile::forEach( F p ) {
+        LockMongoFilesShared lklk;
+        for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ )
+            p(*i);
+    }
+
+#if defined(_WIN32)    
+    class ourbitset {
+        volatile unsigned bits[MemoryMappedFile::NChunks]; // volatile as we are doing double check locking
+    public:
+        ourbitset() { 
+            memset((void*) bits, 0, sizeof(bits));
+        }
+        bool get(unsigned i) const { 
+            unsigned x = i / 32;
+            assert( x < MemoryMappedFile::NChunks );
+            return (bits[x] & (1 << (i%32))) != 0;
+        }
+        void set(unsigned i) { 
+            unsigned x = i / 32;
+            wassert( x < (MemoryMappedFile::NChunks*2/3) ); // warn if getting close to limit
+            assert( x < MemoryMappedFile::NChunks );
+            bits[x] |= (1 << (i%32));
+        }
+        void clear(unsigned i) { 
+            unsigned x = i / 32;
+            assert( x < MemoryMappedFile::NChunks );
+            bits[x] &= ~(1 << (i%32));
+        }
+    };
+    extern ourbitset writable;
+    void makeChunkWritable(size_t chunkno);
+    inline void MemoryMappedFile::makeWritable(void *_p, unsigned len) {
+        size_t p = (size_t) _p;
+        unsigned a = p/ChunkSize;
+        unsigned b = (p+len)/ChunkSize;
+        for( unsigned i = a; i <= b; i++ ) {
+            if( !writable.get(i) ) {
+                makeChunkWritable(i);
+            }
+        }
+    }
+
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/util/mmap_mm.cpp b/src/mongo/util/mmap_mm.cpp
new file mode 100644
index 00000000000..ec2400e02d3
--- /dev/null
+++ b/src/mongo/util/mmap_mm.cpp
@@ -0,0 +1,52 @@
+// mmap_mm.cpp - in memory (no file) version
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "mmap.h"
+
+/* in memory (no file) version */
+
+namespace mongo {
+
+    MemoryMappedFile::MemoryMappedFile() {
+        fd = 0;
+        maphandle = 0;
+        view = 0;
+        len = 0;
+    }
+
+    void MemoryMappedFile::close() {
+        if ( view )
+            free( view );
+        view = 0;
+        len = 0;
+    }
+
+    void* MemoryMappedFile::map(const char *filename, long& length , int options ) {
+        assert( length );
+        view = malloc( length );
+        return view;
+    }
+
+    void MemoryMappedFile::flush(bool sync) {
+    }
+
+    void MemoryMappedFile::_lock() {}
+    void MemoryMappedFile::_unlock() {}
+
+}
+
diff --git a/src/mongo/util/mmap_posix.cpp b/src/mongo/util/mmap_posix.cpp
new file mode 100644
index 00000000000..8097ef1b370
--- /dev/null
+++ b/src/mongo/util/mmap_posix.cpp
@@ -0,0 +1,214 @@
+// mmap_posix.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "mmap.h"
+#include "file_allocator.h"
+#include "../db/concurrency.h"
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "../util/processinfo.h"
+#include "mongoutils/str.h"
+using namespace mongoutils;
+
+namespace mongo {
+
+    MemoryMappedFile::MemoryMappedFile() {
+        fd = 0;
+        maphandle = 0;
+        len = 0;
+        created();
+    }
+
+    void MemoryMappedFile::close() {
+        LockMongoFilesShared::assertExclusivelyLocked();
+        for( vector<void*>::iterator i = views.begin(); i != views.end(); i++ ) {
+            munmap(*i,len);
+        }
+        views.clear();
+
+        if ( fd )
+            ::close(fd);
+        fd = 0;
+        destroyed(); // cleans up from the master list of mmaps
+    }
+
+#ifndef O_NOATIME
+#define O_NOATIME (0)
+#endif
+
+#ifndef MAP_NORESERVE
+#define MAP_NORESERVE (0)
+#endif
+
+#if defined(__sunos__)
+    MAdvise::MAdvise(void *,unsigned, Advice) { }
+    MAdvise::~MAdvise() { }
+#else
+    MAdvise::MAdvise(void *p, unsigned len, Advice a) : _p(p), _len(len) {
+        assert( a == Sequential ); // more later
+        madvise(_p,_len,MADV_SEQUENTIAL);
+    }
+    MAdvise::~MAdvise() { 
+        madvise(_p,_len,MADV_NORMAL);
+    }
+#endif
+
+    void* MemoryMappedFile::map(const char *filename, unsigned long long &length, int options) {
+        // length may be updated by callee.
+        setFilename(filename);
+        FileAllocator::get()->allocateAsap( filename, length );
+        len = length;
+
+        massert( 10446 , str::stream() << "mmap: can't map area of size 0 file: " << filename, length > 0 );
+
+        fd = open(filename, O_RDWR | O_NOATIME);
+        if ( fd <= 0 ) {
+            log() << "couldn't open " << filename << ' ' << errnoWithDescription() << endl;
+            fd = 0; // our sentinel for not opened
+            return 0;
+        }
+
+        unsigned long long filelen = lseek(fd, 0, SEEK_END);
+        uassert(10447,  str::stream() << "map file alloc failed, wanted: " << length << " filelen: " << filelen << ' ' << sizeof(size_t), filelen == length );
+        lseek( fd, 0, SEEK_SET );
+
+        void * view = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        if ( view == MAP_FAILED ) {
+            error() << "  mmap() failed for " << filename << " len:" << length << " " << errnoWithDescription() << endl;
+            if ( errno == ENOMEM ) {
+                if( sizeof(void*) == 4 )
+                    error() << "mmap failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64" << endl;
+                else
+                    error() << "mmap failed with out of memory. (64 bit build)" << endl;
+            }
+            return 0;
+        }
+
+
+#if defined(__sunos__)
+#warning madvise not supported on solaris yet
+#else
+        if ( options & SEQUENTIAL ) {
+            if ( madvise( view , length , MADV_SEQUENTIAL ) ) {
+                warning() << "map: madvise failed for " << filename << ' ' << errnoWithDescription() << endl;
+            }
+        }
+#endif
+
+        views.push_back( view );
+
+        DEV if (! d.dbMutex.info().isLocked()) {
+            _unlock();
+        }
+
+        return view;
+    }
+
+    void* MemoryMappedFile::createReadOnlyMap() {
+        void * x = mmap( /*start*/0 , len , PROT_READ , MAP_SHARED , fd , 0 );
+        if( x == MAP_FAILED ) {
+            if ( errno == ENOMEM ) {
+                if( sizeof(void*) == 4 )
+                    error() << "mmap ro failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64" << endl;
+                else
+                    error() << "mmap ro failed with out of memory. (64 bit build)" << endl;
+            }
+            return 0;
+        }
+        return x;
+    }
+
+    void* MemoryMappedFile::createPrivateMap() {
+        void * x = mmap( /*start*/0 , len , PROT_READ|PROT_WRITE , MAP_PRIVATE|MAP_NORESERVE , fd , 0 );
+        if( x == MAP_FAILED ) {
+            if ( errno == ENOMEM ) {
+                if( sizeof(void*) == 4 ) {
+                    error() << "mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64" << endl;
+                }
+                else {
+                    error() << "mmap private failed with out of memory. (64 bit build)" << endl;
+                }
+            }
+            else { 
+                error() << "mmap private failed " << errnoWithDescription() << endl;
+            }
+            return 0;
+        }
+
+        views.push_back(x);
+        return x;
+    }
+
+    void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) {
+        // don't unmap, just mmap over the old region
+        void * x = mmap( oldPrivateAddr, len , PROT_READ|PROT_WRITE , MAP_PRIVATE|MAP_NORESERVE|MAP_FIXED , fd , 0 );
+        if( x == MAP_FAILED ) {
+            int err = errno;
+            error()  << "13601 Couldn't remap private view: " << errnoWithDescription(err) << endl;
+            log() << "aborting" << endl;
+            printMemInfo();
+            abort();
+        }
+        assert( x == oldPrivateAddr );
+        return x;
+    }
+
+    void MemoryMappedFile::flush(bool sync) {
+        if ( views.empty() || fd == 0 )
+            return;
+        if ( msync(viewForFlushing(), len, sync ? MS_SYNC : MS_ASYNC) )
+            problem() << "msync " << errnoWithDescription() << endl;
+    }
+
+    class PosixFlushable : public MemoryMappedFile::Flushable {
+    public:
+        PosixFlushable( void * view , HANDLE fd , long len )
+            : _view( view ) , _fd( fd ) , _len(len) {
+        }
+
+        void flush() {
+            if ( _view && _fd )
+                if ( msync(_view, _len, MS_SYNC ) )
+                    problem() << "msync " << errnoWithDescription() << endl;
+
+        }
+
+        void * _view;
+        HANDLE _fd;
+        long _len;
+    };
+
+    MemoryMappedFile::Flushable * MemoryMappedFile::prepareFlush() {
+        return new PosixFlushable( viewForFlushing() , fd , len );
+    }
+
+    void MemoryMappedFile::_lock() {
+        if (! views.empty() && isMongoMMF() ) 
+            assert(mprotect(views[0], len, PROT_READ | PROT_WRITE) == 0);
+    }
+
+    void MemoryMappedFile::_unlock() {
+        if (! views.empty() && isMongoMMF() ) 
+            assert(mprotect(views[0], len, PROT_READ) == 0);
+    }
+
+} // namespace mongo
+
diff --git a/src/mongo/util/mmap_win.cpp b/src/mongo/util/mmap_win.cpp
new file mode 100644
index 00000000000..26115d096c1
--- /dev/null
+++ b/src/mongo/util/mmap_win.cpp
@@ -0,0 +1,202 @@
+// mmap_win.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "mmap.h"
+#include "text.h"
+#include "../db/mongommf.h"
+#include "../db/concurrency.h"
+
+namespace mongo {
+
+    mutex mapViewMutex("mapView");
+    ourbitset writable;
+
+    MAdvise::MAdvise(void *,unsigned, Advice) { }
+    MAdvise::~MAdvise() { }
+
+    /** notification on unmapping so we can clear writable bits */
+    void MemoryMappedFile::clearWritableBits(void *p) {
+        for( unsigned i = ((size_t)p)/ChunkSize; i <= (((size_t)p)+len)/ChunkSize; i++ ) {
+            writable.clear(i);
+            assert( !writable.get(i) );
+        }
+    }
+
+    MemoryMappedFile::MemoryMappedFile()
+        : _flushMutex(new mutex("flushMutex")) {
+        fd = 0;
+        maphandle = 0;
+        len = 0;
+        created();
+    }
+
+    void MemoryMappedFile::close() {
+        LockMongoFilesShared::assertExclusivelyLocked();
+        for( vector<void*>::iterator i = views.begin(); i != views.end(); i++ ) {
+            clearWritableBits(*i);
+            UnmapViewOfFile(*i);
+        }
+        views.clear();
+        if ( maphandle )
+            CloseHandle(maphandle);
+        maphandle = 0;
+        if ( fd )
+            CloseHandle(fd);
+        fd = 0;
+        destroyed(); // cleans up from the master list of mmaps
+    }
+
+    unsigned long long mapped = 0;
+
+    void* MemoryMappedFile::createReadOnlyMap() {
+        assert( maphandle );
+        scoped_lock lk(mapViewMutex);
+        void *p = MapViewOfFile(maphandle, FILE_MAP_READ, /*f ofs hi*/0, /*f ofs lo*/ 0, /*dwNumberOfBytesToMap 0 means to eof*/0);
+        if ( p == 0 ) {
+            DWORD e = GetLastError();
+            log() << "FILE_MAP_READ MapViewOfFile failed " << filename() << " " << errnoWithDescription(e) << endl;
+        }
+        else {
+            views.push_back(p);
+        }
+        return p;
+    }
+
+    void* MemoryMappedFile::map(const char *filenameIn, unsigned long long &length, int options) {
+        assert( fd == 0 && len == 0 ); // can't open more than once
+        setFilename(filenameIn);
+        /* big hack here: Babble uses db names with colons.  doesn't seem to work on windows.  temporary perhaps. */
+        char filename[256];
+        strncpy(filename, filenameIn, 255);
+        filename[255] = 0;
+        {
+            size_t len = strlen( filename );
+            for ( size_t i=len-1; i>=0; i-- ) {
+                if ( filename[i] == '/' ||
+                        filename[i] == '\\' )
+                    break;
+
+                if ( filename[i] == ':' )
+                    filename[i] = '_';
+            }
+        }
+
+        updateLength( filename, length );
+
+        {
+            DWORD createOptions = FILE_ATTRIBUTE_NORMAL;
+            if ( options & SEQUENTIAL )
+                createOptions |= FILE_FLAG_SEQUENTIAL_SCAN;
+            DWORD rw = GENERIC_READ | GENERIC_WRITE;
+            fd = CreateFile(
+                     toNativeString(filename).c_str(),
+                     rw, // desired access
+                     FILE_SHARE_WRITE | FILE_SHARE_READ, // share mode
+                     NULL, // security
+                     OPEN_ALWAYS, // create disposition
+                     createOptions , // flags
+                     NULL); // hTempl
+            if ( fd == INVALID_HANDLE_VALUE ) {
+                DWORD e = GetLastError();
+                log() << "Create/OpenFile failed " << filename << " errno:" << e << endl;
+                return 0;
+            }
+        }
+
+        mapped += length;
+
+        {
+            DWORD flProtect = PAGE_READWRITE; //(options & READONLY)?PAGE_READONLY:PAGE_READWRITE;
+            maphandle = CreateFileMapping(fd, NULL, flProtect,
+                                          length >> 32 /*maxsizehigh*/,
+                                          (unsigned) length /*maxsizelow*/,
+                                          NULL/*lpName*/);
+            if ( maphandle == NULL ) {
+                DWORD e = GetLastError(); // log() call was killing lasterror before we get to that point in the stream
+                log() << "CreateFileMapping failed " << filename << ' ' << errnoWithDescription(e) << endl;
+                close();
+                return 0;
+            }
+        }
+
+        void *view = 0;
+        {
+            scoped_lock lk(mapViewMutex);
+            DWORD access = (options&READONLY)? FILE_MAP_READ : FILE_MAP_ALL_ACCESS;
+            view = MapViewOfFile(maphandle, access, /*f ofs hi*/0, /*f ofs lo*/ 0, /*dwNumberOfBytesToMap 0 means to eof*/0);
+        }
+        if ( view == 0 ) {
+            DWORD e = GetLastError();
+            log() << "MapViewOfFile failed " << filename << " " << errnoWithDescription(e) << 
+                ((sizeof(void*)==4)?" (32 bit build)":"") << endl;
+            close();
+        }
+        else {
+            views.push_back(view);
+        }
+        len = length;
+
+        return view;
+    }
+
+    class WindowsFlushable : public MemoryMappedFile::Flushable {
+    public:
+        WindowsFlushable( void * view , HANDLE fd , string filename , boost::shared_ptr<mutex> flushMutex )
+            : _view(view) , _fd(fd) , _filename(filename) , _flushMutex(flushMutex)
+        {}
+
+        void flush() {
+            if (!_view || !_fd)
+                return;
+
+            scoped_lock lk(*_flushMutex);
+
+            BOOL success = FlushViewOfFile(_view, 0); // 0 means whole mapping
+            if (!success) {
+                int err = GetLastError();
+                out() << "FlushViewOfFile failed " << err << " file: " << _filename << endl;
+            }
+
+            success = FlushFileBuffers(_fd);
+            if (!success) {
+                int err = GetLastError();
+                out() << "FlushFileBuffers failed " << err << " file: " << _filename << endl;
+            }
+        }
+
+        void * _view;
+        HANDLE _fd;
+        string _filename;
+        boost::shared_ptr<mutex> _flushMutex;
+    };
+
+    void MemoryMappedFile::flush(bool sync) {
+        uassert(13056, "Async flushing not supported on windows", sync);
+        if( !views.empty() ) {
+            WindowsFlushable f( viewForFlushing() , fd , filename() , _flushMutex);
+            f.flush();
+        }
+    }
+
+    MemoryMappedFile::Flushable * MemoryMappedFile::prepareFlush() {
+        return new WindowsFlushable( viewForFlushing() , fd , filename() , _flushMutex );
+    }
+    void MemoryMappedFile::_lock() {}
+    void MemoryMappedFile::_unlock() {}
+
+}
diff --git a/src/mongo/util/mongoutils/README b/src/mongo/util/mongoutils/README
new file mode 100755
index 00000000000..f61277c7409
--- /dev/null
+++ b/src/mongo/util/mongoutils/README
@@ -0,0 +1,15 @@
+    mongoutils namespace requirements:
+
+    (1) code is not database specific, rather, true utilities
+    (2) are cross platform
+    (3) may require boost headers, but not libs
+    (4) are clean and easy to use in any c++ project without pulling in lots of other stuff.
+        specifically, does not use anything in the mongo namespace!
+    (5) apache license
+    (6) must be documented!  if you aren't going to bother (but don't do that), stick it in util.
+    (7) ideally header only (in the spirit of #3)
+
+    So basically, easy to use, general purpose stuff, with no arduous dependencies to drop into 
+    any new project.
+
+    *** PLACE UNIT TESTS IN mongoutils/test.cpp ***
diff --git a/src/mongo/util/mongoutils/checksum.h b/src/mongo/util/mongoutils/checksum.h
new file mode 100644
index 00000000000..ea3d05131ce
--- /dev/null
+++ b/src/mongo/util/mongoutils/checksum.h
@@ -0,0 +1,32 @@
+/** @file checksum.h */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongoutils {
+
+    /**
+     * this is a silly temporary implementation
+     */
+    inline int checksum( const char* x , int size ) {
+        int ck = 0;
+        for ( int i=0; i<size; i++ )
+            ck += ( (int)x[i] * ( i + 1 ) );
+        return ck;
+    }
+
+}
diff --git a/src/mongo/util/mongoutils/hash.h b/src/mongo/util/mongoutils/hash.h
new file mode 100644
index 00000000000..49f30b3242a
--- /dev/null
+++ b/src/mongo/util/mongoutils/hash.h
@@ -0,0 +1,41 @@
+/** @file hash.h */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongoutils {
+
+    /** @return hash of a pointer to an unsigned. so you get a 32 bit hash out, regardless of whether
+                pointers are 32 or 64 bit on the particular platform.
+
+        is there a faster way to impl this that hashes just as well?
+    */
+    inline unsigned hashPointer(void *v) {
+        unsigned x = 0;
+        unsigned char *p = (unsigned char *) &v;
+        for( unsigned i = 0; i < sizeof(void*); i++ ) {
+            x = x * 131 + p[i];
+        }
+        return x;
+    }
+
+    inline unsigned hash(unsigned u) {
+        unsigned char *p = (unsigned char *) &u;
+        return (((((p[3] * 131) + p[2]) * 131) + p[1]) * 131) + p[0];
+    }
+
+}
diff --git a/src/mongo/util/mongoutils/html.h b/src/mongo/util/mongoutils/html.h
new file mode 100644
index 00000000000..f79e6ca514f
--- /dev/null
+++ b/src/mongo/util/mongoutils/html.h
@@ -0,0 +1,158 @@
+// @file html.h
+
+#pragma once
+
+/* Things in the mongoutils namespace
+   (1) are not database specific, rather, true utilities
+   (2) are cross platform
+   (3) may require boost headers, but not libs
+   (4) are clean and easy to use in any c++ project without pulling in lots of other stuff
+*/
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <sstream>
+
+namespace mongoutils {
+
+    namespace html {
+
+        using namespace std;
+
+        inline string _end() { return "</body></html>"; }
+        inline string _table() { return "</table>\n\n"; }
+        inline string _tr() { return "</tr>\n"; }
+
+        inline string tr() { return "<tr>"; }
+        inline string tr(string a, string b) {
+            stringstream ss;
+            ss << "<tr><td>" << a << "</td><td>" << b << "</td></tr>\n";
+            return ss.str();
+        }
+        template <class T>
+        inline string td(T x) {
+            stringstream ss;
+            ss << "<td>" << x << "</td>";
+            return ss.str();
+        }
+        inline string td(string x) {
+            return "<td>" + x + "</td>";
+        }
+        inline string th(string x) {
+            return "<th>" + x + "</th>";
+        }
+
+        inline void tablecell( stringstream& ss , bool b ) {
+            ss << "<td>" << (b ? "<b>X</b>" : "") << "</td>";
+        }
+
+        template< typename T>
+        inline void tablecell( stringstream& ss , const T& t ) {
+            ss << "<td>" << t << "</td>";
+        }
+
+        inline string table(const char *headers[] = 0, bool border = true) {
+            stringstream ss;
+            ss << "\n<table "
+               << (border?"border=1 ":"")
+               << "cellpadding=2 cellspacing=0>\n";
+            if( headers ) {
+                ss << "<tr>";
+                while( *headers ) {
+                    ss << "<th>" << *headers << "</th>";
+                    headers++;
+                }
+                ss << "</tr>\n";
+            }
+            return ss.str();
+        }
+
+        inline string start(string title) {
+            stringstream ss;
+            ss << "<html><head>\n<title>";
+            ss << title;
+            ss << "</title>\n";
+
+            ss << "<style type=\"text/css\" media=\"screen\">"
+               "body { font-family: helvetica, arial, san-serif }\n"
+               "table { border-collapse:collapse; border-color:#999; margin-top:.5em }\n"
+               "th { background-color:#bbb; color:#000 }\n"
+               "td,th { padding:.25em }\n"
+               "</style>\n";
+
+            ss << "</head>\n<body>\n";
+            return ss.str();
+        }
+
+        inline string red(string contentHtml, bool color=true) {
+            if( !color ) return contentHtml;
+            stringstream ss;
+            ss << "<span style=\"color:#A00;\">" << contentHtml << "</span>";
+            return ss.str();
+        }
+        inline string grey(string contentHtml, bool color=true) {
+            if( !color ) return contentHtml;
+            stringstream ss;
+            ss << "<span style=\"color:#888;\">" << contentHtml << "</span>";
+            return ss.str();
+        }
+        inline string blue(string contentHtml, bool color=true) {
+            if( !color ) return contentHtml;
+            stringstream ss;
+            ss << "<span style=\"color:#00A;\">" << contentHtml << "</span>";
+            return ss.str();
+        }
+        inline string yellow(string contentHtml, bool color=true) {
+            if( !color ) return contentHtml;
+            stringstream ss;
+            ss << "<span style=\"color:#A80;\">" << contentHtml << "</span>";
+            return ss.str();
+        }
+        inline string green(string contentHtml, bool color=true) {
+            if( !color ) return contentHtml;
+            stringstream ss;
+            ss << "<span style=\"color:#0A0;\">" << contentHtml << "</span>";
+            return ss.str();
+        }
+
+        inline string p(string contentHtml) {
+            stringstream ss;
+            ss << "<p>" << contentHtml << "</p>\n";
+            return ss.str();
+        }
+
+        inline string h2(string contentHtml) {
+            stringstream ss;
+            ss << "<h2>" << contentHtml << "</h2>\n";
+            return ss.str();
+        }
+
+        /* does NOT escape the strings. */
+        inline string a(string href, string title="", string contentHtml = "") {
+            stringstream ss;
+            ss << "<a";
+            if( !href.empty() ) ss << " href=\"" << href << '"';
+            if( !title.empty() ) ss << " title=\"" << title << '"';
+            ss << '>';
+            if( !contentHtml.empty() ) {
+                ss << contentHtml << "</a>";
+            }
+            return ss.str();
+        }
+
+    }
+
+}
diff --git a/src/mongo/util/mongoutils/mongoutils.vcxproj b/src/mongo/util/mongoutils/mongoutils.vcxproj
new file mode 100755
index 00000000000..f6ec0935ca9
--- /dev/null
+++ b/src/mongo/util/mongoutils/mongoutils.vcxproj
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{7B84584E-92BC-4DB9-971B-A1A8F93E5053}</ProjectGuid>
+    <RootNamespace>mongoutils</RootNamespace>
+    <ProjectName>mongoutils test program</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>c:\boost;\boost</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="test.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="html.h" />
+    <ClInclude Include="str.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/util/mongoutils/mongoutils.vcxproj.filters b/src/mongo/util/mongoutils/mongoutils.vcxproj.filters
new file mode 100755
index 00000000000..84ecbffede0
--- /dev/null
+++ b/src/mongo/util/mongoutils/mongoutils.vcxproj.filters
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="test.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="html.h" />
+    <ClInclude Include="str.h" />
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/util/mongoutils/str.h b/src/mongo/util/mongoutils/str.h
new file mode 100644
index 00000000000..97b121b0068
--- /dev/null
+++ b/src/mongo/util/mongoutils/str.h
@@ -0,0 +1,216 @@
+// @file str.h
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+/* Things in the mongoutils namespace
+   (1) are not database specific, rather, true utilities
+   (2) are cross platform
+   (3) may require boost headers, but not libs
+   (4) are clean and easy to use in any c++ project without pulling in lots of other stuff
+
+   Note: within this module, we use int for all offsets -- there are no unsigned offsets
+   and no size_t's.  If you need 3 gigabyte long strings, don't use this module.
+*/
+
+#include <string>
+#include <sstream>
+
+// this violates the README rules for mongoutils:
+#include "../../bson/util/builder.h"
+
+namespace mongoutils {
+
+    namespace str {
+
+        typedef std::string string;
+
+        /** the idea here is to make one liners easy.  e.g.:
+
+               return str::stream() << 1 << ' ' << 2;
+
+            since the following doesn't work:
+
+               (stringstream() << 1).str();
+        */
+        class stream {
+        public:
+            mongo::StringBuilder ss;
+            template<class T>
+            stream& operator<<(const T& v) {
+                ss << v;
+                return *this;
+            }
+            operator std::string () const { return ss.str(); }
+        };
+
+        inline bool startsWith(const char *str, const char *prefix) {
+            const char *s = str;
+            const char *p = prefix;
+            while( *p ) {
+                if( *p != *s ) return false;
+                p++; s++;
+            }
+            return true;
+        }
+        inline bool startsWith(string s, string p) { return startsWith(s.c_str(), p.c_str()); }
+
+        // while these are trivial today use in case we do different wide char things later
+        inline bool startsWith(const char *p, char ch) { return *p == ch; }
+        inline bool startsWith(string s, char ch) { return startsWith(s.c_str(), ch); }
+
+        inline bool endsWith(string s, string p) {
+            int l = p.size();
+            int x = s.size();
+            if( x < l ) return false;
+            return strncmp(s.c_str()+x-l, p.c_str(), l) == 0;
+        }
+        inline bool endsWith(const char *s, char p) {
+            size_t len = strlen(s);
+            return len && s[len-1] == p;
+        }
+
+        inline bool equals( const char * a , const char * b ) { return strcmp( a , b ) == 0; }
+
+        /** find char x, and return rest of string thereafter, or "" if not found */
+        inline const char * after(const char *s, char x) {
+            const char *p = strchr(s, x);
+            return (p != 0) ? p+1 : "";
+        }
+        inline string after(const string& s, char x) {
+            const char *p = strchr(s.c_str(), x);
+            return (p != 0) ? string(p+1) : "";
+        }
+
+        /** find string x, and return rest of string thereafter, or "" if not found */
+        inline const char * after(const char *s, const char *x) {
+            const char *p = strstr(s, x);
+            return (p != 0) ? p+strlen(x) : "";
+        }
+        inline string after(string s, string x) {
+            const char *p = strstr(s.c_str(), x.c_str());
+            return (p != 0) ? string(p+x.size()) : "";
+        }
+
+        /** @return true if s contains x */
+        inline bool contains(string s, string x) {
+            return strstr(s.c_str(), x.c_str()) != 0;
+        }
+        inline bool contains(string s, char x) {
+            return strchr(s.c_str(), x) != 0;
+        }
+
+        /** @return everything before the character x, else entire string */
+        inline string before(const string& s, char x) {
+            const char *p = strchr(s.c_str(), x);
+            return (p != 0) ? s.substr(0, p-s.c_str()) : s;
+        }
+
+        /** @return everything before the string x, else entire string */
+        inline string before(const string& s, const string& x) {
+            const char *p = strstr(s.c_str(), x.c_str());
+            return (p != 0) ? s.substr(0, p-s.c_str()) : s;
+        }
+
+        /** check if if strings share a common starting prefix
+            @return offset of divergence (or length if equal).  0=nothing in common. */
+        inline int shareCommonPrefix(const char *p, const char *q) {
+            int ofs = 0;
+            while( 1 ) {
+                if( *p == 0 || *q == 0 )
+                    break;
+                if( *p != *q )
+                    break;
+                p++; q++; ofs++;
+            }
+            return ofs;
+        }
+        inline int shareCommonPrefix(const string &a, const string &b)
+        { return shareCommonPrefix(a.c_str(), b.c_str()); }
+
+        /** string to unsigned. zero if not a number. can end with non-num chars */
+        inline unsigned toUnsigned(const string& a) {
+            unsigned x = 0;
+            const char *p = a.c_str();
+            while( 1 ) {
+                if( !isdigit(*p) )
+                    break;
+                x = x * 10 + (*p - '0');
+                p++;
+            }
+            return x;
+        }
+
+        /** split a string on a specific char.  We don't split N times, just once
+            on the first occurrence.  If char not present entire string is in L
+            and R is empty.
+            @return true if char found
+        */
+        inline bool splitOn(const string &s, char c, string& L, string& R) {
+            const char *start = s.c_str();
+            const char *p = strchr(start, c);
+            if( p == 0 ) {
+                L = s; R.clear();
+                return false;
+            }
+            L = string(start, p-start);
+            R = string(p+1);
+            return true;
+        }
+        /** split scanning reverse direction. Splits ONCE ONLY. */
+        inline bool rSplitOn(const string &s, char c, string& L, string& R) {
+            const char *start = s.c_str();
+            const char *p = strrchr(start, c);
+            if( p == 0 ) {
+                L = s; R.clear();
+                return false;
+            }
+            L = string(start, p-start);
+            R = string(p+1);
+            return true;
+        }
+
+        /** @return number of occurrences of c in s */
+        inline unsigned count( const string& s , char c ) {
+            unsigned n=0;
+            for ( unsigned i=0; i<s.size(); i++ )
+                if ( s[i] == c )
+                    n++;
+            return n;
+        }
+
+        /** trim leading spaces. spaces only, not tabs etc. */
+        inline string ltrim(const string& s) {
+            const char *p = s.c_str();
+            while( *p == ' ' ) p++;
+            return p;
+        }
+
+        /** remove trailing chars in place */
+        inline void stripTrailing(string& s, const char *chars) {
+            string::iterator i = s.end();
+            while( s.begin() != i ) {
+                i--;
+                if( contains(chars, *i) ) {
+                    s.erase(i);
+                }
+            }
+        }
+
+    }
+
+}
diff --git a/src/mongo/util/mongoutils/test.cpp b/src/mongo/util/mongoutils/test.cpp
new file mode 100644
index 00000000000..45268c5ca49
--- /dev/null
+++ b/src/mongo/util/mongoutils/test.cpp
@@ -0,0 +1,45 @@
+/* @file test.cpp
+  utils/mongoutils/test.cpp
+  unit tests for mongoutils
+*/
+
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <assert.h>
+#include "str.h"
+#include "html.h"
+
+using namespace std;
+using namespace mongoutils;
+
+int main() {
+    {
+        string s = "abcde";
+        str::stripTrailing(s, "ef");
+        assert( s == "abcd" );
+        str::stripTrailing(s, "abcd");
+        assert( s.empty() );
+        s = "abcddd";
+        str::stripTrailing(s, "d");
+        assert( s == "abc" );
+    }
+
+    string x = str::after("abcde", 'c');
+    assert( x == "de" );
+    assert( str::after("abcde", 'x') == "" );
+    return 0;
+}
diff --git a/src/mongo/util/moveablebuffer.h b/src/mongo/util/moveablebuffer.h
new file mode 100644
index 00000000000..e01f2d8d9a4
--- /dev/null
+++ b/src/mongo/util/moveablebuffer.h
@@ -0,0 +1,51 @@
+/* moveablebuffer.h
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    /** this is a sort of smart pointer class where we can move where something is and all the pointers will adjust.
+        not threadsafe.
+        */
+    struct MoveableBuffer {
+        MoveableBuffer();
+        MoveableBuffer(void *);
+        MoveableBuffer& operator=(const MoveableBuffer&);
+        ~MoveableBuffer();
+
+        void *p;
+    };
+
+    /* implementation (inlines) below */
+
+    // this is a temp stub implementation...not really done yet - just having everything compile & such for checkpointing into git
+
+    inline MoveableBuffer::MoveableBuffer() : p(0) { }
+
+    inline MoveableBuffer::MoveableBuffer(void *_p) : p(_p) { }
+
+    inline MoveableBuffer& MoveableBuffer::operator=(const MoveableBuffer& r) {
+        p = r.p;
+        return *this;
+    }
+
+    inline MoveableBuffer::~MoveableBuffer() {
+    }
+
+}
diff --git a/src/mongo/util/net/hostandport.h b/src/mongo/util/net/hostandport.h
new file mode 100644
index 00000000000..3f4a64b79a9
--- /dev/null
+++ b/src/mongo/util/net/hostandport.h
@@ -0,0 +1,239 @@
+// hostandport.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "sock.h"
+#include "../../db/cmdline.h"
+#include "../mongoutils/str.h"
+
+namespace mongo {
+
+    using namespace mongoutils;
+
+    void dynHostResolve(string& name, int& port);
+    string dynHostMyName();
+
+    /** helper for manipulating host:port connection endpoints.
+      */
+    struct HostAndPort {
+        HostAndPort() : _port(-1) { }
+
+        /** From a string hostname[:portnumber] or a #dynname
+            Throws user assertion if bad config string or bad port #.
+        */
+        HostAndPort(string s);
+
+        /** @param p port number. -1 is ok to use default. */
+        HostAndPort(string h, int p /*= -1*/) : _host(h), _port(p) { 
+            assert( !str::startsWith(h, '#') );
+        }
+
+        HostAndPort(const SockAddr& sock ) : _host( sock.getAddr() ) , _port( sock.getPort() ) { }
+
+        static HostAndPort me() { return HostAndPort("localhost", cmdLine.port); }
+
+        /* uses real hostname instead of localhost */
+        static HostAndPort Me();
+
+        bool operator<(const HostAndPort& r) const {
+            string h = host();
+            string rh = r.host();
+            if( h < rh )
+                return true;
+            if( h == rh )
+                return port() < r.port();
+            return false;
+        }
+
+        bool operator==(const HostAndPort& r) const { 
+            return host() == r.host() && port() == r.port(); 
+        }
+
+        bool operator!=(const HostAndPort& r) const { return !(*this == r); }
+
+        /* returns true if the host/port combo identifies this process instance. */
+        bool isSelf() const; // defined in isself.cpp
+
+        bool isLocalHost() const;
+
+        /**
+         * @param includePort host:port if true, host otherwise
+         */
+        string toString( bool includePort=true ) const;
+
+        // get the logical name if using a #dynhostname instead of resolving to current actual name
+        string dynString() const;
+        string toStringLong() const;
+
+        operator string() const { return toString(); }
+
+        bool empty() const { 
+            return _dynName.empty() && _host.empty() && _port < 0;
+        }
+        string host() const { 
+            if( !dyn() )
+                return _host; 
+            string h = _dynName;
+            int p;
+            dynHostResolve(h, p);
+            return h;
+        }
+        int port() const { 
+            int p = -2;
+            if( dyn() ) {
+                string h = _dynName;
+                dynHostResolve(h,p);
+            }
+            else {
+                p = _port;
+            }
+            return p >= 0 ? p : CmdLine::DefaultDBPort; 
+        }
+        bool hasPort() const { 
+            int p = -2;
+            if( dyn() ) {
+                string h = _dynName;
+                dynHostResolve(h,p);
+            }
+            else {
+                p = _port;
+            }
+            return p >= 0;
+        }
+        void setPort( int port ) { 
+            if( dyn() ) {
+                log() << "INFO skipping setPort() HostAndPort dyn()=true" << endl;
+                return;
+            }
+            _port = port; 
+        }
+
+    private:
+        bool dyn() const { return !_dynName.empty(); }
+        void init(const char *);
+        // invariant (except full obj assignment):
+        string _dynName; // when this is set, _host and _port aren't used, rather, we look up the dyn info every time.
+        string _host;
+        int _port; // -1 indicates unspecified
+    };
+
+    inline HostAndPort HostAndPort::Me() {
+        {
+            string s = dynHostMyName();
+            if( !s.empty() ) 
+                return HostAndPort(s);
+        }
+
+        const char* ips = cmdLine.bind_ip.c_str();
+        while(*ips) {
+            string ip;
+            const char * comma = strchr(ips, ',');
+            if (comma) {
+                ip = string(ips, comma - ips);
+                ips = comma + 1;
+            }
+            else {
+                ip = string(ips);
+                ips = "";
+            }
+            HostAndPort h = HostAndPort(ip, cmdLine.port);
+            if (!h.isLocalHost()) {
+                return h;
+            }
+        }
+
+        string h = getHostName();
+        assert( !h.empty() );
+        assert( h != "localhost" );
+        return HostAndPort(h, cmdLine.port);
+    }
+
+    inline string HostAndPort::dynString() const {
+        return dyn() ? _dynName : toString();
+    }
+
+    inline string HostAndPort::toStringLong() const {
+        return _dynName + ':' + toString();
+    }
+
+    inline string HostAndPort::toString( bool includePort ) const {
+        string h = host();
+        int p = port();
+
+        if ( ! includePort )
+            return h;
+
+        stringstream ss;
+        ss << h;
+        if ( p != -1 ) {
+            ss << ':';
+#if defined(_DEBUG)
+            if( p >= 44000 && p < 44100 ) {
+                log() << "warning: special debug port 44xxx used" << endl;
+                ss << p+1;
+            }
+            else
+                ss << p;
+#else
+            ss << p;
+#endif
+        }
+        return ss.str();
+    }
+
+    inline bool HostAndPort::isLocalHost() const {
+        string _host = host();
+        return (  _host == "localhost"
+               || startsWith(_host.c_str(), "127.")
+               || _host == "::1"
+               || _host == "anonymous unix socket"
+               || _host.c_str()[0] == '/' // unix socket
+               );
+    }
+
+    inline void HostAndPort::init(const char *p) {
+        uassert(13110, "HostAndPort: bad host:port config string", *p);
+        assert( *p != '#' );
+        assert( _dynName.empty() );
+        const char *colon = strrchr(p, ':');
+        if( colon ) {
+            int port = atoi(colon+1);
+            uassert(13095, "HostAndPort: bad port #", port > 0);
+            _host = string(p,colon-p);
+            _port = port;
+        }
+        else {
+            // no port specified.
+            _host = p;
+            _port = -1;
+        }
+    }
+
+    inline HostAndPort::HostAndPort(string s) {
+        const char *p = s.c_str();
+        if( *p == '#' ) {
+            _dynName = s;
+            _port = -2;
+            _host = "invalid_hostname_dyn_in_use";
+        }
+        else {
+            init(p);
+        }
+    }
+
+}
diff --git a/src/mongo/util/net/httpclient.cpp b/src/mongo/util/net/httpclient.cpp
new file mode 100644
index 00000000000..16eaa0ae80a
--- /dev/null
+++ b/src/mongo/util/net/httpclient.cpp
@@ -0,0 +1,177 @@
+// httpclient.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "httpclient.h"
+#include "sock.h"
+#include "message.h"
+#include "message_port.h"
+#include "../mongoutils/str.h"
+#include "../../bson/util/builder.h"
+
+namespace mongo {
+
+    //#define HD(x) cout << x << endl;
+#define HD(x)
+
+
+    int HttpClient::get( string url , Result * result ) {
+        return _go( "GET" , url , 0 , result );
+    }
+
+    int HttpClient::post( string url , string data , Result * result ) {
+        return _go( "POST" , url , data.c_str() , result );
+    }
+
+    int HttpClient::_go( const char * command , string url , const char * body , Result * result ) {
+        bool ssl = false;
+        if ( url.find( "https://" ) == 0 ) {
+            ssl = true;
+            url = url.substr( 8 );
+        }
+        else {
+            uassert( 10271 ,  "invalid url" , url.find( "http://" ) == 0 );
+            url = url.substr( 7 );
+        }
+
+        string host , path;
+        if ( url.find( "/" ) == string::npos ) {
+            host = url;
+            path = "/";
+        }
+        else {
+            host = url.substr( 0 , url.find( "/" ) );
+            path = url.substr( url.find( "/" ) );
+        }
+
+
+        HD( "host [" << host << "]" );
+        HD( "path [" << path << "]" );
+
+        string server = host;
+        int port = ssl ? 443 : 80;
+
+        string::size_type idx = host.find( ":" );
+        if ( idx != string::npos ) {
+            server = host.substr( 0 , idx );
+            string t = host.substr( idx + 1 );
+            port = atoi( t.c_str() );
+        }
+
+        HD( "server [" << server << "]" );
+        HD( "port [" << port << "]" );
+
+        string req;
+        {
+            stringstream ss;
+            ss << command << " " << path << " HTTP/1.1\r\n";
+            ss << "Host: " << host << "\r\n";
+            ss << "Connection: Close\r\n";
+            ss << "User-Agent: mongodb http client\r\n";
+            if ( body ) {
+                ss << "Content-Length: " << strlen( body ) << "\r\n";
+            }
+            ss << "\r\n";
+            if ( body ) {
+                ss << body;
+            }
+
+            req = ss.str();
+        }
+
+        SockAddr addr( server.c_str() , port );
+        HD( "addr: " << addr.toString() );
+
+        Socket sock;
+        if ( ! sock.connect( addr ) )
+            return -1;
+        
+        if ( ssl ) {
+#ifdef MONGO_SSL
+            _checkSSLManager();
+            sock.secure( _sslManager.get() );
+#else
+            uasserted( 15862 , "no ssl support" );
+#endif
+        }
+
+        {
+            const char * out = req.c_str();
+            int toSend = req.size();
+            sock.send( out , toSend, "_go" );
+        }
+
+        char buf[4096];
+        int got = sock.unsafe_recv( buf , 4096 );
+        buf[got] = 0;
+
+        int rc;
+        char version[32];
+        assert( sscanf( buf , "%s %d" , version , &rc ) == 2 );
+        HD( "rc: " << rc );
+
+        StringBuilder sb;
+        if ( result )
+            sb << buf;
+
+        while ( ( got = sock.unsafe_recv( buf , 4096 ) ) > 0) {
+            if ( result )
+                sb << buf;
+        }
+
+        if ( result ) {
+            result->_init( rc , sb.str() );
+        }
+
+        return rc;
+    }
+
+    void HttpClient::Result::_init( int code , string entire ) {
+        _code = code;
+        _entireResponse = entire;
+
+        while ( true ) {
+            size_t i = entire.find( '\n' );
+            if ( i == string::npos ) {
+                // invalid
+                break;
+            }
+
+            string h = entire.substr( 0 , i );
+            entire = entire.substr( i + 1 );
+
+            if ( h.size() && h[h.size()-1] == '\r' )
+                h = h.substr( 0 , h.size() - 1 );
+
+            if ( h.size() == 0 )
+                break;
+
+            i = h.find( ':' );
+            if ( i != string::npos )
+                _headers[h.substr(0,i)] = str::ltrim(h.substr(i+1));
+        }
+
+        _body = entire;
+    }
+
+#ifdef MONGO_SSL
+    void HttpClient::_checkSSLManager() {
+        _sslManager.reset( new SSLManager( true ) );
+    }
+#endif
+
+}
diff --git a/src/mongo/util/net/httpclient.h b/src/mongo/util/net/httpclient.h
new file mode 100644
index 00000000000..126969f4f70
--- /dev/null
+++ b/src/mongo/util/net/httpclient.h
@@ -0,0 +1,78 @@
+// httpclient.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../../pch.h"
+#include "sock.h"
+
+namespace mongo {
+
+    class HttpClient : boost::noncopyable {
+    public:
+
+        typedef map<string,string> Headers;
+
+        class Result {
+        public:
+            Result() {}
+
+            const string& getEntireResponse() const {
+                return _entireResponse;
+            }
+
+            Headers getHeaders() const {
+                return _headers;
+            }
+
+            const string& getBody() const {
+                return _body;
+            }
+
+        private:
+
+            void _init( int code , string entire );
+
+            int _code;
+            string _entireResponse;
+
+            Headers _headers;
+            string _body;
+
+            friend class HttpClient;
+        };
+
+        /**
+         * @return response code
+         */
+        int get( string url , Result * result = 0 );
+
+        /**
+         * @return response code
+         */
+        int post( string url , string body , Result * result = 0 );
+
+    private:
+        int _go( const char * command , string url , const char * body , Result * result );
+
+#ifdef MONGO_SSL
+        void _checkSSLManager();
+
+        scoped_ptr<SSLManager> _sslManager;
+#endif
+    };
+}
diff --git a/src/mongo/util/net/listen.cpp b/src/mongo/util/net/listen.cpp
new file mode 100644
index 00000000000..ec3e4fa0ee8
--- /dev/null
+++ b/src/mongo/util/net/listen.cpp
@@ -0,0 +1,394 @@
+// listen.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+
+#include "pch.h"
+#include "listen.h"
+#include "message_port.h"
+
+#ifndef _WIN32
+
+# ifndef __sunos__
+#  include <ifaddrs.h>
+# endif
+# include <sys/resource.h>
+# include <sys/stat.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netdb.h>
+#ifdef __openbsd__
+# include <sys/uio.h>
+#endif
+
+#else
+
+// errno doesn't work for winsock.
+#undef errno
+#define errno WSAGetLastError()
+
+#endif
+
+namespace mongo {
+
+
+    void checkTicketNumbers();
+
+    
+    // ----- Listener -------
+
+    const Listener* Listener::_timeTracker;
+
+    vector<SockAddr> ipToAddrs(const char* ips, int port, bool useUnixSockets) {
+        vector<SockAddr> out;
+        if (*ips == '\0') {
+            out.push_back(SockAddr("0.0.0.0", port)); // IPv4 all
+
+            if (IPv6Enabled())
+                out.push_back(SockAddr("::", port)); // IPv6 all
+#ifndef _WIN32
+            if (useUnixSockets)
+                out.push_back(SockAddr(makeUnixSockPath(port).c_str(), port)); // Unix socket
+#endif
+            return out;
+        }
+
+        while(*ips) {
+            string ip;
+            const char * comma = strchr(ips, ',');
+            if (comma) {
+                ip = string(ips, comma - ips);
+                ips = comma + 1;
+            }
+            else {
+                ip = string(ips);
+                ips = "";
+            }
+
+            SockAddr sa(ip.c_str(), port);
+            out.push_back(sa);
+
+#ifndef _WIN32
+            if (useUnixSockets && (sa.getAddr() == "127.0.0.1" || sa.getAddr() == "0.0.0.0")) // only IPv4
+                out.push_back(SockAddr(makeUnixSockPath(port).c_str(), port));
+#endif
+        }
+        return out;
+
+    }
+    
+    Listener::Listener(const string& name, const string &ip, int port, bool logConnect ) 
+        : _port(port), _name(name), _ip(ip), _logConnect(logConnect), _elapsedTime(0) { 
+#ifdef MONGO_SSL
+        _ssl = 0;
+        _sslPort = 0;
+
+        if ( cmdLine.sslOnNormalPorts && cmdLine.sslServerManager ) {
+            secure( cmdLine.sslServerManager );
+        }
+#endif
+    }
+    
+    Listener::~Listener() {
+        if ( _timeTracker == this )
+            _timeTracker = 0;
+    }
+
+#ifdef MONGO_SSL
+    void Listener::secure( SSLManager* manager ) {
+        _ssl = manager;
+    }
+
+    void Listener::addSecurePort( SSLManager* manager , int additionalPort ) {
+        _ssl = manager;
+        _sslPort = additionalPort;
+    }
+
+#endif
+
+    bool Listener::_setupSockets( const vector<SockAddr>& mine , vector<SOCKET>& socks ) {
+        for (vector<SockAddr>::const_iterator it=mine.begin(), end=mine.end(); it != end; ++it) {
+            const SockAddr& me = *it;
+
+            SOCKET sock = ::socket(me.getType(), SOCK_STREAM, 0);
+            massert( 15863 , str::stream() << "listen(): invalid socket? " << errnoWithDescription() , sock >= 0 );
+
+            if (me.getType() == AF_UNIX) {
+#if !defined(_WIN32)
+                if (unlink(me.getAddr().c_str()) == -1) {
+                    int x = errno;
+                    if (x != ENOENT) {
+                        log() << "couldn't unlink socket file " << me << errnoWithDescription(x) << " skipping" << endl;
+                        continue;
+                    }
+                }
+#endif
+            }
+            else if (me.getType() == AF_INET6) {
+                // IPv6 can also accept IPv4 connections as mapped addresses (::ffff:127.0.0.1)
+                // That causes a conflict if we don't do set it to IPV6_ONLY
+                const int one = 1;
+                setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*) &one, sizeof(one));
+            }
+
+#if !defined(_WIN32)
+            {
+                const int one = 1;
+                if ( setsockopt( sock , SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0 )
+                    out() << "Failed to set socket opt, SO_REUSEADDR" << endl;
+            }
+#endif
+
+            if ( ::bind(sock, me.raw(), me.addressSize) != 0 ) {
+                int x = errno;
+                error() << "listen(): bind() failed " << errnoWithDescription(x) << " for socket: " << me.toString() << endl;
+                if ( x == EADDRINUSE )
+                    error() << "  addr already in use" << endl;
+                closesocket(sock);
+                return false;
+            }
+
+#if !defined(_WIN32)
+            if (me.getType() == AF_UNIX) {
+                if (chmod(me.getAddr().c_str(), 0777) == -1) {
+                    error() << "couldn't chmod socket file " << me << errnoWithDescription() << endl;
+                }
+                ListeningSockets::get()->addPath( me.getAddr() );
+            }
+#endif
+            
+            if ( ::listen(sock, 128) != 0 ) {
+                error() << "listen(): listen() failed " << errnoWithDescription() << endl;
+                closesocket(sock);
+                return false;
+            }
+
+            ListeningSockets::get()->add( sock );
+
+            socks.push_back(sock);
+        }
+        
+        return true;
+    }
+    
+    void Listener::initAndListen() {
+        checkTicketNumbers();
+        vector<SOCKET> socks;
+        set<int> sslSocks;
+        
+        { // normal sockets
+            vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _port, (!cmdLine.noUnixSocket && useUnixSockets()));
+            if ( ! _setupSockets( mine , socks ) )
+                return;
+        }
+        
+#ifdef MONGO_SSL
+        if ( _ssl && _sslPort > 0 ) {
+            unsigned prev = socks.size();
+            
+            vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _sslPort, false );
+            if ( ! _setupSockets( mine , socks ) )
+                return;
+            
+            for ( unsigned i=prev; i<socks.size(); i++ ) {
+                sslSocks.insert( socks[i] );
+            }
+
+        }
+#endif
+
+        SOCKET maxfd = 0; // needed for select()
+        for ( unsigned i=0; i<socks.size(); i++ ) {
+            if ( socks[i] > maxfd )
+                maxfd = socks[i];
+        }
+        
+#ifdef MONGO_SSL
+        if ( _ssl == 0 ) {
+            _logListen( _port , false );
+        }
+        else if ( _sslPort == 0 ) {
+            _logListen( _port , true );
+        }
+        else {
+            // both
+            _logListen( _port , false );
+            _logListen( _sslPort , true );
+        }
+#else
+        _logListen( _port , false );
+#endif
+
+        static long connNumber = 0;
+        struct timeval maxSelectTime;
+        while ( ! inShutdown() ) {
+            fd_set fds[1];
+            FD_ZERO(fds);
+            
+            for (vector<SOCKET>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) {
+                FD_SET(*it, fds);
+            }
+
+            maxSelectTime.tv_sec = 0;
+            maxSelectTime.tv_usec = 10000;
+            const int ret = select(maxfd+1, fds, NULL, NULL, &maxSelectTime);
+
+            if (ret == 0) {
+#if defined(__linux__)
+                _elapsedTime += ( 10000 - maxSelectTime.tv_usec ) / 1000;
+#else
+                _elapsedTime += 10;
+#endif
+                continue;
+            }
+
+            if (ret < 0) {
+                int x = errno;
+#ifdef EINTR
+                if ( x == EINTR ) {
+                    log() << "select() signal caught, continuing" << endl;
+                    continue;
+                }
+#endif
+                if ( ! inShutdown() )
+                    log() << "select() failure: ret=" << ret << " " << errnoWithDescription(x) << endl;
+                return;
+            }
+
+#if defined(__linux__)
+            _elapsedTime += max(ret, (int)(( 10000 - maxSelectTime.tv_usec ) / 1000));
+#else
+            _elapsedTime += ret; // assume 1ms to grab connection. very rough
+#endif
+
+            for (vector<SOCKET>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) {
+                if (! (FD_ISSET(*it, fds)))
+                    continue;
+
+                SockAddr from;
+                int s = accept(*it, from.raw(), &from.addressSize);
+                if ( s < 0 ) {
+                    int x = errno; // so no global issues
+                    if ( x == ECONNABORTED || x == EBADF ) {
+                        log() << "Listener on port " << _port << " aborted" << endl;
+                        return;
+                    }
+                    if ( x == 0 && inShutdown() ) {
+                        return;   // socket closed
+                    }
+                    if( !inShutdown() ) {
+                        log() << "Listener: accept() returns " << s << " " << errnoWithDescription(x) << endl;
+                        if (x == EMFILE || x == ENFILE) {
+                            // Connection still in listen queue but we can't accept it yet
+                            error() << "Out of file descriptors. Waiting one second before trying to accept more connections." << warnings;
+                            sleepsecs(1);
+                        }
+                    }
+                    continue;
+                }
+                if (from.getType() != AF_UNIX)
+                    disableNagle(s);
+                if ( _logConnect && ! cmdLine.quiet ){
+                    int conns = connTicketHolder.used()+1;
+                    const char* word = (conns == 1 ? " connection" : " connections");
+                    log() << "connection accepted from " << from.toString() << " #" << ++connNumber << " (" << conns << word << " now open)" << endl;
+                }
+                
+                Socket newSock = Socket(s, from);
+#ifdef MONGO_SSL
+                if ( _ssl && ( _sslPort == 0 || sslSocks.count(*it) ) ) {
+                    newSock.secureAccepted( _ssl );
+                }
+#endif
+                accepted( newSock );
+            }
+        }
+    }
+
+    void Listener::_logListen( int port , bool ssl ) {
+        log() << _name << ( _name.size() ? " " : "" ) << "waiting for connections on port " << port << ( ssl ? " ssl" : "" ) << endl;
+    }
+
+
+    void Listener::accepted(Socket socket) {
+        accepted( new MessagingPort(socket) );
+    }
+    
+    void Listener::accepted(MessagingPort *mp) {
+        assert(!"You must overwrite one of the accepted methods");
+    }
+
+    // ----- ListeningSockets -------
+
+    ListeningSockets* ListeningSockets::_instance = new ListeningSockets();
+
+    ListeningSockets* ListeningSockets::get() {
+        return _instance;
+    }
+
+    // ------ connection ticket and control ------
+
+    const int DEFAULT_MAX_CONN = 20000;
+    const int MAX_MAX_CONN = 20000;
+
+    int getMaxConnections() {
+#ifdef _WIN32
+        return DEFAULT_MAX_CONN;
+#else
+        struct rlimit limit;
+        assert( getrlimit(RLIMIT_NOFILE,&limit) == 0 );
+
+        int max = (int)(limit.rlim_cur * .8);
+
+        log(1) << "fd limit"
+               << " hard:" << limit.rlim_max
+               << " soft:" << limit.rlim_cur
+               << " max conn: " << max
+               << endl;
+
+        if ( max > MAX_MAX_CONN )
+            max = MAX_MAX_CONN;
+
+        return max;
+#endif
+    }
+
+    void checkTicketNumbers() {
+        int want = getMaxConnections();
+        int current = connTicketHolder.outof();
+        if ( current != DEFAULT_MAX_CONN ) {
+            if ( current < want ) {
+                // they want fewer than they can handle
+                // which is fine
+                log(1) << " only allowing " << current << " connections" << endl;
+                return;
+            }
+            if ( current > want ) {
+                log() << " --maxConns too high, can only handle " << want << endl;
+            }
+        }
+        connTicketHolder.resize( want );
+    }
+
+    TicketHolder connTicketHolder(DEFAULT_MAX_CONN);
+
+}
diff --git a/src/mongo/util/net/listen.h b/src/mongo/util/net/listen.h
new file mode 100644
index 00000000000..ca90e835b97
--- /dev/null
+++ b/src/mongo/util/net/listen.h
@@ -0,0 +1,190 @@
+// listen.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "sock.h"
+
+namespace mongo {
+
+    class MessagingPort;
+
+    class Listener : boost::noncopyable {
+    public:
+
+        Listener(const string& name, const string &ip, int port, bool logConnect=true );
+
+        virtual ~Listener();
+        
+#ifdef MONGO_SSL
+        /**
+         * make this an ssl socket
+         * ownership of SSLManager remains with the caller
+         */
+        void secure( SSLManager* manager );
+
+        void addSecurePort( SSLManager* manager , int additionalPort );
+#endif
+
+        void initAndListen(); // never returns unless error (start a thread)
+
+        /* spawn a thread, etc., then return */
+        virtual void accepted(Socket socket);
+        virtual void accepted(MessagingPort *mp);
+
+        const int _port;
+
+        /**
+         * @return a rough estimate of elapsed time since the server started
+         */
+        long long getMyElapsedTimeMillis() const { return _elapsedTime; }
+
+        void setAsTimeTracker() {
+            _timeTracker = this;
+        }
+
+        static const Listener* getTimeTracker() {
+            return _timeTracker;
+        }
+
+        static long long getElapsedTimeMillis() {
+            if ( _timeTracker )
+                return _timeTracker->getMyElapsedTimeMillis();
+
+            // should this assert or throw?  seems like callers may not expect to get zero back, certainly not forever.
+            return 0;
+        }
+
+    private:
+        string _name;
+        string _ip;
+        bool _logConnect;
+        long long _elapsedTime;
+        
+#ifdef MONGO_SSL
+        SSLManager* _ssl;
+        int _sslPort;
+#endif
+        
+        /**
+         * @return true iff everything went ok
+         */
+        bool _setupSockets( const vector<SockAddr>& mine , vector<SOCKET>& socks );
+        
+        void _logListen( int port , bool ssl );
+
+        static const Listener* _timeTracker;
+        
+        virtual bool useUnixSockets() const { return false; }
+    };
+
+    /**
+     * keep track of elapsed time
+     * after a set amount of time, tells you to do something
+     * only in this file because depends on Listener
+     */
+    class ElapsedTracker {
+    public:
+        ElapsedTracker( int hitsBetweenMarks , int msBetweenMarks )
+            : _h( hitsBetweenMarks ) , _ms( msBetweenMarks ) , _pings(0) {
+            _last = Listener::getElapsedTimeMillis();
+        }
+
+        /**
+         * call this for every iteration
+         * returns true if one of the triggers has gone off
+         */
+        bool intervalHasElapsed() {
+            if ( ( ++_pings % _h ) == 0 ) {
+                _last = Listener::getElapsedTimeMillis();
+                return true;
+            }
+
+            long long now = Listener::getElapsedTimeMillis();
+            if ( now - _last > _ms ) {
+                _last = now;
+                return true;
+            }
+
+            return false;
+        }
+
+    private:
+        const int _h;
+        const int _ms;
+
+        unsigned long long _pings;
+
+        long long _last;
+
+    };
+
+    class ListeningSockets {
+    public:
+        ListeningSockets()
+            : _mutex("ListeningSockets")
+            , _sockets( new set<int>() )
+            , _socketPaths( new set<string>() )
+        { }
+        void add( int sock ) {
+            scoped_lock lk( _mutex );
+            _sockets->insert( sock );
+        }
+        void addPath( string path ) {
+            scoped_lock lk( _mutex );
+            _socketPaths->insert( path );
+        }
+        void remove( int sock ) {
+            scoped_lock lk( _mutex );
+            _sockets->erase( sock );
+        }
+        void closeAll() {
+            set<int>* sockets;
+            set<string>* paths;
+
+            {
+                scoped_lock lk( _mutex );
+                sockets = _sockets;
+                _sockets = new set<int>();
+                paths = _socketPaths;
+                _socketPaths = new set<string>();
+            }
+
+            for ( set<int>::iterator i=sockets->begin(); i!=sockets->end(); i++ ) {
+                int sock = *i;
+                log() << "closing listening socket: " << sock << endl;
+                closesocket( sock );
+            }
+
+            for ( set<string>::iterator i=paths->begin(); i!=paths->end(); i++ ) {
+                string path = *i;
+                log() << "removing socket file: " << path << endl;
+                ::remove( path.c_str() );
+            }
+        }
+        static ListeningSockets* get();
+    private:
+        mongo::mutex _mutex;
+        set<int>* _sockets;
+        set<string>* _socketPaths; // for unix domain sockets
+        static ListeningSockets* _instance;
+    };
+
+
+    extern TicketHolder connTicketHolder;
+
+}
diff --git a/src/mongo/util/net/message.cpp b/src/mongo/util/net/message.cpp
new file mode 100644
index 00000000000..a84e5c48c5c
--- /dev/null
+++ b/src/mongo/util/net/message.cpp
@@ -0,0 +1,64 @@
+// message.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+
+#include "message.h"
+#include "message_port.h"
+#include "listen.h"
+
+#include "../goodies.h"
+#include "../../client/dbclient.h"
+
+namespace mongo {
+
+    void Message::send( MessagingPort &p, const char *context ) {
+        if ( empty() ) {
+            return;
+        }
+        if ( _buf != 0 ) {
+            p.send( (char*)_buf, _buf->len, context );
+        }
+        else {
+            p.send( _data, context );
+        }
+    }
+
+    MSGID NextMsgId;
+
+    /*struct MsgStart {
+        MsgStart() {
+            NextMsgId = (((unsigned) time(0)) << 16) ^ curTimeMillis();
+            assert(MsgDataHeaderSize == 16);
+        }
+    } msgstart;*/
+
+    MSGID nextMessageId() {
+        MSGID msgid = NextMsgId++;
+        return msgid;
+    }
+
+    bool doesOpGetAResponse( int op ) {
+        return op == dbQuery || op == dbGetMore;
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/util/net/message.h b/src/mongo/util/net/message.h
new file mode 100644
index 00000000000..14b0241f21d
--- /dev/null
+++ b/src/mongo/util/net/message.h
@@ -0,0 +1,312 @@
+// message.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "sock.h"
+#include "../../bson/util/atomic_int.h"
+#include "hostandport.h"
+
+namespace mongo {
+
+    class Message;
+    class MessagingPort;
+    class PiggyBackData;
+
+    typedef AtomicUInt MSGID;
+
+    enum Operations {
+        opReply = 1,     /* reply. responseTo is set. */
+        dbMsg = 1000,    /* generic msg command followed by a string */
+        dbUpdate = 2001, /* update object */
+        dbInsert = 2002,
+        //dbGetByOID = 2003,
+        dbQuery = 2004,
+        dbGetMore = 2005,
+        dbDelete = 2006,
+        dbKillCursors = 2007
+    };
+
+    bool doesOpGetAResponse( int op );
+
+    inline const char * opToString( int op ) {
+        switch ( op ) {
+        case 0: return "none";
+        case opReply: return "reply";
+        case dbMsg: return "msg";
+        case dbUpdate: return "update";
+        case dbInsert: return "insert";
+        case dbQuery: return "query";
+        case dbGetMore: return "getmore";
+        case dbDelete: return "remove";
+        case dbKillCursors: return "killcursors";
+        default:
+            PRINT(op);
+            assert(0);
+            return "";
+        }
+    }
+
+    inline bool opIsWrite( int op ) {
+        switch ( op ) {
+
+        case 0:
+        case opReply:
+        case dbMsg:
+        case dbQuery:
+        case dbGetMore:
+        case dbKillCursors:
+            return false;
+
+        case dbUpdate:
+        case dbInsert:
+        case dbDelete:
+            return true;
+
+        default:
+            PRINT(op);
+            assert(0);
+            return "";
+        }
+
+    }
+
+#pragma pack(1)
+    /* see http://www.mongodb.org/display/DOCS/Mongo+Wire+Protocol
+    */
+    struct MSGHEADER {
+        int messageLength; // total message size, including this
+        int requestID;     // identifier for this message
+        int responseTo;    // requestID from the original request
+        //   (used in reponses from db)
+        int opCode;
+    };
+    struct OP_GETMORE : public MSGHEADER {
+        MSGHEADER header;             // standard message header
+        int       ZERO_or_flags;      // 0 - reserved for future use
+        //cstring   fullCollectionName; // "dbname.collectionname"
+        //int32     numberToReturn;     // number of documents to return
+        //int64     cursorID;           // cursorID from the OP_REPLY
+    };
+#pragma pack()
+
+#pragma pack(1)
+    /* todo merge this with MSGHEADER (or inherit from it). */
+    struct MsgData {
+        int len; /* len of the msg, including this field */
+        MSGID id; /* request/reply id's match... */
+        MSGID responseTo; /* id of the message we are responding to */
+        short _operation;
+        char _flags;
+        char _version;
+        int operation() const {
+            return _operation;
+        }
+        void setOperation(int o) {
+            _flags = 0;
+            _version = 0;
+            _operation = o;
+        }
+        char _data[4];
+
+        int& dataAsInt() {
+            return *((int *) _data);
+        }
+
+        bool valid() {
+            if ( len <= 0 || len > ( 4 * BSONObjMaxInternalSize ) )
+                return false;
+            if ( _operation < 0 || _operation > 30000 )
+                return false;
+            return true;
+        }
+
+        long long getCursor() {
+            assert( responseTo > 0 );
+            assert( _operation == opReply );
+            long long * l = (long long *)(_data + 4);
+            return l[0];
+        }
+
+        int dataLen(); // len without header
+    };
+    const int MsgDataHeaderSize = sizeof(MsgData) - 4;
+    inline int MsgData::dataLen() {
+        return len - MsgDataHeaderSize;
+    }
+#pragma pack()
+
+    class Message {
+    public:
+        // we assume here that a vector with initial size 0 does no allocation (0 is the default, but wanted to make it explicit).
+        Message() : _buf( 0 ), _data( 0 ), _freeIt( false ) {}
+        Message( void * data , bool freeIt ) :
+            _buf( 0 ), _data( 0 ), _freeIt( false ) {
+            _setData( reinterpret_cast< MsgData* >( data ), freeIt );
+        };
+        Message(Message& r) : _buf( 0 ), _data( 0 ), _freeIt( false ) {
+            *this = r;
+        }
+        ~Message() {
+            reset();
+        }
+
+        SockAddr _from;
+
+        MsgData *header() const {
+            assert( !empty() );
+            return _buf ? _buf : reinterpret_cast< MsgData* > ( _data[ 0 ].first );
+        }
+        int operation() const { return header()->operation(); }
+
+        MsgData *singleData() const {
+            massert( 13273, "single data buffer expected", _buf );
+            return header();
+        }
+
+        bool empty() const { return !_buf && _data.empty(); }
+
+        int size() const {
+            int res = 0;
+            if ( _buf ) {
+                res =  _buf->len;
+            }
+            else {
+                for (MsgVec::const_iterator it = _data.begin(); it != _data.end(); ++it) {
+                    res += it->second;
+                }
+            }
+            return res;
+        }
+
+        int dataSize() const { return size() - sizeof(MSGHEADER); }
+
+        // concat multiple buffers - noop if <2 buffers already, otherwise can be expensive copy
+        // can get rid of this if we make response handling smarter
+        void concat() {
+            if ( _buf || empty() ) {
+                return;
+            }
+
+            assert( _freeIt );
+            int totalSize = 0;
+            for( vector< pair< char *, int > >::const_iterator i = _data.begin(); i != _data.end(); ++i ) {
+                totalSize += i->second;
+            }
+            char *buf = (char*)malloc( totalSize );
+            char *p = buf;
+            for( vector< pair< char *, int > >::const_iterator i = _data.begin(); i != _data.end(); ++i ) {
+                memcpy( p, i->first, i->second );
+                p += i->second;
+            }
+            reset();
+            _setData( (MsgData*)buf, true );
+        }
+
+        // vector swap() so this is fast
+        Message& operator=(Message& r) {
+            assert( empty() );
+            assert( r._freeIt );
+            _buf = r._buf;
+            r._buf = 0;
+            if ( r._data.size() > 0 ) {
+                _data.swap( r._data );
+            }
+            r._freeIt = false;
+            _freeIt = true;
+            return *this;
+        }
+
+        void reset() {
+            if ( _freeIt ) {
+                if ( _buf ) {
+                    free( _buf );
+                }
+                for( vector< pair< char *, int > >::const_iterator i = _data.begin(); i != _data.end(); ++i ) {
+                    free(i->first);
+                }
+            }
+            _buf = 0;
+            _data.clear();
+            _freeIt = false;
+        }
+
+        // use to add a buffer
+        // assumes message will free everything
+        void appendData(char *d, int size) {
+            if ( size <= 0 ) {
+                return;
+            }
+            if ( empty() ) {
+                MsgData *md = (MsgData*)d;
+                md->len = size; // can be updated later if more buffers added
+                _setData( md, true );
+                return;
+            }
+            assert( _freeIt );
+            if ( _buf ) {
+                _data.push_back( make_pair( (char*)_buf, _buf->len ) );
+                _buf = 0;
+            }
+            _data.push_back( make_pair( d, size ) );
+            header()->len += size;
+        }
+
+        // use to set first buffer if empty
+        void setData(MsgData *d, bool freeIt) {
+            assert( empty() );
+            _setData( d, freeIt );
+        }
+        void setData(int operation, const char *msgtxt) {
+            setData(operation, msgtxt, strlen(msgtxt)+1);
+        }
+        void setData(int operation, const char *msgdata, size_t len) {
+            assert( empty() );
+            size_t dataLen = len + sizeof(MsgData) - 4;
+            MsgData *d = (MsgData *) malloc(dataLen);
+            memcpy(d->_data, msgdata, len);
+            d->len = fixEndian(dataLen);
+            d->setOperation(operation);
+            _setData( d, true );
+        }
+
+        bool doIFreeIt() {
+            return _freeIt;
+        }
+
+        void send( MessagingPort &p, const char *context );
+        
+        string toString() const;
+
+    private:
+        void _setData( MsgData *d, bool freeIt ) {
+            _freeIt = freeIt;
+            _buf = d;
+        }
+        // if just one buffer, keep it in _buf, otherwise keep a sequence of buffers in _data
+        MsgData * _buf;
+        // byte buffer(s) - the first must contain at least a full MsgData unless using _buf for storage instead
+        typedef vector< pair< char*, int > > MsgVec;
+        MsgVec _data;
+        bool _freeIt;
+    };
+
+
+    MSGID nextMessageId();
+
+
+} // namespace mongo
diff --git a/src/mongo/util/net/message_port.cpp b/src/mongo/util/net/message_port.cpp
new file mode 100644
index 00000000000..c342ed3c8b7
--- /dev/null
+++ b/src/mongo/util/net/message_port.cpp
@@ -0,0 +1,303 @@
+// message_port.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+
+#include "message.h"
+#include "message_port.h"
+#include "listen.h"
+
+#include "../goodies.h"
+#include "../background.h"
+#include "../time_support.h"
+#include "../../db/cmdline.h"
+#include "../../client/dbclient.h"
+
+
+#ifndef _WIN32
+# ifndef __sunos__
+#  include <ifaddrs.h>
+# endif
+# include <sys/resource.h>
+# include <sys/stat.h>
+#else
+
+// errno doesn't work for winsock.
+#undef errno
+#define errno WSAGetLastError()
+
+#endif
+
+namespace mongo {
+
+
+// if you want trace output:
+#define mmm(x)
+
+    /* messagingport -------------------------------------------------------------- */
+
+    class PiggyBackData {
+    public:
+        PiggyBackData( MessagingPort * port ) {
+            _port = port;
+            _buf = new char[1300];
+            _cur = _buf;
+        }
+
+        ~PiggyBackData() {
+            DESTRUCTOR_GUARD (
+                flush();
+                delete[]( _cur );
+            );
+        }
+
+        void append( Message& m ) {
+            assert( m.header()->len <= 1300 );
+
+            if ( len() + m.header()->len > 1300 )
+                flush();
+
+            memcpy( _cur , m.singleData() , m.header()->len );
+            _cur += m.header()->len;
+        }
+
+        void flush() {
+            if ( _buf == _cur )
+                return;
+
+            _port->send( _buf , len(), "flush" );
+            _cur = _buf;
+        }
+
+        int len() const { return _cur - _buf; }
+
+    private:
+        MessagingPort* _port;
+        char * _buf;
+        char * _cur;
+    };
+
+    class Ports {
+        set<MessagingPort*> ports;
+        mongo::mutex m;
+    public:
+        Ports() : ports(), m("Ports") {}
+        void closeAll(unsigned skip_mask) {
+            scoped_lock bl(m);
+            for ( set<MessagingPort*>::iterator i = ports.begin(); i != ports.end(); i++ ) {
+                if( (*i)->tag & skip_mask )
+                    continue;
+                (*i)->shutdown();
+            }
+        }
+        void insert(MessagingPort* p) {
+            scoped_lock bl(m);
+            ports.insert(p);
+        }
+        void erase(MessagingPort* p) {
+            scoped_lock bl(m);
+            ports.erase(p);
+        }
+    };
+
+    // we "new" this so it is still be around when other automatic global vars
+    // are being destructed during termination.
+    Ports& ports = *(new Ports());
+
+    void MessagingPort::closeAllSockets(unsigned mask) {
+        ports.closeAll(mask);
+    }
+
+    MessagingPort::MessagingPort(int fd, const SockAddr& remote) 
+        : Socket( fd , remote ) , piggyBackData(0) {
+        ports.insert(this);
+    }
+
+    MessagingPort::MessagingPort( double timeout, int ll ) 
+        : Socket( timeout, ll ) {
+        ports.insert(this);
+        piggyBackData = 0;
+    }
+
+    MessagingPort::MessagingPort( Socket& sock )
+        : Socket( sock ) , piggyBackData( 0 ) {
+        ports.insert(this);
+    }
+
+    void MessagingPort::shutdown() {
+        close();
+    }
+
+    MessagingPort::~MessagingPort() {
+        if ( piggyBackData )
+            delete( piggyBackData );
+        shutdown();
+        ports.erase(this);
+    }
+
+    bool MessagingPort::recv(Message& m) {
+        try {
+again:
+            mmm( log() << "*  recv() sock:" << this->sock << endl; )
+            int len = -1;
+
+            char *lenbuf = (char *) &len;
+            int lft = 4;
+            Socket::recv( lenbuf, lft );
+
+            if ( len < 16 || len > 48000000 ) { // messages must be large enough for headers
+                if ( len == -1 ) {
+                    // Endian check from the client, after connecting, to see what mode server is running in.
+                    unsigned foo = 0x10203040;
+                    send( (char *) &foo, 4, "endian" );
+                    goto again;
+                }
+
+                if ( len == 542393671 ) {
+                    // an http GET
+                    log(_logLevel) << "looks like you're trying to access db over http on native driver port.  please add 1000 for webserver" << endl;
+                    string msg = "You are trying to access MongoDB on the native driver port. For http diagnostic access, add 1000 to the port number\n";
+                    stringstream ss;
+                    ss << "HTTP/1.0 200 OK\r\nConnection: close\r\nContent-Type: text/plain\r\nContent-Length: " << msg.size() << "\r\n\r\n" << msg;
+                    string s = ss.str();
+                    send( s.c_str(), s.size(), "http" );
+                    return false;
+                }
+                log(0) << "recv(): message len " << len << " is too large" << len << endl;
+                return false;
+            }
+
+            int z = (len+1023)&0xfffffc00;
+            assert(z>=len);
+            MsgData *md = (MsgData *) malloc(z);
+            assert(md);
+            md->len = len;
+
+            char *p = (char *) &md->id;
+            int left = len -4;
+
+            try {
+                Socket::recv( p, left );
+            }
+            catch (...) {
+                free(md);
+                throw;
+            }
+
+            m.setData(md, true);
+            return true;
+
+        }
+        catch ( const SocketException & e ) {
+            log(_logLevel + (e.shouldPrint() ? 0 : 1) ) << "SocketException: remote: " << remote() << " error: " << e << endl;
+            m.reset();
+            return false;
+        }
+    }
+
+    void MessagingPort::reply(Message& received, Message& response) {
+        say(/*received.from, */response, received.header()->id);
+    }
+
+    void MessagingPort::reply(Message& received, Message& response, MSGID responseTo) {
+        say(/*received.from, */response, responseTo);
+    }
+
+    bool MessagingPort::call(Message& toSend, Message& response) {
+        mmm( log() << "*call()" << endl; )
+        say(toSend);
+        return recv( toSend , response );
+    }
+
+    bool MessagingPort::recv( const Message& toSend , Message& response ) {
+        while ( 1 ) {
+            bool ok = recv(response);
+            if ( !ok )
+                return false;
+            //log() << "got response: " << response.data->responseTo << endl;
+            if ( response.header()->responseTo == toSend.header()->id )
+                break;
+            error() << "MessagingPort::call() wrong id got:" << hex << (unsigned)response.header()->responseTo << " expect:" << (unsigned)toSend.header()->id << '\n'
+                    << dec
+                    << "  toSend op: " << (unsigned)toSend.operation() << '\n'
+                    << "  response msgid:" << (unsigned)response.header()->id << '\n'
+                    << "  response len:  " << (unsigned)response.header()->len << '\n'
+                    << "  response op:  " << response.operation() << '\n'
+                    << "  remote: " << remoteString() << endl;
+            assert(false);
+            response.reset();
+        }
+        mmm( log() << "*call() end" << endl; )
+        return true;
+    }
+
+    void MessagingPort::assertStillConnected() { 
+        uassert(15901, "client disconnected during operation", Socket::stillConnected());
+    }
+
+    void MessagingPort::say(Message& toSend, int responseTo) {
+        assert( !toSend.empty() );
+        mmm( log() << "*  say() sock:" << this->sock << " thr:" << GetCurrentThreadId() << endl; )
+        toSend.header()->id = nextMessageId();
+        toSend.header()->responseTo = responseTo;
+
+        if ( piggyBackData && piggyBackData->len() ) {
+            mmm( log() << "*     have piggy back" << endl; )
+            if ( ( piggyBackData->len() + toSend.header()->len ) > 1300 ) {
+                // won't fit in a packet - so just send it off
+                piggyBackData->flush();
+            }
+            else {
+                piggyBackData->append( toSend );
+                piggyBackData->flush();
+                return;
+            }
+        }
+
+        toSend.send( *this, "say" );
+    }
+
+    void MessagingPort::piggyBack( Message& toSend , int responseTo ) {
+
+        if ( toSend.header()->len > 1300 ) {
+            // not worth saving because its almost an entire packet
+            say( toSend );
+            return;
+        }
+
+        // we're going to be storing this, so need to set it up
+        toSend.header()->id = nextMessageId();
+        toSend.header()->responseTo = responseTo;
+
+        if ( ! piggyBackData )
+            piggyBackData = new PiggyBackData( this );
+
+        piggyBackData->append( toSend );
+    }
+
+    HostAndPort MessagingPort::remote() const {
+        if ( ! _remoteParsed.hasPort() )
+            _remoteParsed = HostAndPort( remoteAddr() );
+        return _remoteParsed;
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/util/net/message_port.h b/src/mongo/util/net/message_port.h
new file mode 100644
index 00000000000..5d404d84f8a
--- /dev/null
+++ b/src/mongo/util/net/message_port.h
@@ -0,0 +1,108 @@
+// message_port.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "sock.h"
+#include "message.h"
+
+namespace mongo {
+
+    class MessagingPort;
+    class PiggyBackData;
+
+    typedef AtomicUInt MSGID;
+
+    class AbstractMessagingPort : boost::noncopyable {
+    public:
+        AbstractMessagingPort() : tag(0) {}
+        virtual ~AbstractMessagingPort() { }
+        virtual void reply(Message& received, Message& response, MSGID responseTo) = 0; // like the reply below, but doesn't rely on received.data still being available
+        virtual void reply(Message& received, Message& response) = 0;
+
+        virtual HostAndPort remote() const = 0;
+        virtual unsigned remotePort() const = 0;
+
+        virtual void assertStillConnected() = 0;
+
+    public:
+        // TODO make this private with some helpers
+
+        /* ports can be tagged with various classes.  see closeAllSockets(tag). defaults to 0. */
+        unsigned tag;
+
+    };
+
+    class MessagingPort : public AbstractMessagingPort , public Socket {
+    public:
+        MessagingPort(int fd, const SockAddr& remote);
+
+        // in some cases the timeout will actually be 2x this value - eg we do a partial send,
+        // then the timeout fires, then we try to send again, then the timeout fires again with
+        // no data sent, then we detect that the other side is down
+        MessagingPort(double so_timeout = 0, int logLevel = 0 );
+
+        MessagingPort(Socket& socket);
+
+        virtual ~MessagingPort();
+
+        void shutdown();
+
+        /* it's assumed if you reuse a message object, that it doesn't cross MessagingPort's.
+           also, the Message data will go out of scope on the subsequent recv call.
+        */
+        bool recv(Message& m);
+        void reply(Message& received, Message& response, MSGID responseTo);
+        void reply(Message& received, Message& response);
+        bool call(Message& toSend, Message& response);
+
+        void say(Message& toSend, int responseTo = -1);
+
+        /**
+         * this is used for doing 'async' queries
+         * instead of doing call( to , from )
+         * you would do
+         * say( to )
+         * recv( from )
+         * Note: if you fail to call recv and someone else uses this port,
+         *       horrible things will happend
+         */
+        bool recv( const Message& sent , Message& response );
+
+        void piggyBack( Message& toSend , int responseTo = -1 );
+
+        unsigned remotePort() const { return Socket::remotePort(); }
+        virtual HostAndPort remote() const;
+
+        void assertStillConnected();
+
+    private:
+        
+        PiggyBackData * piggyBackData;
+        
+        // this is the parsed version of remote
+        // mutable because its initialized only on call to remote()
+        mutable HostAndPort _remoteParsed; 
+
+    public:
+        static void closeAllSockets(unsigned tagMask = 0xffffffff);
+
+        friend class PiggyBackData;
+    };
+
+
+} // namespace mongo
diff --git a/src/mongo/util/net/message_server.h b/src/mongo/util/net/message_server.h
new file mode 100644
index 00000000000..ae77b97bb0f
--- /dev/null
+++ b/src/mongo/util/net/message_server.h
@@ -0,0 +1,66 @@
+// message_server.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*
+  abstract database server
+  async io core, worker thread system
+ */
+
+#pragma once
+
+#include "../../pch.h"
+
+namespace mongo {
+
+    class MessageHandler {
+    public:
+        virtual ~MessageHandler() {}
+        
+        /**
+         * called once when a socket is connected
+         */
+        virtual void connected( AbstractMessagingPort* p ) = 0;
+
+        /**
+         * called every time a message comes in
+         * handler is responsible for responding to client
+         */
+        virtual void process( Message& m , AbstractMessagingPort* p , LastError * err ) = 0;
+
+        /**
+         * called once when a socket is disconnected
+         */
+        virtual void disconnected( AbstractMessagingPort* p ) = 0;
+    };
+
+    class MessageServer {
+    public:
+        struct Options {
+            int port;                   // port to bind to
+            string ipList;             // addresses to bind to
+
+            Options() : port(0), ipList("") {}
+        };
+
+        virtual ~MessageServer() {}
+        virtual void run() = 0;
+        virtual void setAsTimeTracker() = 0;
+    };
+
+    // TODO use a factory here to decide between port and asio variations
+    MessageServer * createServer( const MessageServer::Options& opts , MessageHandler * handler );
+}
diff --git a/src/mongo/util/net/message_server_asio.cpp b/src/mongo/util/net/message_server_asio.cpp
new file mode 100644
index 00000000000..0c6a7d925da
--- /dev/null
+++ b/src/mongo/util/net/message_server_asio.cpp
@@ -0,0 +1,261 @@
+// message_server_asio.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#ifdef USE_ASIO
+
+#include <boost/asio.hpp>
+#include <boost/bind.hpp>
+#include <boost/enable_shared_from_this.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include <iostream>
+#include <vector>
+
+#include "message.h"
+#include "message_server.h"
+#include "../util/concurrency/mvar.h"
+
+using namespace boost;
+using namespace boost::asio;
+using namespace boost::asio::ip;
+
+namespace mongo {
+    class MessageServerSession;
+
+    namespace {
+        class StickyThread {
+        public:
+            StickyThread()
+                : _thread(boost::ref(*this))
+            {}
+
+            ~StickyThread() {
+                _mss.put(boost::shared_ptr<MessageServerSession>());
+                _thread.join();
+            }
+
+            void ready(boost::shared_ptr<MessageServerSession> mss) {
+                _mss.put(mss);
+            }
+
+            void operator() () {
+                boost::shared_ptr<MessageServerSession> mss;
+                while((mss = _mss.take())) { // intentionally not using ==
+                    task(mss.get());
+                    mss.reset();
+                }
+            }
+
+        private:
+            boost::thread _thread;
+            inline void task(MessageServerSession* mss); // must be defined after MessageServerSession
+
+            MVar<boost::shared_ptr<MessageServerSession> > _mss; // populated when given a task
+        };
+
+        vector<boost::shared_ptr<StickyThread> > thread_pool;
+        mongo::mutex tp_mutex; // this is only needed if io_service::run() is called from multiple threads
+    }
+
+    class MessageServerSession : public boost::enable_shared_from_this<MessageServerSession> , public AbstractMessagingPort {
+    public:
+        MessageServerSession( MessageHandler * handler , io_service& ioservice )
+            : _handler( handler )
+            , _socket( ioservice )
+            , _portCache(0)
+        { }
+
+        ~MessageServerSession() {
+            cout << "disconnect from: " << _socket.remote_endpoint() << endl;
+        }
+
+        tcp::socket& socket() {
+            return _socket;
+        }
+
+        void start() {
+            cout << "MessageServerSession start from:" << _socket.remote_endpoint() << endl;
+            _startHeaderRead();
+        }
+
+        void handleReadHeader( const boost::system::error_code& error ) {
+            if ( _inHeader.len == 0 )
+                return;
+
+            if ( ! _inHeader.valid() ) {
+                cout << "  got invalid header from: " << _socket.remote_endpoint() << " closing connected" << endl;
+                return;
+            }
+
+            char * raw = (char*)malloc( _inHeader.len );
+
+            MsgData * data = (MsgData*)raw;
+            memcpy( data , &_inHeader , sizeof( _inHeader ) );
+            assert( data->len == _inHeader.len );
+
+            uassert( 10273 ,  "_cur not empty! pipelining requests not supported" , ! _cur.data );
+
+            _cur.setData( data , true );
+            async_read( _socket ,
+                        buffer( raw + sizeof( _inHeader ) , _inHeader.len - sizeof( _inHeader ) ) ,
+                        boost::bind( &MessageServerSession::handleReadBody , shared_from_this() , boost::asio::placeholders::error ) );
+        }
+
+        void handleReadBody( const boost::system::error_code& error ) {
+            if (!_myThread) {
+                mongo::mutex::scoped_lock(tp_mutex);
+                if (!thread_pool.empty()) {
+                    _myThread = thread_pool.back();
+                    thread_pool.pop_back();
+                }
+            }
+
+            if (!_myThread) // pool is empty
+                _myThread.reset(new StickyThread());
+
+            assert(_myThread);
+
+            _myThread->ready(shared_from_this());
+        }
+
+        void process() {
+            _handler->process( _cur , this );
+
+            if (_reply.data) {
+                async_write( _socket ,
+                             buffer( (char*)_reply.data , _reply.data->len ) ,
+                             boost::bind( &MessageServerSession::handleWriteDone , shared_from_this() , boost::asio::placeholders::error ) );
+            }
+            else {
+                _cur.reset();
+                _startHeaderRead();
+            }
+        }
+
+        void handleWriteDone( const boost::system::error_code& error ) {
+            {
+                // return thread to pool after we have sent data to the client
+                mongo::mutex::scoped_lock(tp_mutex);
+                assert(_myThread);
+                thread_pool.push_back(_myThread);
+                _myThread.reset();
+            }
+            _cur.reset();
+            _reply.reset();
+            _startHeaderRead();
+        }
+
+        virtual void reply( Message& received, Message& response ) {
+            reply( received , response , received.data->id );
+        }
+
+        virtual void reply( Message& query , Message& toSend, MSGID responseTo ) {
+            _reply = toSend;
+
+            _reply.data->id = nextMessageId();
+            _reply.data->responseTo = responseTo;
+            uassert( 10274 ,  "pipelining requests doesn't work yet" , query.data->id == _cur.data->id );
+        }
+
+
+        virtual unsigned remotePort() {
+            if (!_portCache)
+                _portCache = _socket.remote_endpoint().port(); //this is expensive
+            return _portCache;
+        }
+
+    private:
+
+        void _startHeaderRead() {
+            _inHeader.len = 0;
+            async_read( _socket ,
+                        buffer( &_inHeader , sizeof( _inHeader ) ) ,
+                        boost::bind( &MessageServerSession::handleReadHeader , shared_from_this() , boost::asio::placeholders::error ) );
+        }
+
+        MessageHandler * _handler;
+        tcp::socket _socket;
+        MsgData _inHeader;
+        Message _cur;
+        Message _reply;
+
+        unsigned _portCache;
+
+        boost::shared_ptr<StickyThread> _myThread;
+    };
+
+    void StickyThread::task(MessageServerSession* mss) {
+        mss->process();
+    }
+
+
+    class AsyncMessageServer : public MessageServer {
+    public:
+        // TODO accept an IP address to bind to
+        AsyncMessageServer( const MessageServer::Options& opts , MessageHandler * handler )
+            : _port( opts.port )
+            , _handler(handler)
+            , _endpoint( tcp::v4() , opts.port )
+            , _acceptor( _ioservice , _endpoint ) {
+            _accept();
+        }
+        virtual ~AsyncMessageServer() {
+
+        }
+
+        void run() {
+            cout << "AsyncMessageServer starting to listen on: " << _port << endl;
+            boost::thread other(boost::bind(&io_service::run, &_ioservice));
+            _ioservice.run();
+            cout << "AsyncMessageServer done listening on: " << _port << endl;
+        }
+
+        void handleAccept( shared_ptr<MessageServerSession> session ,
+                           const boost::system::error_code& error ) {
+            if ( error ) {
+                cout << "handleAccept error!" << endl;
+                return;
+            }
+            session->start();
+            _accept();
+        }
+
+        void _accept( ) {
+            shared_ptr<MessageServerSession> session( new MessageServerSession( _handler , _ioservice ) );
+            _acceptor.async_accept( session->socket() ,
+                                    boost::bind( &AsyncMessageServer::handleAccept,
+                                                 this,
+                                                 session,
+                                                 boost::asio::placeholders::error )
+                                  );
+        }
+
+    private:
+        int _port;
+        MessageHandler * _handler;
+        io_service _ioservice;
+        tcp::endpoint _endpoint;
+        tcp::acceptor _acceptor;
+    };
+
+    MessageServer * createServer( const MessageServer::Options& opts , MessageHandler * handler ) {
+        return new AsyncMessageServer( opts , handler );
+    }
+
+}
+
+#endif
diff --git a/src/mongo/util/net/message_server_port.cpp b/src/mongo/util/net/message_server_port.cpp
new file mode 100644
index 00000000000..7e6a731529b
--- /dev/null
+++ b/src/mongo/util/net/message_server_port.cpp
@@ -0,0 +1,204 @@
+// message_server_port.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#ifndef USE_ASIO
+
+#include "message.h"
+#include "message_port.h"
+#include "message_server.h"
+#include "listen.h"
+
+#include "../../db/cmdline.h"
+#include "../../db/lasterror.h"
+#include "../../db/stats/counters.h"
+
+#ifdef __linux__  // TODO: consider making this ifndef _WIN32
+# include <sys/resource.h>
+#endif
+
+namespace mongo {
+
+    namespace pms {
+
+        MessageHandler * handler;
+
+        void threadRun( MessagingPort * inPort) {
+            TicketHolderReleaser connTicketReleaser( &connTicketHolder );
+
+            setThreadName( "conn" );
+            
+            assert( inPort );
+            inPort->setLogLevel(1);
+            scoped_ptr<MessagingPort> p( inPort );
+
+            p->postFork();
+
+            string otherSide;
+
+            Message m;
+            try {
+                LastError * le = new LastError();
+                lastError.reset( le ); // lastError now has ownership
+
+                otherSide = p->remoteString();
+
+                handler->connected( p.get() );
+
+                while ( ! inShutdown() ) {
+                    m.reset();
+                    p->clearCounters();
+
+                    if ( ! p->recv(m) ) {
+                        if( !cmdLine.quiet ){
+                            int conns = connTicketHolder.used()-1;
+                            const char* word = (conns == 1 ? " connection" : " connections");
+                            log() << "end connection " << otherSide << " (" << conns << word << " now open)" << endl;
+                        }
+                        p->shutdown();
+                        break;
+                    }
+
+                    handler->process( m , p.get() , le );
+                    networkCounter.hit( p->getBytesIn() , p->getBytesOut() );
+                }
+            }
+            catch ( AssertionException& e ) {
+                log() << "AssertionException handling request, closing client connection: " << e << endl;
+                p->shutdown();
+            }
+            catch ( SocketException& e ) {
+                log() << "SocketException handling request, closing client connection: " << e << endl;
+                p->shutdown();
+            }
+            catch ( const ClockSkewException & ) {
+                log() << "ClockSkewException - shutting down" << endl;
+                exitCleanly( EXIT_CLOCK_SKEW );
+            }
+            catch ( const DBException& e ) { // must be right above std::exception to avoid catching subclasses
+                log() << "DBException handling request, closing client connection: " << e << endl;
+                p->shutdown();
+            }
+            catch ( std::exception &e ) {
+                error() << "Uncaught std::exception: " << e.what() << ", terminating" << endl;
+                dbexit( EXIT_UNCAUGHT );
+            }
+            catch ( ... ) {
+                error() << "Uncaught exception, terminating" << endl;
+                dbexit( EXIT_UNCAUGHT );
+            }
+
+            handler->disconnected( p.get() );
+        }
+
+    }
+
+    class PortMessageServer : public MessageServer , public Listener {
+    public:
+        PortMessageServer(  const MessageServer::Options& opts, MessageHandler * handler ) :
+            Listener( "" , opts.ipList, opts.port ) {
+
+            uassert( 10275 ,  "multiple PortMessageServer not supported" , ! pms::handler );
+            pms::handler = handler;
+        }
+
+        virtual void accepted(MessagingPort * p) {
+
+            if ( ! connTicketHolder.tryAcquire() ) {
+                log() << "connection refused because too many open connections: " << connTicketHolder.used() << endl;
+
+                // TODO: would be nice if we notified them...
+                p->shutdown();
+                delete p;
+
+                sleepmillis(2); // otherwise we'll hard loop
+                return;
+            }
+
+            try {
+#ifndef __linux__  // TODO: consider making this ifdef _WIN32
+                boost::thread thr( boost::bind( &pms::threadRun , p ) );
+#else
+                pthread_attr_t attrs;
+                pthread_attr_init(&attrs);
+                pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+                static const size_t STACK_SIZE = 1024*1024; // if we change this we need to update the warning
+
+                struct rlimit limits;
+                verify(15887, getrlimit(RLIMIT_STACK, &limits) == 0);
+                if (limits.rlim_cur > STACK_SIZE) {
+                    pthread_attr_setstacksize(&attrs, (DEBUG_BUILD
+                                                        ? (STACK_SIZE / 2)
+                                                        : STACK_SIZE));
+                } else if (limits.rlim_cur < 1024*1024) {
+                    warning() << "Stack size set to " << (limits.rlim_cur/1024) << "KB. We suggest 1MB" << endl;
+                }
+
+
+                pthread_t thread;
+                int failed = pthread_create(&thread, &attrs, (void*(*)(void*)) &pms::threadRun, p);
+
+                pthread_attr_destroy(&attrs);
+
+                if (failed) {
+                    log() << "pthread_create failed: " << errnoWithDescription(failed) << endl;
+                    throw boost::thread_resource_error(); // for consistency with boost::thread
+                }
+#endif
+            }
+            catch ( boost::thread_resource_error& ) {
+                connTicketHolder.release();
+                log() << "can't create new thread, closing connection" << endl;
+
+                p->shutdown();
+                delete p;
+
+                sleepmillis(2);
+            }
+            catch ( ... ) {
+                connTicketHolder.release();
+                log() << "unknown error accepting new socket" << endl;
+
+                p->shutdown();
+                delete p;
+
+                sleepmillis(2);
+            }
+
+        }
+
+        virtual void setAsTimeTracker() {
+            Listener::setAsTimeTracker();
+        }
+
+        void run() {
+            initAndListen();
+        }
+
+        virtual bool useUnixSockets() const { return true; }
+    };
+
+
+    MessageServer * createServer( const MessageServer::Options& opts , MessageHandler * handler ) {
+        return new PortMessageServer( opts , handler );
+    }
+
+}
+
+#endif
diff --git a/src/mongo/util/net/miniwebserver.cpp b/src/mongo/util/net/miniwebserver.cpp
new file mode 100644
index 00000000000..f0b58569d22
--- /dev/null
+++ b/src/mongo/util/net/miniwebserver.cpp
@@ -0,0 +1,212 @@
+// miniwebserver.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "miniwebserver.h"
+#include "../hex.h"
+
+#include "pcrecpp.h"
+
+namespace mongo {
+
+    MiniWebServer::MiniWebServer(const string& name, const string &ip, int port)
+        : Listener(name, ip, port, false)
+    {}
+
+    string MiniWebServer::parseURL( const char * buf ) {
+        const char * urlStart = strchr( buf , ' ' );
+        if ( ! urlStart )
+            return "/";
+
+        urlStart++;
+
+        const char * end = strchr( urlStart , ' ' );
+        if ( ! end ) {
+            end = strchr( urlStart , '\r' );
+            if ( ! end ) {
+                end = strchr( urlStart , '\n' );
+            }
+        }
+
+        if ( ! end )
+            return "/";
+
+        int diff = (int)(end-urlStart);
+        if ( diff < 0 || diff > 255 )
+            return "/";
+
+        return string( urlStart , (int)(end-urlStart) );
+    }
+
+    void MiniWebServer::parseParams( BSONObj & params , string query ) {
+        if ( query.size() == 0 )
+            return;
+
+        BSONObjBuilder b;
+        while ( query.size() ) {
+
+            string::size_type amp = query.find( "&" );
+
+            string cur;
+            if ( amp == string::npos ) {
+                cur = query;
+                query = "";
+            }
+            else {
+                cur = query.substr( 0 , amp );
+                query = query.substr( amp + 1 );
+            }
+
+            string::size_type eq = cur.find( "=" );
+            if ( eq == string::npos )
+                continue;
+
+            b.append( urlDecode(cur.substr(0,eq)) , urlDecode(cur.substr(eq+1) ) );
+        }
+
+        params = b.obj();
+    }
+
+    string MiniWebServer::parseMethod( const char * headers ) {
+        const char * end = strchr( headers , ' ' );
+        if ( ! end )
+            return "GET";
+        return string( headers , (int)(end-headers) );
+    }
+
+    const char *MiniWebServer::body( const char *buf ) {
+        const char *ret = strstr( buf, "\r\n\r\n" );
+        return ret ? ret + 4 : ret;
+    }
+
+    bool MiniWebServer::fullReceive( const char *buf ) {
+        const char *bod = body( buf );
+        if ( !bod )
+            return false;
+        const char *lenString = "Content-Length:";
+        const char *lengthLoc = strstr( buf, lenString );
+        if ( !lengthLoc )
+            return true;
+        lengthLoc += strlen( lenString );
+        long len = strtol( lengthLoc, 0, 10 );
+        if ( long( strlen( bod ) ) == len )
+            return true;
+        return false;
+    }
+
+    void MiniWebServer::accepted(Socket sock) {
+        sock.postFork();
+        sock.setTimeout(8);
+        char buf[4096];
+        int len = 0;
+        while ( 1 ) {
+            int left = sizeof(buf) - 1 - len;
+            if( left == 0 )
+                break;
+            int x = sock.unsafe_recv( buf + len , left );
+            if ( x <= 0 ) {
+                sock.close();
+                return;
+            }
+            len += x;
+            buf[ len ] = 0;
+            if ( fullReceive( buf ) ) {
+                break;
+            }
+        }
+        buf[len] = 0;
+
+        string responseMsg;
+        int responseCode = 599;
+        vector<string> headers;
+
+        try {
+            doRequest(buf, parseURL( buf ), responseMsg, responseCode, headers, sock.remoteAddr() );
+        }
+        catch ( std::exception& e ) {
+            responseCode = 500;
+            responseMsg = "error loading page: ";
+            responseMsg += e.what();
+        }
+        catch ( ... ) {
+            responseCode = 500;
+            responseMsg = "unknown error loading page";
+        }
+
+        stringstream ss;
+        ss << "HTTP/1.0 " << responseCode;
+        if ( responseCode == 200 ) ss << " OK";
+        ss << "\r\n";
+        if ( headers.empty() ) {
+            ss << "Content-Type: text/html\r\n";
+        }
+        else {
+            for ( vector<string>::iterator i = headers.begin(); i != headers.end(); i++ ) {
+                assert( strncmp("Content-Length", i->c_str(), 14) );
+                ss << *i << "\r\n";
+            }
+        }
+        ss << "Connection: close\r\n";
+        ss << "Content-Length: " << responseMsg.size() << "\r\n";
+        ss << "\r\n";
+        ss << responseMsg;
+        string response = ss.str();
+
+        try {
+            sock.send( response.c_str(), response.size() , "http response" );
+            sock.close();
+        }
+        catch ( SocketException& e ) {
+            log(1) << "couldn't send data to http client: " << e << endl;
+        }
+    }
+
+    string MiniWebServer::getHeader( const char * req , string wanted ) {
+        const char * headers = strchr( req , '\n' );
+        if ( ! headers )
+            return "";
+        pcrecpp::StringPiece input( headers + 1 );
+
+        string name;
+        string val;
+        pcrecpp::RE re("([\\w\\-]+): (.*?)\r?\n");
+        while ( re.Consume( &input, &name, &val) ) {
+            if ( name == wanted )
+                return val;
+        }
+        return "";
+    }
+
+    string MiniWebServer::urlDecode(const char* s) {
+        stringstream out;
+        while(*s) {
+            if (*s == '+') {
+                out << ' ';
+            }
+            else if (*s == '%') {
+                out << fromHex(s+1);
+                s+=2;
+            }
+            else {
+                out << *s;
+            }
+            s++;
+        }
+        return out.str();
+    }
+
+} // namespace mongo
diff --git a/src/mongo/util/net/miniwebserver.h b/src/mongo/util/net/miniwebserver.h
new file mode 100644
index 00000000000..1fb6b3f2e65
--- /dev/null
+++ b/src/mongo/util/net/miniwebserver.h
@@ -0,0 +1,60 @@
+// miniwebserver.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../../pch.h"
+#include "message.h"
+#include "message_port.h"
+#include "listen.h"
+#include "../../db/jsobj.h"
+
+namespace mongo {
+
+    class MiniWebServer : public Listener {
+    public:
+        MiniWebServer(const string& name, const string &ip, int _port);
+        virtual ~MiniWebServer() {}
+
+        virtual void doRequest(
+            const char *rq, // the full request
+            string url,
+            // set these and return them:
+            string& responseMsg,
+            int& responseCode,
+            vector<string>& headers, // if completely empty, content-type: text/html will be added
+            const SockAddr &from
+        ) = 0;
+
+        // --- static helpers ----
+
+        static void parseParams( BSONObj & params , string query );
+
+        static string parseURL( const char * buf );
+        static string parseMethod( const char * headers );
+        static string getHeader( const char * headers , string name );
+        static const char *body( const char *buf );
+
+        static string urlDecode(const char* s);
+        static string urlDecode(string s) {return urlDecode(s.c_str());}
+
+    private:
+        void accepted(Socket socket);
+        static bool fullReceive( const char *buf );
+    };
+
+} // namespace mongo
diff --git a/src/mongo/util/net/sock.cpp b/src/mongo/util/net/sock.cpp
new file mode 100644
index 00000000000..bd08e6c64b9
--- /dev/null
+++ b/src/mongo/util/net/sock.cpp
@@ -0,0 +1,763 @@
+// @file sock.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "sock.h"
+#include "../background.h"
+#include "../concurrency/value.h"
+#include "../mongoutils/str.h"
+
+#if !defined(_WIN32)
+# include <sys/socket.h>
+# include <sys/types.h>
+# include <sys/socket.h>
+# include <sys/un.h>
+# include <netinet/in.h>
+# include <netinet/tcp.h>
+# include <arpa/inet.h>
+# include <errno.h>
+# include <netdb.h>
+# if defined(__openbsd__)
+#  include <sys/uio.h>
+# endif
+#endif
+
+#ifdef MONGO_SSL
+#include <openssl/err.h>
+#include <openssl/ssl.h>
+#endif
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    void dynHostResolve(string& name, int& port);
+    string dynHostMyName();
+
+    static bool ipv6 = false;
+    void enableIPv6(bool state) { ipv6 = state; }
+    bool IPv6Enabled() { return ipv6; }
+    
+    void setSockTimeouts(int sock, double secs) {
+        struct timeval tv;
+        tv.tv_sec = (int)secs;
+        tv.tv_usec = (int)((long long)(secs*1000*1000) % (1000*1000));
+        bool report = logLevel > 3; // solaris doesn't provide these
+        DEV report = true;
+#if defined(_WIN32)
+        tv.tv_sec *= 1000; // Windows timeout is a DWORD, in milliseconds.
+        int status = setsockopt( sock, SOL_SOCKET, SO_RCVTIMEO, (char *) &tv.tv_sec, sizeof(DWORD) ) == 0;
+        if( report && (status == SOCKET_ERROR) ) log() << "unable to set SO_RCVTIMEO" << endl;
+        status = setsockopt( sock, SOL_SOCKET, SO_SNDTIMEO, (char *) &tv.tv_sec, sizeof(DWORD) ) == 0;
+        DEV if( report && (status == SOCKET_ERROR) ) log() << "unable to set SO_SNDTIMEO" << endl;
+#else
+        bool ok = setsockopt( sock, SOL_SOCKET, SO_RCVTIMEO, (char *) &tv, sizeof(tv) ) == 0;
+        if( report && !ok ) log() << "unable to set SO_RCVTIMEO" << endl;
+        ok = setsockopt( sock, SOL_SOCKET, SO_SNDTIMEO, (char *) &tv, sizeof(tv) ) == 0;
+        DEV if( report && !ok ) log() << "unable to set SO_SNDTIMEO" << endl;
+#endif
+    }
+
+#if defined(_WIN32)
+    void disableNagle(int sock) {
+        int x = 1;
+        if ( setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &x, sizeof(x)) )
+            error() << "disableNagle failed" << endl;
+        if ( setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &x, sizeof(x)) )
+            error() << "SO_KEEPALIVE failed" << endl;
+    }
+#else
+    
+    void disableNagle(int sock) {
+        int x = 1;
+
+#ifdef SOL_TCP
+        int level = SOL_TCP;
+#else
+        int level = SOL_SOCKET;
+#endif
+
+        if ( setsockopt(sock, level, TCP_NODELAY, (char *) &x, sizeof(x)) )
+            error() << "disableNagle failed: " << errnoWithDescription() << endl;
+
+#ifdef SO_KEEPALIVE
+        if ( setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &x, sizeof(x)) )
+            error() << "SO_KEEPALIVE failed: " << errnoWithDescription() << endl;
+
+#  ifdef __linux__
+        socklen_t len = sizeof(x);
+        if ( getsockopt(sock, level, TCP_KEEPIDLE, (char *) &x, &len) )
+            error() << "can't get TCP_KEEPIDLE: " << errnoWithDescription() << endl;
+
+        if (x > 300) {
+            x = 300;
+            if ( setsockopt(sock, level, TCP_KEEPIDLE, (char *) &x, sizeof(x)) ) {
+                error() << "can't set TCP_KEEPIDLE: " << errnoWithDescription() << endl;
+            }
+        }
+
+        len = sizeof(x); // just in case it changed
+        if ( getsockopt(sock, level, TCP_KEEPINTVL, (char *) &x, &len) )
+            error() << "can't get TCP_KEEPINTVL: " << errnoWithDescription() << endl;
+
+        if (x > 300) {
+            x = 300;
+            if ( setsockopt(sock, level, TCP_KEEPINTVL, (char *) &x, sizeof(x)) ) {
+                error() << "can't set TCP_KEEPINTVL: " << errnoWithDescription() << endl;
+            }
+        }
+#  endif
+#endif
+
+    }
+
+#endif
+
+    string getAddrInfoStrError(int code) {
+#if !defined(_WIN32)
+        return gai_strerror(code);
+#else
+        /* gai_strerrorA is not threadsafe on windows. don't use it. */
+        return errnoWithDescription(code);
+#endif
+    }
+
+
+    // --- SockAddr
+
+    SockAddr::SockAddr(int sourcePort) {
+        memset(as<sockaddr_in>().sin_zero, 0, sizeof(as<sockaddr_in>().sin_zero));
+        as<sockaddr_in>().sin_family = AF_INET;
+        as<sockaddr_in>().sin_port = htons(sourcePort);
+        as<sockaddr_in>().sin_addr.s_addr = htonl(INADDR_ANY);
+        addressSize = sizeof(sockaddr_in);
+    }
+
+    SockAddr::SockAddr(const char * _iporhost , int port) {
+        string target = _iporhost;
+        bool cloudName = *_iporhost == '#';
+        if( target == "localhost" ) {
+            target = "127.0.0.1";
+        }
+        else if( cloudName ) {
+            dynHostResolve(target, port);
+        }
+
+        if( str::contains(target, '/') ) {
+#ifdef _WIN32
+            uassert(13080, "no unix socket support on windows", false);
+#endif
+            uassert(13079, "path to unix socket too long", target.size() < sizeof(as<sockaddr_un>().sun_path));
+            as<sockaddr_un>().sun_family = AF_UNIX;
+            strcpy(as<sockaddr_un>().sun_path, target.c_str());
+            addressSize = sizeof(sockaddr_un);
+        }
+        else {
+            addrinfo* addrs = NULL;
+            addrinfo hints;
+            memset(&hints, 0, sizeof(addrinfo));
+            hints.ai_socktype = SOCK_STREAM;
+            //hints.ai_flags = AI_ADDRCONFIG; // This is often recommended but don't do it. SERVER-1579
+            hints.ai_flags |= AI_NUMERICHOST; // first pass tries w/o DNS lookup
+            hints.ai_family = (IPv6Enabled() ? AF_UNSPEC : AF_INET);
+
+            StringBuilder ss;
+            ss << port;
+            int ret = getaddrinfo(target.c_str(), ss.str().c_str(), &hints, &addrs);
+
+            // old C compilers on IPv6-capable hosts return EAI_NODATA error
+#ifdef EAI_NODATA
+            int nodata = (ret == EAI_NODATA);
+#else
+            int nodata = false;
+#endif
+            if ( (ret == EAI_NONAME || nodata) && !cloudName ) {
+                // iporhost isn't an IP address, allow DNS lookup
+                hints.ai_flags &= ~AI_NUMERICHOST;
+                ret = getaddrinfo(target.c_str(), ss.str().c_str(), &hints, &addrs);
+            }
+
+            if (ret) {
+                // we were unsuccessful
+                if( target != "0.0.0.0" ) { // don't log if this as it is a CRT construction and log() may not work yet.
+                    log() << "getaddrinfo(\"" << target << "\") failed: " << gai_strerror(ret) << endl;
+                }
+                *this = SockAddr(port);
+            }
+            else {
+                //TODO: handle other addresses in linked list;
+                assert(addrs->ai_addrlen <= sizeof(sa));
+                memcpy(&sa, addrs->ai_addr, addrs->ai_addrlen);
+                addressSize = addrs->ai_addrlen;
+                freeaddrinfo(addrs);
+            }
+        }
+    }
+
+    bool SockAddr::isLocalHost() const {
+        switch (getType()) {
+        case AF_INET: return getAddr() == "127.0.0.1";
+        case AF_INET6: return getAddr() == "::1";
+        case AF_UNIX: return true;
+        default: return false;
+        }
+        assert(false);
+        return false;
+    }
+
+    string SockAddr::toString(bool includePort) const {
+        string out = getAddr();
+        if (includePort && getType() != AF_UNIX && getType() != AF_UNSPEC)
+            out += mongoutils::str::stream() << ':' << getPort();
+        return out;
+    }
+    
+    sa_family_t SockAddr::getType() const {
+        return sa.ss_family;
+    }
+    
+    unsigned SockAddr::getPort() const {
+        switch (getType()) {
+        case AF_INET:  return ntohs(as<sockaddr_in>().sin_port);
+        case AF_INET6: return ntohs(as<sockaddr_in6>().sin6_port);
+        case AF_UNIX: return 0;
+        case AF_UNSPEC: return 0;
+        default: massert(SOCK_FAMILY_UNKNOWN_ERROR, "unsupported address family", false); return 0;
+        }
+    }
+    
+    string SockAddr::getAddr() const {
+        switch (getType()) {
+        case AF_INET:
+        case AF_INET6: {
+            const int buflen=128;
+            char buffer[buflen];
+            int ret = getnameinfo(raw(), addressSize, buffer, buflen, NULL, 0, NI_NUMERICHOST);
+            massert(13082, str::stream() << "getnameinfo error " << getAddrInfoStrError(ret), ret == 0);
+            return buffer;
+        }
+            
+        case AF_UNIX:  return (addressSize > 2 ? as<sockaddr_un>().sun_path : "anonymous unix socket");
+        case AF_UNSPEC: return "(NONE)";
+        default: massert(SOCK_FAMILY_UNKNOWN_ERROR, "unsupported address family", false); return "";
+        }
+    }
+
+    bool SockAddr::operator==(const SockAddr& r) const {
+        if (getType() != r.getType())
+            return false;
+        
+        if (getPort() != r.getPort())
+            return false;
+        
+        switch (getType()) {
+        case AF_INET:  return as<sockaddr_in>().sin_addr.s_addr == r.as<sockaddr_in>().sin_addr.s_addr;
+        case AF_INET6: return memcmp(as<sockaddr_in6>().sin6_addr.s6_addr, r.as<sockaddr_in6>().sin6_addr.s6_addr, sizeof(in6_addr)) == 0;
+        case AF_UNIX:  return strcmp(as<sockaddr_un>().sun_path, r.as<sockaddr_un>().sun_path) == 0;
+        case AF_UNSPEC: return true; // assume all unspecified addresses are the same
+        default: massert(SOCK_FAMILY_UNKNOWN_ERROR, "unsupported address family", false);
+        }
+        return false;
+    }
+    
+    bool SockAddr::operator!=(const SockAddr& r) const {
+        return !(*this == r);
+    }
+    
+    bool SockAddr::operator<(const SockAddr& r) const {
+        if (getType() < r.getType())
+            return true;
+        else if (getType() > r.getType())
+            return false;
+        
+        if (getPort() < r.getPort())
+            return true;
+        else if (getPort() > r.getPort())
+            return false;
+        
+        switch (getType()) {
+        case AF_INET:  return as<sockaddr_in>().sin_addr.s_addr < r.as<sockaddr_in>().sin_addr.s_addr;
+        case AF_INET6: return memcmp(as<sockaddr_in6>().sin6_addr.s6_addr, r.as<sockaddr_in6>().sin6_addr.s6_addr, sizeof(in6_addr)) < 0;
+        case AF_UNIX:  return strcmp(as<sockaddr_un>().sun_path, r.as<sockaddr_un>().sun_path) < 0;
+        case AF_UNSPEC: return false;
+        default: massert(SOCK_FAMILY_UNKNOWN_ERROR, "unsupported address family", false);
+        }
+        return false;        
+    }
+
+    SockAddr unknownAddress( "0.0.0.0", 0 );
+
+    // If an ip address is passed in, just return that.  If a hostname is passed
+    // in, look up its ip and return that.  Returns "" on failure.
+    string hostbyname(const char *hostname) {
+        if( *hostname == '#' ) {
+            string s = hostname;
+            int port;
+            dynHostResolve(s, port);
+            return s;
+        }
+
+        string addr =  SockAddr(hostname, 0).getAddr();
+        if (addr == "0.0.0.0")
+            return "";
+        else
+            return addr;
+    }
+   
+    //  --- my --
+
+    DiagStr _hostNameCached;
+
+    string getHostName() {
+        {
+            string s = dynHostMyName();
+            if( !s.empty() ) 
+                return s;
+        }
+
+        char buf[256];
+        int ec = gethostname(buf, 127);
+        if ( ec || *buf == 0 ) {
+            log() << "can't get this server's hostname " << errnoWithDescription() << endl;
+            return "";
+        }
+        return buf;
+    }
+
+    static void _hostNameCachedInit() {
+        _hostNameCached = getHostName();
+    }
+    boost::once_flag _hostNameCachedInitFlags = BOOST_ONCE_INIT;
+
+    /** we store our host name once */
+    // ok w dynhosts map?
+    string getHostNameCached() {
+        boost::call_once( _hostNameCachedInit , _hostNameCachedInitFlags );
+        return _hostNameCached;
+    }
+
+    // --------- SocketException ----------
+
+#ifdef MSG_NOSIGNAL
+    const int portSendFlags = MSG_NOSIGNAL;
+    const int portRecvFlags = MSG_NOSIGNAL;
+#else
+    const int portSendFlags = 0;
+    const int portRecvFlags = 0;
+#endif
+
+    string SocketException::toString() const {
+        stringstream ss;
+        ss << _ei.code << " socket exception [" << _type << "] ";
+        
+        if ( _server.size() )
+            ss << "server [" << _server << "] ";
+        
+        if ( _extra.size() )
+            ss << _extra;
+        
+        return ss.str();
+    }
+
+
+    // ------------ SSLManager -----------------
+
+#ifdef MONGO_SSL
+    SSLManager::SSLManager( bool client ) {
+        _client = client;
+        SSL_library_init();
+        SSL_load_error_strings();
+        ERR_load_crypto_strings();
+        
+        _context = SSL_CTX_new( client ? SSLv23_client_method() : SSLv23_server_method() );
+        massert( 15864 , mongoutils::str::stream() << "can't create SSL Context: " << ERR_error_string(ERR_get_error(), NULL) , _context );
+        
+        SSL_CTX_set_options( _context, SSL_OP_ALL);   
+    }
+
+    void SSLManager::setupPubPriv( const string& privateKeyFile , const string& publicKeyFile ) {
+        massert( 15865 , 
+                 mongoutils::str::stream() << "Can't read SSL certificate from file " 
+                 << publicKeyFile << ":" <<  ERR_error_string(ERR_get_error(), NULL) ,
+                 SSL_CTX_use_certificate_file(_context, publicKeyFile.c_str(), SSL_FILETYPE_PEM) );
+  
+
+        massert( 15866 , 
+                 mongoutils::str::stream() << "Can't read SSL private key from file " 
+                 << privateKeyFile << " : " << ERR_error_string(ERR_get_error(), NULL) ,
+                 SSL_CTX_use_PrivateKey_file(_context, privateKeyFile.c_str(), SSL_FILETYPE_PEM) );
+    }
+    
+    
+    int SSLManager::password_cb(char *buf,int num, int rwflag,void *userdata){
+        SSLManager* sm = (SSLManager*)userdata;
+        string pass = sm->_password;
+        strcpy(buf,pass.c_str());
+        return(pass.size());
+    }
+
+    void SSLManager::setupPEM( const string& keyFile , const string& password ) {
+        _password = password;
+        
+        massert( 15867 , "Can't read certificate file" , SSL_CTX_use_certificate_chain_file( _context , keyFile.c_str() ) );
+        
+        SSL_CTX_set_default_passwd_cb_userdata( _context , this );
+        SSL_CTX_set_default_passwd_cb( _context, &SSLManager::password_cb );
+        
+        massert( 15868 , "Can't read key file" , SSL_CTX_use_PrivateKey_file( _context , keyFile.c_str() , SSL_FILETYPE_PEM ) );
+    }
+        
+    SSL * SSLManager::secure( int fd ) {
+        SSL * ssl = SSL_new( _context );
+        massert( 15861 , "can't create SSL" , ssl );
+        SSL_set_fd( ssl , fd );
+        return ssl;
+    }
+
+
+#endif
+
+    // ------------ Socket -----------------
+    
+    Socket::Socket(int fd , const SockAddr& remote) : 
+        _fd(fd), _remote(remote), _timeout(0) {
+        _logLevel = 0;
+        _init();
+    }
+
+    Socket::Socket( double timeout, int ll ) {
+        _logLevel = ll;
+        _fd = -1;
+        _timeout = timeout;
+        _init();
+    }
+    
+    void Socket::_init() {
+        _bytesOut = 0;
+        _bytesIn = 0;
+#ifdef MONGO_SSL
+        _sslAccepted = 0;
+#endif
+    }
+
+    void Socket::close() {
+#ifdef MONGO_SSL
+        _ssl.reset();
+#endif
+        if ( _fd >= 0 ) {
+            closesocket( _fd );
+            _fd = -1;
+        }
+    }
+    
+#ifdef MONGO_SSL
+    void Socket::secure( SSLManager * ssl ) {
+        assert( ssl );
+        assert( _fd >= 0 );
+        _ssl.reset( ssl->secure( _fd ) );
+        SSL_connect( _ssl.get() );
+    }
+
+    void Socket::secureAccepted( SSLManager * ssl ) { 
+        _sslAccepted = ssl;
+    }
+#endif
+
+    void Socket::postFork() {
+#ifdef MONGO_SSL
+        if ( _sslAccepted ) {
+            assert( _fd );
+            _ssl.reset( _sslAccepted->secure( _fd ) );
+            SSL_accept( _ssl.get() );
+            _sslAccepted = 0;
+        }
+#endif
+    }
+
+    class ConnectBG : public BackgroundJob {
+    public:
+        ConnectBG(int sock, SockAddr remote) : _sock(sock), _remote(remote) { }
+
+        void run() { _res = ::connect(_sock, _remote.raw(), _remote.addressSize); }
+        string name() const { return "ConnectBG"; }
+        int inError() const { return _res; }
+
+    private:
+        int _sock;
+        int _res;
+        SockAddr _remote;
+    };
+
+    bool Socket::connect(SockAddr& remote) {
+        _remote = remote;
+
+        _fd = socket(remote.getType(), SOCK_STREAM, 0);
+        if ( _fd == INVALID_SOCKET ) {
+            log(_logLevel) << "ERROR: connect invalid socket " << errnoWithDescription() << endl;
+            return false;
+        }
+
+        if ( _timeout > 0 ) {
+            setTimeout( _timeout );
+        }
+
+        ConnectBG bg(_fd, remote);
+        bg.go();
+        if ( bg.wait(5000) ) {
+            if ( bg.inError() ) {
+                close();
+                return false;
+            }
+        }
+        else {
+            // time out the connect
+            close();
+            bg.wait(); // so bg stays in scope until bg thread terminates
+            return false;
+        }
+
+        if (remote.getType() != AF_UNIX)
+            disableNagle(_fd);
+
+#ifdef SO_NOSIGPIPE
+        // osx
+        const int one = 1;
+        setsockopt( _fd , SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(int));
+#endif
+
+        return true;
+    }
+
+    int Socket::_send( const char * data , int len ) {
+#ifdef MONGO_SSL
+        if ( _ssl ) {
+            return SSL_write( _ssl.get() , data , len );
+        }
+#endif
+        return ::send( _fd , data , len , portSendFlags );
+    }
+
+    bool Socket::stillConnected() { 
+#ifdef MONGO_SSL
+        DEV log() << "TODO stillConnected() w/SSL" << endl;
+#else
+        int r = _send("", 0);
+        if( r < 0 ) {
+#if defined(_WIN32)
+            if ( WSAGetLastError() == WSAETIMEDOUT ) {
+#else
+            if ( ( errno == EAGAIN || errno == EWOULDBLOCK ) ) {
+#endif
+                ;
+            }
+            else {
+                return false;
+            }
+        }
+#endif
+        return true;
+    }
+
+    // sends all data or throws an exception
+    void Socket::send( const char * data , int len, const char *context ) {
+        while( len > 0 ) {
+            int ret = _send( data , len  );
+            if ( ret == -1 ) {
+                
+#ifdef MONGO_SSL
+                if ( _ssl ) {
+                    log() << "SSL Error ret: " << ret << " err: " << SSL_get_error( _ssl.get() , ret ) 
+                          << " " << ERR_error_string(ERR_get_error(), NULL) 
+                          << endl;
+                }
+#endif
+
+#if defined(_WIN32)
+                if ( WSAGetLastError() == WSAETIMEDOUT && _timeout != 0 ) {
+#else
+                if ( ( errno == EAGAIN || errno == EWOULDBLOCK ) && _timeout != 0 ) {
+#endif
+                    log(_logLevel) << "Socket " << context << " send() timed out " << _remote.toString() << endl;
+                    throw SocketException( SocketException::SEND_TIMEOUT , remoteString() );
+                }
+                else {
+                    SocketException::Type t = SocketException::SEND_ERROR;
+                    log(_logLevel) << "Socket " << context << " send() " 
+                                   << errnoWithDescription() << ' ' << remoteString() << endl;
+                    throw SocketException( t , remoteString() );
+                }
+            }
+            else {
+                _bytesOut += ret;
+
+                assert( ret <= len );
+                len -= ret;
+                data += ret;
+            }
+        }
+    }
+
+    void Socket::_send( const vector< pair< char *, int > > &data, const char *context ) {
+        for( vector< pair< char *, int > >::const_iterator i = data.begin(); i != data.end(); ++i ) {
+            char * data = i->first;
+            int len = i->second;
+            send( data, len, context );
+        }
+    }
+
+    /** sends all data or throws an exception
+     * @param context descriptive for logging
+     */
+    void Socket::send( const vector< pair< char *, int > > &data, const char *context ) {
+
+#ifdef MONGO_SSL
+        if ( _ssl ) {
+            _send( data , context );
+            return;
+        }
+#endif
+
+#if defined(_WIN32)
+        // TODO use scatter/gather api
+        _send( data , context );
+#else
+        vector< struct iovec > d( data.size() );
+        int i = 0;
+        for( vector< pair< char *, int > >::const_iterator j = data.begin(); j != data.end(); ++j ) {
+            if ( j->second > 0 ) {
+                d[ i ].iov_base = j->first;
+                d[ i ].iov_len = j->second;
+                ++i;
+                _bytesOut += j->second;
+            }
+        }
+        struct msghdr meta;
+        memset( &meta, 0, sizeof( meta ) );
+        meta.msg_iov = &d[ 0 ];
+        meta.msg_iovlen = d.size();
+
+        while( meta.msg_iovlen > 0 ) {
+            int ret = ::sendmsg( _fd , &meta , portSendFlags );
+            if ( ret == -1 ) {
+                if ( errno != EAGAIN || _timeout == 0 ) {
+                    log(_logLevel) << "Socket " << context << " send() " << errnoWithDescription() << ' ' << remoteString() << endl;
+                    throw SocketException( SocketException::SEND_ERROR , remoteString() );
+                }
+                else {
+                    log(_logLevel) << "Socket " << context << " send() remote timeout " << remoteString() << endl;
+                    throw SocketException( SocketException::SEND_TIMEOUT , remoteString() );
+                }
+            }
+            else {
+                struct iovec *& i = meta.msg_iov;
+                while( ret > 0 ) {
+                    if ( i->iov_len > unsigned( ret ) ) {
+                        i->iov_len -= ret;
+                        i->iov_base = (char*)(i->iov_base) + ret;
+                        ret = 0;
+                    }
+                    else {
+                        ret -= i->iov_len;
+                        ++i;
+                        --(meta.msg_iovlen);
+                    }
+                }
+            }
+        }
+#endif
+    }
+
+    void Socket::recv( char * buf , int len ) {
+        unsigned retries = 0;
+        while( len > 0 ) {
+            int ret = unsafe_recv( buf , len );
+            if ( ret > 0 ) {
+                if ( len <= 4 && ret != len )
+                    log(_logLevel) << "Socket recv() got " << ret << " bytes wanted len=" << len << endl;
+                assert( ret <= len );
+                len -= ret;
+                buf += ret;
+            }
+            else if ( ret == 0 ) {
+                log(3) << "Socket recv() conn closed? " << remoteString() << endl;
+                throw SocketException( SocketException::CLOSED , remoteString() );
+            }
+            else { /* ret < 0  */                
+#if defined(_WIN32)
+                int e = WSAGetLastError();
+#else
+                int e = errno;
+# if defined(EINTR)
+                if( e == EINTR ) {
+                    if( ++retries == 1 ) {
+                        log() << "EINTR retry" << endl;
+                        continue;
+                    }
+                }
+# endif
+#endif
+                if ( ( e == EAGAIN 
+#if defined(_WIN32)
+                       || e == WSAETIMEDOUT
+#endif
+                       ) && _timeout > 0 ) 
+                {
+                    // this is a timeout
+                    log(_logLevel) << "Socket recv() timeout  " << remoteString() <<endl;
+                    throw SocketException( SocketException::RECV_TIMEOUT, remoteString() );                    
+                }
+
+                log(_logLevel) << "Socket recv() " << errnoWithDescription(e) << " " << remoteString() <<endl;
+                throw SocketException( SocketException::RECV_ERROR , remoteString() );
+            }
+        }
+    }
+
+    int Socket::unsafe_recv( char *buf, int max ) {
+        int x = _recv( buf , max );
+        _bytesIn += x;
+        return x;
+    }
+
+
+    int Socket::_recv( char *buf, int max ) {
+#ifdef MONGO_SSL
+        if ( _ssl ){
+            return SSL_read( _ssl.get() , buf , max );
+        }
+#endif
+        return ::recv( _fd , buf , max , portRecvFlags );
+    }
+
+    void Socket::setTimeout( double secs ) {
+        setSockTimeouts( _fd, secs );
+    }
+
+#if defined(_WIN32)
+    struct WinsockInit {
+        WinsockInit() {
+            WSADATA d;
+            if ( WSAStartup(MAKEWORD(2,2), &d) != 0 ) {
+                out() << "ERROR: wsastartup failed " << errnoWithDescription() << endl;
+                problem() << "ERROR: wsastartup failed " << errnoWithDescription() << endl;
+                dbexit( EXIT_NTSERVICE_ERROR );
+            }
+        }
+    } winsock_init;
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/util/net/sock.h b/src/mongo/util/net/sock.h
new file mode 100644
index 00000000000..2053768cbd5
--- /dev/null
+++ b/src/mongo/util/net/sock.h
@@ -0,0 +1,261 @@
+// @file sock.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../../pch.h"
+
+#include <stdio.h>
+#include <sstream>
+#include "../goodies.h"
+#include "../../db/cmdline.h"
+#include "../mongoutils/str.h"
+
+#ifndef _WIN32
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <errno.h>
+
+#ifdef __openbsd__
+# include <sys/uio.h>
+#endif
+
+#endif // _WIN32
+
+#ifdef MONGO_SSL
+#include <openssl/ssl.h>
+#endif
+
+namespace mongo {
+
+    const int SOCK_FAMILY_UNKNOWN_ERROR=13078;
+
+    void disableNagle(int sock);
+
+#if defined(_WIN32)
+
+    typedef short sa_family_t;
+    typedef int socklen_t;
+
+    // This won't actually be used on windows
+    struct sockaddr_un {
+        short sun_family;
+        char sun_path[108]; // length from unix header
+    };
+
+#else // _WIN32
+
+    inline void closesocket(int s) { close(s); }
+    const int INVALID_SOCKET = -1;
+    typedef int SOCKET;
+
+#endif // _WIN32
+
+    inline string makeUnixSockPath(int port) {
+        return mongoutils::str::stream() << cmdLine.socket << "/mongodb-" << port << ".sock";
+    }
+
+    // If an ip address is passed in, just return that.  If a hostname is passed
+    // in, look up its ip and return that.  Returns "" on failure.
+    string hostbyname(const char *hostname);
+
+    void enableIPv6(bool state=true);
+    bool IPv6Enabled();
+    void setSockTimeouts(int sock, double secs);
+
+    /**
+     * wrapped around os representation of network address
+     */
+    struct SockAddr {
+        SockAddr() {
+            addressSize = sizeof(sa);
+            memset(&sa, 0, sizeof(sa));
+            sa.ss_family = AF_UNSPEC;
+        }
+        SockAddr(int sourcePort); /* listener side */
+        SockAddr(const char *ip, int port); /* EndPoint (remote) side, or if you want to specify which interface locally */
+
+        template <typename T> T& as() { return *(T*)(&sa); }
+        template <typename T> const T& as() const { return *(const T*)(&sa); }
+        
+        string toString(bool includePort=true) const;
+
+        /** 
+         * @return one of AF_INET, AF_INET6, or AF_UNIX
+         */
+        sa_family_t getType() const;
+
+        unsigned getPort() const;
+
+        string getAddr() const;
+
+        bool isLocalHost() const;
+
+        bool operator==(const SockAddr& r) const;
+
+        bool operator!=(const SockAddr& r) const;
+
+        bool operator<(const SockAddr& r) const;
+
+        const sockaddr* raw() const {return (sockaddr*)&sa;}
+        sockaddr* raw() {return (sockaddr*)&sa;}
+
+        socklen_t addressSize;
+    private:
+        struct sockaddr_storage sa;
+    };
+
+    extern SockAddr unknownAddress; // ( "0.0.0.0", 0 )
+
+    /** this is not cache and does a syscall */
+    string getHostName();
+    
+    /** this is cached, so if changes during the process lifetime
+     * will be stale */
+    string getHostNameCached();
+
+    /**
+     * thrown by Socket and SockAddr
+     */
+    class SocketException : public DBException {
+    public:
+        const enum Type { CLOSED , RECV_ERROR , SEND_ERROR, RECV_TIMEOUT, SEND_TIMEOUT, FAILED_STATE, CONNECT_ERROR } _type;
+        
+        SocketException( Type t , string server , int code = 9001 , string extra="" ) 
+            : DBException( "socket exception" , code ) , _type(t) , _server(server), _extra(extra){ }
+        virtual ~SocketException() throw() {}
+
+        bool shouldPrint() const { return _type != CLOSED; }
+        virtual string toString() const;
+
+    private:
+        string _server;
+        string _extra;
+    };
+
+#ifdef MONGO_SSL
+    class SSLManager : boost::noncopyable {
+    public:
+        SSLManager( bool client );
+        
+        void setupPEM( const string& keyFile , const string& password );
+        void setupPubPriv( const string& privateKeyFile , const string& publicKeyFile );
+
+        /**
+         * creates an SSL context to be used for this file descriptor
+         * caller should delete
+         */
+        SSL * secure( int fd );
+        
+        static int password_cb( char *buf,int num, int rwflag,void *userdata );
+
+    private:
+        bool _client;
+        SSL_CTX* _context;
+        string _password;
+    };
+#endif
+
+    /**
+     * thin wrapped around file descriptor and system calls
+     * todo: ssl
+     */
+    class Socket {
+    public:
+        Socket(int sock, const SockAddr& farEnd);
+
+        /** In some cases the timeout will actually be 2x this value - eg we do a partial send,
+            then the timeout fires, then we try to send again, then the timeout fires again with
+            no data sent, then we detect that the other side is down.
+
+            Generally you don't want a timeout, you should be very prepared for errors if you set one.
+        */
+        Socket(double so_timeout = 0, int logLevel = 0 );
+
+        bool connect(SockAddr& farEnd);
+        void close();
+        
+        void send( const char * data , int len, const char *context );
+        void send( const vector< pair< char *, int > > &data, const char *context );
+
+        // recv len or throw SocketException
+        void recv( char * data , int len );
+        int unsafe_recv( char *buf, int max );
+        
+        int getLogLevel() const { return _logLevel; }
+        void setLogLevel( int ll ) { _logLevel = ll; }
+
+        SockAddr remoteAddr() const { return _remote; }
+        string remoteString() const { return _remote.toString(); }
+        unsigned remotePort() const { return _remote.getPort(); }
+
+        void clearCounters() { _bytesIn = 0; _bytesOut = 0; }
+        long long getBytesIn() const { return _bytesIn; }
+        long long getBytesOut() const { return _bytesOut; }
+        
+        void setTimeout( double secs );
+
+        bool stillConnected();
+
+#ifdef MONGO_SSL
+        /** secures inline */
+        void secure( SSLManager * ssl );
+
+        void secureAccepted( SSLManager * ssl );
+#endif
+        
+        /**
+         * call this after a fork for server sockets
+         */
+        void postFork();
+        
+    private:
+        void _init();
+
+        /** raw send, same semantics as ::send */
+    public:
+        int _send( const char * data , int len );
+    private:
+        
+        /** sends dumbly, just each buffer at a time */
+        void _send( const vector< pair< char *, int > > &data, const char *context );
+
+        /** raw recv, same semantics as ::recv */
+        int _recv( char * buf , int max );
+
+        int _fd;
+        SockAddr _remote;
+        double _timeout;
+
+        long long _bytesIn;
+        long long _bytesOut;
+
+#ifdef MONGO_SSL
+        shared_ptr<SSL> _ssl;
+        SSLManager * _sslAccepted;
+#endif
+
+    protected:
+        int _logLevel; // passed to log() when logging errors
+
+    };
+
+
+} // namespace mongo
diff --git a/src/mongo/util/ntservice.cpp b/src/mongo/util/ntservice.cpp
new file mode 100644
index 00000000000..93cfd4a2de0
--- /dev/null
+++ b/src/mongo/util/ntservice.cpp
@@ -0,0 +1,408 @@
+// ntservice.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "ntservice.h"
+#include "../db/client.h"
+#include "winutil.h"
+#include "text.h"
+#include <direct.h>
+
+#if defined(_WIN32)
+
+namespace mongo {
+
+    void shutdownServer();
+
+    SERVICE_STATUS_HANDLE ServiceController::_statusHandle = NULL;
+    std::wstring ServiceController::_serviceName;
+    ServiceCallback ServiceController::_serviceCallback = NULL;
+
+    ServiceController::ServiceController() {}
+
+    bool initService();
+
+    // returns true if the service is started.
+    bool serviceParamsCheck( boost::program_options::variables_map& params, const std::string dbpath, int argc, char* argv[] ) {
+        bool installService = false;
+        bool removeService = false;
+        bool reinstallService = false;
+        bool startService = false;
+
+        std::wstring windowsServiceName = L"MongoDB";
+        std::wstring windowsServiceDisplayName = L"Mongo DB";
+        std::wstring windowsServiceDescription = L"Mongo DB Server";
+        std::wstring windowsServiceUser = L"";
+        std::wstring windowsServicePassword = L"";
+
+        if (params.count("install")) {
+            if ( ! params.count( "logpath" ) ) {
+                cerr << "--install has to be used with --logpath" << endl;
+                ::exit(-1);
+            }
+            installService = true;
+        }
+        if (params.count("reinstall")) {
+            if ( ! params.count( "logpath" ) ) {
+                cerr << "--reinstall has to be used with --logpath" << endl;
+                ::exit(-1);
+            }
+            reinstallService = true;
+        }
+        if (params.count("remove")) {
+            removeService = true;
+        }
+        if (params.count("service")) {
+            startService = true;
+        }
+
+        if (params.count("serviceName")) {
+            string x = params["serviceName"].as<string>();
+            windowsServiceName = wstring(x.size(),L' ');
+            for ( size_t i=0; i<x.size(); i++) {
+                windowsServiceName[i] = x[i];
+            }
+        }
+        if (params.count("serviceDisplayName")) {
+            string x = params["serviceDisplayName"].as<string>();
+            windowsServiceDisplayName = wstring(x.size(),L' ');
+            for ( size_t i=0; i<x.size(); i++) {
+                windowsServiceDisplayName[i] = x[i];
+            }
+        }
+        if (params.count("serviceDescription")) {
+            string x = params["serviceDescription"].as<string>();
+            windowsServiceDescription = wstring(x.size(),L' ');
+            for ( size_t i=0; i<x.size(); i++) {
+                windowsServiceDescription[i] = x[i];
+            }
+        }
+        if (params.count("serviceUser")) {
+            string x = params["serviceUser"].as<string>();
+            windowsServiceUser = wstring(x.size(),L' ');
+            for ( size_t i=0; i<x.size(); i++) {
+                windowsServiceUser[i] = x[i];
+            }
+        }
+        if (params.count("servicePassword")) {
+            string x = params["servicePassword"].as<string>();
+            windowsServicePassword = wstring(x.size(),L' ');
+            for ( size_t i=0; i<x.size(); i++) {
+                windowsServicePassword[i] = x[i];
+            }
+        }
+
+        if ( reinstallService ) {
+            ServiceController::removeService( windowsServiceName );
+        }
+        if ( installService || reinstallService ) {
+            if ( !ServiceController::installService( windowsServiceName , windowsServiceDisplayName, windowsServiceDescription, windowsServiceUser, windowsServicePassword, dbpath, argc, argv ) )
+                dbexit( EXIT_NTSERVICE_ERROR );
+            dbexit( EXIT_CLEAN );
+        }
+        else if ( removeService ) {
+            if ( !ServiceController::removeService( windowsServiceName ) )
+                dbexit( EXIT_NTSERVICE_ERROR );
+            dbexit( EXIT_CLEAN );
+        }
+        else if ( startService ) {
+            if ( !ServiceController::startService( windowsServiceName , mongo::initService ) )
+                dbexit( EXIT_NTSERVICE_ERROR );
+            return true;
+        }
+        return false;
+    }
+
+    bool ServiceController::installService( const std::wstring& serviceName, const std::wstring& displayName, const std::wstring& serviceDesc, const std::wstring& serviceUser, const std::wstring& servicePassword, const std::string dbpath, int argc, char* argv[] ) {
+        assert(argc >= 1);
+
+        stringstream commandLine;
+
+        char exePath[1024];
+        GetModuleFileNameA( NULL, exePath, sizeof exePath );
+        commandLine << '"' << exePath << "\" ";
+
+        for ( int i = 1; i < argc; i++ ) {
+            std::string arg( argv[ i ] );
+            // replace install command to indicate process is being started as a service
+            if ( arg == "--install" || arg == "--reinstall" ) {
+                arg = "--service";
+            }
+            else if ( arg == "--dbpath" && i + 1 < argc ) {
+                commandLine << arg << "  \"" << dbpath << "\"  ";
+                i++;
+                continue;
+            }
+            else if ( arg == "--logpath" && i + 1 < argc ) {
+                commandLine << arg << "  \"" << argv[i+1] << "\"  ";
+                i++;
+                continue;
+            }
+            else if ( arg == "-f" && i + 1 < argc ) {
+                commandLine << arg << "  \"" << argv[i+1] << "\"  ";
+                i++;
+                continue;
+            }
+            else if ( arg == "--config" && i + 1 < argc ) {
+                commandLine << arg << "  \"" << argv[i+1] << "\"  ";
+                i++;
+                continue;
+            }
+            else if ( arg == "--pidfilepath" && i + 1 < argc ) {
+                commandLine << arg << "  \"" << argv[i+1] << "\"  ";
+                i++;
+                continue;
+            }
+            else if ( arg == "--repairpath" && i + 1 < argc ) {
+                commandLine << arg << "  \"" << argv[i+1] << "\"  ";
+                i++;
+                continue;
+            }
+            else if ( arg == "--keyfile" && i + 1 < argc ) {
+                commandLine << arg << "  \"" << argv[i+1] << "\"  ";
+                i++;
+                continue;
+            }
+            else if ( arg.length() > 9 && arg.substr(0, 9) == "--service" ) {
+                // Strip off --service(Name|User|Password) arguments
+                i++;
+                continue;
+            }
+            commandLine << arg << "  ";
+        }
+
+        SC_HANDLE schSCManager = ::OpenSCManager( NULL, NULL, SC_MANAGER_ALL_ACCESS );
+        if ( schSCManager == NULL ) {
+            DWORD err = ::GetLastError();
+            cerr << "Error connecting to the Service Control Manager: " << GetWinErrMsg(err) << endl;
+            return false;
+        }
+
+        // Make sure servise doesn't already exist.
+        // TODO: Check to see if service is in "Deleting" status, suggest the user close down Services MMC snap-ins.
+        SC_HANDLE schService = ::OpenService( schSCManager, serviceName.c_str(), SERVICE_ALL_ACCESS );
+        if ( schService != NULL ) {
+            cerr << "There is already a service named " << toUtf8String(serviceName) << ". Aborting" << endl;
+            ::CloseServiceHandle( schService );
+            ::CloseServiceHandle( schSCManager );
+            return false;
+        }
+        std::basic_ostringstream< TCHAR > commandLineWide;
+        commandLineWide << commandLine.str().c_str();
+
+        cerr << "Creating service " << toUtf8String(serviceName) << "." << endl;
+
+        // create new service
+        schService = ::CreateService( schSCManager, serviceName.c_str(), displayName.c_str(),
+                                      SERVICE_ALL_ACCESS, SERVICE_WIN32_OWN_PROCESS,
+                                      SERVICE_AUTO_START, SERVICE_ERROR_NORMAL,
+                                      commandLineWide.str().c_str(), NULL, NULL, L"\0\0", NULL, NULL );
+        if ( schService == NULL ) {
+            DWORD err = ::GetLastError();
+            cerr << "Error creating service: " << GetWinErrMsg(err) << endl;
+            ::CloseServiceHandle( schSCManager );
+            return false;
+        }
+
+        cerr << "Service creation successful." << endl;
+        cerr << "Service can be started from the command line via 'net start \"" << toUtf8String(serviceName) << "\"'." << endl;
+
+        bool serviceInstalled;
+
+        // TODO: If neccessary grant user "Login as a Service" permission.
+        if ( !serviceUser.empty() ) {
+            std::wstring actualServiceUser;
+            if ( serviceUser.find(L"\\") == string::npos ) {
+                actualServiceUser = L".\\" + serviceUser;
+            }
+            else {
+                actualServiceUser = serviceUser;
+            }
+
+            cerr << "Setting service login credentials. User: " << toUtf8String(actualServiceUser) << endl;
+            serviceInstalled = ::ChangeServiceConfig( schService, SERVICE_NO_CHANGE, SERVICE_NO_CHANGE, SERVICE_NO_CHANGE, NULL, NULL, NULL, NULL, actualServiceUser.c_str(), servicePassword.c_str(), NULL );
+            if ( !serviceInstalled ) {
+                cerr << "Setting service login failed. Service has 'LocalService' permissions." << endl;
+            }
+        }
+
+        // set the service description
+        SERVICE_DESCRIPTION serviceDescription;
+        serviceDescription.lpDescription = (LPTSTR)serviceDesc.c_str();
+        serviceInstalled = ::ChangeServiceConfig2( schService, SERVICE_CONFIG_DESCRIPTION, &serviceDescription );
+
+#if 1
+        if ( ! serviceInstalled ) {
+#else
+        // This code sets the mongod service to auto-restart, forever.
+        // This might be a fine thing to do except that when mongod or Windows has a crash, the mongo.lock
+        // file is still around, so any attempt at a restart will immediately fail.  With auto-restart, we
+        // go into a loop, crashing and restarting, crashing and restarting, until someone comes in and
+        // disables the service or deletes the mongod.lock file.
+        //
+        // I'm leaving the old code here for now in case we solve this and are able to turn SC_ACTION_RESTART
+        // back on.
+        //
+        if ( serviceInstalled ) {
+            SC_ACTION aActions[ 3 ] = { { SC_ACTION_RESTART, 0 }, { SC_ACTION_RESTART, 0 }, { SC_ACTION_RESTART, 0 } };
+
+            SERVICE_FAILURE_ACTIONS serviceFailure;
+            ZeroMemory( &serviceFailure, sizeof( SERVICE_FAILURE_ACTIONS ) );
+            serviceFailure.cActions = 3;
+            serviceFailure.lpsaActions = aActions;
+
+            // set service recovery options
+            serviceInstalled = ::ChangeServiceConfig2( schService, SERVICE_CONFIG_FAILURE_ACTIONS, &serviceFailure );
+
+        }
+        else {
+#endif
+            cerr << "Could not set service description. Check the event log for more details." << endl;
+        }
+
+        ::CloseServiceHandle( schService );
+        ::CloseServiceHandle( schSCManager );
+
+        return serviceInstalled;
+    }
+
+    bool ServiceController::removeService( const std::wstring& serviceName ) {
+        SC_HANDLE schSCManager = ::OpenSCManager( NULL, NULL, SC_MANAGER_ALL_ACCESS );
+        if ( schSCManager == NULL ) {
+            DWORD err = ::GetLastError();
+            cerr << "Error connecting to the Service Control Manager: " << GetWinErrMsg(err) << endl;
+            return false;
+        }
+
+        SC_HANDLE schService = ::OpenService( schSCManager, serviceName.c_str(), SERVICE_ALL_ACCESS );
+        if ( schService == NULL ) {
+            cerr << "Could not find a service named " << toUtf8String(serviceName) << " to uninstall." << endl;
+            ::CloseServiceHandle( schSCManager );
+            return false;
+        }
+
+        SERVICE_STATUS serviceStatus;
+
+        // stop service if its running
+        if ( ::ControlService( schService, SERVICE_CONTROL_STOP, &serviceStatus ) ) {
+            cerr << "Service " << toUtf8String(serviceName) << " is currently running. Stopping service." << endl;
+            while ( ::QueryServiceStatus( schService, &serviceStatus ) ) {
+                if ( serviceStatus.dwCurrentState == SERVICE_STOP_PENDING ) {
+                    Sleep( 1000 );
+                }
+                else { break; }
+            }
+            cerr << "Service stopped." << endl;
+        }
+
+        cerr << "Deleting service " << toUtf8String(serviceName) << "." << endl;
+        bool serviceRemoved = ::DeleteService( schService );
+
+        ::CloseServiceHandle( schService );
+        ::CloseServiceHandle( schSCManager );
+
+        if (serviceRemoved) {
+            cerr << "Service deleted successfully." << endl;
+        }
+        else {
+            cerr << "Failed to delete service." << endl;
+        }
+
+        return serviceRemoved;
+    }
+
+    bool ServiceController::startService( const std::wstring& serviceName, ServiceCallback startService ) {
+        _serviceName = serviceName;
+        _serviceCallback = startService;
+
+        SERVICE_TABLE_ENTRY dispTable[] = {
+            { (LPTSTR)serviceName.c_str(), (LPSERVICE_MAIN_FUNCTION)ServiceController::initService },
+            { NULL, NULL }
+        };
+
+        return StartServiceCtrlDispatcher( dispTable );
+    }
+
+    bool ServiceController::reportStatus( DWORD reportState, DWORD waitHint ) {
+        if ( _statusHandle == NULL )
+            return false;
+
+        static DWORD checkPoint = 1;
+
+        SERVICE_STATUS ssStatus;
+
+        DWORD dwControlsAccepted;
+        switch ( reportState ) {
+        case SERVICE_START_PENDING:
+        case SERVICE_STOP_PENDING:
+        case SERVICE_STOPPED:
+            dwControlsAccepted = 0;
+            break;
+        default:
+            dwControlsAccepted = SERVICE_ACCEPT_STOP | SERVICE_ACCEPT_SHUTDOWN;
+            break;
+        }
+
+        ssStatus.dwServiceType = SERVICE_WIN32_OWN_PROCESS;
+        ssStatus.dwServiceSpecificExitCode = 0;
+        ssStatus.dwControlsAccepted = dwControlsAccepted;
+        ssStatus.dwCurrentState = reportState;
+        ssStatus.dwWin32ExitCode = NO_ERROR;
+        ssStatus.dwWaitHint = waitHint;
+        ssStatus.dwCheckPoint = ( reportState == SERVICE_RUNNING || reportState == SERVICE_STOPPED ) ? 0 : checkPoint++;
+
+        return SetServiceStatus( _statusHandle, &ssStatus );
+    }
+
+    void WINAPI ServiceController::initService( DWORD argc, LPTSTR *argv ) {
+        _statusHandle = RegisterServiceCtrlHandler( _serviceName.c_str(), serviceCtrl );
+        if ( !_statusHandle )
+            return;
+
+        reportStatus( SERVICE_START_PENDING, 1000 );
+
+        _serviceCallback();
+        dbexit( EXIT_CLEAN );
+
+        reportStatus( SERVICE_STOPPED );
+    }
+
+    static void serviceShutdown( const char* controlCodeName ) {
+        Client::initThread( "serviceShutdown" );
+        log() << "got " << controlCodeName << " request from Windows Service Controller, " <<
+            ( inShutdown() ? "already in shutdown" : "will terminate after current cmd ends" ) << endl;
+        ServiceController::reportStatus( SERVICE_STOP_PENDING );
+        if ( ! inShutdown() ) {
+            exitCleanly( EXIT_WINDOWS_SERVICE_STOP );
+            ServiceController::reportStatus( SERVICE_STOPPED );
+        }
+    }
+
+    void WINAPI ServiceController::serviceCtrl( DWORD ctrlCode ) {
+        switch ( ctrlCode ) {
+        case SERVICE_CONTROL_STOP:
+            serviceShutdown( "SERVICE_CONTROL_STOP" );
+            break;
+        case SERVICE_CONTROL_SHUTDOWN:
+            serviceShutdown( "SERVICE_CONTROL_SHUTDOWN" );
+            break;
+        }
+    }
+
+} // namespace mongo
+
+#endif
diff --git a/src/mongo/util/ntservice.h b/src/mongo/util/ntservice.h
new file mode 100644
index 00000000000..2570dfa9bef
--- /dev/null
+++ b/src/mongo/util/ntservice.h
@@ -0,0 +1,49 @@
+// ntservice.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#if defined(_WIN32)
+#include <windows.h>
+
+namespace mongo {
+
+    typedef bool ( *ServiceCallback )( void );
+    bool serviceParamsCheck( boost::program_options::variables_map& params, const std::string dbpath, int argc, char* argv[] );
+
+    class ServiceController {
+    public:
+        ServiceController();
+        virtual ~ServiceController() {}
+
+        static bool installService( const std::wstring& serviceName, const std::wstring& displayName, const std::wstring& serviceDesc, const std::wstring& serviceUser, const std::wstring& servicePassword, const std::string dbpath, int argc, char* argv[] );
+        static bool removeService( const std::wstring& serviceName );
+        static bool startService( const std::wstring& serviceName, ServiceCallback startService );
+        static bool reportStatus( DWORD reportState, DWORD waitHint = 0 );
+
+        static void WINAPI initService( DWORD argc, LPTSTR *argv );
+        static void WINAPI serviceCtrl( DWORD ctrlCode );
+
+    protected:
+        static std::wstring _serviceName;
+        static SERVICE_STATUS_HANDLE _statusHandle;
+        static ServiceCallback _serviceCallback;
+    };
+
+} // namespace mongo
+
+#endif
diff --git a/src/mongo/util/optime.h b/src/mongo/util/optime.h
new file mode 100644
index 00000000000..031ad960d20
--- /dev/null
+++ b/src/mongo/util/optime.h
@@ -0,0 +1,170 @@
+// optime.h - OpTime class
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+//#include "../db/concurrency.h"
+
+namespace mongo {
+    void exitCleanly( ExitCode code );
+
+    struct ClockSkewException : public DBException {
+        ClockSkewException() : DBException( "clock skew exception" , 20001 ) {}
+    };
+
+    /* replsets used to use RSOpTime.
+       M/S uses OpTime.
+       But this is useable from both.
+       */
+    typedef unsigned long long ReplTime;
+
+    /* Operation sequence #.  A combination of current second plus an ordinal value.
+     */
+#pragma pack(4)
+    class OpTime {
+        unsigned i; // ordinal comes first so we can do a single 64 bit compare on little endian
+        unsigned secs;
+        static OpTime last;
+        static OpTime skewed();
+    public:
+        static void setLast(const Date_t &date) {
+            notifier().notify_all(); // won't really do anything until write-lock released
+
+            last = OpTime(date);
+        }
+        unsigned getSecs() const {
+            return secs;
+        }
+        unsigned getInc() const {
+            return i;
+        }
+        OpTime(Date_t date) {
+            reinterpret_cast<unsigned long long&>(*this) = date.millis;
+            dassert( (int)secs >= 0 );
+        }
+        OpTime(ReplTime x) {
+            reinterpret_cast<unsigned long long&>(*this) = x;
+            dassert( (int)secs >= 0 );
+        }
+        OpTime(unsigned a, unsigned b) {
+            secs = a;
+            i = b;
+            dassert( (int)secs >= 0 );
+        }
+        OpTime( const OpTime& other ) { 
+            secs = other.secs;
+            i = other.i;
+            dassert( (int)secs >= 0 );
+        }
+        OpTime() {
+            secs = 0;
+            i = 0;
+        }
+        // it isn't generally safe to not be locked for this. so use now(). some tests use this.
+        static OpTime now_inlock() {
+            notifier().notify_all(); // won't really do anything until write-lock released
+
+            unsigned t = (unsigned) time(0);
+            if ( last.secs == t ) {
+                last.i++;
+                return last;
+            }
+            if ( t < last.secs ) {
+                return skewed(); // separate function to keep out of the hot code path
+            }
+            last = OpTime(t, 1);
+            return last;
+        }
+        static OpTime now();
+        static OpTime last_inlock();
+
+        // Waits for global OpTime to be different from *this
+        // Must be atLeastReadLocked
+        // Defined in instance.cpp (only current user) as it needs dbtemprelease
+        void waitForDifferent(unsigned millis);
+
+        /* We store OpTime's in the database as BSON Date datatype -- we needed some sort of
+         64 bit "container" for these values.  While these are not really "Dates", that seems a
+         better choice for now than say, Number, which is floating point.  Note the BinData type
+         is perhaps the cleanest choice, lacking a true unsigned64 datatype, but BinData has 5
+         bytes of overhead.
+         */
+        unsigned long long asDate() const {
+            return reinterpret_cast<const unsigned long long*>(&i)[0];
+        }
+        long long asLL() const {
+            return reinterpret_cast<const long long*>(&i)[0];
+        }
+
+        bool isNull() const { return secs == 0; }
+
+        string toStringLong() const {
+            char buf[64];
+            time_t_to_String(secs, buf);
+            stringstream ss;
+            ss << time_t_to_String_short(secs) << ' ';
+            ss << hex << secs << ':' << i;
+            return ss.str();
+        }
+
+        string toStringPretty() const {
+            stringstream ss;
+            ss << time_t_to_String_short(secs) << ':' << hex << i;
+            return ss.str();
+        }
+
+        string toString() const {
+            stringstream ss;
+            ss << hex << secs << ':' << i;
+            return ss.str();
+        }
+
+        bool operator==(const OpTime& r) const {
+            return i == r.i && secs == r.secs;
+        }
+        bool operator!=(const OpTime& r) const {
+            return !(*this == r);
+        }
+        bool operator<(const OpTime& r) const {
+            if ( secs != r.secs )
+                return secs < r.secs;
+            return i < r.i;
+        }
+        bool operator<=(const OpTime& r) const {
+            return *this < r || *this == r;
+        }
+        bool operator>(const OpTime& r) const {
+            return !(*this <= r);
+        }
+        bool operator>=(const OpTime& r) const {
+            return !(*this < r);
+        }
+    private:
+
+        // The following functions are to get around the need to define class-level statics in a cpp
+        static boost::condition& notifier() {
+            static boost::condition* holder = new boost::condition();
+            return *holder;
+        };
+        static boost::mutex& notifyMutex() {
+            static boost::mutex* holder = new boost::mutex();
+            return *holder;
+        };
+    };
+#pragma pack()
+
+} // namespace mongo
diff --git a/src/mongo/util/password.cpp b/src/mongo/util/password.cpp
new file mode 100644
index 00000000000..18164c3aa0a
--- /dev/null
+++ b/src/mongo/util/password.cpp
@@ -0,0 +1,91 @@
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "password.h"
+#include <iostream>
+
+#ifndef _WIN32
+#include <termios.h>
+#endif
+
+using namespace std;
+
+namespace mongo {
+
+    string askPassword() {
+
+        std::string password;
+        cout << "Enter password: ";
+#ifndef _WIN32
+        const int stdinfd = 0;
+        termios termio;
+        tcflag_t old = 0;
+        if ( isatty( stdinfd ) ) {
+            int i = tcgetattr( stdinfd, &termio );
+            if( i == -1 ) {
+                cerr << "Cannot get terminal attributes " << errnoWithDescription() << endl;
+                return string();
+            }
+            old = termio.c_lflag;
+            termio.c_lflag &= ~ECHO;
+            i = tcsetattr( stdinfd, TCSANOW, &termio );
+            if( i == -1 ) {
+                cerr << "Cannot set terminal attributes " << errnoWithDescription() << endl;
+                return string();
+            }
+        }
+
+        getline( cin, password );
+
+        if ( isatty( stdinfd ) ) {
+            termio.c_lflag = old;
+            int i = tcsetattr( stdinfd, TCSANOW, &termio );
+            if( i == -1 ) {
+                cerr << "Cannot set terminal attributes " << errnoWithDescription() << endl;
+                return string();
+            }
+        }
+#else
+        HANDLE stdinh = GetStdHandle( STD_INPUT_HANDLE );
+        if ( stdinh == INVALID_HANDLE_VALUE) {
+            cerr << "Cannot get stdin handle " << GetLastError() << "\n";
+            return string();
+        }
+
+        DWORD old;
+        if ( !GetConsoleMode( stdinh, &old ) ) {
+            cerr << "Cannot get console mode " << GetLastError() << "\n";
+            return string();
+        }
+
+        DWORD noecho = ENABLE_LINE_INPUT | ENABLE_PROCESSED_INPUT;
+        if ( !SetConsoleMode( stdinh, noecho ) ) {
+            cerr << "Cannot set console mode " << GetLastError() << "\n";
+            return string();
+        }
+
+        getline( cin, password );
+
+        if ( !SetConsoleMode( stdinh, old ) ) {
+            cerr << "Cannot set console mode " << GetLastError() << "\n";
+            return string();
+        }
+#endif
+        cout << "\n";
+        return password;
+    }
+}
diff --git a/src/mongo/util/password.h b/src/mongo/util/password.h
new file mode 100644
index 00000000000..519f712ee7e
--- /dev/null
+++ b/src/mongo/util/password.h
@@ -0,0 +1,61 @@
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+
+#pragma once
+
+#include <boost/program_options.hpp>
+#include <string>
+
+namespace mongo {
+
+    struct PasswordValue : public boost::program_options::typed_value<std::string> {
+
+        PasswordValue( std::string* val )
+            : boost::program_options::typed_value<std::string>( val ) { }
+
+        unsigned min_tokens() const {
+            return 0;
+        }
+
+        unsigned max_tokens() const {
+            return 1;
+        }
+
+        bool is_required() const {
+            return false;
+        }
+
+        void xparse( boost::any& value_store,
+                     const std::vector<std::string>& new_tokens ) const {
+            if ( !value_store.empty() )
+#if BOOST_VERSION >= 104200
+                boost::throw_exception( boost::program_options::validation_error( boost::program_options::validation_error::multiple_values_not_allowed ) );
+#else
+                boost::throw_exception( boost::program_options::validation_error( "multiple values not allowed" ) );
+#endif
+            else if ( !new_tokens.empty() )
+                boost::program_options::typed_value<std::string>::xparse
+                (value_store, new_tokens);
+            else
+                value_store = std::string();
+        }
+
+    };
+
+    std::string askPassword();
+
+}
diff --git a/src/mongo/util/paths.h b/src/mongo/util/paths.h
new file mode 100644
index 00000000000..bb82df0c730
--- /dev/null
+++ b/src/mongo/util/paths.h
@@ -0,0 +1,124 @@
+// @file paths.h
+// file paths and directory handling
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "mongoutils/str.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+namespace mongo {
+    
+    using namespace mongoutils;
+
+    extern string dbpath;
+
+    /** this is very much like a boost::path.  however, we define a new type to get some type
+        checking.  if you want to say 'my param MUST be a relative path", use this.
+    */
+    struct RelativePath {
+        string _p;
+
+        bool empty() const { return _p.empty(); }
+
+        static RelativePath fromRelativePath(string f) {
+            RelativePath rp;
+            rp._p = f;
+            return rp;
+        }
+
+        /** from a full path */
+        static RelativePath fromFullPath(path f) {
+            path dbp(dbpath); // normalizes / and backslash
+            string fullpath = f.string();
+            string relative = str::after(fullpath, dbp.string());
+            if( relative.empty() ) {
+                log() << "warning file is not under db path? " << fullpath << ' ' << dbp.string() << endl;
+                RelativePath rp;
+                rp._p = fullpath;
+                return rp;
+            }
+            /*uassert(13600,
+                    str::stream() << "file path is not under the db path? " << fullpath << ' ' << dbpath,
+                    relative != fullpath);*/
+            if( str::startsWith(relative, "/") || str::startsWith(relative, "\\") ) {
+                relative.erase(0, 1);
+            }
+            RelativePath rp;
+            rp._p = relative;
+            return rp;
+        }
+
+        string toString() const { return _p; }
+
+        bool operator!=(const RelativePath& r) const { return _p != r._p; }
+        bool operator==(const RelativePath& r) const { return _p == r._p; }
+        bool operator<(const RelativePath& r) const { return _p < r._p; }
+
+        string asFullPath() const {
+            path x(dbpath);
+            x /= _p;
+            return x.string();
+        }
+
+    };
+
+    inline dev_t getPartition(const string& path){
+        struct stat stats;
+
+        if (stat(path.c_str(), &stats) != 0){
+            uasserted(13646, str::stream() << "stat() failed for file: " << path << " " << errnoWithDescription());
+        }
+
+        return stats.st_dev;
+    }
+    
+    inline bool onSamePartition(const string& path1, const string& path2){
+        dev_t dev1 = getPartition(path1);
+        dev_t dev2 = getPartition(path2);
+
+        return dev1 == dev2;
+    }
+
+    inline void flushMyDirectory(const boost::filesystem::path& file){
+#ifdef __linux__ // this isn't needed elsewhere
+        // if called without a fully qualified path it asserts; that makes mongoperf fail. so make a warning. need a better solution longer term.
+        // massert(13652, str::stream() << "Couldn't find parent dir for file: " << file.string(), );
+        if( !file.has_branch_path() ) {
+            log() << "warning flushMYDirectory couldn't find parent dir for file: " << file.string() << endl;
+            return;
+        }
+
+
+        boost::filesystem::path dir = file.branch_path(); // parent_path in new boosts
+
+        log(1) << "flushing directory " << dir.string() << endl;
+
+        int fd = ::open(dir.string().c_str(), O_RDONLY); // DO NOT THROW OR ASSERT BEFORE CLOSING
+        massert(13650, str::stream() << "Couldn't open directory '" << dir.string() << "' for flushing: " << errnoWithDescription(), fd >= 0);
+        if (fsync(fd) != 0){
+            int e = errno;
+            close(fd);
+            massert(13651, str::stream() << "Couldn't fsync directory '" << dir.string() << "': " << errnoWithDescription(e), false);
+        }
+        close(fd);
+#endif
+    }
+
+}
diff --git a/src/mongo/util/processinfo.cpp b/src/mongo/util/processinfo.cpp
new file mode 100644
index 00000000000..082d42b3bc0
--- /dev/null
+++ b/src/mongo/util/processinfo.cpp
@@ -0,0 +1,48 @@
+// processinfo.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "processinfo.h"
+#include "mmap.h"
+
+#include <iostream>
+using namespace std;
+
+namespace mongo {
+
+    class PidFileWiper {
+    public:
+        ~PidFileWiper() {
+            ofstream out( path.c_str() , ios_base::out );
+            out.close();
+        }
+
+        void write( const string& p ) {
+            path = p;
+            ofstream out( path.c_str() , ios_base::out );
+            out << getpid() << endl;
+            out.close();
+        }
+
+        string path;
+    } pidFileWiper;
+
+    void writePidFile( const string& path ) {
+        pidFileWiper.write( path );
+    }
+
+}
diff --git a/src/mongo/util/processinfo.h b/src/mongo/util/processinfo.h
new file mode 100644
index 00000000000..5272831eb74
--- /dev/null
+++ b/src/mongo/util/processinfo.h
@@ -0,0 +1,67 @@
+// processinfo.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <sys/types.h>
+#include <string>
+
+#ifndef _WIN32
+#include <unistd.h>
+#else
+typedef int pid_t;
+int getpid();
+#endif
+
+namespace mongo {
+
+    class BSONObjBuilder;
+
+    class ProcessInfo {
+    public:
+        ProcessInfo( pid_t pid = getpid() );
+        ~ProcessInfo();
+
+        /**
+         * @return mbytes
+         */
+        int getVirtualMemorySize();
+
+        /**
+         * @return mbytes
+         */
+        int getResidentSize();
+
+        /**
+         * Append platform-specific data to obj
+         */
+        void getExtraInfo(BSONObjBuilder& info);
+
+        bool supported();
+
+        static bool blockCheckSupported();
+        static bool blockInMemory( char * start );
+
+    private:
+        pid_t _pid;
+    };
+
+    void writePidFile( const std::string& path );
+
+    void printMemInfo( const char * whereContextStr = 0 );
+
+}
diff --git a/src/mongo/util/processinfo_darwin.cpp b/src/mongo/util/processinfo_darwin.cpp
new file mode 100644
index 00000000000..9f73cbffd4f
--- /dev/null
+++ b/src/mongo/util/processinfo_darwin.cpp
@@ -0,0 +1,116 @@
+// processinfo_darwin.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "../pch.h"
+#include "processinfo.h"
+#include "log.h"
+
+#include <mach/vm_statistics.h>
+#include <mach/task_info.h>
+#include <mach/mach_init.h>
+#include <mach/mach_host.h>
+#include <mach/mach_traps.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
+#include <mach/shared_region.h>
+#include <iostream>
+
+#include <sys/types.h>
+#include <sys/mman.h>
+
+using namespace std;
+
+namespace mongo {
+
+    ProcessInfo::ProcessInfo( pid_t pid ) : _pid( pid ) {
+    }
+
+    ProcessInfo::~ProcessInfo() {
+    }
+
+    bool ProcessInfo::supported() {
+        return true;
+    }
+
+    int ProcessInfo::getVirtualMemorySize() {
+        task_t result;
+
+        mach_port_t task;
+
+        if ( ( result = task_for_pid( mach_task_self() , _pid , &task) ) != KERN_SUCCESS ) {
+            cout << "error getting task\n";
+            return 0;
+        }
+
+#if !defined(__LP64__)
+        task_basic_info_32 ti;
+#else
+        task_basic_info_64 ti;
+#endif
+        mach_msg_type_number_t  count = TASK_BASIC_INFO_COUNT;
+        if ( ( result = task_info( task , TASK_BASIC_INFO , (task_info_t)&ti, &count ) )  != KERN_SUCCESS ) {
+            cout << "error getting task_info: " << result << endl;
+            return 0;
+        }
+        return (int)((double)ti.virtual_size / (1024.0 * 1024 ) );
+    }
+
+    int ProcessInfo::getResidentSize() {
+        task_t result;
+
+        mach_port_t task;
+
+        if ( ( result = task_for_pid( mach_task_self() , _pid , &task) ) != KERN_SUCCESS ) {
+            cout << "error getting task\n";
+            return 0;
+        }
+
+
+#if !defined(__LP64__)
+        task_basic_info_32 ti;
+#else
+        task_basic_info_64 ti;
+#endif
+        mach_msg_type_number_t  count = TASK_BASIC_INFO_COUNT;
+        if ( ( result = task_info( task , TASK_BASIC_INFO , (task_info_t)&ti, &count ) )  != KERN_SUCCESS ) {
+            cout << "error getting task_info: " << result << endl;
+            return 0;
+        }
+        return (int)( ti.resident_size / (1024 * 1024 ) );
+    }
+
+    void ProcessInfo::getExtraInfo(BSONObjBuilder& info) {}
+
+    bool ProcessInfo::blockCheckSupported() {
+        return true;
+    }
+
+    bool ProcessInfo::blockInMemory( char * start ) {
+        static long pageSize = 0;
+        if ( pageSize == 0 ) {
+            pageSize = sysconf( _SC_PAGESIZE );
+        }
+        start = start - ( (unsigned long long)start % pageSize );
+        char x = 0;
+        if ( mincore( start , 128 , &x ) ) {
+            log() << "mincore failed: " << errnoWithDescription() << endl;
+            return 1;
+        }
+        return x & 0x1;
+    }
+
+}
diff --git a/src/mongo/util/processinfo_linux2.cpp b/src/mongo/util/processinfo_linux2.cpp
new file mode 100644
index 00000000000..3eaccafd030
--- /dev/null
+++ b/src/mongo/util/processinfo_linux2.cpp
@@ -0,0 +1,244 @@
+// processinfo_linux2.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "processinfo.h"
+
+#include <iostream>
+#include <stdio.h>
+#include <malloc.h>
+#include <db/jsobj.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+using namespace std;
+
+#define KLONG long
+#define KLF "l"
+
+namespace mongo {
+
+    class LinuxProc {
+    public:
+        LinuxProc( pid_t pid = getpid() ) {
+            char name[128];
+            sprintf( name , "/proc/%d/stat"  , pid );
+
+            FILE * f = fopen( name , "r");
+            if ( ! f ) {
+                stringstream ss;
+                ss << "couldn't open [" << name << "] " << errnoWithDescription();
+                string s = ss.str();
+                // help the assert# control uasserted( 13538 , s.c_str() );
+                msgassertedNoTrace( 13538 , s.c_str() );
+            }
+            int found = fscanf(f,
+                               "%d %s %c "
+                               "%d %d %d %d %d "
+                               "%lu %lu %lu %lu %lu "
+                               "%lu %lu %ld %ld "  /* utime stime cutime cstime */
+                               "%ld %ld "
+                               "%ld "
+                               "%ld "
+                               "%lu "  /* start_time */
+                               "%lu "
+                               "%ld " // rss
+                               "%lu %"KLF"u %"KLF"u %"KLF"u %"KLF"u %"KLF"u "
+                               /*
+                                 "%*s %*s %*s %*s "
+                                 "%"KLF"u %*lu %*lu "
+                                 "%d %d "
+                                 "%lu %lu"
+                               */
+
+                               ,
+
+                               &_pid,
+                               _comm,
+                               &_state,
+                               &_ppid, &_pgrp, &_session, &_tty, &_tpgid,
+                               &_flags, &_min_flt, &_cmin_flt, &_maj_flt, &_cmaj_flt,
+                               &_utime, &_stime, &_cutime, &_cstime,
+                               &_priority, &_nice,
+                               &_alarm,
+                               &_nlwp,
+                               &_start_time,
+                               &_vsize,
+                               &_rss,
+                               &_rss_rlim, &_start_code, &_end_code, &_start_stack, &_kstk_esp, &_kstk_eip
+
+                               /*
+                                 &_wchan,
+                                 &_exit_signal, &_processor,
+                                 &_rtprio, &_sched
+                               */
+                              );
+            if ( found == 0 ) {
+                cout << "system error: reading proc info" << endl;
+            }
+            fclose( f );
+        }
+
+        unsigned long getVirtualMemorySize() {
+            return _vsize;
+        }
+
+        unsigned long getResidentSize() {
+            return (unsigned long)_rss * 4 * 1024;
+        }
+
+        int _pid;
+        // The process ID.
+
+        char _comm[128];
+        // The filename of the executable, in parentheses.  This is visible whether or not the executable is swapped out.
+
+        char _state;
+        //One character from the string "RSDZTW" where R is running, S is sleeping in an interruptible wait, D is waiting  in  uninterruptible
+        //  disk sleep, Z is zombie, T is traced or stopped (on a signal), and W is paging.
+
+        int _ppid;
+        // The PID of the parent.
+
+        int _pgrp;
+        // The process group ID of the process.
+
+        int _session;
+        // The session ID of the process.
+
+        int _tty;
+        // The tty the process uses.
+
+        int _tpgid;
+        // The process group ID of the process which currently owns the tty that the process is connected to.
+
+        unsigned long _flags; // %lu
+        // The  kernel flags word of the process. For bit meanings, see the PF_* defines in <linux/sched.h>.  Details depend on the kernel version.
+
+        unsigned long _min_flt; // %lu
+        // The number of minor faults the process has made which have not required loading a memory page from disk.
+
+        unsigned long _cmin_flt; // %lu
+        // The number of minor faults that the process
+
+        unsigned long _maj_flt; // %lu
+        // The number of major faults the process has made which have required loading a memory page from disk.
+
+        unsigned long _cmaj_flt; // %lu
+        // The number of major faults that the process
+
+        unsigned long _utime; // %lu
+        // The number of jiffies that this process has been scheduled in user mode.
+
+        unsigned long _stime; //  %lu
+        // The number of jiffies that this process has been scheduled in kernel mode.
+
+        long _cutime; // %ld
+        // The number of jiffies that this removed field.
+
+        long _cstime; // %ld
+
+        long _priority;
+        long _nice;
+
+        long _nlwp; // %ld
+        // The time in jiffies before the next SIGALRM is sent to the process due to an interval timer.
+
+        unsigned long _alarm;
+
+        unsigned long _start_time; // %lu
+        // The time in jiffies the process started after system boot.
+
+        unsigned long _vsize; // %lu
+        // Virtual memory size in bytes.
+
+        long _rss; // %ld
+        // Resident Set Size: number of pages the process has in real memory, minus 3 for administrative purposes. This is just the pages which
+        // count  towards  text,  data, or stack space.  This does not include pages which have not been demand-loaded in, or which are swapped out
+
+        unsigned long _rss_rlim; // %lu
+        // Current limit in bytes on the rss of the process (usually 4294967295 on i386).
+
+        unsigned long _start_code; // %lu
+        // The address above which program text can run.
+
+        unsigned long _end_code; // %lu
+        // The address below which program text can run.
+
+        unsigned long _start_stack; // %lu
+        // The address of the start of the stack.
+
+        unsigned long _kstk_esp; // %lu
+        // The current value of esp (stack pointer), as found in the kernel stack page for the process.
+
+        unsigned long _kstk_eip; // %lu
+        // The current EIP (instruction pointer).
+
+
+
+    };
+
+
+    ProcessInfo::ProcessInfo( pid_t pid ) : _pid( pid ) {
+    }
+
+    ProcessInfo::~ProcessInfo() {
+    }
+
+    bool ProcessInfo::supported() {
+        return true;
+    }
+
+    int ProcessInfo::getVirtualMemorySize() {
+        LinuxProc p(_pid);
+        return (int)( p.getVirtualMemorySize() / ( 1024.0 * 1024 ) );
+    }
+
+    int ProcessInfo::getResidentSize() {
+        LinuxProc p(_pid);
+        return (int)( p.getResidentSize() / ( 1024.0 * 1024 ) );
+    }
+
+    void ProcessInfo::getExtraInfo(BSONObjBuilder& info) {
+        // [dm] i don't think mallinfo works. (64 bit.)  ??
+        struct mallinfo malloc_info = mallinfo(); // structure has same name as function that returns it. (see malloc.h)
+        info.append("heap_usage_bytes", malloc_info.uordblks/*main arena*/ + malloc_info.hblkhd/*mmap blocks*/);
+        //docs claim hblkhd is included in uordblks but it isn't
+
+        LinuxProc p(_pid);
+        info.append("page_faults", (int)p._maj_flt);
+    }
+
+    bool ProcessInfo::blockCheckSupported() {
+        return true;
+    }
+
+    bool ProcessInfo::blockInMemory( char * start ) {
+        static long pageSize = 0;
+        if ( pageSize == 0 ) {
+            pageSize = sysconf( _SC_PAGESIZE );
+        }
+        start = start - ( (unsigned long long)start % pageSize );
+        unsigned char x = 0;
+        if ( mincore( start , 128 , &x ) ) {
+            log() << "mincore failed: " << errnoWithDescription() << endl;
+            return 1;
+        }
+        return x & 0x1;
+    }
+
+
+}
diff --git a/src/mongo/util/processinfo_none.cpp b/src/mongo/util/processinfo_none.cpp
new file mode 100644
index 00000000000..7d1e84d377c
--- /dev/null
+++ b/src/mongo/util/processinfo_none.cpp
@@ -0,0 +1,55 @@
+// processinfo_none.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "processinfo.h"
+
+#include <iostream>
+using namespace std;
+
+namespace mongo {
+
+    ProcessInfo::ProcessInfo( pid_t pid ) {
+    }
+
+    ProcessInfo::~ProcessInfo() {
+    }
+
+    bool ProcessInfo::supported() {
+        return false;
+    }
+
+    int ProcessInfo::getVirtualMemorySize() {
+        return -1;
+    }
+
+    int ProcessInfo::getResidentSize() {
+        return -1;
+    }
+
+    void ProcessInfo::getExtraInfo(BSONObjBuilder& info) {}
+
+    bool ProcessInfo::blockCheckSupported() {
+        return false;
+    }
+
+    bool ProcessInfo::blockInMemory( char * start ) {
+        assert(0);
+        return true;
+    }
+
+}
diff --git a/src/mongo/util/processinfo_win32.cpp b/src/mongo/util/processinfo_win32.cpp
new file mode 100644
index 00000000000..87d92db7e18
--- /dev/null
+++ b/src/mongo/util/processinfo_win32.cpp
@@ -0,0 +1,102 @@
+// processinfo_win32.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "processinfo.h"
+#include <iostream>
+#include <psapi.h>
+#include "../bson/bsonobjbuilder.h"
+using namespace std;
+
+int getpid() {
+    return GetCurrentProcessId();
+}
+
+namespace mongo {
+
+    int _wconvertmtos( SIZE_T s ) {
+        return (int)( s / ( 1024 * 1024 ) );
+    }
+
+    ProcessInfo::ProcessInfo( pid_t pid ) {
+    }
+
+    ProcessInfo::~ProcessInfo() {
+    }
+
+    bool ProcessInfo::supported() {
+        return true;
+    }
+
+    int ProcessInfo::getVirtualMemorySize() {
+        MEMORYSTATUSEX mse;
+        mse.dwLength = sizeof(mse);
+        assert( GlobalMemoryStatusEx( &mse ) );
+        DWORDLONG x = (mse.ullTotalVirtual - mse.ullAvailVirtual) / (1024 * 1024) ;
+        assert( x <= 0x7fffffff );
+        return (int) x;
+    }
+
+    int ProcessInfo::getResidentSize() {
+        PROCESS_MEMORY_COUNTERS pmc;
+        assert( GetProcessMemoryInfo( GetCurrentProcess() , &pmc, sizeof(pmc) ) );
+        return _wconvertmtos( pmc.WorkingSetSize );
+    }
+
+    void ProcessInfo::getExtraInfo(BSONObjBuilder& info) {
+        MEMORYSTATUSEX mse;
+        mse.dwLength = sizeof(mse);
+        PROCESS_MEMORY_COUNTERS pmc;
+        if( GetProcessMemoryInfo( GetCurrentProcess() , &pmc, sizeof(pmc) ) ) {
+            info.append("page_faults", static_cast<int>(pmc.PageFaultCount));
+            info.append("usagePageFileMB", static_cast<int>(pmc.PagefileUsage / 1024 / 1024));
+        }
+        if( GlobalMemoryStatusEx( &mse ) ) {
+            info.append("totalPageFileMB", static_cast<int>(mse.ullTotalPageFile / 1024 / 1024));
+            info.append("availPageFileMB", static_cast<int>(mse.ullAvailPageFile / 1024 / 1024));
+            info.append("ramMB", static_cast<int>(mse.ullTotalPhys / 1024 / 1024));
+        }
+    }
+
+    bool ProcessInfo::blockCheckSupported() {
+        return true;
+    }
+
+    bool ProcessInfo::blockInMemory( char * start ) {
+#if 0
+        // code for printing out page fault addresses and pc's --
+        // this could be useful for targetting heavy pagefault locations in the code
+        static BOOL bstat = InitializeProcessForWsWatch( GetCurrentProcess() );
+        PSAPI_WS_WATCH_INFORMATION_EX wiex[30];
+        DWORD bufsize =  sizeof(wiex);
+        bstat = GetWsChangesEx( GetCurrentProcess(), &wiex[0], &bufsize );
+        if (bstat) {
+            for (int i=0; i<30; i++) {
+                if (wiex[i].BasicInfo.FaultingPc == 0) break;
+                cout << "faulting pc = " << wiex[i].BasicInfo.FaultingPc << " address = " << wiex[i].BasicInfo.FaultingVa << " thread id = " << wiex[i].FaultingThreadId << endl;
+            }
+        }
+#endif
+        PSAPI_WORKING_SET_EX_INFORMATION wsinfo;
+        wsinfo.VirtualAddress = start;
+        BOOL result = QueryWorkingSetEx( GetCurrentProcess(), &wsinfo, sizeof(wsinfo) );
+        if ( result )
+            if ( wsinfo.VirtualAttributes.Valid )
+                return true;
+        return false;
+    }
+}
diff --git a/src/mongo/util/queue.h b/src/mongo/util/queue.h
new file mode 100644
index 00000000000..4223bd6c256
--- /dev/null
+++ b/src/mongo/util/queue.h
@@ -0,0 +1,106 @@
+// @file queue.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../pch.h"
+
+#include <queue>
+
+#include "../util/timer.h"
+
+namespace mongo {
+
+    /**
+     * simple blocking queue
+     */
+    template<typename T> class BlockingQueue : boost::noncopyable {
+    public:
+        BlockingQueue() : _lock("BlockingQueue") { }
+
+        void push(T const& t) {
+            scoped_lock l( _lock );
+            _queue.push( t );
+            _condition.notify_one();
+        }
+
+        bool empty() const {
+            scoped_lock l( _lock );
+            return _queue.empty();
+        }
+
+        size_t size() const {
+            scoped_lock l( _lock );
+            return _queue.size();
+        }
+
+
+        bool tryPop( T & t ) {
+            scoped_lock l( _lock );
+            if ( _queue.empty() )
+                return false;
+
+            t = _queue.front();
+            _queue.pop();
+
+            return true;
+        }
+
+        T blockingPop() {
+
+            scoped_lock l( _lock );
+            while( _queue.empty() )
+                _condition.wait( l.boost() );
+
+            T t = _queue.front();
+            _queue.pop();
+            return t;
+        }
+
+
+        /**
+         * blocks waiting for an object until maxSecondsToWait passes
+         * if got one, return true and set in t
+         * otherwise return false and t won't be changed
+         */
+        bool blockingPop( T& t , int maxSecondsToWait ) {
+
+            Timer timer;
+
+            boost::xtime xt;
+            boost::xtime_get(&xt, boost::TIME_UTC);
+            xt.sec += maxSecondsToWait;
+
+            scoped_lock l( _lock );
+            while( _queue.empty() ) {
+                if ( ! _condition.timed_wait( l.boost() , xt ) )
+                    return false;
+            }
+
+            t = _queue.front();
+            _queue.pop();
+            return true;
+        }
+
+    private:
+        std::queue<T> _queue;
+
+        mutable mongo::mutex _lock;
+        boost::condition _condition;
+    };
+
+}
diff --git a/src/mongo/util/ramlog.cpp b/src/mongo/util/ramlog.cpp
new file mode 100644
index 00000000000..d7a839a3fff
--- /dev/null
+++ b/src/mongo/util/ramlog.cpp
@@ -0,0 +1,190 @@
+// ramlog.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "log.h"
+#include "ramlog.h"
+#include "mongoutils/html.h"
+#include "mongoutils/str.h"
+
+namespace mongo {
+
+    using namespace mongoutils;
+
+    RamLog::RamLog( string name ) : _name(name), _lastWrite(0) {
+        h = 0; n = 0;
+        for( int i = 0; i < N; i++ )
+            lines[i][C-1] = 0;
+
+        if ( name.size() ) {
+            
+            if ( ! _namedLock )
+                _namedLock = new mongo::mutex("RamLog::_namedLock");
+
+            scoped_lock lk( *_namedLock );
+            if ( ! _named )
+                _named = new RM();
+            (*_named)[name] = this;
+        }
+        
+    }
+
+    RamLog::~RamLog() {
+        
+    }
+
+    void RamLog::write(LogLevel ll, const string& str) {
+        _lastWrite = time(0);
+
+        char *p = lines[(h+n)%N];
+        
+        unsigned sz = str.size();
+        if( sz < C ) {
+            if ( str.c_str()[sz-1] == '\n' ) {
+                memcpy(p, str.c_str(), sz-1);
+                p[sz-1] = 0;
+            }
+            else 
+                strcpy(p, str.c_str());
+        }
+        else {
+            memcpy(p, str.c_str(), C-1);
+        }
+
+        if( n < N ) n++;
+        else h = (h+1) % N;
+    }
+
+    void RamLog::get( vector<const char*>& v) const {
+        for( unsigned x=0, i=h; x++ < n; i=(i+1)%N )
+            v.push_back(lines[i]);
+    }
+
+    int RamLog::repeats(const vector<const char *>& v, int i) {
+        for( int j = i-1; j >= 0 && j+8 > i; j-- ) {
+            if( strcmp(v[i]+20,v[j]+20) == 0 ) {
+                for( int x = 1; ; x++ ) {
+                    if( j+x == i ) return j;
+                    if( i+x>=(int) v.size() ) return -1;
+                    if( strcmp(v[i+x]+20,v[j+x]+20) ) return -1;
+                }
+                return -1;
+            }
+        }
+        return -1;
+    }
+
+
+    string RamLog::clean(const vector<const char *>& v, int i, string line ) {
+        if( line.empty() ) line = v[i];
+        if( i > 0 && strncmp(v[i], v[i-1], 11) == 0 )
+            return string("           ") + line.substr(11);
+        return v[i];
+    }
+
+    string RamLog::color(string line) {
+        string s = str::after(line, "replSet ");
+        if( str::startsWith(s, "warning") || startsWith(s, "error") )
+            return html::red(line);
+        if( str::startsWith(s, "info") ) {
+            if( str::endsWith(s, " up\n") )
+                return html::green(line);
+            else if( str::contains(s, " down ") || str::endsWith(s, " down\n") )
+                return html::yellow(line);
+            return line; //html::blue(line);
+        }
+
+        return line;
+    }
+
+    /* turn http:... into an anchor */
+    string RamLog::linkify(const char *s) {
+        const char *p = s;
+        const char *h = strstr(p, "http://");
+        if( h == 0 ) return s;
+
+        const char *sp = h + 7;
+        while( *sp && *sp != ' ' ) sp++;
+
+        string url(h, sp-h);
+        stringstream ss;
+        ss << string(s, h-s) << "<a href=\"" << url << "\">" << url << "</a>" << sp;
+        return ss.str();
+    }
+
+    void RamLog::toHTML(stringstream& s) {
+        vector<const char*> v;
+        get( v );
+
+        s << "<pre>\n";
+        for( int i = 0; i < (int)v.size(); i++ ) {
+            assert( strlen(v[i]) > 20 );
+            int r = repeats(v, i);
+            if( r < 0 ) {
+                s << color( linkify( clean(v,i).c_str() ) ) << '\n';
+            }
+            else {
+                stringstream x;
+                x << string(v[i], 0, 20);
+                int nr = (i-r);
+                int last = i+nr-1;
+                for( ; r < i ; r++ ) x << '.';
+                if( 1 ) {
+                    stringstream r;
+                    if( nr == 1 ) r << "repeat last line";
+                    else r << "repeats last " << nr << " lines; ends " << string(v[last]+4,0,15);
+                    s << html::a("", r.str(), clean(v,i,x.str()));
+                }
+                else s << x.str();
+                s << '\n';
+                i = last;
+            }
+        }
+        s << "</pre>\n";
+    }
+
+    // ---------------
+    // static things
+    // ---------------
+
+    RamLog* RamLog::get( string name ) {
+        if ( ! _named )
+            return 0;
+
+        scoped_lock lk( *_namedLock );
+        RM::iterator i = _named->find( name );
+        if ( i == _named->end() )
+            return 0;
+        return i->second;
+    }
+    
+    void RamLog::getNames( vector<string>& names ) {
+        if ( ! _named )
+            return;
+
+        scoped_lock lk( *_namedLock );
+        for ( RM::iterator i=_named->begin(); i!=_named->end(); ++i ) {
+            if ( i->second->n )
+                names.push_back( i->first );
+        }
+    }
+
+    mongo::mutex* RamLog::_namedLock;
+    RamLog::RM*  RamLog::_named = 0;
+
+    Tee* const warnings = new RamLog("warnings"); // Things put here go in serverStatus
+}
diff --git a/src/mongo/util/ramlog.h b/src/mongo/util/ramlog.h
new file mode 100644
index 00000000000..d3d5c8fbb4e
--- /dev/null
+++ b/src/mongo/util/ramlog.h
@@ -0,0 +1,65 @@
+// ramlog.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "log.h"
+
+namespace mongo {
+
+    class RamLog : public Tee {
+    public:
+        RamLog( string name );
+
+        virtual void write(LogLevel ll, const string& str);
+
+        void get( vector<const char*>& v) const;
+
+        void toHTML(stringstream& s);
+
+        static RamLog* get( string name );
+        static void getNames( vector<string>& names );
+
+        time_t lastWrite() { return _lastWrite; } // 0 if no writes
+
+    protected:
+        static int repeats(const vector<const char *>& v, int i);
+        static string clean(const vector<const char *>& v, int i, string line="");
+        static string color(string line);
+
+        /* turn http:... into an anchor */
+        static string linkify(const char *s);
+
+    private:
+        ~RamLog(); // want this private as we want to leak so we can use them till the very end
+
+        enum {
+            N = 128, // number of links
+            C = 256 // max size of line
+        };
+        char lines[N][C];
+        unsigned h; // current position
+        unsigned n; // numer of lines stores 0 o N
+        string _name;
+
+        typedef map<string,RamLog*> RM;
+        static mongo::mutex* _namedLock;
+        static RM*  _named;
+        time_t _lastWrite;
+    };
+
+}
diff --git a/src/mongo/util/scopeguard.h b/src/mongo/util/scopeguard.h
new file mode 100644
index 00000000000..b87a4b51871
--- /dev/null
+++ b/src/mongo/util/scopeguard.h
@@ -0,0 +1,427 @@
+////////////////////////////////////////////////////////////////////////////////
+// The Loki Library
+// Copyright (c) 2000 Andrei Alexandrescu
+// Copyright (c) 2000 Petru Marginean
+// Copyright (c) 2005 Joshua Lehrer
+//
+// Permission to use, copy, modify, distribute and sell this software for any 
+//     purpose is hereby granted without fee, provided that the above copyright 
+//     notice appear in all copies and that both that copyright notice and this 
+//     permission notice appear in supporting documentation.
+// The author makes no representations about the 
+//     suitability of this software for any purpose. It is provided "as is" 
+//     without express or implied warranty.
+////////////////////////////////////////////////////////////////////////////////
+#ifndef LOKI_SCOPEGUARD_H_
+#define LOKI_SCOPEGUARD_H_
+
+namespace mongo
+{
+
+    ////////////////////////////////////////////////////////////////////////////////
+    ///  \class RefToValue
+    ///
+    ///  Transports a reference as a value
+    ///  Serves to implement the Colvin/Gibbons trick for SmartPtr/ScopeGuard
+    ////////////////////////////////////////////////////////////////////////////////
+
+    template <class T>
+        class RefToValue
+    {   
+    public:
+    
+    RefToValue(T& ref) : ref_(ref) 
+        {}
+
+    RefToValue(const RefToValue& rhs) : ref_(rhs.ref_)
+        {}
+
+        operator T& () const 
+        {
+            return ref_;
+        }
+
+    private:
+        // Disable - not implemented
+        RefToValue();
+        RefToValue& operator=(const RefToValue&);
+        
+        T& ref_;
+    };
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    ///  RefToValue creator.
+    ////////////////////////////////////////////////////////////////////////////////
+
+    template <class T>
+        inline RefToValue<T> ByRef(T& t)
+    {
+        return RefToValue<T>(t);
+    }  
+
+
+
+
+    ////////////////////////////////////////////
+    ///  ScopeGuard
+    /*
+      Trivial example for use:
+      
+      FILE* f = fopen("myfile.txt", "w+");
+      if (!f)
+          return error;
+      ON_BLOCK_EXIT(fclose, f);
+
+
+      More complicated example:
+
+      ScopeGuard guard = MakeGuard(my_rollback_func, myparam);
+      ...
+      if (successful) {
+         guard.Dismiss();
+         return;
+      }
+      // guard is still active here and will fire at scope exit
+      ...
+
+
+    */
+
+
+    class ScopeGuardImplBase
+    {
+        ScopeGuardImplBase& operator =(const ScopeGuardImplBase&);
+
+    protected:
+
+        ~ScopeGuardImplBase()
+        {}
+
+        ScopeGuardImplBase(const ScopeGuardImplBase& other) throw() 
+            : dismissed_(other.dismissed_)
+        {
+            other.Dismiss();
+        }
+
+        template <typename J>
+            static void SafeExecute(J& j) throw() 
+        {
+            if (!j.dismissed_)
+                try
+                {
+                    j.Execute();
+                }
+            catch(...)
+            {}
+        }
+        
+        mutable bool dismissed_;
+
+    public:
+        ScopeGuardImplBase() throw() : dismissed_(false) 
+        {}
+
+        void Dismiss() const throw() 
+        {
+            dismissed_ = true;
+        }
+    };
+    
+    ////////////////////////////////////////////////////////////////
+    ///
+    /// \typedef typedef const ScopeGuardImplBase& ScopeGuard
+    ///
+    /// See Andrei's and Petru Marginean's CUJ article
+    /// http://www.cuj.com/documents/s=8000/cujcexp1812alexandr/alexandr.htm
+    ///
+    /// Changes to the original code by Joshua Lehrer:
+    /// http://www.lehrerfamily.com/scopeguard.html
+    ////////////////////////////////////////////////////////////////
+    
+    typedef const ScopeGuardImplBase& ScopeGuard;
+
+    template <typename F>
+        class ScopeGuardImpl0 : public ScopeGuardImplBase
+    {
+    public:
+        static ScopeGuardImpl0<F> MakeGuard(F fun)
+        {
+            return ScopeGuardImpl0<F>(fun);
+        }
+
+        ~ScopeGuardImpl0() throw() 
+        {
+            SafeExecute(*this);
+        }
+
+        void Execute() 
+        {
+            fun_();
+        }
+
+    protected:
+    ScopeGuardImpl0(F fun) : fun_(fun) 
+        {}
+
+        F fun_;
+    };
+
+    template <typename F> 
+        inline ScopeGuardImpl0<F> MakeGuard(F fun)
+    {
+        return ScopeGuardImpl0<F>::MakeGuard(fun);
+    }
+
+    template <typename F, typename P1>
+        class ScopeGuardImpl1 : public ScopeGuardImplBase
+    {
+    public:
+        static ScopeGuardImpl1<F, P1> MakeGuard(F fun, P1 p1)
+        {
+            return ScopeGuardImpl1<F, P1>(fun, p1);
+        }
+
+        ~ScopeGuardImpl1() throw() 
+        {
+            SafeExecute(*this);
+        }
+
+        void Execute()
+        {
+            fun_(p1_);
+        }
+
+    protected:
+    ScopeGuardImpl1(F fun, P1 p1) : fun_(fun), p1_(p1) 
+        {}
+
+        F fun_;
+        const P1 p1_;
+    };
+
+    template <typename F, typename P1> 
+        inline ScopeGuardImpl1<F, P1> MakeGuard(F fun, P1 p1)
+    {
+        return ScopeGuardImpl1<F, P1>::MakeGuard(fun, p1);
+    }
+
+    template <typename F, typename P1, typename P2>
+        class ScopeGuardImpl2: public ScopeGuardImplBase
+    {
+    public:
+        static ScopeGuardImpl2<F, P1, P2> MakeGuard(F fun, P1 p1, P2 p2)
+        {
+            return ScopeGuardImpl2<F, P1, P2>(fun, p1, p2);
+        }
+
+        ~ScopeGuardImpl2() throw() 
+        {
+            SafeExecute(*this);
+        }
+
+        void Execute()
+        {
+            fun_(p1_, p2_);
+        }
+
+    protected:
+    ScopeGuardImpl2(F fun, P1 p1, P2 p2) : fun_(fun), p1_(p1), p2_(p2) 
+        {}
+
+        F fun_;
+        const P1 p1_;
+        const P2 p2_;
+    };
+
+    template <typename F, typename P1, typename P2>
+        inline ScopeGuardImpl2<F, P1, P2> MakeGuard(F fun, P1 p1, P2 p2)
+    {
+        return ScopeGuardImpl2<F, P1, P2>::MakeGuard(fun, p1, p2);
+    }
+
+    template <typename F, typename P1, typename P2, typename P3>
+        class ScopeGuardImpl3 : public ScopeGuardImplBase
+    {
+    public:
+        static ScopeGuardImpl3<F, P1, P2, P3> MakeGuard(F fun, P1 p1, P2 p2, P3 p3)
+        {
+            return ScopeGuardImpl3<F, P1, P2, P3>(fun, p1, p2, p3);
+        }
+
+        ~ScopeGuardImpl3() throw() 
+        {
+            SafeExecute(*this);
+        }
+
+        void Execute()
+        {
+            fun_(p1_, p2_, p3_);
+        }
+
+    protected:
+    ScopeGuardImpl3(F fun, P1 p1, P2 p2, P3 p3) : fun_(fun), p1_(p1), p2_(p2), p3_(p3) 
+        {}
+
+        F fun_;
+        const P1 p1_;
+        const P2 p2_;
+        const P3 p3_;
+    };
+
+    template <typename F, typename P1, typename P2, typename P3>
+        inline ScopeGuardImpl3<F, P1, P2, P3> MakeGuard(F fun, P1 p1, P2 p2, P3 p3)
+    {
+        return ScopeGuardImpl3<F, P1, P2, P3>::MakeGuard(fun, p1, p2, p3);
+    }
+
+    //************************************************************
+
+    template <class Obj, typename MemFun>
+        class ObjScopeGuardImpl0 : public ScopeGuardImplBase
+    {
+    public:
+        static ObjScopeGuardImpl0<Obj, MemFun> MakeObjGuard(Obj& obj, MemFun memFun)
+        {
+            return ObjScopeGuardImpl0<Obj, MemFun>(obj, memFun);
+        }
+
+        ~ObjScopeGuardImpl0() throw() 
+        {
+            SafeExecute(*this);
+        }
+
+        void Execute() 
+        {
+            (obj_.*memFun_)();
+        }
+
+    protected:
+    ObjScopeGuardImpl0(Obj& obj, MemFun memFun) : obj_(obj), memFun_(memFun) 
+        {}
+
+        Obj& obj_;
+        MemFun memFun_;
+    };
+
+    template <class Obj, typename MemFun>
+        inline ObjScopeGuardImpl0<Obj, MemFun> MakeObjGuard(Obj& obj, MemFun memFun)
+    {
+        return ObjScopeGuardImpl0<Obj, MemFun>::MakeObjGuard(obj, memFun);
+    }
+
+    template <typename Ret, class Obj1, class Obj2>
+        inline ObjScopeGuardImpl0<Obj1,Ret(Obj2::*)()> MakeGuard(Ret(Obj2::*memFun)(), Obj1 &obj) 
+    {
+        return ObjScopeGuardImpl0<Obj1,Ret(Obj2::*)()>::MakeObjGuard(obj,memFun);
+    }
+
+    template <typename Ret, class Obj1, class Obj2>
+        inline ObjScopeGuardImpl0<Obj1,Ret(Obj2::*)()> MakeGuard(Ret(Obj2::*memFun)(), Obj1 *obj) 
+    {
+        return ObjScopeGuardImpl0<Obj1,Ret(Obj2::*)()>::MakeObjGuard(*obj,memFun);
+    }
+
+    template <class Obj, typename MemFun, typename P1>
+        class ObjScopeGuardImpl1 : public ScopeGuardImplBase
+    {
+    public:
+        static ObjScopeGuardImpl1<Obj, MemFun, P1> MakeObjGuard(Obj& obj, MemFun memFun, P1 p1)
+        {
+            return ObjScopeGuardImpl1<Obj, MemFun, P1>(obj, memFun, p1);
+        }
+
+        ~ObjScopeGuardImpl1() throw() 
+        {
+            SafeExecute(*this);
+        }
+
+        void Execute() 
+        {
+            (obj_.*memFun_)(p1_);
+        }
+
+    protected:
+    ObjScopeGuardImpl1(Obj& obj, MemFun memFun, P1 p1) : obj_(obj), memFun_(memFun), p1_(p1) 
+        {}
+        
+        Obj& obj_;
+        MemFun memFun_;
+        const P1 p1_;
+    };
+
+    template <class Obj, typename MemFun, typename P1>
+        inline ObjScopeGuardImpl1<Obj, MemFun, P1> MakeObjGuard(Obj& obj, MemFun memFun, P1 p1)
+    {
+        return ObjScopeGuardImpl1<Obj, MemFun, P1>::MakeObjGuard(obj, memFun, p1);
+    }
+
+    template <typename Ret, class Obj1, class Obj2, typename P1a, typename P1b>
+        inline ObjScopeGuardImpl1<Obj1,Ret(Obj2::*)(P1a),P1b> MakeGuard(Ret(Obj2::*memFun)(P1a), Obj1 &obj, P1b p1) 
+    {
+        return ObjScopeGuardImpl1<Obj1,Ret(Obj2::*)(P1a),P1b>::MakeObjGuard(obj,memFun,p1);
+    }
+
+    template <typename Ret, class Obj1, class Obj2, typename P1a, typename P1b>
+        inline ObjScopeGuardImpl1<Obj1,Ret(Obj2::*)(P1a),P1b> MakeGuard(Ret(Obj2::*memFun)(P1a), Obj1 *obj, P1b p1) 
+    {
+        return ObjScopeGuardImpl1<Obj1,Ret(Obj2::*)(P1a),P1b>::MakeObjGuard(*obj,memFun,p1);
+    }
+
+    template <class Obj, typename MemFun, typename P1, typename P2>
+        class ObjScopeGuardImpl2 : public ScopeGuardImplBase
+    {
+    public:
+        static ObjScopeGuardImpl2<Obj, MemFun, P1, P2> MakeObjGuard(Obj& obj, MemFun memFun, P1 p1, P2 p2)
+        {
+            return ObjScopeGuardImpl2<Obj, MemFun, P1, P2>(obj, memFun, p1, p2);
+        }
+
+        ~ObjScopeGuardImpl2() throw() 
+        {
+            SafeExecute(*this);
+        }
+
+        void Execute() 
+        {
+            (obj_.*memFun_)(p1_, p2_);
+        }
+
+    protected:
+    ObjScopeGuardImpl2(Obj& obj, MemFun memFun, P1 p1, P2 p2) : obj_(obj), memFun_(memFun), p1_(p1), p2_(p2) 
+        {}
+
+        Obj& obj_;
+        MemFun memFun_;
+        const P1 p1_;
+        const P2 p2_;
+    };
+
+    template <class Obj, typename MemFun, typename P1, typename P2>
+        inline ObjScopeGuardImpl2<Obj, MemFun, P1, P2> MakeObjGuard(Obj& obj, MemFun memFun, P1 p1, P2 p2)
+    {
+        return ObjScopeGuardImpl2<Obj, MemFun, P1, P2>::MakeObjGuard(obj, memFun, p1, p2);
+    }
+
+    template <typename Ret, class Obj1, class Obj2, typename P1a, typename P1b, typename P2a, typename P2b>
+        inline ObjScopeGuardImpl2<Obj1,Ret(Obj2::*)(P1a,P2a),P1b,P2b> MakeGuard(Ret(Obj2::*memFun)(P1a,P2a), Obj1 &obj, P1b p1, P2b p2) 
+    {
+        return ObjScopeGuardImpl2<Obj1,Ret(Obj2::*)(P1a,P2a),P1b,P2b>::MakeObjGuard(obj,memFun,p1,p2);
+    }
+
+    template <typename Ret, class Obj1, class Obj2, typename P1a, typename P1b, typename P2a, typename P2b>
+        inline ObjScopeGuardImpl2<Obj1,Ret(Obj2::*)(P1a,P2a),P1b,P2b> MakeGuard(Ret(Obj2::*memFun)(P1a,P2a), Obj1 *obj, P1b p1, P2b p2) 
+    {
+        return ObjScopeGuardImpl2<Obj1,Ret(Obj2::*)(P1a,P2a),P1b,P2b>::MakeObjGuard(*obj,memFun,p1,p2);
+    }
+
+} // namespace Loki
+
+#define LOKI_CONCATENATE_DIRECT(s1, s2)  s1##s2
+#define LOKI_CONCATENATE(s1, s2)         LOKI_CONCATENATE_DIRECT(s1, s2)
+#define LOKI_ANONYMOUS_VARIABLE(str)     LOKI_CONCATENATE(str, __LINE__)
+
+#define ON_BLOCK_EXIT      ScopeGuard LOKI_ANONYMOUS_VARIABLE(scopeGuard) = MakeGuard
+#define ON_BLOCK_EXIT_OBJ  ScopeGuard LOKI_ANONYMOUS_VARIABLE(scopeGuard) = MakeObjGuard
+
+#endif //LOKI_SCOPEGUARD_H_
diff --git a/src/mongo/util/signal_handlers.cpp b/src/mongo/util/signal_handlers.cpp
new file mode 100644
index 00000000000..0e9ec7a9b15
--- /dev/null
+++ b/src/mongo/util/signal_handlers.cpp
@@ -0,0 +1,122 @@
+// signal_handlers.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+
+#if !defined(_WIN32)  // TODO: windows support
+#include <unistd.h>
+#endif
+
+#if !defined(_WIN32) && !defined(NOEXECINFO)
+#include <execinfo.h>
+#endif
+
+#include "log.h"
+#include "signal_handlers.h"
+
+namespace mongo {
+
+    /*
+     * WARNING: PLEASE READ BEFORE CHANGING THIS MODULE
+     *
+     * All code in this module should be singal-friendly. Before adding any system
+     * call or other dependency, please make sure the latter still holds.
+     *
+     */
+
+    static int rawWrite( int fd , char* c , int size ) {
+#if !defined(_WIN32)
+
+        int toWrite = size;
+        int writePos = 0;
+        int wrote;
+        while ( toWrite > 0 ) {
+            wrote = write( fd , &c[writePos] , toWrite );
+            if ( wrote < 1 ) break;
+            toWrite -= wrote;
+            writePos += wrote;
+        }
+        return writePos;
+
+#else
+
+        return -1;
+
+#endif
+    }
+
+    static int formattedWrite( int fd , const char* format, ... ) {
+        const int MAX_ENTRY = 256;
+        static char entryBuf[MAX_ENTRY];
+
+        va_list ap;
+        va_start( ap , format );
+        int entrySize = vsnprintf( entryBuf , MAX_ENTRY-1 , format , ap );
+        if ( entrySize < 0 ) {
+            return -1;
+        }
+
+        if ( rawWrite( fd , entryBuf , entrySize ) < 0 ) {
+            return -1;
+        }
+
+        return 0;
+    }
+
+    static void formattedBacktrace( int fd ) {
+
+#if !defined(_WIN32) && !defined(NOEXECINFO)
+
+        int numFrames;
+        const int MAX_DEPTH = 20;
+        void* stackFrames[MAX_DEPTH];
+
+        numFrames = backtrace( stackFrames , 20 );
+        for ( int i = 0; i < numFrames; i++ ) {
+            formattedWrite( fd , "%p " , stackFrames[i] );
+        }
+        formattedWrite( fd , "\n" );
+
+        backtrace_symbols_fd( stackFrames , numFrames , fd );
+
+#else
+
+        formattedWrite( fd, "backtracing not implemented for this platform yet\n" );
+
+#endif
+
+    }
+
+    void printStackAndExit( int signalNum ) {
+        int fd = Logstream::getLogDesc();
+
+        if ( fd >= 0 ) {
+            formattedWrite( fd , "Received signal %d\n" , signalNum );
+            formattedWrite( fd , "Backtrace: " );
+            formattedBacktrace( fd );
+            formattedWrite( fd , "===\n" );
+        }
+
+        ::exit( EXIT_ABRUPT );
+    }
+
+} // namespace mongo
diff --git a/src/mongo/util/signal_handlers.h b/src/mongo/util/signal_handlers.h
new file mode 100644
index 00000000000..9d3a735a723
--- /dev/null
+++ b/src/mongo/util/signal_handlers.h
@@ -0,0 +1,34 @@
+// signal_handlers.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+
+namespace mongo {
+
+    /**
+     * Obtains the log file handler and writes the current thread's stack trace to
+     * it. This call issues an exit(). The function can safely be called from within a
+     * signal handler.
+     *
+     * @param signal that this hadler is called for
+     */
+    void printStackAndExit( int signalNum );
+
+} // namespace mongo
diff --git a/src/mongo/util/string_writer.h b/src/mongo/util/string_writer.h
new file mode 100755
index 00000000000..e83881bf6f6
--- /dev/null
+++ b/src/mongo/util/string_writer.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    class StringWriter {
+    public:
+	virtual ~StringWriter() {};
+	virtual void writeString(stringstream &ss) const = 0;
+    };
+}
diff --git a/src/mongo/util/stringutils.cpp b/src/mongo/util/stringutils.cpp
new file mode 100644
index 00000000000..229f57bb3cb
--- /dev/null
+++ b/src/mongo/util/stringutils.cpp
@@ -0,0 +1,44 @@
+// stringutils.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+namespace mongo {
+
+    void splitStringDelim( const string& str , vector<string>* res , char delim ) {
+        if ( str.empty() )
+            return;
+
+        size_t beg = 0;
+        size_t pos = str.find( delim );
+        while ( pos != string::npos ) {
+            res->push_back( str.substr( beg, pos - beg) );
+            beg = ++pos;
+            pos = str.find( delim, beg );
+        }
+        res->push_back( str.substr( beg ) );
+    }
+
+    void joinStringDelim( const vector<string>& strs , string* res , char delim ) {
+        for ( vector<string>::const_iterator it = strs.begin(); it != strs.end(); ++it ) {
+            if ( it !=strs.begin() ) res->push_back( delim );
+            res->append( *it );
+        }
+    }
+
+} // namespace mongo
diff --git a/src/mongo/util/stringutils.h b/src/mongo/util/stringutils.h
new file mode 100644
index 00000000000..93598aa520b
--- /dev/null
+++ b/src/mongo/util/stringutils.h
@@ -0,0 +1,139 @@
+// stringutils.h
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    // see also mongoutils/str.h - perhaps move these there?
+    // see also text.h
+
+    void splitStringDelim( const string& str , vector<string>* res , char delim );
+
+    void joinStringDelim( const vector<string>& strs , string* res , char delim );
+
+    inline string tolowerString( const string& input ) {
+        string::size_type sz = input.size();
+
+        boost::scoped_array<char> line(new char[sz+1]);
+        char * copy = line.get();
+
+        for ( string::size_type i=0; i<sz; i++ ) {
+            char c = input[i];
+            copy[i] = (char)tolower( (int)c );
+        }
+        copy[sz] = 0;
+        return string(copy);
+    }
+
+    /**
+     * Non numeric characters are compared lexicographically; numeric substrings
+     * are compared numerically; dots separate ordered comparable subunits.
+     * For convenience, character 255 is greater than anything else.
+     */
+    inline int lexNumCmp( const char *s1, const char *s2 ) {
+        //cout << "START : " << s1 << "\t" << s2 << endl;
+
+        bool startWord = true;
+        
+        while( *s1 && *s2 ) {
+
+            bool d1 = ( *s1 == '.' );
+            bool d2 = ( *s2 == '.' );
+            if ( d1 && !d2 )
+             	return -1;
+            if ( d2 && !d1 )
+             	return 1;
+            if ( d1 && d2 ) {
+             	++s1; ++s2;
+                startWord = true;
+                continue;
+            }
+            
+            bool p1 = ( *s1 == (char)255 );
+            bool p2 = ( *s2 == (char)255 );
+            //cout << "\t\t " << p1 << "\t" << p2 << endl;
+            if ( p1 && !p2 )
+                return 1;
+            if ( p2 && !p1 )
+                return -1;
+
+            bool n1 = isNumber( *s1 );
+            bool n2 = isNumber( *s2 );
+
+            if ( n1 && n2 ) {
+                // get rid of leading 0s
+                if ( startWord ) {
+                    while ( *s1 == '0' ) s1++;
+                    while ( *s2 == '0' ) s2++;
+                }
+
+                char * e1 = (char*)s1;
+                char * e2 = (char*)s2;
+
+                // find length
+                // if end of string, will break immediately ('\0')
+                while ( isNumber (*e1) ) e1++;
+                while ( isNumber (*e2) ) e2++;
+
+                int len1 = (int)(e1-s1);
+                int len2 = (int)(e2-s2);
+
+                int result;
+                // if one is longer than the other, return
+                if ( len1 > len2 ) {
+                    return 1;
+                }
+                else if ( len2 > len1 ) {
+                    return -1;
+                }
+                // if the lengths are equal, just strcmp
+                else if ( (result = strncmp(s1, s2, len1)) != 0 ) {
+                    return result;
+                }
+
+                // otherwise, the numbers are equal
+                s1 = e1;
+                s2 = e2;
+                startWord = false;
+                continue;
+            }
+            
+            if ( n1 )
+                return 1;
+
+            if ( n2 )
+                return -1;
+
+            if ( *s1 > *s2 )
+                return 1;
+
+            if ( *s2 > *s1 )
+                return -1;
+            
+            s1++; s2++;
+            startWord = false;
+        }
+
+        if ( *s1 )
+            return 1;
+        if ( *s2 )
+            return -1;
+        return 0;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/util/systeminfo.h b/src/mongo/util/systeminfo.h
new file mode 100755
index 00000000000..be4404ff785
--- /dev/null
+++ b/src/mongo/util/systeminfo.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace mongo {
+
+    class SystemInfo {
+    public:
+	/*
+	  Get the amount of physical memory available on the host.
+
+	  This should only be used for "advisory" purposes, and not as a hard
+	  value, because this could be deceptive on virtual hosts, and because
+	  this will return zero on platforms that do not support it.
+
+	  @returns amount of physical memory, or zero
+	 */
+	static size_t getPhysicalRam();
+
+    private:
+	// don't instantiate this class
+	SystemInfo(); // no implementation
+    };
+
+}
diff --git a/src/mongo/util/systeminfo_linux2.cpp b/src/mongo/util/systeminfo_linux2.cpp
new file mode 100755
index 00000000000..c1b7c861768
--- /dev/null
+++ b/src/mongo/util/systeminfo_linux2.cpp
@@ -0,0 +1,47 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "util/systeminfo.h"
+
+#include <unistd.h>
+
+namespace mongo {
+
+    size_t SystemInfo::getPhysicalRam() {
+	/*
+	  The value of this should not be changing while the system is running,
+	  so it should be safe to do this once for the lifetime of the
+	  application.
+
+	  This could present a race condition if multiple threads do this at
+	  the same time, but all paths through here will produce the same
+	  result, so it's not worth locking or worrying about it.
+	 */
+	static bool unknown = true;
+	static size_t ramSize = 0;
+
+	if (unknown) {
+	    long pages = sysconf(_SC_PHYS_PAGES);
+	    long page_size = sysconf(_SC_PAGE_SIZE);
+	    ramSize = pages * page_size;
+	    unknown = false;
+	}
+
+	return ramSize;
+    }
+
+}
diff --git a/src/mongo/util/systeminfo_none.cpp b/src/mongo/util/systeminfo_none.cpp
new file mode 100755
index 00000000000..d22ce17f6b9
--- /dev/null
+++ b/src/mongo/util/systeminfo_none.cpp
@@ -0,0 +1,26 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "util/systeminfo.h"
+
+namespace mongo {
+
+    size_t SystemInfo::getPhysicalRam() {
+	return 0;
+    }
+
+}
diff --git a/src/mongo/util/systeminfo_win32.cpp b/src/mongo/util/systeminfo_win32.cpp
new file mode 100755
index 00000000000..19c182878ee
--- /dev/null
+++ b/src/mongo/util/systeminfo_win32.cpp
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "util/systeminfo.h"
+
+#include <windows.h>
+
+namespace mongo {
+
+    size_t SystemInfo::getPhysicalRam() {
+	/*
+	  The value of this should not be changing while the system is running,
+	  so it should be safe to do this once for the lifetime of the
+	  application.
+
+	  This could present a race condition if multiple threads do this at
+	  the same time, but all paths through here will produce the same
+	  result, so it's not worth locking or worrying about it.
+	 */
+	static bool unknown = true;
+	static size_t ramSize = 0;
+
+	if (unknown) {
+	    MEMORYSTATUSEX status;
+	    status.dwLength = sizeof(status);
+	    GlobalMemoryStatusEx(&status);
+	    ramSize = static_cast<size_t>(status.ullTotalPhys);
+	    unknown = false;
+	}
+
+	return ramSize;
+    }
+
+}
diff --git a/src/mongo/util/text.cpp b/src/mongo/util/text.cpp
new file mode 100644
index 00000000000..51a2556afdc
--- /dev/null
+++ b/src/mongo/util/text.cpp
@@ -0,0 +1,115 @@
+// text.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "text.h"
+#include "unittest.h"
+
+namespace mongo {
+
+    inline int leadingOnes(unsigned char c) {
+        if (c < 0x80) return 0;
+        static const char _leadingOnes[128] = {
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 - 0x8F
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 - 0x99
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xA0 - 0xA9
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xB0 - 0xB9
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0 - 0xC9
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0 - 0xD9
+            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0 - 0xE9
+            4, 4, 4, 4, 4, 4, 4, 4,                         // 0xF0 - 0xF7
+            5, 5, 5, 5,             // 0xF8 - 0xFB
+            6, 6,       // 0xFC - 0xFD
+            7,    // 0xFE
+            8, // 0xFF
+        };
+        return _leadingOnes[c & 0x7f];
+
+    }
+
+    bool isValidUTF8(const char *s) {
+        int left = 0; // how many bytes are left in the current codepoint
+        while (*s) {
+            const unsigned char c = (unsigned char) *(s++);
+            const int ones = leadingOnes(c);
+            if (left) {
+                if (ones != 1) return false; // should be a continuation byte
+                left--;
+            }
+            else {
+                if (ones == 0) continue; // ASCII byte
+                if (ones == 1) return false; // unexpected continuation byte
+                if (c > 0xF4) return false; // codepoint too large (< 0x10FFFF)
+                if (c == 0xC0 || c == 0xC1) return false; // codepoints <= 0x7F shouldn't be 2 bytes
+
+                // still valid
+                left = ones-1;
+            }
+        }
+        if (left!=0) return false; // string ended mid-codepoint
+        return true;
+    }
+
+#if defined(_WIN32)
+
+    std::string toUtf8String(const std::wstring& wide) {
+        if (wide.size() > boost::integer_traits<int>::const_max)
+            throw std::length_error(
+                "Wide string cannot be more than INT_MAX characters long.");
+        if (wide.size() == 0)
+            return "";
+
+        // Calculate necessary buffer size
+        int len = ::WideCharToMultiByte(
+                      CP_UTF8, 0, wide.c_str(), static_cast<int>(wide.size()),
+                      NULL, 0, NULL, NULL);
+
+        // Perform actual conversion
+        if (len > 0) {
+            std::vector<char> buffer(len);
+            len = ::WideCharToMultiByte(
+                      CP_UTF8, 0, wide.c_str(), static_cast<int>(wide.size()),
+                      &buffer[0], static_cast<int>(buffer.size()), NULL, NULL);
+            if (len > 0) {
+                assert(len == static_cast<int>(buffer.size()));
+                return std::string(&buffer[0], buffer.size());
+            }
+        }
+
+        throw boost::system::system_error(
+            ::GetLastError(), boost::system::system_category);
+    }
+
+#if defined(_UNICODE)
+    std::wstring toWideString(const char *s) {
+        std::basic_ostringstream<TCHAR> buf;
+        buf << s;
+        return buf.str();
+    }
+#endif
+
+#endif
+
+    struct TextUnitTest : public UnitTest {
+        void run() {
+            assert( parseLL("123") == 123 );
+            assert( parseLL("-123000000000") == -123000000000LL );
+        }
+    } textUnitTest;
+
+}
+
diff --git a/src/mongo/util/text.h b/src/mongo/util/text.h
new file mode 100644
index 00000000000..bf25c86fd39
--- /dev/null
+++ b/src/mongo/util/text.h
@@ -0,0 +1,148 @@
+// text.h
+/*
+ *    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    class StringSplitter {
+    public:
+        /** @param big the string to be split
+            @param splitter the delimiter
+        */
+        StringSplitter( const char * big , const char * splitter )
+            : _big( big ) , _splitter( splitter ) {
+        }
+
+        /** @return true if more to be taken via next() */
+        bool more() {
+            return _big[0] != 0;
+        }
+
+        /** get next split string fragment */
+        string next() {
+            const char * foo = strstr( _big , _splitter );
+            if ( foo ) {
+                string s( _big , foo - _big );
+                _big = foo + 1;
+                while ( *_big && strstr( _big , _splitter ) == _big )
+                    _big++;
+                return s;
+            }
+
+            string s = _big;
+            _big += strlen( _big );
+            return s;
+        }
+
+        void split( vector<string>& l ) {
+            while ( more() ) {
+                l.push_back( next() );
+            }
+        }
+
+        vector<string> split() {
+            vector<string> l;
+            split( l );
+            return l;
+        }
+
+        static vector<string> split( const string& big , const string& splitter ) {
+            StringSplitter ss( big.c_str() , splitter.c_str() );
+            return ss.split();
+        }
+
+        static string join( vector<string>& l , const string& split ) {
+            stringstream ss;
+            for ( unsigned i=0; i<l.size(); i++ ) {
+                if ( i > 0 )
+                    ss << split;
+                ss << l[i];
+            }
+            return ss.str();
+        }
+
+    private:
+        const char * _big;
+        const char * _splitter;
+    };
+
+    /* This doesn't defend against ALL bad UTF8, but it will guarantee that the
+     * string can be converted to sequence of codepoints. However, it doesn't
+     * guarantee that the codepoints are valid.
+     */
+    bool isValidUTF8(const char *s);
+    inline bool isValidUTF8(string s) { return isValidUTF8(s.c_str()); }
+
+#if defined(_WIN32)
+
+    std::string toUtf8String(const std::wstring& wide);
+
+    std::wstring toWideString(const char *s);
+
+    /* like toWideString but UNICODE macro sensitive */
+# if !defined(_UNICODE)
+#error temp error 
+    inline std::string toNativeString(const char *s) { return s; }
+# else
+    inline std::wstring toNativeString(const char *s) { return toWideString(s); }
+# endif
+
+#endif
+
+    // expect that n contains a base ten number and nothing else after it
+    // NOTE win version hasn't been tested directly
+    inline long long parseLL( const char *n ) {
+        long long ret;
+        uassert( 13307, "cannot convert empty string to long long", *n != 0 );
+#if !defined(_WIN32)
+        char *endPtr = 0;
+        errno = 0;
+        ret = strtoll( n, &endPtr, 10 );
+        uassert( 13305, "could not convert string to long long", *endPtr == 0 && errno == 0 );
+#elif _MSC_VER>=1600    // 1600 is VS2k10 1500 is VS2k8
+        size_t endLen = 0;
+        try {
+            ret = stoll( n, &endLen, 10 );
+        }
+        catch ( ... ) {
+            endLen = 0;
+        }
+        uassert( 13306, "could not convert string to long long", endLen != 0 && n[ endLen ] == 0 );
+#else // stoll() wasn't introduced until VS 2010.
+        char* endPtr = 0;
+        ret = _strtoi64( n, &endPtr, 10 );
+        uassert( 13310, "could not convert string to long long", (*endPtr == 0) && (ret != _I64_MAX) && (ret != _I64_MIN) );
+#endif // !defined(_WIN32)
+        return ret;
+    }
+}
diff --git a/src/mongo/util/time_support.h b/src/mongo/util/time_support.h
new file mode 100644
index 00000000000..18181eb805a
--- /dev/null
+++ b/src/mongo/util/time_support.h
@@ -0,0 +1,255 @@
+// @file time_support.h
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdio> // sscanf
+#include <ctime>
+#include <boost/date_time/posix_time/posix_time.hpp>
+#include <boost/thread/xtime.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+    inline void time_t_to_Struct(time_t t, struct tm * buf , bool local = false ) {
+#if defined(_WIN32)
+        if ( local )
+            localtime_s( buf , &t );
+        else
+            gmtime_s(buf, &t);
+#else
+        if ( local )
+            localtime_r(&t, buf);
+        else
+            gmtime_r(&t, buf);
+#endif
+    }
+
+    // uses ISO 8601 dates without trailing Z
+    // colonsOk should be false when creating filenames
+    inline string terseCurrentTime(bool colonsOk=true) {
+        struct tm t;
+        time_t_to_Struct( time(0) , &t );
+
+        const char* fmt = (colonsOk ? "%Y-%m-%dT%H:%M:%S" : "%Y-%m-%dT%H-%M-%S");
+        char buf[32];
+        assert(strftime(buf, sizeof(buf), fmt, &t) == 19);
+        return buf;
+    }
+
+    inline string timeToISOString(time_t time) {
+        struct tm t;
+        time_t_to_Struct( time, &t );
+
+        const char* fmt = "%Y-%m-%dT%H:%M:%SZ";
+        char buf[32];
+        assert(strftime(buf, sizeof(buf), fmt, &t) == 20);
+        return buf;
+    }
+
+    inline boost::gregorian::date currentDate() {
+        boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
+        return now.date();
+    }
+
+    // parses time of day in "hh:mm" format assuming 'hh' is 00-23
+    inline bool toPointInTime( const string& str , boost::posix_time::ptime* timeOfDay ) {
+        int hh = 0;
+        int mm = 0;
+        if ( 2 != sscanf( str.c_str() , "%d:%d" , &hh , &mm ) ) {
+            return false;
+        }
+
+        // verify that time is well formed
+        if ( ( hh / 24 ) || ( mm / 60 ) ) {
+            return false;
+        }
+
+        boost::posix_time::ptime res( currentDate() , boost::posix_time::hours( hh ) + boost::posix_time::minutes( mm ) );
+        *timeOfDay = res;
+        return true;
+    }
+
+#define MONGO_asctime _asctime_not_threadsafe_
+#define asctime MONGO_asctime
+#define MONGO_gmtime _gmtime_not_threadsafe_
+#define gmtime MONGO_gmtime
+#define MONGO_localtime _localtime_not_threadsafe_
+#define localtime MONGO_localtime
+#define MONGO_ctime _ctime_is_not_threadsafe_
+#define ctime MONGO_ctime
+
+#if defined(_WIN32)
+    inline void sleepsecs(int s) {
+        // todo : add an assert here that we are not locked in d.dbMutex.  there may be debugging things where we 
+        //        are but otherwise it's quite likely that would be a mistake.
+        Sleep(s*1000);
+    }
+    inline void sleepmillis(long long s) {
+        assert( s <= 0xffffffff );
+        Sleep((DWORD) s);
+    }
+    inline void sleepmicros(long long s) {
+        if ( s <= 0 )
+            return;
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        xt.sec += (int)( s / 1000000 );
+        xt.nsec += (int)(( s % 1000000 ) * 1000);
+        if ( xt.nsec >= 1000000000 ) {
+            xt.nsec -= 1000000000;
+            xt.sec++;
+        }
+        boost::thread::sleep(xt);
+    }
+#elif defined(__sunos__)
+    inline void sleepsecs(int s) {
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        xt.sec += s;
+        boost::thread::sleep(xt);
+    }
+    inline void sleepmillis(long long s) {
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        xt.sec += (int)( s / 1000 );
+        xt.nsec += (int)(( s % 1000 ) * 1000000);
+        if ( xt.nsec >= 1000000000 ) {
+            xt.nsec -= 1000000000;
+            xt.sec++;
+        }
+        boost::thread::sleep(xt);
+    }
+    inline void sleepmicros(long long s) {
+        if ( s <= 0 )
+            return;
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        xt.sec += (int)( s / 1000000 );
+        xt.nsec += (int)(( s % 1000000 ) * 1000);
+        if ( xt.nsec >= 1000000000 ) {
+            xt.nsec -= 1000000000;
+            xt.sec++;
+        }
+        boost::thread::sleep(xt);
+    }
+#else
+    inline void sleepsecs(int s) {
+        struct timespec t;
+        t.tv_sec = s;
+        t.tv_nsec = 0;
+        if ( nanosleep( &t , 0 ) ) {
+            cout << "nanosleep failed" << endl;
+        }
+    }
+    inline void sleepmicros(long long s) {
+        if ( s <= 0 )
+            return;
+        struct timespec t;
+        t.tv_sec = (int)(s / 1000000);
+        t.tv_nsec = 1000 * ( s % 1000000 );
+        struct timespec out;
+        if ( nanosleep( &t , &out ) ) {
+            cout << "nanosleep failed" << endl;
+        }
+    }
+    inline void sleepmillis(long long s) {
+        sleepmicros( s * 1000 );
+    }
+#endif
+
+    extern long long jsTime_virtual_skew;
+    extern boost::thread_specific_ptr<long long> jsTime_virtual_thread_skew;
+
+    // DO NOT TOUCH except for testing
+    inline void jsTimeVirtualSkew( long long skew ){
+        jsTime_virtual_skew = skew;
+    }
+    inline long long getJSTimeVirtualSkew(){
+        return jsTime_virtual_skew;
+    }
+
+    inline void jsTimeVirtualThreadSkew( long long skew ){
+        jsTime_virtual_thread_skew.reset(new long long(skew));
+    }
+    inline long long getJSTimeVirtualThreadSkew(){
+        if(jsTime_virtual_thread_skew.get()){
+            return *(jsTime_virtual_thread_skew.get());
+        }
+        else return 0;
+    }
+
+    /** Date_t is milliseconds since epoch */
+    inline Date_t jsTime();
+
+    /** warning this will wrap */
+    inline unsigned curTimeMicros();
+
+    inline unsigned long long curTimeMicros64();
+#ifdef _WIN32 // no gettimeofday on windows
+    inline unsigned long long curTimeMillis64() {
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        return ((unsigned long long)xt.sec) * 1000 + xt.nsec / 1000000;
+    }
+    inline Date_t jsTime() {
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        unsigned long long t = xt.nsec / 1000000;
+        return ((unsigned long long) xt.sec * 1000) + t + getJSTimeVirtualSkew() + getJSTimeVirtualThreadSkew();
+    }
+    inline unsigned long long curTimeMicros64() {
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        unsigned long long t = xt.nsec / 1000;
+        return (((unsigned long long) xt.sec) * 1000000) + t;
+    }    
+    inline unsigned curTimeMicros() {
+        boost::xtime xt;
+        boost::xtime_get(&xt, boost::TIME_UTC);
+        unsigned t = xt.nsec / 1000;
+        unsigned secs = xt.sec % 1024;
+        return secs*1000000 + t;
+    }
+#else
+#  include <sys/time.h>
+    inline unsigned long long curTimeMillis64() {
+        timeval tv;
+        gettimeofday(&tv, NULL);
+        return ((unsigned long long)tv.tv_sec) * 1000 + tv.tv_usec / 1000;
+    }
+    inline Date_t jsTime() {
+        timeval tv;
+        gettimeofday(&tv, NULL);
+        unsigned long long t = tv.tv_usec / 1000;
+        return ((unsigned long long) tv.tv_sec * 1000) + t + getJSTimeVirtualSkew() + getJSTimeVirtualThreadSkew();
+    }
+    inline unsigned long long curTimeMicros64() {
+        timeval tv;
+        gettimeofday(&tv, NULL);
+        return (((unsigned long long) tv.tv_sec) * 1000*1000) + tv.tv_usec;
+    }
+    inline unsigned curTimeMicros() {
+        timeval tv;
+        gettimeofday(&tv, NULL);
+        unsigned secs = tv.tv_sec % 1024;
+        return secs*1000*1000 + tv.tv_usec;
+    }
+#endif
+
+}  // namespace mongo
diff --git a/src/mongo/util/timer.h b/src/mongo/util/timer.h
new file mode 100644
index 00000000000..224651ac224
--- /dev/null
+++ b/src/mongo/util/timer.h
@@ -0,0 +1,115 @@
+// @file timer.h
+
+/*    Copyright 2010 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "time_support.h"
+
+namespace mongo {
+
+#if !defined(_WIN32)
+
+    /**
+     *  simple scoped timer
+     */
+    class Timer /*copyable*/ {
+    public:
+        Timer() { reset(); }
+        int seconds() const { return (int)(micros() / 1000000); }
+        int millis() const { return (int)(micros() / 1000); }
+        int minutes() const { return seconds() / 60; }
+        
+
+        /** gets time interval and resets at the same time.  this way we can call curTimeMicros
+              once instead of twice if one wanted millis() and then reset().
+            @return time in millis
+        */
+        int millisReset() { 
+            unsigned long long now = curTimeMicros64();
+            int m = (int)((now-old)/1000);
+            old = now;
+            return m;
+        }
+
+        // note: dubious that the resolution is as anywhere near as high as ethod name implies!
+        unsigned long long micros() const {
+            unsigned long long n = curTimeMicros64();
+            return n - old;
+        }
+        unsigned long long micros(unsigned long long & n) const { // returns cur time in addition to timer result
+            n = curTimeMicros64();
+            return n - old;
+        }
+
+        void reset() { old = curTimeMicros64(); }
+    private:
+        unsigned long long old;
+    };
+
+#else
+
+    class Timer /*copyable*/ {
+    public:
+        Timer() { reset(); }
+
+        int seconds() const { 
+            int s = static_cast<int>((now() - old) / countsPerSecond);
+            return s;
+        }
+
+        int millis() const { 
+            return (int)
+                    ((now() - old) * 1000.0 / countsPerSecond);
+        }
+
+        int minutes() const { return seconds() / 60; }
+        
+        /** gets time interval and resets at the same time.  this way we can call curTimeMicros
+              once instead of twice if one wanted millis() and then reset().
+            @return time in millis
+        */
+        int millisReset() { 
+            unsigned long long nw = now();
+            int m = static_cast<int>((nw - old) * 1000.0 / countsPerSecond);
+            old = nw;
+            return m;
+       } 
+
+        void reset() { 
+            old = now();
+        }            
+
+        unsigned long long micros() const {
+            return (unsigned long long)
+                    ((now() - old) * 1000000.0 / countsPerSecond);
+        }
+
+        static unsigned long long countsPerSecond;
+
+    private:
+        unsigned long long now() const {
+            LARGE_INTEGER i;
+            QueryPerformanceCounter(&i);
+            return i.QuadPart;
+        }
+
+        unsigned long long old;
+    };
+
+#endif
+
+}  // namespace mongo
diff --git a/src/mongo/util/unittest.h b/src/mongo/util/unittest.h
new file mode 100644
index 00000000000..94be444363f
--- /dev/null
+++ b/src/mongo/util/unittest.h
@@ -0,0 +1,62 @@
+// unittest.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    /* The idea here is to let all initialization of global variables (classes inheriting from UnitTest)
+       complete before we run the tests -- otherwise order of initilization being arbitrary may mess
+       us up.  The app's main() function should call runTests().
+
+       To define a unit test, inherit from this and implement run. instantiate one object for the new class
+       as a global.
+
+       These tests are ran on *every* startup of mongod, so they have to be very lightweight.  But it is a
+       good quick check for a bad build.
+    */
+    struct UnitTest {
+        UnitTest() {
+            registerTest(this);
+        }
+        virtual ~UnitTest() {}
+
+        // assert if fails
+        virtual void run() = 0;
+
+        static bool testsInProgress() { return running; }
+    private:
+        static vector<UnitTest*> *tests;
+        static bool running;
+    public:
+        static void registerTest(UnitTest *t) {
+            if ( tests == 0 )
+                tests = new vector<UnitTest*>();
+            tests->push_back(t);
+        }
+
+        static void runTests() {
+            running = true;
+            for ( vector<UnitTest*>::iterator i = tests->begin(); i != tests->end(); i++ ) {
+                (*i)->run();
+            }
+            running = false;
+        }
+    };
+
+
+} // namespace mongo
diff --git a/src/mongo/util/util.cpp b/src/mongo/util/util.cpp
new file mode 100644
index 00000000000..356c640f449
--- /dev/null
+++ b/src/mongo/util/util.cpp
@@ -0,0 +1,220 @@
+// @file util.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "goodies.h"
+#include "unittest.h"
+#include "file_allocator.h"
+#include "optime.h"
+#include "time_support.h"
+#include "mongoutils/str.h"
+#include "timer.h"
+
+namespace mongo {
+
+#if defined(_WIN32)
+    unsigned long long Timer::countsPerSecond;
+    struct AtStartup {
+        AtStartup() {
+            LARGE_INTEGER x;
+            bool ok = QueryPerformanceFrequency(&x);
+            assert(ok);
+            Timer::countsPerSecond = x.QuadPart;
+        }
+    } atstartuputil;
+#endif
+
+    string hexdump(const char *data, unsigned len) {
+        assert( len < 1000000 );
+        const unsigned char *p = (const unsigned char *) data;
+        stringstream ss;
+        for( unsigned i = 0; i < 4 && i < len; i++ ) {
+            ss << std::hex << setw(2) << setfill('0');
+            unsigned n = p[i];
+            ss << n;
+            ss << ' ';
+        }
+        string s = ss.str();
+        return s;
+    }
+
+    boost::thread_specific_ptr<string> _threadName;
+
+    unsigned _setThreadName( const char * name ) {
+        if ( ! name ) name = "NONE";
+
+        static unsigned N = 0;
+
+        if ( strcmp( name , "conn" ) == 0 ) {
+            string* x = _threadName.get();
+            if ( x && mongoutils::str::startsWith( *x , "conn" ) ) {
+                int n = atoi( x->c_str() + 4 );
+                if ( n > 0 )
+                    return n;
+                warning() << "unexpected thread name [" << *x << "] parsed to " << n << endl;
+            }
+            unsigned n = ++N;
+            stringstream ss;
+            ss << name << n;
+            _threadName.reset( new string( ss.str() ) );
+            return n;
+        }
+
+        _threadName.reset( new string(name) );
+        return 0;
+    }
+
+#if defined(_WIN32)
+#define MS_VC_EXCEPTION 0x406D1388
+#pragma pack(push,8)
+    typedef struct tagTHREADNAME_INFO {
+        DWORD dwType; // Must be 0x1000.
+        LPCSTR szName; // Pointer to name (in user addr space).
+        DWORD dwThreadID; // Thread ID (-1=caller thread).
+        DWORD dwFlags; // Reserved for future use, must be zero.
+    } THREADNAME_INFO;
+#pragma pack(pop)
+
+    void setWinThreadName(const char *name) {
+        /* is the sleep here necessary???
+           Sleep(10);
+           */
+        THREADNAME_INFO info;
+        info.dwType = 0x1000;
+        info.szName = name;
+        info.dwThreadID = -1;
+        info.dwFlags = 0;
+        __try {
+            RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
+        }
+        __except(EXCEPTION_EXECUTE_HANDLER) {
+        }
+    }
+
+    unsigned setThreadName(const char *name) {
+        unsigned n = _setThreadName( name );
+#if !defined(_DEBUG)
+        // naming might be expensive so don't do "conn*" over and over
+        if( string("conn") == name )
+            return n;
+#endif
+        setWinThreadName(name);
+        return n;
+    }
+
+#else
+
+    unsigned setThreadName(const char * name ) {
+        return _setThreadName( name );
+    }
+
+#endif
+
+    string getThreadName() {
+        string * s = _threadName.get();
+        if ( s )
+            return *s;
+        return "";
+    }
+
+    vector<UnitTest*> *UnitTest::tests = 0;
+    bool UnitTest::running = false;
+
+    const char *default_getcurns() { return ""; }
+    const char * (*getcurns)() = default_getcurns;
+
+    int logLevel = 0;
+    int tlogLevel = 0;
+    mongo::mutex Logstream::mutex("Logstream");
+    int Logstream::doneSetup = Logstream::magicNumber();
+
+    bool isPrime(int n) {
+        int z = 2;
+        while ( 1 ) {
+            if ( z*z > n )
+                break;
+            if ( n % z == 0 )
+                return false;
+            z++;
+        }
+        return true;
+    }
+
+    int nextPrime(int n) {
+        n |= 1; // 2 goes to 3...don't care...
+        while ( !isPrime(n) )
+            n += 2;
+        return n;
+    }
+
+    struct UtilTest : public UnitTest {
+        void run() {
+            assert( isPrime(3) );
+            assert( isPrime(2) );
+            assert( isPrime(13) );
+            assert( isPrime(17) );
+            assert( !isPrime(9) );
+            assert( !isPrime(6) );
+            assert( nextPrime(4) == 5 );
+            assert( nextPrime(8) == 11 );
+
+            assert( endsWith("abcde", "de") );
+            assert( !endsWith("abcde", "dasdfasdfashkfde") );
+
+            assert( swapEndian(0x01020304) == 0x04030201 );
+
+        }
+    } utilTest;
+
+    OpTime OpTime::last(0, 0);
+
+    /* this is a good place to set a breakpoint when debugging, as lots of warning things
+       (assert, wassert) call it.
+    */
+    void sayDbContext(const char *errmsg) {
+        if ( errmsg ) {
+            problem() << errmsg << endl;
+        }
+        printStackTrace();
+    }
+
+    /* note: can't use malloc herein - may be in signal handler.
+             logLockless() likely does not comply and should still be fixed todo
+             likewise class string?
+    */
+    void rawOut( const string &s ) {
+        if( s.empty() ) return;
+
+        char buf[64];
+        time_t_to_String( time(0) , buf );
+        /* truncate / don't show the year: */
+        buf[19] = ' ';
+        buf[20] = 0;
+
+        Logstream::logLockless(buf);
+        Logstream::logLockless(s);
+        Logstream::logLockless("\n");
+    }
+
+    ostream& operator<<( ostream &s, const ThreadSafeString &o ) {
+        s << o.toString();
+        return s;
+    }
+
+    bool StaticObserver::_destroyingStatics = false;
+
+} // namespace mongo
diff --git a/src/mongo/util/version.cpp b/src/mongo/util/version.cpp
new file mode 100644
index 00000000000..1e4bc457f91
--- /dev/null
+++ b/src/mongo/util/version.cpp
@@ -0,0 +1,288 @@
+// @file version.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include <cstdlib>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include "unittest.h"
+#include "version.h"
+#include "stringutils.h"
+#include "../db/jsobj.h"
+#include "file.h"
+#include "ramlog.h"
+#include "../db/cmdline.h"
+
+namespace mongo {
+
+    /* Approved formats for versionString:
+     *      1.2.3
+     *      1.2.3-pre-
+     *      1.2.3-rc4 (up to rc9)
+     *      1.2.3-rc4-pre-
+     * If you really need to do something else you'll need to fix _versionArray()
+     */
+    const char versionString[] = "2.1.0-pre-";
+
+    // See unit test for example outputs
+    static BSONArray _versionArray(const char* version){
+        // this is inefficient, but cached so it doesn't matter
+        BSONArrayBuilder b;
+        string curPart;
+        const char* c = version;
+        int finalPart = 0; // 0 = final release, -100 = pre, -10 to -1 = -10 + X for rcX
+        do { //walks versionString including NUL byte
+            if (!(*c == '.' || *c == '-' || *c == '\0')){
+                curPart += *c;
+                continue;
+            }
+
+            try {
+                unsigned num = stringToNum(curPart.c_str());
+                b.append((int) num);
+            }
+            catch (...){ // not a number
+                if (curPart.empty()){
+                    assert(*c == '\0');
+                    break;
+                }
+                else if (startsWith(curPart, "rc")){
+                    finalPart = -10 + stringToNum(curPart.c_str()+2);
+                    break;
+                }
+                else if (curPart == "pre"){
+                    finalPart = -100;
+                    break;
+                }
+            }
+
+            curPart = "";
+        } while (*c++);
+
+        b.append(finalPart);
+        return b.arr();
+    }
+
+    const BSONArray versionArray = _versionArray(versionString);
+
+    string mongodVersion() {
+        stringstream ss;
+        ss << "db version v" << versionString << ", pdfile version " << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR;
+        return ss.str();
+    }
+
+#ifndef _SCONS
+    // only works in scons
+    const char * gitVersion() { return "not-scons"; }
+#endif
+
+    void printGitVersion() { log() << "git version: " << gitVersion() << endl; }
+
+#ifndef _SCONS
+#if defined(_WIN32)
+    string sysInfo() {
+        stringstream ss;
+        ss << "not-scons win";
+        ss << " mscver:" << _MSC_FULL_VER << " built:" << __DATE__;
+        ss << " boostver:" << BOOST_VERSION;
+#if( !defined(_MT) )
+#error _MT is not defined
+#endif
+        ss << (sizeof(char *) == 8) ? " 64bit" : " 32bit";
+        return ss.str();
+    }
+#else
+    string sysInfo() { return ""; }
+#endif
+#endif
+
+    void printSysInfo() {
+        log() << "build info: " << sysInfo() << endl;
+    }
+
+
+    static Tee * startupWarningsLog = new RamLog("startupWarnings"); //intentionally leaked
+
+    //
+    // system warnings
+    //
+    void show_warnings() {
+        // each message adds a leading and a trailing newline
+
+        bool warned = false;
+        {
+            const char * foo = strchr( versionString , '.' ) + 1;
+            int bar = atoi( foo );
+            if ( ( 2 * ( bar / 2 ) ) != bar ) {
+                log() << startupWarningsLog;
+                log() << "** NOTE: This is a development version (" << versionString << ") of MongoDB." << startupWarningsLog;
+                log() << "**       Not recommended for production." << startupWarningsLog;
+                warned = true;
+            }
+        }
+
+        if ( sizeof(int*) == 4 ) {
+            log() << startupWarningsLog;
+            log() << "** NOTE: when using MongoDB 32 bit, you are limited to about 2 gigabytes of data" << startupWarningsLog;
+            log() << "**       see http://blog.mongodb.org/post/137788967/32-bit-limitations" << startupWarningsLog;
+            log() << "**       with --journal, the limit is lower" << startupWarningsLog;
+            warned = true;
+        }
+
+#ifdef __linux__
+        if (boost::filesystem::exists("/proc/vz") && !boost::filesystem::exists("/proc/bc")) {
+            log() << startupWarningsLog;
+            log() << "** WARNING: You are running in OpenVZ. This is known to be broken!!!" << startupWarningsLog;
+            warned = true;
+        }
+
+        if (boost::filesystem::exists("/sys/devices/system/node/node1")){
+            // We are on a box with a NUMA enabled kernel and more than 1 numa node (they start at node0)
+            // Now we look at the first line of /proc/self/numa_maps
+            //
+            // Bad example:
+            // $ cat /proc/self/numa_maps
+            // 00400000 default file=/bin/cat mapped=6 N4=6
+            //
+            // Good example:
+            // $ numactl --interleave=all cat /proc/self/numa_maps
+            // 00400000 interleave:0-7 file=/bin/cat mapped=6 N4=6
+
+            File f;
+            f.open("/proc/self/numa_maps", /*read_only*/true);
+            if ( f.is_open() && ! f.bad() ) {
+                char line[100]; //we only need the first line
+                if (read(f.fd, line, sizeof(line)) < 0){
+                    warning() << "failed to read from /proc/self/numa_maps: " << errnoWithDescription() << startupWarningsLog;
+                    warned = true;
+                }
+                else {
+                    // just in case...
+                    line[98] = ' ';
+                    line[99] = '\0';
+                    
+                    // skip over pointer
+                    const char* space = strchr(line, ' ');
+                    
+                    if ( ! space ) {
+                        log() << startupWarningsLog;
+                        log() << "** WARNING: cannot parse numa_maps" << startupWarningsLog;
+                        warned = true;
+                    }
+                    else if ( ! startsWith(space+1, "interleave") ) {
+                        log() << startupWarningsLog;
+                        log() << "** WARNING: You are running on a NUMA machine." << startupWarningsLog;
+                        log() << "**          We suggest launching mongod like this to avoid performance problems:" << startupWarningsLog;
+                        log() << "**              numactl --interleave=all mongod [other options]" << startupWarningsLog;
+                        warned = true;
+                    }
+                }
+            }
+        }
+
+        if (cmdLine.dur){
+            fstream f ("/proc/sys/vm/overcommit_memory", ios_base::in);
+            unsigned val;
+            f >> val;
+
+            if (val == 2) {
+                log() << startupWarningsLog;
+                log() << "** WARNING: /proc/sys/vm/overcommit_memory is " << val << startupWarningsLog;
+                log() << "**          Journaling works best with it set to 0 or 1" << startupWarningsLog;
+            }
+        }
+
+        if (boost::filesystem::exists("/proc/sys/vm/zone_reclaim_mode")){
+            fstream f ("/proc/sys/vm/zone_reclaim_mode", ios_base::in);
+            unsigned val;
+            f >> val;
+
+            if (val != 0) {
+                log() << startupWarningsLog;
+                log() << "** WARNING: /proc/sys/vm/zone_reclaim_mode is " << val << startupWarningsLog;
+                log() << "**          We suggest setting it to 0" << startupWarningsLog;
+                log() << "**          http://www.kernel.org/doc/Documentation/sysctl/vm.txt" << startupWarningsLog;
+            }
+        }
+#endif
+
+        if (warned) {
+            log() << startupWarningsLog;
+        }
+    }
+
+    int versionCmp(StringData rhs, StringData lhs) {
+        if (strcmp(rhs.data(),lhs.data()) == 0)
+            return 0;
+
+        // handle "1.2.3-" and "1.2.3-pre"
+        if (rhs.size() < lhs.size()) {
+            if (strncmp(rhs.data(), lhs.data(), rhs.size()) == 0 && lhs.data()[rhs.size()] == '-')
+                return +1;
+        }
+        else if (rhs.size() > lhs.size()) {
+            if (strncmp(rhs.data(), lhs.data(), lhs.size()) == 0 && rhs.data()[lhs.size()] == '-')
+                return -1;
+        }
+
+        return lexNumCmp(rhs.data(), lhs.data());
+    }
+
+    class VersionCmpTest : public UnitTest {
+    public:
+        void run() {
+            assert( versionCmp("1.2.3", "1.2.3") == 0 );
+            assert( versionCmp("1.2.3", "1.2.4") < 0 );
+            assert( versionCmp("1.2.3", "1.2.20") < 0 );
+            assert( versionCmp("1.2.3", "1.20.3") < 0 );
+            assert( versionCmp("2.2.3", "10.2.3") < 0 );
+            assert( versionCmp("1.2.3", "1.2.3-") > 0 );
+            assert( versionCmp("1.2.3", "1.2.3-pre") > 0 );
+            assert( versionCmp("1.2.3", "1.2.4-") < 0 );
+            assert( versionCmp("1.2.3-", "1.2.3") < 0 );
+            assert( versionCmp("1.2.3-pre", "1.2.3") < 0 );
+
+            log(1) << "versionCmpTest passed" << endl;
+        }
+    } versionCmpTest;
+
+    class VersionArrayTest : public UnitTest {
+    public:
+        void run() {
+            assert( _versionArray("1.2.3") == BSON_ARRAY(1 << 2 << 3 << 0) );
+            assert( _versionArray("1.2.0") == BSON_ARRAY(1 << 2 << 0 << 0) );
+            assert( _versionArray("2.0.0") == BSON_ARRAY(2 << 0 << 0 << 0) );
+
+            assert( _versionArray("1.2.3-pre-") == BSON_ARRAY(1 << 2 << 3 << -100) );
+            assert( _versionArray("1.2.0-pre-") == BSON_ARRAY(1 << 2 << 0 << -100) );
+            assert( _versionArray("2.0.0-pre-") == BSON_ARRAY(2 << 0 << 0 << -100) );
+
+            assert( _versionArray("1.2.3-rc0") == BSON_ARRAY(1 << 2 << 3 << -10) );
+            assert( _versionArray("1.2.0-rc1") == BSON_ARRAY(1 << 2 << 0 << -9) );
+            assert( _versionArray("2.0.0-rc2") == BSON_ARRAY(2 << 0 << 0 << -8) );
+
+            // Note that the pre of an rc is the same as the rc itself
+            assert( _versionArray("1.2.3-rc3-pre-") == BSON_ARRAY(1 << 2 << 3 << -7) );
+            assert( _versionArray("1.2.0-rc4-pre-") == BSON_ARRAY(1 << 2 << 0 << -6) );
+            assert( _versionArray("2.0.0-rc5-pre-") == BSON_ARRAY(2 << 0 << 0 << -5) );
+
+            log(1) << "versionArrayTest passed" << endl;
+        }
+    } versionArrayTest;
+}
diff --git a/src/mongo/util/version.h b/src/mongo/util/version.h
new file mode 100644
index 00000000000..64f8b140fd5
--- /dev/null
+++ b/src/mongo/util/version.h
@@ -0,0 +1,27 @@
+#ifndef UTIL_VERSION_HEADER
+#define UTIL_VERSION_HEADER
+
+#include <string>
+
+namespace mongo {
+    struct BSONArray;
+
+    using std::string;
+
+    // mongo version
+    extern const char versionString[];
+    extern const BSONArray versionArray;
+    string mongodVersion();
+    int versionCmp(StringData rhs, StringData lhs); // like strcmp
+
+    const char * gitVersion();
+    void printGitVersion();
+
+    string sysInfo();
+    void printSysInfo();
+
+    void show_warnings();
+
+}  // namespace mongo
+
+#endif  // UTIL_VERSION_HEADER
diff --git a/src/mongo/util/winutil.h b/src/mongo/util/winutil.h
new file mode 100644
index 00000000000..b69b69a630d
--- /dev/null
+++ b/src/mongo/util/winutil.h
@@ -0,0 +1,44 @@
+// @file winutil.cpp : Windows related utility functions
+//
+// /**
+// *    Copyright (C) 2008 10gen Inc.
+// *
+// *    This program is free software: you can redistribute it and/or  modify
+// *    it under the terms of the GNU Affero General Public License, version 3,
+// *    as published by the Free Software Foundation.
+// *
+// *    This program is distributed in the hope that it will be useful,
+// *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+// *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// *    GNU Affero General Public License for more details.
+// *
+// *    You should have received a copy of the GNU Affero General Public License
+// *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+// */
+//
+// #include "pch.h"
+
+#pragma once
+
+#if defined(_WIN32)
+#include <windows.h>
+#include "text.h"
+
+namespace mongo {
+
+    inline string GetWinErrMsg(DWORD err) {
+        LPTSTR errMsg;
+        ::FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0, (LPTSTR)&errMsg, 0, NULL );
+        std::string errMsgStr = toUtf8String( errMsg );
+        ::LocalFree( errMsg );
+        // FormatMessage() appends a newline to the end of error messages, we trim it because endl flushes the buffer.
+        errMsgStr = errMsgStr.erase( errMsgStr.length() - 2 );
+        std::ostringstream output;
+        output << errMsgStr << " (" << err << ")";
+
+        return output.str();
+    }
+}
+
+#endif
+