merge

author: Dwight <dwight@10gen.com> 2011-08-17 16:55:13 -0400
committer: Dwight <dwight@10gen.com> 2011-08-17 16:55:13 -0400
commit: 48977d2abc8ecffaa4c547d427603c7ff24895d3 (patch)
tree: d910534e1d623fdc72dbd063ebf4878b8efedd7c
parent: 9ce68d36823c22f641f705928f1c1b22f6206d76 (diff)
parent: e4a084bdab0e2a61e81476068ed494e346715d41 (diff)
download: mongo-48977d2abc8ecffaa4c547d427603c7ff24895d3.tar.gz
302 files changed, 10377 insertions, 2474 deletions
diff --git a/.gitignore b/.gitignore
index 87449576069..8ffc0d3e496 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 .sconsign.dblite
 .sconf_temp
 perf.data
+massif.out.*
 
 *~
 *.swp
diff --git a/SConstruct b/SConstruct
index b8e8c3fc889..99e3189c2d0 100644
--- a/SConstruct
+++ b/SConstruct
@@ -126,10 +126,10 @@ add_option( "staticlibpath", "comma separated list of dirs to search for staticl
 add_option( "boost-compiler", "compiler used for boost (gcc41)" , 1 , True , "boostCompiler" )
 add_option( "boost-version", "boost version for linking(1_38)" , 1 , True , "boostVersion" )
 
-
 # experimental features
 add_option( "mm", "use main memory instead of memory mapped files" , 0 , True )
 add_option( "asio" , "Use Asynchronous IO (NOT READY YET)" , 0 , True )
+add_option( "ssl" , "Enable SSL" , 0 , True )
 
 # library choices
 add_option( "usesm" , "use spider monkey for javascript" , 0 , True )
@@ -138,12 +138,13 @@ add_option( "usev8" , "use v8 for javascript" , 0 , True )
 # mongo feature options
 add_option( "noshell", "don't build shell" , 0 , True )
 add_option( "safeshell", "don't let shell scripts run programs (still, don't run untrusted scripts)" , 0 , True )
-add_option( "osnew", "use newer operating system API features" , 0 , False )
+add_option( "win2008plus", "use newer operating system API features" , 0 , False )
 
 # dev tools
 add_option( "d", "debug build no optimization, etc..." , 0 , True , "debugBuild" )
 add_option( "dd", "debug build no optimization, additional debug logging, etc..." , 0 , False , "debugBuildAndLogging" )
 add_option( "durableDefaultOn" , "have durable default to on" , 0 , True )
+add_option( "durableDefaultOff" , "have durable default to off" , 0 , True )
 
 add_option( "pch" , "use precompiled headers to speed up the build (experimental)" , 0 , True , "usePCH" )
 add_option( "distcc" , "use distcc for distributing builds" , 0 , False )
@@ -234,6 +235,9 @@ if has_option( "safeshell" ):
 if has_option( "durableDefaultOn" ):
     env.Append( CPPDEFINES=[ "_DURABLEDEFAULTON" ] )
 
+if has_option( "durableDefaultOff" ):
+    env.Append( CPPDEFINES=[ "_DURABLEDEFAULTOFF" ] )
+
 boostCompiler = GetOption( "boostCompiler" )
 if boostCompiler is None:
     boostCompiler = ""
@@ -343,26 +347,28 @@ processInfoFiles = [ "util/processinfo.cpp" ]
 
 if os.path.exists( "util/processinfo_" + os.sys.platform + ".cpp" ):
     processInfoFiles += [ "util/processinfo_" + os.sys.platform + ".cpp" ]
+elif os.sys.platform == "linux3":
+    processInfoFiles += [ "util/processinfo_linux2.cpp" ]
 else:
     processInfoFiles += [ "util/processinfo_none.cpp" ]
 
 coreServerFiles += processInfoFiles
 
-
-
 if has_option( "asio" ):
     coreServerFiles += [ "util/net/message_server_asio.cpp" ]
 
 # mongod files - also files used in tools. present in dbtests, but not in mongos and not in client libs.
-serverOnlyFiles = Split( "db/key.cpp db/btreebuilder.cpp util/logfile.cpp util/alignedbuilder.cpp db/mongommf.cpp db/dur.cpp db/durop.cpp db/dur_writetodatafiles.cpp db/dur_preplogbuffer.cpp db/dur_commitjob.cpp db/dur_recover.cpp db/dur_journal.cpp db/introspect.cpp db/btree.cpp db/clientcursor.cpp db/tests.cpp db/repl.cpp db/repl/rs.cpp db/repl/consensus.cpp db/repl/rs_initiate.cpp db/repl/replset_commands.cpp db/repl/manager.cpp db/repl/health.cpp db/repl/heartbeat.cpp db/repl/rs_config.cpp db/repl/rs_rollback.cpp db/repl/rs_sync.cpp db/repl/rs_initialsync.cpp db/oplog.cpp db/repl_block.cpp db/btreecursor.cpp db/cloner.cpp db/namespace.cpp db/cap.cpp db/matcher_covered.cpp db/dbeval.cpp db/restapi.cpp db/dbhelpers.cpp db/instance.cpp db/client.cpp db/database.cpp db/pdfile.cpp db/record.cpp db/cursor.cpp db/security.cpp db/queryoptimizer.cpp db/queryoptimizercursor.cpp db/extsort.cpp db/cmdline.cpp" )
+serverOnlyFiles = Split( "util/compress.cpp db/key.cpp db/btreebuilder.cpp util/logfile.cpp util/alignedbuilder.cpp db/mongommf.cpp db/dur.cpp db/durop.cpp db/dur_writetodatafiles.cpp db/dur_preplogbuffer.cpp db/dur_commitjob.cpp db/dur_recover.cpp db/dur_journal.cpp db/introspect.cpp db/btree.cpp db/clientcursor.cpp db/tests.cpp db/repl.cpp db/repl/rs.cpp db/repl/consensus.cpp db/repl/rs_initiate.cpp db/repl/replset_commands.cpp db/repl/manager.cpp db/repl/health.cpp db/repl/heartbeat.cpp db/repl/rs_config.cpp db/repl/rs_rollback.cpp db/repl/rs_sync.cpp db/repl/rs_initialsync.cpp db/oplog.cpp db/repl_block.cpp db/btreecursor.cpp db/cloner.cpp db/namespace.cpp db/cap.cpp db/matcher_covered.cpp db/dbeval.cpp db/restapi.cpp db/dbhelpers.cpp db/instance.cpp db/client.cpp db/database.cpp db/pdfile.cpp db/record.cpp db/cursor.cpp db/security.cpp db/queryoptimizer.cpp db/queryoptimizercursor.cpp db/extsort.cpp db/cmdline.cpp" )
 
-serverOnlyFiles += [ "db/index.cpp" ] + Glob( "db/geo/*.cpp" ) + Glob( "db/ops/*.cpp" )
+serverOnlyFiles += [ "db/index.cpp" , "db/scanandorder.cpp" ] + Glob( "db/geo/*.cpp" ) + Glob( "db/ops/*.cpp" )
 
 serverOnlyFiles += [ "db/dbcommands.cpp" , "db/dbcommands_admin.cpp" ]
 serverOnlyFiles += Glob( "db/commands/*.cpp" )
 coreServerFiles += Glob( "db/stats/*.cpp" )
 serverOnlyFiles += [ "db/driverHelpers.cpp" ]
 
+snappyFiles = ["third_party/snappy/snappy.cc", "third_party/snappy/snappy-sinksource.cc"]
+
 scriptingFiles = [ "scripting/engine.cpp" , "scripting/utils.cpp" , "scripting/bench.cpp" ]
 
 if usesm:
@@ -474,7 +480,7 @@ if "darwin" == os.sys.platform:
         env.Append( CPPPATH=filterExists(["/sw/include" , "/opt/local/include"]) )
         env.Append( LIBPATH=filterExists(["/sw/lib/", "/opt/local/lib"]) )
 
-elif "linux2" == os.sys.platform:
+elif "linux2" == os.sys.platform or "linux3" == os.sys.platform:
     linux = True
     platform = "linux"
 
@@ -519,7 +525,7 @@ elif "win32" == os.sys.platform:
     #if force64:
     #    release = True
 
-    if has_option( "osnew" ):
+    if has_option( "win2008plus" ):
         env.Append( CPPDEFINES=[ "MONGO_USE_SRW_ON_WINDOWS" ] )
 
     for pathdir in env['ENV']['PATH'].split(os.pathsep):
@@ -689,6 +695,7 @@ if nix:
         if not has_option('clang'): 
             env.Append( CPPFLAGS=" -fno-builtin-memcmp " ) # glibc's memcmp is faster than gcc's
 
+    env.Append( CPPDEFINES="_FILE_OFFSET_BITS=64" )
     env.Append( CXXFLAGS=" -Wnon-virtual-dtor " )
     env.Append( LINKFLAGS=" -fPIC -pthread -rdynamic" )
     env.Append( LIBS=[] )
@@ -704,7 +711,7 @@ if nix:
         env.Append( CPPFLAGS=" -O0 -fstack-protector " );
         env['ENV']['GLIBCXX_FORCE_NEW'] = 1; # play nice with valgrind
     else:
-        env.Append( CPPFLAGS=" -O3" )
+        env.Append( CPPFLAGS=" -O3 " )
         #env.Append( CPPFLAGS=" -fprofile-generate" )
         #env.Append( LINKFLAGS=" -fprofile-generate" )
         # then:
@@ -752,6 +759,10 @@ if "uname" in dir(os):
     if hacks is not None:
         hacks.insert( env , { "linux64" : linux64 } )
 
+if has_option( "ssl" ):
+    env.Append( CPPDEFINES=["MONGO_SSL"] )
+    env.Append( LIBS=["ssl"] )
+
 try:
     umask = os.umask(022)
 except OSError:
@@ -1107,6 +1118,12 @@ def checkErrorCodes():
 
 checkErrorCodes()
 
+snappyEnv = env.Clone()
+if not windows:
+    snappyEnv.Append(CPPFLAGS=" -Wno-sign-compare -Wno-unused-function ") #snappy doesn't compile cleanly
+serverOnlyFiles += [snappyEnv.Object(f) for f in snappyFiles]
+
+
 # main db target
 mongodOnlyFiles = [ "db/db.cpp", "db/compact.cpp" ]
 if windows:
diff --git a/bson/bson-inl.h b/bson/bson-inl.h
index 54431549852..b86d66784ed 100644
--- a/bson/bson-inl.h
+++ b/bson/bson-inl.h
@@ -172,7 +172,7 @@ dodouble:
     }
 
     inline BSONObj BSONElement::embeddedObjectUserCheck() const {
-        if ( isABSONObj() )
+        if ( MONGO_likely(isABSONObj()) )
             return BSONObj(value());
         stringstream ss;
         ss << "invalid parameter: expected an object (" << fieldName() << ")";
diff --git a/bson/bsonobj.h b/bson/bsonobj.h
index c65f1268cc4..b3258a2c1d7 100644
--- a/bson/bsonobj.h
+++ b/bson/bsonobj.h
@@ -308,7 +308,7 @@ namespace mongo {
         /** This is "shallow equality" -- ints and doubles won't match.  for a
            deep equality test use woCompare (which is slower).
         */
-        bool shallowEqual(const BSONObj& r) const {
+        bool binaryEqual(const BSONObj& r) const {
             int os = objsize();
             if ( os == r.objsize() ) {
                 return (os == 0 || memcmp(objdata(),r.objdata(),os)==0);
diff --git a/bson/bsonobjbuilder.h b/bson/bsonobjbuilder.h
index 7d6965dd7fc..f61d45879f3 100644
--- a/bson/bsonobjbuilder.h
+++ b/bson/bsonobjbuilder.h
@@ -28,10 +28,10 @@
 #include "bsonobj.h"
 #include "bsonmisc.h"
 
-using namespace std;
-
 namespace mongo {
 
+    using namespace std;
+
 #if defined(_WIN32)
 // warning: 'this' : used in base member initializer list
 #pragma warning( disable : 4355 )
diff --git a/bson/bsonobjiterator.h b/bson/bsonobjiterator.h
index 0d2344e002e..39ae24d9b86 100644
--- a/bson/bsonobjiterator.h
+++ b/bson/bsonobjiterator.h
@@ -37,7 +37,7 @@ namespace mongo {
         */
         BSONObjIterator(const BSONObj& jso) {
             int sz = jso.objsize();
-            if ( sz == 0 ) {
+            if ( MONGO_unlikely(sz == 0) ) {
                 _pos = _theend = 0;
                 return;
             }
diff --git a/bson/inline_decls.h b/bson/inline_decls.h
index 433a67010cb..30da9b4560d 100644
--- a/bson/inline_decls.h
+++ b/bson/inline_decls.h
@@ -31,24 +31,38 @@
 
 #endif
 
+namespace mongo {
 
 /* Note: do not clutter code with these -- ONLY use in hot spots / significant loops. */
 
 #if !defined(__GNUC__)
 
-// branch prediction.  indicate we expect to enter the if statement body
-# define MONGOIF(x) if( (x) )
+// branch prediction.  indicate we expect to be true
+# define MONGO_likely(x) ((bool)(x))
 
-// branch prediction.  indicate we expect to not enter the if statement body
-# define MONGO_IF(x) if( (x) )
+// branch prediction.  indicate we expect to be false
+# define MONGO_unlikely(x) ((bool)(x))
 
-// prefetch data from memory
-# define MONGOPREFETCH(x) { /*just check we compile:*/ assert(sizeof(*x)); }
+# if defined(_WIN32)
+    // prefetch data from memory
+    inline void prefetch(const void *p) { 
+#if defined(_MM_HINT_T0)
+        _mm_prefetch((char *) p, _MM_HINT_T0);
+#endif
+    }
+#else
+    inline void prefetch(void *p) { }
+#endif
 
 #else
 
-# define MONGOIF(x) if( __builtin_expect((x), 1) )
-# define MONGO_IF(x) if( __builtin_expect((x), 0) )
-# define MONGOPREFETCH(x) { /*just check we compile:*/ assert(sizeof(*x)); }
+# define MONGO_likely(x) ( __builtin_expect((bool)(x), 1) )
+# define MONGO_unlikely(x) ( __builtin_expect((bool)(x), 0) )
+
+    inline void prefetch(void *p) { 
+        __builtin_prefetch(p);
+    }
 
 #endif
+
+}
diff --git a/bson/stringdata.h b/bson/stringdata.h
index c4919e82d97..352dc51813f 100644
--- a/bson/stringdata.h
+++ b/bson/stringdata.h
@@ -60,7 +60,7 @@ namespace mongo {
             : _data(&val[0]), _size(N-1) {}
 
         // accessors
-        const char* const data() const { return _data; }
+        const char* data() const { return _data; }
         const unsigned size() const { return _size; }
 
     private:
diff --git a/buildscripts/errorcodes.py b/buildscripts/errorcodes.py
index ce1b3e465d0..dec1030ddad 100755
--- a/buildscripts/errorcodes.py
+++ b/buildscripts/errorcodes.py
@@ -32,9 +32,9 @@ def assignErrorCodes():
 codes = []
 
 def readErrorCodes( callback, replaceZero = False ):
-    ps = [ re.compile( "(([umsg]asser(t|ted))) *\( *(\d+)" ) ,
-           re.compile( "((User|Msg|MsgAssertion)Exceptio(n))\( *(\d+)" ) ,
-           re.compile( "(((verify))) *\( *(\d+)" )
+    ps = [ re.compile( "(([umsg]asser(t|ted))) *\(( *)(\d+)" ) ,
+           re.compile( "((User|Msg|MsgAssertion)Exceptio(n))\(( *)(\d+)" ) ,
+           re.compile( "(((verify))) *\(( *)(\d+)" )
            ]
     
     for x in utils.getAllSourceFiles():
@@ -52,7 +52,8 @@ def readErrorCodes( callback, replaceZero = False ):
                     m = m.groups()
                     
                     start = m[0]
-                    code = m[3]
+                    spaces = m[3]
+                    code = m[4]
                     if code == '0' and replaceZero :
                         code = getNextCode( lastCodes )
                         lastCodes.append( code )
@@ -65,7 +66,7 @@ def readErrorCodes( callback, replaceZero = False ):
                         codes.append( ( x , lineNum , line , code ) )
                         callback( x , lineNum , line , code )
                     
-                    return start + "( " + code
+                    return start + "(" + spaces + code
                 
                 line = re.sub( p, repl, line )
             
diff --git a/buildscripts/smoke.py b/buildscripts/smoke.py
index 128c01cc287..c46b5d1879d 100755
--- a/buildscripts/smoke.py
+++ b/buildscripts/smoke.py
@@ -110,7 +110,7 @@ class mongod(object):
         sock.connect(("localhost", int(port)))
         sock.close()
 
-    def did_mongod_start(self, port=mongod_port, timeout=90):
+    def did_mongod_start(self, port=mongod_port, timeout=300):
         while timeout > 0:
             time.sleep(1)
             try:
@@ -119,6 +119,7 @@ class mongod(object):
             except Exception,e:
                 print >> sys.stderr, e
                 timeout = timeout - 1
+        print >> sys.stderr, "timeout starting mongod"
         return False
 
     def start(self):
@@ -148,6 +149,10 @@ class mongod(object):
             argv += ["--master", "--oplogSize", "256"]
         if self.slave:
             argv += ['--slave', '--source', 'localhost:' + str(srcport)]
+        if self.kwargs.get('no_journal'):
+            argv += ['--nojournal']
+        if self.kwargs.get('no_preallocj'):
+            argv += ['--nopreallocj']
         print "running " + " ".join(argv)
         self.proc = Popen(argv)
         if not self.did_mongod_start(self.port):
@@ -280,6 +285,19 @@ def runTest(test):
     t1 = time.time()
     # FIXME: we don't handle the case where the subprocess
     # hangs... that's bad.
+    if argv[0].endswith( 'mongo' ) and not '--eval' in argv :
+        argv = argv + [ '--eval', 'TestData = new Object();' + 
+                                  'TestData.testPath = "' + path + '";' + 
+                                  'TestData.testFile = "' + os.path.basename( path ) + '";' +
+                                  'TestData.testName = "' + re.sub( ".js$", "", os.path.basename( path ) ) + '";' + 
+                                  'TestData.noJournal = ' + ( 'true' if no_journal else 'false' )  + ";" +
+                                  'TestData.noJournalPrealloc = ' + ( 'true' if no_preallocj else 'false' )  + ";" ]
+    
+    if argv[0].endswith( 'test' ) and no_preallocj :
+        argv = argv + [ '--nopreallocj' ]
+    
+    
+    print argv
     r = call(argv, cwd=test_path)
     t2 = time.time()
     print "                " + str((t2 - t1) * 1000) + "ms"
@@ -301,7 +319,7 @@ def run_tests(tests):
     
     # The reason we use with is so that we get __exit__ semantics
 
-    with mongod(small_oplog=small_oplog) as master:
+    with mongod(small_oplog=small_oplog,no_journal=no_journal,no_preallocj=no_preallocj) as master:
         with mongod(slave=True) if small_oplog else Nothing() as slave:
             if small_oplog:
                 master.wait_for_repl()
@@ -421,7 +439,7 @@ def add_exe(e):
     return e
 
 def main():
-    global mongod_executable, mongod_port, shell_executable, continue_on_failure, small_oplog, smoke_db_prefix, test_path
+    global mongod_executable, mongod_port, shell_executable, continue_on_failure, small_oplog, no_journal, no_preallocj, smoke_db_prefix, test_path
     parser = OptionParser(usage="usage: smoke.py [OPTIONS] ARGS*")
     parser.add_option('--mode', dest='mode', default='suite',
                       help='If "files", ARGS are filenames; if "suite", ARGS are sets of tests (%default)')
@@ -447,6 +465,12 @@ def main():
     parser.add_option('--small-oplog', dest='small_oplog', default=False,
                       action="store_true",
                       help='Run tests with master/slave replication & use a small oplog')
+    parser.add_option('--nojournal', dest='no_journal', default=False,
+                      action="store_true",
+                      help='Do not turn on journaling in tests')
+    parser.add_option('--nopreallocj', dest='no_preallocj', default=False,
+                      action="store_true",
+                      help='Do not preallocate journal files in tests')
     global tests
     (options, tests) = parser.parse_args()
 
@@ -467,6 +491,8 @@ def main():
     continue_on_failure = options.continue_on_failure
     smoke_db_prefix = options.smoke_db_prefix
     small_oplog = options.small_oplog
+    no_journal = options.no_journal
+    no_preallocj = options.no_preallocj
 
     if options.File:
         if options.File == '-':
diff --git a/client/connpool.cpp b/client/connpool.cpp
index e94a78d1c45..2d7c37bfbda 100644
--- a/client/connpool.cpp
+++ b/client/connpool.cpp
@@ -238,13 +238,16 @@ namespace mongo {
     }
 
     void DBConnectionPool::appendInfo( BSONObjBuilder& b ) {
-        BSONObjBuilder bb( b.subobjStart( "hosts" ) );
+
         int avail = 0;
         long long created = 0;
 
 
         map<ConnectionString::ConnectionType,long long> createdByType;
 
+        set<string> replicaSets;
+        
+        BSONObjBuilder bb( b.subobjStart( "hosts" ) );
         {
             scoped_lock lk( _mutex );
             for ( PoolMap::iterator i=_pools.begin(); i!=_pools.end(); ++i ) {
@@ -263,9 +266,33 @@ namespace mongo {
 
                 long long& x = createdByType[i->second.type()];
                 x += i->second.numCreated();
+
+                {
+                    string setName = i->first.ident;
+                    if ( setName.find( "/" ) != string::npos ) {
+                        setName = setName.substr( 0 , setName.find( "/" ) );
+                        replicaSets.insert( setName );
+                    }
+                }
             }
         }
         bb.done();
+        
+        
+        BSONObjBuilder setBuilder( b.subobjStart( "replicaSets" ) );
+        for ( set<string>::iterator i=replicaSets.begin(); i!=replicaSets.end(); ++i ) {
+            string rs = *i;
+            ReplicaSetMonitorPtr m = ReplicaSetMonitor::get( rs );
+            if ( ! m ) {
+                warning() << "no monitor for set: " << rs << endl;
+                continue;
+            }
+            
+            BSONObjBuilder temp( setBuilder.subobjStart( rs ) );
+            m->appendInfo( temp );
+            temp.done();
+        }
+        setBuilder.done();
 
         {
             BSONObjBuilder temp( bb.subobjStart( "createdByType" ) );
@@ -280,20 +307,36 @@ namespace mongo {
     }
 
     bool DBConnectionPool::serverNameCompare::operator()( const string& a , const string& b ) const{
-        string ap = str::before( a , "/" );
-        string bp = str::before( b , "/" );
-        
-        return ap < bp;
+        const char* ap = a.c_str();
+        const char* bp = b.c_str();
+       
+        while (true){
+            if (*ap == '\0' || *ap == '/'){
+                if (*bp == '\0' || *bp == '/')
+                    return false; // equal strings
+                else
+                    return true; // a is shorter
+            }
+
+            if (*bp == '\0' || *bp == '/')
+                return false; // b is shorter
+            
+            if ( *ap < *bp)
+                return true;
+            else if (*ap > *bp)
+                return false;
+
+            ++ap;
+            ++bp;
+        }
+        assert(false);
     }
     
     bool DBConnectionPool::poolKeyCompare::operator()( const PoolKey& a , const PoolKey& b ) const {
-        string ap = str::before( a.ident , "/" );
-        string bp = str::before( b.ident , "/" );
-        
-        if ( ap < bp )
+        if (DBConnectionPool::serverNameCompare()( a.ident , b.ident ))
             return true;
         
-        if ( ap > bp )
+        if (DBConnectionPool::serverNameCompare()( b.ident , a.ident ))
             return false;
 
         return a.timeout < b.timeout;
@@ -366,7 +409,7 @@ namespace mongo {
         PoolFlushCmd() : Command( "connPoolSync" , false , "connpoolsync" ) {}
         virtual void help( stringstream &help ) const { help<<"internal"; }
         virtual LockType locktype() const { return NONE; }
-        virtual bool run(const string&, mongo::BSONObj&, std::string&, mongo::BSONObjBuilder& result, bool) {
+        virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
             pool.flush();
             return true;
         }
@@ -381,7 +424,7 @@ namespace mongo {
         PoolStats() : Command( "connPoolStats" ) {}
         virtual void help( stringstream &help ) const { help<<"stats about connection pool"; }
         virtual LockType locktype() const { return NONE; }
-        virtual bool run(const string&, mongo::BSONObj&, std::string&, mongo::BSONObjBuilder& result, bool) {
+        virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
             pool.appendInfo( result );
             result.append( "numDBClientConnection" , DBClientConnection::getNumConnections() );
             result.append( "numAScopedConnection" , AScopedConnection::getNumConnections() );
diff --git a/client/dbclient.cpp b/client/dbclient.cpp
index 9c907b01a00..dadf7e4f38a 100644
--- a/client/dbclient.cpp
+++ b/client/dbclient.cpp
@@ -80,7 +80,7 @@ namespace mongo {
 
         case PAIR:
         case SET: {
-            DBClientReplicaSet * set = new DBClientReplicaSet( _setName , _servers );
+            DBClientReplicaSet * set = new DBClientReplicaSet( _setName , _servers , socketTimeout );
             if( ! set->connect() ) {
                 delete set;
                 errmsg = "connect failed to set ";
@@ -589,6 +589,13 @@ namespace mongo {
             _failed = true;
             return false;
         }
+
+#ifdef MONGO_SSL
+        if ( cmdLine.sslOnNormalPorts ) {
+            p->secure( sslManager() );
+        }
+#endif
+
         return true;
     }
 
@@ -997,6 +1004,19 @@ namespace mongo {
             say(m);
     }
 
+#ifdef MONGO_SSL
+    SSLManager* DBClientConnection::sslManager() {
+        if ( _sslManager )
+            return _sslManager;
+        
+        SSLManager* s = new SSLManager(true);
+        _sslManager = s;
+        return s;
+    }
+
+    SSLManager* DBClientConnection::_sslManager = 0;
+#endif
+
     AtomicUInt DBClientConnection::_numConnections;
     bool DBClientConnection::_lazyKillCursor = true;
 
diff --git a/client/dbclient.h b/client/dbclient.h
index f48f279e9f5..2b4bb857e2d 100644
--- a/client/dbclient.h
+++ b/client/dbclient.h
@@ -110,7 +110,7 @@ namespace mongo {
      */
     enum InsertOptions {
         /** With muli-insert keep processing inserts if one fails */
-        InsertOption_KeepGoing = 1 << 0
+        InsertOption_ContinueOnError = 1 << 0
     };
 
     class DBClientBase;
@@ -353,6 +353,7 @@ namespace mongo {
         virtual void checkResponse( const char* data, int nReturned, bool* retry = NULL, string* targetHost = NULL ) {
             if( retry ) *retry = false; if( targetHost ) *targetHost = "";
         }
+        virtual bool lazySupported() const = 0;
     };
 
     /**
@@ -921,13 +922,15 @@ namespace mongo {
         void setSoTimeout(double to) { _so_timeout = to; }
         double getSoTimeout() const { return _so_timeout; }
 
+        virtual bool lazySupported() const { return true; }
+
         static int getNumConnections() {
             return _numConnections;
         }
         
         static void setLazyKillCursor( bool lazy ) { _lazyKillCursor = lazy; }
         static bool getLazyKillCursor() { return _lazyKillCursor; }
-
+        
     protected:
         friend class SyncClusterConnection;
         virtual void sayPiggyBack( Message &toSend );
@@ -951,6 +954,11 @@ namespace mongo {
 
         static AtomicUInt _numConnections;
         static bool _lazyKillCursor; // lazy means we piggy back kill cursors on next op
+
+#ifdef MONGO_SSL
+        static SSLManager* sslManager();
+        static SSLManager* _sslManager;
+#endif
     };
 
     /** pings server to check if it's up
diff --git a/client/dbclient_rs.cpp b/client/dbclient_rs.cpp
index bd108d75ba4..2cab1f7b0d5 100644
--- a/client/dbclient_rs.cpp
+++ b/client/dbclient_rs.cpp
@@ -54,9 +54,9 @@ namespace mongo {
         void run() {
             log() << "starting" << endl;
             while ( ! inShutdown() ) {
-                sleepsecs( 20 );
+                sleepsecs( 10 );
                 try {
-                    ReplicaSetMonitor::checkAll();
+                    ReplicaSetMonitor::checkAll( true );
                 }
                 catch ( std::exception& e ) {
                     error() << "check failed: " << e.what() << endl;
@@ -99,17 +99,14 @@ namespace mongo {
             }
 
             _nodes.push_back( Node( servers[i] , conn.release() ) );
-
+            
+            int myLoc = _nodes.size() - 1;
             string maybePrimary;
-            if (_checkConnection( _nodes[_nodes.size()-1].conn , maybePrimary, false)) {
-                break;
-            }
+            _checkConnection( _nodes[myLoc].conn.get() , maybePrimary, false, myLoc );
         }
     }
 
     ReplicaSetMonitor::~ReplicaSetMonitor() {
-        for ( unsigned i=0; i<_nodes.size(); i++ )
-            delete _nodes[i].conn;
         _nodes.clear();
         _master = -1;
     }
@@ -125,7 +122,16 @@ namespace mongo {
         return m;
     }
 
-    void ReplicaSetMonitor::checkAll() {
+    ReplicaSetMonitorPtr ReplicaSetMonitor::get( const string& name ) {
+        scoped_lock lk( _setsLock );
+        map<string,ReplicaSetMonitorPtr>::const_iterator i = _sets.find( name );
+        if ( i == _sets.end() ) 
+            return ReplicaSetMonitorPtr();
+        return i->second;
+    }
+
+
+    void ReplicaSetMonitor::checkAll( bool checkAllSecondaries ) {
         set<string> seen;
 
         while ( true ) {
@@ -146,7 +152,7 @@ namespace mongo {
             if ( ! m )
                 break;
 
-            m->check();
+            m->check( checkAllSecondaries );
         }
 
 
@@ -202,7 +208,7 @@ namespace mongo {
                 return _nodes[_master].addr;
         }
         
-        _check();
+        _check( false );
 
         scoped_lock lk( _lock );
         uassert( 10009 , str::stream() << "ReplicaSetMonitor no master found for set: " << _name , _master >= 0 );
@@ -210,34 +216,70 @@ namespace mongo {
     }
     
     HostAndPort ReplicaSetMonitor::getSlave( const HostAndPort& prev ) {
-        // make sure its valid 
-        if ( prev.port() > 0 ) {
+        // make sure its valid
+
+        bool wasFound = false;
+
+        // This is always true, since checked in port()
+        assert( prev.port() >= 0 );
+        if( prev.host().size() ){
             scoped_lock lk( _lock );
             for ( unsigned i=0; i<_nodes.size(); i++ ) {
                 if ( prev != _nodes[i].addr ) 
                     continue;
 
-                if ( _nodes[i].ok ) 
+                wasFound = true;
+
+                if ( _nodes[i].okForSecondaryQueries() )
                     return prev;
+
                 break;
             }
         }
         
+        if( prev.host().size() ){
+            if( wasFound ){ LOG(1) << "slave '" << prev << "' is no longer ok to use" << endl; }
+            else{ LOG(1) << "slave '" << prev << "' was not found in the replica set" << endl; }
+        }
+        else LOG(1) << "slave '" << prev << "' is not initialized or invalid" << endl;
+
         return getSlave();
     }
 
     HostAndPort ReplicaSetMonitor::getSlave() {
 
-        scoped_lock lk( _lock );
-        for ( unsigned i=0; i<_nodes.size(); i++ ) {
-            _nextSlave = ( _nextSlave + 1 ) % _nodes.size();
-            if ( _nextSlave == _master )
-                continue;
-            if ( _nodes[ _nextSlave ].ok )
-                return _nodes[ _nextSlave ].addr;
+        LOG(2) << "selecting new slave from replica set " << getServerAddress() << endl;
+
+        // Logic is to retry three times for any secondary node, if we can't find any secondary, we'll take
+        // any "ok" node
+        // TODO: Could this query hidden nodes?
+        const int MAX = 3;
+        for ( int xxx=0; xxx<MAX; xxx++ ) {
+
+            {
+                scoped_lock lk( _lock );
+                
+                unsigned i = 0;
+                for ( ; i<_nodes.size(); i++ ) {
+                    _nextSlave = ( _nextSlave + 1 ) % _nodes.size();
+                    if ( _nextSlave == _master ){
+                        LOG(2) << "not selecting " << _nodes[_nextSlave] << " as it is the current master" << endl;
+                        continue;
+                    }
+                    if ( _nodes[ _nextSlave ].okForSecondaryQueries() || ( _nodes[ _nextSlave ].ok && ( xxx + 1 ) >= MAX ) )
+                        return _nodes[ _nextSlave ].addr;
+                    
+                    LOG(2) << "not selecting " << _nodes[_nextSlave] << " as it is not ok to use" << endl;
+                }
+                
+            }
+
+            check(false);
         }
+        
+        LOG(2) << "no suitable slave nodes found, returning default node " << _nodes[ 0 ] << endl;
 
-        return _nodes[ 0 ].addr;
+        return _nodes[0].addr;
     }
 
     /**
@@ -266,7 +308,7 @@ namespace mongo {
             string host = member["name"].String();
 
             int m = -1;
-            if ((m = _find(host)) <= 0) {
+            if ((m = _find(host)) < 0) {
                 continue;
             }
 
@@ -309,16 +351,34 @@ namespace mongo {
     
     
 
-    bool ReplicaSetMonitor::_checkConnection( DBClientConnection * c , string& maybePrimary , bool verbose ) {
+    bool ReplicaSetMonitor::_checkConnection( DBClientConnection * c , string& maybePrimary , bool verbose , int nodesOffset ) {
         scoped_lock lk( _checkConnectionLock );
         bool isMaster = false;
         bool changed = false;
         try {
+            Timer t;
             BSONObj o;
             c->isMaster(isMaster, &o);
+            
+            if ( o["setName"].type() != String || o["setName"].String() != _name ) {
+                warning() << "node: " << c->getServerAddress() << " isn't a part of set: " << _name 
+                          << " ismaster: " << o << endl;
+                if ( nodesOffset >= 0 )
+                    _nodes[nodesOffset].ok = false;
+                return false;
+            }
 
-            log( ! verbose ) << "ReplicaSetMonitor::_checkConnection: " << c->toString() << ' ' << o << endl;
+            if ( nodesOffset >= 0 ) {
+                _nodes[nodesOffset].pingTimeMillis = t.millis();
+                _nodes[nodesOffset].hidden = o["hidden"].trueValue();
+                _nodes[nodesOffset].secondary = o["secondary"].trueValue();
+                _nodes[nodesOffset].ismaster = o["ismaster"].trueValue();
+
+                _nodes[nodesOffset].lastIsMaster = o.copy();
+            }
 
+            log( ! verbose ) << "ReplicaSetMonitor::_checkConnection: " << c->toString() << ' ' << o << endl;
+            
             // add other nodes
             if ( o["hosts"].type() == Array ) {
                 if ( o["primary"].type() == String )
@@ -329,11 +389,14 @@ namespace mongo {
             if (o.hasField("passives") && o["passives"].type() == Array) {
                 _checkHosts(o["passives"].Obj(), changed);
             }
-
+            
             _checkStatus(c);
+
+            
         }
         catch ( std::exception& e ) {
             log( ! verbose ) << "ReplicaSetMonitor::_checkConnection: caught exception " << c->toString() << ' ' << e.what() << endl;
+            _nodes[nodesOffset].ok = false;
         }
 
         if ( changed && _hook )
@@ -342,24 +405,28 @@ namespace mongo {
         return isMaster;
     }
 
-    void ReplicaSetMonitor::_check() {
+    void ReplicaSetMonitor::_check( bool checkAllSecondaries ) {
 
         bool triedQuickCheck = false;
 
         LOG(1) <<  "_check : " << getServerAddress() << endl;
 
+        int newMaster = -1;
+        
         for ( int retry = 0; retry < 2; retry++ ) {
             for ( unsigned i=0; i<_nodes.size(); i++ ) {
-                DBClientConnection * c;
+                shared_ptr<DBClientConnection> c;
                 {
                     scoped_lock lk( _lock );
                     c = _nodes[i].conn;
                 }
 
                 string maybePrimary;
-                if ( _checkConnection( c , maybePrimary , retry ) ) {
+                if ( _checkConnection( c.get() , maybePrimary , retry , i ) ) {
                     _master = i;
-                    return;
+                    newMaster = i;
+                    if ( ! checkAllSecondaries )
+                        return;
                 }
 
                 if ( ! triedQuickCheck && maybePrimary.size() ) {
@@ -367,36 +434,44 @@ namespace mongo {
                     if ( x >= 0 ) {
                         triedQuickCheck = true;
                         string dummy;
-                        DBClientConnection * testConn;
+                        shared_ptr<DBClientConnection> testConn;
                         {
                             scoped_lock lk( _lock );
                             testConn = _nodes[x].conn;
                         }
-                        if ( _checkConnection( testConn , dummy , false ) ) {
+                        if ( _checkConnection( testConn.get() , dummy , false , x ) ) {
                             _master = x;
-                            return;
+                            newMaster = x;
+                            if ( ! checkAllSecondaries )
+                                return;
                         }
                     }
                 }
 
             }
+            
+            if ( newMaster >= 0 )
+                return;
+
             sleepsecs(1);
         }
 
     }
 
-    void ReplicaSetMonitor::check() {
+    void ReplicaSetMonitor::check( bool checkAllSecondaries ) {
         // first see if the current master is fine
         if ( _master >= 0 ) {
             string temp;
-            if ( _checkConnection( _nodes[_master].conn , temp , false ) ) {
-                // current master is fine, so we're done
-                return;
+            if ( _checkConnection( _nodes[_master].conn.get() , temp , false , _master ) ) {
+                if ( ! checkAllSecondaries ) {
+                    // current master is fine, so we're done
+                    return;
+                }
             }
         }
 
         // we either have no master, or the current is dead
-        _check();
+        _check( checkAllSecondaries );
     }
 
     int ReplicaSetMonitor::_find( const string& server ) const {
@@ -419,7 +494,26 @@ namespace mongo {
                 return i;
         return -1;
     }
-
+    
+    void ReplicaSetMonitor::appendInfo( BSONObjBuilder& b ) const {
+        scoped_lock lk( _lock );
+        BSONArrayBuilder hosts( b.subarrayStart( "hosts" ) );
+        for ( unsigned i=0; i<_nodes.size(); i++ ) {
+            hosts.append( BSON( "addr" << _nodes[i].addr <<
+                                // "lastIsMaster" << _nodes[i].lastIsMaster << // this is a potential race, so only used when debugging
+                                "ok" << _nodes[i].ok <<
+                                "ismaster" << _nodes[i].ismaster <<
+                                "hidden" << _nodes[i].hidden <<
+                                "secondary" << _nodes[i].secondary <<
+                                "pingTimeMillis" << _nodes[i].pingTimeMillis  ) );
+            
+        }
+        hosts.done();
+        
+        b.append( "master" , _master );
+        b.append( "nextSlave" , _nextSlave );
+    }
+    
 
     mongo::mutex ReplicaSetMonitor::_setsLock( "ReplicaSetMonitor" );
     map<string,ReplicaSetMonitorPtr> ReplicaSetMonitor::_sets;
@@ -428,8 +522,9 @@ namespace mongo {
     // ----- DBClientReplicaSet ---------
     // --------------------------------
 
-    DBClientReplicaSet::DBClientReplicaSet( const string& name , const vector<HostAndPort>& servers )
-        : _monitor( ReplicaSetMonitor::get( name , servers ) ) {
+    DBClientReplicaSet::DBClientReplicaSet( const string& name , const vector<HostAndPort>& servers, double so_timeout )
+        : _monitor( ReplicaSetMonitor::get( name , servers ) ),
+          _so_timeout( so_timeout ) {
     }
 
     DBClientReplicaSet::~DBClientReplicaSet() {
@@ -446,7 +541,7 @@ namespace mongo {
         }
 
         _masterHost = _monitor->getMaster();
-        _master.reset( new DBClientConnection( true , this ) );
+        _master.reset( new DBClientConnection( true , this , _so_timeout ) );
         string errmsg;
         if ( ! _master->connect( _masterHost , errmsg ) ) {
             _monitor->notifyFailure( _masterHost );
@@ -463,10 +558,13 @@ namespace mongo {
             if ( ! _slave->isFailed() )
                 return _slave.get();
             _monitor->notifySlaveFailure( _slaveHost );
+            _slaveHost = _monitor->getSlave();
+        } 
+        else {
+            _slaveHost = h;
         }
-        
-        _slaveHost = _monitor->getSlave();
-        _slave.reset( new DBClientConnection( true , this ) );
+
+        _slave.reset( new DBClientConnection( true , this , _so_timeout ) );
         _slave->connect( _slaveHost );
         _auth( _slave.get() );
         return _slave.get();
diff --git a/client/dbclient_rs.h b/client/dbclient_rs.h
index 4a0a832d9ca..b6948a05b80 100644
--- a/client/dbclient_rs.h
+++ b/client/dbclient_rs.h
@@ -43,10 +43,16 @@ namespace mongo {
         static ReplicaSetMonitorPtr get( const string& name , const vector<HostAndPort>& servers );
 
         /**
+         * gets a cached Monitor per name or will return none if it doesn't exist
+         */
+        static ReplicaSetMonitorPtr get( const string& name );
+
+
+        /**
          * checks all sets for current master and new secondaries
          * usually only called from a BackgroundJob
          */
-        static void checkAll();
+        static void checkAll( bool checkAllSecondaries );
 
         /**
          * this is called whenever the config of any repclia set changes
@@ -81,13 +87,15 @@ namespace mongo {
         /**
          * checks for current master and new secondaries
          */
-        void check();
+        void check( bool checkAllSecondaries );
 
         string getName() const { return _name; }
 
         string getServerAddress() const;
         
         bool contains( const string& server ) const;
+        
+        void appendInfo( BSONObjBuilder& b ) const;
 
     private:
         /**
@@ -98,7 +106,7 @@ namespace mongo {
          */
         ReplicaSetMonitor( const string& name , const vector<HostAndPort>& servers );
 
-        void _check();
+        void _check( bool checkAllSecondaries );
 
         /**
          * Use replSetGetStatus command to make sure hosts in host list are up
@@ -119,9 +127,10 @@ namespace mongo {
          * @param c the connection to check
          * @param maybePrimary OUT
          * @param verbose
+         * @param nodesOffset - offset into _nodes array, -1 for not in it
          * @return if the connection is good
          */
-        bool _checkConnection( DBClientConnection * c , string& maybePrimary , bool verbose );
+        bool _checkConnection( DBClientConnection * c , string& maybePrimary , bool verbose , int nodesOffset );
 
         int _find( const string& server ) const ;
         int _find_inlock( const string& server ) const ;
@@ -132,14 +141,44 @@ namespace mongo {
 
         string _name;
         struct Node {
-            Node( const HostAndPort& a , DBClientConnection* c ) : addr( a ) , conn(c) , ok(true) {}
+            Node( const HostAndPort& a , DBClientConnection* c ) 
+                : addr( a ) , conn(c) , ok(true) , 
+                  ismaster(false), secondary( false ) , hidden( false ) , pingTimeMillis(0) {
+            }
+
+            bool okForSecondaryQueries() const {
+                return ok && secondary && ! hidden;
+            }
+
+            BSONObj toBSON() const {
+                return BSON( "addr" << addr.toString() <<
+                             "isMaster" << ismaster <<
+                             "secondary" << secondary <<
+                             "hidden" << hidden <<
+                             "ok" << ok );
+            }
+
+            string toString() const {
+                return toBSON().toString();
+            }
+
             HostAndPort addr;
-            DBClientConnection* conn;
+            shared_ptr<DBClientConnection> conn;
 
             // if this node is in a failure state
             // used for slave routing
             // this is too simple, should make it better
             bool ok;
+
+            // as reported by ismaster
+            BSONObj lastIsMaster;
+
+            bool ismaster;
+            bool secondary; 
+            bool hidden;
+            
+            int pingTimeMillis;
+
         };
 
         /**
@@ -168,7 +207,7 @@ namespace mongo {
 
     public:
         /** Call connect() after constructing. autoReconnect is always on for DBClientReplicaSet connections. */
-        DBClientReplicaSet( const string& name , const vector<HostAndPort>& servers );
+        DBClientReplicaSet( const string& name , const vector<HostAndPort>& servers, double so_timeout=0 );
         virtual ~DBClientReplicaSet();
 
         /** Returns false if nomember of the set were reachable, or neither is
@@ -228,16 +267,14 @@ namespace mongo {
 
         // ----- informational ----
 
-        /**
-         * timeout not supported in DBClientReplicaSet yet
-         */
-        double getSoTimeout() const { return 0; }
+        double getSoTimeout() const { return _so_timeout; }
 
         string toString() { return getServerAddress(); }
 
         string getServerAddress() const { return _monitor->getServerAddress(); }
 
         virtual ConnectionString::ConnectionType type() const { return ConnectionString::SET; }
+        virtual bool lazySupported() const { return true; }
 
         // ---- low level ------
 
@@ -265,6 +302,8 @@ namespace mongo {
 
         HostAndPort _slaveHost;
         scoped_ptr<DBClientConnection> _slave;
+        
+        double _so_timeout;
 
         /**
          * for storing authentication info
diff --git a/client/dbclientcursor.cpp b/client/dbclientcursor.cpp
index f1685637311..5db360ef2c7 100644
--- a/client/dbclientcursor.cpp
+++ b/client/dbclientcursor.cpp
@@ -70,6 +70,7 @@ namespace mongo {
     }
     
     void DBClientCursor::initLazy( bool isRetry ) {
+        verify( 15875 , _client->lazySupported() );
         Message toSend;
         _assembleInit( toSend );
         _client->say( toSend, isRetry );
diff --git a/client/distlock.cpp b/client/distlock.cpp
index cd516494cf9..cb711590524 100644
--- a/client/distlock.cpp
+++ b/client/distlock.cpp
@@ -634,7 +634,9 @@ namespace mongo {
                         // TODO: Clean up all the extra code to exit this method, probably with a refactor
                         if ( ! errMsg.empty() || ! err["n"].type() || err["n"].numberInt() < 1 ) {
                             ( errMsg.empty() ? log( logLvl - 1 ) : warning() ) << "Could not re-enter lock '" << lockName << "' "
-                                    << ( !errMsg.empty() ? causedBy(errMsg) : string("(not sure lock is held)") ) << endl;
+                                                                               << ( !errMsg.empty() ? causedBy(errMsg) : string("(not sure lock is held)") ) 
+                                                                               << " gle: " << err
+                                                                               << endl;
                             *other = o; other->getOwned(); conn.done();
                             return false;
                         }
diff --git a/client/distlock_test.cpp b/client/distlock_test.cpp
index ab5183c7069..42a1c48cedb 100644
--- a/client/distlock_test.cpp
+++ b/client/distlock_test.cpp
@@ -86,7 +86,7 @@ namespace mongo {
             }
         }
 
-        bool run(const string&, BSONObj& cmdObj, string& errmsg,
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg,
                  BSONObjBuilder& result, bool) {
             Timer t;
             DistributedLock lk(ConnectionString(cmdObj["host"].String(),
@@ -288,7 +288,7 @@ namespace mongo {
             return;
         }
 
-        bool run(const string&, BSONObj& cmdObj, string& errmsg,
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg,
                  BSONObjBuilder& result, bool) {
 
             Timer t;
@@ -417,7 +417,7 @@ namespace mongo {
             return NONE;
         }
 
-        bool run(const string&, BSONObj& cmdObj, string& errmsg,
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg,
                  BSONObjBuilder& result, bool) {
 
             long long skew = (long long) number_field(cmdObj, "skew", 0);
diff --git a/client/examples/httpClientTest.cpp b/client/examples/httpClientTest.cpp
index fab3251ec49..4055d4492d5 100644
--- a/client/examples/httpClientTest.cpp
+++ b/client/examples/httpClientTest.cpp
@@ -22,20 +22,7 @@
 
 using namespace mongo;
 
-int main( int argc, const char **argv ) {
-
-    int port = 27017;
-    if ( argc != 1 ) {
-        if ( argc != 3 )
-            throw -12;
-        port = atoi( argv[ 2 ] );
-    }
-    port += 1000;
-
-    stringstream ss;
-    ss << "http://localhost:" << port << "/";
-    string url = ss.str();
-
+void play( string url ) {
     cout << "[" << url << "]" << endl;
 
     HttpClient c;
@@ -45,8 +32,27 @@ int main( int argc, const char **argv ) {
     HttpClient::Headers h = r.getHeaders();
     MONGO_assert( h["Content-Type"].find( "text/html" ) == 0 );
 
-    cout << "Headers" << endl;
+    cout << "\tHeaders" << endl;
     for ( HttpClient::Headers::iterator i = h.begin() ; i != h.end(); ++i ) {
-        cout << i->first << "\t" << i->second << endl;
+        cout << "\t\t" << i->first << "\t" << i->second << endl;
     }
+    
+}
+
+int main( int argc, const char **argv ) {
+
+    int port = 27017;
+    if ( argc != 1 ) {
+        if ( argc != 3 )
+            throw -12;
+        port = atoi( argv[ 2 ] );
+    }
+    port += 1000;
+
+    play( str::stream() << "http://localhost:" << port << "/" );
+    
+#ifdef MONGO_SSL
+    play( "https://www.10gen.com/" );
+#endif
+    
 }
diff --git a/client/examples/rs.cpp b/client/examples/rs.cpp
index 65fff8d2948..3307d87b56b 100644
--- a/client/examples/rs.cpp
+++ b/client/examples/rs.cpp
@@ -57,14 +57,19 @@ int main( int argc , const char ** argv ) {
     
     unsigned nThreads = 1;
     bool print = false;
+    bool testTimeout = false;
 
     for ( int i=1; i<argc; i++ ) {
         if ( mongoutils::str::equals( "--threads" , argv[i] ) ) {
             nThreads = atoi( argv[++i] );
         }
-        else if ( mongoutils::str::equals( "--print" , argv[1] ) ) {
+        else if ( mongoutils::str::equals( "--print" , argv[i] ) ) {
             print = true;
         }
+        // Run a special mode to demonstrate the DBClientReplicaSet so_timeout option.
+        else if ( mongoutils::str::equals( "--testTimeout" , argv[i] ) ) {
+            testTimeout = true;
+        }
         else {
             cerr << "unknown option: " << argv[i] << endl;
             return 1;
@@ -79,7 +84,7 @@ int main( int argc , const char ** argv ) {
         return 1;
     }
 
-    DBClientReplicaSet * conn = (DBClientReplicaSet*)cs.connect( errmsg );
+    DBClientReplicaSet * conn = dynamic_cast<DBClientReplicaSet*>(cs.connect( errmsg, testTimeout ? 10 : 0 ));
     if ( ! conn ) {
         cout << "error connecting: " << errmsg << endl;
         return 2;
@@ -88,6 +93,17 @@ int main( int argc , const char ** argv ) {
     string collName = "test.rs1";
 
     conn->dropCollection( collName );
+
+    if ( testTimeout ) {
+        conn->insert( collName, BSONObj() );
+        try {
+            conn->count( collName, BSON( "$where" << "sleep(40000)" ) );
+        } catch( DBException& ) {
+            return 0;
+        }
+        cout << "expected socket exception" << endl;
+        return 1;
+    }
     
     vector<boost::shared_ptr<boost::thread> > threads;
     for ( unsigned i=0; i<nThreads; i++ ) {
diff --git a/client/parallel.cpp b/client/parallel.cpp
index f157927703f..76b0168be22 100644
--- a/client/parallel.cpp
+++ b/client/parallel.cpp
@@ -410,6 +410,7 @@ namespace mongo {
         }
     }
 
+    // TODO:  Merge with futures API?  We do a lot of error checking here that would be useful elsewhere.
     void ParallelSortClusteredCursor::_init() {
 
         // log() << "Starting parallel search..." << endl;
@@ -720,17 +721,23 @@ namespace mongo {
     // ---- Future -----
     // -----------------
 
-    Future::CommandResult::CommandResult( const string& server , const string& db , const BSONObj& cmd , DBClientBase * conn )
-        :_server(server) ,_db(db) ,_cmd(cmd) ,_conn(conn) ,_done(false)
+    Future::CommandResult::CommandResult( const string& server , const string& db , const BSONObj& cmd , int options , DBClientBase * conn )
+        :_server(server) ,_db(db) , _options(options), _cmd(cmd) ,_conn(conn) ,_done(false)
     {
         try {
             if ( ! _conn ){
                 _connHolder.reset( new ScopedDbConnection( _server ) );
                 _conn = _connHolder->get();
             }
-            
-            _cursor.reset( new DBClientCursor(_conn, _db + ".$cmd", _cmd, -1/*limit*/, 0, NULL, 0, 0));
-            _cursor->initLazy();
+
+            if ( _conn->lazySupported() ) {
+                _cursor.reset( new DBClientCursor(_conn, _db + ".$cmd", _cmd, -1/*limit*/, 0, NULL, _options, 0));
+                _cursor->initLazy();
+            }
+            else {
+                _done = true; // we set _done first because even if there is an error we're done
+                _ok = _conn->runCommand( db , cmd , _res , options );
+            }
         }
         catch ( std::exception& e ) {
             error() << "Future::spawnComand (part 1) exception: " << e.what() << endl;
@@ -768,8 +775,8 @@ namespace mongo {
         return _ok;
     }
 
-    shared_ptr<Future::CommandResult> Future::spawnCommand( const string& server , const string& db , const BSONObj& cmd , DBClientBase * conn ) {
-        shared_ptr<Future::CommandResult> res (new Future::CommandResult( server , db , cmd , conn  ));
+    shared_ptr<Future::CommandResult> Future::spawnCommand( const string& server , const string& db , const BSONObj& cmd , int options , DBClientBase * conn ) {
+        shared_ptr<Future::CommandResult> res (new Future::CommandResult( server , db , cmd , options , conn  ));
         return res;
     }
 
diff --git a/client/parallel.h b/client/parallel.h
index 332840edea1..869bff95a4a 100644
--- a/client/parallel.h
+++ b/client/parallel.h
@@ -280,10 +280,11 @@ namespace mongo {
 
         private:
 
-            CommandResult( const string& server , const string& db , const BSONObj& cmd , DBClientBase * conn );
+            CommandResult( const string& server , const string& db , const BSONObj& cmd , int options , DBClientBase * conn );
 
             string _server;
             string _db;
+            int _options;
             BSONObj _cmd;
             DBClientBase * _conn;
             scoped_ptr<ScopedDbConnection> _connHolder; // used if not provided a connection
@@ -304,7 +305,7 @@ namespace mongo {
          * @param cmd cmd to exec
          * @param conn optional connection to use.  will use standard pooled if non-specified
          */
-        static shared_ptr<CommandResult> spawnCommand( const string& server , const string& db , const BSONObj& cmd , DBClientBase * conn = 0 );
+        static shared_ptr<CommandResult> spawnCommand( const string& server , const string& db , const BSONObj& cmd , int options , DBClientBase * conn = 0 );
     };
 
 
diff --git a/client/syncclusterconnection.h b/client/syncclusterconnection.h
index edd458fe683..68dd338a408 100644
--- a/client/syncclusterconnection.h
+++ b/client/syncclusterconnection.h
@@ -96,6 +96,7 @@ namespace mongo {
 
         virtual bool auth(const string &dbname, const string &username, const string &password_text, string& errmsg, bool digestPassword);
 
+        virtual bool lazySupported() const { return false; }
     private:
         SyncClusterConnection( SyncClusterConnection& prev, double socketTimeout = 0 );
         string _toString() const;
diff --git a/db/btree.cpp b/db/btree.cpp
index 232ac615470..e4753bef696 100644
--- a/db/btree.cpp
+++ b/db/btree.cpp
@@ -44,7 +44,7 @@ namespace mongo {
     }
 
     /** data check. like assert, but gives a reasonable error message to the user. */
-#define check(expr) _IF(!(expr)) { checkFailed(__LINE__); }
+#define check(expr) if(!(expr) ) { checkFailed(__LINE__); }
 
 #define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this );
 
diff --git a/db/btree.h b/db/btree.h
index 2e47d69a221..9ffa54cddc0 100644
--- a/db/btree.h
+++ b/db/btree.h
@@ -1071,7 +1071,7 @@ namespace mongo {
          * Our btrees may (rarely) have "unused" keys when items are deleted.
          * Skip past them.
          */
-        virtual bool skipUnusedKeys( bool mayJump ) = 0;
+        virtual bool skipUnusedKeys() = 0;
 
         bool skipOutOfRangeKeysAndCheckEnd();
         void skipAndCheck();
diff --git a/db/btreecursor.cpp b/db/btreecursor.cpp
index cd145ef861f..f39d5bb0535 100644
--- a/db/btreecursor.cpp
+++ b/db/btreecursor.cpp
@@ -68,7 +68,7 @@ namespace mongo {
             return !currKeyNode().prevChildBucket.isNull();
         }
 
-        bool skipUnusedKeys( bool mayJump ) {
+        bool skipUnusedKeys() {
             int u = 0;
             while ( 1 ) {
                 if ( !ok() )
@@ -80,9 +80,6 @@ namespace mongo {
                 u++;
                 //don't include unused keys in nscanned
                 //++_nscanned;
-                if ( mayJump && ( u % 10 == 0 ) ) {
-                    skipOutOfRangeKeysAndCheckEnd();
-                }
             }
             if ( u > 10 )
                 OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
@@ -114,13 +111,13 @@ namespace mongo {
                     while( 1 ) {
                         //  if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
                         //       b->k(keyOfs).recordLoc == locAtKeyOfs ) {
-                        if ( keyAt(keyOfs).shallowEqual(keyAtKeyOfs) ) {
+                        if ( keyAt(keyOfs).binaryEqual(keyAtKeyOfs) ) {
                             const _KeyNode& kn = keyNode(keyOfs);
                             if( kn.recordLoc == locAtKeyOfs ) {
                                 if ( !kn.isUsed() ) {
                                     // we were deleted but still exist as an unused
                                     // marker key. advance.
-                                    skipUnusedKeys( false );
+                                    skipUnusedKeys();
                                 }
                                 return;
                             }
@@ -149,7 +146,7 @@ namespace mongo {
             bucket = _locate(keyAtKeyOfs, locAtKeyOfs);
             RARELY log() << "key seems to have moved in the index, refinding. " << bucket.toString() << endl;
             if ( ! bucket.isNull() )
-                skipUnusedKeys( false );
+                skipUnusedKeys();
 
         }
     
@@ -329,18 +326,24 @@ namespace mongo {
         if ( ok() ) {
             _nscanned = 1;
         }
-        skipUnusedKeys( false );
+        skipUnusedKeys();
         checkEnd();
     }
 
     void BtreeCursor::skipAndCheck() {
-        skipUnusedKeys( true );
+        int startNscanned = _nscanned;
+        skipUnusedKeys();
         while( 1 ) {
             if ( !skipOutOfRangeKeysAndCheckEnd() ) {
                 break;
             }
-            while( skipOutOfRangeKeysAndCheckEnd() );
-            if ( !skipUnusedKeys( true ) ) {
+            do {
+                if ( _nscanned > startNscanned + 20 ) {
+                    skipUnusedKeys();
+                    return;
+                }
+            } while( skipOutOfRangeKeysAndCheckEnd() );
+            if ( !skipUnusedKeys() ) {
                 break;
             }
         }
@@ -395,7 +398,7 @@ namespace mongo {
         bucket = _advance(bucket, keyOfs, _direction, "BtreeCursor::advance");
 
         if ( !_independentFieldRanges ) {
-            skipUnusedKeys( false );
+            skipUnusedKeys();
             checkEnd();
             if ( ok() ) {
                 ++_nscanned;
diff --git a/db/client.cpp b/db/client.cpp
index be5dba9ae56..bf3aead75a6 100644
--- a/db/client.cpp
+++ b/db/client.cpp
@@ -122,10 +122,13 @@ namespace mongo {
             error() << "Client::shutdown not called: " << _desc << endl;
         }
 
-        scoped_lock bl(clientsMutex);
-        if ( ! _shutdown )
-            clients.erase(this);
-        delete _curOp;
+        if ( ! inShutdown() ) {
+            // we can't clean up safely once we're in shutdown
+            scoped_lock bl(clientsMutex);
+            if ( ! _shutdown )
+                clients.erase(this);
+            delete _curOp;
+        }
     }
 
     bool Client::shutdown() {
@@ -469,7 +472,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return false; }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             Client& c = cc();
             c.gotHandshake( cmdObj );
             return 1;
@@ -688,11 +691,14 @@ namespace mongo {
 
 #define OPDEBUG_APPEND_NUMBER(x) if( x ) b.append( #x , (x) )
 #define OPDEBUG_APPEND_BOOL(x) if( x ) b.appendBool( #x , (x) )
-    void OpDebug::append( BSONObjBuilder& b ) const {
+    void OpDebug::append( const CurOp& curop, BSONObjBuilder& b ) const {
         b.append( "op" , iscommand ? "command" : opToString( op ) );
         b.append( "ns" , ns.toString() );
         if ( ! query.isEmpty() )
             b.append( iscommand ? "command" : "query" , query );
+        else if ( ! iscommand && curop.haveQuery() )
+            curop.appendQuery( b , "query" );
+
         if ( ! updateobj.isEmpty() )
             b.append( "updateobj" , updateobj );
         
diff --git a/db/clientcursor.cpp b/db/clientcursor.cpp
index 615616e7a7c..e803afd459c 100644
--- a/db/clientcursor.cpp
+++ b/db/clientcursor.cpp
@@ -447,16 +447,29 @@ namespace mongo {
         return rec;
     }
 
-    bool ClientCursor::yieldSometimes( RecordNeeds need ) {
+    bool ClientCursor::yieldSometimes( RecordNeeds need, bool *yielded ) {
+        if ( yielded ) {
+            *yielded = false;   
+        }
         if ( ! _yieldSometimesTracker.ping() ) {
             Record* rec = _recordForYield( need );
-            if ( rec ) 
+            if ( rec ) {
+                if ( yielded ) {
+                    *yielded = true;   
+                }
                 return yield( yieldSuggest() , rec );
+            }
             return true;
         }
 
         int micros = yieldSuggest();
-        return ( micros > 0 ) ? yield( micros , _recordForYield( need ) ) : true;
+        if ( micros > 0 ) {
+            if ( yielded ) {
+                *yielded = true;   
+            }
+            return yield( micros , _recordForYield( need ) );
+        }
+        return true;
     }
 
     void ClientCursor::staticYield( int micros , const StringData& ns , Record * rec ) {
@@ -616,7 +629,7 @@ namespace mongo {
             help << " example: { cursorInfo : 1 }";
         }
         virtual LockType locktype() const { return NONE; }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             ClientCursor::appendStats( result );
             return true;
         }
diff --git a/db/clientcursor.h b/db/clientcursor.h
index b3bd996c768..75c7da85cc6 100644
--- a/db/clientcursor.h
+++ b/db/clientcursor.h
@@ -186,9 +186,10 @@ namespace mongo {
         /**
          * @param needRecord whether or not the next record has to be read from disk for sure
          *                   if this is true, will yield of next record isn't in memory
+         * @param yielded true if a yield occurred, and potentially if a yield did not occur
          * @return same as yield()
          */
-        bool yieldSometimes( RecordNeeds need );
+        bool yieldSometimes( RecordNeeds need, bool *yielded = 0 );
 
         static int yieldSuggest();
         static void staticYield( int micros , const StringData& ns , Record * rec );
diff --git a/db/cloner.cpp b/db/cloner.cpp
index 2a46ea22cb4..8956133daa3 100644
--- a/db/cloner.cpp
+++ b/db/cloner.cpp
@@ -460,7 +460,7 @@ namespace mongo {
             help << "{ clone : \"host13\" }";
         }
         CmdClone() : Command("clone") { }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string from = cmdObj.getStringField("clone");
             if ( from.empty() )
                 return false;
@@ -486,7 +486,7 @@ namespace mongo {
                  "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there."
                  ;
         }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string fromhost = cmdObj.getStringField("from");
             if ( fromhost.empty() ) {
                 errmsg = "missing 'from' parameter";
@@ -538,7 +538,7 @@ namespace mongo {
             help << "get a nonce for subsequent copy db request from secure server\n";
             help << "usage: {copydbgetnonce: 1, fromhost: <hostname>}";
         }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string fromhost = cmdObj.getStringField("fromhost");
             if ( fromhost.empty() ) {
                 /* copy from self */
@@ -579,7 +579,7 @@ namespace mongo {
             help << "copy a database from another host to this host\n";
             help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, slaveOk: <bool>, username: <username>, nonce: <nonce>, key: <key>]}";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             bool slaveOk = cmdObj["slaveOk"].trueValue();
             string fromhost = cmdObj.getStringField("fromhost");
             if ( fromhost.empty() ) {
@@ -633,7 +633,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << " example: { renameCollection: foo.a, to: bar.b }";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string source = cmdObj.getStringField( name.c_str() );
             string target = cmdObj.getStringField( "to" );
             if ( source.empty() || target.empty() ) {
@@ -671,6 +671,8 @@ namespace mongo {
                 nsToDatabase( target.c_str(), to );
                 if ( strcmp( from, to ) == 0 ) {
                     renameNamespace( source.c_str(), target.c_str() );
+                    // make sure we drop counters etc
+                    Top::global.collectionDropped( source );
                     return true;
                 }
             }
diff --git a/db/cmdline.cpp b/db/cmdline.cpp
index d0b80455ff2..06880c98829 100644
--- a/db/cmdline.cpp
+++ b/db/cmdline.cpp
@@ -19,6 +19,7 @@
 #include "pch.h"
 #include "cmdline.h"
 #include "commands.h"
+#include "../util/password.h"
 #include "../util/processinfo.h"
 #include "../util/net/listen.h"
 #include "security_common.h"
@@ -27,6 +28,8 @@
 #include <direct.h>
 #endif
 
+#define MAX_LINE_LENGTH 256
+
 namespace po = boost::program_options;
 namespace fs = boost::filesystem;
 
@@ -64,6 +67,14 @@ namespace mongo {
         ("fork" , "fork server process" )
 #endif
         ;
+        
+        hidden.add_options()
+#ifdef MONGO_SSL
+        ("sslOnNormalPorts" , "use ssl on configured ports" )
+        ("sslPEMKeyFile" , po::value<string>(&cmdLine.sslPEMKeyFile), "PEM file for ssl" )
+        ("sslPEMKeyPassword" , new PasswordValue(&cmdLine.sslPEMKeyPassword) , "PEM file password" )
+#endif
+        ;
 
     }
 
@@ -85,6 +96,32 @@ namespace mongo {
     }
 #endif
 
+    void CmdLine::parseConfigFile( istream &f, stringstream &ss ) {
+        string s;
+        char line[MAX_LINE_LENGTH];
+
+        while ( f ) {
+            f.getline(line, MAX_LINE_LENGTH);
+            s = line;
+            std::remove(s.begin(), s.end(), ' ');
+            std::remove(s.begin(), s.end(), '\t');
+            boost::to_upper(s);
+
+            if ( s.find( "FASTSYNC" ) != string::npos )
+                cout << "warning \"fastsync\" should not be put in your configuration file" << endl;
+
+            if ( s.c_str()[0] == '#' ) { 
+                // skipping commented line
+            } else if ( s.find( "=FALSE" ) == string::npos ) {
+                ss << line << endl;
+            } else {
+                cout << "warning: remove or comment out this line by starting it with \'#\', skipping now : " << line << endl;
+            }
+        }
+        return;
+    }
+
+
 
     bool CmdLine::store( int argc , char ** argv ,
                          boost::program_options::options_description& visible,
@@ -141,7 +178,9 @@ namespace mongo {
                     return false;
                 }
 
-                po::store( po::parse_config_file( f , all ) , params );
+                stringstream ss;
+                CmdLine::parseConfigFile( f, ss );
+                po::store( po::parse_config_file( ss , all ) , params );
                 f.close();
             }
 
@@ -287,7 +326,25 @@ namespace mongo {
             noauth = false;
         }
 
+#ifdef MONGO_SSL
+        if (params.count("sslOnNormalPorts") ) {
+            cmdLine.sslOnNormalPorts = true;
 
+            if ( cmdLine.sslPEMKeyPassword.size() == 0 ) {
+                log() << "need sslPEMKeyPassword" << endl;
+                dbexit(EXIT_BADOPTIONS);
+            }
+            
+            if ( cmdLine.sslPEMKeyFile.size() == 0 ) {
+                log() << "need sslPEMKeyFile" << endl;
+                dbexit(EXIT_BADOPTIONS);
+            }
+            
+            cmdLine.sslServerManager = new SSLManager( false );
+            cmdLine.sslServerManager->setupPEM( cmdLine.sslPEMKeyFile , cmdLine.sslPEMKeyPassword );
+        }
+#endif
+        
         {
             BSONObjBuilder b;
             for (po::variables_map::const_iterator it(params.begin()), end(params.end()); it != end; it++){
@@ -354,7 +411,7 @@ namespace mongo {
         virtual bool adminOnly() const { return true; }
         virtual bool slaveOk() const { return true; }
 
-        virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             result.append("argv", argvArray);
             result.append("parsed", parsedOpts);
             return true;
diff --git a/db/cmdline.h b/db/cmdline.h
index 7b6d12a2e04..60eb668a735 100644
--- a/db/cmdline.h
+++ b/db/cmdline.h
@@ -21,6 +21,12 @@
 
 namespace mongo {
 
+#ifdef MONGO_SSL
+    class SSLManager;
+#endif
+
+
+
     /* command line options
     */
     /* concurrency: OK/READ */
@@ -63,6 +69,7 @@ namespace mongo {
         bool quiet;            // --quiet
         bool noTableScan;      // --notablescan no table scans allowed
         bool prealloc;         // --noprealloc no preallocation of data files
+        bool preallocj;        // --nopreallocj no preallocation of journal files
         bool smallfiles;       // --smallfiles allocate smaller data files
 
         bool configsvr;        // --configsvr
@@ -71,7 +78,8 @@ namespace mongo {
         int quotaFiles;        // --quotaFiles
         bool cpu;              // --cpu show cpu time periodically
 
-        bool dur;              // --dur durability (now --journal)
+        bool dur;                       // --dur durability (now --journal)
+        unsigned journalCommitInterval; // group/batch commit interval ms
 
         /** --durOptions 7      dump journal and terminate without doing anything further
             --durOptions 4      recover and terminate without listening
@@ -99,6 +107,14 @@ namespace mongo {
         bool noUnixSocket;     // --nounixsocket
         string socket;         // UNIX domain socket directory
 
+#ifdef MONGO_SSL
+        bool sslOnNormalPorts;      // --sslOnNormalPorts
+        string sslPEMKeyFile;       // --sslPEMKeyFile
+        string sslPEMKeyPassword;   // --sslPEMKeyPassword
+
+        SSLManager* sslServerManager; // currently leaks on close
+#endif
+
         static void addGlobalOptions( boost::program_options::options_description& general ,
                                       boost::program_options::options_description& hidden );
 
@@ -106,6 +122,7 @@ namespace mongo {
                                        boost::program_options::options_description& hidden );
 
 
+        static void parseConfigFile( istream &f, stringstream &ss);
         /**
          * @return true if should run program, false if should exit
          */
@@ -116,18 +133,28 @@ namespace mongo {
                            boost::program_options::variables_map &output );
     };
 
+    // todo move to cmdline.cpp?
     inline CmdLine::CmdLine() :
-        port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), smallfiles(sizeof(int*) == 4),
+        port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), preallocj(true), smallfiles(sizeof(int*) == 4),
         configsvr(false),
         quota(false), quotaFiles(8), cpu(false), durOptions(0), objcheck(false), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ),
         syncdelay(60), noUnixSocket(false), socket("/tmp") 
     {
-        // default may change for this later.
+        journalCommitInterval = 0; // 0 means use default
+        dur = false;
 #if defined(_DURABLEDEFAULTON)
         dur = true;
-#else
+#endif
+        if( sizeof(void*) == 8 )
+            dur = true;
+#if defined(_DURABLEDEFAULTOFF)
         dur = false;
 #endif
+
+#ifdef MONGO_SSL
+        sslOnNormalPorts = false;
+        sslServerManager = 0;
+#endif
     }
             
     extern CmdLine cmdLine;
diff --git a/db/commands.h b/db/commands.h
index 454e2277e06..c18621828f2 100644
--- a/db/commands.h
+++ b/db/commands.h
@@ -20,6 +20,7 @@
 #include "../pch.h"
 #include "jsobj.h"
 #include "../util/timer.h"
+#include "../client/dbclient.h"
 
 namespace mongo {
 
@@ -45,7 +46,7 @@ namespace mongo {
 
            return value is true if succeeded.  if false, set errmsg text.
         */
-        virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) = 0;
+        virtual bool run(const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) = 0;
 
         /*
            note: logTheTop() MUST be false if READ
@@ -94,6 +95,11 @@ namespace mongo {
         */
         virtual bool requiresAuth() { return true; }
 
+        /* Return true if a replica set secondary should go into "recovering"
+           (unreadable) state while running this command.
+         */
+        virtual bool maintenanceMode() const { return false; }
+
         /** @param webUI expose the command in the web ui as localhost:28017/<name>
             @param oldName an optional old, deprecated name for the command
         */
@@ -120,7 +126,7 @@ namespace mongo {
         static const map<string,Command*>* commandsByBestName() { return _commandsByBestName; }
         static const map<string,Command*>* webCommands() { return _webCommands; }
         /** @return if command was found and executed */
-        static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder);
+        static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder, int queryOptions = 0);
         static LockType locktype( const string& name );
         static Command * findCommand( const string& name );
     };
@@ -139,7 +145,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream& help ) const;
         CmdShutdown() : Command("shutdown") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+        bool run(const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
     private:
         bool shutdownHelper();
     };
diff --git a/db/commands/distinct.cpp b/db/commands/distinct.cpp
index 9a10e69d5a8..48f44050e49 100644
--- a/db/commands/distinct.cpp
+++ b/db/commands/distinct.cpp
@@ -32,7 +32,7 @@ namespace mongo {
             help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
         }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             Timer t;
             string ns = dbname + '.' + cmdObj.firstElement().valuestr();
 
diff --git a/db/commands/find_and_modify.cpp b/db/commands/find_and_modify.cpp
index 2856ab3d3f1..0cf766fcf87 100644
--- a/db/commands/find_and_modify.cpp
+++ b/db/commands/find_and_modify.cpp
@@ -37,7 +37,7 @@ namespace mongo {
         virtual bool logTheOp() { return false; } // the modifications will be logged directly
         virtual bool slaveOk() const { return false; }
         virtual LockType locktype() const { return WRITE; }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             static DBDirectClient db;
 
             string ns = dbname + '.' + cmdObj.firstElement().valuestr();
diff --git a/db/commands/group.cpp b/db/commands/group.cpp
index 9d7acbdf7d4..d3e5839748c 100644
--- a/db/commands/group.cpp
+++ b/db/commands/group.cpp
@@ -20,6 +20,7 @@
 #include "../instance.h"
 #include "../queryoptimizer.h"
 #include "../../scripting/engine.h"
+#include "../clientcursor.h"
 
 namespace mongo {
 
@@ -44,7 +45,7 @@ namespace mongo {
                 uassert( 10042 ,  "return of $key has to be an object" , type == Object );
                 return s->getObject( "return" );
             }
-            return obj.extractFields( keyPattern , true );
+            return obj.extractFields( keyPattern , true ).getOwned();
         }
 
         bool group( string realdbname , const string& ns , const BSONObj& query ,
@@ -88,14 +89,27 @@ namespace mongo {
             list<BSONObj> blah;
 
             shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query);
+            ClientCursor::CleanupPointer ccPointer;
+            ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
 
             while ( cursor->ok() ) {
+                
+                if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered ) ||
+                    !cursor->ok() ) {
+                    break;
+                }
+                
                 if ( ( cursor->matcher() && !cursor->matcher()->matchesCurrent( cursor.get() ) ) ||
                     cursor->getsetdup( cursor->currLoc() ) ) {
                     cursor->advance();
                     continue;
                 }
 
+                if ( !ccPointer->yieldSometimes( ClientCursor::WillNeed ) ||
+                    !cursor->ok() ) {
+                    break;
+                }
+                
                 BSONObj obj = cursor->current();
                 cursor->advance();
 
@@ -117,6 +131,7 @@ namespace mongo {
                     throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() );
                 }
             }
+            ccPointer.reset();
 
             if (!finalize.empty()) {
                 s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 );
@@ -140,7 +155,7 @@ namespace mongo {
             return true;
         }
 
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
 
             if ( !globalScriptEngine ) {
                 errmsg = "server-side JavaScript execution is disabled";
diff --git a/db/commands/isself.cpp b/db/commands/isself.cpp
index cac8380dc20..5a868de919f 100644
--- a/db/commands/isself.cpp
+++ b/db/commands/isself.cpp
@@ -130,7 +130,7 @@ namespace mongo {
             help << "{ _isSelf : 1 } INTERNAL ONLY";
         }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             init();
             result.append( "id" , _id );
             return true;
diff --git a/db/commands/mr.cpp b/db/commands/mr.cpp
index 75f5615b9f6..56e9770dff2 100644
--- a/db/commands/mr.cpp
+++ b/db/commands/mr.cpp
@@ -879,8 +879,6 @@ namespace mongo {
             }
         }
 
-//        boost::thread_specific_ptr<State*> _tl;
-
         /**
          * emit that will be called by js function
          */
@@ -932,7 +930,7 @@ namespace mongo {
                 help << "http://www.mongodb.org/display/DOCS/MapReduce";
             }
             virtual LockType locktype() const { return NONE; }
-            bool run(const string& dbname , BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            bool run(const string& dbname , BSONObj& cmd, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
                 Timer t;
                 Client::GodScope cg;
                 Client& client = cc();
@@ -968,12 +966,6 @@ namespace mongo {
                     state.init();
                     state.prepTempCollection();
 
-                    {
-                        State** s = new State*();
-                        s[0] = &state;
-//                        _tl.reset( s );
-                    }
-
                     wassert( config.limit < 0x4000000 ); // see case on next line to 32 bit unsigned
                     ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , state.incomingDocuments() ) );
                     long long mapTime = 0;
@@ -988,7 +980,9 @@ namespace mongo {
 
                         // obtain cursor on data to apply mr to, sorted
                         shared_ptr<Cursor> temp = NamespaceDetailsTransient::getCursor( config.ns.c_str(), config.filter, config.sort );
+                        uassert( 15876, str::stream() << "could not create cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, temp.get() );
                         auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , config.ns.c_str() ) );
+                        uassert( 15877, str::stream() << "could not create client cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, cursor.get() );
 
                         Timer mt;
                         // go through each doc
@@ -1065,11 +1059,19 @@ namespace mongo {
                     countsBuilder.appendNumber( "reduce" , state.numReduces() );
                     timingBuilder.append( "reduceTime" , inReduce / 1000 );
                     timingBuilder.append( "mode" , state.jsMode() ? "js" : "mixed" );
-
-//                    _tl.reset();
+                }
+                // TODO:  The error handling code for queries is v. fragile,
+                // *requires* rethrow AssertionExceptions - should probably fix.
+                catch ( AssertionException& e ){
+                    log() << "mr failed, removing collection" << causedBy(e) << endl;
+                    throw e;
+                }
+                catch ( std::exception& e ){
+                    log() << "mr failed, removing collection" << causedBy(e) << endl;
+                    throw e;
                 }
                 catch ( ... ) {
-                    log() << "mr failed, removing collection" << endl;
+                    log() << "mr failed for unknown reason, removing collection" << endl;
                     throw;
                 }
 
@@ -1116,7 +1118,7 @@ namespace mongo {
             virtual bool slaveOverrideOk() { return true; }
 
             virtual LockType locktype() const { return NONE; }
-            bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
                 string postProcessCollection = cmdObj["postProcessCollection"].valuestrsafe();
                 bool postProcessOnly = !(postProcessCollection.empty());
diff --git a/db/compact.cpp b/db/compact.cpp
index a1197460f4f..c6e5f77ee0e 100644
--- a/db/compact.cpp
+++ b/db/compact.cpp
@@ -263,6 +263,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual bool adminOnly() const { return false; }
         virtual bool slaveOk() const { return true; }
+        virtual bool maintenanceMode() const { return true; }
         virtual bool logTheOp() { return false; }
         virtual void help( stringstream& help ) const {
             help << "compact collection\n"
@@ -274,7 +275,7 @@ namespace mongo {
         virtual bool requiresAuth() { return true; }
         CompactCmd() : Command("compact") { }
 
-        virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string coll = cmdObj.firstElement().valuestr();
             if( coll.empty() || db.empty() ) {
                 errmsg = "no collection name specified";
diff --git a/db/curop.h b/db/curop.h
index f261812d383..2717d78cc62 100644
--- a/db/curop.h
+++ b/db/curop.h
@@ -28,6 +28,8 @@
 
 namespace mongo {
 
+    class CurOp;
+
     /* lifespan is different than CurOp because of recursives with DBDirectClient */
     class OpDebug {
     public:
@@ -36,7 +38,7 @@ namespace mongo {
         void reset();
         
         string toString() const;
-        void append( BSONObjBuilder& b ) const;
+        void append( const CurOp& curop, BSONObjBuilder& b ) const;
 
         // -------------------
         
@@ -119,7 +121,7 @@ namespace mongo {
         int size() const { return *_size; }
         bool have() const { return size() > 0; }
 
-        BSONObj get() {
+        BSONObj get() const {
             _lock.lock();
             BSONObj o;
             try {
@@ -133,7 +135,7 @@ namespace mongo {
             return o;
         }
 
-        void append( BSONObjBuilder& b , const StringData& name ) {
+        void append( BSONObjBuilder& b , const StringData& name ) const {
             scoped_spinlock lk(_lock);
             BSONObj temp = _get();
             b.append( name , temp );
@@ -141,7 +143,7 @@ namespace mongo {
 
     private:
         /** you have to be locked when you call this */
-        BSONObj _get() {
+        BSONObj _get() const {
             int sz = size();
             if ( sz == 0 )
                 return BSONObj();
@@ -153,7 +155,7 @@ namespace mongo {
         /** you have to be locked when you call this */
         void _reset( int sz ) { _size[0] = sz; }
 
-        SpinLock _lock;
+        mutable SpinLock _lock;
         int * _size;
         char _buf[512];
     };
@@ -168,7 +170,8 @@ namespace mongo {
 
         bool haveQuery() const { return _query.have(); }
         BSONObj query() { return _query.get();  }
-
+        void appendQuery( BSONObjBuilder& b , const StringData& name ) const { _query.append( b , name ); }
+        
         void ensureStarted() {
             if ( _start == 0 )
                 _start = _checkpoint = curTimeMicros64();
diff --git a/db/cursor.h b/db/cursor.h
index ff9c9821ada..9639b2677b1 100644
--- a/db/cursor.h
+++ b/db/cursor.h
@@ -132,6 +132,8 @@ namespace mongo {
         virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) {
             massert( 13285, "manual matcher config not allowed", false );
         }
+
+        virtual void explainDetails( BSONObjBuilder& b ) { return; }
     };
 
     // strategy object implementing direction of traversal.
diff --git a/db/database.cpp b/db/database.cpp
index 7906e9b435a..97b3fa011cb 100644
--- a/db/database.cpp
+++ b/db/database.cpp
@@ -192,22 +192,31 @@ namespace mongo {
         return ret;
     }
 
+    bool fileIndexExceedsQuota( const char *ns, int fileIndex, bool enforceQuota ) {
+        return
+            cmdLine.quota &&
+            enforceQuota &&
+            fileIndex >= cmdLine.quotaFiles &&
+            // we don't enforce the quota on "special" namespaces as that could lead to problems -- e.g.
+            // rejecting an index insert after inserting the main record.
+            !NamespaceString::special( ns ) &&
+            NamespaceString( ns ).db != "local";
+    }
+    
     MongoDataFile* Database::suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota ) {
 
         // check existing files
         for ( int i=numFiles()-1; i>=0; i-- ) {
             MongoDataFile* f = getFile( i );
             if ( f->getHeader()->unusedLength >= sizeNeeded ) {
-                // we don't enforce the quota on "special" namespaces as that could lead to problems -- e.g.
-                // rejecting an index insert after inserting the main record.
-                if( cmdLine.quota && enforceQuota && i > cmdLine.quotaFiles && !NamespaceString::special(ns) )
+                if ( fileIndexExceedsQuota( ns, i-1, enforceQuota ) ) // NOTE i-1 is the value used historically for this check.
                     ;
                 else
                     return f;
             }
         }
 
-        if( cmdLine.quota && enforceQuota && numFiles() >= cmdLine.quotaFiles && !NamespaceString::special(ns) )
+        if ( fileIndexExceedsQuota( ns, numFiles(), enforceQuota ) )
             uasserted(12501, "quota exceeded");
 
         // allocate files until we either get one big enough or hit maxSize
@@ -261,8 +270,8 @@ namespace mongo {
             log() << "creating profile collection: " << profileName << endl;
             BSONObjBuilder spec;
             spec.appendBool( "capped", true );
-            spec.append( "size", 131072.0 );
-            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , true ) ) {
+            spec.append( "size", 1024*1024 );
+            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , false /* we don't replica profile messages */ ) ) {
                 return false;
             }
         }
diff --git a/db/db.cpp b/db/db.cpp
index 9f90b9ddd02..2d4970af044 100644
--- a/db/db.cpp
+++ b/db/db.cpp
@@ -70,7 +70,6 @@ namespace mongo {
     extern string repairpath;
 
     void setupSignals( bool inFork );
-    void startReplSets(ReplSetCmdline*);
     void startReplication();
     void exitCleanly( ExitCode code );
 
@@ -216,8 +215,6 @@ namespace mongo {
 
     void listen(int port) {
         //testTheDb();
-        log() << "waiting for connections on port " << port << endl;
-
         MessageServer::Options options;
         options.port = port;
         options.ipList = cmdLine.bind_ip;
@@ -483,12 +480,6 @@ namespace mongo {
         clientCursorMonitor.go();
         PeriodicTask::theRunner->go();
 
-        if( !cmdLine._replSet.empty() ) {
-            replSet = true;
-            ReplSetCmdline *replSetCmdline = new ReplSetCmdline(cmdLine._replSet);
-            boost::thread t( boost::bind( &startReplSets, replSetCmdline) );
-        }
-
         listen(listenPort);
 
         // listen() will return when exit code closes its socket.
@@ -575,10 +566,12 @@ int main(int argc, char* argv[]) {
     ("directoryperdb", "each database will be stored in a separate directory")
     ("journal", "enable journaling")
     ("journalOptions", po::value<int>(), "journal diagnostic options")
+    ("journalCommitInterval", po::value<unsigned>(), "how often to group/batch commit (ms)")
     ("ipv6", "enable IPv6 support (disabled by default)")
     ("jsonp","allow JSONP access via http (has security implications)")
     ("noauth", "run without security")
     ("nohttpinterface", "disable http interface")
+    ("nojournal", "disable journaling (journaling is on by default for 64 bit)")
     ("noprealloc", "disable data file preallocation - will often hurt performance")
     ("noscripting", "disable scripting engine")
     ("notablescan", "do not allow table scans")
@@ -631,12 +624,11 @@ int main(int argc, char* argv[]) {
     ("pretouch", po::value<int>(), "n pretouch threads for applying replicationed operations")
     ("command", po::value< vector<string> >(), "command")
     ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
-    // these move to unhidden later:
     ("nodur", "disable journaling (currently the default)")
-    ("nojournal", "disable journaling (currently the default)")
     // things we don't want people to use
     ("nocursors", "diagnostic/debugging option that turns off cursors DO NOT USE IN PRODUCTION")
     ("nohints", "ignore query hints")
+    ("nopreallocj", "don't preallocate journal files")
     ("dur", "enable journaling") // deprecated version
     ("durOptions", po::value<int>(), "durability diagnostic options") // deprecated version
     // deprecated pairing command line options
@@ -745,6 +737,15 @@ int main(int argc, char* argv[]) {
         if (params.count("durOptions")) {
             cmdLine.durOptions = params["durOptions"].as<int>();
         }
+        if( params.count("journalCommitInterval") ) { 
+            // don't check if dur is false here as many will just use the default, and will default to off on win32. 
+            // ie no point making life a little more complex by giving an error on a dev environment.
+            cmdLine.journalCommitInterval = params["journalCommitInterval"].as<unsigned>();
+            if( cmdLine.journalCommitInterval <= 1 || cmdLine.journalCommitInterval > 300 ) {
+                out() << "--journalCommitInterval out of allowed range (0-300ms)" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
         if (params.count("journalOptions")) {
             cmdLine.durOptions = params["journalOptions"].as<int>();
         }
@@ -761,6 +762,9 @@ int main(int argc, char* argv[]) {
         if (params.count("nohints")) {
             useHints = false;
         }
+        if (params.count("nopreallocj")) {
+            cmdLine.preallocj = false;
+        }
         if (params.count("nohttpinterface")) {
             noHttpInterface = true;
         }
diff --git a/db/db.vcxproj b/db/db.vcxproj
index 685015ed7f6..8f831cb8559 100755
--- a/db/db.vcxproj
+++ b/db/db.vcxproj
@@ -459,9 +459,27 @@
     <ClCompile Include="..\s\shard.cpp" />
     <ClCompile Include="..\s\shardconnection.cpp" />
     <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
     <ClCompile Include="..\util\alignedbuilder.cpp">
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
     </ClCompile>
+    <ClCompile Include="..\util\compress.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
     <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
     <ClCompile Include="..\util\concurrency\synchronization.cpp" />
     <ClCompile Include="..\util\concurrency\task.cpp" />
@@ -561,6 +579,7 @@
     <ClCompile Include="..\client\parallel.cpp" />
     <ClCompile Include="pdfile.cpp" />
     <ClCompile Include="queryoptimizer.cpp" />
+    <ClCompile Include="scanandorder.cpp" />
     <ClCompile Include="security.cpp" />
     <ClCompile Include="security_commands.cpp" />
     <ClCompile Include="security_common.cpp" />
@@ -652,6 +671,8 @@
     <ClInclude Include="..\targetver.h" />
     <ClInclude Include="..\pcre-7.4\config.h" />
     <ClInclude Include="..\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\third_party\snappy\config.h" />
+    <ClInclude Include="..\third_party\snappy\snappy.h" />
     <ClInclude Include="..\util\alignedbuilder.h" />
     <ClInclude Include="..\util\concurrency\mutexdebugger.h" />
     <ClInclude Include="..\util\concurrency\race.h" />
diff --git a/db/db.vcxproj.filters b/db/db.vcxproj.filters
index d9e9def86f8..36b0df1ddc2 100755
--- a/db/db.vcxproj.filters
+++ b/db/db.vcxproj.filters
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <ClCompile Include="..\bson\oid.cpp" />
@@ -166,6 +166,16 @@
     <ClCompile Include="..\util\net\message_port.cpp" />
     <ClCompile Include="dbmessage.cpp" />
     <ClCompile Include="commands\find_and_modify.cpp" />
+    <ClCompile Include="..\util\compress.cpp">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="scanandorder.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\client\dbclientcursor.h" />
@@ -315,6 +325,12 @@
     <ClInclude Include="..\util\net\sock.h" />
     <ClInclude Include="..\util\concurrency\rwlockimpl.h" />
     <ClInclude Include="..\util\concurrency\mutexdebugger.h" />
+    <ClInclude Include="..\third_party\snappy\config.h">
+      <Filter>snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy.h">
+      <Filter>snappy</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="db.rc" />
@@ -349,4 +365,9 @@
     <Library Include="..\..\js\js64d.lib" />
     <Library Include="..\..\js\js64r.lib" />
   </ItemGroup>
+  <ItemGroup>
+    <Filter Include="snappy">
+      <UniqueIdentifier>{bb99c086-7926-4f50-838d-f5f0c18397c0}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
 </Project>
 \ No newline at end of file
diff --git a/db/dbcommands.cpp b/db/dbcommands.cpp
index 73c1004d4f2..2edd7684ff8 100644
--- a/db/dbcommands.cpp
+++ b/db/dbcommands.cpp
@@ -31,6 +31,7 @@
 #include "../util/lruishmap.h"
 #include "../util/md5.hpp"
 #include "../util/processinfo.h"
+#include "../util/ramlog.h"
 #include "json.h"
 #include "repl.h"
 #include "repl_block.h"
@@ -53,14 +54,16 @@ namespace mongo {
     namespace dur { 
         void setAgeOutJournalFiles(bool rotate);
     }
+    /** @return true if fields found */
     bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { 
         BSONElement e = cmdObj["ageOutJournalFiles"];
         if( !e.eoo() ) {
             bool r = e.trueValue();
             log() << "ageOutJournalFiles " << r << endl;
             dur::setAgeOutJournalFiles(r);
+            return true;
         }
-        return true;
+        return false;
     }
 
     void flushDiagLog();
@@ -85,7 +88,7 @@ namespace mongo {
             help << "reset error state (used with getpreverror)";
         }
         CmdResetError() : Command("resetError", false, "reseterror") {}
-        bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             LastError *le = lastError.get();
             assert( le );
             le->reset();
@@ -116,7 +119,7 @@ namespace mongo {
                  << "  { w:n } - await replication to n servers (including self) before returning\n"
                  << "  { wtimeout:m} - timeout for w in m milliseconds";
         }
-        bool run(const string& dbname, BSONObj& _cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& _cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             LastError *le = lastError.disableForCommand();
 
             bool err = false;
@@ -246,7 +249,7 @@ namespace mongo {
             return true;
         }
         CmdGetPrevError() : Command("getPrevError", false, "getpreverror") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             LastError *le = lastError.disableForCommand();
             le->appendSelf( result );
             if ( le->valid )
@@ -268,14 +271,14 @@ namespace mongo {
              << "N to wait N seconds for other members to catch up.";
     }
 
-    bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+    bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
         bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
 
         if (!force && theReplSet && theReplSet->isPrimary()) {
-            int timeout, now, start;
+            long long timeout, now, start;
             timeout = now = start = curTimeMicros64()/1000000;
             if (cmdObj.hasField("timeoutSecs")) {
-                timeout += cmdObj["timeoutSecs"].numberInt();
+                timeout += cmdObj["timeoutSecs"].numberLong();
             }
 
             OpTime lastOp = theReplSet->lastOpTimeWritten;
@@ -329,7 +332,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return WRITE; }
         CmdDropDatabase() : Command("dropDatabase") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
             log() << "dropDatabase " << dbname << endl;
             int p = (int) e.number();
@@ -349,12 +352,13 @@ namespace mongo {
         virtual bool slaveOk() const {
             return true;
         }
+        virtual bool maintenanceMode() const { return true; }
         virtual void help( stringstream& help ) const {
             help << "repair database.  also compacts. note: slow.";
         }
         virtual LockType locktype() const { return WRITE; }
         CmdRepairDatabase() : Command("repairDatabase") {}
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
             log() << "repairDatabase " << dbname << endl;
             int p = (int) e.number();
@@ -388,7 +392,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return WRITE; }
         CmdProfile() : Command("profile") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
             result.append("was", cc().database()->profile);
             result.append("slowms", cmdLine.slowMS );
@@ -425,7 +429,7 @@ namespace mongo {
             help << "returns lots of administrative server statistics";
         }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             long long start = Listener::getElapsedTimeMillis();
             BSONObjBuilder timeBuilder(128);
 
@@ -596,6 +600,21 @@ namespace mongo {
 
             timeBuilder.appendNumber( "after dur" , Listener::getElapsedTimeMillis() - start );
 
+            {
+                RamLog* rl = RamLog::get( "warnings" );
+                verify(15880, rl);
+                
+                if (rl->lastWrite() >= time(0)-(10*60)){ // only show warnings from last 10 minutes
+                    vector<const char*> lines;
+                    rl->get( lines );
+                    
+                    BSONArrayBuilder arr( result.subarrayStart( "warnings" ) );
+                    for ( unsigned i=std::max(0,(int)lines.size()-10); i<lines.size(); i++ )
+                        arr.append( lines[i] );
+                    arr.done();
+                }
+            }
+
             if ( ! authed )
                 result.append( "note" , "run against admin for more info" );
             
@@ -619,7 +638,7 @@ namespace mongo {
         virtual void help( stringstream& help ) const { help << "internal"; }
         virtual LockType locktype() const { return NONE; }
         CmdGetOpTime() : Command("getoptime") { }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             writelock l( "" );
             result.appendDate("optime", OpTime::now().asDate());
             return true;
@@ -648,7 +667,7 @@ namespace mongo {
         }
         void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Monitoring+and+Diagnostics#MonitoringandDiagnostics-DatabaseRecord%2FReplay"; }
         virtual LockType locktype() const { return WRITE; }
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() );
             flushDiagLog();
             if ( !cmdLine.quiet )
@@ -771,7 +790,7 @@ namespace mongo {
         }
         virtual void help( stringstream& help ) const { help << "drop a collection\n{drop : <collectionName>}"; }
         virtual LockType locktype() const { return WRITE; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string nsToDrop = dbname + '.' + cmdObj.firstElement().valuestr();
             NamespaceDetails *d = nsdetails(nsToDrop.c_str());
             if ( !cmdLine.quiet )
@@ -805,7 +824,7 @@ namespace mongo {
             return false;
         }
         virtual void help( stringstream& help ) const { help << "count objects in collection"; }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string ns = dbname + '.' + cmdObj.firstElement().valuestr();
             string err;
             long long n = runCount(ns.c_str(), cmdObj, err);
@@ -844,7 +863,8 @@ namespace mongo {
             help << "create a collection explicitly\n"
                 "{ create: <ns>[, capped: <bool>, size: <collSizeInBytes>, max: <nDocs>] }";
         }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            uassert(15888, "must pass name of collection to create", cmdObj.firstElement().valuestrsafe()[0] != '\0');
             string ns = dbname + '.' + cmdObj.firstElement().valuestr();
             string err;
             uassert(14832, "specify size:<n> when capped is true", !cmdObj["capped"].trueValue() || cmdObj["size"].isNumber() || cmdObj.hasField("$nExtents"));
@@ -869,7 +889,7 @@ namespace mongo {
             help << "drop indexes for a collection";
         }
         CmdDropIndexes() : Command("dropIndexes", false, "deleteIndexes") { }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
             BSONElement e = jsobj.firstElement();
             string toDeleteNs = dbname + '.' + e.valuestr();
             NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
@@ -914,7 +934,7 @@ namespace mongo {
             help << "re-index a collection";
         }
         CmdReIndex() : Command("reIndex") { }
-        bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             static DBDirectClient db;
 
             BSONElement e = jsobj.firstElement();
@@ -969,7 +989,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream& help ) const { help << "list databases on this server"; }
         CmdListDatabases() : Command("listDatabases" , true ) {}
-        bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             vector< string > dbNames;
             getDatabaseNames( dbNames );
             vector< BSONObj > dbInfos;
@@ -1038,7 +1058,7 @@ namespace mongo {
         virtual LockType locktype() const { return WRITE; }
 
         CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {}
-        bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             bool ok;
             try {
                 ok = dbHolder.closeAll( dbpath , result, false );
@@ -1065,7 +1085,7 @@ namespace mongo {
             help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }";
         }
         virtual LockType locktype() const { return READ; }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname;
             ns += ".";
             {
@@ -1164,7 +1184,7 @@ namespace mongo {
                  "\nkeyPattern, min, and max parameters are optional."
                  "\nnote: This command may take a while to run";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             Timer timer;
 
             string ns = jsobj.firstElement().String();
@@ -1282,7 +1302,7 @@ namespace mongo {
             help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024\n"
                     "    avgObjSize - in bytes";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname + "." + jsobj.firstElement().valuestr();
             Client::Context cx( ns );
 
@@ -1351,7 +1371,7 @@ namespace mongo {
                 "Get stats on a database. Not instantaneous. Slower for databases with large .ns files.\n" << 
                 "Example: { dbStats:1, scale:1 }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             int scale = 1;
             if ( jsobj["scale"].isNumber() ) {
                 scale = jsobj["scale"].numberInt();
@@ -1426,7 +1446,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "{ cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string from = jsobj.getStringField( "cloneCollectionAsCapped" );
             string to = jsobj.getStringField( "toCollection" );
             long long size = (long long)jsobj.getField( "size" ).number();
@@ -1488,7 +1508,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "{ convertToCapped:<fromCollectionName>, size:<sizeInBytes> }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             BackgroundOperation::assertNoBgOpInProgForDb(dbname.c_str());
 
             string from = jsobj.getStringField( "convertToCapped" );
@@ -1544,7 +1564,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "{whatsmyuri:1}";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             BSONObj info = cc().curop()->infoNoauth();
             result << "you" << info[ "client" ];
             return true;
@@ -1559,7 +1579,7 @@ namespace mongo {
             return true;
         }
         virtual bool slaveOk() const {
-            return false;
+            return true;
         }
         virtual LockType locktype() const { return WRITE; }
         virtual bool requiresAuth() {
@@ -1568,7 +1588,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "internal. for testing only.";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "godinsert" ].valuestrsafe();
             uassert( 13049, "godinsert must specify a collection", !coll.empty() );
             string ns = dbname + "." + coll;
@@ -1583,7 +1603,7 @@ namespace mongo {
         DBHashCmd() : Command( "dbHash", false, "dbhash" ) {}
         virtual bool slaveOk() const { return true; }
         virtual LockType locktype() const { return READ; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             list<string> colls;
             Database* db = cc().database();
             if ( db )
@@ -1629,9 +1649,8 @@ namespace mongo {
                     cursor = findTableScan( c.c_str() , BSONObj() );
                 }
                 else {
-                    bb.done();
-                    errmsg = (string)"can't find _id index for: " + c;
-                    return 0;
+                    log() << "can't find _id index for: " << c << endl;
+                    continue;
                 }
 
                 md5_state_t st;
@@ -1677,7 +1696,7 @@ namespace mongo {
             help << "w:true write lock. secs:<seconds>";
         }
         CmdSleep() : Command("sleep") { }
-        bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             int secs = 100;
             if ( cmdObj["secs"].isNumber() )
                 secs = cmdObj["secs"].numberInt();
@@ -1700,7 +1719,7 @@ namespace mongo {
         virtual bool slaveOk() const { return false; }
         virtual LockType locktype() const { return WRITE; }
         virtual bool requiresAuth() { return true; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "captrunc" ].valuestrsafe();
             uassert( 13416, "captrunc must specify a collection", !coll.empty() );
             string ns = dbname + "." + coll;
@@ -1727,7 +1746,7 @@ namespace mongo {
         virtual bool slaveOk() const { return false; }
         virtual LockType locktype() const { return WRITE; }
         virtual bool requiresAuth() { return true; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "emptycapped" ].valuestrsafe();
             uassert( 13428, "emptycapped must specify a collection", !coll.empty() );
             string ns = dbname + "." + coll;
@@ -1792,13 +1811,22 @@ namespace mongo {
         if ( c->adminOnly() )
             log( 2 ) << "command: " << cmdObj << endl;
 
+        if (c->maintenanceMode() && theReplSet && theReplSet->isSecondary()) {
+            theReplSet->setMaintenanceMode(true);
+        }
+
         if ( c->locktype() == Command::NONE ) {
             // we also trust that this won't crash
             client.curop()->ensureStarted();
             string errmsg;
-            int ok = c->run( dbname , cmdObj , errmsg , result , fromRepl );
+            int ok = c->run( dbname , cmdObj , queryOptions, errmsg , result , fromRepl );
             if ( ! ok )
                 result.append( "errmsg" , errmsg );
+
+            if (c->maintenanceMode() && theReplSet) {
+                theReplSet->setMaintenanceMode(false);
+            }
+
             return ok;
         }
 
@@ -1812,11 +1840,13 @@ namespace mongo {
         client.curop()->ensureStarted();
         Client::Context ctx( dbname , dbpath , &lk , c->requiresAuth() );
 
+        bool retval = true;
+
         try {
             string errmsg;
-            if ( ! c->run(dbname, cmdObj, errmsg, result, fromRepl ) ) {
+            if ( ! c->run(dbname, cmdObj, queryOptions, errmsg, result, fromRepl ) ) {
                 result.append( "errmsg" , errmsg );
-                return false;
+                retval = false;
             }
         }
         catch ( DBException& e ) {
@@ -1824,14 +1854,18 @@ namespace mongo {
             ss << "exception: " << e.what();
             result.append( "errmsg" , ss.str() );
             result.append( "code" , e.getCode() );
-            return false;
+            retval = false;
         }
 
-        if ( c->logTheOp() && ! fromRepl ) {
+        if ( retval && c->logTheOp() && ! fromRepl ) {
             logOp("c", cmdns, cmdObj);
         }
 
-        return true;
+        if (c->maintenanceMode() && theReplSet) {
+            theReplSet->setMaintenanceMode(false);
+        }
+
+        return retval;
     }
 
 
diff --git a/db/dbcommands_admin.cpp b/db/dbcommands_admin.cpp
index 47f6c691ab4..566027fc594 100644
--- a/db/dbcommands_admin.cpp
+++ b/db/dbcommands_admin.cpp
@@ -47,7 +47,7 @@ namespace mongo {
 
         virtual void help(stringstream& h) const { h << "internal"; }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string dropns = dbname + "." + cmdObj.firstElement().valuestrsafe();
 
             if ( !cmdLine.quiet )
@@ -82,7 +82,7 @@ namespace mongo {
         virtual bool adminOnly() const { return true; }
         virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             filesystem::path p = dur::getJournalDir();
             p /= "journalLatencyTest";
         
@@ -157,7 +157,7 @@ namespace mongo {
         virtual LockType locktype() const { return READ; }
         //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] [, full: <bool> } */
 
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname + "." + cmdObj.firstElement().valuestrsafe();
             NamespaceDetails * d = nsdetails( ns.c_str() );
             if ( !cmdLine.quiet )
@@ -473,7 +473,7 @@ namespace mongo {
             return !x.empty();
         }*/
         virtual void help(stringstream& h) const { h << url(); }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately
             bool lock = cmdObj["lock"].trueValue();
             log() << "CMD fsync:  sync:" << sync << " lock:" << lock << endl;
diff --git a/db/dbcommands_generic.cpp b/db/dbcommands_generic.cpp
index 2e025b500ea..a9e13eab741 100644
--- a/db/dbcommands_generic.cpp
+++ b/db/dbcommands_generic.cpp
@@ -79,7 +79,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "internal command facilitating running in certain cloud computing environments";
         }
-        bool run(const string& dbname, BSONObj& obj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& obj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             if( !obj.hasElement("servers") ) { 
                 vector<string> ips;
                 obj["servers"].Obj().Vals(ips);
@@ -106,7 +106,7 @@ namespace mongo {
             help << "get version #, etc.\n";
             help << "{ buildinfo:1 }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo();
             result << "versionArray" << versionArray;
             result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 );
@@ -137,7 +137,7 @@ namespace mongo {
             help << "  syncdelay\n";
             help << "{ getParameter:'*' } to get everything\n";
         }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             bool all = *cmdObj.firstElement().valuestrsafe() == '*';
 
             int before = result.len();
@@ -166,11 +166,6 @@ namespace mongo {
         }
     } cmdGet;
 
-    // dev - experimental. so only in set command for now.  may go away or change
-    namespace dur { 
-        int groupCommitIntervalMs = 100;
-    }
-
     // tempish
     bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl );
 
@@ -184,23 +179,24 @@ namespace mongo {
             help << "set administrative option(s)\n";
             help << "{ setParameter:1, <param>:<value> }\n";
             help << "supported so far:\n";
-            help << "  notablescan\n";
+            help << "  journalCommitInterval\n";
             help << "  logLevel\n";
+            help << "  notablescan\n";
             help << "  quiet\n";
             help << "  syncdelay\n";
         }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             int s = 0;
-            setParmsMongodSpecific(dbname, cmdObj, errmsg, result, fromRepl);
-            if( cmdObj.hasElement("groupCommitIntervalMs") ) { 
+            bool found = setParmsMongodSpecific(dbname, cmdObj, errmsg, result, fromRepl);
+            if( cmdObj.hasElement("journalCommitInterval") ) { 
                 if( !cmdLine.dur ) { 
                     errmsg = "journaling is off";
                     return false;
                 }
-                int x = (int) cmdObj["groupCommitIntervalMs"].Number();
-                assert( x > 0 && x < 500 );
-                dur::groupCommitIntervalMs = x;
-                log() << "groupCommitIntervalMs " << x << endl;
+                int x = (int) cmdObj["journalCommitInterval"].Number();
+                assert( x > 1 && x < 500 );
+                cmdLine.journalCommitInterval = x;
+                log() << "setParameter journalCommitInterval=" << x << endl;
                 s++;
             }
             if( cmdObj.hasElement("notablescan") ) {
@@ -241,7 +237,7 @@ namespace mongo {
                 s++;
             }
 
-            if( s == 0 ) {
+            if( s == 0 && !found ) {
                 errmsg = "no option found to set, use help:true to see options ";
                 return false;
             }
@@ -257,7 +253,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const { help << "a way to check that the server is alive. responds immediately even if server is in a db lock."; }
         virtual LockType locktype() const { return NONE; }
         virtual bool requiresAuth() { return false; }
-        virtual bool run(const string& badns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& badns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             // IMPORTANT: Don't put anything in here that might lock db - including authentication
             return true;
         }
@@ -270,7 +266,7 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual bool readOnly() { return true; }
         virtual LockType locktype() const { return NONE; }
-        virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( globalScriptEngine ) {
                 BSONObjBuilder bb( result.subobjStart( "js" ) );
                 result.append( "utf8" , globalScriptEngine->utf8Ok() );
@@ -292,7 +288,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return true; }
-        virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             rotateLogs();
             return 1;
         }
@@ -306,7 +302,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return false; }
-        virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONObjBuilder b( result.subobjStart( "commands" ) );
             for ( map<string,Command*>::iterator i=_commands->begin(); i!=_commands->end(); ++i ) {
                 Command * c = i->second;
@@ -361,7 +357,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return NONE; }
         CmdForceError() : Command("forceerror") {}
-        bool run(const string& dbnamne, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbnamne, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             uassert( 10038 , "forced error", false);
             return true;
         }
@@ -373,7 +369,7 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual LockType locktype() const { return NONE; }
         virtual bool requiresAuth() { return false; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             result << "options" << QueryOption_AllSupported;
             return true;
         }
@@ -393,7 +389,7 @@ namespace mongo {
             help << "{ getLog : '*' }  OR { getLog : 'global' }";
         }
 
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string p = cmdObj.firstElement().String();
             if ( p == "*" ) {
                 vector<string> names;
diff --git a/db/dbeval.cpp b/db/dbeval.cpp
index 3a53200a49f..5fe137fc3a3 100644
--- a/db/dbeval.cpp
+++ b/db/dbeval.cpp
@@ -121,7 +121,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return NONE; }
         CmdEval() : Command("eval", false, "$eval") { }
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
 
             AuthenticationInfo *ai = cc().getAuthenticationInfo();
             uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(dbname.c_str()) );
diff --git a/db/dbmessage.h b/db/dbmessage.h
index a14d4cf5142..a789bff849c 100644
--- a/db/dbmessage.h
+++ b/db/dbmessage.h
@@ -122,7 +122,7 @@ namespace mongo {
 
         /** the 32 bit field before the ns 
          * track all bit usage here as its cross op
-         * 0: InsertOption_KeepGoing
+         * 0: InsertOption_ContinueOnError
          * 1: fromWriteback
          */
         int& reservedField() { return *reserved; }
@@ -233,7 +233,7 @@ namespace mongo {
 
     public:
         enum ReservedOptions {
-            Reserved_InsertOption_KeepGoing = 1 << 0 , 
+            Reserved_InsertOption_ContinueOnError = 1 << 0 , 
             Reserved_FromWriteback = 1 << 1 
         };
     };
diff --git a/db/dbwebserver.cpp b/db/dbwebserver.cpp
index 40950a8ccb3..50a59fa1267 100644
--- a/db/dbwebserver.cpp
+++ b/db/dbwebserver.cpp
@@ -61,7 +61,7 @@ namespace mongo {
     class DbWebServer : public MiniWebServer {
     public:
         DbWebServer(const string& ip, int port, const AdminAccess* webUsers)
-            : MiniWebServer(ip, port), _webUsers(webUsers) {
+            : MiniWebServer("admin web console", ip, port), _webUsers(webUsers) {
             WebStatusPlugin::initAll();
         }
 
@@ -424,7 +424,7 @@ namespace mongo {
                 string errmsg;
 
                 BSONObjBuilder sub;
-                if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) )
+                if ( ! c->run( "admin.$cmd" , co , 0, errmsg , sub , false ) )
                     buf.append( cmd , errmsg );
                 else
                     buf.append( cmd , sub.obj() );
@@ -531,7 +531,6 @@ namespace mongo {
         Client::initThread("websvr");
         const int p = cmdLine.port + 1000;
         DbWebServer mini(cmdLine.bind_ip, p, adminAccessPtr.get());
-        log() << "web admin interface listening on port " << p << endl;
         mini.initAndListen();
         cc().shutdown();
     }
diff --git a/db/driverHelpers.cpp b/db/driverHelpers.cpp
index d98a33b25c5..12aa01886c4 100644
--- a/db/driverHelpers.cpp
+++ b/db/driverHelpers.cpp
@@ -46,7 +46,7 @@ namespace mongo {
     class ObjectIdTest : public BasicDriverHelper {
     public:
         ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ) {}
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( cmdObj.firstElement().type() != jstOID ) {
                 errmsg = "not oid";
                 return false;
diff --git a/db/dur.cpp b/db/dur.cpp
index 6cb69ac5ac2..dfa36f95224 100644
--- a/db/dur.cpp
+++ b/db/dur.cpp
@@ -62,11 +62,11 @@
 #include "dur_journal.h"
 #include "dur_commitjob.h"
 #include "dur_recover.h"
+#include "dur_stats.h"
 #include "../util/concurrency/race.h"
 #include "../util/mongoutils/hash.h"
 #include "../util/mongoutils/str.h"
 #include "../util/timer.h"
-#include "dur_stats.h"
 
 using namespace mongoutils;
 
@@ -74,8 +74,9 @@ namespace mongo {
 
     namespace dur {
 
-        void WRITETODATAFILES();
-        void PREPLOGBUFFER();
+        void PREPLOGBUFFER(JSectHeader& outParm);
+        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed);
+        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed);
 
         /** declared later in this file
             only used in this file -- use DurableInterface::commitNow() outside
@@ -129,6 +130,7 @@ namespace mongo {
                        "commits" << _commits <<
                        "journaledMB" << _journaledBytes / 1000000.0 <<
                        "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 <<
+                       "compression" << _journaledBytes / (_uncompressedBytes+1.0) <<
                        "commitsInWriteLock" << _commitsInWriteLock <<
                        "earlyCommits" << _earlyCommits << 
                        "timeMs" <<
@@ -143,6 +145,8 @@ namespace mongo {
                 b << "ageOutJournalFiles" << "mutex timeout";
             if( r == 0 )
                 b << "ageOutJournalFiles" << false;
+            if( cmdLine.journalCommitInterval != 0 )
+                b << "journalCommitIntervalMs" << cmdLine.journalCommitInterval;
             return b.obj();
         }
 
@@ -269,6 +273,9 @@ namespace mongo {
         }
 
         bool DurableImpl::commitIfNeeded() {
+            if ( ! dbMutex.isWriteLocked() ) // we implicitly commit if needed when releasing write lock
+                return false;
+
             DEV commitJob._nSinceCommitIfNeededCall = 0;
             if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit?
                 stats.curr->_earlyCommits++;
@@ -325,15 +332,6 @@ namespace mongo {
         }
 #endif
 
-        /** write the buffer we have built to the journal and fsync it.
-            outside of lock as that could be slow.
-        */
-        static void WRITETOJOURNAL(AlignedBuilder& ab) {
-            Timer t;
-            journal(ab);
-            stats.curr->_writeToJournalMicros += t.micros();
-        }
-
         // Functor to be called over all MongoFiles
 
         class validateSingleMapMatches {
@@ -486,6 +484,7 @@ namespace mongo {
             stats.curr->_remapPrivateViewMicros += t.micros();
         }
 
+        // lock order: dbMutex first, then this
         mutex groupCommitMutex("groupCommit");
 
         bool _groupCommitWithLimitedLocks() {
@@ -502,8 +501,8 @@ namespace mongo {
                 commitJob.notifyCommitted();
                 return true;
             }
-
-            PREPLOGBUFFER();
+            JSectHeader h;
+            PREPLOGBUFFER(h);
 
             RWLockRecursive::Shared lk3(MongoFile::mmmutex);
 
@@ -515,16 +514,15 @@ namespace mongo {
             lk1.reset();
 
             // ****** now other threads can do writes ******
-
-            WRITETOJOURNAL(commitJob._ab);
+            WRITETOJOURNAL(h, commitJob._ab);
             assert( abLen == commitJob._ab.len() ); // a check that no one touched the builder while we were doing work. if so, our locking is wrong.
 
             // data is now in the journal, which is sufficient for acknowledging getLastError.
             // (ok to crash after that)
             commitJob.notifyCommitted();
 
-            WRITETODATAFILES();
-            assert( abLen == commitJob._ab.len() ); // WRITETODATAFILES uses _ab also
+            WRITETODATAFILES(h, commitJob._ab);
+            assert( abLen == commitJob._ab.len() ); // check again wasn't modded
             commitJob._ab.reset();
 
             // can't : dbMutex._remapPrivateViewRequested = true;
@@ -570,18 +568,19 @@ namespace mongo {
             // (and we are only read locked in the dbMutex, so it could happen)
             scoped_lock lk(groupCommitMutex);
 
-            PREPLOGBUFFER();
+            JSectHeader h;
+            PREPLOGBUFFER(h);
 
             // todo : write to the journal outside locks, as this write can be slow.
             //        however, be careful then about remapprivateview as that cannot be done 
             //        if new writes are then pending in the private maps.
-            WRITETOJOURNAL(commitJob._ab);
+            WRITETOJOURNAL(h, commitJob._ab);
 
             // data is now in the journal, which is sufficient for acknowledging getLastError.
             // (ok to crash after that)
             commitJob.notifyCommitted();
 
-            WRITETODATAFILES();
+            WRITETODATAFILES(h, commitJob._ab);
             debugValidateAllMapsMatch();
 
             commitJob.reset();
@@ -613,6 +612,7 @@ namespace mongo {
         }
 
         /** locking: in read lock when called
+                     or, for early commits (commitIfNeeded), in write lock
             @see MongoMMF::close()
         */
         static void groupCommit() {
@@ -686,29 +686,41 @@ namespace mongo {
         }
 
         extern int groupCommitIntervalMs;
+        filesystem::path getJournalDir();
 
         void durThread() {
             Client::initThread("journal");
+
+            bool samePartition = true;
+            try {
+                const string dbpathDir = boost::filesystem::path(dbpath).native_directory_string();
+                samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
+            }
+            catch(...) {
+            }
+
             while( !inShutdown() ) {
                 RACECHECK
+
+                unsigned ms = cmdLine.journalCommitInterval;
+                if( ms == 0 ) { 
+                    // use default
+                    ms = samePartition ? 100 : 30;
+                }
+
+                unsigned oneThird = (ms / 3) + 1; // +1 so never zero
+
                 try {
-                    int millis = groupCommitIntervalMs;
-                    {
-                        stats.rotate();
-                        {
-                            Timer t;
-                            journalRotate(); // note we do this part outside of mongomutex
-                            millis -= t.millis();
-                            wassert( millis <= groupCommitIntervalMs ); // race if groupCommitIntervalMs was changing by another thread so wassert
-                            if( millis < 2 )
-                                millis = 2;
-                        }
+                    stats.rotate();
 
-                        // we do this in a couple blocks, which makes it a tiny bit faster (only a little) on throughput,
-                        // but is likely also less spiky on our cpu usage, which is good:
-                        sleepmillis(millis/2);
-                        commitJob.wi()._deferred.invoke();
-                        sleepmillis(millis/2);
+                    // we do this in a couple blocks (the invoke()), which makes it a tiny bit faster (only a little) on throughput,
+                    // but is likely also less spiky on our cpu usage, which is good.
+
+                    // commit sooner if one or more getLastError j:true is pending
+                    for( unsigned i = 1; i <= 2; i++ ) {
+                        sleepmillis(oneThird);
+                        if( commitJob._notify.nWaiting() )
+                            break;
                         commitJob.wi()._deferred.invoke();
                     }
 
@@ -772,6 +784,13 @@ namespace mongo {
         void DurableImpl::syncDataAndTruncateJournal() {
             dbMutex.assertWriteLocked();
 
+            // a commit from the commit thread won't begin while we are in the write lock,
+            // but it may already be in progress and the end of that work is done outside 
+            // (dbMutex) locks. This line waits for that to complete if already underway.
+            {
+                scoped_lock lk(groupCommitMutex);
+            }
+
             groupCommit();
             MongoFile::flushAll(true);
             journalCleanup();
diff --git a/db/dur_journal.cpp b/db/dur_journal.cpp
index f85dda32b51..0a1bc5ebbad 100644
--- a/db/dur_journal.cpp
+++ b/db/dur_journal.cpp
@@ -34,6 +34,7 @@
 #include "../util/file.h"
 #include "../util/checksum.h"
 #include "../util/concurrency/race.h"
+#include "../util/compress.h"
 
 using namespace mongoutils;
 
@@ -92,6 +93,11 @@ namespace mongo {
             assert(false);
         }
 
+        JSectFooter::JSectFooter() { 
+            memset(this, 0, sizeof(*this));
+            sentinel = JEntry::OpCode_Footer;
+        }
+
         JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash
             sentinel = JEntry::OpCode_Footer;
             reserved = 0;
@@ -103,6 +109,10 @@ namespace mongo {
         }
 
         bool JSectFooter::checkHash(const void* begin, int len) const {
+            if( !magicOk() ) { 
+                log() << "journal footer not valid" << endl;
+                return false;
+            }
             Checksum c;
             c.gen(begin, len);
             DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(c.bytes, 16) << endl;
@@ -317,13 +327,13 @@ namespace mongo {
 
         void preallocateFiles() {
             if( exists(getJournalDir()/"prealloc.0") || // if enabled previously, keep using
-                exists(getJournalDir()/"prealloc.1") || 
-                preallocateIsFaster() ) {
+                exists(getJournalDir()/"prealloc.1") ||
+                ( cmdLine.preallocj && preallocateIsFaster() ) ) {
                     usingPreallocate = true;
                     try {
                         _preallocateFiles();
                     }
-                    catch(...) { 
+                    catch(...) {
                         log() << "warning caught exception in preallocateFiles, continuing" << endl;
                     }
             }
@@ -343,10 +353,12 @@ namespace mongo {
                             {
                                 // zero the header
                                 File f;
-                                f.open(temppath.string().c_str(), false, true);
+                                f.open(temppath.string().c_str(), false, false);
                                 char buf[8192];
                                 memset(buf, 0, 8192);
                                 f.write(0, buf, 8192);
+                                f.truncate(DataLimitPerJournalFile);
+                                f.fsync();
                             }
                             boost::filesystem::rename(temppath, filepath);
                             return;
@@ -471,12 +483,6 @@ namespace mongo {
         /** called during recovery (the error message text below assumes that)
         */
         unsigned long long journalReadLSN() {
-            if( !debug ) {
-                // in nondebug build, for now, be conservative until more tests written, and apply the whole journal.
-                // however we will still write the lsn file to exercise that code, and use in _DEBUG build.
-                return 0;
-            }
-
             if( !MemoryMappedFile::exists(lsnPath()) ) {
                 log() << "info no lsn file in journal/ directory" << endl;
                 return 0;
@@ -595,15 +601,7 @@ namespace mongo {
             j._ageOut = a;
         }
 
-        /** check if time to rotate files.  assure a file is open.
-            done separately from the journal() call as we can do this part
-            outside of lock.
-            thread: durThread()
-         */
-        void journalRotate() {
-            j.rotate();
-        }
-        void Journal::rotate() {
+        void Journal::_rotate() {
             assert( !dbMutex.atLeastReadLocked() );
             RACECHECK
 
@@ -618,6 +616,7 @@ namespace mongo {
                 return;
 
             if( _curLogFile ) {
+                _curLogFile->truncate();
                 closeCurrentJournalFile();
                 removeUnneededJournalFiles();
             }
@@ -636,24 +635,74 @@ namespace mongo {
             }
         }
 
-        /** write to journal
+        /** write (append) the buffer we have built to the journal and fsync it.
+            outside of dbMutex lock as this could be slow.
+            @param uncompressed - a buffer that will be written to the journal after compression
+            will not return until on disk
         */
-        void journal(const AlignedBuilder& b) {
-            j.journal(b);
+        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed) {
+            Timer t;
+            j.journal(h, uncompressed);
+            stats.curr->_writeToJournalMicros += t.micros();
         }
-        void Journal::journal(const AlignedBuilder& b) {
+        void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) {
+            RACECHECK
+            static AlignedBuilder b(32*1024*1024);
+            /* buffer to journal will be
+               JSectHeader
+               compressed operations
+               JSectFooter
+            */
+            const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter);
+            const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize;
+            b.reset(max);
+
+            {
+                dassert( h.sectionLen() == (unsigned) 0xffffffff ); // we will backfill later
+                b.appendStruct(h);
+            }
+
+            size_t compressedLength = 0;
+            rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength);
+            assert( compressedLength < 0xffffffff );
+            assert( compressedLength < max );
+            b.skip(compressedLength);
+
+            // footer
+            unsigned L = 0xffffffff;
+            {
+                // pad to alignment, and set the total section length in the JSectHeader
+                assert( 0xffffe000 == (~(Alignment-1)) );
+                unsigned lenUnpadded = b.len() + sizeof(JSectFooter);
+                L = (lenUnpadded + Alignment-1) & (~(Alignment-1));
+                dassert( L >= lenUnpadded );
+
+                ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded);
+
+                JSectFooter f(b.buf(), b.len()); // computes checksum
+                b.appendStruct(f);
+                dassert( b.len() == lenUnpadded );
+
+                b.skip(L - lenUnpadded);
+                dassert( b.len() % Alignment == 0 );
+            }
+
             try {
                 mutex::scoped_lock lk(_curLogFileMutex);
 
                 // must already be open -- so that _curFileId is correct for previous buffer building
                 assert( _curLogFile );
 
-                stats.curr->_journaledBytes += b.len();
-                _written += b.len();
-                _curLogFile->synchronousAppend((void *) b.buf(), b.len());
+                stats.curr->_uncompressedBytes += b.len();
+                unsigned w = b.len();
+                _written += w;
+                assert( w <= L );
+                stats.curr->_journaledBytes += L;
+                _curLogFile->synchronousAppend((const void *) b.buf(), L);
+                _rotate();
             }
             catch(std::exception& e) {
-                log() << "warning exception in dur::journal " << e.what() << endl;
+                log() << "error exception in dur::journal " << e.what() << endl;
                 throw;
             }
         }
diff --git a/db/dur_journal.h b/db/dur_journal.h
index e8e3dfd1465..664f63942e0 100644
--- a/db/dur_journal.h
+++ b/db/dur_journal.h
@@ -28,7 +28,8 @@ namespace mongo {
         extern bool okToCleanUp;
 
         /** at termination after db files closed & fsynced 
-            also after covery
+            also after recovery
+            closes and removes journal files
             @param log report in log that we are cleaning up if we actually do any work
         */
         void journalCleanup(bool log = false);
@@ -43,12 +44,6 @@ namespace mongo {
          */
         void journalRotate();
 
-        /** write/append to journal file *
-            @param buf - a buffer that will be written to the journal.
-            will not return until on disk
-        */
-        void journal(const AlignedBuilder& buf);
-
         /** flag that something has gone wrong during writing to the journal
             (not for recovery mode)
         */
@@ -67,5 +62,7 @@ namespace mongo {
         // in case disk controller buffers writes
         const long long ExtraKeepTimeMs = 10000;
 
+        const unsigned JournalCommitIntervalDefault = 100;
+
     }
 }
diff --git a/db/dur_journalformat.h b/db/dur_journalformat.h
index 72587ccd7b6..10ed8487b71 100644
--- a/db/dur_journalformat.h
+++ b/db/dur_journalformat.h
@@ -22,6 +22,8 @@ namespace mongo {
 
     namespace dur {
 
+        const unsigned Alignment = 8192;
+
 #pragma pack(1)
         /** beginning header for a journal/j._<n> file
             there is nothing important int this header at this time.  except perhaps version #.
@@ -34,7 +36,11 @@ namespace mongo {
 
             // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
             // that.  simply incrementing the version # is safe on a fwd basis.
+#if defined(_NOCOMPRESS)
             enum { CurrentVersion = 0x4148 };
+#else
+            enum { CurrentVersion = 0x4149 };
+#endif
             unsigned short _version;
 
             // these are just for diagnostic ease (make header more useful as plain text)
@@ -55,11 +61,25 @@ namespace mongo {
 
         /** "Section" header.  A section corresponds to a group commit.
             len is length of the entire section including header and footer.
+            header and footer are not compressed, just the stuff in between.
         */
         struct JSectHeader {
-            unsigned len;                  // length in bytes of the whole section
+        private:
+            unsigned _sectionLen;          // unpadded length in bytes of the whole section
+        public:
             unsigned long long seqNumber;  // sequence number that can be used on recovery to not do too much work
             unsigned long long fileId;     // matches JHeader::fileId
+            unsigned sectionLen() const { return _sectionLen; }
+
+            // we store the unpadded length so we can use that when we uncompress. to 
+            // get the true total size this must be rounded up to the Alignment.
+            void setSectionLen(unsigned lenUnpadded) { _sectionLen = lenUnpadded; }
+
+            unsigned sectionLenWithPadding() const { 
+                unsigned x = (sectionLen() + (Alignment-1)) & (~(Alignment-1));
+                dassert( x % Alignment == 0 );
+                return x;
+            }
         };
 
         /** an individual write operation within a group commit section.  Either the entire section should
@@ -111,6 +131,7 @@ namespace mongo {
 
         /** group commit section footer. md5 is a key field. */
         struct JSectFooter {
+            JSectFooter();
             JSectFooter(const void* begin, int len); // needs buffer to compute hash
             unsigned sentinel;
             unsigned char hash[16];
@@ -123,6 +144,8 @@ namespace mongo {
                 @return true if buffer looks valid
             */
             bool checkHash(const void* begin, int len) const;
+
+            bool magicOk() const { return *((unsigned*)magic) == 0x0a0a0a0a; }
         };
 
         /** declares "the next entry(s) are for this database / file path prefix" */
diff --git a/db/dur_journalimpl.h b/db/dur_journalimpl.h
index e436eae45f1..bf771c5d768 100644
--- a/db/dur_journalimpl.h
+++ b/db/dur_journalimpl.h
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include "dur_journalformat.h"
 #include "../util/logfile.h"
 
 namespace mongo {
@@ -40,14 +41,14 @@ namespace mongo {
              */
             void rotate();
 
-            /** write to journal
+            /** append to the journal file
             */
-            void journal(const AlignedBuilder& b);
+            void journal(const JSectHeader& h, const AlignedBuilder& b);
 
             boost::filesystem::path getFilePathFor(int filenumber) const;
 
             unsigned long long lastFlushTime() const { return _lastFlushTime; }
-            void cleanup(bool log);
+            void cleanup(bool log); // closes and removes journal files
 
             unsigned long long curFileId() const { return _curFileId; }
 
@@ -61,6 +62,11 @@ namespace mongo {
             void open();
 
         private:
+            /** check if time to rotate files.  assure a file is open.
+             *  internally called with every commit
+             */
+            void _rotate();
+
             void _open();
             void closeCurrentJournalFile();
             void removeUnneededJournalFiles();
diff --git a/db/dur_preplogbuffer.cpp b/db/dur_preplogbuffer.cpp
index 5851e415408..0d8ef3688db 100644
--- a/db/dur_preplogbuffer.cpp
+++ b/db/dur_preplogbuffer.cpp
@@ -60,7 +60,7 @@ namespace mongo {
             size_t ofs = 1;
             MongoMMF *mmf = findMMF_inlock(i->start(), /*out*/ofs);
 
-            _IF( !mmf->willNeedRemap() ) {
+            if( unlikely(!mmf->willNeedRemap()) ) {
                 // tag this mmf as needed a remap of its private view later.
                 // usually it will already be dirty/already set, so we do the if above first
                 // to avoid possibility of cpu cache line contention
@@ -97,7 +97,7 @@ namespace mongo {
 #endif
             bb.appendBuf(i->start(), e.len);
 
-            _IF (e.len != (unsigned)i->length()) {
+            if (unlikely(e.len != (unsigned)i->length())) {
                 log() << "journal info splitting prepBasicWrite at boundary" << endl;
 
                 // This only happens if we write to the last byte in a file and
@@ -120,40 +120,25 @@ namespace mongo {
             // each time events switch to a different database we journal a JDbContext
             RelativePath lastDbPath;
 
-            set<WriteIntent>::iterator i = commitJob.writes().begin();
-
-            const WriteIntent *w = &(*i);
-            while(1) {
-                i++;
-                const WriteIntent *next = 0;
-                IF( i != commitJob.writes().end() ) {
-                    next = &(*i);
-                    PREFETCH(next);
-                }
-                prepBasicWrite_inlock(bb, w, lastDbPath);
-                _IF( next == 0 )
-                    break;
-                w = next;
-            };
+            for( set<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
+                prepBasicWrite_inlock(bb, &(*i), lastDbPath);
+            }
         }
 
-        void resetLogBuffer(AlignedBuilder& bb) {
+        void resetLogBuffer(/*out*/JSectHeader& h, AlignedBuilder& bb) {
             bb.reset();
 
-            // JSectHeader
-            JSectHeader h;
-            h.len = (unsigned) 0xffffffff;  // total length, will fill in later
+            h.setSectionLen(0xffffffff);  // total length, will fill in later
             h.seqNumber = getLastDataFileFlushTime();
             h.fileId = j.curFileId();
-
-            bb.appendStruct(h);
         }
 
         /** we will build an output buffer ourself and then use O_DIRECT
             we could be in read lock for this
             caller handles locking
+            @return partially populated sectheader and _ab set
         */
-        void _PREPLOGBUFFER() {
+        void _PREPLOGBUFFER(JSectHeader& h) {
             assert( cmdLine.dur );
 
             {
@@ -165,7 +150,7 @@ namespace mongo {
             }
 
             AlignedBuilder& bb = commitJob._ab;
-            resetLogBuffer(bb);
+            resetLogBuffer(h, bb); // adds JSectHeader
 
             // ops other than basic writes (DurOp's)
             {
@@ -174,34 +159,14 @@ namespace mongo {
                 }
             }
 
-            {
-                prepBasicWrites(bb);
-            }
-
-            // pad to alignment, and set the total section length in the JSectHeader
-            assert( 0xffffe000 == (~(Alignment-1)) );
-            unsigned lenWillBe = bb.len() + sizeof(JSectFooter);
-            unsigned L = (lenWillBe + Alignment-1) & (~(Alignment-1));
-            dassert( L >= lenWillBe );
-            *((unsigned*)bb.atOfs(0)) = L;
-
-            {
-                JSectFooter f(bb.buf(), bb.len());
-                bb.appendStruct(f);
-            }
-
-            {
-                unsigned padding = L - bb.len();
-                bb.skip(padding);
-                dassert( bb.len() % Alignment == 0 );
-            }
+            prepBasicWrites(bb);
 
             return;
         }
-        void PREPLOGBUFFER() {
+        void PREPLOGBUFFER(/*out*/ JSectHeader& h) {
             Timer t;
             j.assureLogFileOpen(); // so fileId is set
-            _PREPLOGBUFFER();
+            _PREPLOGBUFFER(h);
             stats.curr->_prepLogBufferMicros += t.micros();
         }
 
diff --git a/db/dur_recover.cpp b/db/dur_recover.cpp
index 2e1516914f1..1e719c0070d 100644
--- a/db/dur_recover.cpp
+++ b/db/dur_recover.cpp
@@ -27,6 +27,7 @@
 #include "namespace.h"
 #include "../util/mongoutils/str.h"
 #include "../util/bufreader.h"
+#include "../util/concurrency/race.h"
 #include "pdfile.h"
 #include "database.h"
 #include "db.h"
@@ -35,6 +36,7 @@
 #include "cmdline.h"
 #include "curop.h"
 #include "mongommf.h"
+#include "../util/compress.h"
 
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -92,59 +94,73 @@ namespace mongo {
             throws
         */
         class JournalSectionIterator : boost::noncopyable {
+            auto_ptr<BufReader> _entries;
+            const JSectHeader _h;
+            const char *_lastDbName; // pointer into mmaped journal file
+            const bool _doDurOps;
+            string _uncompressed;
         public:
-            JournalSectionIterator(const void *p, unsigned len, bool doDurOps)
-                : _br(p, len)
-                , _sectHead(static_cast<const JSectHeader*>(_br.skip(sizeof(JSectHeader))))
-                , _lastDbName(NULL)
-                , _doDurOps(doDurOps)
-            {}
+            JournalSectionIterator(const JSectHeader& h, const void *compressed, unsigned compressedLen, bool doDurOpsRecovering) :
+                _h(h),
+                _lastDbName(0)
+                , _doDurOps(doDurOpsRecovering)
+            {
+                assert( doDurOpsRecovering );
+                bool ok = uncompress((const char *)compressed, compressedLen, &_uncompressed);
+                if( !ok ) { 
+                    // it should always be ok (i think?) as there is a previous check to see that the JSectFooter is ok
+                    log() << "couldn't uncompress journal section" << endl;
+                    msgasserted(15874, "couldn't uncompress journal section");
+                }
+                const char *p = _uncompressed.c_str();
+                assert( compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader) );
+                _entries = auto_ptr<BufReader>( new BufReader(p, _uncompressed.size()) );
+            }
+
+            // we work with the uncompressed buffer when doing a WRITETODATAFILES (for speed)
+            JournalSectionIterator(const JSectHeader &h, const void *p, unsigned len) :
+                _entries( new BufReader((const char *) p, len) ),
+                _h(h),
+                _lastDbName(0)
+                , _doDurOps(false)
 
-            bool atEof() const { return _br.atEof(); }
+                { }
 
-            unsigned long long seqNumber() const { return _sectHead->seqNumber; }
+            bool atEof() const { return _entries->atEof(); }
+
+            unsigned long long seqNumber() const { return _h.seqNumber; }
 
             /** get the next entry from the log.  this function parses and combines JDbContext and JEntry's.
-             *  @return true if got an entry.  false at successful end of section (and no entry returned).
              *  throws on premature end of section.
              */
-            bool next(ParsedJournalEntry& e) {
+            void next(ParsedJournalEntry& e) {
                 unsigned lenOrOpCode;
-                _br.read(lenOrOpCode);
+                _entries->read(lenOrOpCode);
 
                 if (lenOrOpCode > JEntry::OpCode_Min) {
                     switch( lenOrOpCode ) {
 
                     case JEntry::OpCode_Footer: {
-                        if (_doDurOps) {
-                            const char* pos = (const char*) _br.pos();
-                            pos -= sizeof(lenOrOpCode); // rewind to include OpCode
-                            const JSectFooter& footer = *(const JSectFooter*)pos;
-                            int len = pos - (char*)_sectHead;
-                            if (!footer.checkHash(_sectHead, len)) {
-                                massert(13594, "journal checksum doesn't match", false);
-                            }
-                        }
-                        return false; // false return value denotes end of section
+                        assert( false );
                     }
 
                     case JEntry::OpCode_FileCreated:
                     case JEntry::OpCode_DropDb: {
                         e.dbName = 0;
-                        boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, _br);
+                        boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries);
                         if (_doDurOps) {
                             e.op = op;
                         }
-                        return true;
+                        return;
                     }
 
                     case JEntry::OpCode_DbContext: {
-                        _lastDbName = (const char*) _br.pos();
-                        const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _br.remaining());
+                        _lastDbName = (const char*) _entries->pos();
+                        const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _entries->remaining());
                         const unsigned len = strnlen(_lastDbName, limit);
                         massert(13533, "problem processing journal file during recovery", _lastDbName[len] == '\0');
-                        _br.skip(len+1); // skip '\0' too
-                        _br.read(lenOrOpCode);
+                        _entries->skip(len+1); // skip '\0' too
+                        _entries->read(lenOrOpCode); // read this for the fall through
                     }
                     // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
 
@@ -156,18 +172,13 @@ namespace mongo {
 
                 // JEntry - a basic write
                 assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
-                _br.rewind(4);
-                e.e = (JEntry *) _br.skip(sizeof(JEntry));
+                _entries->rewind(4);
+                e.e = (JEntry *) _entries->skip(sizeof(JEntry));
                 e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
                 assert( e.e->len == lenOrOpCode );
-                _br.skip(e.e->len);
-                return true;
+                _entries->skip(e.e->len);
             }
-        private:
-            BufReader _br;
-            const JSectHeader* _sectHead;
-            const char *_lastDbName; // pointer into mmaped journal file
-            const bool _doDurOps;
+
         };
 
         static string fileName(const char* dbName, int fileNo) {
@@ -289,27 +300,64 @@ namespace mongo {
                 log() << "END section" << endl;
         }
 
-        void RecoveryJob::processSection(const void *p, unsigned len) {
+        void RecoveryJob::processSection(const JSectHeader *h, const void *p, unsigned len, const JSectFooter *f) {
             scoped_lock lk(_mx);
+            RACECHECK
+
+            /** todo: we should really verify the checksum to see that seqNumber is ok?
+                      that is expensive maybe there is some sort of checksum of just the header 
+                      within the header itself
+            */
+            if( _recovering && _lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs ) {
+                if( h->seqNumber != _lastSeqMentionedInConsoleLog ) {
+                    static int n;
+                    if( ++n < 10 ) {
+                        log() << "recover skipping application of section seq:" << h->seqNumber << " < lsn:" << _lastDataSyncedFromLastRun << endl;
+                    }
+                    else if( n == 10 ) { 
+                        log() << "recover skipping application of section more..." << endl;
+                    }
+                    _lastSeqMentionedInConsoleLog = h->seqNumber;
+                }
+                return;
+            }
 
-            vector<ParsedJournalEntry> entries;
-            JournalSectionIterator i(p, len, _recovering);
+            auto_ptr<JournalSectionIterator> i;
+            if( _recovering ) {
+                i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering));
+            }
+            else { 
+                i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, /*after header*/p, /*w/out header*/len));
+            }
 
-            //DEV log() << "recovery processSection seq:" << i.seqNumber() << endl;
-            if( _recovering && _lastDataSyncedFromLastRun > i.seqNumber() + ExtraKeepTimeMs ) {
-                if( i.seqNumber() != _lastSeqMentionedInConsoleLog ) {
-                    log() << "recover skipping application of section seq:" << i.seqNumber() << " < lsn:" << _lastDataSyncedFromLastRun << endl;
-                    _lastSeqMentionedInConsoleLog = i.seqNumber();
+            // we use a static so that we don't have to reallocate every time through.  occasionally we 
+            // go back to a small allocation so that if there were a spiky growth it won't stick forever.
+            static vector<ParsedJournalEntry> entries;
+            entries.clear();
+/** TEMP uncomment
+            RARELY OCCASIONALLY {
+                if( entries.capacity() > 2048 ) {
+                    entries.shrink_to_fit();
+                    entries.reserve(2048);
                 }
-                return;
             }
+*/
 
             // first read all entries to make sure this section is valid
             ParsedJournalEntry e;
-            while( i.next(e) ) {
+            while( !i->atEof() ) {
+                i->next(e);
                 entries.push_back(e);
             }
 
+            // after the entries check the footer checksum
+            if( _recovering ) {
+                assert( ((const char *)h) + sizeof(JSectHeader) == p );
+                if( !f->checkHash(h, len + sizeof(JSectHeader)) ) { 
+                    msgasserted(13594, "journal checksum doesn't match");
+                }
+            }
+
             // got all the entries for one group commit.  apply them:
             applyEntries(entries);
         }
@@ -345,11 +393,16 @@ namespace mongo {
                     if( h.fileId != fileId ) {
                         if( debug || (cmdLine.durOptions & CmdLine::DurDumpJournal) ) {
                             log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl;
-                            log() << "  sect len:" << h.len << " seqnum:" << h.seqNumber << endl;
+                            log() << "  sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl;
                         }
                         return true;
                     }
-                    processSection(br.skip(h.len), h.len);
+                    unsigned slen = h.sectionLen();
+                    unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter);
+                    const char *hdr = (const char *) br.skip(h.sectionLenWithPadding());
+                    const char *data = hdr + sizeof(JSectHeader);
+                    const char *footer = data + dataLen;
+                    processSection((const JSectHeader*) hdr, data, dataLen, (const JSectFooter*) footer);
 
                     // ctrl c check
                     killCurrentOp.checkForInterrupt(false);
@@ -367,6 +420,17 @@ namespace mongo {
         /** apply a specific journal file */
         bool RecoveryJob::processFile(path journalfile) {
             log() << "recover " << journalfile.string() << endl;
+
+            try { 
+                if( boost::filesystem::file_size( journalfile.string() ) == 0 ) {
+                    log() << "recover info " << journalfile.string() << " has zero length" << endl;
+                    return true;
+                }
+            } catch(...) { 
+                // if something weird like a permissions problem keep going so the massert down below can happen (presumably)
+                log() << "recover exception checking filesize" << endl;
+            }
+
             MemoryMappedFile f;
             void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL);
             massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p);
@@ -382,13 +446,19 @@ namespace mongo {
             _lastDataSyncedFromLastRun = journalReadLSN();
             log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl;
 
+            // todo: we could truncate the journal file at rotation time to the right length, then this abruptEnd 
+            // check can be turned back on.  this is relevant when prealloc is being used.
             for( unsigned i = 0; i != files.size(); ++i ) {
-	      /*bool abruptEnd = */processFile(files[i]);
-                /*if( abruptEnd && i+1 < files.size() ) {
+	      bool abruptEnd = processFile(files[i]);
+                if( abruptEnd && i+1 < files.size() ) {
+#if 1 // Leaving this as a warning for now. TODO: make this an error post 2.0
+                    log() << "recover warning: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
+#else
                     log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
                     close();
                     uasserted(13535, "recover abrupt journal file end");
-                }*/
+#endif
+                }
             }
 
             close();
diff --git a/db/dur_recover.h b/db/dur_recover.h
index b5a922b498a..955e730ea05 100644
--- a/db/dur_recover.h
+++ b/db/dur_recover.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include "dur_journalformat.h"
 #include "../util/concurrency/mutex.h"
 #include "../util/file.h"
 
@@ -15,10 +16,14 @@ namespace mongo {
          */
         class RecoveryJob : boost::noncopyable {
         public:
-            RecoveryJob() :_lastDataSyncedFromLastRun(0), _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; }
+            RecoveryJob() : _lastDataSyncedFromLastRun(0), 
+                _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; }
             void go(vector<path>& files);
             ~RecoveryJob();
-            void processSection(const void *, unsigned len);
+
+            /** @param data data between header and footer. compressed if recovering. */
+            void processSection(const JSectHeader *h, const void *data, unsigned len, const JSectFooter *f);
+
             void close(); // locks and calls _close()
 
             static RecoveryJob & get() { return _instance; }
diff --git a/db/dur_stats.h b/db/dur_stats.h
index d4943c01cb3..50a26d1f215 100644
--- a/db/dur_stats.h
+++ b/db/dur_stats.h
@@ -20,6 +20,7 @@ namespace mongo {
                 unsigned _commits;
                 unsigned _earlyCommits; // count of early commits from commitIfNeeded() or from getDur().commitNow()
                 unsigned long long _journaledBytes;
+                unsigned long long _uncompressedBytes;
                 unsigned long long _writeToDataFilesBytes;
 
                 unsigned long long _prepLogBufferMicros;
diff --git a/db/dur_writetodatafiles.cpp b/db/dur_writetodatafiles.cpp
index cdccb018d83..6724f0731aa 100644
--- a/db/dur_writetodatafiles.cpp
+++ b/db/dur_writetodatafiles.cpp
@@ -47,9 +47,9 @@ namespace mongo {
             @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc&hl=en
         */
 
-        void WRITETODATAFILES_Impl1() {
+        void WRITETODATAFILES_Impl1(const JSectHeader& h, AlignedBuilder& uncompressed) {
             RWLockRecursive::Shared lk(MongoFile::mmmutex);
-            RecoveryJob::get().processSection(commitJob._ab.buf(), commitJob._ab.len());
+            RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), 0);
         }
 
 #if 0
@@ -81,16 +81,14 @@ namespace mongo {
 #endif
 
         // concurrency: in mmmutex, not necessarily in dbMutex
-        void WRITETODATAFILES() {
+        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed) {
             Timer t;
 #if defined(_EXPERIMENTAL)
             WRITETODATAFILES_Impl3();
 #else
-            WRITETODATAFILES_Impl1();
+            WRITETODATAFILES_Impl1(h, uncompressed);
 #endif
             stats.curr->_writeToDataFilesMicros += t.micros();
-
-
         }
 
     }
diff --git a/db/durop.h b/db/durop.h
index c4574c2e3cb..9ab1bfcbede 100644
--- a/db/durop.h
+++ b/db/durop.h
@@ -28,8 +28,6 @@ namespace mongo {
 
     namespace dur {
 
-        const unsigned Alignment = 8192;
-
         /** DurOp - Operations we journal that aren't just basic writes.
          *
          *  Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent.
diff --git a/db/geo/2d.cpp b/db/geo/2d.cpp
index 21b0eaa6601..9b762b260de 100644
--- a/db/geo/2d.cpp
+++ b/db/geo/2d.cpp
@@ -138,7 +138,11 @@ namespace mongo {
             GeoHash b = a;
             b.move(1, 1);
 
-            _error = distance(a, b);
+            // Epsilon is 1/100th of a bucket size
+            // TODO:  Can we actually find error bounds for the sqrt function?
+            double epsilon = 0.001 / _scaling;
+            _error = distance(a, b) + epsilon;
+
             // Error in radians
             _errorSphere = deg2rad( _error );
         }
@@ -293,6 +297,14 @@ namespace mongo {
 
         }
 
+        BSONObj _fromBSONHash( const BSONElement& e ) const {
+            return _unhash( _tohash( e ) );
+        }
+
+        BSONObj _fromBSONHash( const BSONObj& o ) const {
+            return _unhash( _tohash( o.firstElement() ) );
+        }
+
         GeoHash _tohash( const BSONElement& e ) const {
             if ( e.isABSONObj() )
                 return _hash( e.embeddedObject() );
@@ -368,6 +380,10 @@ namespace mongo {
         }
 
         double sizeEdge( const GeoHash& a ) const {
+
+            if( ! a.constrains() )
+                return _max - _min;
+
             double ax,ay,bx,by;
             GeoHash b = a;
             b.move( 1 , 1 );
@@ -443,6 +459,10 @@ namespace mongo {
 
         Box() {}
 
+        BSONArray toBSON() const {
+            return BSON_ARRAY( BSON_ARRAY( _min._x << _min._y ) << BSON_ARRAY( _max._x << _max._y ) );
+        }
+
         string toString() const {
             StringBuilder buf(64);
             buf << _min.toString() << " -->> " << _max.toString();
@@ -630,8 +650,8 @@ namespace mongo {
 
                 }
                 else if( fudge == 0 ){
-                	if( p._y == p1._y && p._x == p1._x ) return true;
-                	else if( p._y == p2._y && p._x == p2._x ) return true;
+                    if( p._y == p1._y && p._x == p1._x ) return true;
+                    else if( p._y == p2._y && p._x == p2._x ) return true;
                 }
 
                 // Normal intersection test.
@@ -742,293 +762,96 @@ namespace mongo {
         geo2dplugin.getName();
     }
     
-    struct GeoUnitTest : public UnitTest {
-
-        int round( double d ) {
-            return (int)(.5+(d*1000));
-        }
-
-#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); }
-
-        void run() {
-            assert( ! GeoHash::isBitSet( 0 , 0 ) );
-            assert( ! GeoHash::isBitSet( 0 , 31 ) );
-            assert( GeoHash::isBitSet( 1 , 31 ) );
-
-            IndexSpec i( BSON( "loc" << "2d" ) );
-            Geo2dType g( &geo2dplugin , &i );
-            {
-                double x = 73.01212;
-                double y = 41.352964;
-                BSONObj in = BSON( "x" << x << "y" << y );
-                GeoHash h = g._hash( in );
-                BSONObj out = g._unhash( h );
-                assert( round(x) == round( out["x"].number() ) );
-                assert( round(y) == round( out["y"].number() ) );
-                assert( round( in["x"].number() ) == round( out["x"].number() ) );
-                assert( round( in["y"].number() ) == round( out["y"].number() ) );
-            }
-
-            {
-                double x = -73.01212;
-                double y = 41.352964;
-                BSONObj in = BSON( "x" << x << "y" << y );
-                GeoHash h = g._hash( in );
-                BSONObj out = g._unhash( h );
-                assert( round(x) == round( out["x"].number() ) );
-                assert( round(y) == round( out["y"].number() ) );
-                assert( round( in["x"].number() ) == round( out["x"].number() ) );
-                assert( round( in["y"].number() ) == round( out["y"].number() ) );
-            }
-
-            {
-                GeoHash h( "0000" );
-                h.move( 0 , 1 );
-                GEOHEQ( h , "0001" );
-                h.move( 0 , -1 );
-                GEOHEQ( h , "0000" );
-
-                h.init( "0001" );
-                h.move( 0 , 1 );
-                GEOHEQ( h , "0100" );
-                h.move( 0 , -1 );
-                GEOHEQ( h , "0001" );
-
-
-                h.init( "0000" );
-                h.move( 1 , 0 );
-                GEOHEQ( h , "0010" );
-            }
-
-            {
-                Box b( 5 , 5 , 2 );
-                assert( "(5,5) -->> (7,7)" == b.toString() );
-            }
-
-            {
-                GeoHash a = g.hash( 1 , 1 );
-                GeoHash b = g.hash( 4 , 5 );
-                assert( 5 == (int)(g.distance( a , b ) ) );
-                a = g.hash( 50 , 50 );
-                b = g.hash( 42 , 44 );
-                assert( round(10) == round(g.distance( a , b )) );
-            }
-
-            {
-                GeoHash x("0000");
-                assert( 0 == x.getHash() );
-                x.init( 0 , 1 , 32 );
-                GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" )
-
-                assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
-                assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
-            }
-
-            {
-                GeoHash x("1010");
-                GEOHEQ( x , "1010" );
-                GeoHash y = x + "01";
-                GEOHEQ( y , "101001" );
-            }
-
-            {
-
-                GeoHash a = g.hash( 5 , 5 );
-                GeoHash b = g.hash( 5 , 7 );
-                GeoHash c = g.hash( 100 , 100 );
-                /*
-                cout << "a: " << a << endl;
-                cout << "b: " << b << endl;
-                cout << "c: " << c << endl;
-
-                cout << "a: " << a.toStringHex1() << endl;
-                cout << "b: " << b.toStringHex1() << endl;
-                cout << "c: " << c.toStringHex1() << endl;
-                */
-                BSONObj oa = a.wrap();
-                BSONObj ob = b.wrap();
-                BSONObj oc = c.wrap();
-                /*
-                cout << "a: " << oa.hexDump() << endl;
-                cout << "b: " << ob.hexDump() << endl;
-                cout << "c: " << oc.hexDump() << endl;
-                */
-                assert( oa.woCompare( ob ) < 0 );
-                assert( oa.woCompare( oc ) < 0 );
-
-            }
-
-            {
-                GeoHash x( "000000" );
-                x.move( -1 , 0 );
-                GEOHEQ( x , "101010" );
-                x.move( 1 , -1 );
-                GEOHEQ( x , "010101" );
-                x.move( 0 , 1 );
-                GEOHEQ( x , "000000" );
-            }
 
-            {
-                GeoHash prefix( "110011000000" );
-                GeoHash entry(  "1100110000011100000111000001110000011100000111000001000000000000" );
-                assert( ! entry.hasPrefix( prefix ) );
 
-                entry = GeoHash("1100110000001100000111000001110000011100000111000001000000000000");
-                assert( entry.toString().find( prefix.toString() ) == 0 );
-                assert( entry.hasPrefix( GeoHash( "1100" ) ) );
-                assert( entry.hasPrefix( prefix ) );
-            }
-
-            {
-                GeoHash a = g.hash( 50 , 50 );
-                GeoHash b = g.hash( 48 , 54 );
-                assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
-            }
-
-
-            {
-                Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
-                assert( b.inside( 29.763 , -95.363 ) );
-                assert( ! b.inside( 32.9570255 , -96.1082497 ) );
-                assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) );
-            }
-
-            {
-                GeoHash a( "11001111" );
-                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11") ) );
-                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11110000") ) );
-            }
-
-            {
-                int N = 10000;
-                {
-                    Timer t;
-                    for ( int i=0; i<N; i++ ) {
-                        unsigned x = (unsigned)rand();
-                        unsigned y = (unsigned)rand();
-                        GeoHash h( x , y );
-                        unsigned a,b;
-                        h.unhash_slow( a,b );
-                        assert( a == x );
-                        assert( b == y );
-                    }
-                    //cout << "slow: " << t.millis() << endl;
-                }
-
-                {
-                    Timer t;
-                    for ( int i=0; i<N; i++ ) {
-                        unsigned x = (unsigned)rand();
-                        unsigned y = (unsigned)rand();
-                        GeoHash h( x , y );
-                        unsigned a,b;
-                        h.unhash_fast( a,b );
-                        assert( a == x );
-                        assert( b == y );
-                    }
-                    //cout << "fast: " << t.millis() << endl;
-                }
-
-            }
-
-            {
-                // see http://en.wikipedia.org/wiki/Great-circle_distance#Worked_example
-
-                {
-                    Point BNA (-86.67, 36.12);
-                    Point LAX (-118.40, 33.94);
+    class GeoHopper;
 
-                    double dist1 = spheredist_deg(BNA, LAX);
-                    double dist2 = spheredist_deg(LAX, BNA);
+    class GeoPoint {
+    public:
 
-                    // target is 0.45306
-                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
-                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
-                }
-                {
-                    Point BNA (-1.5127, 0.6304);
-                    Point LAX (-2.0665, 0.5924);
+        GeoPoint() : _distance( -1 ), _exact( false )
+        {}
 
-                    double dist1 = spheredist_rad(BNA, LAX);
-                    double dist2 = spheredist_rad(LAX, BNA);
+        //// Distance not used ////
 
-                    // target is 0.45306
-                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
-                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
-                }
-                {
-                    Point JFK (-73.77694444, 40.63861111 );
-                    Point LAX (-118.40, 33.94);
+        GeoPoint( const GeoKeyNode& node )
+            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( -1 ) , _exact( false ) {
+        }
 
-                    double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES;
-                    assert( dist > 2469 && dist < 2470 );
-                }
+        //// Immediate initialization of distance ////
 
-                {
-                    Point BNA (-86.67, 36.12);
-                    Point LAX (-118.40, 33.94);
-                    Point JFK (-73.77694444, 40.63861111 );
-                    assert( spheredist_deg(BNA, BNA) < 1e-6);
-                    assert( spheredist_deg(LAX, LAX) < 1e-6);
-                    assert( spheredist_deg(JFK, JFK) < 1e-6);
+        GeoPoint( const GeoKeyNode& node, double distance, bool exact )
+            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( distance ), _exact( exact ) {
+        }
 
-                    Point zero (0, 0);
-                    Point antizero (0,-180);
+        GeoPoint( const GeoPoint& pt, double distance, bool exact )
+            : _key( pt.key() ) , _loc( pt.loc() ) , _o( pt.obj() ), _distance( distance ), _exact( exact ) {
+        }
 
-                    // these were known to cause NaN
-                    assert( spheredist_deg(zero, zero) < 1e-6);
-                    assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6);
-                    assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6);
-                }
-            }
+        bool operator<( const GeoPoint& other ) const {
+            if( _distance != other._distance ) return _distance < other._distance;
+            if( _exact != other._exact ) return _exact < other._exact;
+            return _loc < other._loc;
         }
-    } geoUnitTest;
 
-    class GeoHopper;
+        double distance() const {
+            return _distance;
+        }
 
-    class GeoPoint {
-    public:
-        GeoPoint() { }
+        bool isExact() const {
+            return _exact;
+        }
 
-        //// Distance not used ////
+        BSONObj key() const {
+            return _key;
+        }
 
-        GeoPoint( const GeoKeyNode& node )
-            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ) , _exactDistance( -1 ), _exactWithin( false ) {
+        DiskLoc loc() const {
+            return _loc;
         }
-        
-        //// Immediate initialization of exact distance ////
 
-        GeoPoint( const GeoKeyNode& node , double exactDistance, bool exactWithin )
-            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _exactDistance( exactDistance ), _exactWithin( exactWithin ) {
+        BSONObj obj() const {
+            return _o;
         }
 
-        bool operator<( const GeoPoint& other ) const {
-            return _exactDistance < other._exactDistance;
+        BSONObj pt() const {
+            return _pt;
         }
 
-        bool isEmpty() const {
+        bool isEmpty() {
             return _o.isEmpty();
         }
 
         string toString() const {
-            return str::stream() << "Point from " << _o.toString() << " dist : " << _exactDistance << " within ? " << _exactWithin;
+            return str::stream() << "Point from " << _o << " dist : " << _distance << ( _exact ? " (ex)" : " (app)" );
         }
 
         BSONObj _key;
         DiskLoc _loc;
         BSONObj _o;
+        BSONObj _pt;
 
-        double _exactDistance;
-        bool _exactWithin;
+        double _distance;
+        bool _exact;
     };
 
     // GeoBrowse subclasses this
     class GeoAccumulator {
     public:
-        GeoAccumulator( const Geo2dType * g , const BSONObj& filter )
-            : _g(g) , _lookedAt(0) , _objectsLoaded(0) , _found(0) {
+        GeoAccumulator( const Geo2dType * g , const BSONObj& filter, bool uniqueDocs, bool needDistance )
+            : _g(g) ,
+              _keysChecked(0) ,
+              _lookedAt(0) ,
+              _matchesPerfd(0) ,
+              _objectsLoaded(0) ,
+              _pointsLoaded(0) ,
+              _found(0) ,
+              _uniqueDocs( uniqueDocs ) ,
+              _needDistance( needDistance )
+        {
             if ( ! filter.isEmpty() ) {
                 _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) );
+                GEODEBUG( "Matcher is now " << _matcher->docMatcher().toString() );
             }
         }
 
@@ -1042,6 +865,9 @@ namespace mongo {
         set< pair<DiskLoc,int> > _seen;
     public:
         bool seen(DiskLoc bucket, int pos) {
+
+            _keysChecked++;
+
             pair< set<pair<DiskLoc,int> >::iterator, bool > seenBefore = _seen.insert( make_pair(bucket,pos) );
             if ( ! seenBefore.second ) {
                 GEODEBUG( "\t\t\t\t already seen : " << bucket.toString() << ' ' << pos ); // node.key.toString() << " @ " << Point( _g, GeoHash( node.key.firstElement() ) ).toString() << " with " << node.recordLoc.obj()["_id"] );
@@ -1050,29 +876,43 @@ namespace mongo {
             return false;
         }
 
-        void add( const GeoKeyNode& node ) {
+        enum KeyResult { BAD, BORDER, GOOD };
+
+        virtual void add( const GeoKeyNode& node ) {
 
-            GEODEBUG( "\t\t\t\t checking key " << node.key.toString() )
+            GEODEBUG( "\t\t\t\t checking key " << node._key.toString() )
 
             _lookedAt++;
 
-            // distance check
-            double d = 0;
-            if ( ! checkDistance( node , d ) ) {
-                GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj()  << "\t" << d );
+            ////
+            // Approximate distance check using key data
+            ////
+            double keyD = 0;
+            Point keyP( _g, GeoHash( node._key.firstElement(), _g->_bits ) );
+            KeyResult keyOk = approxKeyCheck( keyP, keyD );
+            if ( keyOk == BAD ) {
+                GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj()  << "\t" << keyD );
                 return;
             }
-            GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj()  << "\t" << d );
+            GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj()  << "\t" << keyD );
 
+            ////
+            // Check for match using other key (and potentially doc) criteria
+            ////
             // Remember match results for each object
             map<DiskLoc, bool>::iterator match = _matched.find( node.recordLoc );
             bool newDoc = match == _matched.end();
             if( newDoc ) {
 
+                GEODEBUG( "\t\t\t\t matching new doc with " << (_matcher ? _matcher->docMatcher().toString() : "(empty)" ) );
+
                 // matcher
                 MatchDetails details;
                 if ( _matcher.get() ) {
                     bool good = _matcher->matchesWithSingleKeyIndex( node._key , node.recordLoc , &details );
+
+                    _matchesPerfd++;
+
                     if ( details._loadedObject )
                         _objectsLoaded++;
 
@@ -1094,12 +934,50 @@ namespace mongo {
                 return;
             }
 
-            addSpecific( node , d, newDoc );
-            _found++;
+            ////
+            // Exact check with particular data fields
+            ////
+            // Can add multiple points
+            int diff = addSpecific( node , keyP, keyOk == BORDER, keyD, newDoc );
+            if( diff > 0 ) _found += diff;
+            else _found -= -diff;
+
+        }
+
+        virtual void getPointsFor( const BSONObj& key, const BSONObj& obj, vector< BSONObj >& locsForNode, bool allPoints = false ){
+
+            // Find all the location objects from the keys
+            vector< BSONObj > locs;
+            _g->getKeys( obj, allPoints ? locsForNode : locs );
+            _pointsLoaded++;
+
+            if( allPoints ) return;
+            if( locs.size() == 1 ){
+                locsForNode.push_back( locs[0] );
+                return;
+            }
+
+            // Find the particular location we want
+            GeoHash keyHash( key.firstElement(), _g->_bits );
+
+            // log() << "Hash: " << node.key << " and " << keyHash.getHash() << " unique " << _uniqueDocs << endl;
+            for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
+
+                // Ignore all locations not hashed to the key's hash, since we may see
+                // those later
+                if( _g->_hash( *i ) != keyHash ) continue;
+
+                locsForNode.push_back( *i );
+
+            }
+
         }
 
-        virtual void addSpecific( const GeoKeyNode& node , double d, bool newDoc ) = 0;
-        virtual bool checkDistance( const GeoKeyNode& node , double& d ) = 0;
+        virtual int addSpecific( const GeoKeyNode& node, const Point& p , bool inBounds, double d, bool newDoc ) = 0;
+        virtual KeyResult approxKeyCheck( const Point& p , double& keyD ) = 0;
+        virtual bool exactDocCheck( const Point& p , double& d ) = 0;
+        virtual bool expensiveExactCheck(){ return false; }
+
 
         long long found() const {
             return _found;
@@ -1109,9 +987,16 @@ namespace mongo {
         map<DiskLoc, bool> _matched;
         shared_ptr<CoveredIndexMatcher> _matcher;
 
+        long long _keysChecked;
         long long _lookedAt;
+        long long _matchesPerfd;
         long long _objectsLoaded;
+        long long _pointsLoaded;
         long long _found;
+
+        bool _uniqueDocs;
+        bool _needDistance;
+
     };
 
     struct BtreeLocation {
@@ -1264,8 +1149,8 @@ namespace mongo {
             DONE
         } _state;
 
-        GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj() )
-            : GeoCursorBase( g ), GeoAccumulator( g , filter ) ,
+        GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj(), bool uniqueDocs = true, bool needDistance = false )
+            : GeoCursorBase( g ), GeoAccumulator( g , filter, uniqueDocs, needDistance ) ,
               _type( type ) , _filter( filter ) , _firstCall(true), _nscanned(), _centerPrefix(0, 0, 0) {
 
             // Set up the initial expand state
@@ -1350,11 +1235,9 @@ namespace mongo {
         virtual void fillStack( int maxToCheck, int maxToAdd = -1, bool onlyExpand = false ) {
 
 #ifdef GEODEBUGGING
-
-            int s = _state;
             log() << "Filling stack with maximum of " << maxToCheck << ", state : " << (int) _state << endl;
-
 #endif
+
             if( maxToAdd < 0 ) maxToAdd = maxToCheck;
             int maxFound = _foundInExp + maxToCheck;
             assert( maxToCheck > 0 );
@@ -1395,7 +1278,6 @@ namespace mongo {
                 while ( true ) {
 
                     GEODEBUG( "box prefix [" << _prefix << "]" );
-
 #ifdef GEODEBUGGING
                     if( _prefix.constrains() ) {
                         log() << "current expand box : " << Box( _g, _prefix ).toString() << endl;
@@ -1407,6 +1289,9 @@ namespace mongo {
 
                     GEODEBUG( "expanding box points... ");
 
+                    // Record the prefix we're actively exploring...
+                    _expPrefix.reset( new GeoHash( _prefix ) );
+
                     // Find points inside this prefix
                     while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _foundInExp , this ) && _foundInExp < maxFound && _found < maxAdded );
                     while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _foundInExp , this ) && _foundInExp < maxFound && _found < maxAdded );
@@ -1419,7 +1304,7 @@ namespace mongo {
 
 #endif
 
-                    GEODEBUG( "finished expand, found : " << ( maxToCheck - ( maxFound - _found ) ) );
+                    GEODEBUG( "finished expand, found : " << ( maxToAdd - ( maxAdded - _found ) ) );
                     if( _foundInExp >= maxFound || _found >= maxAdded ) return;
 
                     // We've searched this prefix fully, remember
@@ -1429,6 +1314,7 @@ namespace mongo {
                     if ( ! _prefix.constrains() ) {
                         GEODEBUG( "box exhausted" );
                         _state = DONE;
+                        notePrefix();
                         return;
                     }
 
@@ -1453,8 +1339,9 @@ namespace mongo {
                     break;
 
                 }
-            }
 
+                notePrefix();
+            }
 
             // If we doeighbors
             if( onlyExpand ) return;
@@ -1495,7 +1382,7 @@ namespace mongo {
                     GeoHash _neighborPrefix = _centerPrefix;
                     _neighborPrefix.move( i, j );
 
-                    GEODEBUG( "moving to " << i << " , " << j );
+                    GEODEBUG( "moving to " << i << " , " << j << " fringe : " << _fringe.size() );
                     PREFIXDEBUG( _centerPrefix, _g );
                     PREFIXDEBUG( _neighborPrefix , _g );
                     while( _fringe.size() > 0 ) {
@@ -1542,7 +1429,7 @@ namespace mongo {
                         // be entirely done.  Max recurse depth is < 8 * 16.
 
                         // If we're maxed out on points, return
-                        if( _foundInExp >= maxFound ) {
+                        if( _foundInExp >= maxFound || _found >= maxAdded ) {
                             // Make sure we'll come back to add more points
                             assert( _state == DOING_EXPAND );
                             return;
@@ -1571,14 +1458,63 @@ namespace mongo {
         // The amount the current box overlaps our search area
         virtual double intersectsBox( Box& cur ) = 0;
 
-        virtual void addSpecific( const GeoKeyNode& node , double d, bool newDoc ) {
+        virtual int addSpecific( const GeoKeyNode& node , const Point& keyP , bool onBounds , double keyD , bool newDoc ) {
 
-            if( ! newDoc ) return;
+            int found = 0;
 
-            if ( _cur.isEmpty() )
-                _cur = GeoPoint( node );
-            else
-                _stack.push_back( GeoPoint( node ) );
+            // We need to handle every possible point in this method, even those not in the key value, to
+            // avoid us tracking which hashes we've already seen.
+            if( ! newDoc ){
+                // log() << "Already handled doc!" << endl;
+                return 0;
+            }
+
+            if( _uniqueDocs && ! onBounds ) {
+                // log() << "Added ind to " << _type << endl;
+                _stack.push_front( GeoPoint( node ) );
+                found++;
+            }
+            else {
+                // We now handle every possible point in the document, even those not in the key value,
+                // since we're iterating through them anyway - prevents us from having to save the hashes
+                // we've seen per-doc
+
+                // If we're filtering by hash, get the original
+                bool expensiveExact = expensiveExactCheck();
+
+                vector< BSONObj > locs;
+                getPointsFor( node._key, node.recordLoc.obj(), locs, true );
+                for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ){
+
+                    double d = -1;
+                    Point p( *i );
+
+                    // We can avoid exact document checks by redoing approx checks,
+                    // if the exact checks are more expensive.
+                    bool needExact = true;
+                    if( expensiveExact ){
+                        assert( false );
+                        KeyResult result = approxKeyCheck( p, d );
+                        if( result == BAD ) continue;
+                        else if( result == GOOD ) needExact = false;
+                    }
+
+                    if( ! needExact || exactDocCheck( p, d ) ){
+                        // log() << "Added mult to " << _type << endl;
+                        _stack.push_front( GeoPoint( node ) );
+                        found++;
+                        // If returning unique, just exit after first point is added
+                        if( _uniqueDocs ) break;
+                    }
+                }
+            }
+
+            if ( _cur.isEmpty() && _stack.size() > 0 ){
+                _cur = _stack.front();
+                _stack.pop_front();
+            }
+
+            return found;
         }
 
         virtual long long nscanned() {
@@ -1588,6 +1524,35 @@ namespace mongo {
             return _nscanned;
         }
 
+        virtual void explainDetails( BSONObjBuilder& b ){
+            b << "keysChecked" << _keysChecked;
+            b << "lookedAt" << _lookedAt;
+            b << "matchesPerfd" << _matchesPerfd;
+            b << "objectsLoaded" << _objectsLoaded;
+            b << "pointsLoaded" << _pointsLoaded;
+        }
+
+        virtual BSONObj prettyIndexBounds() const {
+
+            vector<GeoHash>::const_iterator i = _expPrefixes.end();
+            if( _expPrefixes.size() > 0 && *(--i) != *( _expPrefix.get() ) )
+                _expPrefixes.push_back( *( _expPrefix.get() ) );
+
+            BSONObjBuilder bob;
+            BSONArrayBuilder bab;
+            for( i = _expPrefixes.begin(); i != _expPrefixes.end(); ++i ){
+                bab << Box( _g, *i ).toBSON();
+            }
+            bob << _g->_geo << bab.arr();
+
+            return bob.obj();
+
+        }
+
+        void notePrefix() {
+            _expPrefixes.push_back( _prefix );
+        }
+
         string _type;
         BSONObj _filter;
         list<GeoPoint> _stack;
@@ -1616,6 +1581,9 @@ namespace mongo {
         BtreeLocation _min;
         BtreeLocation _max;
 
+        shared_ptr<GeoHash> _expPrefix;
+        mutable vector<GeoHash> _expPrefixes;
+
     };
 
 
@@ -1623,133 +1591,148 @@ namespace mongo {
     public:
         typedef multiset<GeoPoint> Holder;
 
-        GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN )
-            : GeoBrowse( g, "search", filter ), _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _distError( type == GEO_PLAIN ? g->_error : g->_errorSphere ), _farthest(0)
+        GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = true )
+            : GeoBrowse( g, "search", filter, uniqueDocs, needDistance ), _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _distError( type == GEO_PLAIN ? g->_error : g->_errorSphere ), _farthest(0)
         {}
 
-        virtual bool checkDistance( const GeoKeyNode& node, double& d ) {
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
 
             // Always check approximate distance, since it lets us avoid doing
             // checks of the rest of the object if it succeeds
-            // TODO:  Refactor so that we can check exact distance and within if we are going to
-            // anyway.
-            d = approxDistance( node );
-            assert( d >= 0 );
 
-            // Out of the error range, see how close we are to the furthest points
-            bool good = d <= _maxDistance + 2 * _distError /* In error range */
-                        && ( _points.size() < _max /* need more points */
-                          || d <= farthest() + 2 * _distError /* could be closer than previous points */ );
+            switch (_type) {
+            case GEO_PLAIN:
+                d = _near.distance( p );
+                break;
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                d = spheredist_deg( _near, p );
+                break;
+            default: assert( false );
+            }
+            assert( d >= 0 );
 
             GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString()
-                      << "\t" << GeoHash( node.key.firstElement() ) << "\t" << d
-                      << " ok: " << good << " farthest: " << farthest() );
+                      << "\t" << p.toString() << "\t" << d
+                      << " farthest: " << farthest() );
 
-            return good;
-        }
+            // If we need more points
+            double borderDist = ( _points.size() < _max ? _maxDistance : farthest() );
+
+            if( d >= borderDist - 2 * _distError && d <= borderDist + 2 * _distError ) return BORDER;
+            else return d < borderDist ? GOOD : BAD;
 
-        double approxDistance( const GeoKeyNode& node ) {
-            return approxDistance( GeoHash( node._key.firstElement() ) );
         }
 
-        double approxDistance( const GeoHash& h ) {
+        virtual bool exactDocCheck( const Point& p, double& d ){
 
-            double approxDistance = -1;
-            Point p( _g, h );
-            switch (_type) {
+            bool within = false;
+
+            // Get the appropriate distance for the type
+            switch ( _type ) {
             case GEO_PLAIN:
-                approxDistance = _near.distance( p );
+                d = _near.distance( p );
+                within = _near.distanceWithin( p, _maxDistance );
                 break;
             case GEO_SPHERE:
                 checkEarthBounds( p );
-                approxDistance = spheredist_deg( _near, p );
+                d = spheredist_deg( _near, p );
+                within = ( d <= _maxDistance );
                 break;
             default: assert( false );
             }
 
-            return approxDistance;
+            return within;
         }
 
-        double exactDistances( const GeoKeyNode& node ) {
-
-            GEODEBUG( "Finding exact distance for " << node.key.toString() << " and " << node.recordLoc.obj().toString() );
-
-            // Find all the location objects from the keys
-            vector< BSONObj > locs;
-            _g->getKeys( node.recordLoc.obj(), locs );
+        // Always in distance units, whether radians or normal
+        double farthest() const {
+            return _farthest;
+        }
 
-            double maxDistance = -1;
+        virtual int addSpecific( const GeoKeyNode& node, const Point& keyP, bool onBounds, double keyD, bool newDoc ) {
 
-            // Find the particular location we want
-            BSONObj loc;
-            GeoHash keyHash( node._key.firstElement(), _g->_bits );
-            for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
+            // Unique documents
 
-                loc = *i;
+            GeoPoint newPoint( node, keyD, false );
 
-                // Ignore all locations not hashed to the key's hash, since we may see
-                // those later
-                if( _g->_hash( loc ) != keyHash ) continue;
+            int prevSize = _points.size();
 
-                double exactDistance = -1;
-                bool exactWithin = false;
+            // STEP 1 : Remove old duplicate points from the set if needed
+            if( _uniqueDocs ){
 
-                Point p( loc );
+                // Lookup old point with same doc
+                map< DiskLoc , Holder::iterator >::iterator oldPointIt = _seenPts.find( newPoint.loc() );
 
-                // Get the appropriate distance for the type
-                switch ( _type ) {
-                case GEO_PLAIN:
-                    exactDistance = _near.distance( p );
-                    exactWithin = _near.distanceWithin( p, _maxDistance );
-                    break;
-                case GEO_SPHERE:
-                    checkEarthBounds( p );
-                    exactDistance = spheredist_deg( _near, p );
-                    exactWithin = ( exactDistance <= _maxDistance );
-                    break;
-                default: assert( false );
+                if( oldPointIt != _seenPts.end() ){
+                    const GeoPoint& oldPoint = *(oldPointIt->second);
+                    // We don't need to care if we've already seen this same approx pt or better,
+                    // or we've already gone to disk once for the point
+                    if( oldPoint < newPoint ){
+                        GEODEBUG( "\t\tOld point closer than new point" );
+                        return 0;
+                    }
+                    GEODEBUG( "\t\tErasing old point " << oldPointIt->first.obj() );
+                    _points.erase( oldPointIt->second );
                 }
+            }
 
-                assert( exactDistance >= 0 );
-                if( !exactWithin ) continue;
+            Holder::iterator newIt = _points.insert( newPoint );
+            if( _uniqueDocs ) _seenPts[ newPoint.loc() ] = newIt;
 
-                GEODEBUG( "Inserting exact point: " << GeoPoint( node , exactDistance, exactWithin ).toString() );
+            GEODEBUG( "\t\tInserted new point " << newPoint.toString() << " approx : " << keyD );
 
-                // Add a point for this location
-                _points.insert( GeoPoint( node , exactDistance, exactWithin ) );
+            assert( _max > 0 );
 
-                if( exactDistance > maxDistance ) maxDistance = exactDistance;
-            }
+            Holder::iterator lastPtIt = _points.end();
+            lastPtIt--;
+            _farthest = lastPtIt->distance() + 2 * _distError;
 
-            return maxDistance;
+            return _points.size() - prevSize;
 
         }
 
-        // Always in distance units, whether radians or normal
-        double farthest() const {
-            return _farthest;
-        }
+        // Removes extra points from end of _points set.
+        // Check can be a bit costly if we have lots of exact points near borders,
+        // so we'll do this every once and awhile.
+        void processExtraPoints(){
 
-        bool inErrorBounds( double approxD ) const {
-            return approxD >= _maxDistance - _distError && approxD <= _maxDistance + _distError;
-        }
+            if( _points.size() == 0 ) return;
 
-        virtual void addSpecific( const GeoKeyNode& node , double d, bool newDoc ) {
+            int prevSize = _points.size();
 
-            GEODEBUG( "\t\t" << GeoHash( node.key.firstElement() ) << "\t" << node.recordLoc.obj() << "\t" << d );
+            // Erase all points from the set with a position >= _max *and*
+            // whose distance isn't close to the _max - 1 position distance
 
-            double maxDistance = exactDistances( node );
-            if( maxDistance >= 0 ){
+            int numToErase = _points.size() - _max;
+            if( numToErase < 0 ) numToErase = 0;
 
-               // Recalculate the current furthest point.
-               int numToErase = _points.size() - _max;
-               while( numToErase-- > 0 ){
-                   _points.erase( --_points.end() );
-               }
+            // Get the first point definitely in the _points array
+            Holder::iterator startErase = _points.end();
+            for( int i = 0; i < numToErase + 1; i++ ) startErase--;
+            _farthest = startErase->distance() + 2 * _distError;
 
-               _farthest = boost::next( _points.end(), -1 )->_exactDistance;
+            GEODEBUG( "\t\tPotentially erasing " << numToErase << " points, " << " size : " << _points.size() << " max : " << _max << " dist : " << startErase->distance() << " farthest dist : " << _farthest << " from error : " << _distError );
 
+            startErase++;
+            while( numToErase > 0 && startErase->distance() <= _farthest ){
+                GEODEBUG( "\t\tNot erasing point " << startErase->toString() );
+                numToErase--;
+                startErase++;
+                assert( startErase != _points.end() || numToErase == 0 );
             }
+
+            if( _uniqueDocs ){
+                for( Holder::iterator i = startErase; i != _points.end(); ++i )
+                    _seenPts.erase( i->loc() );
+            }
+
+            _points.erase( startErase, _points.end() );
+
+            int diff = _points.size() - prevSize;
+            if( diff > 0 ) _found += diff;
+            else _found -= -diff;
+
         }
 
         unsigned _max;
@@ -1760,17 +1743,20 @@ namespace mongo {
         double _distError;
         double _farthest;
 
+        map< DiskLoc , Holder::iterator > _seenPts;
+
     };
 
 
 
     class GeoSearch : public GeoHopper {
     public:
-        GeoSearch( const Geo2dType * g , const Point& startPt , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN )
-           : GeoHopper( g , numWanted , startPt , filter , maxDistance, type ),
+        GeoSearch( const Geo2dType * g , const Point& startPt , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = false )
+           : GeoHopper( g , numWanted , startPt , filter , maxDistance, type, uniqueDocs, needDistance ),
              _start( g->hash( startPt._x, startPt._y ) ),
-              _numWanted( numWanted ),
-              _type(type)
+             // TODO:  Remove numWanted...
+             _numWanted( numWanted ),
+             _type(type)
         {
 
            assert( g->getDetails() );
@@ -1795,6 +1781,8 @@ namespace mongo {
 
         void exec() {
 
+            if( _numWanted == 0 ) return;
+
             /*
              * Search algorithm
              * 1) use geohash prefix to find X items
@@ -1805,7 +1793,7 @@ namespace mongo {
 
 #ifdef GEODEBUGGING
 
-           log() << "start near search for points near " << _near << " (max dist " << _maxDistance << ")" << endl;
+           log() << "start near search for " << _numWanted << " points near " << _near << " (max dist " << _maxDistance << ")" << endl;
 
 #endif
 
@@ -1815,13 +1803,16 @@ namespace mongo {
                    long long f = found();
                    assert( f <= 0x7fffffff );
                    fillStack( maxPointsHeuristic, _numWanted - static_cast<int>(f) , true );
+                   processExtraPoints();
                } while( _state != DONE && _state != DONE_NEIGHBOR &&
                         found() < _numWanted &&
                         (! _prefix.constrains() || _g->sizeEdge( _prefix ) <= _scanDistance ) );
 
                // If we couldn't scan or scanned everything, we're done
-               if( _state == DONE ) return;
-
+               if( _state == DONE ){
+                   expandEndPoints();
+                   return;
+               }
            }
 
 #ifdef GEODEBUGGING
@@ -1856,6 +1847,8 @@ namespace mongo {
                 _want = Box( _near._x - farDist , _near._y - farDist , farDist * 2 );
                 GEODEBUGPRINT( _want.toString() );
 
+                // log() << "Found : " << found() << " wanted : " << _numWanted << " Far distance : " << farDist << " box : " << _want << endl;
+
                 // Remember the far distance for further scans
                 _scanDistance = farDist;
 
@@ -1874,15 +1867,195 @@ namespace mongo {
                 // Do regular search in the full region
                 do {
                    fillStack( maxPointsHeuristic );
+                   processExtraPoints();
                 }
                 while( _state != DONE );
 
             }
 
-            GEODEBUG( "done near search" )
+            GEODEBUG( "done near search with " << _points.size() << " points " );
+
+            expandEndPoints();
 
         }
 
+        void addExactPoints( const GeoPoint& pt, Holder& points, bool force ){
+            int before, after;
+            addExactPoints( pt, points, before, after, force );
+        }
+
+        void addExactPoints( const GeoPoint& pt, Holder& points, int& before, int& after, bool force ){
+
+            before = 0;
+            after = 0;
+
+            GEODEBUG( "Adding exact points for " << pt.toString() );
+
+            if( pt.isExact() ){
+                if( force ) points.insert( pt );
+                return;
+            }
+
+            vector<BSONObj> locs;
+            getPointsFor( pt.key(), pt.obj(), locs, _uniqueDocs );
+
+            GeoPoint nearestPt( pt, -1, true );
+
+            for( vector<BSONObj>::iterator i = locs.begin(); i != locs.end(); i++ ){
+
+                Point loc( *i );
+
+                double d;
+                if( ! exactDocCheck( loc, d ) ) continue;
+
+                if( _uniqueDocs && ( nearestPt.distance() < 0 || d < nearestPt.distance() ) ){
+                    nearestPt._distance = d;
+                    nearestPt._pt = *i;
+                    continue;
+                }
+                else if( ! _uniqueDocs ){
+                    GeoPoint exactPt( pt, d, true );
+                    exactPt._pt = *i;
+                    GEODEBUG( "Inserting exact pt " << exactPt.toString() << " for " << pt.toString() << " exact : " << d << " is less? " << ( exactPt < pt ) << " bits : " << _g->_bits );
+                    points.insert( exactPt );
+                    exactPt < pt ? before++ : after++;
+                }
+
+            }
+
+            if( _uniqueDocs && nearestPt.distance() >= 0 ){
+                GEODEBUG( "Inserting unique exact pt " << nearestPt.toString() << " for " << pt.toString() << " exact : " << nearestPt.distance() << " is less? " << ( nearestPt < pt ) << " bits : " << _g->_bits );
+                points.insert( nearestPt );
+                if( nearestPt < pt ) before++;
+                else after++;
+            }
+
+        }
+
+        // TODO: Refactor this back into holder class, allow to run periodically when we are seeing a lot of pts
+        void expandEndPoints( bool finish = true ){
+
+            processExtraPoints();
+
+            // All points in array *could* be in maxDistance
+
+            // Step 1 : Trim points to max size
+            // TODO:  This check will do little for now, but is skeleton for future work in incremental $near
+            // searches
+            if( _max > 0 ){
+
+                int numToErase = _points.size() - _max;
+
+                if( numToErase > 0 ){
+
+                    Holder tested;
+
+                    // Work backward through all points we're not sure belong in the set
+                    Holder::iterator maybePointIt = _points.end();
+                    maybePointIt--;
+                    double approxMin = maybePointIt->distance() - 2 * _distError;
+
+                    GEODEBUG( "\t\tNeed to erase " << numToErase << " max : " << _max << " min dist " << approxMin << " error : " << _distError << " starting from : " << (*maybePointIt).toString() );
+
+                    // Insert all
+                    int erased = 0;
+                    while( _points.size() > 0 && ( maybePointIt->distance() >= approxMin || erased < numToErase ) ){
+
+                        Holder::iterator current = maybePointIt--;
+
+                        addExactPoints( *current, tested, true );
+                        _points.erase( current );
+                        erased++;
+
+                        if( tested.size() )
+                            approxMin = tested.begin()->distance() - 2 * _distError;
+
+                    }
+
+                    GEODEBUG( "\t\tEnding search at point " << ( _points.size() == 0 ? "(beginning)" : maybePointIt->toString() ) );
+
+                    int numToAddBack = erased - numToErase;
+                    assert( numToAddBack >= 0 );
+
+                    GEODEBUG( "\t\tNum tested valid : " << tested.size() << " erased : " << erased << " added back : " << numToAddBack );
+
+#ifdef GEODEBUGGING
+                    for( Holder::iterator it = tested.begin(); it != tested.end(); it++ ){
+                        log() << "Tested Point: " << *it << endl;
+                    }
+#endif
+                    Holder::iterator testedIt = tested.begin();
+                    for( int i = 0; i < numToAddBack && testedIt != tested.end(); i++ ){
+                        _points.insert( *testedIt );
+                        testedIt++;
+                    }
+                }
+            }
+
+#ifdef GEODEBUGGING
+            for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+                log() << "Point: " << *it << endl;
+            }
+#endif
+            // We've now trimmed first set of unneeded points
+
+            GEODEBUG( "\t\t Start expanding, num points : " << _points.size() << " max : " << _max );
+
+            // Step 2: iterate through all points and add as needed
+
+            unsigned expandedPoints = 0;
+            Holder::iterator it = _points.begin();
+            double expandWindowEnd = -1;
+            while( it != _points.end() ){
+                const GeoPoint& currPt = *it;
+
+                // TODO: If one point is exact, maybe not 2 * _distError
+
+                // See if we're in an expand window
+                bool inWindow = currPt.distance() <= expandWindowEnd;
+                // If we're not, and we're done with points, break
+                if( ! inWindow && expandedPoints >= _max ) break;
+
+                bool expandApprox = ! currPt.isExact() && ( ! _uniqueDocs || ( finish && _needDistance ) || inWindow );
+
+                if( expandApprox ){
+
+                    // Add new point(s)
+                    // These will only be added in a radius of 2 * _distError around the current point,
+                    // so should not affect previously valid points.
+                    int before, after;
+                    addExactPoints( currPt, _points, before, after, false );
+                    expandedPoints += before;
+
+                    if( _max > 0 && expandedPoints < _max )
+                        expandWindowEnd = currPt.distance() + 2 * _distError;
+
+                    // Iterate to the next point
+                    Holder::iterator current = it++;
+                    // Erase the current point
+                    _points.erase( current );
+
+                }
+                else{
+                    expandedPoints++;
+                    it++;
+                }
+            }
+
+            GEODEBUG( "\t\tFinished expanding, num points : " << _points.size() << " max : " << _max );
+
+            // Finish
+            // TODO:  Don't really need to trim?
+            for( ; expandedPoints > _max; expandedPoints-- ) it--;
+            _points.erase( it, _points.end() );
+
+#ifdef GEODEBUGGING
+            for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+                log() << "Point: " << *it << endl;
+            }
+#endif
+        }
+
         virtual GeoHash expandStartHash(){
            return _start;
         }
@@ -1915,7 +2088,7 @@ namespace mongo {
             : GeoCursorBase( s->_spec ) ,
               _s( s ) , _cur( s->_points.begin() ) , _end( s->_points.end() ), _nscanned() {
             if ( _cur != _end ) {
-            	++_nscanned;
+                ++_nscanned;
             }
         }
 
@@ -1975,8 +2148,8 @@ namespace mongo {
     class GeoCircleBrowse : public GeoBrowse {
     public:
 
-        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center")
-            : GeoBrowse( g , "circle" , filter ) {
+        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center", bool uniqueDocs = true )
+            : GeoBrowse( g , "circle" , filter, uniqueDocs ) {
 
             uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 );
 
@@ -2040,19 +2213,16 @@ namespace mongo {
             return cur.intersects( _bBox );
         }
 
-        virtual bool checkDistance( const GeoKeyNode& node, double& d ) {
-
-            GeoHash h( node._key.firstElement(), _g->_bits );
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
 
             // Inexact hash distance checks.
             double error = 0;
             switch (_type) {
             case GEO_PLAIN:
-                d = _g->distance( _start , h );
+                d = _startPt.distance( p );
                 error = _g->_error;
                 break;
             case GEO_SPHERE: {
-                Point p( _g, h );
                 checkEarthBounds( p );
                 d = spheredist_deg( _startPt, p );
                 error = _g->_errorSphere;
@@ -2062,40 +2232,25 @@ namespace mongo {
             }
 
             // If our distance is in the error bounds...
-            if( d >= _maxDistance - error && d <= _maxDistance + error ) {
-
-                // Do exact check
-                vector< BSONObj > locs;
-                _g->getKeys( node.recordLoc.obj(), locs );
-
-                for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
-
-                    GEODEBUG( "Inexact distance : " << d << " vs " << _maxDistance << " from " << ( *i ).toString() << " due to error " << error );
-
-                    Point p( *i );
-                    // Exact distance checks.
-                    switch (_type) {
-                    case GEO_PLAIN: {
-                        if( _startPt.distanceWithin( p, _maxDistance ) ) return true;
-                        break;
-                    }
-                    case GEO_SPHERE:
-                        // Ignore all locations not hashed to the key's hash, since spherical calcs are
-                        // more expensive.
-                        if( _g->_hash( *i ) != h ) break;
-                        checkEarthBounds( p );
-                        if( spheredist_deg( _startPt , p ) <= _maxDistance ) return true;
-                        break;
-                    default: assert( false );
-                    }
+            if( d >= _maxDistance - error && d <= _maxDistance + error ) return BORDER;
+            return d > _maxDistance ? BAD : GOOD;
+        }
 
-                }
+        virtual bool exactDocCheck( const Point& p, double& d ){
 
-                return false;
+            switch (_type) {
+            case GEO_PLAIN: {
+                if( _startPt.distanceWithin( p, _maxDistance ) ) return true;
+                break;
+            }
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                if( spheredist_deg( _startPt , p ) <= _maxDistance ) return true;
+                break;
+            default: assert( false );
             }
 
-            GEODEBUG( "\t " << h << "\t" << d );
-            return d <= _maxDistance;
+            return false;
         }
 
         GeoDistType _type;
@@ -2111,8 +2266,8 @@ namespace mongo {
     class GeoBoxBrowse : public GeoBrowse {
     public:
 
-        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj() )
-            : GeoBrowse( g , "box" , filter ) {
+        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj(), bool uniqueDocs = true )
+            : GeoBrowse( g , "box" , filter, uniqueDocs ) {
 
             uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 );
 
@@ -2133,7 +2288,7 @@ namespace mongo {
             _fudge = _g->_error;
             _wantLen = _fudge +
                        std::max( ( _want._max._x - _want._min._x ) ,
-                                 ( _want._max._y - _want._min._y ) );
+                                 ( _want._max._y - _want._min._y ) ) / 2;
 
             ok();
         }
@@ -2171,39 +2326,14 @@ namespace mongo {
             return cur.intersects( _want );
         }
 
-        virtual bool checkDistance( const GeoKeyNode& node, double& d ) {
-
-            GeoHash h( node._key.firstElement() );
-            Point approxPt( _g, h );
-
-            bool approxInside = _want.inside( approxPt, _fudge );
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+            if( _want.onBoundary( p, _fudge ) ) return BORDER;
+            else return _want.inside( p, _fudge ) ? GOOD : BAD;
 
-            if( approxInside && _want.onBoundary( approxPt, _fudge ) ) {
-
-                // Do exact check
-                vector< BSONObj > locs;
-                _g->getKeys( node.recordLoc.obj(), locs );
-
-                for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
-                    if( _want.inside( Point( *i ) ) ) {
-
-                        GEODEBUG( "found exact point : " << _want.toString()
-                                  << " exact point : " << Point( *i ).toString()
-                                  << " approx point : " << approxPt.toString()
-                                  << " because of error: " << _fudge );
-
-                        return true;
-                    }
-                }
-
-                return false;
-            }
-
-            GEODEBUG( "checking point : " << _want.toString()
-                      << " point: " << approxPt.toString()
-                      << " in : " << _want.inside( approxPt, _fudge ) );
+        }
 
-            return approxInside;
+        virtual bool exactDocCheck( const Point& p, double& d ){
+            return _want.inside( p );
         }
 
         Box _want;
@@ -2218,7 +2348,7 @@ namespace mongo {
     public:
 
         GeoPolygonBrowse( const Geo2dType* g , const BSONObj& polyPoints ,
-                          BSONObj filter = BSONObj() ) : GeoBrowse( g , "polygon" , filter ) {
+                          BSONObj filter = BSONObj(), bool uniqueDocs = true ) : GeoBrowse( g , "polygon" , filter, uniqueDocs ) {
 
             GEODEBUG( "In Polygon" )
 
@@ -2233,7 +2363,7 @@ namespace mongo {
             uassert( 14030, "polygon must be defined by three points or more", _poly.size() >= 3 );
 
             _bounds = _poly.bounds();
-            _maxDim = _bounds.maxDim();
+            _maxDim = _g->_error + _bounds.maxDim() / 2;
 
             ok();
         }
@@ -2253,51 +2383,17 @@ namespace mongo {
             return cur.intersects( _bounds );
         }
 
-        virtual bool checkDistance( const GeoKeyNode& node, double& d ) {
-
-            GeoHash h( node._key.firstElement(), _g->_bits );
-            Point p( _g, h );
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
 
             int in = _poly.contains( p, _g->_error );
-            if( in != 0 ) {
-
-                if ( in > 0 ) {
-                    GEODEBUG( "Point: [" << p._x << ", " << p._y << "] approx in polygon" );
-                }
-                else {
-                    GEODEBUG( "Point: [" << p._x << ", " << p._y << "] approx not in polygon" );
-                }
-
-                if( in != 0 ) return in > 0;
-            }
-
-            // Do exact check, since to approximate check was inconclusive
-            vector< BSONObj > locs;
-            _g->getKeys( node.recordLoc.obj(), locs );
-
-            for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
-
-                Point p( *i );
 
-                // Ignore all points not hashed to the current value
-                // This implicitly assumes hashing is less costly than the polygon check, which
-                // may or may not be true.
-                if( _g->hash( p ) != h ) continue;
+            if( in == 0 ) return BORDER;
+            else return in > 0 ? GOOD : BAD;
 
-                // Use the point in polygon algorithm to see if the point
-                // is contained in the polygon.
-                bool in = _poly.contains( p );
-                if ( in ) {
-                    GEODEBUG( "Point: [" << p._x << ", " << p._y << "] exactly in polygon" );
-                }
-                else {
-                    GEODEBUG( "Point: [" << p._x << ", " << p._y << "] exactly not in polygon" );
-                }
-                if( in ) return in;
-
-            }
+        }
 
-            return false;
+        virtual bool exactDocCheck( const Point& p, double& d ){
+            return _poly.contains( p );
         }
 
     private:
@@ -2324,7 +2420,7 @@ namespace mongo {
 
             if ( e.type() == Array ) {
                 // If we get an array query, assume it is a location, and do a $within { $center : [[x, y], 0] } search
-                shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ) ) );
+                shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ), "$center", true ) );
                 return c;
             }
             else if ( e.type() == Object ) {
@@ -2364,33 +2460,44 @@ namespace mongo {
                         if ( e.isNumber() )
                             maxDistance = e.numberDouble();
                     }
-                    shared_ptr<GeoSearch> s( new GeoSearch( this , Point( e ) , numWanted , query , maxDistance, type ) );
+
+                    bool uniqueDocs = false;
+                    if( ! n["$uniqueDocs"].eoo() ) uniqueDocs = n["$uniqueDocs"].trueValue();
+
+                    shared_ptr<GeoSearch> s( new GeoSearch( this , Point( e ) , numWanted , query , maxDistance, type, uniqueDocs ) );
                     s->exec();
                     shared_ptr<Cursor> c;
                     c.reset( new GeoSearchCursor( s ) );
                     return c;
                 }
                 case BSONObj::opWITHIN: {
+
                     e = e.embeddedObject().firstElement();
                     uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() );
+
+                    BSONObj context = e.embeddedObject();
                     e = e.embeddedObject().firstElement();
                     string type = e.fieldName();
+
+                    bool uniqueDocs = true;
+                    if( ! context["$uniqueDocs"].eoo() ) uniqueDocs = context["$uniqueDocs"].trueValue();
+
                     if ( startsWith(type,  "$center") ) {
                         uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() );
-                        shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type) );
+                        shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type, uniqueDocs ) );
                         return c;
                     }
                     else if ( type == "$box" ) {
                         uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() );
-                        shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query ) );
+                        shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
                         return c;
                     }
                     else if ( startsWith( type, "$poly" ) ) {
                         uassert( 14029 , "$polygon has to take an object or array" , e.isABSONObj() );
-                        shared_ptr<Cursor> c( new GeoPolygonBrowse( this , e.embeddedObjectUserCheck() , query ) );
+                        shared_ptr<Cursor> c( new GeoPolygonBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
                         return c;
                     }
-                    throw UserException( 13058 , (string)"unknown $with type: " + type );
+                    throw UserException( 13058 , (string)"unknown $within type: " + type );
                 }
                 default:
                     // Otherwise... assume the object defines a point, and we want to do a zero-radius $within $center
@@ -2414,7 +2521,7 @@ namespace mongo {
         bool slaveOk() const { return true; }
         void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; }
         bool slaveOverrideOk() { return true; }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
 
             NamespaceDetails * d = nsdetails( ns.c_str() );
@@ -2450,6 +2557,12 @@ namespace mongo {
                 assert( numWanted >= 0 );
             }
 
+            bool uniqueDocs = false;
+            if( ! cmdObj["uniqueDocs"].eoo() ) uniqueDocs = cmdObj["uniqueDocs"].trueValue();
+
+            bool includeLocs = false;
+            if( ! cmdObj["includeLocs"].eoo() ) includeLocs = cmdObj["includeLocs"].trueValue();
+
             uassert(13046, "'near' param missing/invalid", !cmdObj["near"].eoo());
             const Point n( cmdObj["near"] );
             result.append( "near" , g->_tohash( cmdObj["near"] ).toString() );
@@ -2466,7 +2579,7 @@ namespace mongo {
             if ( cmdObj["spherical"].trueValue() )
                 type = GEO_SPHERE;
 
-            GeoSearch gs( g , n , numWanted , filter , maxDistance , type );
+            GeoSearch gs( g , n , numWanted , filter , maxDistance , type, uniqueDocs, true );
 
             if ( cmdObj["start"].type() == String) {
                 GeoHash start ((string) cmdObj["start"].valuestr());
@@ -2486,11 +2599,12 @@ namespace mongo {
             for ( GeoHopper::Holder::iterator i=gs._points.begin(); i!=gs._points.end(); i++ ) {
 
                 const GeoPoint& p = *i;
-                double dis = distanceMultiplier * p._exactDistance;
+                double dis = distanceMultiplier * p.distance();
                 totalDistance += dis;
 
                 BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ) ) );
                 bb.append( "dis" , dis );
+                if( includeLocs ) bb.append( "loc" , p._pt );
                 bb.append( "obj" , p._o );
                 bb.done();
             }
@@ -2516,7 +2630,7 @@ namespace mongo {
         virtual LockType locktype() const { return READ; }
         bool slaveOk() const { return true; }
         bool slaveOverrideOk() { return true; }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
 
             NamespaceDetails * d = nsdetails( ns.c_str() );
@@ -2571,4 +2685,248 @@ namespace mongo {
 
     } geoWalkCmd;
 
+    struct GeoUnitTest : public UnitTest {
+
+        int round( double d ) {
+            return (int)(.5+(d*1000));
+        }
+
+#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); }
+
+        void run() {
+            assert( ! GeoHash::isBitSet( 0 , 0 ) );
+            assert( ! GeoHash::isBitSet( 0 , 31 ) );
+            assert( GeoHash::isBitSet( 1 , 31 ) );
+
+            IndexSpec i( BSON( "loc" << "2d" ) );
+            Geo2dType g( &geo2dplugin , &i );
+            {
+                double x = 73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                double x = -73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                GeoHash h( "0000" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0001" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0000" );
+
+                h.init( "0001" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0100" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0001" );
+
+
+                h.init( "0000" );
+                h.move( 1 , 0 );
+                GEOHEQ( h , "0010" );
+            }
+
+            {
+                Box b( 5 , 5 , 2 );
+                assert( "(5,5) -->> (7,7)" == b.toString() );
+            }
+
+            {
+                GeoHash a = g.hash( 1 , 1 );
+                GeoHash b = g.hash( 4 , 5 );
+                assert( 5 == (int)(g.distance( a , b ) ) );
+                a = g.hash( 50 , 50 );
+                b = g.hash( 42 , 44 );
+                assert( round(10) == round(g.distance( a , b )) );
+            }
+
+            {
+                GeoHash x("0000");
+                assert( 0 == x.getHash() );
+                x.init( 0 , 1 , 32 );
+                GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" )
+
+                assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
+                assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
+            }
+
+            {
+                GeoHash x("1010");
+                GEOHEQ( x , "1010" );
+                GeoHash y = x + "01";
+                GEOHEQ( y , "101001" );
+            }
+
+            {
+
+                GeoHash a = g.hash( 5 , 5 );
+                GeoHash b = g.hash( 5 , 7 );
+                GeoHash c = g.hash( 100 , 100 );
+                /*
+                cout << "a: " << a << endl;
+                cout << "b: " << b << endl;
+                cout << "c: " << c << endl;
+
+                cout << "a: " << a.toStringHex1() << endl;
+                cout << "b: " << b.toStringHex1() << endl;
+                cout << "c: " << c.toStringHex1() << endl;
+                */
+                BSONObj oa = a.wrap();
+                BSONObj ob = b.wrap();
+                BSONObj oc = c.wrap();
+                /*
+                cout << "a: " << oa.hexDump() << endl;
+                cout << "b: " << ob.hexDump() << endl;
+                cout << "c: " << oc.hexDump() << endl;
+                */
+                assert( oa.woCompare( ob ) < 0 );
+                assert( oa.woCompare( oc ) < 0 );
+
+            }
+
+            {
+                GeoHash x( "000000" );
+                x.move( -1 , 0 );
+                GEOHEQ( x , "101010" );
+                x.move( 1 , -1 );
+                GEOHEQ( x , "010101" );
+                x.move( 0 , 1 );
+                GEOHEQ( x , "000000" );
+            }
+
+            {
+                GeoHash prefix( "110011000000" );
+                GeoHash entry(  "1100110000011100000111000001110000011100000111000001000000000000" );
+                assert( ! entry.hasPrefix( prefix ) );
+
+                entry = GeoHash("1100110000001100000111000001110000011100000111000001000000000000");
+                assert( entry.toString().find( prefix.toString() ) == 0 );
+                assert( entry.hasPrefix( GeoHash( "1100" ) ) );
+                assert( entry.hasPrefix( prefix ) );
+            }
+
+            {
+                GeoHash a = g.hash( 50 , 50 );
+                GeoHash b = g.hash( 48 , 54 );
+                assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
+            }
+
+
+            {
+                Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
+                assert( b.inside( 29.763 , -95.363 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) );
+            }
+
+            {
+                GeoHash a( "11001111" );
+                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11") ) );
+                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11110000") ) );
+            }
+
+            {
+                int N = 10000;
+                {
+                    Timer t;
+                    for ( int i=0; i<N; i++ ) {
+                        unsigned x = (unsigned)rand();
+                        unsigned y = (unsigned)rand();
+                        GeoHash h( x , y );
+                        unsigned a,b;
+                        h.unhash_slow( a,b );
+                        assert( a == x );
+                        assert( b == y );
+                    }
+                    //cout << "slow: " << t.millis() << endl;
+                }
+
+                {
+                    Timer t;
+                    for ( int i=0; i<N; i++ ) {
+                        unsigned x = (unsigned)rand();
+                        unsigned y = (unsigned)rand();
+                        GeoHash h( x , y );
+                        unsigned a,b;
+                        h.unhash_fast( a,b );
+                        assert( a == x );
+                        assert( b == y );
+                    }
+                    //cout << "fast: " << t.millis() << endl;
+                }
+
+            }
+
+            {
+                // see http://en.wikipedia.org/wiki/Great-circle_distance#Worked_example
+
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+
+                    double dist1 = spheredist_deg(BNA, LAX);
+                    double dist2 = spheredist_deg(LAX, BNA);
+
+                    // target is 0.45306
+                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+                }
+                {
+                    Point BNA (-1.5127, 0.6304);
+                    Point LAX (-2.0665, 0.5924);
+
+                    double dist1 = spheredist_rad(BNA, LAX);
+                    double dist2 = spheredist_rad(LAX, BNA);
+
+                    // target is 0.45306
+                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+                }
+                {
+                    Point JFK (-73.77694444, 40.63861111 );
+                    Point LAX (-118.40, 33.94);
+
+                    double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES;
+                    assert( dist > 2469 && dist < 2470 );
+                }
+
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+                    Point JFK (-73.77694444, 40.63861111 );
+                    assert( spheredist_deg(BNA, BNA) < 1e-6);
+                    assert( spheredist_deg(LAX, LAX) < 1e-6);
+                    assert( spheredist_deg(JFK, JFK) < 1e-6);
+
+                    Point zero (0, 0);
+                    Point antizero (0,-180);
+
+                    // these were known to cause NaN
+                    assert( spheredist_deg(zero, zero) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6);
+                }
+            }
+        }
+    } geoUnitTest;
+
+
 }
+
diff --git a/db/geo/core.h b/db/geo/core.h
index 74f4b6e8269..b77997844f2 100644
--- a/db/geo/core.h
+++ b/db/geo/core.h
@@ -278,14 +278,19 @@ namespace mongo {
             return *this;
         }
 
-        bool operator==(const GeoHash& h ) {
+        bool operator==(const GeoHash& h ) const {
             return _hash == h._hash && _bits == h._bits;
         }
 
-        bool operator!=(const GeoHash& h ) {
+        bool operator!=(const GeoHash& h ) const {
             return !( *this == h );
         }
 
+        bool operator<(const GeoHash& h ) const {
+            if( _hash != h._hash ) return _hash < h._hash;
+            return _bits < h._bits;
+        }
+
         GeoHash& operator+=( const char * s ) {
             unsigned pos = _bits * 2;
             _bits += strlen(s) / 2;
diff --git a/db/geo/haystack.cpp b/db/geo/haystack.cpp
index fd6b2392d6a..a5dd478f625 100644
--- a/db/geo/haystack.cpp
+++ b/db/geo/haystack.cpp
@@ -264,7 +264,7 @@ namespace mongo {
         virtual LockType locktype() const { return READ; }
         bool slaveOk() const { return true; }
         bool slaveOverrideOk() const { return true; }
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
 
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
 
diff --git a/db/index.cpp b/db/index.cpp
index 8aebef45e8e..67a0d44e444 100644
--- a/db/index.cpp
+++ b/db/index.cpp
@@ -27,11 +27,6 @@
 
 namespace mongo {
 
-    /** old (<= v1.8) : 0
-        1 is new version
-    */
-    const int DefaultIndexVersionNumber = 1;
-
     template< class V >
     class IndexInterfaceImpl : public IndexInterface { 
     public:
diff --git a/db/index.h b/db/index.h
index debe2aa9c26..54b06394435 100644
--- a/db/index.h
+++ b/db/index.h
@@ -150,14 +150,18 @@ namespace mongo {
             return io.getStringField("ns");
         }
 
-        int version() const {
-            BSONElement e = info.obj()["v"];
+        static int versionForIndexObj( const BSONObj &obj ) {
+            BSONElement e = obj["v"];
             if( e.type() == NumberInt ) 
                 return e._numberInt();
             // should normally be an int.  this is for backward compatibility
             int v = e.numberInt();
             uassert(14802, "index v field should be Integer type", v == 0);
-            return v;
+            return v;            
+        }
+        
+        int version() const {
+            return versionForIndexObj( info.obj() );
         }
 
         /** @return true if index has unique constraint */
diff --git a/db/indexkey.cpp b/db/indexkey.cpp
index cc2cd43daf5..6d6fcc58cae 100644
--- a/db/indexkey.cpp
+++ b/db/indexkey.cpp
@@ -22,9 +22,15 @@
 #include "btree.h"
 #include "ops/query.h"
 #include "background.h"
+#include "../util/text.h"
 
 namespace mongo {
 
+    /** old (<= v1.8) : 0
+     1 is new version
+     */
+    const int DefaultIndexVersionNumber = 1;
+    
     map<string,IndexPlugin*> * IndexPlugin::_plugins;
 
     IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec )
@@ -100,6 +106,14 @@ namespace mongo {
         }
 
         {
+            // _undefinedElt
+            BSONObjBuilder b;
+            b.appendUndefined( "" );
+            _undefinedObj = b.obj();
+            _undefinedElt = _undefinedObj.firstElement();
+        }
+        
+        {
             // handle plugins
             string pluginName = IndexPlugin::findPluginName( keyPattern );
             if ( pluginName.size() ) {
@@ -116,131 +130,289 @@ namespace mongo {
         _finishedInit = true;
     }
 
-
-    void IndexSpec::getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
-        if ( _indexType.get() ) { //plugin (eg geo)
-            _indexType->getKeys( obj , keys );
-            return;
-        }
-        vector<const char*> fieldNames( _fieldNames );
-        vector<BSONElement> fixed( _fixed );
-        _getKeys( fieldNames , fixed , obj, keys );
-        if ( keys.empty() && ! _sparse )
-            keys.insert( _nullKey );
+    void assertParallelArrays( const char *first, const char *second ) {
+        stringstream ss;
+        ss << "cannot index parallel arrays [" << first << "] [" << second << "]";
+        uasserted( 10088 ,  ss.str() );        
     }
-
-    void IndexSpec::_getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const {
-        BSONElement arrElt;
-        unsigned arrIdx = ~0;
-        int numNotFound = 0;
-
-        for( unsigned i = 0; i < fieldNames.size(); ++i ) {
-            if ( *fieldNames[ i ] == '\0' )
-                continue;
-
-            BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
-
-            if ( e.eoo() ) {
-                e = _nullElt; // no matching field
-                numNotFound++;
+    
+    class KeyGeneratorV0 {
+    public:
+        KeyGeneratorV0( const IndexSpec &spec ) : _spec( spec ) {}
+        
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+            if ( _spec._indexType.get() ) { //plugin (eg geo)
+                _spec._indexType->getKeys( obj , keys );
+                return;
             }
-
-            if ( e.type() != Array )
-                fieldNames[ i ] = ""; // no matching field or non-array match
-
-            if ( *fieldNames[ i ] == '\0' )
-                fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
-
-            if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
-                arrIdx = i;
-                arrElt = e;
+            vector<const char*> fieldNames( _spec._fieldNames );
+            vector<BSONElement> fixed( _spec._fixed );
+            _getKeys( fieldNames , fixed , obj, keys );
+            if ( keys.empty() && ! _spec._sparse )
+                keys.insert( _spec._nullKey );
+        }
+        
+    private:
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const {
+            BSONElement arrElt;
+            unsigned arrIdx = ~0;
+            int numNotFound = 0;
+            
+            for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+                if ( *fieldNames[ i ] == '\0' )
+                    continue;
+                
+                BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
+                
+                if ( e.eoo() ) {
+                    e = _spec._nullElt; // no matching field
+                    numNotFound++;
+                }
+                
+                if ( e.type() != Array )
+                    fieldNames[ i ] = ""; // no matching field or non-array match
+                
+                if ( *fieldNames[ i ] == '\0' )
+                    fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
+                
+                if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
+                    arrIdx = i;
+                    arrElt = e;
+                }
+                
+                // enforce single array path here
+                if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) {
+                    assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+                }
             }
-
-            // enforce single array path here
-            if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) {
-                stringstream ss;
-                ss << "cannot index parallel arrays [" << e.fieldName() << "] [" << arrElt.fieldName() << "]";
-                uasserted( 10088 ,  ss.str() );
+            
+            bool allFound = true; // have we found elements for all field names in the key spec?
+            for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) {
+                if ( **i != '\0' ) {
+                    allFound = false;
+                    break;
+                }
             }
-        }
-
-        bool allFound = true; // have we found elements for all field names in the key spec?
-        for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) {
-            if ( **i != '\0' ) {
-                allFound = false;
-                break;
+            
+            if ( _spec._sparse && numNotFound == _spec._nFields ) {
+                // we didn't find any fields
+                // so we're not going to index this document
+                return;
             }
-        }
-
-        if ( _sparse && numNotFound == _nFields ) {
-            // we didn't find any fields
-            // so we're not going to index this document
-            return;
-        }
-
-        bool insertArrayNull = false;
-
-        if ( allFound ) {
-            if ( arrElt.eoo() ) {
-                // no terminal array element to expand
-                BSONObjBuilder b(_sizeTracker);
-                for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
-                    b.appendAs( *i, "" );
-                keys.insert( b.obj() );
+            
+            bool insertArrayNull = false;
+            
+            if ( allFound ) {
+                if ( arrElt.eoo() ) {
+                    // no terminal array element to expand
+                    BSONObjBuilder b(_spec._sizeTracker);
+                    for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
+                        b.appendAs( *i, "" );
+                    keys.insert( b.obj() );
+                }
+                else {
+                    // terminal array element to expand, so generate all keys
+                    BSONObjIterator i( arrElt.embeddedObject() );
+                    if ( i.more() ) {
+                        while( i.more() ) {
+                            BSONObjBuilder b(_spec._sizeTracker);
+                            for( unsigned j = 0; j < fixed.size(); ++j ) {
+                                if ( j == arrIdx )
+                                    b.appendAs( i.next(), "" );
+                                else
+                                    b.appendAs( fixed[ j ], "" );
+                            }
+                            keys.insert( b.obj() );
+                        }
+                    }
+                    else if ( fixed.size() > 1 ) {
+                        insertArrayNull = true;
+                    }
+                }
             }
             else {
-                // terminal array element to expand, so generate all keys
+                // nonterminal array element to expand, so recurse
+                assert( !arrElt.eoo() );
                 BSONObjIterator i( arrElt.embeddedObject() );
                 if ( i.more() ) {
                     while( i.more() ) {
-                        BSONObjBuilder b(_sizeTracker);
-                        for( unsigned j = 0; j < fixed.size(); ++j ) {
-                            if ( j == arrIdx )
-                                b.appendAs( i.next(), "" );
-                            else
-                                b.appendAs( fixed[ j ], "" );
+                        BSONElement e = i.next();
+                        if ( e.type() == Object ) {
+                            _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
                         }
-                        keys.insert( b.obj() );
                     }
                 }
-                else if ( fixed.size() > 1 ) {
+                else {
                     insertArrayNull = true;
                 }
             }
-        }
-        else {
-            // nonterminal array element to expand, so recurse
-            assert( !arrElt.eoo() );
-            BSONObjIterator i( arrElt.embeddedObject() );
-            if ( i.more() ) {
-                while( i.more() ) {
-                    BSONElement e = i.next();
-                    if ( e.type() == Object ) {
-                        _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
+            
+            if ( insertArrayNull ) {
+                // x : [] - need to insert undefined
+                BSONObjBuilder b(_spec._sizeTracker);
+                for( unsigned j = 0; j < fixed.size(); ++j ) {
+                    if ( j == arrIdx ) {
+                        b.appendUndefined( "" );
+                    }
+                    else {
+                        BSONElement e = fixed[j];
+                        if ( e.eoo() )
+                            b.appendNull( "" );
+                        else
+                            b.appendAs( e , "" );
                     }
                 }
+                keys.insert( b.obj() );
             }
-            else {
-                insertArrayNull = true;
+        }
+        
+        const IndexSpec &_spec;
+    };
+
+    class KeyGeneratorV1 {
+    public:
+        KeyGeneratorV1( const IndexSpec &spec ) : _spec( spec ) {}
+        
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+            if ( _spec._indexType.get() ) { //plugin (eg geo)
+                _spec._indexType->getKeys( obj , keys );
+                return;
+            }
+            vector<const char*> fieldNames( _spec._fieldNames );
+            vector<BSONElement> fixed( _spec._fixed );
+            _getKeys( fieldNames , fixed , obj, keys );
+            if ( keys.empty() && ! _spec._sparse )
+                keys.insert( _spec._nullKey );
+        }     
+        
+    private:
+        /**
+         * @param arrayNestedArray - set if the returned element is an array nested directly within arr.
+         */
+        BSONElement extractNextElement( const BSONObj &obj, const BSONObj &arr, const char *&field, bool &arrayNestedArray ) const {
+            string firstField = mongoutils::str::before( field, '.' );
+            bool haveObjField = !obj.getField( firstField ).eoo();
+            BSONElement arrField = arr.getField( firstField );
+            bool haveArrField = !arrField.eoo();
+
+            // An index component field name cannot exist in both a document array and one of that array's children.
+            uassert( 15855 , "Parallel references while expanding indexed field in array", !haveObjField || !haveArrField );
+
+            arrayNestedArray = false;
+			if ( haveObjField ) {
+                return obj.getFieldDottedOrArray( field );
+            }
+            else if ( haveArrField ) {
+                if ( arrField.type() == Array ) {
+                    arrayNestedArray = true;
+                }
+                return arr.getFieldDottedOrArray( field );
             }
+            return BSONElement();
         }
-
-        if ( insertArrayNull ) {
-            // x : [] - need to insert undefined
-            BSONObjBuilder b(_sizeTracker);
-            for( unsigned j = 0; j < fixed.size(); ++j ) {
-                if ( j == arrIdx ) {
-                    b.appendUndefined( "" );
+        
+        void _getKeysArrEltFixed( vector<const char*> &fieldNames , vector<BSONElement> &fixed , const BSONElement &arrEntry, BSONObjSet &keys, int numNotFound, const BSONElement &arrObjElt, const set< unsigned > &arrIdxs, bool mayExpandArrayUnembedded ) const {
+            // set up any terminal array values
+            for( set<unsigned>::const_iterator j = arrIdxs.begin(); j != arrIdxs.end(); ++j ) {
+                if ( *fieldNames[ *j ] == '\0' ) {
+                    fixed[ *j ] = mayExpandArrayUnembedded ? arrEntry : arrObjElt;
+                }
+            }
+            // recurse
+            _getKeys( fieldNames, fixed, ( arrEntry.type() == Object ) ? arrEntry.embeddedObject() : BSONObj(), keys, numNotFound, arrObjElt.embeddedObject() );        
+        }
+        
+        /**
+         * @param fieldNames - fields to index, may be postfixes in recursive calls
+         * @param fixed - values that have already been identified for their index fields
+         * @param obj - object from which keys should be extracted, based on names in fieldNames
+         * @param keys - set where index keys are written
+         * @param numNotFound - number of index fields that have already been identified as missing
+         * @param array - array from which keys should be extracted, based on names in fieldNames
+         *        If obj and array are both nonempty, obj will be one of the elements of array.
+         */        
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys, int numNotFound = 0, const BSONObj &array = BSONObj() ) const {
+            BSONElement arrElt;
+            set<unsigned> arrIdxs;
+            bool mayExpandArrayUnembedded = true;
+            for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+                if ( *fieldNames[ i ] == '\0' ) {
+                    continue;
+                }
+                
+                bool arrayNestedArray;
+                // Extract element matching fieldName[ i ] from object xor array.
+                BSONElement e = extractNextElement( obj, array, fieldNames[ i ], arrayNestedArray );
+                
+                if ( e.eoo() ) {
+                    // if field not present, set to null
+                    fixed[ i ] = _spec._nullElt;
+                    // done expanding this field name
+                    fieldNames[ i ] = "";
+                    numNotFound++;
+                }
+                else if ( e.type() == Array ) {
+                    arrIdxs.insert( i );
+                    if ( arrElt.eoo() ) {
+                        // we only expand arrays on a single path -- track the path here
+                        arrElt = e;
+                    }
+                    else if ( e.rawdata() != arrElt.rawdata() ) {
+                        // enforce single array path here
+                        assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+                    }
+                    if ( arrayNestedArray ) {
+                        mayExpandArrayUnembedded = false;   
+                    }
                 }
                 else {
-                    BSONElement e = fixed[j];
-                    if ( e.eoo() )
-                        b.appendNull( "" );
-                    else
-                        b.appendAs( e , "" );
+                    // not an array - no need for further expansion
+                    fixed[ i ] = e;
+                }
+            }
+            
+            if ( arrElt.eoo() ) {
+                // No array, so generate a single key.
+                if ( _spec._sparse && numNotFound == _spec._nFields ) {
+                    return;
+                }            
+                BSONObjBuilder b(_spec._sizeTracker);
+                for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) {
+                    b.appendAs( *i, "" );
+                }
+                keys.insert( b.obj() );
+            }
+            else if ( arrElt.embeddedObject().firstElement().eoo() ) {
+                // Empty array, so set matching fields to undefined.
+                _getKeysArrEltFixed( fieldNames, fixed, _spec._undefinedElt, keys, numNotFound, arrElt, arrIdxs, true );
+            }
+            else {
+                // Non empty array that can be expanded, so generate a key for each member.
+                BSONObj arrObj = arrElt.embeddedObject();
+                BSONObjIterator i( arrObj );
+                while( i.more() ) {
+                    _getKeysArrEltFixed( fieldNames, fixed, i.next(), keys, numNotFound, arrElt, arrIdxs, mayExpandArrayUnembedded );
                 }
             }
-            keys.insert( b.obj() );
+        }
+        
+        const IndexSpec &_spec;
+    };
+    
+    void IndexSpec::getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+        switch( indexVersion() ) {
+            case 0: {
+                KeyGeneratorV0 g( *this );
+                g.getKeys( obj, keys );
+                break;
+            }
+            case 1: {
+                KeyGeneratorV1 g( *this );
+                g.getKeys( obj, keys );
+                break;
+            }
+            default:
+                massert( 15869, "Invalid index version for key generation.", false );
         }
     }
 
@@ -275,6 +447,13 @@ namespace mongo {
     IndexSuitability IndexType::suitability( const BSONObj& query , const BSONObj& order ) const {
         return _spec->_suitability( query , order );
     }
+    
+    int IndexSpec::indexVersion() const {
+        if ( !info.hasField( "v" ) ) {
+            return DefaultIndexVersionNumber;
+        }
+        return IndexDetails::versionForIndexObj( info );
+    }    
 
     bool IndexType::scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const {
         return ! order.isEmpty();
diff --git a/db/indexkey.h b/db/indexkey.h
index 4a755f8a4e8..c04cd6396f6 100644
--- a/db/indexkey.h
+++ b/db/indexkey.h
@@ -25,6 +25,8 @@
 
 namespace mongo {
 
+    extern const int DefaultIndexVersionNumber;
+    
     class Cursor;
     class IndexSpec;
     class IndexType; // TODO: this name sucks
@@ -161,16 +163,21 @@ namespace mongo {
 
     protected:
 
+        int indexVersion() const;
+        
         IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ;
 
-        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const;
-
         BSONSizeTracker _sizeTracker;
         vector<const char*> _fieldNames;
         vector<BSONElement> _fixed;
+
         BSONObj _nullKey; // a full key with all fields null
         BSONObj _nullObj; // only used for _nullElt
         BSONElement _nullElt; // jstNull
+
+        BSONObj _undefinedObj; // only used for _undefinedElt
+        BSONElement _undefinedElt; // undefined
+
         int _nFields; // number of fields in the index
         bool _sparse; // if the index is sparse
         shared_ptr<IndexType> _indexType;
@@ -179,6 +186,8 @@ namespace mongo {
         void _init();
 
         friend class IndexType;
+        friend class KeyGeneratorV0;
+        friend class KeyGeneratorV1;
     public:
         bool _finishedInit;
     };
diff --git a/db/instance.cpp b/db/instance.cpp
index ede433d652b..971cd2e7b38 100644
--- a/db/instance.cpp
+++ b/db/instance.cpp
@@ -587,7 +587,7 @@ namespace mongo {
     }
 
     NOINLINE_DECL void insertMulti(DbMessage& d, const char *ns, const BSONObj& _js) { 
-        const bool keepGoing = d.reservedField() & InsertOption_KeepGoing;
+        const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError;
         int n = 0;
         BSONObj js(_js);
         while( 1 ) {
diff --git a/db/instance.h b/db/instance.h
index 2b86eb44fce..422c77d5ffa 100644
--- a/db/instance.h
+++ b/db/instance.h
@@ -147,6 +147,8 @@ namespace mongo {
         virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; }
 
         double getSoTimeout() const { return 0; }
+
+        virtual bool lazySupported() const { return true; }
     private:
         static HostAndPort _clientHost;
     };
diff --git a/db/introspect.cpp b/db/introspect.cpp
index ca65710b3fc..7e1d19ce2f3 100644
--- a/db/introspect.cpp
+++ b/db/introspect.cpp
@@ -40,7 +40,7 @@ namespace mongo {
         profileBufBuilder.reset();
         BSONObjBuilder b(profileBufBuilder);
         b.appendDate("ts", jsTime());
-        currentOp.debug().append( b );
+        currentOp.debug().append( currentOp , b );
 
         b.append("client", c.clientAddress() );
 
@@ -49,6 +49,26 @@ namespace mongo {
 
         BSONObj p = b.done();
 
+        if (p.objsize() > 100*1024){
+            string small = p.toString(/*isArray*/false, /*full*/false);
+
+            warning() << "can't add full line to system.profile: " << small;
+
+            // rebuild with limited info
+            BSONObjBuilder b(profileBufBuilder);
+            b.appendDate("ts", jsTime());
+            b.append("client", c.clientAddress() );
+            if ( c.getAuthenticationInfo() )
+                b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
+            b.append("err", "profile line too large (max is 100KB)");
+            if (small.size() < 100*1024){ // should be much smaller but if not don't break anything
+                b.append("abbreviated", small);
+            }
+
+            p = b.done();
+        }
+
         // write: not replicated
         NamespaceDetails *d = db->namespaceIndex.details(ns);
         if( d ) {
diff --git a/db/jsobj.cpp b/db/jsobj.cpp
index 53c2329bd35..dcb77447873 100644
--- a/db/jsobj.cpp
+++ b/db/jsobj.cpp
@@ -45,7 +45,7 @@ BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 );
 
 namespace mongo {
 
-    BSONElement nullElement;
+    BSONElement eooElement;
 
     GENOIDLabeler GENOID;
 
@@ -508,6 +508,12 @@ namespace mongo {
     }
 
     BSONObj staticNull = fromjson( "{'':null}" );
+    BSONObj makeUndefined() {
+        BSONObjBuilder b;
+        b.appendUndefined( "" );
+        return b.obj();
+    }
+    BSONObj staticUndefined = makeUndefined();
 
     /* well ordered compare */
     int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const {
@@ -613,13 +619,13 @@ namespace mongo {
         }
 
         if ( sub.eoo() )
-            return nullElement;
-        else if ( sub.type() == Array || name[0] == '\0')
+            return eooElement;
+        else if ( sub.type() == Array || name[0] == '\0' )
             return sub;
         else if ( sub.type() == Object )
             return sub.embeddedObject().getFieldDottedOrArray( name );
         else
-            return nullElement;
+            return eooElement;
     }
 
     /**
@@ -919,7 +925,7 @@ namespace mongo {
             c.appendRegex("x", "goo");
             BSONObj p = c.done();
 
-            assert( !o.shallowEqual( p ) );
+            assert( !o.binaryEqual( p ) );
             assert( o.woCompare( p ) < 0 );
 
         }
@@ -1024,7 +1030,7 @@ namespace mongo {
             BSONObj a = A.done();
             BSONObj b = B.done();
             BSONObj c = C.done();
-            assert( !a.shallowEqual( b ) ); // comments on operator==
+            assert( !a.binaryEqual( b ) ); // comments on operator==
             int cmp = a.woCompare(b);
             assert( cmp == 0 );
             cmp = a.woCompare(c);
@@ -1167,13 +1173,9 @@ namespace mongo {
 
         while (l.more() && r.more()){
             if (strcmp(l.next().fieldName(), r.next().fieldName())) {
-                PRINTFL;
                 return false;
             }
         }
-        PRINT(l.more());
-        PRINT(r.more());
-        PRINT(l.more() || r.more());
 
         return !(l.more() || r.more()); // false if lhs and rhs have diff nFields()
     }
diff --git a/db/key.cpp b/db/key.cpp
index ddc2d593350..648502ebf17 100644
--- a/db/key.cpp
+++ b/db/key.cpp
@@ -264,15 +264,17 @@ namespace mongo {
                     if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
                         int len;
                         const char * d = e.binData(len);
-                        int code = BinDataLengthToCode[len];
-                        if( code >= 0 ) {
-                            if( t >= 128 )
-                                t = (t-128) | 0x08;
-                            dassert( (code&t) == 0 );
-                            b.appendUChar( cbindata|bits );
-                            b.appendUChar( code | t );
-                            b.appendBuf(d, len);
-                            break;
+                        if( len <= BinDataLenMax ) {
+                            int code = BinDataLengthToCode[len];
+                            if( code >= 0 ) {
+                                if( t >= 128 )
+                                    t = (t-128) | 0x08;
+                                dassert( (code&t) == 0 );
+                                b.appendUChar( cbindata|bits );
+                                b.appendUChar( code | t );
+                                b.appendBuf(d, len);
+                                break;
+                            }
                         }
                     }
                     traditional(obj);
diff --git a/db/matcher.cpp b/db/matcher.cpp
index 23d5a7057bf..2b92d5797c3 100644
--- a/db/matcher.cpp
+++ b/db/matcher.cpp
@@ -64,8 +64,14 @@ namespace mongo {
         }
         ~Where() {
 
-            if ( scope.get() )
-                scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
+            if ( scope.get() ){
+                try {
+                    scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
+                }
+                catch( DBException& e ){
+                    warning() << "javascript scope cleanup interrupted" << causedBy( e ) << endl;
+                }
+            }
 
             if ( jsScope ) {
                 delete jsScope;
@@ -148,6 +154,9 @@ namespace mongo {
                     rm._prefix = prefix;
             }
             else {
+                uassert( 15882, "$elemMatch not allowed within $in",
+                         ie.type() != Object ||
+                         ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
                 _myset->insert(ie);
             }
         }
diff --git a/db/modules/mms.cpp b/db/modules/mms.cpp
index 28fc225477f..40abb391dfb 100644
--- a/db/modules/mms.cpp
+++ b/db/modules/mms.cpp
@@ -142,7 +142,7 @@ namespace mongo {
 
             string errmsg;
             BSONObjBuilder sub;
-            if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) )
+            if ( ! c->run( "admin.$cmd" , co , 0 , errmsg , sub , false ) )
                 postData.append( cmd , errmsg );
             else
                 postData.append( cmd , sub.obj() );
diff --git a/db/mongommf.h b/db/mongommf.h
index b347e4ff259..0c4e8e4a19d 100644
--- a/db/mongommf.h
+++ b/db/mongommf.h
@@ -75,7 +75,7 @@ namespace mongo {
             fileSuffixNo() is 3
             if the suffix is "ns", fileSuffixNo -1
         */
-        RelativePath relativePath() const {
+        const RelativePath& relativePath() const {
             DEV assert( !_p._p.empty() );
             return _p;
         }
diff --git a/db/namespace.cpp b/db/namespace.cpp
index 927f56b6e7b..2bc7409e56c 100644
--- a/db/namespace.cpp
+++ b/db/namespace.cpp
@@ -604,6 +604,17 @@ namespace mongo {
         }
     }
 
+    void NamespaceDetailsTransient::eraseForPrefix(const char *prefix) {
+        assertInWriteLock();
+        vector< string > found;
+        for( ouriter i = _map.begin(); i != _map.end(); ++i )
+            if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 )
+                found.push_back( i->first );
+        for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) {
+            _map.erase(*i);
+        }
+    }
+
     void NamespaceDetailsTransient::computeIndexKeys() {
         _keysComputed = true;
         _indexKeys.clear();
@@ -657,7 +668,7 @@ namespace mongo {
         // index details across commands are in cursors and nsd
         // transient (including query cache) so clear these.
         ClientCursor::invalidate( from );
-        NamespaceDetailsTransient::clearForPrefix( from );
+        NamespaceDetailsTransient::eraseForPrefix( from );
 
         NamespaceDetails *details = ni->details( from );
         ni->add_ns( to, *details );
diff --git a/db/namespace.h b/db/namespace.h
index a1b7c2274bc..3dfb3f33767 100644
--- a/db/namespace.h
+++ b/db/namespace.h
@@ -454,6 +454,7 @@ namespace mongo {
            Can be useful as index namespaces share the same start as the regular collection.
            SLOW - sequential scan of all NamespaceDetailsTransient objects */
         static void clearForPrefix(const char *prefix);
+        static void eraseForPrefix(const char *prefix);
 
         /**
          * @return a cursor interface to the query optimizer.  The implementation may
diff --git a/db/oplog.cpp b/db/oplog.cpp
index 7286fd9053c..dc9db76d9d5 100644
--- a/db/oplog.cpp
+++ b/db/oplog.cpp
@@ -473,9 +473,9 @@ namespace mongo {
         return _qp.nsd()->capFirstNewRecord;
     }
     
-    void assertExtentNonempty( const Extent *e ) {
+    void wassertExtentNonempty( const Extent *e ) {
         // TODO ensure this requirement is clearly enforced, or fix.
-        massert( 14834, "empty extent found during finding start scan", !e->firstRecord.isNull() );
+        wassert( !e->firstRecord.isNull() );
     }
     
     DiskLoc FindingStartCursor::prevExtentFirstLoc( const DiskLoc &rec ) {
@@ -488,14 +488,14 @@ namespace mongo {
                 e = e->xprev.ext();
             }
             if ( e->myLoc != _qp.nsd()->capExtent ) {
-                assertExtentNonempty( e );
+                wassertExtentNonempty( e );
                 return e->firstRecord;
             }
         }
         else {
             if ( !e->xprev.isNull() ) {
                 e = e->xprev.ext();
-                assertExtentNonempty( e );
+                wassertExtentNonempty( e );
                 return e->firstRecord;
             }
         }
@@ -506,20 +506,30 @@ namespace mongo {
         shared_ptr<Cursor> c = _qp.newCursor( startLoc );
         _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) );
     }
+
+    bool FindingStartCursor::firstDocMatchesOrEmpty() const {
+        shared_ptr<Cursor> c = _qp.newCursor();
+        return !c->ok() || _matcher->matchesCurrent( c.get() );
+    }
     
     void FindingStartCursor::init() {
-        // Use a ClientCursor here so we can release db mutex while scanning
-        // oplog (can take quite a while with large oplogs).
-        shared_ptr<Cursor> c = _qp.newReverseCursor();
-        _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) );
-        _findingStartTimer.reset();
-        _findingStartMode = Initial;
         BSONElement tsElt = _qp.originalQuery()[ "ts" ];
         massert( 13044, "no ts field in query", !tsElt.eoo() );
         BSONObjBuilder b;
         b.append( tsElt );
         BSONObj tsQuery = b.obj();
         _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey()));
+        if ( firstDocMatchesOrEmpty() ) {
+            _c = _qp.newCursor();
+            _findingStart = false;
+            return;
+        }
+        // Use a ClientCursor here so we can release db mutex while scanning
+        // oplog (can take quite a while with large oplogs).
+        shared_ptr<Cursor> c = _qp.newReverseCursor();
+        _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) );
+        _findingStartTimer.reset();
+        _findingStartMode = Initial;
     }
     
     // -------------------------------------
@@ -704,7 +714,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "internal (sharding)\n{ applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
 
             if ( cmdObj.firstElement().type() != Array ) {
                 errmsg = "ops has to be an array";
diff --git a/db/oplog.h b/db/oplog.h
index f87a1c85e04..79fb01b0a4d 100644
--- a/db/oplog.h
+++ b/db/oplog.h
@@ -118,6 +118,7 @@ namespace mongo {
             _findingStartCursor.reset( 0 );
         }
         void init();
+        bool firstDocMatchesOrEmpty() const;
     };
 
     void pretouchOperation(const BSONObj& op);
diff --git a/db/ops/query.cpp b/db/ops/query.cpp
index 120382fa7d8..f13b6e5ea4b 100644
--- a/db/ops/query.cpp
+++ b/db/ops/query.cpp
@@ -36,6 +36,7 @@
 #include "../lasterror.h"
 #include "../../s/d_logic.h"
 #include "../repl_block.h"
+#include "../../server.h"
 
 namespace mongo {
 
@@ -92,21 +93,15 @@ namespace mongo {
         ClientCursor::Pointer p(cursorid);
         ClientCursor *cc = p.c();
 
-        int bufSize = 512;
-        if ( cc ) {
-            bufSize += sizeof( QueryResult );
-            bufSize += MaxBytesToReturnToClientAtOnce;
-        }
+        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;
 
         BufBuilder b( bufSize );
-
         b.skip(sizeof(QueryResult));
-
         int resultFlags = ResultFlag_AwaitCapable;
         int start = 0;
         int n = 0;
 
-        if ( !cc ) {
+        if ( unlikely(!cc) ) {
             log() << "getMore: cursorid not found " << ns << " " << cursorid << endl;
             cursorid = 0;
             resultFlags = ResultFlag_CursorNotFound;
@@ -420,6 +415,8 @@ namespace mongo {
 
             *_b << "indexBounds" << c->prettyIndexBounds();
 
+            c->explainDetails( *_b );
+
             if ( !hint ) {
                 *_b << "allPlans" << _a->arr();
             }
@@ -899,9 +896,6 @@ namespace mongo {
 
         if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
 
-            //NamespaceDetails* d = nsdetails(ns);
-            //uassert(14820, "capped collections have no _id index by default, can only query by _id if one added", d == NULL || d->haveIdIndex() );
-
             bool nsFound = false;
             bool indexFound = false;
 
diff --git a/db/ops/update.cpp b/db/ops/update.cpp
index 3221fe0f277..d70048d2cc2 100644
--- a/db/ops/update.cpp
+++ b/db/ops/update.cpp
@@ -1060,11 +1060,10 @@ namespace mongo {
         
         debug.updateobj = updateobj;
 
-        /* idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case */
-        /* NOTE: when yield() is added herein, these must be refreshed after each call to yield! */
+        // idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case
+        // The pointers may be left invalid on a failed or terminal yield recovery.
         NamespaceDetails *d = nsdetails(ns); // can be null if an upsert...
         NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get_w(ns);
-        /* end note */
 
         auto_ptr<ModSet> mods;
         bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$';
@@ -1105,6 +1104,9 @@ namespace mongo {
         shared_ptr< MultiCursor::CursorOp > opPtr( new UpdateOp( mods.get() && mods->hasDynamicArray() ) );
         shared_ptr< MultiCursor > c( new MultiCursor( ns, patternOrig, BSONObj(), opPtr, true ) );
 
+        d = nsdetails(ns);
+        nsdt = &NamespaceDetailsTransient::get_w(ns);
+
         if( c->ok() ) {
             set<DiskLoc> seenObjects;
             MatchDetails details;
@@ -1114,20 +1116,28 @@ namespace mongo {
 
                 bool atomic = c->matcher()->docMatcher().atomic();
                 
-                // *****************
-                if ( cc.get() == 0 ) {
-                    shared_ptr< Cursor > cPtr = c;
-                    cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
-                }
-
-                if ( ! cc->yieldSometimes( ClientCursor::WillNeed ) ) {
-                    cc.release();
-                    break;
-                }
-                if ( !c->ok() ) {
-                    break;
+                if ( !atomic ) {
+                    // *****************
+                    if ( cc.get() == 0 ) {
+                        shared_ptr< Cursor > cPtr = c;
+                        cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                    }
+    
+                    bool didYield;
+                    if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) {
+                        cc.release();
+                        break;
+                    }
+                    if ( !c->ok() ) {
+                        break;
+                    }
+                
+                    if ( didYield ) {
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
+                    }
+                    // *****************
                 }
-                // *****************
 
                 // May have already matched in UpdateOp, but do again to get details set correctly
                 if ( ! c->matcher()->matchesCurrent( c.get(), &details ) ) {
@@ -1146,6 +1156,8 @@ namespace mongo {
                         if ( !c->ok() ) {
                             break;
                         }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
                     }
                     continue;
                 }
@@ -1276,10 +1288,11 @@ namespace mongo {
                         if ( !c->ok() ) {
                             break;
                         }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
                     }
 
-                    if (atomic)
-                        getDur().commitIfNeeded();
+                    getDur().commitIfNeeded();
 
                     continue;
                 }
diff --git a/db/pdfile.cpp b/db/pdfile.cpp
index 0b7a5b0830d..0569ba6868e 100644
--- a/db/pdfile.cpp
+++ b/db/pdfile.cpp
@@ -869,6 +869,7 @@ namespace mongo {
         result.append("ns", name.c_str());
         ClientCursor::invalidate(name.c_str());
         Top::global.collectionDropped( name );
+        NamespaceDetailsTransient::eraseForPrefix( name.c_str() );
         dropNS(name);
     }
 
@@ -967,7 +968,7 @@ namespace mongo {
         }
     }
 
-    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) {
+    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) {
         dassert( todelete == dl.rec() );
 
         NamespaceDetails* d = nsdetails(ns);
@@ -976,6 +977,14 @@ namespace mongo {
             uassert( 10089 ,  "can't remove from a capped collection" , 0 );
             return;
         }
+        
+        BSONObj toDelete;
+        if ( doLog ) {
+            BSONElement e = dl.obj()["_id"];
+            if ( e.type() ) {
+                toDelete = e.wrap();
+            }
+        }
 
         /* check if any cursors point to us.  if so, advance them. */
         ClientCursor::aboutToDelete(dl);
@@ -984,6 +993,10 @@ namespace mongo {
 
         _deleteRecord(d, ns, todelete, dl);
         NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
+
+        if ( ! toDelete.isEmpty() ) {
+            logOp( "d" , ns , toDelete );
+        }
     }
 
 
@@ -1181,7 +1194,13 @@ namespace mongo {
             BSONObjExternalSorter::Data d = i->next();
 
             try {
-                btBuilder.addKey(d.first, d.second);
+                if ( !dupsAllowed && dropDups ) {
+                    LastError::Disabled led( lastError.get() );
+                    btBuilder.addKey(d.first, d.second);
+                }
+                else {
+                    btBuilder.addKey(d.first, d.second);                    
+                }
             }
             catch( AssertionException& e ) {
                 if ( dupsAllowed ) {
@@ -1189,8 +1208,9 @@ namespace mongo {
                     throw;
                 }
 
-                if( e.interrupted() )
-                    throw;
+                if( e.interrupted() ) {
+                    killCurrentOp.checkForInterrupt();
+                }
 
                 if ( ! dropDups )
                     throw;
@@ -1276,7 +1296,7 @@ namespace mongo {
         log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
 
         for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){
-            theDataFileMgr.deleteRecord( ns, i->rec(), *i, false, true );
+            theDataFileMgr.deleteRecord( ns, i->rec(), *i, false, true , true );
             getDur().commitIfNeeded();
         }
 
@@ -1302,18 +1322,27 @@ namespace mongo {
             while ( cc->ok() ) {
                 BSONObj js = cc->current();
                 try {
-                    _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                    {
+                        if ( !dupsAllowed && dropDups ) {
+                            LastError::Disabled led( lastError.get() );
+                            _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                        }
+                        else {
+                            _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                        }
+                    }
                     cc->advance();
                 }
                 catch( AssertionException& e ) {
-                    if( e.interrupted() )
-                        throw;
+                    if( e.interrupted() ) {
+                        killCurrentOp.checkForInterrupt();
+                    }
 
                     if ( dropDups ) {
                         DiskLoc toDelete = cc->currLoc();
                         bool ok = cc->advance();
                         cc->updateLocation();
-                        theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true );
+                        theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true , true );
                         if( ClientCursor::find(id, false) == 0 ) {
                             cc.release();
                             if( !ok ) {
diff --git a/db/pdfile.h b/db/pdfile.h
index 0f45e6d337e..64dba68ca41 100644
--- a/db/pdfile.h
+++ b/db/pdfile.h
@@ -142,7 +142,7 @@ namespace mongo {
         static Record* getRecord(const DiskLoc& dl);
         static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len);
 
-        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false);
+        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false, bool logOp=false);
 
         /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */
         void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl);
diff --git a/db/queryoptimizer.cpp b/db/queryoptimizer.cpp
index 4173eaaa2cd..e49e9b11ecb 100644
--- a/db/queryoptimizer.cpp
+++ b/db/queryoptimizer.cpp
@@ -52,7 +52,7 @@ namespace mongo {
 
     QueryPlan::QueryPlan(
         NamespaceDetails *d, int idxNo,
-        const FieldRangeSetPair &frsp, const FieldRangeSetPair &originalFrsp, const BSONObj &originalQuery, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) :
+        const FieldRangeSetPair &frsp, const FieldRangeSetPair *originalFrsp, const BSONObj &originalQuery, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) :
         _d(d), _idxNo(idxNo),
         _frs( frsp.frsForIndex( _d, _idxNo ) ),
         _frsMulti( frsp.frsForIndex( _d, -1 ) ),
@@ -166,12 +166,17 @@ doneCheckOrder:
             _optimal = true;
         if ( exactIndexedQueryCount == _frs.nNontrivialRanges() &&
                 orderFieldsUnindexed.size() == 0 &&
-                exactIndexedQueryCount == _index->keyPattern().nFields() &&
+                exactIndexedQueryCount == idxKey.nFields() &&
                 exactIndexedQueryCount == _originalQuery.nFields() ) {
             _exactKeyMatch = true;
         }
         _frv.reset( new FieldRangeVector( _frs, idxSpec, _direction ) );
-        _originalFrv.reset( new FieldRangeVector( originalFrsp.frsForIndex( _d, _idxNo ), idxSpec, _direction ) );
+        if ( originalFrsp ) {
+            _originalFrv.reset( new FieldRangeVector( originalFrsp->frsForIndex( _d, _idxNo ), idxSpec, _direction ) );
+        }
+        else {
+            _originalFrv = _frv;
+        }
         if ( _startOrEndSpec ) {
             BSONObj newStart, newEnd;
             if ( !startKey.isEmpty() )
@@ -206,8 +211,25 @@ doneCheckOrder:
         }
 
         if ( willScanTable() ) {
-            if ( _frs.nNontrivialRanges() )
+            if ( _frs.nNontrivialRanges() ) {
                 checkTableScanAllowed( _frs.ns() );
+                
+                // if we are doing a table scan on _id
+                // and its a capped collection
+                // we disallow as its a common user error
+                // .system. and local collections are exempt
+                if ( _d && _d->capped && _frs.range( "_id" ).nontrivial() ) {
+                    if ( cc().isSyncThread() ||
+                         str::contains( _frs.ns() , ".system." ) || 
+                         str::startsWith( _frs.ns() , "local." ) ) {
+                        // ok
+                    }
+                    else {
+                        warning() << "_id query on capped collection without an _id index, performance will be poor collection: " << _frs.ns() << endl;
+                        //uassert( 14820, str::stream() << "doing _id query on a capped collection without an index is not allowed: " << _frs.ns() ,
+                    }
+                }
+            }
             return findTableScan( _frs.ns(), _order, startLoc );
         }
                 
@@ -328,7 +350,7 @@ doneCheckOrder:
             massert( 10365 ,  errmsg, indexDetailsForRange( _frsp->ns(), errmsg, _min, _max, keyPattern ) );
         }
         NamespaceDetails *d = nsdetails(_ns);
-        _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_frsp, *_originalFrsp, _originalQuery, _order, _min, _max ) ) );
+        _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_frsp, _originalFrsp.get(), _originalQuery, _order, _min, _max ) ) );
     }
 
     // returns an IndexDetails * for a hint, 0 if hint is $natural.
@@ -374,7 +396,7 @@ doneCheckOrder:
         NamespaceDetails *d = nsdetails( ns );
         if ( !d || !_frsp->matchPossible() ) {
             // Table scan plan, when no matches are possible
-            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, *_originalFrsp, _originalQuery, _order ) ) );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order ) ) );
             return;
         }
 
@@ -388,7 +410,7 @@ doneCheckOrder:
             else {
                 massert( 10366 ,  "natural order cannot be specified with $min/$max", _min.isEmpty() && _max.isEmpty() );
                 // Table scan plan
-                _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, *_originalFrsp, _originalQuery, _order ) ) );
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order ) ) );
             }
             return;
         }
@@ -398,7 +420,7 @@ doneCheckOrder:
             BSONObj keyPattern;
             IndexDetails *idx = indexDetailsForRange( ns, errmsg, _min, _max, keyPattern );
             massert( 10367 ,  errmsg, idx );
-            _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_frsp, *_originalFrsp, _originalQuery, _order, _min, _max ) ) );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_frsp, _originalFrsp.get(), _originalQuery, _order, _min, _max ) ) );
             return;
         }
 
@@ -407,13 +429,13 @@ doneCheckOrder:
             if ( idx >= 0 ) {
                 _usingPrerecordedPlan = true;
                 _mayRecordPlan = false;
-                _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_frsp , *_originalFrsp , _originalQuery, _order ) ) );
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_frsp , _originalFrsp.get() , _originalQuery, _order ) ) );
                 return;
             }
         }
 
         if ( _originalQuery.isEmpty() && _order.isEmpty() ) {
-            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, *_originalFrsp, _originalQuery, _order ) ) );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order ) ) );
             return;
         }
 
@@ -428,7 +450,7 @@ doneCheckOrder:
                 if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , _order ) ) {
                     _usingPrerecordedPlan = true;
                     _mayRecordPlan = false;
-                    _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_frsp , *_originalFrsp , _originalQuery, _order ,
+                    _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_frsp , _originalFrsp.get() , _originalQuery, _order ,
                                                     BSONObj() , BSONObj() , _special ) ) );
                     return;
                 }
@@ -445,7 +467,7 @@ doneCheckOrder:
                 _oldNScanned = oldNScanned;
                 if ( !strcmp( bestIndex.firstElementFieldName(), "$natural" ) ) {
                     // Table scan plan
-                    p.reset( new QueryPlan( d, -1, *_frsp, *_originalFrsp, _originalQuery, _order ) );
+                    p.reset( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order ) );
                 }
 
                 NamespaceDetails::IndexIterator i = d->ii();
@@ -453,7 +475,7 @@ doneCheckOrder:
                     int j = i.pos();
                     IndexDetails& ii = i.next();
                     if( ii.keyPattern().woCompare(bestIndex) == 0 ) {
-                        p.reset( new QueryPlan( d, j, *_frsp, *_originalFrsp, _originalQuery, _order ) );
+                        p.reset( new QueryPlan( d, j, *_frsp, _originalFrsp.get(), _originalQuery, _order ) );
                     }
                 }
 
@@ -480,7 +502,7 @@ doneCheckOrder:
         if ( !_frsp->matchPossible() || ( _frsp->noNontrivialRanges() && _order.isEmpty() ) ||
                 ( !_order.isEmpty() && !strcmp( _order.firstElementFieldName(), "$natural" ) ) ) {
             // Table scan plan
-            addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, *_originalFrsp, _originalQuery, _order ) ), checkFirst );
+            addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order ) ), checkFirst );
             return;
         }
 
@@ -490,10 +512,11 @@ doneCheckOrder:
         QueryPlanPtr optimalPlan;
         for( int i = 0; i < d->nIndexes; ++i ) {
             if ( normalQuery ) {
-                if ( !_frsp->matchPossibleForIndex( d, i, d->idx( i ).keyPattern() ) ) {
+                BSONObj keyPattern = d->idx( i ).keyPattern();
+                if ( !_frsp->matchPossibleForIndex( d, i, keyPattern ) ) {
                     // If no match is possible, only generate a trival plan that won't
                     // scan any documents.
-                    QueryPlanPtr p( new QueryPlan( d, i, *_frsp, *_originalFrsp, _originalQuery, _order ) );
+                    QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order ) );
                     addPlan( p, checkFirst );
                     return;
                 }
@@ -502,7 +525,7 @@ doneCheckOrder:
                 }
             }
 
-            QueryPlanPtr p( new QueryPlan( d, i, *_frsp, *_originalFrsp, _originalQuery, _order ) );
+            QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order ) );
             if ( p->optimal() ) {
                 if ( !optimalPlan.get() ) {
                     optimalPlan = p;
@@ -520,7 +543,7 @@ doneCheckOrder:
             addPlan( *i, checkFirst );
 
         // Table scan plan
-        addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, *_originalFrsp, _originalQuery, _order ) ), checkFirst );
+        addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order ) ), checkFirst );
     }
 
     shared_ptr<QueryOp> QueryPlanSet::runOp( QueryOp &op ) {
@@ -538,7 +561,7 @@ doneCheckOrder:
         return r.runUntilFirstCompletes();
     }
     
-    shared_ptr<QueryOp> QueryPlanSet::nextOp( QueryOp &originalOp ) {
+    shared_ptr<QueryOp> QueryPlanSet::nextOp( QueryOp &originalOp, bool retried ) {
         if ( !_runner ) {
             _runner.reset( new Runner( *this, originalOp ) );
             shared_ptr<QueryOp> op = _runner->init();
@@ -553,10 +576,14 @@ doneCheckOrder:
         if ( !_usingPrerecordedPlan || _bestGuessOnly || _plans.size() > 1 ) {
             return op;
         }
+
+        // Avoid an infinite loop here
+        uassert( 15878, str::stream() << "query plans not successful even with no constraints, potentially due to additional sort", ! retried );
+
         // Retry with all candidate plans.
         QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
         init();
-        return nextOp( originalOp );
+        return nextOp( originalOp, true );
     }
 
     bool QueryPlanSet::prepareToYield() {
@@ -815,24 +842,29 @@ doneCheckOrder:
         _ns( ns ),
         _or( !query.getField( "$or" ).eoo() ),
         _query( query.getOwned() ),
-        _org( ns, _query ),
         _i(),
         _honorRecordedPlan( honorRecordedPlan ),
         _bestGuessOnly( bestGuessOnly ),
         _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ),
         _mayYield( mayYield ),
         _tableScanned() {
-        if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() || !_org.getSpecial().empty() ) {
+        if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() ) {
             _or = false;
         }
-        if ( _or && uselessOr( _hint.firstElement() ) ) {
-            _or = false;
+        if ( _or ) {
+            // Only construct an OrRangeGenerator if we may handle $or clauses.
+            _org.reset( new OrRangeGenerator( ns, _query ) );
+            if ( !_org->getSpecial().empty() ) {
+                _or = false;
+            }
+            else if ( uselessOr( _hint.firstElement() ) ) {
+                _or = false;   
+            }
         }
         // if _or == false, don't use or clauses for index selection
         if ( !_or ) {
             auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, _query, true ) );
-            auto_ptr<FieldRangeSetPair> oldFrsp( new FieldRangeSetPair( *frsp ) );
-            _currentQps.reset( new QueryPlanSet( ns, frsp, oldFrsp, _query, order, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
+            _currentQps.reset( new QueryPlanSet( ns, frsp, auto_ptr<FieldRangeSetPair>(), _query, order, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
         }
         else {
             BSONElement e = _query.getField( "$or" );
@@ -847,8 +879,8 @@ doneCheckOrder:
             return _currentQps->runOp( op );
         }
         ++_i;
-        auto_ptr<FieldRangeSetPair> frsp( _org.topFrsp() );
-        auto_ptr<FieldRangeSetPair> originalFrsp( _org.topFrspOriginal() );
+        auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+        auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
         BSONElement hintElt = _hint.firstElement();
         _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
         shared_ptr<QueryOp> ret( _currentQps->runOp( op ) );
@@ -856,7 +888,7 @@ doneCheckOrder:
             _tableScanned = true;
         } else {
             // If the full table was scanned, don't bother popping the last or clause.
-	        _org.popOrClause( ret->qp().nsd(), ret->qp().idxNo(), ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() );
+	        _org->popOrClause( ret->qp().nsd(), ret->qp().idxNo(), ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() );
         }
         return ret;
     }
@@ -877,7 +909,7 @@ doneCheckOrder:
         if ( op->qp().willScanTable() ) {
             _tableScanned = true;   
         } else {
-            _org.popOrClause( op->qp().nsd(), op->qp().idxNo(), op->qp().indexed() ? op->qp().indexKey() : BSONObj() );         	   
+            _org->popOrClause( op->qp().nsd(), op->qp().idxNo(), op->qp().indexed() ? op->qp().indexKey() : BSONObj() );         	   
         }
         return op;
     }
@@ -887,8 +919,8 @@ doneCheckOrder:
         shared_ptr<QueryOp> op;
         while( mayRunMore() ) {
 	        ++_i;
-    	    auto_ptr<FieldRangeSetPair> frsp( _org.topFrsp() );
-        	auto_ptr<FieldRangeSetPair> originalFrsp( _org.topFrspOriginal() );
+    	    auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+        	auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
 	        BSONElement hintElt = _hint.firstElement();
     	    _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
             op = nextOpHandleEndOfClause();
@@ -954,9 +986,9 @@ doneCheckOrder:
             if ( !id ) {
                 return true;
             }
-            return QueryUtilIndexed::uselessOr( _org, nsd, nsd->idxNo( *id ) );
+            return QueryUtilIndexed::uselessOr( *_org, nsd, nsd->idxNo( *id ) );
         }
-        return QueryUtilIndexed::uselessOr( _org, nsd, -1 );
+        return QueryUtilIndexed::uselessOr( *_org, nsd, -1 );
     }
     
     MultiCursor::MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op, bool mayYield )
@@ -1199,12 +1231,13 @@ doneCheckOrder:
     }
 
     bool QueryUtilIndexed::indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order ) {
-        frsp.assertValidIndex( d, idxNo );
-        if ( !frsp.matchPossibleForIndex( d, idxNo, d->idx( idxNo ).keyPattern() ) ) {
+        DEV frsp.assertValidIndex( d, idxNo );
+        BSONObj keyPattern = d->idx( idxNo ).keyPattern();
+        if ( !frsp.matchPossibleForIndex( d, idxNo, keyPattern ) ) {
             // No matches are possible in the index so the index may be useful.
             return true;   
         }
-        return d->idx( idxNo ).getSpec().suitability( frsp.simplifiedQueryForIndex( d, idxNo, d->idx( idxNo ).keyPattern() ), order ) != USELESS;
+        return d->idx( idxNo ).getSpec().suitability( frsp.simplifiedQueryForIndex( d, idxNo, keyPattern ), order ) != USELESS;
     }
     
     void QueryUtilIndexed::clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
diff --git a/db/queryoptimizer.h b/db/queryoptimizer.h
index e55e791e1ca..ad6b985ab1f 100644
--- a/db/queryoptimizer.h
+++ b/db/queryoptimizer.h
@@ -35,10 +35,13 @@ namespace mongo {
     class QueryPlan : boost::noncopyable {
     public:
 
+        /**
+         * @param originalFrsp - original constraints for this query clause.  If null, frsp will be used instead.
+         */
         QueryPlan(NamespaceDetails *d,
                   int idxNo, // -1 = no index
                   const FieldRangeSetPair &frsp,
-                  const FieldRangeSetPair &originalFrsp,
+                  const FieldRangeSetPair *originalFrsp,
                   const BSONObj &originalQuery,
                   const BSONObj &order,
                   const BSONObj &startKey = BSONObj(),
@@ -245,6 +248,9 @@ namespace mongo {
         typedef boost::shared_ptr<QueryPlan> QueryPlanPtr;
         typedef vector<QueryPlanPtr> PlanSet;
 
+        /**
+         * @param originalFrsp - original constraints for this query clause; if null, frsp will be used.
+         */
         QueryPlanSet( const char *ns,
                       auto_ptr<FieldRangeSetPair> frsp,
                       auto_ptr<FieldRangeSetPair> originalFrsp,
@@ -272,7 +278,7 @@ namespace mongo {
         }
 
         /** Initialize or iterate a runner generated from @param originalOp. */
-        shared_ptr<QueryOp> nextOp( QueryOp &originalOp );
+        shared_ptr<QueryOp> nextOp( QueryOp &originalOp, bool retried = false );
         
         /** Yield the runner member. */
         
@@ -290,7 +296,7 @@ namespace mongo {
 
         //for testing
         const FieldRangeSetPair &frsp() const { return *_frsp; }
-        const FieldRangeSetPair &originalFrsp() const { return *_originalFrsp; }
+        const FieldRangeSetPair *originalFrsp() const { return _originalFrsp.get(); }
         bool modifiedKeys() const;
         bool hasMultiKey() const;
 
@@ -420,7 +426,7 @@ namespace mongo {
         shared_ptr<Cursor> singleCursor() const;
         
         /** @return true iff more $or clauses need to be scanned. */
-        bool mayRunMore() const { return _or ? ( !_tableScanned && !_org.orFinished() ) : _i == 0; }
+        bool mayRunMore() const { return _or ? ( !_tableScanned && !_org->orFinished() ) : _i == 0; }
         /** @return non-$or version of explain output. */
         BSONObj oldExplain() const { assertNotOr(); return _currentQps->explain(); }
         /** @return true iff this is not a $or query and a plan is selected based on previous success of this plan. */
@@ -445,7 +451,7 @@ namespace mongo {
         const char * _ns;
         bool _or;
         BSONObj _query;
-        OrRangeGenerator _org;
+        shared_ptr<OrRangeGenerator> _org; // May be null in certain non $or query cases.
         auto_ptr<QueryPlanSet> _currentQps;
         int _i;
         bool _honorRecordedPlan;
diff --git a/db/queryutil-inl.h b/db/queryutil-inl.h
index 2c3a757b385..d0fc212cef9 100644
--- a/db/queryutil-inl.h
+++ b/db/queryutil-inl.h
@@ -130,5 +130,24 @@ namespace mongo {
         }
         return ret;
     }
+    
+    inline bool FieldRangeSetPair::matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+        assertValidIndexOrNoIndex( d, idxNo );
+        if ( !matchPossible() ) {
+            return false;
+        }
+        if ( idxNo < 0 ) {
+            // multi key matchPossible() is true, so return true.
+            return true;   
+        }
+        return frsForIndex( d, idxNo ).matchPossibleForIndex( keyPattern );
+    }
 
+    inline void FieldRangeSetPair::assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const {
+        massert( 14049, "FieldRangeSetPair invalid index specified", idxNo >= -1 );
+        if ( idxNo >= 0 ) {
+            assertValidIndex( d, idxNo );   
+        }
+    }        
+    
 } // namespace mongo
diff --git a/db/queryutil.cpp b/db/queryutil.cpp
index ec9ee693511..717eac816b8 100644
--- a/db/queryutil.cpp
+++ b/db/queryutil.cpp
@@ -28,6 +28,7 @@
 
 namespace mongo {
     extern BSONObj staticNull;
+    extern BSONObj staticUndefined;
 
     /** returns a string that when used as a matcher, would match a super set of regex()
         returns "" for complex regular expressions
@@ -79,6 +80,10 @@ namespace mongo {
                 r = r.substr( 0 , r.size() - 1 );
                 return r; //breaking here fails with /^a?/
             }
+            else if (c == '|') {
+                // whole match so far is optional. Nothing we can do here.
+                return string();
+            }
             else if (c == '\\') {
                 c = *(regex++);
                 if (c == 'Q'){
@@ -107,7 +112,7 @@ namespace mongo {
                     ss << c;
                 }
             }
-            else if (strchr("^$.[|()+{", c)) {
+            else if (strchr("^$.[()+{", c)) {
                 // list of "metacharacters" from man pcrepattern
                 r = ss.str();
                 break;
@@ -153,25 +158,33 @@ namespace mongo {
 
     FieldRange::FieldRange( const BSONElement &e, bool singleKey, bool isNot, bool optimize )
     : _singleKey( singleKey ) {
+        int op = e.getGtLtOp();
+
         // NOTE with $not, we could potentially form a complementary set of intervals.
-        if ( !isNot && !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) {
+        if ( !isNot && !e.eoo() && e.type() != RegEx && op == BSONObj::opIN ) {
             set<BSONElement,element_lt> vals;
             vector<FieldRange> regexes;
             uassert( 12580 , "invalid query" , e.isABSONObj() );
             BSONObjIterator i( e.embeddedObject() );
             while( i.more() ) {
                 BSONElement ie = i.next();
+                uassert( 15881, "$elemMatch not allowed within $in",
+                         ie.type() != Object ||
+                         ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
                 if ( ie.type() == RegEx ) {
                     regexes.push_back( FieldRange( ie, singleKey, false, optimize ) );
                 }
                 else {
-                    // A document array may be indexed by its first element, or
-                    // as a full array if it is embedded within another array.
+                    // A document array may be indexed by its first element, by undefined
+                    // if it is empty, or as a full array if it is embedded within another
+                    // array.
                     vals.insert( ie );                        
                     if ( ie.type() == Array ) {
-                        if ( !ie.embeddedObject().firstElement().eoo() ) {
-                         	vals.insert( ie.embeddedObject().firstElement() );
-                        }
+                        BSONElement temp = ie.embeddedObject().firstElement();
+                        if ( temp.eoo() ) {
+                            temp = staticUndefined.firstElement();
+                        }                        
+                        vals.insert( temp );
                     }
                 }
             }
@@ -185,17 +198,21 @@ namespace mongo {
             return;
         }
 
-        // A document array may be indexed by its first element, or
-        // as a full array if it is embedded within another array.
-        if ( e.type() == Array && e.getGtLtOp() == BSONObj::Equality ) {
+        // A document array may be indexed by its first element, by undefined
+        // if it is empty, or as a full array if it is embedded within another
+        // array.
+        if ( e.type() == Array && op == BSONObj::Equality ) {
 
             _intervals.push_back( FieldInterval(e) );
-            const BSONElement& temp = e.embeddedObject().firstElement();
-            if ( ! temp.eoo() ) {
-                if ( temp < e )
-                    _intervals.insert( _intervals.begin() , temp );
-                else
-                    _intervals.push_back( FieldInterval(temp) );
+            BSONElement temp = e.embeddedObject().firstElement();
+            if ( temp.eoo() ) {
+             	temp = staticUndefined.firstElement();
+            }
+            if ( temp < e ) {
+                _intervals.insert( _intervals.begin() , temp );
+            }
+            else {
+                _intervals.push_back( FieldInterval(temp) );
             }
 
             return;
@@ -215,8 +232,6 @@ namespace mongo {
         if ( e.eoo() )
             return;
 
-        int op = e.getGtLtOp();
-
         bool existsSpec = false;
         if ( op == BSONObj::opEXISTS ) {
             existsSpec = e.trueValue();
@@ -622,6 +637,27 @@ namespace mongo {
         return o;
     }
 
+    string FieldInterval::toString() const {
+        StringBuilder buf;
+        buf << ( _lower._inclusive ? "[" : "(" );
+        buf << _lower._bound;
+        buf << " , ";
+        buf << _upper._bound;
+        buf << ( _upper._inclusive ? "]" : ")" );
+        return buf.str();
+    }
+
+    string FieldRange::toString() const {
+        StringBuilder buf;
+        buf << "(FieldRange special: " << _special << " singleKey: " << _special << " intervals: ";
+        for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            buf << i->toString();
+        }
+
+        buf << ")";
+        return buf.str();
+    }
+
     string FieldRangeSet::getSpecial() const {
         string s = "";
         for ( map<string,FieldRange>::const_iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) {
@@ -773,30 +809,32 @@ namespace mongo {
     }
 
     void FieldRangeSet::processQueryField( const BSONElement &e, bool optimize ) {
-        if ( strcmp( e.fieldName(), "$and" ) == 0 ) {
-            uassert( 14816 , "$and expression must be a nonempty array" , e.type() == Array && e.embeddedObject().nFields() > 0 );
-            BSONObjIterator i( e.embeddedObject() );
-            while( i.more() ) {
-                BSONElement e = i.next();
-                uassert( 14817 , "$and elements must be objects" , e.type() == Object );
-                BSONObjIterator j( e.embeddedObject() );
-                while( j.more() ) {
-                    processQueryField( j.next(), optimize );
-                }
-            }            
-        }
+        if ( e.fieldName()[ 0 ] == '$' ) {
+            if ( strcmp( e.fieldName(), "$and" ) == 0 ) {
+                uassert( 14816 , "$and expression must be a nonempty array" , e.type() == Array && e.embeddedObject().nFields() > 0 );
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.more() ) {
+                    BSONElement e = i.next();
+                    uassert( 14817 , "$and elements must be objects" , e.type() == Object );
+                    BSONObjIterator j( e.embeddedObject() );
+                    while( j.more() ) {
+                        processQueryField( j.next(), optimize );
+                    }
+                }            
+            }
         
-        if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
-            return;
-        }
+            if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
+                return;
+            }
         
-        if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
-            return;
-        }
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                return;
+            }
         
-        if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
-            return;
-        }        
+            if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
+                return;
+            }
+        }
         
         bool equality = ( getGtLtOp( e ) == BSONObj::Equality );
         if ( equality && e.type() == Object ) {
@@ -1055,32 +1093,11 @@ namespace mongo {
         return ret;
     }
     
-    const FieldRangeSet &FieldRangeSetPair::frsForIndex( const NamespaceDetails* nsd, int idxNo ) const {
-        assertValidIndexOrNoIndex( nsd, idxNo );
-        if ( idxNo < 0 ) {
-            // An unindexed cursor cannot have a "single key" constraint.
-            return _multiKey;
-        }
-        return nsd->isMultikey( idxNo ) ? _multiKey : _singleKey;
-    }    
-
     bool FieldRangeSetPair::noNontrivialRanges() const {
         return _singleKey.matchPossible() && _singleKey.nNontrivialRanges() == 0 &&
                  _multiKey.matchPossible() && _multiKey.nNontrivialRanges() == 0;
     }
     
-    bool FieldRangeSetPair::matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
-        assertValidIndexOrNoIndex( d, idxNo );
-        if ( !matchPossible() ) {
-            return false;
-        }
-        if ( idxNo < 0 ) {
-            // multi key matchPossible() is true, so return true.
-            return true;   
-        }
-        return frsForIndex( d, idxNo ).matchPossibleForIndex( keyPattern );
-    }
-    
     FieldRangeSetPair &FieldRangeSetPair::operator&=( const FieldRangeSetPair &other ) {
         _singleKey &= other._singleKey;
         _multiKey &= other._multiKey;
@@ -1093,21 +1110,23 @@ namespace mongo {
         return *this;            
     }    
     
+    BSONObj FieldRangeSetPair::simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+        return frsForIndex( d, idxNo ).simplifiedQuery( keyPattern );
+    }    
+    
     void FieldRangeSetPair::assertValidIndex( const NamespaceDetails *d, int idxNo ) const {
         massert( 14048, "FieldRangeSetPair invalid index specified", idxNo >= 0 && idxNo < d->nIndexes );   
     }
-    
-    void FieldRangeSetPair::assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const {
-        massert( 14049, "FieldRangeSetPair invalid index specified", idxNo >= -1 );
-        if ( idxNo >= 0 ) {
-            assertValidIndex( d, idxNo );   
+        
+    const FieldRangeSet &FieldRangeSetPair::frsForIndex( const NamespaceDetails* nsd, int idxNo ) const {
+        assertValidIndexOrNoIndex( nsd, idxNo );
+        if ( idxNo < 0 ) {
+            // An unindexed cursor cannot have a "single key" constraint.
+            return _multiKey;
         }
+        return nsd->isMultikey( idxNo ) ? _multiKey : _singleKey;
     }    
-    
-    BSONObj FieldRangeSetPair::simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
-        return frsForIndex( d, idxNo ).simplifiedQuery( keyPattern );
-    }    
-    
+        
     bool FieldRangeVector::matchesElement( const BSONElement &e, int i, bool forward ) const {
         bool eq;
         int l = matchingLowElement( e, i, forward, eq );
diff --git a/db/queryutil.h b/db/queryutil.h
index 00d2d264961..104cde28e4a 100644
--- a/db/queryutil.h
+++ b/db/queryutil.h
@@ -53,6 +53,8 @@ namespace mongo {
         /** @return true iff the interval is an equality constraint. */
         bool equality() const;
         mutable int _cachedEquality;
+
+        string toString() const;
     };
 
     /**
@@ -103,6 +105,8 @@ namespace mongo {
          * NOTE the resulting intervals might not be strictValid().
          */
         void reverse( FieldRange &ret ) const;
+
+        string toString() const;
     private:
         BSONObj addObj( const BSONObj &o );
         void finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other );
diff --git a/db/record.cpp b/db/record.cpp
index f5fa972227a..18be9c75fe2 100644
--- a/db/record.cpp
+++ b/db/record.cpp
@@ -120,14 +120,16 @@ namespace mongo {
 
             /**
              * after this call, we assume the page is in ram
+             * @param doHalf if this is a known good access, want to put in first half
              * @return whether we know the page is in ram
              */
-            bool access( size_t region , short offset ) {
+            bool access( size_t region , short offset , bool doHalf ) {
                 int regionHash = hash(region);
                 
                 scoped_spinlock lk( _lock );
-
-                RARELY {
+                
+                static int rarely_count = 0;
+                if ( rarely_count++ % 2048 == 0 ) {
                     long long now = Listener::getElapsedTimeMillis();
                     RARELY if ( now == 0 ) {
                         tlog() << "warning Listener::getElapsedTimeMillis returning 0ms" << endl;
@@ -137,8 +139,8 @@ namespace mongo {
                         _rotate();
                     }
                 }
-
-                for ( int i=0; i<NumSlices; i++ ) {
+                
+                for ( int i=0; i<NumSlices / ( doHalf ? 2 : 1 ); i++ ) {
                     int pos = (_curSlice+i)%NumSlices;
                     State s = _slices[pos].get( regionHash , region , offset );
 
@@ -205,7 +207,7 @@ namespace mongo {
         const size_t region = page >> 6;
         const size_t offset = page & 0x3f;
         
-        if ( ps::rolling.access( region , offset ) )
+        if ( ps::rolling.access( region , offset , false ) )
             return true;
 
         if ( ! blockSupported )
@@ -214,14 +216,11 @@ namespace mongo {
     }
 
     Record* Record::accessed() {
-        if ( ! MemoryTrackingEnabled ) 
-            return this;
-
         const size_t page = (size_t)data >> 12;
         const size_t region = page >> 6;
         const size_t offset = page & 0x3f;
-
-        ps::rolling.access( region , offset );
+        
+        ps::rolling.access( region , offset , true );
         return this;
     }
     
diff --git a/db/repl.cpp b/db/repl.cpp
index a4ab6e4f0ea..3d08f2324c0 100644
--- a/db/repl.cpp
+++ b/db/repl.cpp
@@ -95,7 +95,7 @@ namespace mongo {
         virtual LockType locktype() const { return WRITE; }
         void help(stringstream&h) const { h << "resync (from scratch) an out of date replica slave.\nhttp://www.mongodb.org/display/DOCS/Master+Slave"; }
         CmdResync() : Command("resync") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( cmdLine.usingReplSets() ) {
                 errmsg = "resync command not currently supported with replica sets.  See RS102 info in the mongodb documentations";
                 result.append("info", "http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member");
@@ -232,7 +232,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return NONE; }
         CmdIsMaster() : Command("isMaster", true, "ismaster") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not
                authenticated.
                we allow unauthenticated ismaster but we aren't as verbose informationally if
@@ -1407,6 +1407,7 @@ namespace mongo {
 
     void newRepl();
     void oldRepl();
+    void startReplSets(ReplSetCmdline*);
     void startReplication() {
         /* if we are going to be a replica set, we aren't doing other forms of replication. */
         if( !cmdLine._replSet.empty() ) {
@@ -1416,6 +1417,11 @@ namespace mongo {
                 log() << "***" << endl;
             }
             newRepl();
+
+            replSet = true;
+            ReplSetCmdline *replSetCmdline = new ReplSetCmdline(cmdLine._replSet);
+            boost::thread t( boost::bind( &startReplSets, replSetCmdline) );
+
             return;
         }
 
diff --git a/db/repl/consensus.cpp b/db/repl/consensus.cpp
index 3a4dd9b5b3d..07ee2fa80a3 100644
--- a/db/repl/consensus.cpp
+++ b/db/repl/consensus.cpp
@@ -25,6 +25,7 @@ namespace mongo {
     public:
         CmdReplSetFresh() : ReplSetCommand("replSetFresh") { }
     private:
+
         bool shouldVeto(const BSONObj& cmdObj, string& errmsg) {
             unsigned id = cmdObj["id"].Int();
             const Member* primary = theReplSet->box.getPrimary();
@@ -66,7 +67,7 @@ namespace mongo {
             return false;
         }
 
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
 
@@ -101,7 +102,7 @@ namespace mongo {
     public:
         CmdReplSetElect() : ReplSetCommand("replSetElect") { }
     private:
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
             theReplSet->elect.electCmdReceived(cmdObj, &result);
@@ -152,7 +153,7 @@ namespace mongo {
         LastYea &L = this->ly.ref(lk);
         time_t now = time(0);
         if( L.when + LeaseTime >= now && L.who != memberId ) {
-            log(1) << "replSet not voting yea for " << memberId <<
+            LOG(1) << "replSet not voting yea for " << memberId <<
                    " voted for " << L.who << ' ' << now-L.when << " secs ago" << rsLog;
             throw VoteException();
         }
@@ -176,7 +177,7 @@ namespace mongo {
     void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) {
         BSONObjBuilder& b = *_b;
         DEV log() << "replSet received elect msg " << cmd.toString() << rsLog;
-        else log(2) << "replSet received elect msg " << cmd.toString() << rsLog;
+        else LOG(2) << "replSet received elect msg " << cmd.toString() << rsLog;
         string set = cmd["set"].String();
         unsigned whoid = cmd["whoid"].Int();
         int cfgver = cmd["cfgver"].Int();
@@ -309,7 +310,7 @@ namespace mongo {
                 allUp = false;
             }
         }
-        log(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
+        LOG(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
         assert( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working...
         return true;
     }
diff --git a/db/repl/heartbeat.cpp b/db/repl/heartbeat.cpp
index 6247b4b1d13..7d3f78c73b5 100644
--- a/db/repl/heartbeat.cpp
+++ b/db/repl/heartbeat.cpp
@@ -39,6 +39,8 @@ namespace mongo {
     extern bool replSetBlind;
     extern ReplSettings replSettings;
 
+    unsigned int HeartbeatInfo::numPings;
+
     long long HeartbeatInfo::timeDown() const {
         if( up() ) return 0;
         if( downSince == 0 )
@@ -51,7 +53,7 @@ namespace mongo {
     public:
         virtual bool adminOnly() const { return false; }
         CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( replSetBlind )
                 return false;
 
@@ -62,6 +64,10 @@ namespace mongo {
                 return false;
             }
 
+            if (!checkAuth(errmsg, result)) {
+                return false;
+            }
+
             /* we want to keep heartbeat connections open when relinquishing primary.  tag them here. */
             {
                 AbstractMessagingPort *mp = cc().port();
@@ -147,7 +153,7 @@ namespace mongo {
         string name() const { return "rsHealthPoll"; }
         void doWork() {
             if ( !theReplSet ) {
-                log(2) << "replSet not initialized yet, skipping health poll this round" << rsLog;
+                LOG(2) << "replSet not initialized yet, skipping health poll this round" << rsLog;
                 return;
             }
 
@@ -169,7 +175,10 @@ namespace mongo {
                 time_t after = mem.lastHeartbeat = before + (mem.ping / 1000);
 
                 // weight new ping with old pings
-                mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2));
+                // on the first ping, just use the ping value
+                if (old.ping != 0) {
+                    mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2));
+                }
 
                 if ( info["time"].isNumber() ) {
                     long long t = info["time"].numberLong();
@@ -191,6 +200,8 @@ namespace mongo {
                         mem.hbstate = MemberState(state.Int());
                 }
                 if( ok ) {
+                    HeartbeatInfo::numPings++;
+
                     if( mem.upSince == 0 ) {
                         log() << "replSet info member " << h.toString() << " is up" << rsLog;
                         mem.upSince = mem.lastHeartbeat;
@@ -262,6 +273,7 @@ namespace mongo {
     private:
         void down(HeartbeatInfo& mem, string msg) {
             mem.health = 0.0;
+            mem.ping = 0;
             if( mem.upSince || mem.downSince == 0 ) {
                 mem.upSince = 0;
                 mem.downSince = jsTime();
diff --git a/db/repl/replset_commands.cpp b/db/repl/replset_commands.cpp
index 79639acd567..68dab7eb3c1 100644
--- a/db/repl/replset_commands.cpp
+++ b/db/repl/replset_commands.cpp
@@ -45,14 +45,18 @@ namespace mongo {
             help << "Just for regression tests.\n";
         }
         CmdReplSetTest() : ReplSetCommand("replSetTest") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             log() << "replSet replSetTest command received: " << cmdObj.toString() << rsLog;
+
+            if (!checkAuth(errmsg, result)) {
+                return false;
+            }
+
             if( cmdObj.hasElement("forceInitialSyncFailure") ) {
                 replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number();
                 return true;
             }
 
-            // may not need this, but if removed check all tests still work:
             if( !check(errmsg, result) )
                 return false;
 
@@ -76,11 +80,11 @@ namespace mongo {
             help << "internal";
         }
         CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") {
-            // this is ok but micros or combo with some rand() and/or 64 bits might be better -- 
+            // this is ok but micros or combo with some rand() and/or 64 bits might be better --
             // imagine a restart and a clock correction simultaneously (very unlikely but possible...)
             rbid = (int) curTimeMillis64();
         }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
             result.append("rbid",rbid);
@@ -108,7 +112,7 @@ namespace mongo {
             help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
         CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( cmdObj["forShell"].trueValue() )
                 lastError.disableForCommand();
 
@@ -128,17 +132,21 @@ namespace mongo {
             help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
         CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { }
-        virtual bool run(const string& a, BSONObj& b, string& errmsg, BSONObjBuilder& c, bool d) {
+        virtual bool run(const string& a, BSONObj& b, int e, string& errmsg, BSONObjBuilder& c, bool d) {
             try {
                 rwlock_try_write lk(mutex);
-                return _run(a,b,errmsg,c,d);
+                return _run(a,b,e,errmsg,c,d);
             }
             catch(rwlock_try_write::exception&) { }
             errmsg = "a replSetReconfig is already in progress";
             return false;
         }
     private:
-        bool _run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool _run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( !checkAuth(errmsg, result) ) {
+                return false;
+            }
+
             if( cmdObj["replSetReconfig"].type() != Object ) {
                 errmsg = "no configuration specified";
                 return false;
@@ -209,7 +217,7 @@ namespace mongo {
         }
 
         CmdReplSetFreeze() : ReplSetCommand("replSetFreeze") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
             int secs = (int) cmdObj.firstElement().numberInt();
@@ -233,7 +241,7 @@ namespace mongo {
         }
 
         CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
             if( !theReplSet->box.getState().primary() ) {
@@ -252,19 +260,19 @@ namespace mongo {
                 long long int diff = lastOp - closest;
                 result.append("closest", closest);
                 result.append("difference", diff);
-                
+
                 if (diff < 0) {
                     // not our problem, but we'll wait until thing settle down
                     errmsg = "someone is ahead of the primary?";
                     return false;
                 }
-                                
+
                 if (diff > 10) {
                     errmsg = "no secondaries within 10 seconds of my optime";
                     return false;
                 }
             }
-            
+
             int secs = (int) cmdObj.firstElement().numberInt();
             if( secs == 0 )
                 secs = 60;
diff --git a/db/repl/rs.cpp b/db/repl/rs.cpp
index 84b92fe9297..243e087eff1 100644
--- a/db/repl/rs.cpp
+++ b/db/repl/rs.cpp
@@ -24,9 +24,12 @@
 #include "rs.h"
 #include "connections.h"
 #include "../repl.h"
+#include "../instance.h"
 
-namespace mongo {
+using namespace std;
 
+namespace mongo {
+    
     using namespace bson;
 
     bool replSet = false;
@@ -60,18 +63,43 @@ namespace mongo {
     }
 
     void ReplSetImpl::assumePrimary() {
-        log(2) << "assuming primary" << endl;
+        LOG(2) << "replSet assuming primary" << endl;
         assert( iAmPotentiallyHot() );
         writelock lk("admin."); // so we are synchronized with _logOp()
+
+        // Make sure that new OpTimes are higher than existing ones even with clock skew
+        DBDirectClient c;
+        BSONObj lastOp = c.findOne( "local.oplog.rs", Query().sort(reverseNaturalObj), NULL, QueryOption_SlaveOk );
+        if ( !lastOp.isEmpty() ) {
+            OpTime::setLast( lastOp[ "ts" ].date() );
+        }
+
         changeState(MemberState::RS_PRIMARY);
     }
 
     void ReplSetImpl::changeState(MemberState s) { box.change(s, _self); }
 
+    void ReplSetImpl::setMaintenanceMode(const bool inc) {
+        lock lk(this);
+
+        if (inc) {
+            log() << "replSet going into maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+
+            _maintenanceMode++;
+            changeState(MemberState::RS_RECOVERING);
+        }
+        else {
+            _maintenanceMode--;
+            // no need to change state, syncTail will try to go live as a secondary soon
+
+            log() << "leaving maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+        }
+    }
+
     Member* ReplSetImpl::getMostElectable() {
         lock lk(this);
-        
-        Member *max = 0;        
+
+        Member *max = 0;
 
         for (set<unsigned>::iterator it = _electableSet.begin(); it != _electableSet.end(); it++) {
             const Member *temp = findById(*it);
@@ -91,7 +119,7 @@ namespace mongo {
     const bool closeOnRelinquish = true;
 
     void ReplSetImpl::relinquish() {
-        log(2) << "attempting to relinquish" << endl;
+        LOG(2) << "replSet attempting to relinquish" << endl;
         if( box.getState().primary() ) {
             {
                 writelock lk("admin."); // so we are synchronized with _logOp()
@@ -239,7 +267,7 @@ namespace mongo {
 
         if( myConfig().arbiterOnly )
             b.append("arbiterOnly", true);
-        if( myConfig().priority == 0 )
+        if( myConfig().priority == 0 && !myConfig().arbiterOnly)
             b.append("passive", true);
         if( myConfig().slaveDelay )
             b.append("slaveDelay", myConfig().slaveDelay);
@@ -296,8 +324,10 @@ namespace mongo {
         _currentSyncTarget(0),
         _hbmsgTime(0),
         _self(0),
+        _maintenanceMode(0),
         mgr( new Manager(this) ),
         ghost( new GhostSync(this) ) {
+
         _cfg = 0;
         memset(_hbmsg, 0, sizeof(_hbmsg));
         strcpy( _hbmsg , "initial startup" );
@@ -306,7 +336,7 @@ namespace mongo {
 
         _seeds = &replSetCmdline.seeds;
 
-        log(1) << "replSet beginning startup..." << rsLog;
+        LOG(1) << "replSet beginning startup..." << rsLog;
 
         loadConfig();
 
@@ -317,7 +347,7 @@ namespace mongo {
         for( set<HostAndPort>::iterator i = replSetCmdline.seedSet.begin(); i != replSetCmdline.seedSet.end(); i++ ) {
             if( i->isSelf() ) {
                 if( sss == 1 )
-                    log(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog;
+                    LOG(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog;
             }
             else
                 log() << "replSet warning command line seed " << i->toString() << " is not present in the current repl set config" << rsLog;
@@ -382,7 +412,7 @@ namespace mongo {
             getLastErrorDefault = new BSONObj( c.getLastErrorDefaults );
         }
 
-        list<const ReplSetConfig::MemberCfg*> newOnes;
+        list<ReplSetConfig::MemberCfg*> newOnes;
         // additive short-cuts the new config setup. If we are just adding a
         // node/nodes and nothing else is changing, this is additive. If it's
         // not a reconfig, we're not adding anything
@@ -391,8 +421,8 @@ namespace mongo {
             unsigned nfound = 0;
             int me = 0;
             for( vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++ ) {
-                const ReplSetConfig::MemberCfg& m = *i;
                 
+                ReplSetConfig::MemberCfg& m = *i;
                 if( m.h.isSelf() ) {
                     me++;
                 }
@@ -443,8 +473,8 @@ namespace mongo {
         // this is a shortcut for simple changes
         if( additive ) {
             log() << "replSet info : additive change to configuration" << rsLog;
-            for( list<const ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) {
-                const ReplSetConfig::MemberCfg* m = *i;
+            for( list<ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) {
+                ReplSetConfig::MemberCfg *m = *i;
                 Member *mi = new Member(m->h, m->_id, m, false);
 
                 /** we will indicate that new members are up() initially so that we don't relinquish our
@@ -456,6 +486,11 @@ namespace mongo {
                 _members.push(mi);
                 startHealthTaskFor(mi);
             }
+
+            // if we aren't creating new members, we may have to update the
+            // groups for the current ones
+            _cfg->updateMembers(_members);
+
             return true;
         }
 
@@ -479,7 +514,7 @@ namespace mongo {
         string members = "";
 
         for( vector<ReplSetConfig::MemberCfg>::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) {
-            const ReplSetConfig::MemberCfg& m = *i;
+            ReplSetConfig::MemberCfg& m = *i;
             Member *mi;
             members += ( members == "" ? "" : ", " ) + m.h.toString();
             if( m.h.isSelf() ) {
@@ -594,7 +629,7 @@ namespace mongo {
                         if( ++once == 1 )
                             log() << "replSet info you may need to run replSetInitiate -- rs.initiate() in the shell -- if that is not already done" << rsLog;
                         if( _seeds->size() == 0 )
-                            log(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog;
+                            LOG(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog;
                     }
                     else {
                         startupStatus = EMPTYUNREACHABLE;
diff --git a/db/repl/rs.h b/db/repl/rs.h
index 7654597a930..14c630d27a2 100644
--- a/db/repl/rs.h
+++ b/db/repl/rs.h
@@ -58,10 +58,11 @@ namespace mongo {
         ~Member(); // intentionally unimplemented as should never be called -- see List1<>::Base.
         Member(const Member&); 
     public:
-        Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self);
+        Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self);
 
         string fullName() const { return h().toString(); }
         const ReplSetConfig::MemberCfg& config() const { return _config; }
+        ReplSetConfig::MemberCfg& configw() { return _config; }
         const HeartbeatInfo& hbinfo() const { return _hbinfo; }
         HeartbeatInfo& get_hbinfo() { return _hbinfo; }
         string lhb() const { return _hbinfo.lastHeartbeatMsg; }
@@ -74,7 +75,7 @@ namespace mongo {
 
     private:
         friend class ReplSetImpl;
-        const ReplSetConfig::MemberCfg _config;
+        ReplSetConfig::MemberCfg _config;
         const HostAndPort _h;
         HeartbeatInfo _hbinfo;
     };
@@ -242,13 +243,19 @@ namespace mongo {
             const Member *primary;
         };
         const SP get() {
-            scoped_lock lk(m);
+            rwlock lk(m, false);
             return sp;
         }
-        MemberState getState() const { return sp.state; }
-        const Member* getPrimary() const { return sp.primary; }
+        MemberState getState() const {
+            rwlock lk(m, false);
+            return sp.state;
+        }
+        const Member* getPrimary() const {
+            rwlock lk(m, false);
+            return sp.primary;
+        }
         void change(MemberState s, const Member *self) {
-            scoped_lock lk(m);
+            rwlock lk(m, true);
             if( sp.state != s ) {
                 log() << "replSet " << s.toString() << rsLog;
             }
@@ -262,24 +269,25 @@ namespace mongo {
             }
         }
         void set(MemberState s, const Member *p) {
-            scoped_lock lk(m);
-            sp.state = s; sp.primary = p;
+            rwlock lk(m, true);
+            sp.state = s;
+            sp.primary = p;
         }
         void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); }
         void setOtherPrimary(const Member *mem) {
-            scoped_lock lk(m);
+            rwlock lk(m, true);
             assert( !sp.state.primary() );
             sp.primary = mem;
         }
         void noteRemoteIsPrimary(const Member *remote) {
-            scoped_lock lk(m);
+            rwlock lk(m, true);
             if( !sp.state.secondary() && !sp.state.fatal() )
                 sp.state = MemberState::RS_RECOVERING;
             sp.primary = remote;
         }
         StateBox() : m("StateBox") { }
     private:
-        mongo::mutex m;
+        RWLock m;
         SP sp;
     };
 
@@ -446,11 +454,20 @@ namespace mongo {
         List1<Member> _members; // all members of the set EXCEPT _self.
         ReplSetConfig::MemberCfg _config; // config of _self
         unsigned _id; // _id of _self
+
+        int _maintenanceMode; // if we should stay in recovering state
     public:
         // this is called from within a writelock in logOpRS
         unsigned selfId() const { return _id; }
         Manager *mgr;
         GhostSync *ghost;
+        /**
+         * This forces a secondary to go into recovering state and stay there
+         * until this is called again, passing in "false".  Multiple threads can
+         * call this and it will leave maintenance mode once all of the callers
+         * have called it again, passing in false.
+         */
+        void setMaintenanceMode(const bool inc);
     private:
         Member* head() const { return _members.head(); }
     public:
@@ -553,11 +570,29 @@ namespace mongo {
         virtual bool logTheOp() { return false; }
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream &help ) const { help << "internal"; }
+
+        /**
+         * Some replica set commands call this and then call check(). This is
+         * intentional, as they might do things before theReplSet is initialized
+         * that still need to be checked for auth.
+         */
+        bool checkAuth(string& errmsg, BSONObjBuilder& result) {
+            if( !noauth && adminOnly() ) {
+                AuthenticationInfo *ai = cc().getAuthenticationInfo();
+                if (!ai->isAuthorizedForLock("admin", locktype())) {
+                    errmsg = "replSet command unauthorized";
+                    return false;
+                }
+            }
+            return true;
+        }
+
         bool check(string& errmsg, BSONObjBuilder& result) {
             if( !replSet ) {
                 errmsg = "not running with --replSet";
                 return false;
             }
+
             if( theReplSet == 0 ) {
                 result.append("startupStatus", ReplSet::startupStatus);
                 string s;
@@ -566,7 +601,8 @@ namespace mongo {
                     result.append("info", "run rs.initiate(...) if not yet done for the set");
                 return false;
             }
-            return true;
+
+            return checkAuth(errmsg, result);
         }
     };
 
@@ -578,7 +614,7 @@ namespace mongo {
 
     /** inlines ----------------- */
 
-    inline Member::Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self) :
+    inline Member::Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self) :
         _config(*c), _h(h), _hbinfo(ord) {
         assert(c);
         if( self )
diff --git a/db/repl/rs_config.cpp b/db/repl/rs_config.cpp
index 4d6c7b59bba..745d60b537c 100644
--- a/db/repl/rs_config.cpp
+++ b/db/repl/rs_config.cpp
@@ -83,14 +83,24 @@ namespace mongo {
         if( hidden ) b << "hidden" << hidden;
         if( !buildIndexes ) b << "buildIndexes" << buildIndexes;
         if( !tags.empty() ) {
-            BSONArrayBuilder a;
-            for( set<string>::const_iterator i = tags.begin(); i != tags.end(); i++ )
-                a.append(*i);
-            b.appendArray("tags", a.done());
+            BSONObjBuilder a;
+            for( map<string,string>::const_iterator i = tags.begin(); i != tags.end(); i++ )
+                a.append((*i).first, (*i).second);
+            b.append("tags", a.done());
         }
         return b.obj();
     }
 
+    void ReplSetConfig::updateMembers(List1<Member> &dest) {
+        for (vector<MemberCfg>::iterator source = members.begin(); source < members.end(); source++) {
+            for( Member *d = dest.head(); d; d = d->next() ) {
+                if (d->fullName() == (*source).h.toString()) {
+                    d->configw().groupsw() = (*source).groups();
+                }
+            }
+        }
+    }
+
     bo ReplSetConfig::asBson() const {
         bob b;
         b.append("_id", _id).append("version", version);
@@ -307,85 +317,39 @@ namespace mongo {
     }
 
     void ReplSetConfig::_populateTagMap(map<string,TagClause> &tagMap) {
-        // stage 1: create subgroups for each server corresponding to each of
-        // its tags. If a server has three tags, we want it to end up in three
-        // subgroups, e.g.: A is tagged with ["A", "dc.ny", "m"].  At the end of
-        // this step, tagMap will contain:
-        // "A" => {"A.A" : A}
-        // "dc.ny" => {"dc.ny.A" : A}
-        // "m" => {"m.A" : A}
-        // If we have more than one server with the same tag, we end up with
-        // something like "x.y.z" => [{"x.y.z.A" : A},{"x.y.z.B" : B}] (if A
-        // and B were tagged with "x.y.z").
+        // create subgroups for each server corresponding to each of
+        // its tags. E.g.:
+        //
+        // A is tagged with {"server" : "A", "dc" : "ny"}
+        // B is tagged with {"server" : "B", "dc" : "ny"}
+        //
+        // At the end of this step, tagMap will contain:
+        //
+        // "server" => {"A" : [A], "B" : [B]}
+        // "dc" => {"ny" : [A,B]}
+
         for (unsigned i=0; i<members.size(); i++) {
             MemberCfg member = members[i];
 
-            for (set<string>::iterator tag = member.tags.begin(); tag != member.tags.end(); tag++) {
-                TagClause& clause = tagMap[*tag];
-                clause.name = *tag;
+            for (map<string,string>::iterator tag = member.tags.begin(); tag != member.tags.end(); tag++) {
+                string label = (*tag).first;
+                string value = (*tag).second;
 
-                // we also populate the map, to be used by step 2... I think
-                // this is correct, as step 2 condenses the groups anyway
-                string perServerName = *tag+"."+members[i].h.toString();
+                TagClause& clause = tagMap[label];
+                clause.name = label;
 
                 TagSubgroup* subgroup;
-                if (clause.subgroups.find(perServerName) == clause.subgroups.end()) {
-                    clause.subgroups[perServerName] = subgroup = new TagSubgroup(perServerName);
+                // search for "ny" in "dc"'s clause
+                if (clause.subgroups.find(value) == clause.subgroups.end()) {
+                    clause.subgroups[value] = subgroup = new TagSubgroup(value);
                 }
                 else {
-                    subgroup = clause.subgroups[perServerName];
+                    subgroup = clause.subgroups[value];
                 }
 
                 subgroup->m.insert(&members[i]);
             }
         }
-
-        // stage 2: generate all parent tags.  If we have "x.y.z", this
-        // generates "x.y" and "x" and creates a map for each clause, e.g.,
-        // "x"'s clause might have a map that looks like:
-        // "x.y" => {A, B} {C}
-        // "x.w" => {D} {E, F}
-        for (map<string,TagClause>::iterator baseClause = tagMap.begin(); baseClause != tagMap.end(); baseClause++) {
-            string prevPrefix = (*baseClause).first;
-            const char *dot = strrchr(prevPrefix.c_str(), '.');
-
-            while (dot) {
-                // get x.y
-                string xyTag = string(prevPrefix.c_str(), dot - prevPrefix.c_str());
-                log(1) << "generating tag " << xyTag << rsLog;
-                TagClause& xyClause = tagMap[xyTag];
-                xyClause.name = xyTag;
-
-                // get all of x.y.z's subgroups, add them as a single subgroup of x.y
-                TagSubgroup* condensedSubgroup;;
-                if (xyClause.subgroups.find(prevPrefix) == xyClause.subgroups.end()) {
-                    // label this subgroup one higher than the current, e.g.,
-                    // "x.y.z" if we're creating the "x.y" clause
-                    condensedSubgroup = new TagSubgroup(prevPrefix);
-                    xyClause.subgroups[prevPrefix] = condensedSubgroup;
-                }
-                else {
-                    condensedSubgroup = xyClause.subgroups[prevPrefix];
-                    assert(condensedSubgroup->name == prevPrefix);
-                }
-
-                TagClause& xyzClause = tagMap[prevPrefix];
-
-                for (map<string,TagSubgroup*>::iterator xyzSubgroup = xyzClause.subgroups.begin();
-                     xyzSubgroup != xyzClause.subgroups.end(); xyzSubgroup++) {
-                    for (set<MemberCfg*>::const_iterator xyzMember = (*xyzSubgroup).second->m.begin();
-                         xyzMember != (*xyzSubgroup).second->m.end(); xyzMember++) {
-                        condensedSubgroup->m.insert(*xyzMember);
-                        // we'll link the member back with the group later, to
-                        // avoid creating extra link-backs
-                    }
-                }
-
-                // advance: if we were handling "x.y", now do "x"
-                prevPrefix = xyTag;
-                dot = strrchr(prevPrefix.c_str(), '.');
-            }
-        }
     }
 
     void ReplSetConfig::parseRules(const BSONObj& modes) {
@@ -442,7 +406,7 @@ namespace mongo {
 
                     for (set<MemberCfg *>::iterator cfg = (*sgs).second->m.begin();
                          !foundMe && cfg != (*sgs).second->m.end(); cfg++) {
-                        (*cfg)->groupsw(this).insert((*sgs).second);
+                        (*cfg)->groupsw().insert((*sgs).second);
                     }
                 }
 
@@ -463,7 +427,7 @@ namespace mongo {
             }
 
             // if we got here, this is a valid rule
-            log(1) << "new rule " << rule.fieldName() << ": " << r->toString() << rsLog;
+            LOG(1) << "replSet new rule " << rule.fieldName() << ": " << r->toString() << rsLog;
             rules[rule.fieldName()] = r;
         }
     }
@@ -532,9 +496,10 @@ namespace mongo {
                 if( mobj.hasElement("votes") )
                     m.votes = (unsigned) mobj["votes"].Number();
                 if( mobj.hasElement("tags") ) {
-                    vector<BSONElement> v = mobj["tags"].Array();
-                    for( unsigned i = 0; i < v.size(); i++ )
-                        m.tags.insert( v[i].String() );
+                    const BSONObj &t = mobj["tags"].Obj();
+                    for (BSONObj::iterator c = t.begin(); c.more(); c.next()) {
+                        m.tags[(*c).fieldName()] = (*c).String();
+                    }
                 }
                 m.check();
             }
diff --git a/db/repl/rs_config.h b/db/repl/rs_config.h
index d9c9d97ed4d..4e0d1e862c0 100644
--- a/db/repl/rs_config.h
+++ b/db/repl/rs_config.h
@@ -25,7 +25,7 @@
 #include "health.h"
 
 namespace mongo {
-
+    class Member;
     const string rsConfigNs = "local.system.replset";
 
     class ReplSetConfig {
@@ -61,15 +61,14 @@ namespace mongo {
             int slaveDelay;       /* seconds.  int rather than unsigned for convenient to/front bson conversion. */
             bool hidden;          /* if set, don't advertise to drives in isMaster. for non-primaries (priority 0) */
             bool buildIndexes;    /* if false, do not create any non-_id indexes */
-            set<string> tags;     /* tagging for data center, rack, etc. */
+            map<string,string> tags;     /* tagging for data center, rack, etc. */
         private:
             set<TagSubgroup*> _groups; // the subgroups this member belongs to
         public:
             const set<TagSubgroup*>& groups() const { 
                 return _groups;
             }
-            set<TagSubgroup*>& groupsw(ReplSetConfig *c) { 
-                assert(!c->_constructed);
+            set<TagSubgroup*>& groupsw() {
                 return _groups;
             }
             void check() const;   /* check validity, assert if not. */
@@ -114,6 +113,11 @@ namespace mongo {
         void saveConfigLocally(BSONObj comment); // to local db
         string saveConfigEverywhere(); // returns textual info on what happened
 
+        /**
+         * Update members' groups when the config changes but members stay the same.
+         */
+        void updateMembers(List1<Member> &dest);
+
         BSONObj asBson() const;
 
         bool _constructed;
diff --git a/db/repl/rs_initialsync.cpp b/db/repl/rs_initialsync.cpp
index 814bb1d0bf8..142878ab478 100644
--- a/db/repl/rs_initialsync.cpp
+++ b/db/repl/rs_initialsync.cpp
@@ -75,7 +75,7 @@ namespace mongo {
         if( d && d->stats.nrecords == 0 )
             return; // already empty, ok.
 
-        log(1) << "replSet empty oplog" << rsLog;
+        LOG(1) << "replSet empty oplog" << rsLog;
         d->emptyCappedCollection(rsoplog);
     }
 
@@ -85,6 +85,7 @@ namespace mongo {
         // find the member with the lowest ping time that has more data than me
         for (Member *m = _members.head(); m; m = m->next()) {
             if (m->hbinfo().up() &&
+                HeartbeatInfo::numPings > config().members.size()*2 &&
                 (m->state() == MemberState::RS_PRIMARY ||
                  (m->state() == MemberState::RS_SECONDARY && m->hbinfo().opTime > lastOpTimeWritten)) &&
                 (!closest || m->hbinfo().ping < closest->hbinfo().ping)) {
diff --git a/db/repl/rs_initiate.cpp b/db/repl/rs_initiate.cpp
index 5dd0ab23d24..0a796e1e445 100644
--- a/db/repl/rs_initiate.cpp
+++ b/db/repl/rs_initiate.cpp
@@ -150,7 +150,7 @@ namespace mongo {
             h << "Initiate/christen a replica set.";
             h << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             log() << "replSet replSetInitiate admin command received from client" << rsLog;
 
             if( !replSet ) {
diff --git a/db/repl/rs_member.h b/db/repl/rs_member.h
index 8e5a8ad9da3..d60bb5261e9 100644
--- a/db/repl/rs_member.h
+++ b/db/repl/rs_member.h
@@ -80,7 +80,8 @@ namespace mongo {
         DiagStr lastHeartbeatMsg;
         OpTime opTime;
         int skew;
-        unsigned int ping; // microseconds
+        unsigned int ping; // milliseconds
+        static unsigned int numPings;
 
         bool up() const { return health > 0; }
 
diff --git a/db/repl/rs_rollback.cpp b/db/repl/rs_rollback.cpp
index 67d6cc26f07..cce5c091074 100644
--- a/db/repl/rs_rollback.cpp
+++ b/db/repl/rs_rollback.cpp
@@ -574,7 +574,7 @@ namespace mongo {
         sethbmsg("rollback 6");
 
         // clean up oplog
-        log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
+        LOG(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
         // todo: fatal error if this throws?
         oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
 
diff --git a/db/repl/rs_sync.cpp b/db/repl/rs_sync.cpp
index 95bbe2040a6..5fe3075c0f7 100644
--- a/db/repl/rs_sync.cpp
+++ b/db/repl/rs_sync.cpp
@@ -188,6 +188,16 @@ namespace mongo {
     */
     bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
         bool golive = false;
+
+        {
+            lock lk( this );
+
+            if (_maintenanceMode > 0) {
+                // we're not actually going live
+                return true;
+            }
+        }
+
         {
             readlock lk("local.replset.minvalid");
             BSONObj mv;
@@ -211,7 +221,7 @@ namespace mongo {
         BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
         OpTime ts = remoteOldestOp["ts"]._opTime();
         DEV log() << "replSet remoteOldestOp:    " << ts.toStringLong() << rsLog;
-        else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
+        else LOG(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
         DEV {
             log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
             log() << "replSet our state: " << state().toString() << rsLog;
@@ -251,7 +261,7 @@ namespace mongo {
         assert(r.conn() == 0);
 
         if( !r.connect(hn) ) {
-            log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
+            LOG(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
             r.resetConnection();
             return false;
         }
@@ -407,7 +417,7 @@ namespace mongo {
 
                     }
 
-                    {
+                    try {
                         writelock lk("");
 
                         /* if we have become primary, we dont' want to apply things from elsewhere
@@ -421,11 +431,16 @@ namespace mongo {
                         syncApply(o);
                         _logOpObjRS(o);   // with repl sets we write the ops to our oplog too 
                     }
+                    catch (DBException& e) {
+                        sethbmsg(str::stream() << "syncTail: " << e.toString() << ", syncing: " << o);
+                        sleepsecs(30);
+                        return;
+                    }
                 }
             }
             r.tailCheck();
             if( !r.haveCursor() ) {
-                log(1) << "replSet end syncTail pass with " << hn << rsLog;
+                LOG(1) << "replSet end syncTail pass with " << hn << rsLog;
                 // TODO : reuse our connection to the primary.
                 return;
             }
@@ -475,9 +490,7 @@ namespace mongo {
                 _syncThread();
             }
             catch(DBException& e) {
-                sethbmsg(str::stream() << "syncThread: " << e.toString() <<
-                         ", try 'use local; db.oplog.rs.findOne({ts : {$gt : new Timestamp(" <<
-                         lastOpTimeWritten.getSecs() << "000," << lastOpTimeWritten.getInc() << ")}});' on the primary");
+                sethbmsg(str::stream() << "syncThread: " << e.toString());
                 sleepsecs(10);
             }
             catch(...) {
@@ -580,7 +593,7 @@ namespace mongo {
             // the target might end up with a new Member, but s.slave never
             // changes so we'll compare the names
             || target == slave->slave || target->fullName() == slave->slave->fullName()) {
-            log(1) << "replica set ghost target no good" << endl;
+            LOG(1) << "replica set ghost target no good" << endl;
             return;
         }
 
@@ -593,8 +606,7 @@ namespace mongo {
                 slave->reader.ghostQueryGTE(rsoplog, last);
             }
 
-            log(1) << "last: " << slave->last.toString() << " to " << last.toString() << rsLog;
-
+            LOG(1) << "replSet last: " << slave->last.toString() << " to " << last.toString() << rsLog;
             if (slave->last > last) {
                 return;
             }
@@ -608,11 +620,11 @@ namespace mongo {
                 BSONObj o = slave->reader.nextSafe();
                 slave->last = o["ts"]._opTime();
             }
-            log(2) << "now last is " << slave->last.toString() << rsLog;
+            LOG(2) << "now last is " << slave->last.toString() << rsLog;
         }
         catch (DBException& e) {
             // we'll be back
-            log(2) << "replSet ghost sync error: " << e.what() << " for "
+            LOG(2) << "replSet ghost sync error: " << e.what() << " for "
                    << slave->slave->fullName() << rsLog;
             slave->reader.resetConnection();
         }
diff --git a/db/scanandorder.cpp b/db/scanandorder.cpp
new file mode 100644
index 00000000000..efa9c8d7f13
--- /dev/null
+++ b/db/scanandorder.cpp
@@ -0,0 +1,93 @@
+/* scanandorder.cpp
+   Order results (that aren't already indexes and in order.)
+*/
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "scanandorder.h"
+
+namespace mongo {
+
+    const unsigned ScanAndOrder::MaxScanAndOrderBytes = 32 * 1024 * 1024;
+
+    void ScanAndOrder::_add(BSONObj& k, BSONObj o, DiskLoc* loc) {
+        if (!loc) {
+            _best.insert(make_pair(k.getOwned(),o.getOwned()));
+        }
+        else {
+            BSONObjBuilder b;
+            b.appendElements(o);
+            b.append("$diskLoc", loc->toBSONObj());
+            _best.insert(make_pair(k.getOwned(), b.obj().getOwned()));
+        }
+    }
+
+    void ScanAndOrder::_addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc) {
+        /* todo : we don't correct _approxSize here. */
+        const BSONObj& worstBestKey = i->first;
+        int c = worstBestKey.woCompare(k, _order._spec.keyPattern);
+        if ( c > 0 ) {
+            // k is better, 'upgrade'
+            _best.erase(i);
+            _add(k, o, loc);
+        }
+    }
+
+
+    void ScanAndOrder::add(BSONObj o, DiskLoc* loc) {
+        assert( o.isValid() );
+        BSONObj k = _order.getKeyFromObject(o);
+        if ( k.isEmpty() ) {
+            return;   
+        }
+        if ( (int) _best.size() < _limit ) {
+            _approxSize += k.objsize();
+            _approxSize += o.objsize();
+            
+            /* note : adjust when bson return limit adjusts. note this limit should be a bit higher. */
+            uassert( 10128 ,  "too much data for sort() with no index.  add an index or specify a smaller limit", _approxSize < MaxScanAndOrderBytes );
+            
+            _add(k, o, loc);
+            return;
+        }
+        BestMap::iterator i;
+        assert( _best.end() != _best.begin() );
+        i = _best.end();
+        i--;
+        _addIfBetter(k, o, i, loc);
+    }
+
+
+    void ScanAndOrder::fill(BufBuilder& b, Projection *filter, int& nout ) const {
+        int n = 0;
+        int nFilled = 0;
+        for ( BestMap::const_iterator i = _best.begin(); i != _best.end(); i++ ) {
+            n++;
+            if ( n <= _startFrom )
+                continue;
+            const BSONObj& o = i->second;
+            fillQueryResultFromObj(b, filter, o);
+            nFilled++;
+            if ( nFilled >= _limit )
+                break;
+            uassert( 10129 ,  "too much data for sort() with no index", b.len() < (int)MaxScanAndOrderBytes ); // appserver limit
+        }
+        nout = nFilled;
+    }
+
+} // namespace mongo
diff --git a/db/scanandorder.h b/db/scanandorder.h
index 2957ae60245..33e76f61f67 100644
--- a/db/scanandorder.h
+++ b/db/scanandorder.h
@@ -22,6 +22,7 @@
 
 #include "indexkey.h"
 #include "queryutil.h"
+#include "projection.h"
 
 namespace mongo {
 
@@ -76,30 +77,9 @@ namespace mongo {
 
     typedef multimap<BSONObj,BSONObj,BSONObjCmp> BestMap;
     class ScanAndOrder {
-        void _add(BSONObj& k, BSONObj o, DiskLoc* loc) {
-            if (!loc) {
-                _best.insert(make_pair(k.getOwned(),o.getOwned()));
-            }
-            else {
-                BSONObjBuilder b;
-                b.appendElements(o);
-                b.append("$diskLoc", loc->toBSONObj());
-                _best.insert(make_pair(k.getOwned(), b.obj().getOwned()));
-            }
-        }
-
-        void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc) {
-            /* todo : we don't correct _approxSize here. */
-            const BSONObj& worstBestKey = i->first;
-            int c = worstBestKey.woCompare(k, _order._spec.keyPattern);
-            if ( c > 0 ) {
-                // k is better, 'upgrade'
-                _best.erase(i);
-                _add(k, o, loc);
-            }
-        }
-
     public:
+        static const unsigned MaxScanAndOrderBytes;
+
         ScanAndOrder(int startFrom, int limit, BSONObj order, const FieldRangeSet &frs) :
             _best( BSONObjCmp( order ) ),
             _startFrom(startFrom), _order(order, frs) {
@@ -107,60 +87,25 @@ namespace mongo {
             _approxSize = 0;
         }
 
-        int size() const {
-            return _best.size();
-        }
-
-        void add(BSONObj o, DiskLoc* loc) {
-            assert( o.isValid() );
-            BSONObj k = _order.getKeyFromObject(o);
-            if ( k.isEmpty() ) {
-                return;   
-            }
-            if ( (int) _best.size() < _limit ) {
-                _approxSize += k.objsize();
-                _approxSize += o.objsize();
-
-                /* note : adjust when bson return limit adjusts. note this limit should be a bit higher. */
-                uassert( 10128 ,  "too much data for sort() with no index.  add an index or specify a smaller limit", _approxSize < 32 * 1024 * 1024 );
-
-                _add(k, o, loc);
-                return;
-            }
-            BestMap::iterator i;
-            assert( _best.end() != _best.begin() );
-            i = _best.end();
-            i--;
-            _addIfBetter(k, o, i, loc);
-        }
+        int size() const { return _best.size(); }
 
-        void _fill(BufBuilder& b, Projection *filter, int& nout, BestMap::iterator begin, BestMap::iterator end) {
-            int n = 0;
-            int nFilled = 0;
-            for ( BestMap::iterator i = begin; i != end; i++ ) {
-                n++;
-                if ( n <= _startFrom )
-                    continue;
-                BSONObj& o = i->second;
-                fillQueryResultFromObj(b, filter, o);
-                nFilled++;
-                if ( nFilled >= _limit )
-                    break;
-                uassert( 10129 ,  "too much data for sort() with no index", b.len() < 4000000 ); // appserver limit
-            }
-            nout = nFilled;
-        }
+        void add(BSONObj o, DiskLoc* loc);
 
         /* scanning complete. stick the query result in b for n objects. */
-        void fill(BufBuilder& b, Projection *filter, int& nout) {
-            _fill(b, filter, nout, _best.begin(), _best.end());
-        }
-        
+        void fill(BufBuilder& b, Projection *filter, int& nout ) const;
+
+    private:
+
+        void _add(BSONObj& k, BSONObj o, DiskLoc* loc);
+
+        void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc);
+
         BestMap _best; // key -> full object
         int _startFrom;
         int _limit;   // max to send back.
         KeyType _order;
         unsigned _approxSize;
+
     };
 
 } // namespace mongo
diff --git a/db/security.cpp b/db/security.cpp
index 4a6f32600aa..b57326a8233 100644
--- a/db/security.cpp
+++ b/db/security.cpp
@@ -30,7 +30,7 @@
 namespace mongo {
 
     bool AuthenticationInfo::_warned = false;
-
+    /*
     void AuthenticationInfo::print() const {
         cout << "AuthenticationInfo: " << this << '\n';
         for ( MA::const_iterator i=_dbs.begin(); i!=_dbs.end(); i++ ) {
@@ -38,7 +38,7 @@ namespace mongo {
         }
         cout << "END" << endl;
     }
-
+    */
 
     string AuthenticationInfo::getUser( const string& dbname ) const {
         scoped_spinlock lk(_lock);
@@ -78,9 +78,9 @@ namespace mongo {
             pwd = internalSecurity.pwd;
         }
         else {
-            static BSONObj userPattern = fromjson("{\"user\":1}");
+            // static BSONObj userPattern = fromjson("{\"user\":1}");
             string systemUsers = dbname + ".system.users";
-            OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
+            // OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
             {
                 BSONObjBuilder b;
                 b << "user" << user;
@@ -107,7 +107,7 @@ namespace mongo {
         }
     }
 
-    bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+    bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
         AuthenticationInfo *ai = cc().getAuthenticationInfo();
         ai->logout(dbname);
         return true;
diff --git a/db/security.h b/db/security.h
index 2937ef29f80..2937ef29f80 100644..100755
--- a/db/security.h
+++ b/db/security.h
diff --git a/db/security_commands.cpp b/db/security_commands.cpp
index 16face7fc32..2db96802404 100644
--- a/db/security_commands.cpp
+++ b/db/security_commands.cpp
@@ -56,7 +56,7 @@ namespace mongo {
         void help(stringstream& h) const { h << "internal"; }
         virtual LockType locktype() const { return NONE; }
         CmdGetNonce() : Command("getnonce") {}
-        bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             nonce64 *n = new nonce64(Security::getNonce());
             stringstream ss;
             ss << hex << *n;
@@ -68,7 +68,7 @@ namespace mongo {
 
     CmdLogout cmdLogout;
 
-    bool CmdAuthenticate::run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+    bool CmdAuthenticate::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
         log() << " authenticate: " << cmdObj << endl;
 
         string user = cmdObj.getStringField("user");
diff --git a/db/security_common.h b/db/security_common.h
index 3af70cc7b97..2f2565f3ce0 100644
--- a/db/security_common.h
+++ b/db/security_common.h
@@ -57,10 +57,10 @@ namespace mongo {
         virtual bool slaveOk() const {
             return true;
         }
-        virtual LockType locktype() const { return WRITE; }
+        virtual LockType locktype() const { return READ; }
         virtual void help(stringstream& ss) const { ss << "internal"; }
         CmdAuthenticate() : Command("authenticate") {}
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+        bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
     private:
         bool getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd);
         void authenticate(const string& dbname, const string& user, const bool readOnly);
@@ -77,7 +77,7 @@ namespace mongo {
         void help(stringstream& h) const { h << "de-authenticate"; }
         virtual LockType locktype() const { return NONE; }
         CmdLogout() : Command("logout") {}
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+        bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
     };
 
 } // namespace mongo
diff --git a/db/stats/top.cpp b/db/stats/top.cpp
index 51a270c8c8c..f5b6ee42f1c 100644
--- a/db/stats/top.cpp
+++ b/db/stats/top.cpp
@@ -156,7 +156,7 @@ namespace mongo {
         virtual LockType locktype() const { return READ; }
         virtual void help( stringstream& help ) const { help << "usage by collection, in micros "; }
 
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             {
                 BSONObjBuilder b( result.subobjStart( "totals" ) );
                 b.append( "note" , "all times in microseconds" );
diff --git a/dbtests/basictests.cpp b/dbtests/basictests.cpp
index 299dc4352ad..80bd7d70892 100644
--- a/dbtests/basictests.cpp
+++ b/dbtests/basictests.cpp
@@ -26,6 +26,7 @@
 #include "../util/queue.h"
 #include "../util/paths.h"
 #include "../util/stringutils.h"
+#include "../util/compress.h"
 #include "../db/db.h"
 
 namespace BasicTests {
@@ -411,6 +412,21 @@ namespace BasicTests {
             ASSERT_EQUALS( -1 , lexNumCmp( "a.b.c.d0" , "a.b.c.d00" ) );
             ASSERT_EQUALS( 1 , lexNumCmp( "a.b.c.0.y" , "a.b.c.00.x" ) );
             
+            ASSERT_EQUALS( -1, lexNumCmp( "a", "a-" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "a-", "a" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "a-", "a-" ) );
+
+            ASSERT_EQUALS( -1, lexNumCmp( "a", "a-c" ) );
+            ASSERT_EQUALS( 1, lexNumCmp( "a-c", "a" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "a-c", "a-c" ) );
+
+            ASSERT_EQUALS( 1, lexNumCmp( "a-c.t", "a.t" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a.t", "a-c.t" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "a-c.t", "a-c.t" ) );
+
+            ASSERT_EQUALS( 1, lexNumCmp( "ac.t", "a.t" ) );
+            ASSERT_EQUALS( -1, lexNumCmp( "a.t", "ac.t" ) );
+            ASSERT_EQUALS( 0, lexNumCmp( "ac.t", "ac.t" ) );            
         }
     };
 
@@ -596,6 +612,40 @@ namespace BasicTests {
         }
     };
 
+    class CmdLineParseConfigTest {
+    public:
+        void run() {
+            stringstream ss1;
+            istringstream iss1("");
+            CmdLine::parseConfigFile( iss1, ss1 );
+            stringstream ss2;
+            istringstream iss2("password=\'foo bar baz\'");
+            CmdLine::parseConfigFile( iss2, ss2 );
+            stringstream ss3;
+            istringstream iss3("\t    this = false  \n#that = true\n  #another = whocares\n\n  other = monkeys  ");
+            CmdLine::parseConfigFile( iss3, ss3 );
+
+            ASSERT( ss1.str().compare("\n") == 0 );
+            ASSERT( ss2.str().compare("password=\'foo bar baz\'\n\n") == 0 );
+            ASSERT( ss3.str().compare("\n  other = monkeys  \n\n") == 0 );
+        }
+    };
+
+    struct CompressionTest1 { 
+        void run() { 
+            const char * c = "this is a test";
+            std::string s;
+            size_t len = compress(c, strlen(c)+1, &s);
+            assert( len > 0 );
+            
+            std::string out;
+            bool ok = uncompress(s.c_str(), s.size(), &out);
+            assert(ok);
+            assert( strcmp(out.c_str(), c) == 0 );
+        }
+    } ctest1;
+
+
     class All : public Suite {
     public:
         All() : Suite( "basic" ) {
@@ -632,6 +682,9 @@ namespace BasicTests {
 
             add< HostAndPortTests >();
             add< RelativePathTest >();
+            add< CmdLineParseConfigTest >();
+
+            add< CompressionTest1 >();
         }
     } myall;
 
diff --git a/dbtests/cursortests.cpp b/dbtests/cursortests.cpp
index 4d2de164165..cf661864b95 100644
--- a/dbtests/cursortests.cpp
+++ b/dbtests/cursortests.cpp
@@ -33,6 +33,7 @@ namespace CursorTests {
 
         class Base {
         protected:
+            static const char *ns() { return "unittests.cursortests.Base"; }
             FieldRangeVector *vec( int *vals, int len, int direction = 1 ) {
                 FieldRangeSet s( "", BSON( "a" << 1 ), true );
                 for( int i = 0; i < len; i += 2 ) {
@@ -49,6 +50,7 @@ namespace CursorTests {
                 IndexSpec *idxSpec = new IndexSpec( BSON( "a" << 1 ) );
                 return new FieldRangeVector( s, *idxSpec, direction );
             }
+            DBDirectClient _c;
         private:
             vector< BSONObj > _objs;
         };
@@ -258,6 +260,29 @@ namespace CursorTests {
             }
             virtual BSONObj idx() const { return BSON( "a" << 1 << "b" << 1 ); }
         };
+        
+        class AbortImplicitScan : public Base {
+        public:
+            void run() {
+                dblock lk;
+                IndexSpec idx( BSON( "a" << 1 << "b" << 1 ) );
+                _c.ensureIndex( ns(), idx.keyPattern );
+                for( int i = 0; i < 300; ++i ) {
+                    _c.insert( ns(), BSON( "a" << i << "b" << 5 ) );
+                }
+                FieldRangeSet frs( ns(), BSON( "b" << 3 ), true );
+                boost::shared_ptr<FieldRangeVector> frv( new FieldRangeVector( frs, idx, 1 ) );
+                Client::Context ctx( ns() );
+                scoped_ptr<BtreeCursor> c( BtreeCursor::make( nsdetails( ns() ), 1, nsdetails( ns() )->idx(1), frv, 1 ) );
+                int initialNscanned = c->nscanned();
+                ASSERT( initialNscanned < 200 );
+                ASSERT( c->ok() );
+                c->advance();
+                ASSERT( c->nscanned() > initialNscanned );
+                ASSERT( c->nscanned() < 200 );
+                ASSERT( c->ok() );
+            }
+        };
 
     } // namespace BtreeCursorTests
 
@@ -274,6 +299,7 @@ namespace CursorTests {
             add< BtreeCursorTests::EqIn >();
             add< BtreeCursorTests::RangeEq >();
             add< BtreeCursorTests::RangeIn >();
+            add< BtreeCursorTests::AbortImplicitScan >();
         }
     } myall;
 } // namespace CursorTests
diff --git a/dbtests/directclienttests.cpp b/dbtests/directclienttests.cpp
index 5b3bde70889..860eb7e7e5c 100644
--- a/dbtests/directclienttests.cpp
+++ b/dbtests/directclienttests.cpp
@@ -84,7 +84,7 @@ namespace DirectClientTests {
             ASSERT_EQUALS((int)client().count(ns), 1);
 
             client().dropCollection(ns);
-            client().insert(ns, objs, InsertOption_KeepGoing);
+            client().insert(ns, objs, InsertOption_ContinueOnError);
             ASSERT_EQUALS(client().getLastErrorDetailed()["code"].numberInt(), 11000);
             ASSERT_EQUALS((int)client().count(ns), 2);
         }
diff --git a/dbtests/framework.cpp b/dbtests/framework.cpp
index 99fcad51d97..95ed8b33668 100644
--- a/dbtests/framework.cpp
+++ b/dbtests/framework.cpp
@@ -209,6 +209,7 @@ namespace mongo {
 
             hidden_options.add_options()
             ("suites", po::value< vector<string> >(), "test suites to run")
+            ("nopreallocj", "disable journal prealloc")
             ;
 
             positional_options.add("suites", -1);
@@ -247,6 +248,10 @@ namespace mongo {
                 cmdLine.dur = true;
             }
 
+            if( params.count("nopreallocj") ) {
+                cmdLine.preallocj = false;
+            }
+
             if (params.count("debug") || params.count("verbose") ) {
                 logLevel = 1;
             }
diff --git a/dbtests/jsobjtests.cpp b/dbtests/jsobjtests.cpp
index 9f00d4cabce..034bb97c620 100644
--- a/dbtests/jsobjtests.cpp
+++ b/dbtests/jsobjtests.cpp
@@ -569,6 +569,13 @@ namespace JsobjTests {
                 }
 
                 {
+                    BSONObjBuilder b;
+                    b.appendBinData("f", 33, (BinDataType) 1, "123456789012345678901234567890123");
+                    BSONObj o = b.obj();
+                    keyTest( o, false );
+                }
+
+                {
                     for( int i = 1; i <= 3; i++ ) {
                         for( int j = 1; j <= 3; j++ ) {
                             BSONObjBuilder b;
diff --git a/dbtests/namespacetests.cpp b/dbtests/namespacetests.cpp
index 392917dd6d3..bbb8f5e596e 100644
--- a/dbtests/namespacetests.cpp
+++ b/dbtests/namespacetests.cpp
@@ -44,12 +44,13 @@ namespace NamespaceTests {
                 ASSERT( theDataFileMgr.findAll( ns() )->eof() );
             }
         protected:
-            void create() {
+            void create( bool sparse = false ) {
                 NamespaceDetailsTransient::get_w( ns() ).deletedIndex();
                 BSONObjBuilder builder;
                 builder.append( "ns", ns() );
                 builder.append( "name", "testIndex" );
                 builder.append( "key", key() );
+                builder.append( "sparse", sparse );
                 BSONObj bobj = builder.done();
                 id_.info = theDataFileMgr.insert( ns(), bobj.objdata(), bobj.objsize() );
                 // head not needed for current tests
@@ -339,12 +340,13 @@ namespace NamespaceTests {
                     elts.push_back( simpleBC( i ) );
                 BSONObjBuilder b;
                 b.append( "a", elts );
-
+                BSONObj obj = b.obj();
+                
                 BSONObjSet keys;
-                id().getKeysFromObject( b.done(), keys );
+                id().getKeysFromObject( obj, keys );
                 checkSize( 4, keys );
                 BSONObjSet::iterator i = keys.begin();
-                assertEquals( nullObj(), *i++ );
+                assertEquals( nullObj(), *i++ ); // see SERVER-3377
                 for ( int j = 1; j < 4; ++i, ++j ) {
                     BSONObjBuilder b;
                     b.append( "", j );
@@ -532,9 +534,49 @@ namespace NamespaceTests {
 
                 id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
                 checkSize(1, keys );
+                ASSERT_EQUALS( Undefined, keys.begin()->firstElement().type() );
                 keys.clear();
             }
         };
+ 
+        class DoubleArray : Base {
+        public:
+            void run() {
+             	create();   
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[1,2]}" ), keys );
+                checkSize(2, keys );
+                BSONObjSet::const_iterator i = keys.begin();
+                ASSERT_EQUALS( BSON( "" << 1 << "" << 1 ), *i );
+                ++i;
+                ASSERT_EQUALS( BSON( "" << 2 << "" << 2 ), *i );
+                keys.clear();
+            }
+            
+        protected:
+            BSONObj key() const {
+                return BSON( "a" << 1 << "a" << 1 );
+            }
+        };
+        
+        class DoubleEmptyArray : Base {
+        public:
+            void run() {
+             	create();   
+
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize(1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined,'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+            
+        protected:
+            BSONObj key() const {
+                return BSON( "a" << 1 << "a" << 1 );
+            }
+        };
 
         class MultiEmptyArray : Base {
         public:
@@ -558,7 +600,9 @@ namespace NamespaceTests {
                 id().getKeysFromObject( fromjson( "{a:1,b:[]}" ), keys );
                 checkSize(1, keys );
                 //cout << "YO : " << *(keys.begin()) << endl;
-                ASSERT_EQUALS( NumberInt , keys.begin()->firstElement().type() );
+                BSONObjIterator i( *keys.begin() );
+                ASSERT_EQUALS( NumberInt , i.next().type() );
+                ASSERT_EQUALS( Undefined , i.next().type() );
                 keys.clear();
             }
 
@@ -567,8 +611,313 @@ namespace NamespaceTests {
                 return aAndB();
             }
         };
+        
+        class NestedEmptyArray : Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 ); }
+        };
+        
+		class MultiNestedEmptyArray : Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null,'':null}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 << "a.c" << 1 ); }
+        };
+        
+        class UnevenNestedEmptyArray : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined,'':null}" ), *keys.begin() );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[{b:1}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':{b:1},'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[{b:[]}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':{b:[]},'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a" << 1 << "a.b" << 1 ); }            
+        };
+
+        class ReverseUnevenNestedEmptyArray : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null,'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 << "a" << 1 ); }            
+        };
+        
+        class SparseReverseUnevenNestedEmptyArray : public Base {
+        public:
+            void run() {
+             	create( true );
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null,'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 << "a" << 1 ); }            
+        };
+        
+        class SparseEmptyArray : public Base {
+        public:
+            void run() {
+             	create( true );
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:1}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
 
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[{c:1}]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 ); }            
+        };
 
+        class SparseEmptyArraySecond : public Base {
+        public:
+            void run() {
+             	create( true );
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:1}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[{c:1}]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "z" << 1 << "a.b" << 1 ); }
+        };
+        
+        class NonObjectMissingNestedField : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[1]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[1,{b:1}]}" ), keys );
+                checkSize( 2, keys );
+                BSONObjSet::const_iterator c = keys.begin();
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *c );
+                ++c;
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *c );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 ); }
+        };
+
+        class SparseNonObjectMissingNestedField : public Base {
+        public:
+            void run() {
+             	create( true );
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[1]}" ), keys );
+                checkSize( 0, keys );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[1,{b:1}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.b" << 1 ); }
+        };
+        
+        class IndexedArrayIndex : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[1]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( BSON( "" << 1 ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[1]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':[1]}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined}" ), *keys.begin() );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:{'0':1}}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( BSON( "" << 1 ), *keys.begin() );
+                keys.clear();
+
+                ASSERT_EXCEPTION( id().getKeysFromObject( fromjson( "{a:[{'0':1}]}" ), keys ), UserException );
+
+                ASSERT_EXCEPTION( id().getKeysFromObject( fromjson( "{a:[1,{'0':2}]}" ), keys ), UserException );
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.0" << 1 ); }
+        };
+
+        class DoubleIndexedArrayIndex : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[[1]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':null}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[[]]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.0.0" << 1 ); }
+        };
+        
+        class ObjectWithinArray : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[{b:1}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[{b:[1]}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[{b:[[1]]}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':[1]}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[{b:1}]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[{b:[1]}]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+
+                id().getKeysFromObject( fromjson( "{a:[[{b:[[1]]}]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':[1]}" ), *keys.begin() );
+                keys.clear();
+                
+                id().getKeysFromObject( fromjson( "{a:[[{b:[]}]]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':undefined}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.0.b" << 1 ); }
+        };
+
+        class ArrayWithinObjectWithinArray : public Base {
+        public:
+            void run() {
+             	create();
+                
+                BSONObjSet keys;
+                id().getKeysFromObject( fromjson( "{a:[{b:[1]}]}" ), keys );
+                checkSize( 1, keys );
+                ASSERT_EQUALS( fromjson( "{'':1}" ), *keys.begin() );
+                keys.clear();
+            }
+        protected:
+            BSONObj key() const { return BSON( "a.0.b.0" << 1 ); }
+        };
+        
+        // also test numeric string field names
+        
     } // namespace IndexDetailsTests
 
     namespace NamespaceDetailsTests {
@@ -862,7 +1211,22 @@ namespace NamespaceTests {
             add< IndexDetailsTests::AlternateMissing >();
             add< IndexDetailsTests::MultiComplex >();
             add< IndexDetailsTests::EmptyArray >();
+            add< IndexDetailsTests::DoubleArray >();
+            add< IndexDetailsTests::DoubleEmptyArray >();
             add< IndexDetailsTests::MultiEmptyArray >();
+            add< IndexDetailsTests::NestedEmptyArray >();
+            add< IndexDetailsTests::MultiNestedEmptyArray >();
+            add< IndexDetailsTests::UnevenNestedEmptyArray >();
+            add< IndexDetailsTests::ReverseUnevenNestedEmptyArray >();
+            add< IndexDetailsTests::SparseReverseUnevenNestedEmptyArray >();
+            add< IndexDetailsTests::SparseEmptyArray >();
+            add< IndexDetailsTests::SparseEmptyArraySecond >();
+            add< IndexDetailsTests::NonObjectMissingNestedField >();
+            add< IndexDetailsTests::SparseNonObjectMissingNestedField >();
+            add< IndexDetailsTests::IndexedArrayIndex >();
+            add< IndexDetailsTests::DoubleIndexedArrayIndex >();
+            add< IndexDetailsTests::ObjectWithinArray >();
+            add< IndexDetailsTests::ArrayWithinObjectWithinArray >();
             add< IndexDetailsTests::MissingField >();
             add< IndexDetailsTests::SubobjectMissing >();
             add< IndexDetailsTests::CompoundMissing >();
diff --git a/dbtests/perftests.cpp b/dbtests/perftests.cpp
index 11fda45c819..6766797a950 100644
--- a/dbtests/perftests.cpp
+++ b/dbtests/perftests.cpp
@@ -36,6 +36,7 @@
 #include "../util/checksum.h"
 #include "../util/version.h"
 #include "../db/key.h"
+#include "../util/compress.h"
 
 using namespace bson;
 
@@ -118,7 +119,7 @@ namespace PerfTests {
 
         // optional 2nd test phase to be timed separately
         // return name of it
-        virtual const char * timed2() { return 0; }
+        virtual string timed2() { return ""; }
 
         virtual void post() { }
 
@@ -133,8 +134,68 @@ namespace PerfTests {
         virtual bool showDurStats() { return true; }
 
         static DBClientConnection *conn;
+        static unsigned once;
 
     public:
+        /* if you want recording of the timings, place the password for the perf database 
+            in ./../settings.py:
+                pstatspassword="<pwd>"
+        */
+        void connect() { 
+            if( once )
+                return;
+            ++once;
+
+            // no writing to perf db if _DEBUG
+            DEV return;
+            
+            const char *fn = "../../settings.py";
+            if( !exists(fn) ) { 
+                if( exists("settings.py") )
+                    fn = "settings.py";
+                else {
+                    cout << "no ../../settings.py or ./settings.py file found. will not write perf stats to pstats db." << endl;
+                    cout << "it is recommended this be enabled even on dev boxes" << endl;
+                    return;
+                }
+            }
+
+            try {
+                if( conn == 0 ) {
+                    MemoryMappedFile f;
+                    const char *p = (const char *) f.mapWithOptions(fn, MongoFile::READONLY);
+                    string pwd;
+
+                    {
+                        const char *q = str::after(p, "pstatspassword=\"");
+                        if( *q == 0 ) {
+                            cout << "info perftests.cpp: no pstatspassword= in settings.py" << endl;
+                            return;
+                        }
+                        else {
+                            pwd = str::before(q, '\"');
+                        }
+                    }
+
+                    DBClientConnection *c = new DBClientConnection(false, 0, 10);
+                    string err;
+                    if( c->connect("perfdb.10gen.cc", err) ) { 
+                        if( !c->auth("perf", "perf", pwd, err) ) { 
+                            cout << "info: authentication with stats db failed: " << err << endl;
+                            assert(false);
+                        }
+                        conn = c;
+                    }
+                    else { 
+                        cout << err << " (to log perfstats)" << endl;
+                    }
+                }
+            }
+            catch(...) { }
+        }
+
+        virtual unsigned batchSize() { return 50; }
+
         void say(unsigned long long n, int ms, string s) {
             unsigned long long rps = n*1000/ms;
             cout << "stats " << setw(33) << left << s << ' ' << right << setw(9) << rps << ' ' << right << setw(5) << ms << "ms ";
@@ -142,124 +203,70 @@ namespace PerfTests {
                 cout << dur::stats.curr->_asCSV();
             cout << endl;
 
-            /* if you want recording of the timings, place the password for the perf database 
-               in ./../settings.py:
-                 pstatspassword="<pwd>"
-            */
-            const char *fn = "../../settings.py";
-            static bool ok = true;
-            if( ok ) {
-                DEV { 
-                    // no writing to perf db if dev
-                }
-                else if( !exists(fn) ) { 
-                    static int once;
-                    if( exists("settings.py") )
-                        fn = "settings.py";
-                    else if( once++ == 0 ) {
-                        cout << "no ../../settings.py or ./settings.py file found. will not write perf stats to pstats db." << endl;
-                        cout << "it is recommended this be enabled even on dev boxes" << endl;
-                    }
-                }
-                else {
-                    try {
-                        if( conn == 0 ) {
-                            MemoryMappedFile f;
-                            const char *p = (const char *) f.mapWithOptions(fn, MongoFile::READONLY);
-                            string pwd;
-
-                            {
-                                const char *q = str::after(p, "pstatspassword=\"");
-                                if( *q == 0 ) {
-                                    cout << "info perftests.cpp: no pstatspassword= in settings.py" << endl;
-                                    ok = false;
-                                }
-                                else {
-                                    pwd = str::before(q, '\"');
-                                }
-                            }
+            connect();
 
-                            if( ok ) {
-                                conn = new DBClientConnection(false, 0, 10);
-                                string err;
-                                if( conn->connect("mongo05.10gen.cust.cbici.net", err) ) { 
-                                    if( !conn->auth("perf", "perf", pwd, err) ) { 
-                                        cout << "info: authentication with stats db failed: " << err << endl;
-                                        assert(false);
-                                    }
-                                }
-                                else { 
-                                    cout << err << " (to log perfstats)" << endl;
-                                    ok = false;
+            if( conn && !conn->isFailed() ) { 
+                const char *ns = "perf.pstats";
+                if( perfHist ) {
+                    static bool needver = true;
+                    try {
+                        // try to report rps from last time */
+                        Query q;
+                        {
+                            BSONObjBuilder b;
+                            b.append("host",getHostName()).append("test",s).append("dur",cmdLine.dur);
+                            DEV { b.append("info.DEBUG",true); }
+                            else b.appendNull("info.DEBUG");
+                            if( sizeof(int*) == 4 ) 
+                                b.append("info.bits", 32);
+                            else 
+                                b.appendNull("info.bits");
+                            q = Query(b.obj()).sort("when",-1);
+                        }
+                        BSONObj fields = BSON( "rps" << 1 << "info" << 1 );
+                        vector<BSONObj> v;
+                        conn->findN(v, ns, q, perfHist, 0, &fields);
+                        for( vector<BSONObj>::iterator i = v.begin(); i != v.end(); i++ ) {
+                            BSONObj o = *i;
+                            double lastrps = o["rps"].Number();
+                            if( lastrps ) {
+                                cout << "stats " << setw(33) << right << "new/old:" << ' ' << setw(9);
+                                cout << fixed << setprecision(2) << rps / lastrps;
+                                if( needver ) {
+                                    cout << "         " << o.getFieldDotted("info.git").toString();
                                 }
+                                cout << '\n';
                             }
                         }
-                        if( conn && !conn->isFailed() ) { 
-                            const char *ns = "perf.pstats";
-                            if( perfHist ) {
-                                static bool needver = true;
-                                try {
-                                    // try to report rps from last time */
-                                    Query q;
-                                    {
-                                        BSONObjBuilder b;
-                                        b.append("host",getHostName()).append("test",s).append("dur",cmdLine.dur);
-                                        DEV b.append("info.DEBUG",true);
-                                        else b.appendNull("info.DEBUG");
-                                        if( sizeof(int*) == 4 ) b.append("info.bits", 32);
-                                        else b.appendNull("info.bits");
-                                        q = Query(b.obj()).sort("when",-1);
-                                   }
-                                    //cout << q.toString() << endl;
-                                    BSONObj fields = BSON( "rps" << 1 << "info" << 1 );
-                                    vector<BSONObj> v;
-                                    conn->findN(v, ns, q, perfHist, 0, &fields);
-                                    for( vector<BSONObj>::iterator i = v.begin(); i != v.end(); i++ ) {
-                                        BSONObj o = *i;
-                                        double lastrps = o["rps"].Number();
-                                        if( lastrps ) {
-                                            cout << "stats " << setw(33) << right << "new/old:" << ' ' << setw(9);
-                                            cout << fixed << setprecision(2) << rps / lastrps;
-                                            if( needver ) {
-                                                cout << "         " << o.getFieldDotted("info.git").toString();
-                                            }
-                                            cout << '\n';
-                                        }
-                                    }
-                                } catch(...) { }
-                                cout.flush();
-                                needver = false;
-                            }
-                            {
-                                bob b;
-                                b.append("host", getHostName());
-                                b.appendTimeT("when", time(0));
-                                b.append("test", s);
-                                b.append("rps", (int) rps);
-                                b.append("millis", ms);
-                                b.appendBool("dur", cmdLine.dur);
-                                if( showDurStats() && cmdLine.dur ) 
-                                    b.append("durStats", dur::stats.curr->_asObj());
-                                {
-                                    bob inf;
-                                    inf.append("version", versionString);
-                                    if( sizeof(int*) == 4 ) inf.append("bits", 32);
-                                    DEV inf.append("DEBUG", true);
+                    } catch(...) { }
+                    cout.flush();
+                    needver = false;
+                }
+                {
+                    bob b;
+                    b.append("host", getHostName());
+                    b.appendTimeT("when", time(0));
+                    b.append("test", s);
+                    b.append("rps", (int) rps);
+                    b.append("millis", ms);
+                    b.appendBool("dur", cmdLine.dur);
+                    if( showDurStats() && cmdLine.dur ) 
+                        b.append("durStats", dur::stats.curr->_asObj());
+                    {
+                        bob inf;
+                        inf.append("version", versionString);
+                        if( sizeof(int*) == 4 ) inf.append("bits", 32);
+                        DEV inf.append("DEBUG", true);
 #if defined(_WIN32)
-                                    inf.append("os", "win");
+                        inf.append("os", "win");
 #endif
-                                    inf.append("git", gitVersion());
-                                    inf.append("boost", BOOST_VERSION);
-                                    b.append("info", inf.obj());
-                                }
-                                BSONObj o = b.obj();
-                                //cout << "inserting " << o.toString() << endl;
-                                conn->insert(ns, o);
-                            }
-                        }
-                    }
-                    catch(...) { 
+                        inf.append("git", gitVersion());
+                        inf.append("boost", BOOST_VERSION);
+                        b.append("info", inf.obj());
                     }
+                    BSONObj o = b.obj();
+                    //cout << "inserting " << o.toString() << endl;
+                    conn->insert(ns, o);
                 }
             }
         }
@@ -277,9 +284,9 @@ namespace PerfTests {
 
             dur::stats._intervalMicros = 0; // no auto rotate
             dur::stats.curr->reset();
-            Timer t;
+            mongo::Timer t;
             unsigned long long n = 0;
-            const unsigned Batch = 50;
+            const unsigned Batch = batchSize();
 
             if( hlm == 0 ) { 
                 // means just do once
@@ -314,10 +321,10 @@ namespace PerfTests {
             post();
 
             {
-                const char *test2name = timed2();
-                if( test2name ) {
+                string test2name = timed2();
+                if( test2name.size() != 0 ) {
                     dur::stats.curr->reset();
-                    Timer t;
+                    mongo::Timer t;
                     unsigned long long n = 0;
                     while( 1 ) {
                         unsigned i;
@@ -335,6 +342,7 @@ namespace PerfTests {
     };
 
     DBClientConnection *B::conn;
+    unsigned B::once;
 
     unsigned dontOptimizeOutHopefully;
 
@@ -598,6 +606,48 @@ namespace PerfTests {
         virtual bool showDurStats() { return false; }
     };
 
+    class Compress : public B {
+    public:
+        const unsigned sz;
+        void *p;
+        Compress() : sz(1024*1024*100+3) { }
+        virtual unsigned batchSize() { return 1; }
+        string name() { return "compress"; }
+        virtual bool showDurStats() { return false; }
+        virtual int howLongMillis() { return 4000; } 
+        unsigned long long expectation() { return 1000000; }
+        void prep() { 
+            p = malloc(sz);
+            // this isn't a fair test as it is mostly rands but we just want a rough perf check
+            static int last;
+            for (unsigned i = 0; i<sz; i++) {
+                int r = rand();
+                if( (r & 0x300) == 0x300 )
+                    r = last;
+                ((char*)p)[i] = r;
+                last = r;
+            }
+        }
+        size_t last;
+        string res;
+        void timed() {
+            mongo::Timer t;
+            string out;
+            size_t len = compress((const char *) p, sz, &out);
+            bool ok = uncompress(out.c_str(), out.size(), &res);
+            ASSERT(ok);
+            static unsigned once;
+            if( once++ == 0 )
+                cout << "compress round trip " << sz/(1024.0*1024) / (t.millis()/1000.0) << "MB/sec\n";
+            //cout << len / (1024.0/1024) << " compressed" << endl;
+            (void)len; //fix unused error while above line is commented out
+        }
+        void post() {
+            ASSERT( memcmp(res.c_str(), p, sz) == 0 );
+            free(p);
+        }
+    };
+
     // test speed of checksum method
     class ChecksumTest : public B {
     public:
@@ -607,6 +657,7 @@ namespace PerfTests {
         virtual int howLongMillis() { return 2000; } 
         int expectationTimeMillis() { return 5000; }
         virtual bool showDurStats() { return false; }
+        virtual unsigned batchSize() { return 1; }
 
         void *p;
 
@@ -684,7 +735,7 @@ namespace PerfTests {
         void timed() {
             client().insert( ns(), x );
         }
-        const char * timed2() {
+        string timed2() {
             client().findOne(ns(), query);
             return "findOne_by_id";
         }
@@ -753,7 +804,7 @@ namespace PerfTests {
             client().update(ns(), q, y, /*upsert*/true);
         }
 
-        const char * timed2() {
+        virtual string timed2() {
             static BSONObj I = BSON( "$inc" << BSON( "y" << 1 ) );
 
             // test some $inc's
@@ -762,8 +813,7 @@ namespace PerfTests {
             BSONObj q = BSON("x" << x);
             client().update(ns(), q, I);
 
-            static string s = name()+"-inc";
-            return s.c_str();
+            return name()+"-inc";
         }
 
         unsigned long long expectation() { return 1000; }
@@ -778,6 +828,16 @@ namespace PerfTests {
             this->client().ensureIndex(this->ns(), BSON("y"<<1));
             this->client().ensureIndex(this->ns(), BSON("z"<<1));
         }
+        
+        /*
+        virtual string timed2() {
+            string x = T::timed2();
+            if ( x.size() == 0 )
+                return x;
+            
+            return x + "-with-more-indexes";
+        }
+        */        
     };
 
     void t() {
@@ -822,6 +882,8 @@ namespace PerfTests {
             }
             else {
                 add< Dummy >();
+                add< ChecksumTest >();
+                add< Compress >();
                 add< TLS >();
                 add< Malloc >();
                 add< Timer >();
@@ -838,7 +900,6 @@ namespace PerfTests {
                 add< BSONIter >();
                 add< BSONGetFields1 >();
                 add< BSONGetFields2 >();
-                add< ChecksumTest >();
                 add< TaskQueueTest >();
                 add< InsertDup >();
                 add< Insert1 >();
diff --git a/dbtests/queryoptimizertests.cpp b/dbtests/queryoptimizertests.cpp
index bd597572d52..83a2d267c57 100644
--- a/dbtests/queryoptimizertests.cpp
+++ b/dbtests/queryoptimizertests.cpp
@@ -104,7 +104,7 @@ namespace QueryOptimizerTests {
         auto_ptr< FieldRangeSetPair > FieldRangeSetPair_GLOBAL;
 #define FRSP(x) ( FieldRangeSetPair_GLOBAL.reset( new FieldRangeSetPair( ns(), x ) ), *FieldRangeSetPair_GLOBAL )
         auto_ptr< FieldRangeSetPair > FieldRangeSetPair_GLOBAL2;
-#define FRSP2(x) ( FieldRangeSetPair_GLOBAL2.reset( new FieldRangeSetPair( ns(), x ) ), *FieldRangeSetPair_GLOBAL2 )
+#define FRSP2(x) ( FieldRangeSetPair_GLOBAL2.reset( new FieldRangeSetPair( ns(), x ) ), FieldRangeSetPair_GLOBAL2.get() )
 
         class NoIndex : public Base {
         public:
@@ -886,7 +886,7 @@ namespace QueryOptimizerTests {
                 }
                 BSONObj hint = fromjson( "{$hint:{a:1,b:1}}" );
                 auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ) ) );
-                QueryPlan qp( nsd(), 1, *frsp, *frsp, fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ), BSONObj() );
+                QueryPlan qp( nsd(), 1, *frsp, frsp.get(), fromjson( "{a:5,b:{$in:[2,3,6,9,11]}}" ), BSONObj() );
                 boost::shared_ptr<Cursor> c = qp.newCursor();
                 double expected[] = { 2, 3, 6, 9 };
                 ASSERT( c->ok() );
@@ -908,7 +908,7 @@ namespace QueryOptimizerTests {
                 }
                 BSONObj hint = fromjson( "{$hint:{a:1,b:1}}" );
                 auto_ptr< FieldRangeSetPair > frsp( new FieldRangeSetPair( ns(), fromjson( "{a:{$gte:5},b:{$in:[2,3,6,9,11]}}" ) ) );
-                QueryPlan qp( nsd(), 1, *frsp, *frsp, fromjson( "{a:{$gte:5},b:{$in:[2,3,6,9,11]}}" ), BSONObj() );
+                QueryPlan qp( nsd(), 1, *frsp, frsp.get(), fromjson( "{a:{$gte:5},b:{$in:[2,3,6,9,11]}}" ), BSONObj() );
                 boost::shared_ptr<Cursor> c = qp.newCursor();
                 int matches[] = { 2, 3, 6, 9 };
                 for( int i = 0; i < 4; ++i, c->advance() ) {
@@ -1900,18 +1900,19 @@ namespace QueryOptimizerTests {
         public:
             void run() {
                 _cli.createCollection( ns(), 1000, true );
-                _cli.insert( ns(), BSON( "_id" << 1 ) );
+                _cli.insert( ns(), BSON( "x" << 1 ) );
                 
                 {
                     dblock lk;
                     Client::Context ctx( ns() );
-                    setQueryOptimizerCursor( BSON( "_id" << GT << 0 ) );
-                    ASSERT_EQUALS( 1, current().getIntField( "_id" ) );
+                    setQueryOptimizerCursor( BSON( "x" << GT << 0 ) );
+                    ASSERT_EQUALS( 1, current().getIntField( "x" ) );
                     ASSERT( prepareToYield() );
                 }
-                
-                while( _cli.count( ns(), BSON( "_id" << 1 ) ) > 0 ) {
-                 	_cli.insert( ns(), BSONObj() );   
+
+                int x = 2;
+                while( _cli.count( ns(), BSON( "x" << 1 ) ) > 0 ) {
+                 	_cli.insert( ns(), BSON( "x" << x++ ) );   
                 }
 
                 {
@@ -2088,26 +2089,26 @@ namespace QueryOptimizerTests {
         public:
             void run() {
                 _cli.createCollection( ns(), 1000, true );
-                _cli.insert( ns(), BSON( "_id" << 1 << "a" << 1 ) );
-                _cli.ensureIndex( ns(), BSON( "_id" << 1 ) );
+                _cli.insert( ns(), BSON( "a" << 1 << "b" << 1 ) );
+                _cli.ensureIndex( ns(), BSON( "a" << 1 ) );
                 
                 shared_ptr<Cursor> c;
                 {
                     dblock lk;
                     Client::Context ctx( ns() );
-                    c = newQueryOptimizerCursor( ns(), BSON( "_id" << GT << 0 << "a" << GT << 0 ) );
-                    ASSERT_EQUALS( 1, c->current().getIntField( "_id" ) );
+                    c = newQueryOptimizerCursor( ns(), BSON( "a" << GT << 0 << "b" << GT << 0 ) );
+                    ASSERT_EQUALS( 1, c->current().getIntField( "a" ) );
                     ASSERT( !c->getsetdup( c->currLoc() ) );
                     c->advance();
-                    ASSERT_EQUALS( 1, c->current().getIntField( "_id" ) );
+                    ASSERT_EQUALS( 1, c->current().getIntField( "a" ) );
                     ASSERT( c->getsetdup( c->currLoc() ) );
                     ASSERT( c->prepareToYield() );
                 }
                 
                 int i = 1;
-                while( _cli.count( ns(), BSON( "_id" << 1 ) ) > 0 ) {
+                while( _cli.count( ns(), BSON( "a" << 1 ) ) > 0 ) {
                     ++i;
-                 	_cli.insert( ns(), BSON( "_id" << i << "a" << i ) );
+                 	_cli.insert( ns(), BSON( "a" << i << "b" << i ) );
                 }
                 
                 {
@@ -2116,7 +2117,7 @@ namespace QueryOptimizerTests {
                     c->recoverFromYield();
                     ASSERT( c->ok() );
                     // {$natural:1} plan does not recover, {_id:1} plan does.
-                    ASSERT( 1 < c->current().getIntField( "_id" ) );
+                    ASSERT( 1 < c->current().getIntField( "a" ) );
                 }                
             }
         };
diff --git a/dbtests/querytests.cpp b/dbtests/querytests.cpp
index a50eadfcd31..694053b10a8 100644
--- a/dbtests/querytests.cpp
+++ b/dbtests/querytests.cpp
@@ -361,6 +361,7 @@ namespace QueryTests {
 		void insertA(const char* ns, int a) {
 			BSONObjBuilder b;
 			b.appendOID("_id", 0, true);
+			b.appendOID("value", 0, true);
 			b.append("a", a);
 			insert(ns, b.obj());
 		}
@@ -374,7 +375,7 @@ namespace QueryTests {
             auto_ptr< DBClientCursor > c1 = client().query( ns, QUERY( "a" << GT << -1 ), 0, 0, 0, QueryOption_CursorTailable );
             OID id;
             id.init("000000000000000000000000");
-            auto_ptr< DBClientCursor > c2 = client().query( ns, QUERY( "_id" << GT << id ), 0, 0, 0, QueryOption_CursorTailable );
+            auto_ptr< DBClientCursor > c2 = client().query( ns, QUERY( "value" << GT << id ), 0, 0, 0, QueryOption_CursorTailable );
             c1->next();
             c1->next();
             ASSERT( !c1->more() );
@@ -399,7 +400,6 @@ namespace QueryTests {
         }
         void run() {
             const char *ns = "unittests.querytests.OplogReplayMode";
-            insert( ns, BSON( "ts" << 3 ) );
             insert( ns, BSON( "ts" << 0 ) );
             insert( ns, BSON( "ts" << 1 ) );
             insert( ns, BSON( "ts" << 2 ) );
@@ -407,6 +407,12 @@ namespace QueryTests {
             ASSERT( c->more() );
             ASSERT_EQUALS( 2, c->next().getIntField( "ts" ) );
             ASSERT( !c->more() );
+
+            insert( ns, BSON( "ts" << 3 ) );
+            c = client().query( ns, QUERY( "ts" << GT << 1 ).hint( BSON( "$natural" << 1 ) ), 0, 0, 0, QueryOption_OplogReplay );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 2, c->next().getIntField( "ts" ) );
+            ASSERT( c->more() );
         }
     };
 
@@ -1146,7 +1152,35 @@ namespace QueryTests {
     private:
         int _old;
     };
+    
+    /**
+     * Check OplogReplay mode where query timestamp is earlier than the earliest
+     * entry in the collection.
+     */
+    class FindingStartStale : public CollectionBase {
+    public:
+        FindingStartStale() : CollectionBase( "findingstart" ) {}
 
+        void run() {
+            unsigned startNumCursors = ClientCursor::numCursors();
+            
+            BSONObj info;
+            ASSERT( client().runCommand( "unittests", BSON( "create" << "querytests.findingstart" << "capped" << true << "$nExtents" << 5 << "autoIndexId" << false ), info ) );
+            
+            // Check OplogReplay mode with empty collection.
+            auto_ptr< DBClientCursor > c = client().query( ns(), QUERY( "ts" << GTE << 50 ), 0, 0, 0, QueryOption_OplogReplay );
+            ASSERT( !c->more() );
+
+            // Check with some docs in the collection.
+            for( int i = 100; i < 150; client().insert( ns(), BSON( "ts" << i++ ) ) );
+            c = client().query( ns(), QUERY( "ts" << GTE << 50 ), 0, 0, 0, QueryOption_OplogReplay );
+            ASSERT( c->more() );
+            ASSERT_EQUALS( 100, c->next()[ "ts" ].numberInt() );
+
+            // Check that no persistent cursors outlast our queries above.
+            ASSERT_EQUALS( startNumCursors, ClientCursor::numCursors() );
+        }
+    };
 
     class WhatsMyUri : public CollectionBase {
     public:
@@ -1362,6 +1396,7 @@ namespace QueryTests {
             add< HelperTest >();
             add< HelperByIdTest >();
             add< FindingStartPartiallyFull >();
+            add< FindingStartStale >();
             add< WhatsMyUri >();
 
             add< parsedtests::basic1 >();
diff --git a/dbtests/repltests.cpp b/dbtests/repltests.cpp
index ecaacf74874..2bf522555ab 100644
--- a/dbtests/repltests.cpp
+++ b/dbtests/repltests.cpp
@@ -25,6 +25,8 @@
 #include "../db/json.h"
 
 #include "dbtests.h"
+#include "../db/oplog.h"
+#include "../db/queryoptimizer.h"
 
 namespace mongo {
     void createOplog();
@@ -1049,6 +1051,31 @@ namespace ReplTests {
         }
     };
     
+    /**
+     * Check against oldest document in the oplog before scanning backward
+     * from the newest document.
+     */
+    class FindingStartCursorStale : public Base {
+    public:
+        void run() {
+            for( int i = 0; i < 10; ++i ) {
+                client()->insert( ns(), BSON( "_id" << i ) );
+            }
+            dblock lk;
+            Client::Context ctx( cllNS() );
+            NamespaceDetails *nsd = nsdetails( cllNS() );
+            BSONObjBuilder b;
+            b.appendTimestamp( "$gte" );
+            BSONObj query = BSON( "ts" << b.obj() );
+            FieldRangeSetPair frsp( cllNS(), query );
+            BSONObj order = BSON( "$natural" << 1 );
+            QueryPlan qp( nsd, -1, frsp, &frsp, query, order );
+            FindingStartCursor fsc( qp );
+            ASSERT( fsc.done() );
+            ASSERT_EQUALS( 0, fsc.cursor()->current()[ "o" ].Obj()[ "_id" ].Int() );
+        }
+    };
+    
     class All : public Suite {
     public:
         All() : Suite( "repl" ) {
@@ -1103,6 +1130,7 @@ namespace ReplTests {
             add< DeleteOpIsIdBased >();
             add< DatabaseIgnorerBasic >();
             add< DatabaseIgnorerUpdate >();
+            add< FindingStartCursorStale >();
         }
     } myall;
 
diff --git a/dbtests/test.sln b/dbtests/test.sln
new file mode 100755
index 00000000000..3a1b741c716
--- /dev/null
+++ b/dbtests/test.sln
@@ -0,0 +1,26 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test.vcxproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/dbtests/test.vcxproj b/dbtests/test.vcxproj
index 1460e9d45d1..fde77d2d20b 100644
--- a/dbtests/test.vcxproj
+++ b/dbtests/test.vcxproj
@@ -259,8 +259,16 @@
     <ClInclude Include="..\db\resource.h" />
     <ClInclude Include="..\db\scanandorder.h" />
     <ClInclude Include="..\db\security.h" />
+    <ClInclude Include="..\third_party\snappy\config.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-c.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-internal.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-sinksource.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-stubs-internal.h" />
+    <ClInclude Include="..\third_party\snappy\snappy-stubs-public.h" />
+    <ClInclude Include="..\third_party\snappy\snappy.h" />
     <ClInclude Include="..\util\builder.h" />
     <ClInclude Include="..\util\checksum.h" />
+    <ClInclude Include="..\util\compress.h" />
     <ClInclude Include="..\util\concurrency\list.h" />
     <ClInclude Include="..\util\concurrency\task.h" />
     <ClInclude Include="..\util\concurrency\value.h" />
@@ -325,6 +333,7 @@
     <ClCompile Include="..\db\repl\rs_rollback.cpp" />
     <ClCompile Include="..\db\repl\rs_sync.cpp" />
     <ClCompile Include="..\db\restapi.cpp" />
+    <ClCompile Include="..\db\scanandorder.cpp" />
     <ClCompile Include="..\db\security_common.cpp" />
     <ClCompile Include="..\pcre-7.4\pcrecpp.cc">
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
@@ -609,9 +618,27 @@
     <ClCompile Include="..\s\shard.cpp" />
     <ClCompile Include="..\s\shardconnection.cpp" />
     <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
     <ClCompile Include="..\util\alignedbuilder.cpp">
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
     </ClCompile>
+    <ClCompile Include="..\util\compress.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
     <ClCompile Include="..\util\concurrency\spin_lock.cpp">
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
     </ClCompile>
diff --git a/dbtests/test.vcxproj.filters b/dbtests/test.vcxproj.filters
index 1c832cd17ba..35e85fba416 100755
--- a/dbtests/test.vcxproj.filters
+++ b/dbtests/test.vcxproj.filters
@@ -56,6 +56,9 @@
     <Filter Include="bson">
       <UniqueIdentifier>{e6652333-c77f-420c-af8e-72d55bc095fe}</UniqueIdentifier>
     </Filter>
+    <Filter Include="misc and third party\snappy">
+      <UniqueIdentifier>{fbc4416f-ca67-4e63-a1ea-49027de7e080}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\boostw\boost_1_34_1\boost\config\auto_link.hpp">
@@ -304,6 +307,30 @@
     <ClInclude Include="..\server.h">
       <Filter>db\h</Filter>
     </ClInclude>
+    <ClInclude Include="..\third_party\snappy\config.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-c.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-internal.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-sinksource.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-stubs-internal.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy-stubs-public.h">
+      <Filter>misc and third party\snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\util\compress.h">
+      <Filter>misc and third party</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Library Include="..\..\js\js64r.lib">
@@ -857,6 +884,18 @@
     <ClCompile Include="..\util\concurrency\spin_lock.cpp">
       <Filter>util\concurrency</Filter>
     </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <Filter>misc and third party\snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\util\compress.cpp">
+      <Filter>misc and third party</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <Filter>misc and third party\snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\db\scanandorder.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="..\SConstruct">
diff --git a/debian/changelog b/debian/changelog
index abc4a2bce28..d1e37c93b1d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,15 @@
+mongodb (1.9.2) unstable; urgency=low
+
+  * see http://jira.mongodb.org/browse/SERVER/fixforversion/10261
+
+ -- Richard Kreuter <richard@10gen.com>  Thu, 11 Aug 2011 16:56:28 -0500
+
+mongodb (1.9.1) unstable; urgency=low
+
+  * see http://jira.mongodb.org/browse/SERVER/fixforversion/10261
+
+ -- Richard Kreuter <richard@10gen.com>  Tue, 26 Jul 2011 16:56:28 -0500
+
 mongodb (1.9.0) unstable; urgency=low
 
   * see http://jira.mongodb.org/browse/SERVER/fixforversion/10232
diff --git a/distsrc/client/SConstruct b/distsrc/client/SConstruct
index c2d309a4e5a..54fc9437d3c 100755
--- a/distsrc/client/SConstruct
+++ b/distsrc/client/SConstruct
@@ -41,7 +41,7 @@ linux = False
 if "darwin" == os.sys.platform:
     addExtraLibs( "/opt/local/" )
     nix = True
-elif "linux2" == os.sys.platform:
+elif "linux2" == os.sys.platform or "linux3" == os.sys.platform:
     nix = True
     linux = True
 
diff --git a/doxygenConfig b/doxygenConfig
index 577ce0119c5..3d873903fe8 100644
--- a/doxygenConfig
+++ b/doxygenConfig
@@ -3,7 +3,7 @@
 #---------------------------------------------------------------------------
 DOXYFILE_ENCODING      = UTF-8
 PROJECT_NAME           = MongoDB
-PROJECT_NUMBER         = 1.9.1-pre-
+PROJECT_NUMBER         = 2.0.0-rc0-pre-
 OUTPUT_DIRECTORY       = docs/doxygen
 CREATE_SUBDIRS         = NO
 OUTPUT_LANGUAGE        = English
diff --git a/jstests/ageoutjournalfiles.js b/jstests/ageoutjournalfiles.js
new file mode 100644
index 00000000000..f7fe2275480
--- /dev/null
+++ b/jstests/ageoutjournalfiles.js
@@ -0,0 +1,16 @@
+if (db.serverStatus().dur) {
+
+    assert(db.serverStatus().dur.ageOutJournalFiles != false);
+
+    db.adminCommand({ setParameter: 1, ageOutJournalFiles: false });
+
+    assert(db.serverStatus().dur.ageOutJournalFiles == false);
+
+    db.adminCommand({ setParameter: 1, ageOutJournalFiles: true });
+
+    assert(db.serverStatus().dur.ageOutJournalFiles != false);
+
+}
+else {
+//    print("dur is off");
+}
+\ No newline at end of file
diff --git a/jstests/array_match3.js b/jstests/array_match3.js
index 06ee926a6a6..c8653430770 100644
--- a/jstests/array_match3.js
+++ b/jstests/array_match3.js
@@ -10,6 +10,4 @@ assert.eq( 2, t.count( {'a.0':5} ) );
 
 // Test with index.
 t.ensureIndex( {'a.0':1} );
-if ( 0 ) { // SERVER-2902
 assert.eq( 2, t.count( {'a.0':5} ) );
-}
diff --git a/jstests/arrayfind4.js b/jstests/arrayfind4.js
new file mode 100644
index 00000000000..b141425f2e9
--- /dev/null
+++ b/jstests/arrayfind4.js
@@ -0,0 +1,22 @@
+// Test query empty array SERVER-2258
+
+t = db.jstests_arrayfind4;
+t.drop();
+
+t.save( {a:[]} );
+t.ensureIndex( {a:1} );
+
+assert.eq( 1, t.find( {a:[]} ).hint( {$natural:1} ).itcount() );
+assert.eq( 1, t.find( {a:[]} ).hint( {a:1} ).itcount() );
+
+assert.eq( 1, t.find( {a:{$in:[[]]}} ).hint( {$natural:1} ).itcount() );
+assert.eq( 1, t.find( {a:{$in:[[]]}} ).hint( {a:1} ).itcount() );
+
+t.remove();
+t.save( {a:[[]]} );
+
+assert.eq( 1, t.find( {a:[]} ).hint( {$natural:1} ).itcount() );
+assert.eq( 1, t.find( {a:[]} ).hint( {a:1} ).itcount() );
+
+assert.eq( 1, t.find( {a:{$in:[[]]}} ).hint( {$natural:1} ).itcount() );
+assert.eq( 1, t.find( {a:{$in:[[]]}} ).hint( {a:1} ).itcount() );
diff --git a/jstests/arrayfind5.js b/jstests/arrayfind5.js
new file mode 100644
index 00000000000..083dc0622c8
--- /dev/null
+++ b/jstests/arrayfind5.js
@@ -0,0 +1,23 @@
+// Test indexed elemmatch of missing field.
+
+t = db.jstests_arrayfind5;
+t.drop();
+
+function check( nullElemMatch ) {
+    assert.eq( 1, t.find( {'a.b':1} ).itcount() );
+    assert.eq( 1, t.find( {a:{$elemMatch:{b:1}}} ).itcount() );
+    assert.eq( 0, t.find( {'a.b':null} ).itcount() );
+    assert.eq( nullElemMatch ? 1 : 0, t.find( {a:{$elemMatch:{b:null}}} ).itcount() ); // see SERVER-3377    
+}
+
+t.save( {a:[{},{b:1}]} );
+check( true );
+t.ensureIndex( {'a.b':1} );
+check( true );
+
+t.drop();
+
+t.save( {a:[5,{b:1}]} );
+check( false );
+t.ensureIndex( {'a.b':1} );
+check( false );
diff --git a/jstests/capped2.js b/jstests/capped2.js
index 1f8bf1d01c6..65bb82f4c07 100644
--- a/jstests/capped2.js
+++ b/jstests/capped2.js
@@ -47,7 +47,7 @@ function checkDecreasing( i ) {
 
 for( i = 0 ;; ++i ) {
     debug( "capped 2: " + i );
-    tzz.save( val[ i ] );
+    tzz.insert( val[ i ] );
     if ( tzz.count() == 0 ) {
     	assert( i > 100, "K" );
         break;
@@ -57,6 +57,6 @@ for( i = 0 ;; ++i ) {
 
 for( i = 600 ; i >= 0 ; --i ) {
     debug( "capped 2: " + i );
-    tzz.save( val[ i ] );
+    tzz.insert( val[ i ] );
     checkDecreasing( i );
 }
diff --git a/jstests/capped5.js b/jstests/capped5.js
index f56d2278a7e..be6c27d7256 100644
--- a/jstests/capped5.js
+++ b/jstests/capped5.js
@@ -9,7 +9,6 @@ db.createCollection( tn , {capped: true, size: 1024 * 1024 * 1 } );
 t.insert( { _id : 5 , x : 11 , z : 52 } );
 assert.eq( 0 , t.getIndexKeys().length , "A0" )
 assert.eq( 52 , t.findOne( { x : 11 } ).z , "A1" );
-assert.eq( 52 , t.findOne( { _id : 5, x : 11 } ).z , "A2" );
 
 t.ensureIndex( { _id : 1 } )
 t.ensureIndex( { x : 1 } )
diff --git a/jstests/capped6.js b/jstests/capped6.js
index 65798075208..098f667732f 100644
--- a/jstests/capped6.js
+++ b/jstests/capped6.js
@@ -52,7 +52,7 @@ var max = 0;
  */
 function doTest() {
     for( var i = max; i < oldMax; ++i ) {
-        tzz.save( val[ i ] );
+        tzz.insert( val[ i ] );
     }
     max = oldMax;
     count = tzz.count();
diff --git a/jstests/cappeda.js b/jstests/cappeda.js
new file mode 100644
index 00000000000..4a4b14a64e5
--- /dev/null
+++ b/jstests/cappeda.js
@@ -0,0 +1,33 @@
+
+t = db.scan_capped_id;
+t.drop()
+
+x = t.runCommand( "create" , { capped : true , size : 10000 } )
+assert( x.ok )
+
+for ( i=0; i<100; i++ )
+    t.insert( { _id : i , x : 1 } )
+
+function q() {
+    return t.findOne( { _id : 5 } )
+}
+
+function u() {
+    t.update( { _id : 5 } , { $set : { x : 2 } } );
+    var gle = db.getLastError();
+    if ( gle )
+        throw gle;
+}
+    
+
+// SERVER-3064
+//assert.throws( q , [] , "A1" );
+//assert.throws( u , [] , "B1" );
+
+t.ensureIndex( { _id : 1 } )
+
+assert.eq( 1 , q().x )
+q()
+u()
+
+assert.eq( 2 , q().x )
diff --git a/jstests/date3.js b/jstests/date3.js
new file mode 100644
index 00000000000..81b385a8616
--- /dev/null
+++ b/jstests/date3.js
@@ -0,0 +1,29 @@
+// Check dates before Unix epoch - SERVER-405
+
+t = db.date3;
+t.drop()
+
+d1 = new Date(-1000)
+dz = new Date(0)
+d2 = new Date(1000)
+
+t.save( {x: 2, d: d2} )
+t.save( {x: 1, d: d1} )
+
+function test () {
+	var list = t.find( {d: {$lt: dz}} )
+	assert.eq ( 1, list.size() )
+	assert.eq ( 1, list[0].x )
+	assert.eq ( d1, list[0].d )
+	var list = t.find( {d: {$gt: dz}} )
+	assert.eq ( 1, list.size() )
+	assert.eq ( 2, list[0].x )
+	var list = t.find().sort( {d:1} )
+	assert.eq ( 2, list.size() )
+	assert.eq ( 1, list[0].x )
+	assert.eq ( 2, list[1].x )
+}
+
+test()
+t.ensureIndex( {d: 1} )
+test()
diff --git a/jstests/dbhash.js b/jstests/dbhash.js
index e9cbc944b5f..7fea4b4d50c 100644
--- a/jstests/dbhash.js
+++ b/jstests/dbhash.js
@@ -14,16 +14,22 @@ db.getCollectionNames().forEach( function( x ) {
                                 }
                                 } );
 
+function dbhash( mydb ) {
+    var ret = mydb.runCommand( "dbhash" );
+    assert.commandWorked( ret, "dbhash failure" );
+    return ret;
+}
+
 function gh( coll , mydb ){
     if ( ! mydb ) mydb = db;
-    var x = mydb.runCommand( "dbhash" ).collections[coll.getName()];
+    var x = dbhash( mydb ).collections[coll.getName()];
     if ( ! x )
         return "";
     return x;
 }
 
 function dbh( mydb ){
-    return mydb.runCommand( "dbhash" ).md5;
+    return dbhash( mydb ).md5;
 }
 
 assert.eq( gh( a ) , gh( b ) , "A1" );
diff --git a/jstests/disk/quota.js b/jstests/disk/quota.js
new file mode 100644
index 00000000000..d93e5eaafc0
--- /dev/null
+++ b/jstests/disk/quota.js
@@ -0,0 +1,47 @@
+// Check functioning of --quotaFiles parameter, including with respect to SERVER-3293 ('local' database).
+
+port = allocatePorts( 1 )[ 0 ];
+
+baseName = "jstests_disk_quota";
+dbpath = "/data/db/" + baseName;
+
+m = startMongod( "--port", port, "--dbpath", "/data/db/" + baseName, "--quotaFiles", "1", "--smallfiles" );
+db = m.getDB( baseName );
+
+big = new Array( 10000 ).toString();
+
+// Insert documents until quota is exhausted.
+while( !db.getLastError() ) {
+    db[ baseName ].save( {b:big} );
+}
+printjson( db.getLastError() );
+
+dotTwoDataFile = dbpath + "/" + baseName + ".2";
+files = listFiles( dbpath );
+for( i in files ) {
+    // Since only one data file is allowed, a .0 file is expected and a .1 file may be preallocated (SERVER-3410) but no .2 file is expected.
+	assert.neq( dotTwoDataFile, files[ i ].name );
+}
+
+dotTwoDataFile = dbpath + "/" + "local" + ".2";
+// Check that quota does not apply to local db, and a .2 file can be created.
+l = m.getDB( "local" )[ baseName ];
+for( i = 0; i < 10000; ++i ) {
+    l.save( {b:big} );
+    assert( !db.getLastError() );
+	dotTwoFound = false;
+    if ( i % 100 != 0 ) {
+        continue;
+    }
+    files = listFiles( dbpath );
+    for( f in files ) {
+     	if ( files[ f ].name == dotTwoDataFile ) {
+         	dotTwoFound = true;
+        }
+    }
+    if ( dotTwoFound ) {
+     	break;   
+    }
+}
+
+assert( dotTwoFound );
diff --git a/jstests/disk/quota2.js b/jstests/disk/quota2.js
new file mode 100644
index 00000000000..c0d30dfecbf
--- /dev/null
+++ b/jstests/disk/quota2.js
@@ -0,0 +1,38 @@
+// Test for quotaFiles off by one file limit issue - SERVER-3420.
+
+if ( 0 ) { // SERVER-3420
+
+port = allocatePorts( 1 )[ 0 ];
+
+baseName = "jstests_disk_quota2";
+dbpath = "/data/db/" + baseName;
+
+m = startMongod( "--port", port, "--dbpath", "/data/db/" + baseName, "--quotaFiles", "1", "--smallfiles" );
+db = m.getDB( baseName );
+
+big = new Array( 10000 ).toString();
+
+// Insert documents until quota is exhausted.
+while( !db.getLastError() ) {
+    db[ baseName ].save( {b:big} );
+}
+
+db.resetError();
+
+// Trigger allocation of an additional file for a 'special' namespace.
+for( n = 0; !db.getLastError(); ++n ) {
+	db.createCollection( '' + n );
+}
+
+print( n );
+
+// Check that new docs are saved in the .0 file.
+for( i = 0; i < n; ++i ) {
+    c = db[ ''+i ];
+    c.save( {b:big} );
+    if( !db.getLastError() ) {
+	    assert.eq( 0, c.find()._addSpecial( "$showDiskLoc", true )[ 0 ].$diskLoc.file );
+    }
+}
+
+}
+\ No newline at end of file
diff --git a/jstests/drop2.js b/jstests/drop2.js
index a1d619df1b3..87e646e1ee9 100644
--- a/jstests/drop2.js
+++ b/jstests/drop2.js
@@ -26,7 +26,7 @@ function op( drop ) {
     return null;
 }
 
-s1 = startParallelShell( "db.jstests_drop2.count( { $where: function() { while( 1 ) { ; } } } )" );
+s1 = startParallelShell( "db.jstests_drop2.count( { $where: function() { while( 1 ) { sleep( 1 ); } } } )" );
 countOp = null;
 assert.soon( function() { countOp = op( false ); return countOp; } );
 
diff --git a/jstests/dur/diskfull.js b/jstests/dur/diskfull.js
index da45c20afd4..c123ea1541e 100644
--- a/jstests/dur/diskfull.js
+++ b/jstests/dur/diskfull.js
@@ -14,23 +14,23 @@ for ( i in files ) {
 if ( !doIt ) {
     print( "path " + startPath + " missing, skipping diskfull test" );
     doIt = false;
-}
-
-function checkNoJournalFiles(path, pass) {
-    var files = listFiles(path);
-    if (files.some(function (f) { return f.name.indexOf("prealloc") < 0; })) {
-        if (pass == null) {
-            // wait a bit longer for mongod to potentially finish if it is still running.
-            sleep(10000);
-            return checkNoJournalFiles(path, 1);
-        }
-        print("\n\n\n");
-        print("FAIL path:" + path);
-        print("unexpected files:");
-        printjson(files);
-        assert(false, "FAIL a journal/lsn file is present which is unexpected");
-    }
-}
+}
+
+function checkNoJournalFiles(path, pass) {
+    var files = listFiles(path);
+    if (files.some(function (f) { return f.name.indexOf("prealloc") < 0; })) {
+        if (pass == null) {
+            // wait a bit longer for mongod to potentially finish if it is still running.
+            sleep(10000);
+            return checkNoJournalFiles(path, 1);
+        }
+        print("\n\n\n");
+        print("FAIL path:" + path);
+        print("unexpected files:");
+        printjson(files);
+        assert(false, "FAIL a journal/lsn file is present which is unexpected");
+    }
+}
 
 /** Clear dbpath without removing and recreating diskfulltest directory, as resetDbpath does */
 function clear() {
@@ -56,7 +56,9 @@ function work() {
             d.foo.insert( { _id:i, b:big } );
         }
         
-        d.getLastError();
+        gle = d.getLastError();
+        if ( gle )
+            throw gle;
     } catch ( e ) {
         print( e );
         raise( e );
@@ -86,9 +88,8 @@ function runFirstMongodAndFillDisk() {
     conn = startMongodNoReset("--port", 30001, "--dbpath", startPath, "--dur", "--smallfiles", "--durOptions", 8, "--noprealloc");
     
     assert.throws( work, null, "no exception thrown when exceeding disk capacity" );
-    waitMongoProgramOnPort( 30001 );
-    
-    // the above wait doesn't work on windows
+    stopMongod( 30001 );
+
     sleep(5000);    
 }
 
@@ -104,9 +105,9 @@ function runSecondMongdAndRecover() {
     // stopMongod seems to be asynchronous (hmmm) so we sleep here.
     sleep(5000);
     
-    // at this point, after clean shutdown, there should be no journal files
-    log("check no journal files");
-    checkNoJournalFiles(startPath + "/journal/");
+    // at this point, after clean shutdown, there should be no journal files
+    log("check no journal files");
+    checkNoJournalFiles(startPath + "/journal/");
     
     log();    
 }
@@ -133,4 +134,4 @@ if ( doIt ) {
     
     print(testname + " SUCCESS");    
 
-}
-\ No newline at end of file
+}
diff --git a/jstests/evald.js b/jstests/evald.js
index 78cabb68045..7b18f3cc893 100644
--- a/jstests/evald.js
+++ b/jstests/evald.js
@@ -53,10 +53,10 @@ function doIt( ev, wait, where ) {
 
 }
 
-doIt( "db.jstests_evald.count( { $where: function() { while( 1 ) { ; } } } )", true, true );
-doIt( "db.jstests_evald.count( { $where: function() { while( 1 ) { ; } } } )", false, true );
-doIt( "while( true ) {;}", false );
-doIt( "while( true ) {;}", true );
+doIt( "db.jstests_evald.count( { $where: function() { while( 1 ) { sleep(1); } } } )", true, true );
+doIt( "db.jstests_evald.count( { $where: function() { while( 1 ) { sleep(1); } } } )", false, true );
+doIt( "while( true ) { sleep(1);}", false );
+doIt( "while( true ) { sleep(1);}", true );
 
 // the for loops are currently required, as a spawned op masks the parent op - see SERVER-1931
 doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} db.jstests_evald.count( {i:10} ); }", true );
@@ -65,4 +65,4 @@ doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} db.jstests_evald.count(
 doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} db.jstests_evald.count(); }", false );
 
 doIt( "while( 1 ) { for( var i = 0; i < 10000; ++i ) {;} try { db.jstests_evald.count( {i:10} ); } catch ( e ) { } }", true );
-doIt( "while( 1 ) { try { while( 1 ) { ; } } catch ( e ) { } }", true );
+doIt( "while( 1 ) { try { while( 1 ) { sleep(1); } } catch ( e ) { } }", true );
diff --git a/jstests/exists9.js b/jstests/exists9.js
index 09695ac4203..66378d1b424 100644
--- a/jstests/exists9.js
+++ b/jstests/exists9.js
@@ -25,8 +25,7 @@ assert.eq( 1, t.count( {a:{$exists:false}} ) );
 t.ensureIndex( {a:1} );
 assert.eq( 1, t.find( {a:{$exists:true}} ).hint( {a:1} ).itcount() );
 assert.eq( 1, t.find( {a:{$exists:false}} ).hint( {a:1} ).itcount() );
-// The empty array will be scanned, but not returned.
-assert.eq( 2, t.find( {a:{$exists:false}} ).hint( {a:1} ).explain().nscanned );
+assert.eq( 1, t.find( {a:{$exists:false}} ).hint( {a:1} ).explain().nscanned );
 
 t.drop();
 
@@ -39,6 +38,4 @@ assert.eq( 1, t.count( {'a.0':{$exists:false}} ) );
 // With index.
 t.ensureIndex( {'a.0':1} );
 assert.eq( 1, t.find( {'a.0':{$exists:true}} ).hint( {'a.0':1} ).itcount() );
-if ( 0 ) { // SERVER-2902
 assert.eq( 1, t.find( {'a.0':{$exists:false}} ).hint( {'a.0':1} ).itcount() );
-}
diff --git a/jstests/geo_mapreduce2.js b/jstests/geo_mapreduce2.js
new file mode 100644
index 00000000000..9c393457c7b
--- /dev/null
+++ b/jstests/geo_mapreduce2.js
@@ -0,0 +1,36 @@
+// Geo mapreduce 2 from SERVER-3478
+
+var coll = db.geoMR2
+coll.drop()
+
+for( var i = 0; i < 300; i++ )
+    coll.insert({ i : i, location : [ 10, 20 ] })
+    
+coll.ensureIndex({ location : "2d" })
+
+// map function
+m = function() {
+    emit( null, { count : this.i } )
+}
+
+// reduce function
+r = function( key, values ) {
+    
+    var total = 0
+    for ( var i = 0; i < values.length; i++ ) {
+        total += values[i].count
+    }
+    
+    return { count : total }
+};
+
+try{ coll.mapReduce( m, r, 
+               { out : coll.getName() + "_mr", 
+                 sort : { _id : 1 }, 
+                 query : { 'location' : { $within : { $centerSphere : [[ 10, 20 ], 0.01 ] } } } })
+
+}
+catch( e ){
+    // This should occur, since we can't in-mem sort for mreduce
+    printjson( e )
+}
diff --git a/jstests/group7.js b/jstests/group7.js
new file mode 100644
index 00000000000..5bf9232577c
--- /dev/null
+++ b/jstests/group7.js
@@ -0,0 +1,43 @@
+// Test yielding group command SERVER-1395
+
+t = db.jstests_group7;
+t.drop();
+
+function checkForYield( docs, updates ) {
+    t.drop();
+    a = 0;
+    for( var i = 0; i < docs; ++i ) {
+	t.save( {a:a} );
+    }
+    db.getLastError();
+
+    // Iteratively update all a values atomically.
+    p = startParallelShell( 'for( a = 0; a < ' + updates + '; ++a ) { db.jstests_group7.update( {$atomic:true}, {$set:{a:a}}, false, true ); db.getLastError(); }' );
+
+    for( var i = 0; i < updates; ++i ) {
+        ret = t.group({key:{a:1},reduce:function(){},initial:{}});
+        // Check if group sees more than one a value, indicating that it yielded.
+        if ( ret.length > 1 ) {
+            p();
+            return true;
+        }
+        printjson( ret );
+    }
+
+    p();
+    return false;
+}
+
+var yielded = false;
+var docs = 1500;
+var updates = 50;
+for( var j = 1; j <= 6; ++j ) {
+    if ( checkForYield( docs, updates ) ) {
+        yielded = true;
+	break;
+    }
+     // Increase docs and updates to encourage yielding.
+    docs *= 2;
+    updates *= 2;
+}
+assert( yielded );
+\ No newline at end of file
diff --git a/jstests/in9.js b/jstests/in9.js
index b0d70b6a4fc..34cefb8278a 100644
--- a/jstests/in9.js
+++ b/jstests/in9.js
@@ -31,5 +31,5 @@ function doTest() {
 doTest(); 
 
 // SERVER-1943 not fixed yet
-//t.ensureIndex( {key:1} ); 
-//doTest();
+t.ensureIndex( {key:1} ); 
+doTest();
diff --git a/jstests/ina.js b/jstests/ina.js
new file mode 100644
index 00000000000..cf614ab994d
--- /dev/null
+++ b/jstests/ina.js
@@ -0,0 +1,15 @@
+// Uassert when $elemMatch is attempted within $in SERVER-3545
+
+t = db.jstests_ina;
+t.drop();
+t.save( {} );
+
+assert.throws( function() { t.find( {a:{$in:[{$elemMatch:{b:1}}]}} ).itcount(); } );
+assert.throws( function() { t.find( {a:{$not:{$in:[{$elemMatch:{b:1}}]}}} ).itcount(); } );
+
+assert.throws( function() { t.find( {a:{$nin:[{$elemMatch:{b:1}}]}} ).itcount(); } );
+assert.throws( function() { t.find( {a:{$not:{$nin:[{$elemMatch:{b:1}}]}}} ).itcount(); } );
+
+// NOTE Above we don't check cases like {b:2,$elemMatch:{b:3,4}} - generally
+// we assume that the first key is $elemMatch if any key is, and validating
+// every key is expensive in some cases.
+\ No newline at end of file
diff --git a/jstests/indexbindata.js b/jstests/indexbindata.js
new file mode 100755
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/jstests/indexbindata.js
diff --git a/jstests/indexr.js b/jstests/indexr.js
index b900e8ccbd5..60ecfb13ed2 100644
--- a/jstests/indexr.js
+++ b/jstests/indexr.js
@@ -28,17 +28,13 @@ t.remove();
 t.save( { a: [ { b: 3, c: 6 }, { b: 1, c: 1 } ] } );
 
 assert.eq( 1, t.count( { 'a.b':{ $gt:2 }, 'a.c': { $lt:4 } } ) );
-if ( 0 ) { // SERVER-3005
 assert.eq( 1, t.count( { a:{ b:3, c:6 }, 'a.c': { $lt:4 } } ) );
-}
 assert.eq( [[{$minElement:1},{$maxElement:1}]], t.find( { 'a.b':{ $gt:2 }, 'a.c': { $lt:4 } } ).explain().indexBounds['a.c'] );
 assert.eq( [[{$minElement:1},{$maxElement:1}]], t.find( { a:{ b:3, c:6 }, 'a.c': { $lt:4 } } ).explain().indexBounds['a.c'] );
 
 // Check reverse direction.
 assert.eq( 1, t.find( { 'a.b':{ $gt:2 }, 'a.c': { $lt:4 } } ).sort( {'a.b':-1} ).itcount() );
-if ( 0 ) { // SERVER-3005
 assert.eq( 1, t.find( { a:{ b:3, c:6 }, 'a.c': { $lt:4 } } ).sort( {a:-1} ).itcount() );
-}
 
 assert.eq( [[{$maxElement:1},{$minElement:1}]], t.find( { 'a.b':{ $gt:2 }, 'a.c': { $lt:4 } } ).sort( {'a.b':-1} ).explain().indexBounds['a.c'] );
 assert.eq( [[{$maxElement:1},{$minElement:1}]], t.find( { a:{ b:3, c:6 }, 'a.c': { $lt:4 } } ).sort( {a:-1} ).explain().indexBounds['a.c'] );
diff --git a/jstests/indexs.js b/jstests/indexs.js
index 3a52584bfd3..609f912affe 100644
--- a/jstests/indexs.js
+++ b/jstests/indexs.js
@@ -17,7 +17,5 @@ t.drop();
 t.ensureIndex( {a:1,'a.b':1} );
 t.save( { a: [ { b: 3 } ] } );
 assert.eq( ib, t.find( { a:{ b:3 } } ).explain().indexBounds );
-if ( 0 ) { // SERVER-3005
 assert.eq( 1, t.find( { a:{ b:3 } } ).explain().nscanned );
 assert.eq( 1, t.count( { a:{ b:3 } } ) );
-}
-\ No newline at end of file
diff --git a/jstests/indext.js b/jstests/indext.js
new file mode 100644
index 00000000000..e418dc2e959
--- /dev/null
+++ b/jstests/indext.js
@@ -0,0 +1,21 @@
+// Sparse indexes with arrays SERVER-3216
+
+t = db.jstests_indext;
+t.drop();
+
+t.ensureIndex( {'a.b':1}, {sparse:true} );
+t.save( {a:[]} );
+t.save( {a:1} );
+assert.eq( 0, t.find().hint( {'a.b':1} ).itcount() );
+assert.eq( 0, t.find().hint( {'a.b':1} ).explain().nscanned );
+
+t.ensureIndex( {'a.b':1,'a.c':1}, {sparse:true} );
+t.save( {a:[]} );
+t.save( {a:1} );
+assert.eq( 0, t.find().hint( {'a.b':1,'a.c':1} ).itcount() );
+assert.eq( 0, t.find().hint( {'a.b':1,'a.c':1} ).explain().nscanned );
+
+t.save( {a:[{b:1}]} );
+t.save( {a:1} );
+assert.eq( 1, t.find().hint( {'a.b':1,'a.c':1} ).itcount() );
+assert.eq( 1, t.find().hint( {'a.b':1,'a.c':1} ).explain().nscanned );
diff --git a/jstests/indexu.js b/jstests/indexu.js
new file mode 100644
index 00000000000..c7fa8ed3365
--- /dev/null
+++ b/jstests/indexu.js
@@ -0,0 +1,137 @@
+// Test index key generation with duplicate values addressed by array index and
+// object field.  SERVER-2902
+
+t = db.jstests_indexu;
+t.drop();
+
+var dupDoc = {a:[{'0':1}]}; // There are two 'a.0' fields in this doc.
+var dupDoc2 = {a:[{'1':1},'c']};
+var noDupDoc = {a:[{'1':1}]};
+
+// Test that we can't index dupDoc.
+t.save( dupDoc );
+assert( !db.getLastError() );
+t.ensureIndex( {'a.0':1} );
+assert( db.getLastError() );
+
+t.remove();
+t.ensureIndex( {'a.0':1} );
+assert( !db.getLastError() );
+t.save( dupDoc );
+assert( db.getLastError() );
+
+// Test that we can't index dupDoc2.
+t.drop();
+t.save( dupDoc2 );
+assert( !db.getLastError() );
+t.ensureIndex( {'a.1':1} );
+assert( db.getLastError() );
+
+t.remove();
+t.ensureIndex( {'a.1':1} );
+assert( !db.getLastError() );
+t.save( dupDoc2 );
+assert( db.getLastError() );
+
+// Test that we can index dupDoc with a different index.
+t.drop();
+t.ensureIndex( {'a.b':1} );
+t.save( dupDoc );
+assert( !db.getLastError() );
+
+// Test number field starting with hyphen.
+t.drop();
+t.ensureIndex( {'a.-1':1} );
+t.save( {a:[{'-1':1}]} );
+assert( !db.getLastError() );
+
+// Test number field starting with zero.
+t.drop();
+t.ensureIndex( {'a.00':1} );
+t.save( {a:[{'00':1}]} );
+assert( !db.getLastError() );
+
+// Test multiple array indexes
+t.drop();
+t.ensureIndex( {'a.0':1,'a.1':1} );
+t.save( {a:[{'1':1}]} );
+assert( !db.getLastError() );
+t.save( {a:[{'1':1},4]} );
+assert( db.getLastError() );
+
+// Test that we can index noDupDoc.
+t.drop();
+t.save( noDupDoc );
+t.ensureIndex( {'a.0':1} );
+assert( !db.getLastError() );
+t.ensureIndex( {'a.1':1} );
+assert( !db.getLastError() );
+
+t.drop();
+t.ensureIndex( {'a.0':1} );
+t.ensureIndex( {'a.1':1} );
+t.save( noDupDoc );
+assert( !db.getLastError() );
+
+// Test that we can query noDupDoc.
+assert.eq( 1, t.find( {'a.1':1} ).hint( {'a.1':1} ).itcount() );
+assert.eq( 1, t.find( {'a.1':1} ).hint( {$natural:1} ).itcount() );
+assert.eq( 1, t.find( {'a.0':{'1':1}} ).hint( {'a.0':1} ).itcount() );
+assert.eq( 1, t.find( {'a.0':{'1':1}} ).hint( {$natural:1} ).itcount() );
+
+// Check multiple nested array fields.
+t.drop();
+t.save( {a:[[1]]} );
+t.ensureIndex( {'a.0.0':1} );
+assert( !db.getLastError() );
+assert.eq( 1, t.find( {'a.0.0':1} ).hint( {$natural:1} ).itcount() );
+assert.eq( 1, t.find( {'a.0.0':1} ).hint( {'a.0.0':1} ).itcount() );
+
+// Check where there is a duplicate for a partially addressed field but not for a fully addressed field.
+t.drop();
+t.save( {a:[[1],{'0':1}]} );
+t.ensureIndex( {'a.0.0':1} );
+assert( db.getLastError() );
+
+// Check where there is a duplicate for a fully addressed field.
+t.drop();
+t.save( {a:[[1],{'0':[1]}]} );
+assert( !db.getLastError() );
+t.ensureIndex( {'a.0.0':1} );
+assert( db.getLastError() );
+
+// Two ways of addressing parse to an array.
+t.drop();
+t.save( {a:[{'0':1}]} );
+t.ensureIndex( {'a.0.0':1} );
+assert( db.getLastError() );
+
+// Test several key depths - with same arrays being found.
+t.drop();
+t.save( {a:[{'0':[{'0':1}]}]} );
+t.ensureIndex( {'a.0.0.0.0.0.0':1} );
+assert( db.getLastError() );
+t.ensureIndex( {'a.0.0.0.0.0':1} );
+assert( db.getLastError() );
+t.ensureIndex( {'a.0.0.0.0':1} );
+assert( db.getLastError() );
+t.ensureIndex( {'a.0.0.0':1} );
+assert( db.getLastError() );
+t.ensureIndex( {'a.0.0':1} );
+assert( db.getLastError() );
+t.ensureIndex( {'a.0':1} );
+assert( db.getLastError() );
+t.ensureIndex( {'a':1} );
+assert( !db.getLastError() );
+
+// Two prefixes extract docs, but one terminates extraction before array.
+t.drop();
+t.save( {a:[{'0':{'c':[]}}]} );
+t.ensureIndex( {'a.0.c':1} );
+assert( db.getLastError() );
+
+t.drop();
+t.save( {a:[[{'b':1}]]} );
+assert.eq( 1, t.find( {'a.0.b':1} ).itcount() );
+t.ensureIndex( {'a.0.b':1} );
+assert.eq( 1, t.find( {'a.0.b':1} ).itcount() );
diff --git a/jstests/indexv.js b/jstests/indexv.js
new file mode 100644
index 00000000000..a69ff2a4664
--- /dev/null
+++ b/jstests/indexv.js
@@ -0,0 +1,18 @@
+// Check null key generation.
+
+t = db.jstests_indexv;
+t.drop();
+
+t.ensureIndex( {'a.b':1} );
+
+t.save( {a:[{},{b:1}]} );
+var e = t.find( {'a.b':null} ).explain();
+assert.eq( 0, e.n );
+assert.eq( 1, e.nscanned );
+
+t.drop();
+t.ensureIndex( {'a.b.c':1} );
+t.save( {a:[{b:[]},{b:{c:1}}]} );
+var e = t.find( {'a.b.c':null} ).explain();
+assert.eq( 0, e.n );
+assert.eq( 1, e.nscanned );
diff --git a/jstests/indexw.js b/jstests/indexw.js
new file mode 100644
index 00000000000..326443400d1
--- /dev/null
+++ b/jstests/indexw.js
@@ -0,0 +1,14 @@
+// Check that v0 keys are generated for v0 indexes SERVER-3375
+
+t = db.jstests_indexw;
+t.drop();
+
+t.save( {a:[]} );
+assert.eq( 1, t.count( {a:[]} ) );
+t.ensureIndex( {a:1} );
+assert.eq( 1, t.count( {a:[]} ) );
+t.dropIndexes();
+
+// The count result is incorrect - just checking here that v0 key generation is used.
+t.ensureIndex( {a:1}, {v:0} );
+assert.eq( 0, t.count( {a:[]} ) );
diff --git a/jstests/libs/testconfig b/jstests/libs/testconfig
new file mode 100644
index 00000000000..0c1fc871d61
--- /dev/null
+++ b/jstests/libs/testconfig
@@ -0,0 +1,4 @@
+fastsync = true
+#comment line
+#commentedflagwithan = false
+version = false
diff --git a/jstests/ork.js b/jstests/ork.js
new file mode 100644
index 00000000000..d6d40161e69
--- /dev/null
+++ b/jstests/ork.js
@@ -0,0 +1,11 @@
+// SERVER-2585 Test $or clauses within indexed top level $or clauses.
+
+t = db.jstests_ork;
+t.drop();
+
+t.ensureIndex( {a:1} );
+t.save( {a:[1,2],b:5} );
+t.save( {a:[2,4],b:5} );
+
+assert.eq( 2, t.find( {$or:[{a:1,$and:[{$or:[{a:2},{a:3}]},{$or:[{b:5}]}]},{a:2,$or:[{a:3},{a:4}]}]} ).itcount() );
+assert.eq( 1, t.find( {$or:[{a:1,$and:[{$or:[{a:2},{a:3}]},{$or:[{b:6}]}]},{a:2,$or:[{a:3},{a:4}]}]} ).itcount() );
diff --git a/jstests/orl.js b/jstests/orl.js
new file mode 100644
index 00000000000..2726975d5aa
--- /dev/null
+++ b/jstests/orl.js
@@ -0,0 +1,13 @@
+// SERVER-3445 Test using coarse multikey bounds for or range elimination.
+
+t = db.jstests_orl;
+t.drop();
+
+t.ensureIndex( {'a.b':1,'a.c':1} );
+// make the index multikey
+t.save( {a:{b:[1,2]}} );
+
+// SERVER-3445
+if ( 0 ) {
+assert( !t.find( {$or:[{'a.b':2,'a.c':3},{'a.b':2,'a.c':4}]} ).explain().clauses );
+}
+\ No newline at end of file
diff --git a/jstests/orm.js b/jstests/orm.js
new file mode 100644
index 00000000000..83183f05a59
--- /dev/null
+++ b/jstests/orm.js
@@ -0,0 +1,26 @@
+// Test dropping during a $or yield SERVER-3555
+
+if ( 0 ) { // SERVER-3555
+
+t = db.jstests_orm;
+t.drop();
+
+clauses = [];
+for( i = 0; i < 10; ++i ) {
+    clauses.push( {a:{$lte:(i+1)*5000/10},i:49999} );
+    clauses.push( {b:{$lte:(i+1)*5000/10},i:49999} );
+}
+
+p = startParallelShell( 'for( i = 0; i < 30; ++i ) { sleep( 1000 ); db.jstests_orm.drop() }' );
+for( j = 0; j < 10; ++j ) {
+    for( i = 0; i < 5000; ++i ) {
+	t.save( {a:i,i:i} );
+	t.save( {b:i,i:i} );
+    }
+    t.ensureIndex( {a:1} );
+    t.ensureIndex( {b:1} );
+    t.find( {$or:clauses} ).itcount();
+}
+p();
+
+}
+\ No newline at end of file
diff --git a/jstests/profile1.js b/jstests/profile1.js
index eed64f60ae2..9654357127f 100644
--- a/jstests/profile1.js
+++ b/jstests/profile1.js
@@ -1,3 +1,4 @@
+print("profile1.js BEGIN");
 
 try {
 
@@ -61,21 +62,50 @@ try {
     after = db.system.profile.count()
     assert.eq( before + 3 , after , "X1" )
 
+    /* sleep() could be inaccurate on certain platforms.  let's check */
+    print("\nsleep 2 time actual:");
+    for (var i = 0; i < 4; i++) {
+        print(db.eval("var x = new Date(); sleep(2); return new Date() - x;"));
+    }
+    print();
+    print("\nsleep 20 times actual:");
+    for (var i = 0; i < 4; i++) {
+        print(db.eval("var x = new Date(); sleep(20); return new Date() - x;"));
+    }
+    print();
+    print("\nsleep 120 times actual:");
+    for (var i = 0; i < 4; i++) {
+        print(db.eval("var x = new Date(); sleep(120); return new Date() - x;"));
+    }
+    print();
+
+    function evalSleepMoreThan(millis,max){
+        var start = new Date();
+        db.eval("sleep("+millis+")");
+        var end = new Date();
+        var actual = end.getTime() - start.getTime();
+        if ( actual > ( millis + 5 ) ) {
+            print( "warning wanted to sleep for: " + millis + " but took: " + actual );
+        }
+        return actual >= max ? 1 : 0;
+    }
+
     db.setProfilingLevel(1,100);
     before = db.system.profile.count();
-    db.eval( "sleep(25)" )
-    db.eval( "sleep(120)" )
+    var delta = 0;
+    delta += evalSleepMoreThan( 15 , 100 );
+    delta += evalSleepMoreThan( 120 , 100 );
     after = db.system.profile.count()
-    assert.eq( before + 1 , after , "X2 : " + getProfileAString() )
+    assert.eq( before + delta , after , "X2 : " + getProfileAString() )
 
     db.setProfilingLevel(1,20);
     before = db.system.profile.count();
-    db.eval( "sleep(25)" )
-    db.eval( "sleep(120)" )
+    delta = 0;
+    delta += evalSleepMoreThan( 5 , 20 );
+    delta += evalSleepMoreThan( 120 , 20 );
     after = db.system.profile.count()
-    assert.eq( before + 2 , after , "X3 : " + getProfileAString() )
-    
-    
+    assert.eq( before + delta , after , "X3 : " + getProfileAString() )
+        
     db.profile.drop();
     db.setProfilingLevel(2)
     var q = { _id : 5 };
@@ -85,7 +115,9 @@ try {
     assert.eq( q , r.query , "Y1" );
     assert.eq( u , r.updateobj , "Y2" );
     assert.eq( "update" , r.op , "Y3" );
-    assert.eq( "test.profile1" , r.ns , "Y4" );
+    assert.eq("test.profile1", r.ns, "Y4");
+
+    print("profile1.js SUCCESS OK");
     
 } finally {
     // disable profiling for subsequent tests
diff --git a/jstests/profile2.js b/jstests/profile2.js
new file mode 100644
index 00000000000..929b463ca3d
--- /dev/null
+++ b/jstests/profile2.js
@@ -0,0 +1,19 @@
+print("profile2.js BEGIN");
+
+try {
+
+    assert.commandWorked( db.runCommand( {profile:2} ) );
+
+    huge = 'huge';
+    while (huge.length < 2*1024*1024){
+        huge += huge;
+    }
+
+    db.profile2.count({huge:huge}) // would make a huge entry in db.system.profile
+
+    print("profile2.js SUCCESS OK");
+    
+} finally {
+    // disable profiling for subsequent tests
+    assert.commandWorked( db.runCommand( {profile:0} ) );
+}
diff --git a/jstests/profile3.js b/jstests/profile3.js
new file mode 100644
index 00000000000..a6574b76f8a
--- /dev/null
+++ b/jstests/profile3.js
@@ -0,0 +1,26 @@
+
+t = db.profile3;
+t.drop();
+
+try {
+    db.setProfilingLevel(0);
+
+    db.system.profile.drop();
+    assert.eq( 0 , db.system.profile.count() )
+    
+    db.setProfilingLevel(2);
+    
+    t.insert( { x : 1 } );
+    t.findOne( { x : 1 } );
+    t.find( { x : 1 } ).count();
+    
+    db.system.profile.find().forEach( printjson )
+
+    db.setProfilingLevel(0);
+    db.system.profile.drop();
+
+}
+finally {
+    db.setProfilingLevel(0);
+}
+    
diff --git a/jstests/regexa.js b/jstests/regexa.js
index e9644627548..b0d47190e77 100644
--- a/jstests/regexa.js
+++ b/jstests/regexa.js
@@ -14,6 +14,6 @@ t.save( {a:'a'} );
 
 check();
 t.ensureIndex( {a:1} );
-if ( 0 ) { // SERVER-3298
+if ( 1 ) { // SERVER-3298
 check();
-}
-\ No newline at end of file
+}
diff --git a/jstests/repl/basic1.js b/jstests/repl/basic1.js
index aaa07dc6cc1..4a6091d9755 100644
--- a/jstests/repl/basic1.js
+++ b/jstests/repl/basic1.js
@@ -160,6 +160,8 @@ assert.eq( 0 , as.system.profile.count() , "P2" )
 assert.eq( 1 , as.foo.findOne().x , "P3" );
 assert.eq( 0 , as.system.profile.count() , "P4" )
 
+assert( as.getCollectionNames().indexOf( "system.profile" ) < 0 , "P4.5" )
+
 as.setProfilingLevel(2)
 as.foo.findOne();
 assert.eq( 1 , as.system.profile.count() , "P5" )
diff --git a/jstests/repl/drop_dups.js b/jstests/repl/drop_dups.js
new file mode 100644
index 00000000000..1fa9984ea06
--- /dev/null
+++ b/jstests/repl/drop_dups.js
@@ -0,0 +1,63 @@
+
+var rt = new ReplTest( "drop_dups" );
+
+m = rt.start( true );
+s = rt.start( false );
+
+function block(){
+    am.runCommand( { getlasterror : 1 , w : 2 , wtimeout : 3000 } )
+}
+
+am = m.getDB( "foo" );
+as = s.getDB( "foo" );
+
+function run( createInBackground ) {
+
+    collName = "foo" + ( createInBackground ? "B" : "F" );
+    
+    am[collName].drop();
+    am.blah.insert( { x : 1 } )
+    block();
+    
+    for ( i=0; i<10; i++ ) {
+        am[collName].insert( { _id : i , x : Math.floor( i / 2 ) } )
+    }
+    
+    block();
+    
+    am.runCommand( { "godinsert" : collName , obj : { _id : 100 , x : 20 } } );
+    am.runCommand( { "godinsert" : collName , obj : { _id : 101 , x : 20 } } );
+
+    as.runCommand( { "godinsert" : collName , obj : { _id : 101 , x : 20 } } );
+    as.runCommand( { "godinsert" : collName , obj : { _id : 100 , x : 20 } } );
+    
+    assert.eq( as[collName].count() , am[collName].count() );
+    
+    function mymap(z) {
+        return z._id + ":" + z.x + ",";
+    }
+
+    
+    if ( am.serverStatus().mem.bits == 64 ) {
+        assert.neq( tojson(am[collName].find().map(mymap)) , 
+                    tojson(as[collName].find().map(mymap)) , "order is not supposed to be same on master and slave but it is" );
+    }
+    
+    
+    am[collName].ensureIndex( { x : 1 } , { unique : true , dropDups : true , background : createInBackground  } );
+    am.blah.insert( { x : 1 } )
+    block();
+
+    assert.eq( 2 , am[collName].getIndexKeys().length , "A1 : " + createInBackground )
+    assert.eq( 2 , as[collName].getIndexKeys().length , "A2 : " + createInBackground )
+    
+    assert.eq( am[collName].find().sort( { _id : 1 } ).map(mymap) , 
+               as[collName].find().sort( { _id : 1 } ).map(mymap) , "different things dropped on master and slave" );
+    
+    
+}
+
+run( false )
+run( true )
+
+rt.stop()
diff --git a/jstests/repl/repl3.js b/jstests/repl/repl3.js
index d3c38486b19..5ace9b69d2f 100644
--- a/jstests/repl/repl3.js
+++ b/jstests/repl/repl3.js
@@ -10,38 +10,42 @@ soonCount = function( count ) {
                 } );
 }
 
-doTest = function( signal ) {
-
-    rt = new ReplTest( "repl3tests" );
-    
-    m = rt.start( true );
-    s = rt.start( false );
-    
-    am = m.getDB( baseName ).a
-    
-    am.save( { _id: new ObjectId() } );
-    soonCount( 1 );
-    rt.stop( false, signal );
-    
-    big = new Array( 2000 ).toString();
-    for( i = 0; i < 1000; ++i )
-        am.save( { _id: new ObjectId(), i: i, b: big } );
-
-    s = rt.start( false, { autoresync: null }, true );
-    
+doTest = function (signal) {
+
+    print("repl3.js doTest(" + signal + ")")
+
+    rt = new ReplTest("repl3tests");
+
+    m = rt.start(true);
+    s = rt.start(false);
+
+    am = m.getDB(baseName).a
+
+    am.save({ _id: new ObjectId() });
+    soonCount(1);
+    rt.stop(false, signal);
+
+    big = new Array(2000).toString();
+    for (i = 0; i < 1000; ++i)
+        am.save({ _id: new ObjectId(), i: i, b: big });
+
+    s = rt.start(false, { autoresync: null }, true);
+
     // after SyncException, mongod waits 10 secs.
-    sleep( 15000 );
-    
+    sleep(15000);
+
     // Need the 2 additional seconds timeout, since commands don't work on an 'allDead' node.
-    soonCount( 1001 );
-    as = s.getDB( baseName ).a
-    assert.eq( 1, as.find( { i: 0 } ).count() );
-    assert.eq( 1, as.find( { i: 999 } ).count() );
-    
-    assert.commandFailed( s.getDB( "admin" ).runCommand( { "resync" : 1 } ) );
+    soonCount(1001);
+    as = s.getDB(baseName).a
+    assert.eq(1, as.find({ i: 0 }).count());
+    assert.eq(1, as.find({ i: 999 }).count());
+
+    assert.commandFailed(s.getDB("admin").runCommand({ "resync": 1 }));
 
     rt.stop();
 }
 
 doTest( 15 ); // SIGTERM
 doTest( 9 );  // SIGKILL
+
+print("repl3.js OK")
diff --git a/jstests/replsets/auth1.js b/jstests/replsets/auth1.js
index e9765c08153..edc162cca16 100644
--- a/jstests/replsets/auth1.js
+++ b/jstests/replsets/auth1.js
@@ -81,6 +81,10 @@ function doQueryOn(p) {
 
 doQueryOn(slave);
 master.adminCommand({logout:1});
+
+print("unauthorized:");
+printjson(master.adminCommand({replSetGetStatus : 1}));
+
 doQueryOn(master);
 
 
diff --git a/jstests/replsets/downstream.js b/jstests/replsets/downstream.js
new file mode 100755
index 00000000000..795e6671d46
--- /dev/null
+++ b/jstests/replsets/downstream.js
@@ -0,0 +1,36 @@
+// BUG: [SERVER-1768] replica set getlasterror {w: 2} after 2000 
+// inserts hangs while secondary servers log "replSet error RS102 too stale to catch up" every once in a while
+
+function newReplicaSet (name, numServers) {
+    var rs = new ReplSetTest({name: name, nodes: numServers})
+    rs.startSet()
+    rs.initiate()
+    rs.awaitReplication()
+    return rs
+}
+
+function go() {
+var N = 2000
+
+// ~1KB string
+var Text = ''
+for (var i = 0; i < 40; i++)
+    Text += 'abcdefghijklmnopqrstuvwxyz'
+
+// Create replica set of 3 servers
+var repset = newReplicaSet('repset', 3)
+var conn = repset.getMaster()
+var db = conn.getDB('test')
+
+// Add data to it
+for (var i = 0; i < N; i++)
+    db['foo'].insert({x: i, text: Text})
+
+// wait to be copied to at least one secondary (BUG hangs here)
+db.getLastError(2)
+
+print('getlasterror_w2.js SUCCESS')
+}
+
+// turn off until fixed 
+//go();
diff --git a/jstests/replsets/fastsync.js b/jstests/replsets/fastsync.js
index 5ba978481cd..1c9c2152ebb 100644
--- a/jstests/replsets/fastsync.js
+++ b/jstests/replsets/fastsync.js
@@ -48,7 +48,7 @@ var admin = p.getDB("admin");
 var foo = p.getDB("foo");
 var local = p.getDB("local");
 
-var config = {_id : basename, members : [{_id : 0, host : hostname+":"+ports[0]}]};
+var config = {_id : basename, members : [{_id : 0, host : hostname+":"+ports[0], priority:2}]};
 printjson(config);
 var result = admin.runCommand({replSetInitiate : config});
 print("result:");
@@ -98,6 +98,7 @@ var startSlave = function(n) {
     config.members.push({_id:n, host:hostname+":"+ports[n]});
 
     result = admin.runCommand({replSetReconfig : config});
+    printjson(result);
     assert(result.ok, "reconfig worked");
     reconnect(p);
 
@@ -125,6 +126,10 @@ var startSlave = function(n) {
 
     assert.eq(status.members[n].state, 2);
 
+    assert.soon(function() {
+        return admin.runCommand({isMaster : 1}).ismaster;
+    });
+
     admin.foo.insert({x:1});
     assert.soon(function() {
         var last = local.oplog.rs.find().sort({$natural:-1}).limit(1).next();
diff --git a/jstests/replsets/maintenance.js b/jstests/replsets/maintenance.js
new file mode 100644
index 00000000000..5b068cd3d8e
--- /dev/null
+++ b/jstests/replsets/maintenance.js
@@ -0,0 +1,32 @@
+
+
+var replTest = new ReplSetTest( {name: 'unicomplex', nodes: 3} );
+var conns = replTest.startSet();
+replTest.initiate();
+
+// Make sure we have a master
+var master = replTest.getMaster();
+
+for (i=0;i<10000; i++) { master.getDB("bar").foo.insert({x:1,y:i,abc:123,str:"foo bar baz"}); }
+for (i=0;i<1000; i++) { master.getDB("bar").foo.update({y:i},{$push :{foo : "barrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr"}}); }
+
+replTest.awaitReplication();
+
+assert.soon(function() { return conns[2].getDB("admin").isMaster().secondary; });
+
+join = startParallelShell( "db.getSisterDB('bar').runCommand({compact : 'foo'});", replTest.ports[2] );
+
+print("check secondary goes to recovering");
+assert.soon(function() { return !conns[2].getDB("admin").isMaster().secondary; });
+
+print("joining");
+join();
+
+print("check secondary becomes a secondary again");
+var x = 0;
+assert.soon(function() {
+    var im = conns[2].getDB("admin").isMaster();
+    if (x++ % 5 == 0) printjson(im);
+    return im.secondary;
+});
+
diff --git a/jstests/replsets/remove1.js b/jstests/replsets/remove1.js
index 6b9cf5d3c30..f93fe9eb071 100644
--- a/jstests/replsets/remove1.js
+++ b/jstests/replsets/remove1.js
@@ -92,8 +92,12 @@ print("reconfig with minority");
 replTest.stop(1);
 
 assert.soon(function() {
-    reconnect(master);
-    return master.getDB("admin").runCommand({isMaster : 1}).secondary;
+    try {
+        return master.getDB("admin").runCommand({isMaster : 1}).secondary;
+    }
+    catch(e) {
+        print("trying to get master: "+e);
+    }
 });
 
 config.version = 4;
diff --git a/jstests/replsets/replset5.js b/jstests/replsets/replset5.js
index a861bd6ff04..67ce2d78bcd 100644
--- a/jstests/replsets/replset5.js
+++ b/jstests/replsets/replset5.js
@@ -23,53 +23,63 @@ doTest = function (signal) {
     master.getDB("barDB").bar.save({ a: 1 });
     replTest.awaitReplication();
 
-    // These writes should be replicated immediately
-    var docNum = 5000;
-    for(var n=0; n<docNum; n++) {
-      master.getDB(testDB).foo.insert({ n: n });
-    }
-
-    // If you want to test failure, just add values for w and wtimeout
-    // to the following command. This will override the default set above and
-    // prevent replication from happening in time for the count tests below.
-    var result = master.getDB("admin").runCommand({getlasterror: 1});
-    printjson(result);
-
+    // These writes should be replicated immediately
+    var docNum = 5000;
+    for (var n = 0; n < docNum; n++) {
+        master.getDB(testDB).foo.insert({ n: n });
+    }
+
+    // should use the configured last error defaults from above, that's what we're testing.
+    // 
+    // If you want to test failure, just add values for w and wtimeout (e.g. w=1)
+    // to the following command. This will override the default set above and
+    // prevent replication from happening in time for the count tests below.
+    //
+    var result = master.getDB("admin").runCommand({ getlasterror: 1 });
+    print("replset5.js getlasterror result:");
+    printjson(result);
+
+    if (result.err == "timeout") {
+        print("\WARNING getLastError timed out and should not have.\nThis machine seems extremely slow. Stopping test without failing it\n")
+        replTest.stopSet(signal);
+        print("\WARNING getLastError timed out and should not have.\nThis machine seems extremely slow. Stopping test without failing it\n")
+        return;
+    }
 
     var slaves = replTest.liveNodes.slaves;
     slaves[0].setSlaveOk();
     slaves[1].setSlaveOk();
 
-    print("Testing slave counts");
+    print("replset5.js Testing slave counts");
+
+    var slave0count = slaves[0].getDB(testDB).foo.count();
+    assert(slave0count == docNum, "Slave 0 has " + slave0count + " of " + docNum + " documents!");
+
+    var slave1count = slaves[1].getDB(testDB).foo.count();
+    assert(slave1count == docNum, "Slave 1 has " + slave1count + " of " + docNum + " documents!");
+
+    var master1count = master.getDB(testDB).foo.count();
+    assert(master1count == docNum, "Master has " + master1count + " of " + docNum + " documents!");
+
+    print("replset5.js reconfigure with hidden=1");
+    config = master.getDB("local").system.replset.findOne();
+    config.version++;
+    config.members[2].priority = 0;
+    config.members[2].hidden = 1;
+
+    try {
+        master.adminCommand({ replSetReconfig: config });
+    }
+    catch (e) {
+        print(e);
+    }
+
+    config = master.getDB("local").system.replset.findOne();
+    printjson(config);
+    assert.eq(config.members[2].hidden, true);
 
-    var slave0count = slaves[0].getDB(testDB).foo.count();
-    assert( slave0count == docNum, "Slave 0 has " + slave0count + " of " + docNum + " documents!");
-
-    var slave1count = slaves[1].getDB(testDB).foo.count();
-    assert( slave1count == docNum, "Slave 1 has " + slave1count + " of " + docNum + " documents!");
-
-    var master1count = master.getDB(testDB).foo.count();
-    assert( master1count == docNum, "Master has " + master1count + " of " + docNum + " documents!");
-
-    print("reconfigure with hidden=1");
-    config = master.getDB("local").system.replset.findOne();
-    config.version++;
-    config.members[2].priority = 0;
-    config.members[2].hidden = 1;
-
-    try {
-        master.adminCommand({replSetReconfig : config});
-    }
-    catch(e) {
-        print(e);
-    }
-
-    config = master.getDB("local").system.replset.findOne();
-    printjson(config);
-    assert.eq(config.members[2].hidden, true);
-
     replTest.stopSet(signal);
 }
 
-doTest( 15 );
-print("replset5.js success");
+doTest( 15 );
+print("replset5.js success");
diff --git a/jstests/replsets/replsetarb2.js b/jstests/replsets/replsetarb2.js
index a20c41b79c0..6f712cbc257 100644
--- a/jstests/replsets/replsetarb2.js
+++ b/jstests/replsets/replsetarb2.js
@@ -8,11 +8,11 @@ doTest = function( signal ) {
     print(tojson(nodes));
 
     var conns = replTest.startSet();
-    var r = replTest.initiate({"_id" : "unicomplex", 
+    var r = replTest.initiate({"_id" : "unicomplex",
                 "members" : [
-                             {"_id" : 0, "host" : nodes[0] },
-                             {"_id" : 1, "host" : nodes[1], "arbiterOnly" : true, "votes": 1},
-                             {"_id" : 2, "host" : nodes[2] }]});
+                    {"_id" : 0, "host" : nodes[0] },
+                    {"_id" : 1, "host" : nodes[1], "arbiterOnly" : true, "votes": 1, "priority" : 0},
+                    {"_id" : 2, "host" : nodes[2] }]});
 
     // Make sure we have a master
     var master = replTest.getMaster();
@@ -24,6 +24,10 @@ doTest = function( signal ) {
         return res.myState == 7;
     }, "Aribiter failed to initialize.");
 
+    var result = conns[1].getDB("admin").runCommand({isMaster : 1});
+    assert(result.arbiterOnly);
+    assert(!result.passive);
+
     // Wait for initial replication
     master.getDB("foo").foo.insert({a: "foo"});
     replTest.awaitReplication();
diff --git a/jstests/replsets/rollback2.js b/jstests/replsets/rollback2.js
index 46fb548ccdf..7ab3c6bf4ee 100644
--- a/jstests/replsets/rollback2.js
+++ b/jstests/replsets/rollback2.js
@@ -202,9 +202,24 @@ doTest = function (signal) {
     wait(function () { return B.isMaster().ismaster || B.isMaster().secondary; });
     
     // everyone is up here...
-    assert(A.isMaster().ismaster || A.isMaster().secondary, "A up");
-    assert(B.isMaster().ismaster || B.isMaster().secondary, "B up");
     replTest.awaitReplication();
+
+    // theoretically, a read could slip in between StateBox::change() printing
+    // replSet SECONDARY
+    // and the replset actually becoming secondary
+    // so we're trying to wait for that here
+    print("waiting for secondary");
+    assert.soon(function() {
+        try {
+          var aim = A.isMaster();
+          var bim = B.isMaster();
+          return (aim.ismaster || aim.secondary) &&
+            (bim.ismaster || bim.secondary);
+        }
+        catch(e) {
+          print("checking A and B: "+e);
+        }
+      });
     
     verify(a);
 
diff --git a/jstests/replsets/tags.js b/jstests/replsets/tags.js
index 8ef8a0a12cc..4e738862afe 100644
--- a/jstests/replsets/tags.js
+++ b/jstests/replsets/tags.js
@@ -8,16 +8,16 @@ var nodes = replTest.startSet();
 var port = replTest.ports;
 replTest.initiate({_id : name, members :
         [
-         {_id:0, host : host+":"+port[0], tags : ["0", "dc.ny.rk1", "machine"]},
-         {_id:1, host : host+":"+port[1], tags : ["1", "dc.ny.rk1", "machine"]},
-         {_id:2, host : host+":"+port[2], tags : ["2", "dc.ny.rk2", "machine"]},
-         {_id:3, host : host+":"+port[3], tags : ["3", "dc.sf.rk1", "machine"]},
-         {_id:4, host : host+":"+port[4], tags : ["4", "dc.sf.rk2", "machine"]},
+            {_id:0, host : host+":"+port[0], tags : {"server" : "0", "dc" : "ny", "ny" : "1", "rack" : "ny.rk1"}},
+            {_id:1, host : host+":"+port[1], tags : {"server" : "1", "dc" : "ny", "ny" : "2", "rack" : "ny.rk1"}},
+            {_id:2, host : host+":"+port[2], tags : {"server" : "2", "dc" : "ny", "ny" : "3", "rack" : "ny.rk2", "2" : "this"}},
+            {_id:3, host : host+":"+port[3], tags : {"server" : "3", "dc" : "sf", "sf" : "1", "rack" : "sf.rk1"}},
+            {_id:4, host : host+":"+port[4], tags : {"server" : "4", "dc" : "sf", "sf" : "2", "rack" : "sf.rk2"}},
         ],
         settings : {
             getLastErrorModes : {
-                "important" : {"dc" : 2, "machine" : 3},
-                "a machine" : {"machine" : 1}
+                "important" : {"dc" : 2, "server" : 3},
+                "a machine" : {"server" : 1}
             }
         }});
 
@@ -29,14 +29,14 @@ printjson(config);
 var modes = config.settings.getLastErrorModes;
 assert.eq(typeof modes, "object");
 assert.eq(modes.important.dc, 2);
-assert.eq(modes.important.machine, 3);
-assert.eq(modes["a machine"]["machine"], 1);
+assert.eq(modes.important.server, 3);
+assert.eq(modes["a machine"]["server"], 1);
 
 config.version++;
 config.members[1].priority = 1.5;
 config.members[2].priority = 2;
-modes.rack = {"dc.sf" : 1};
-modes.niceRack = {"dc.sf" : 2};
+modes.rack = {"sf" : 1};
+modes.niceRack = {"sf" : 2};
 modes["a machine"]["2"] = 1;
 modes.on2 = {"2" : 1}
 
@@ -57,10 +57,10 @@ printjson(config);
 modes = config.settings.getLastErrorModes;
 assert.eq(typeof modes, "object");
 assert.eq(modes.important.dc, 2);
-assert.eq(modes.important.machine, 3);
-assert.eq(modes["a machine"]["machine"], 1);
-assert.eq(modes.rack["dc.sf"], 1);
-assert.eq(modes.niceRack["dc.sf"], 2);
+assert.eq(modes.important.server, 3);
+assert.eq(modes["a machine"]["server"], 1);
+assert.eq(modes.rack["sf"], 1);
+assert.eq(modes.niceRack["sf"], 2);
 
 print("bridging");
 replTest.bridge();
@@ -75,8 +75,11 @@ replTest.partition(3, 4);
 print("done bridging");
 
 print("test1");
+print("2 should be primary");
 master = replTest.getMaster();
 
+printjson(master.getDB("admin").runCommand({replSetGetStatus:1}));
+
 var timeout = 20000;
 
 master.getDB("foo").bar.insert({x:1});
diff --git a/jstests/replsets/tags2.js b/jstests/replsets/tags2.js
new file mode 100644
index 00000000000..16dfcdf4983
--- /dev/null
+++ b/jstests/replsets/tags2.js
@@ -0,0 +1,44 @@
+// Change a getLastErrorMode from 2 to 3 servers
+
+var host = getHostName();
+var replTest = new ReplSetTest( {name: "rstag", nodes: 3, startPort: 31000} );
+var nodes = replTest.startSet();
+var ports = replTest.ports;
+var conf = {_id : "rstag", version: 1, members : [
+    {_id : 0, host : host+":"+ports[0], tags : {"backup" : "A"}},
+    {_id : 1, host : host+":"+ports[1], tags : {"backup" : "B"}},
+    {_id : 2, host : host+":"+ports[2], tags : {"backup" : "C"}} ],
+            settings : {getLastErrorModes : {
+                backedUp : {backup : 2} }} };
+replTest.initiate( conf );
+replTest.awaitReplication();
+
+master = replTest.getMaster();
+var db = master.getDB("test");
+db.foo.insert( {x:1} );
+var result = db.runCommand( {getLastError:1, w:"backedUp", wtimeout:20000} );
+assert.eq (result.err, null);
+
+conf.version = 2;
+conf.settings.getLastErrorModes.backedUp.backup = 3;
+master.getDB("admin").runCommand( {replSetReconfig: conf} );
+replTest.awaitReplication();
+
+master = replTest.getMaster();
+var db = master.getDB("test");
+db.foo.insert( {x:2} );
+var result = db.runCommand( {getLastError:1, w:"backedUp", wtimeout:20000} );
+assert.eq (result.err, null);
+
+conf.version = 3;
+conf.members[0].priorty = 3;
+conf.members[2].priorty = 0;
+master.getDB("admin").runCommand( {replSetReconfig: conf} );
+
+master = replTest.getMaster();
+var db = master.getDB("test");
+db.foo.insert( {x:3} );
+var result = db.runCommand( {getLastError:1, w:"backedUp", wtimeout:20000} );
+assert.eq (result.err, null);
+
+replTest.stopSet();
diff --git a/jstests/replsets/toostale.js b/jstests/replsets/toostale.js
index a1217a6a547..08b1a9c2c6f 100644
--- a/jstests/replsets/toostale.js
+++ b/jstests/replsets/toostale.js
@@ -111,16 +111,12 @@ replTest.restart(2);
 
 
 print("8: check s2.state == 3");
-status = master.getDB("admin").runCommand({replSetGetStatus:1});
-while (status.state == 0) {
-  print("state is 0: ");
-  printjson(status);
-  sleep(1000);
-  status = master.getDB("admin").runCommand({replSetGetStatus:1});
-}
+assert.soon(function() {
+    var status = master.getDB("admin").runCommand({replSetGetStatus:1});
+    printjson(status);
+    return status.members && status.members[2].state == 3;
+});
 
-printjson(status);
-assert.eq(status.members[2].state, 3, 'recovering');
 
 print("make sure s2 doesn't become primary");
 replTest.stop(0);
diff --git a/jstests/sharding/addshard4.js b/jstests/sharding/addshard4.js
index 81cc1f89e73..4a44b5537b2 100644
--- a/jstests/sharding/addshard4.js
+++ b/jstests/sharding/addshard4.js
@@ -2,14 +2,14 @@
 
 s = new ShardingTest( "addshard4", 2 , 0 , 1 , {useHostname : true});
 
-r = new ReplSetTest({name : "addshard4", nodes : 3, startPort : 34000});
+r = new ReplSetTest({name : "addshard4", nodes : 3, startPort : 31100});
 r.startSet();
 
 var config = r.getReplSetConfig();
 config.members[2].priority = 0;
 
 r.initiate(config);
-//Wait for replica set to be fully initialized - could take some time 
+//Wait for replica set to be fully initialized - could take some time
 //to pre-allocate files on slow systems
 r.awaitReplication();
 
@@ -25,14 +25,14 @@ var result = s.adminCommand({"addshard" : shardName});
 printjson(result);
 assert.eq(result, true);
 
-r = new ReplSetTest({name : "addshard42", nodes : 3, startPort : 36000});
+r = new ReplSetTest({name : "addshard42", nodes : 3, startPort : 31200});
 r.startSet();
 
 config = r.getReplSetConfig();
 config.members[2].arbiterOnly = true;
 
 r.initiate(config);
-// Wait for replica set to be fully initialized - could take some time 
+// Wait for replica set to be fully initialized - could take some time
 // to pre-allocate files on slow systems
 r.awaitReplication();
 
diff --git a/jstests/sharding/array_shard_key.js b/jstests/sharding/array_shard_key.js
new file mode 100644
index 00000000000..1ea61e8d3a8
--- /dev/null
+++ b/jstests/sharding/array_shard_key.js
@@ -0,0 +1,127 @@
+// Ensure you can't shard on an array key
+
+var st = new ShardingTest({ name : jsTestName(), shards : 3 })
+
+var mongos = st.s0
+
+var coll = mongos.getCollection( jsTestName() + ".foo" )
+
+st.shardColl( coll, { _id : 1, i : 1 }, { _id : ObjectId(), i : 1 } )
+
+printjson( mongos.getDB("config").chunks.find().toArray() )
+
+st.printShardingStatus()
+
+print( "1: insert some invalid data" )
+
+var value = null
+
+var checkError = function( shouldError ){
+    var error = coll.getDB().getLastError()
+    
+    if( error != null ) printjson( error )
+    
+    if( error == null && ! shouldError ) return
+    if( error != null && shouldError ) return
+    
+    if( error == null ) print( "No error detected!" )
+    else print( "Unexpected error!" )
+    
+    assert( false )
+}
+
+// Insert an object with invalid array key 
+coll.insert({ i : [ 1, 2 ] })
+checkError( true )
+
+// Insert an object with valid array key
+coll.insert({ i : 1 })
+checkError( false )
+
+// Update the value with valid other field
+value = coll.findOne({ i : 1 })
+coll.update( value, { $set : { j : 2 } } )
+checkError( false )
+
+// Update the value with invalid other fields
+value = coll.findOne({ i : 1 })
+coll.update( value, Object.merge( value, { i : [ 3 ] } ) )
+checkError( true )
+
+// Multi-update the value with invalid other fields
+value = coll.findOne({ i : 1 })
+coll.update( value, Object.merge( value, { i : [ 3, 4 ] } ), false, true)
+checkError( true )
+
+// Single update the value with valid other fields
+value = coll.findOne({ i : 1 })
+coll.update( Object.merge( value, { i : [ 3, 4 ] } ), value )
+checkError( true )
+
+// Multi-update the value with other fields (won't work, but no error)
+value = coll.findOne({ i : 1 })
+coll.update( Object.merge( value, { i : [ 1, 1 ] } ), { $set : { k : 4 } }, false, true)
+checkError( false )
+
+// Query the value with other fields (won't work, but no error)
+value = coll.findOne({ i : 1 })
+coll.find( Object.merge( value, { i : [ 1, 1 ] } ) ).toArray()
+checkError( false )
+
+// Can't remove using multikey, but shouldn't error
+value = coll.findOne({ i : 1 })
+coll.remove( Object.extend( value, { i : [ 1, 2, 3, 4 ] } ) )
+checkError( false )
+
+// Can't remove using multikey, but shouldn't error
+value = coll.findOne({ i : 1 })
+coll.remove( Object.extend( value, { i : [ 1, 2, 3, 4, 5 ] } ) )
+error = coll.getDB().getLastError()
+assert.eq( error, null )
+assert.eq( coll.find().itcount(), 1 )
+
+value = coll.findOne({ i : 1 })
+coll.remove( Object.extend( value, { i : 1 } ) )
+error = coll.getDB().getLastError()
+assert.eq( error, null )
+assert.eq( coll.find().itcount(), 0 )
+
+printjson( "Sharding-then-inserting-multikey tested, now trying inserting-then-sharding-multikey" )
+
+// Insert a bunch of data then shard over key which is an array
+var coll = mongos.getCollection( "" + coll + "2" )
+for( var i = 0; i < 10; i++ ){
+    // TODO : does not check weird cases like [ i, i ]
+    coll.insert({ i : [ i, i + 1 ] })
+    checkError( false )
+}
+
+coll.ensureIndex({ _id : 1, i : 1 })
+
+try {
+    st.shardColl( coll, { _id : 1, i : 1 },  { _id : ObjectId(), i : 1 } )
+}
+catch( e ){
+    print( "Correctly threw error on sharding with multikey index." )
+}
+
+st.printShardingStatus()
+
+// Insert a bunch of data then shard over key which is not an array
+var coll = mongos.getCollection( "" + coll + "3" )
+for( var i = 0; i < 10; i++ ){
+    // TODO : does not check weird cases like [ i, i ]
+    coll.insert({ i : i })
+    checkError( false )
+}
+
+coll.ensureIndex({ _id : 1, i : 1 })
+
+st.shardColl( coll, { _id : 1, i : 1 },  { _id : ObjectId(), i : 1 } )
+
+st.printShardingStatus()
+
+
+
+// Finish
+st.stop()
diff --git a/jstests/sharding/auth.js b/jstests/sharding/auth.js
index 559ec2c1fac..8d8d7d79dab 100644
--- a/jstests/sharding/auth.js
+++ b/jstests/sharding/auth.js
@@ -56,7 +56,7 @@ s.s = s._mongos[0] = s["s0"] = conn;
 
 login(adminUser);
 
-d1 = new ReplSetTest({name : "d1", nodes : 3, startPort : 34000});
+d1 = new ReplSetTest({name : "d1", nodes : 3, startPort : 31100});
 d1.startSet({keyFile : "jstests/libs/key2"});
 d1.initiate();
 
@@ -102,6 +102,18 @@ s.getDB(testUser.db).addUser(testUser.username, testUser.password);
 
 logout(adminUser);
 
+print("query try");
+var e = assert.throws(function() {
+    conn.getDB("foo").bar.findOne();
+});
+printjson(e);
+
+print("cmd try");
+e = assert.throws(function() {
+    conn.getDB("foo").runCommand({listdbs:1});
+});
+printjson(e);
+
 print("insert try 1");
 s.getDB("test").foo.insert({x:1});
 result = s.getDB("test").runCommand({getLastError : 1});
@@ -118,7 +130,7 @@ assert.eq(result.err, null);
 
 logout(testUser);
 
-d2 = new ReplSetTest({name : "d2", nodes : 3, startPort : 36000});
+d2 = new ReplSetTest({name : "d2", nodes : 3, startPort : 31200});
 d2.startSet({keyFile : "jstests/libs/key1"});
 d2.initiate();
 
@@ -156,4 +168,10 @@ while (cursor.hasNext()) {
 
 assert.eq(count, 501);
 
+// check that dump doesn't get stuck with auth
+var x = runMongoProgram( "mongodump", "--host", "127.0.0.1:31000", "-d", testUser.db, "-u", testUser.username, "-p", testUser.password);
+
+print("result: "+x);
+
+
 s.stop();
diff --git a/jstests/sharding/count_slaveok.js b/jstests/sharding/count_slaveok.js
new file mode 100644
index 00000000000..075ab41c2ad
--- /dev/null
+++ b/jstests/sharding/count_slaveok.js
@@ -0,0 +1,69 @@
+// Tests count and distinct using slaveOk
+
+var st = new ShardingTest( testName = "countSlaveOk",
+                           numShards = 1,
+                           verboseLevel = 0,
+                           numMongos = 1,
+                           { rs : true, 
+                             rs0 : { nodes : 2 }
+                           })
+
+var rst = st._rs[0].test
+
+// Insert data into replica set
+var conn = new Mongo( st.s.host )
+conn.setLogLevel( 3 )
+
+var coll = conn.getCollection( "test.countSlaveOk" )
+coll.drop()
+
+for( var i = 0; i < 300; i++ ){
+    coll.insert( { i : i % 10 } )
+}
+
+var connA = conn
+var connB = new Mongo( st.s.host )
+var connC = new Mongo( st.s.host )
+
+// Make sure the writes get through, otherwise we can continue to error these one-at-a-time
+coll.getDB().getLastError()
+
+st.printShardingStatus()
+
+// Wait for client to update itself and replication to finish
+rst.awaitReplication()
+
+var primary = rst.getPrimary()
+var sec = rst.getSecondary()
+
+// Data now inserted... stop the master, since only two in set, other will still be secondary
+rst.stop( rst.getMaster(), undefined, true )
+printjson( rst.status() )
+
+// Wait for the mongos to recognize the slave
+ReplSetTest.awaitRSClientHosts( conn, sec, { ok : true, secondary : true } )
+
+// Need to check slaveOk=true first, since slaveOk=false will destroy conn in pool when
+// master is down
+conn.setSlaveOk()
+
+// Should throw exception, since not slaveOk'd
+assert.eq( 30, coll.find({ i : 0 }).count() )
+assert.eq( 10, coll.distinct("i").length )
+
+try {
+   
+    conn.setSlaveOk( false ) 
+    coll.find({ i : 0 }).count()
+    
+    print( "Should not reach here!" )
+    printjson( coll.getDB().getLastError() )                 
+    assert( false )
+    
+}
+catch( e ){
+    print( "Non-slaveOk'd connection failed." )
+}
+
+// Finish
+st.stop()
diff --git a/jstests/sharding/drop_sharded_db.js b/jstests/sharding/drop_sharded_db.js
new file mode 100644
index 00000000000..aedde8f5032
--- /dev/null
+++ b/jstests/sharding/drop_sharded_db.js
@@ -0,0 +1,62 @@
+// Tests the dropping of a sharded database SERVER-3471 SERVER-1726
+
+var st = new ShardingTest({ name : jsTestName() })
+
+var mongos = st.s0
+var config = mongos.getDB( "config" )
+
+var dbName = "buy"
+var dbA = mongos.getDB( dbName )
+var dbB = mongos.getDB( dbName + "_201107" )
+var dbC = mongos.getDB( dbName + "_201108" )
+
+print( "1: insert some data and colls into all dbs" )
+
+var numDocs = 3000;
+var numColls = 10;
+for( var i = 0; i < numDocs; i++ ){
+    dbA.getCollection( "data" + (i % numColls) ).insert({ _id : i })
+    dbB.getCollection( "data" + (i % numColls) ).insert({ _id : i })
+    dbC.getCollection( "data" + (i % numColls) ).insert({ _id : i })
+}
+
+print( "2: shard the colls ")
+
+for( var i = 0; i < numColls; i++ ){
+    
+    var key = { _id : 1 }
+    st.shardColl( dbA.getCollection( "data" + i ), key )
+    st.shardColl( dbB.getCollection( "data" + i ), key )
+    st.shardColl( dbC.getCollection( "data" + i ), key )
+
+}
+
+print( "3: drop the non-suffixed db ")
+
+dbA.dropDatabase()
+
+
+print( "3: ensure only the non-suffixed db was dropped ")
+
+var dbs = mongos.getDBNames()
+for( var i = 0; i < dbs.length; i++ ){
+    assert.neq( dbs, "" + dbA )
+}
+
+assert.eq( 0, config.databases.find({ _id : "" + dbA }).toArray().length )
+assert.eq( 1, config.databases.find({ _id : "" + dbB }).toArray().length )
+assert.eq( 1, config.databases.find({ _id : "" + dbC }).toArray().length )
+
+assert.eq( numColls, config.collections.find({ _id : RegExp( "^" + dbA + "\\..*" ), dropped : true }).toArray().length )
+assert.eq( numColls, config.collections.find({ _id : RegExp( "^" + dbB + "\\..*" ), dropped : false }).toArray().length )
+assert.eq( numColls, config.collections.find({ _id : RegExp( "^" + dbC + "\\..*" ), dropped : false }).toArray().length )
+
+for( var i = 0; i < numColls; i++ ){
+    
+    assert.eq( numDocs / numColls, dbB.getCollection( "data" + (i % numColls) ).find().itcount() )
+    assert.eq( numDocs / numColls, dbC.getCollection( "data" + (i % numColls) ).find().itcount() ) 
+    
+}
+
+// Finish
+st.stop()
diff --git a/jstests/sharding/features3.js b/jstests/sharding/features3.js
index 6870bb70208..5277d22ac56 100644
--- a/jstests/sharding/features3.js
+++ b/jstests/sharding/features3.js
@@ -17,54 +17,79 @@ for ( i=0; i<N; i++ )
     db.foo.insert( { _id : i } )
 db.getLastError();
 x = db.foo.stats();
+assert.eq( "test.foo" , x.ns , "basic1" )
+assert( x.sharded , "basic2" )
 assert.eq( N , x.count , "total count" )
 assert.eq( N / 2 , x.shards.shard0000.count , "count on shard0000" )
 assert.eq( N / 2 , x.shards.shard0001.count , "count on shard0001" )
 assert( x.totalIndexSize > 0 )
 assert( x.numExtents > 0 )
 
+db.bar.insert( { x : 1 } )
+x = db.bar.stats();
+assert.eq( 1 , x.count , "XXX1" )
+assert.eq( "test.bar" , x.ns , "XXX2" )
+assert( ! x.sharded , "XXX3: " + tojson(x) )
+
+// Fork shell and start pulling back data
 start = new Date()
 
 print( "about to fork shell: " + Date() )
-join = startParallelShell( "db.foo.find( function(){ x = ''; for ( i=0; i<10000; i++ ){ x+=i; } return true; } ).itcount()" )
+
+// TODO:  Still potential problem when our sampling of current ops misses when $where is active - 
+// solution is to increase sleep time
+parallelCommand = "try { while(true){" +
+                  " db.foo.find( function(){ x = ''; for ( i=0; i<10000; i++ ){ x+=i; } sleep( 1000 ); return true; } ).itcount() " +
+                  "}} catch(e){ print('PShell execution ended:'); printjson( e ) }"
+
+join = startParallelShell( parallelCommand )
 print( "after forking shell: " + Date() )
 
+// Get all current $where operations
 function getMine( printInprog ){
+    
     var inprog = db.currentOp().inprog;
+    
     if ( printInprog )
         printjson( inprog )
+    
+    // Find all the where queries
     var mine = []
     for ( var x=0; x<inprog.length; x++ ){
         if ( inprog[x].query && inprog[x].query.$where ){
             mine.push( inprog[x] )
         }
     }
+    
     return mine;
 }
 
-state = 0; // 0 = not found, 1 = killed, 
-killTime = null;
+var state = 0; // 0 = not found, 1 = killed, 
+var killTime = null;
+var i = 0;
 
-for ( i=0; i<( 100* 1000 ); i++ ){
+assert.soon( function(){
+    
+    // Get all the current operations
     mine = getMine( state == 0 && i > 20 );
-    if ( state == 0 ){
-        if ( mine.length == 0 ){
-            sleep(1);
-            continue;
-        }
+    i++;
+    
+    // Wait for the queries to start
+    if ( state == 0 && mine.length > 0 ){
+        // Queries started
         state = 1;
+        // Kill all $where
         mine.forEach( function(z){ printjson( db.getSisterDB( "admin" ).killOp( z.opid ) ); }  )
         killTime = new Date()
     }
-    else if ( state == 1 ){
-        if ( mine.length == 0 ){
-            state = 2;
-            break;
-        }
-        sleep(1)
-        continue;
+    // Wait for killed queries to end
+    else if ( state == 1 && mine.length == 0 ){
+        // Queries ended
+        state = 2;
+        return true;
     }
-}
+    
+}, "Couldn't kill the $where operations.", 2 * 60 * 1000 )
 
 print( "after loop: " + Date() );
 assert( killTime , "timed out waiting too kill last mine:" + tojson(mine) )
diff --git a/jstests/sharding/group_slaveok.js b/jstests/sharding/group_slaveok.js
new file mode 100644
index 00000000000..3b7cec4910f
--- /dev/null
+++ b/jstests/sharding/group_slaveok.js
@@ -0,0 +1,68 @@
+// Tests group using slaveOk
+
+var st = new ShardingTest( testName = "groupSlaveOk",
+                           numShards = 1,
+                           verboseLevel = 0,
+                           numMongos = 1,
+                           { rs : true, 
+                             rs0 : { nodes : 2 }
+                           })
+
+var rst = st._rs[0].test
+
+// Insert data into replica set
+var conn = new Mongo( st.s.host )
+conn.setLogLevel( 3 )
+
+var coll = conn.getCollection( "test.groupSlaveOk" )
+coll.drop()
+
+for( var i = 0; i < 300; i++ ){
+    coll.insert( { i : i % 10 } )
+}
+
+// Make sure the writes get through, otherwise we can continue to error these one-at-a-time
+coll.getDB().getLastError()
+
+st.printShardingStatus()
+
+// Wait for client to update itself and replication to finish
+rst.awaitReplication()
+
+var primary = rst.getPrimary()
+var sec = rst.getSecondary()
+
+// Data now inserted... stop the master, since only two in set, other will still be secondary
+rst.stop( rst.getMaster(), undefined, true )
+printjson( rst.status() )
+
+// Wait for the mongos to recognize the slave
+ReplSetTest.awaitRSClientHosts( conn, sec, { ok : true, secondary : true } )
+
+// Need to check slaveOk=true first, since slaveOk=false will destroy conn in pool when
+// master is down
+conn.setSlaveOk()
+
+// Should not throw exception, since slaveOk'd
+assert.eq( 10, coll.group({ key : { i : true } , 
+                            reduce : function( obj, ctx ){ ctx.count += 1 } ,
+                            initial : { count : 0 } }).length )
+
+try {
+   
+    conn.setSlaveOk( false ) 
+    coll.group({ key : { i : true } , 
+                 reduce : function( obj, ctx ){ ctx.count += 1 } ,
+                 initial : { count : 0 } })
+    
+    print( "Should not reach here!" )
+    printjson( coll.getDB().getLastError() )                 
+    assert( false )
+    
+}
+catch( e ){
+    print( "Non-slaveOk'd connection failed." )
+}
+
+// Finish
+st.stop()
diff --git a/jstests/sharding/parallel.js b/jstests/sharding/parallel.js
new file mode 100644
index 00000000000..d35459c3730
--- /dev/null
+++ b/jstests/sharding/parallel.js
@@ -0,0 +1,38 @@
+numShards = 3
+s = new ShardingTest( "parallel" , numShards , 2 , 2 , { sync : true } );
+
+s.adminCommand( { enablesharding : "test" } );
+s.adminCommand( { shardcollection : "test.foo" , key : { _id : 1 } } ); 
+
+db = s.getDB( "test" );
+
+N = 10000;
+
+for ( i=0; i<N; i+=(N/12) ) {
+    s.adminCommand( { split : "test.foo" , middle : { _id : i } } )
+    sh.moveChunk( "test.foo", { _id : i } , "shard000" + Math.floor( Math.random() * numShards ) )
+}
+
+
+for ( i=0; i<N; i++ )
+    db.foo.insert( { _id : i } )
+db.getLastError();
+
+
+doCommand = function( dbname , cmd ) {
+    x = benchRun( { ops : [ { op : "findOne" , ns : dbname + ".$cmd" , query : cmd } ] , 
+                    host : db.getMongo().host , parallel : 2 , seconds : 2 } )
+    printjson(x)
+    x = benchRun( { ops : [ { op : "findOne" , ns : dbname + ".$cmd" , query : cmd } ] , 
+                    host : s._mongos[1].host , parallel : 2 , seconds : 2 } )
+    printjson(x)
+}
+
+doCommand( "test" , { dbstats : 1 } )
+doCommand( "config" , { dbstats : 1 } )
+
+x = s.getDB( "config" ).stats()
+assert( x.ok , tojson(x) )
+printjson(x)
+
+s.stop()
diff --git a/jstests/sharding/shard3.js b/jstests/sharding/shard3.js
index 5f2c0b5148f..e27316e17b6 100644
--- a/jstests/sharding/shard3.js
+++ b/jstests/sharding/shard3.js
@@ -62,6 +62,7 @@ function doCounts( name , total , onlyItCounts ){
 
 var total = doCounts( "before wrong save" )
 secondary.save( { num : -3 } );
+printjson( secondary.getDB().getLastError() )
 doCounts( "after wrong save" , total , true )
 e = a.find().explain();
 assert.eq( 3 , e.n , "ex1" )
diff --git a/jstests/sharding/sync6.js b/jstests/sharding/sync6.js
index 0543837a822..233534bf1aa 100644
--- a/jstests/sharding/sync6.js
+++ b/jstests/sharding/sync6.js
@@ -17,10 +17,13 @@ commandConn.getDB( "admin" ).runCommand( { setParameter : 1, logLevel : 1 } )
 // Have lots of threads, so use larger i
 // Can't test too many, we get socket exceptions... possibly due to the 
 // javascript console.
-for ( var i = 8; i < 12; i++ ) {
+for ( var i = 8; i < 9; i++ ) {
 
-	// Our force time is 2 seconds
-	var takeoverMS = 2000;
+	// Our force time is 4 seconds
+    // Slower machines can't keep up the LockPinger rate, which can lead to lock failures
+    // since our locks are only valid if the LockPinger pings faster than the force time.
+    // Actual lock timeout is 15 minutes, so a few seconds is extremely aggressive
+	var takeoverMS = 4000;
 
 	// Generate valid sleep and skew for this timeout
 	var threadSleepWithLock = takeoverMS / 2;
diff --git a/jstests/slowNightly/command_line_parsing.js b/jstests/slowNightly/command_line_parsing.js
index 38c7324ddb9..ba7b1369627 100644
--- a/jstests/slowNightly/command_line_parsing.js
+++ b/jstests/slowNightly/command_line_parsing.js
@@ -7,3 +7,15 @@ var baseName = "jstests_slowNightly_command_line_parsing";
 var m = startMongod( "--port", port, "--dbpath", "/data/db/" + baseName, "--notablescan" );
 m.getDB( baseName ).getCollection( baseName ).save( {a:1} );
 assert.throws( function() { m.getDB( baseName ).getCollection( baseName ).find( {a:1} ).toArray() } );
+
+// test config file 
+var m2 = startMongod( "--port", port+2, "--dbpath", "/data/db/" + baseName +"2", "--config", "jstests/libs/testconfig");
+var m2result = {
+    "parsed" : {
+        "config" : "jstests/libs/testconfig",
+        "dbpath" : "/data/db/jstests_slowNightly_command_line_parsing2",
+        "fastsync" : "true",
+        "port" : 31002
+    }
+};
+assert( friendlyEqual(m2result.parsed, m2.getDB("admin").runCommand( "getCmdLineOpts" ).parsed) );
diff --git a/jstests/slowNightly/dur_big_atomic_update.js b/jstests/slowNightly/dur_big_atomic_update.js
index ffb0d838cc2..800b4b831fb 100644
--- a/jstests/slowNightly/dur_big_atomic_update.js
+++ b/jstests/slowNightly/dur_big_atomic_update.js
@@ -23,6 +23,23 @@ err = d.getLastErrorObj();
 assert(err.err == null);
 assert(err.n == 1024);
 
+d.dropDatabase();
+
+for (var i=0; i<1024; i++){
+    d.foo.insert({_id:i});
+}
+
+// Do it again but in a db.eval
+d.eval(
+    function(host, big_string) {
+        new Mongo(host).getDB("test").foo.update({}, {$set: {big_string: big_string}}, false, /*multi*/true)
+    }, conn.host, big_string); // Can't pass in connection or DB objects
+
+err = d.getLastErrorObj();
+
+assert(err.err == null);
+assert(err.n == 1024);
+
 // free up space
 d.dropDatabase();
 
diff --git a/jstests/slowNightly/replReads.js b/jstests/slowNightly/replReads.js
new file mode 100644
index 00000000000..4fe91305738
--- /dev/null
+++ b/jstests/slowNightly/replReads.js
@@ -0,0 +1,108 @@
+2// Test that doing slaveOk reads from secondaries hits all the secondaries evenly
+
+function testReadLoadBalancing(numReplicas) {
+
+    s = new ShardingTest( "replReads" , 1 /* numShards */, 0 /* verboseLevel */, 1 /* numMongos */, { rs : true , numReplicas : numReplicas, chunksize : 1 } )
+
+    s.adminCommand({enablesharding : "test"})
+    s.config.settings.find().forEach(printjson)
+
+    s.adminCommand({shardcollection : "test.foo", key : {_id : 1}})
+
+    s.getDB("test").foo.insert({a : 123})
+
+    primary = s._rs[0].test.liveNodes.master
+    secondaries = s._rs[0].test.liveNodes.slaves
+
+    function rsStats() {
+        return s.getDB( "admin" ).runCommand( "connPoolStats" )["replicaSets"]["replReads-rs0"];
+    }
+    
+    assert.eq( numReplicas , rsStats().hosts.length );
+    
+    function isMasterOrSecondary( info ){
+        if ( ! info.ok )
+            return false;
+        if ( info.ismaster )
+            return true;
+        return info.secondary && ! info.hidden;
+    }
+
+    assert.soon( 
+        function() {
+            var x = rsStats().hosts;
+            printjson(x)
+            for ( var i=0; i<x.length; i++ ) 
+                if ( ! isMasterOrSecondary( x[i] ) )
+                    return false;
+            return true;
+        } 
+    );
+    
+    for (var i = 0; i < secondaries.length; i++) {
+        assert.soon( function(){ return secondaries[i].getDB("test").foo.count() > 0; } )
+        secondaries[i].getDB('test').setProfilingLevel(2)
+    }
+
+    for (var i = 0; i < secondaries.length * 10; i++) {
+        conn = new Mongo(s._mongos[0].host)
+        conn.setSlaveOk()
+        conn.getDB('test').foo.findOne()
+    }
+
+    for (var i = 0; i < secondaries.length; i++) {
+        var profileCollection = secondaries[i].getDB('test').system.profile;
+        assert.eq(10, profileCollection.find().count(), "Wrong number of read queries sent to secondary " + i + " " + tojson( profileCollection.find().toArray() ))
+    }
+    
+    db = primary.getDB( "test" );
+    
+    printjson(rs.status());
+    c = rs.conf();
+    print( "config before: " + tojson(c) );
+    for ( i=0; i<c.members.length; i++ ) {
+        if ( c.members[i].host == db.runCommand( "ismaster" ).primary )
+            continue;
+        c.members[i].hidden = true;
+        c.members[i].priority = 0;
+        break;
+    }
+    rs.reconfig( c );
+    print( "config after: " + tojson( rs.conf() ) );
+    
+    assert.soon( 
+        function() {
+            var x = rsStats();
+            printjson(x);
+            var numOk = 0;
+            for ( var i=0; i<x.hosts.length; i++ ) 
+                if ( x.hosts[i].hidden )
+                    return true;
+            return false;
+        } , "one slave not ok" , 180000 , 5000
+    );
+
+    for (var i = 0; i < secondaries.length * 10; i++) {
+        conn = new Mongo(s._mongos[0].host)
+        conn.setSlaveOk()
+        conn.getDB('test').foo.findOne()
+    }
+
+    var counts = []
+    for (var i = 0; i < secondaries.length; i++) {
+        var profileCollection = secondaries[i].getDB('test').system.profile;
+        counts.push( profileCollection.find().count() );
+    }
+
+    counts = counts.sort();
+    assert.eq( 20 , counts[1] - counts[0] , "counts wrong: " + tojson( counts ) );
+
+    s.stop()
+}
+
+//for (var i = 1; i < 10; i++) {
+//    testReadLoadBalancing(i)
+//}
+
+// Is there a way that this can be run multiple times with different values?
+testReadLoadBalancing(3)
diff --git a/jstests/slowNightly/sharding_migrateBigObject.js b/jstests/slowNightly/sharding_migrateBigObject.js
index d8ff740d81d..5ad9ed12a18 100644
--- a/jstests/slowNightly/sharding_migrateBigObject.js
+++ b/jstests/slowNightly/sharding_migrateBigObject.js
@@ -10,10 +10,8 @@ var admin = mongos.getDB("admin")
 admin.runCommand({ addshard : "localhost:30001" })
 admin.runCommand({ addshard : "localhost:30002" })
 
-
-
-var coll = mongos.getDB("test").getCollection("stuff")
-coll.drop()
+db = mongos.getDB("test");
+var coll = db.getCollection("stuff")
 
 var data = "x"
 var nsq = 16
@@ -28,6 +26,9 @@ for( var i = 0; i < 40; i++ ) {
         if(i != 0 && i % 10 == 0) printjson( coll.stats() )
         coll.save({ data : dataObj })
 }
+db.getLastError();
+
+assert.eq( 40 , coll.count() , "prep1" );
 
 printjson( coll.stats() )
 
@@ -37,6 +38,8 @@ admin.printShardingStatus()
 
 admin.runCommand({ shardcollection : "" + coll, key : { _id : 1 } })
 
+assert.lt( 5 , mongos.getDB( "config" ).chunks.find( { ns : "test.stuff" } ).count() , "not enough chunks" );
+
 assert.soon( 
     function(){ 
         res = mongos.getDB( "config" ).chunks.group( { cond : { ns : "test.stuff" } , 
@@ -45,7 +48,7 @@ assert.soon(
                                                        initial : { nChunks : 0 } } );
         
         printjson( res );
-        return res.length > 1 && Math.abs( res[0].nChunks - res[1].nChunks ) <= 1;
+        return res.length > 1 && Math.abs( res[0].nChunks - res[1].nChunks ) <= 3;
 
     } , 
     "never migrated" , 180000 , 1000 );
diff --git a/jstests/slowNightly/sharding_passthrough.js b/jstests/slowNightly/sharding_passthrough.js
index 65d22059381..d81df685bc5 100644
--- a/jstests/slowNightly/sharding_passthrough.js
+++ b/jstests/slowNightly/sharding_passthrough.js
@@ -62,17 +62,17 @@ files.forEach(
      * clean (apitest_dbcollection)
      * logout and getnonce
      */
-	if (/[\/\\](error3|capped.*|splitvector|apitest_db|cursor6|copydb-auth|profile1|dbhash|median|apitest_dbcollection|evalb|evald|eval_nolock|auth1|auth2|dropdb_race|unix_socket\d*)\.js$/.test(x.name)) {
+	if (/[\/\\](error3|capped.*|splitvector|apitest_db|cursor6|copydb-auth|profile\d*|dbhash|median|apitest_dbcollection|evalb|evald|eval_nolock|auth1|auth2|dropdb_race|unix_socket\d*)\.js$/.test(x.name)) {
 	    print(" !!!!!!!!!!!!!!! skipping test that has failed under sharding but might not anymore " + x.name)	    
 	    return;
 	}
 	// These are bugs (some might be fixed now):
-	if (/[\/\\](apply_ops1|count5|cursor8|or4|shellkillop|update4|profile1)\.js$/.test(x.name)) {
+	if (/[\/\\](apply_ops1|count5|cursor8|or4|shellkillop|update4|profile\d*)\.js$/.test(x.name)) {
 	    print(" !!!!!!!!!!!!!!! skipping test that has failed under sharding but might not anymore " + x.name)	    
 	    return;
 	}
 	// These aren't supposed to get run under sharding:
-	if (/[\/\\](dbadmin|error1|fsync|fsync2|geo.*|indexh|remove5|update4|notablescan|compact.*|check_shard_index|mr_replaceIntoDB)\.js$/.test(x.name)) {
+	if (/[\/\\](dbadmin|error1|fsync|fsync2|geo.*|indexh|remove5|update4|notablescan|compact.*|check_shard_index|bench_test.*|mr_replaceIntoDB)\.js$/.test(x.name)) {
 	    print(" >>>>>>>>>>>>>>> skipping test that would fail under sharding " + x.name)	    
 	    return;
 	}
diff --git a/jstests/slowNightly/sharding_rs1.js b/jstests/slowNightly/sharding_rs1.js
index 01358e207de..f73e690d42e 100644
--- a/jstests/slowNightly/sharding_rs1.js
+++ b/jstests/slowNightly/sharding_rs1.js
@@ -59,6 +59,12 @@ assert.soon( function(){
 
 s.config.settings.update( { _id: "balancer" }, { $set : { stopped: true } } , true );
 
+sleep( 1000 );
+
+while ( sh.isBalancerRunning() ){
+    sleep( 1000 );
+}
+
 for ( i=0; i<s._rs.length; i++ ){
     r = s._rs[i];
     r.test.awaitReplication();
diff --git a/jstests/slowNightly/sharding_rs_arb1.js b/jstests/slowNightly/sharding_rs_arb1.js
new file mode 100644
index 00000000000..be4c4dcd136
--- /dev/null
+++ b/jstests/slowNightly/sharding_rs_arb1.js
@@ -0,0 +1,40 @@
+x = 5
+name = "sharding_rs_arb1"
+replTest = new ReplSetTest( { name : name , nodes : 3 , startPort : 31000 } );
+nodes = replTest.startSet();
+var port = replTest.ports;
+replTest.initiate({_id : name, members :
+        [
+            {_id:0, host : getHostName()+":"+port[0]},
+            {_id:1, host : getHostName()+":"+port[1]},
+            {_id:2, host : getHostName()+":"+port[2], arbiterOnly : true},
+        ],
+                  });
+
+replTest.awaitReplication();
+
+master = replTest.getMaster();
+db = master.getDB( "test" );
+printjson( rs.status() );
+
+var config = startMongodEmpty("--configsvr", "--port", 29999, "--dbpath", "/data/db/" + name + "_config" );
+
+var mongos = startMongos("--port", 30000, "--configdb", getHostName() + ":29999")
+var admin = mongos.getDB("admin")
+var url = name + "/";
+for ( i=0; i<port.length; i++ ) {
+    if ( i > 0 )
+        url += ",";
+    url += getHostName() + ":" + port[i];
+}
+print( url )
+res = admin.runCommand( { addshard : url } )
+printjson( res )
+assert( res.ok , tojson(res) )
+
+
+
+stopMongod( 30000 )
+stopMongod( 29999 )
+replTest.stopSet();
+
diff --git a/jstests/slowNightly/sync6_slow.js b/jstests/slowNightly/sync6_slow.js
new file mode 100644
index 00000000000..63d6123833c
--- /dev/null
+++ b/jstests/slowNightly/sync6_slow.js
@@ -0,0 +1,82 @@
+// More complete version of sharding/sync6.js
+// Test that distributed lock forcing does not result in inconsistencies, using a 
+// fast timeout.
+
+// Note that this test will always have random factors, since we can't control the
+// thread scheduling.
+
+test = new SyncCCTest( "sync6", { logpath : "/dev/null" } )
+
+// Startup another process to handle our commands to the cluster, mostly so it's 
+// easier to read.
+var commandConn = startMongodTest( 30000 + 4, "syncCommander", false, {})//{ logpath : "/dev/null" } )//{verbose : ""} )
+// { logpath : "/data/db/syncCommander/mongod.log" } );
+
+// Up the log level for this test
+commandConn.getDB( "admin" ).runCommand( { setParameter : 1, logLevel : 0 } )
+
+// Have lots of threads, so use larger i
+// Can't test too many, we get socket exceptions... possibly due to the 
+// javascript console.
+// TODO:  Figure out our max bounds here - use less threads now to avoid pinger starvation issues.
+for ( var t = 0; t < 4; t++ ) {
+for ( var i = 4; i < 5; i++ ) {
+
+	// Our force time is 6 seconds - slightly diff from sync6 to ensure exact time not important
+	var takeoverMS = 6000;
+
+	// Generate valid sleep and skew for this timeout
+	var threadSleepWithLock = takeoverMS / 2;
+	var configServerTimeSkew = [ 0, 0, 0 ]
+	for ( var h = 0; h < 3; h++ ) {
+		// Skew by 1/30th the takeover time either way, at max
+		configServerTimeSkew[h] = ( i + h ) % Math.floor( takeoverMS / 60 )
+		// Make skew pos or neg
+		configServerTimeSkew[h] *= ( ( i + h ) % 2 ) ? -1 : 1;
+	}
+	
+	// Build command
+	command = { _testDistLockWithSkew : 1 }
+	
+	// Basic test parameters
+	command["lockName"] = "TimeSkewFailNewTest_lock_" + i;
+	command["host"] = test.url
+	command["seed"] = i
+	command["numThreads"] = ( i % 50 ) + 1
+	
+	// Critical values so we're sure of correct operation
+	command["takeoverMS"] = takeoverMS
+	command["wait"] = 4 * takeoverMS // so we must force the lock
+	command["skewHosts"] = configServerTimeSkew
+	command["threadWait"] = threadSleepWithLock
+
+	// Less critical test params
+	
+	// 1/3 of threads will not release the lock
+	command["hangThreads"] = 3
+	// Amount of time to wait before trying lock again
+	command["threadSleep"] = 1;// ( ( i + 1 ) * 100 ) % (takeoverMS / 4)
+	// Amount of total clock skew possible between locking threads (processes)
+	// This can be large now.
+	command["skewRange"] = ( command["takeoverMS"] * 3 ) * 60 * 1000
+
+	// Double-check our sleep, host skew, and takeoverMS values again
+
+	// At maximum, our threads must sleep only half the lock timeout time.
+	assert( command["threadWait"] <= command["takeoverMS"] / 2 )
+	for ( var h = 0; h < command["skewHosts"].length; h++ ) {
+		// At maximum, our config server time skew needs to be less than 1/30th
+		// the total time skew (1/60th either way).
+		assert( Math.abs( command["skewHosts"][h] ) <= ( command["takeoverMS"] / 60 ) )
+	}
+
+	result = commandConn.getDB( "admin" ).runCommand( command )
+	printjson( result )
+	printjson( command )
+	assert( result.ok, "Skewed threads did not increment correctly." );
+
+}
+}
+
+stopMongoProgram( 30004 )
+test.stop();
diff --git a/jstests/slowWeekly/geo_full.js b/jstests/slowWeekly/geo_full.js
index ab8715be6a6..9eb1b7a54bf 100644
--- a/jstests/slowWeekly/geo_full.js
+++ b/jstests/slowWeekly/geo_full.js
@@ -25,17 +25,25 @@ var randEnvironment = function(){
 		return { max : 180, 
 				 min : -180, 
 				 bits : Math.floor( Random.rand() * 32 ) + 1, 
-				 earth : true }
+				 earth : true,
+				 bucketSize : 360 / ( 4 * 1024 * 1024 * 1024 ) }
 	}
 	
 	var scales = [ 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000 ]
 	var scale = scales[ Math.floor( Random.rand() * scales.length ) ]
 	var offset = Random.rand() * scale
 	
-	return { max : Random.rand() * scale + offset,
-		     min : - Random.rand() * scale + offset,
-		     bits : Math.floor( Random.rand() * 32 ) + 1,
-		     earth : false }
+    var max = Random.rand() * scale + offset
+	var min = - Random.rand() * scale + offset
+	var bits = Math.floor( Random.rand() * 32 ) + 1
+	var range = max - min
+    var bucketSize = range / ( 4 * 1024 * 1024 * 1024 )
+    	
+	return { max : max,
+		     min : min,
+		     bits : bits,
+		     earth : false,
+		     bucketSize : bucketSize }
 	
 }
 
@@ -271,6 +279,7 @@ var randYesQuery = function(){
 
 var locArray = function( loc ){
 	if( loc.x ) return [ loc.x, loc.y ]
+	if( ! loc.length ) return [ loc[0], loc[1] ]
 	return loc
 }
 
@@ -287,32 +296,54 @@ var locsArray = function( locs ){
 	}
 }
 
-var numTests = 30
+var minBoxSize = function( env, box ){
+    return env.bucketSize * Math.pow( 2, minBucketScale( env, box ) )
+}
+
+var minBucketScale = function( env, box ){
+        
+    if( box.length && box[0].length )
+        box = [ box[0][0] - box[1][0], box[0][1] - box[1][1] ]
+    
+    if( box.length )
+        box = Math.max( box[0], box[1] )
+        
+    print( box )
+    print( env.bucketSize )
+        
+    return Math.ceil( Math.log( box / env.bucketSize ) / Math.log( 2 ) )
+
+}
+
+// TODO:  Add spherical $uniqueDocs tests
+var numTests = 100
 
 // Our seed will change every time this is run, but 
 // each individual test will be reproducible given
 // that seed and test number
-var seed = Math.floor( Random.rand() * ( 10 ^ 30) ) 
+var seed = new Date().getTime()
 
 for ( var test = 0; test < numTests; test++ ) {
 	
 	Random.srand( seed + test );
-	
+	//Random.srand( 42240 )
+	//Random.srand( 7344 )
 	var t = db.testAllGeo
 	t.drop()
 	
 	print( "Generating test environment #" + test )
 	var env = randEnvironment()
+	//env.bits = 11
 	var query = randQuery( env )
 	var data = randDataType()
-	
+	//data.numDocs = 100; data.maxLocs = 3;
 	var results = {}
 	var totalPoints = 0
 	print( "Calculating target results for " + data.numDocs + " docs with max " + data.maxLocs + " locs " )
 
 	// Index after a random number of docs added
 	var indexIt = Math.floor( Random.rand() * data.numDocs )
-	
+		
 	for ( var i = 0; i < data.numDocs; i++ ) {
 
 		if( indexIt == i ){
@@ -346,7 +377,7 @@ for ( var test = 0; test < numTests; test++ ) {
 		randQueryAdditions( doc, indResults )
 		
 		//printjson( doc )
-		
+		doc._id = i
 		t.insert( doc )
 		
 	}
@@ -362,27 +393,33 @@ for ( var test = 0; test < numTests; test++ ) {
 	// exact
 	print( "Exact query..." )
 	assert.eq( results.exact.docsIn, t.find( { "locs.loc" : randLocType( query.exact ), "exact.docIn" : randYesQuery() } ).count() )
-	
+		
 	// $center
 	print( "Center query..." )
-	assert.eq( results.center.docsIn, t.find( { "locs.loc" : { $within : { $center : [ query.center, query.radius ] } }, "center.docIn" : randYesQuery() } ).count() )
+	print( "Min box : " + minBoxSize( env, query.radius ) )
+	assert.eq( results.center.docsIn, t.find( { "locs.loc" : { $within : { $center : [ query.center, query.radius ], $uniqueDocs : 1 } }, "center.docIn" : randYesQuery() } ).count() )
+	assert.eq( results.center.locsIn, t.find( { "locs.loc" : { $within : { $center : [ query.center, query.radius ], $uniqueDocs : false } }, "center.docIn" : randYesQuery() } ).count() )
 	if( query.sphereRadius >= 0 ){
 		print( "Center sphere query...")
 		// $centerSphere
 		assert.eq( results.sphere.docsIn, t.find( { "locs.loc" : { $within : { $centerSphere : [ query.sphereCenter, query.sphereRadius ] } }, "sphere.docIn" : randYesQuery() } ).count() )
+		assert.eq( results.sphere.locsIn, t.find( { "locs.loc" : { $within : { $centerSphere : [ query.sphereCenter, query.sphereRadius ], $uniqueDocs : 0.0 } }, "sphere.docIn" : randYesQuery() } ).count() )
 	}
 	
 	// $box
 	print( "Box query..." )
-	assert.eq( results.box.docsIn, t.find( { "locs.loc" : { $within : { $box : query.box } }, "box.docIn" : randYesQuery() } ).count() )
+	assert.eq( results.box.docsIn, t.find( { "locs.loc" : { $within : { $box : query.box, $uniqueDocs : true } }, "box.docIn" : randYesQuery() } ).count() )
+	assert.eq( results.box.locsIn, t.find( { "locs.loc" : { $within : { $box : query.box, $uniqueDocs : false } }, "box.docIn" : randYesQuery() } ).count() )
 	
 	// $polygon
 	print( "Polygon query..." )
 	assert.eq( results.poly.docsIn, t.find( { "locs.loc" : { $within : { $polygon : query.boxPoly } }, "poly.docIn" : randYesQuery() } ).count() )
+	assert.eq( results.poly.locsIn, t.find( { "locs.loc" : { $within : { $polygon : query.boxPoly, $uniqueDocs : 0 } }, "poly.docIn" : randYesQuery() } ).count() )
 					 
 	// $near
 	print( "Near query..." )
 	assert.eq( results.center.locsIn > 100 ? 100 : results.center.locsIn, t.find( { "locs.loc" : { $near : query.center, $maxDistance : query.radius } } ).count( true ) )
+
 	if( query.sphereRadius >= 0 ){
 		print( "Near sphere query...")
 		// $centerSphere
@@ -391,27 +428,39 @@ for ( var test = 0; test < numTests; test++ ) {
 	
 	
 	// geoNear
+	// results limited by size of objects
 	if( data.maxLocs < 100 ){
-		
+	    
+	    // GeoNear query
+	    print( "GeoNear query..." )
+	    assert.eq( results.center.locsIn > 100 ? 100 : results.center.locsIn, t.getDB().runCommand({ geoNear : "testAllGeo", near : query.center, maxDistance : query.radius }).results.length )
+	    // GeoNear query
+        assert.eq( results.center.docsIn > 100 ? 100 : results.center.docsIn, t.getDB().runCommand({ geoNear : "testAllGeo", near : query.center, maxDistance : query.radius, uniqueDocs : true }).results.length )
+       
+	    
 		var num = 2 * results.center.locsIn;
 		if( num > 200 ) num = 200;
 		
 		var output = db.runCommand( {
 			geoNear : "testAllGeo", 
 			near : query.center, 
-			maxDistance : query.radius , 
+			maxDistance : query.radius ,
+			includeLocs : true,
 			num : num } ).results
-			
+				
 		assert.eq( Math.min( 200, results.center.locsIn ), output.length )
 	
 		var distance = 0;
 		for ( var i = 0; i < output.length; i++ ) {
 			var retDistance = output[i].dis
-	
+			var retLoc = locArray( output[i].loc )
+			
 			// print( "Dist from : " + results[i].loc + " to " + startPoint + " is "
 			// + retDistance + " vs " + radius )
 			
 			var arrLocs = locsArray( output[i].obj.locs )
+						
+			assert.contains( retLoc, arrLocs )
 			
 			// printjson( arrLocs )
 			
@@ -422,6 +471,7 @@ for ( var test = 0; test < numTests; test++ ) {
 			}
 			
 			assert( distInObj )
+			assert.between( retDistance - 0.0001 , Geo.distance( locArray( query.center ), retLoc ), retDistance + 0.0001 )
 			assert.lte( retDistance, query.radius )
 			assert.gte( retDistance, distance )
 			distance = retDistance
diff --git a/jstests/slowWeekly/geo_mnypts_plus_fields.js b/jstests/slowWeekly/geo_mnypts_plus_fields.js
new file mode 100644
index 00000000000..f67e49ba930
--- /dev/null
+++ b/jstests/slowWeekly/geo_mnypts_plus_fields.js
@@ -0,0 +1,98 @@
+// Test sanity of geo queries with a lot of points
+
+var maxFields = 2;
+
+for( var fields = 1; fields < maxFields; fields++ ){
+    
+    var coll = db.testMnyPts
+    coll.drop()
+    
+    var totalPts = 500 * 1000
+        
+    // Add points in a 100x100 grid
+    for( var i = 0; i < totalPts; i++ ){
+        var ii = i % 10000
+        
+        var doc = { loc : [ ii % 100, Math.floor( ii / 100 ) ] }
+        
+        // Add fields with different kinds of data
+        for( var j = 0; j < fields; j++ ){
+            
+            var field = null
+            
+            if( j % 3 == 0 ){
+                // Make half the points not searchable
+                field = "abcdefg" + ( i % 2 == 0 ? "h" : "" )
+            }
+            else if( j % 3 == 1 ){
+                field = new Date()
+            }
+            else{
+                field = true
+            }
+            
+            doc[ "field" + j ] = field
+        }
+        
+        coll.insert( doc )
+    }
+    
+    // Create the query for the additional fields
+    queryFields = {}
+    for( var j = 0; j < fields; j++ ){
+        
+        var field = null
+        
+        if( j % 3 == 0 ){
+            field = "abcdefg"
+        }
+        else if( j % 3 == 1 ){
+            field = { $lte : new Date() }
+        }
+        else{
+            field = true
+        }
+        
+        queryFields[ "field" + j ] = field
+    }
+    
+    coll.ensureIndex({ loc : "2d" })
+    
+    // Check that quarter of points in each quadrant
+    for( var i = 0; i < 4; i++ ){
+        var x = i % 2
+        var y = Math.floor( i / 2 )
+        
+        var box = [[0, 0], [49, 49]]
+        box[0][0] += ( x == 1 ? 50 : 0 )
+        box[1][0] += ( x == 1 ? 50 : 0 )
+        box[0][1] += ( y == 1 ? 50 : 0 )
+        box[1][1] += ( y == 1 ? 50 : 0 )
+    
+        // Now only half of each result comes back
+        assert.eq( totalPts / ( 4 * 2 ), coll.find(Object.extend( { loc : { $within : { $box : box } } }, queryFields ) ).count() )
+        assert.eq( totalPts / ( 4 * 2 ), coll.find(Object.extend( { loc : { $within : { $box : box } } }, queryFields ) ).itcount() )
+    
+    }
+    
+    // Check that half of points in each half
+    for( var i = 0; i < 2; i++ ){
+        
+        var box = [[0, 0], [49, 99]]
+        box[0][0] += ( i == 1 ? 50 : 0 )
+        box[1][0] += ( i == 1 ? 50 : 0 )
+    
+        assert.eq( totalPts / ( 2 * 2 ), coll.find(Object.extend( { loc : { $within : { $box : box } } }, queryFields ) ).count() )
+        assert.eq( totalPts / ( 2 * 2 ), coll.find(Object.extend( { loc : { $within : { $box : box } } }, queryFields ) ).itcount() )
+    
+    }
+    
+    // Check that all but corner set of points in radius
+    var circle = [[0, 0], (100 - 1) * Math.sqrt( 2 ) - 0.25 ]
+    
+    // All [99,x] pts are field0 : "abcdefg"
+    assert.eq( totalPts / 2 - totalPts / ( 100 * 100 ), coll.find(Object.extend( { loc : { $within : { $center : circle } } }, queryFields ) ).count() )
+    assert.eq( totalPts / 2 - totalPts / ( 100 * 100 ), coll.find(Object.extend( { loc : { $within : { $center : circle } } }, queryFields ) ).itcount() )
+        
+}
+
diff --git a/jstests/slowWeekly/update_yield1.js b/jstests/slowWeekly/update_yield1.js
index 7e95855adb1..5f7183064f3 100644
--- a/jstests/slowWeekly/update_yield1.js
+++ b/jstests/slowWeekly/update_yield1.js
@@ -54,7 +54,7 @@ while ( ( (new Date()).getTime() - start ) < ( time * 2 ) ){
         assert.eq( 1 , x.inprog.length , "nothing in prog" );
     }
 
-    assert.gt( 2000 , me );
+    assert.gt( time / 3 , me );
 }
 
 join();
diff --git a/jstests/sorta.js b/jstests/sorta.js
index f5942d4bddd..7c82778a186 100644
--- a/jstests/sorta.js
+++ b/jstests/sorta.js
@@ -5,16 +5,17 @@ t.drop();
 
 // Enable _allow_dot to try and bypass v8 field name checking.
 t.insert( {_id:0,a:MinKey}, true );
-t.save( {_id:1,a:null} );
-t.save( {_id:2,a:[]} );
+t.save( {_id:3,a:null} );
+t.save( {_id:1,a:[]} );
 t.save( {_id:7,a:[2]} );
-t.save( {_id:3} );
-t.save( {_id:4,a:null} );
-t.save( {_id:5,a:[]} );
+t.save( {_id:4} );
+t.save( {_id:5,a:null} );
+t.save( {_id:2,a:[]} );
 t.save( {_id:6,a:1} );
 t.insert( {_id:8,a:MaxKey}, true );
 
 function sorted( arr ) {
+    assert.eq( 9, arr.length );
  	for( i = 1; i < arr.length; ++i ) {
      	assert.lte( arr[ i-1 ]._id, arr[ i ]._id );
     }
diff --git a/jstests/tool/csvexport1.js b/jstests/tool/csvexport1.js
new file mode 100644
index 00000000000..eb4e6e38431
--- /dev/null
+++ b/jstests/tool/csvexport1.js
@@ -0,0 +1,45 @@
+// csvexport1.js
+
+t = new ToolTest( "csvexport1" )
+
+c = t.startDB( "foo" );
+
+assert.eq( 0 , c.count() , "setup1" );
+
+objId = ObjectId()
+
+c.insert({ a : new NumberInt(1) , b : objId , c: [1, 2, 3], d : {a : "hello", b : "world"} , e: '-'})
+c.insert({ a : -2.0, c : MinKey, d : "Then he said, \"Hello World!\"", e : new NumberLong(3)})
+c.insert({ a : new BinData(0, "1234"), b : ISODate("2009-08-27"), c : new Timestamp(1234, 9876), d : /foo*\"bar\"/i, e : function foo() { print("Hello World!"); }})
+
+assert.eq( 3 , c.count() , "setup2" );
+
+t.runTool( "export" , "--out" , t.extFile , "-d" , t.baseName , "-c" , "foo" , "--csv", "-f", "a,b,c,d,e")
+
+
+c.drop()
+
+assert.eq( 0 , c.count() , "after drop" )
+
+t.runTool("import", "--file", t.extFile, "-d", t.baseName, "-c", "foo", "--type", "csv", "--headerline");
+
+assert.soon ( 3 + " == c.count()", "after import");
+
+// Note: Exporting and Importing to/from CSV is not designed to be round-trippable
+expected = []
+expected.push({ a : 1, b : "ObjectID(" + objId.toString() + ")", c : "[ 1, 2, 3 ]", d : "{ \"a\" : \"hello\", \"b\" : \"world\" }", e : "-"})
+expected.push({ a : -2.0, b : "", c : "$MinKey", d : "Then he said, \"Hello World!\"", e : 3})
+expected.push({ a : "D76DF8", b : "2009-08-27T00:00:00Z", c : "{ \"t\" : 1000 , \"i\" : 9876 }", d : "/foo*\\\"bar\\\"/i", e : tojson(function foo() { print("Hello World!"); })})
+
+actual = []
+actual.push(c.find({a : 1}).toArray()[0]);
+actual.push(c.find({a : -2.0}).toArray()[0]);
+actual.push(c.find({a : "D76DF8"}).toArray()[0]);
+
+for (i = 0; i < expected.length; i++) {
+    delete actual[i]._id
+    assert.eq( expected[i], actual[i], "CSV export " + i);
+}
+
+
+t.stop()
+\ No newline at end of file
diff --git a/jstests/tool/csvexport2.js b/jstests/tool/csvexport2.js
new file mode 100644
index 00000000000..3e0dd2c6829
--- /dev/null
+++ b/jstests/tool/csvexport2.js
@@ -0,0 +1,31 @@
+// csvexport2.js
+
+t = new ToolTest( "csvexport2" )
+
+c = t.startDB( "foo" );
+
+// This test is designed to test exporting of a CodeWithScope object.
+// However, due to SERVER-3391, it is not possible to create a CodeWithScope object in the mongo shell,
+// therefore this test does not work.  Once SERVER-3391 is resolved, this test should be un-commented out
+
+//assert.eq( 0 , c.count() , "setup1" );
+
+//c.insert({ a : 1 , b : Code("print(\"Hello \" + x);", {"x" : "World!"})})
+//assert.eq( 1 , c.count() , "setup2" );
+//t.runTool( "export" , "--out" , t.extFile , "-d" , t.baseName , "-c" , "foo" , "--csv", "-f", "a,b")
+
+
+//c.drop()
+
+//assert.eq( 0 , c.count() , "after drop" )
+//t.runTool("import", "--file", t.extFile, "-d", t.baseName, "-c", "foo", "--type", "csv", "--headerline");
+//assert.soon ( 1 + " == c.count()", "after import");
+
+//expected = { a : 1, b : "\"{ \"$code\" : print(\"Hello \" + x); ,  \"$scope\" : { \"x\" : \"World!\" } }"};
+//actual = c.findOne()
+
+//delete actual._id;
+//assert.eq( expected, actual );
+
+
+t.stop()
+\ No newline at end of file
diff --git a/jstests/tool/csvimport1.js b/jstests/tool/csvimport1.js
new file mode 100644
index 00000000000..3bff1110cbe
--- /dev/null
+++ b/jstests/tool/csvimport1.js
@@ -0,0 +1,40 @@
+// csvimport1.js
+
+t = new ToolTest( "csvimport1" )
+
+c = t.startDB( "foo" );
+
+base = []
+base.push({ a : 1, b : "this is some text.\nThis text spans multiple lines, and just for fun\ncontains a comma", "c" : "This has leading and trailing whitespace!" })
+base.push({a : 2, b : "When someone says something you \"put it in quotes\"", "c" : "I like embedded quotes/slashes\\backslashes" })
+base.push({a : 3, b : "  This line contains the empty string and has leading and trailing whitespace inside the quotes!  ", "c" : "" })
+base.push({a : 4, b : "", "c" : "How are empty entries handled?" })
+base.push({a : 5, b : "\"\"", c : "\"This string is in quotes and contains empty quotes (\"\")\""})
+base.push({ a : "a" , b : "b" , c : "c"})
+
+assert.eq( 0 , c.count() , "setup" );
+
+t.runTool( "import" , "--file" , "jstests/tool/data/csvimport1.csv" , "-d" , t.baseName , "-c" , "foo" , "--type" , "csv" , "-f" , "a,b,c" );
+assert.soon( base.length + " == c.count()" , "after import 1 " );
+
+a = c.find().sort( { a : 1 } ).toArray();
+for (i = 0; i < base.length; i++ ) {
+    delete a[i]._id
+    assert.eq( tojson(base[i]), tojson(a[i]), "csv parse " + i)
+}
+
+c.drop()
+assert.eq( 0 , c.count() , "after drop" )
+
+t.runTool( "import" , "--file" , "jstests/tool/data/csvimport1.csv" , "-d" , t.baseName , "-c" , "foo" , "--type" , "csv" , "--headerline" )
+assert.soon( "c.findOne()" , "no data after sleep" );
+assert.eq( base.length - 1 , c.count() , "after import 2" );
+
+x = c.find().sort( { a : 1 } ).toArray();
+for (i = 0; i < base.length - 1; i++ ) {
+    delete x[i]._id
+    assert.eq( tojson(base[i]), tojson(x[i]), "csv parse with headerline " + i)
+}
+
+
+t.stop()
diff --git a/jstests/tool/data/csvimport1.csv b/jstests/tool/data/csvimport1.csv
new file mode 100644
index 00000000000..256d40a9184
--- /dev/null
+++ b/jstests/tool/data/csvimport1.csv
@@ -0,0 +1,8 @@
+a,b,c
+1,"this is some text.
+This text spans multiple lines, and just for fun
+contains a comma",    "This has leading and trailing whitespace!"  
+2, "When someone says something you ""put it in quotes""", I like embedded quotes/slashes\backslashes  
+  3  , "  This line contains the empty string and has leading and trailing whitespace inside the quotes!  ", ""
+ "4" ,,  How are empty entries handled?  
+"5","""""", """This string is in quotes and contains empty quotes ("""")"""
diff --git a/jstests/tool/dumprestore5.js b/jstests/tool/dumprestore5.js
new file mode 100644
index 00000000000..ce28fea2027
--- /dev/null
+++ b/jstests/tool/dumprestore5.js
@@ -0,0 +1,36 @@
+// dumprestore5.js
+
+t = new ToolTest( "dumprestore5" );
+
+t.startDB( "foo" );
+
+db = t.db
+
+db.addUser('user','password')
+
+assert.eq(1, db.system.users.count(), "setup")
+assert.eq(1, db.system.indexes.count(), "setup2")
+
+t.runTool( "dump" , "--out" , t.ext );
+
+db.dropDatabase()
+
+assert.eq(0, db.system.users.count(), "didn't drop users")
+assert.eq(0, db.system.indexes.count(), "didn't drop indexes")
+
+t.runTool("restore", "--dir", t.ext)
+
+assert.soon("db.system.users.findOne()", "no data after restore");
+assert.eq(1, db.system.users.find({user:'user'}).count(), "didn't restore users")
+assert.eq(1, db.system.indexes.count(), "didn't restore indexes")
+
+db.removeUser('user')
+db.addUser('user2', 'password2')
+
+t.runTool("restore", "--dir", t.ext, "--drop")
+
+assert.soon("1 == db.system.users.find({user:'user'}).count()", "didn't restore users 2")
+assert.eq(0, db.system.users.find({user:'user2'}).count(), "didn't drop users")
+assert.eq(1, db.system.indexes.count(), "didn't maintain indexes")
+
+t.stop();
diff --git a/jstests/unique2.js b/jstests/unique2.js
index 42cf9fbd0ac..1c2828830f4 100644
--- a/jstests/unique2.js
+++ b/jstests/unique2.js
@@ -1,3 +1,11 @@
+// Test unique and dropDups index options.
+
+function checkNprev( np ) {
+    // getPrevError() is not available sharded.
+    if ( typeof( myShardingTest ) == 'undefined' ) {
+        assert.eq( np, db.getPrevError().nPrev );
+    }
+}
 
 t = db.jstests_unique2;
 
@@ -21,7 +29,9 @@ t.ensureIndex({k:1}, {unique:true});
 
 t.insert({k:3});
 t.insert({k:[2,3]});
+assert( db.getLastError() );
 t.insert({k:[4,3]});
+assert( db.getLastError() );
 
 assert( t.count() == 1 ) ;
 assert( t.find().sort({k:1}).toArray().length == 1 ) ;
@@ -33,9 +43,52 @@ t.insert({k:[2,3]});
 t.insert({k:[4,3]});
 assert( t.count() == 3 ) ;
 
+// Trigger an error, so we can test n of getPrevError() later.
+assert.throws( function() { t.find( {$where:'aaa'} ).itcount(); } );
+assert( db.getLastError() );
+checkNprev( 1 );
+
 t.ensureIndex({k:1}, {unique:true, dropDups:true});
+// Check error flag was not set SERVER-2054.
+assert( !db.getLastError() );
+// Check that offset of previous error is correct.
+checkNprev( 2 );
+
+// Check the dups were dropped.
+assert( t.count() == 1 ) ;
+assert( t.find().sort({k:1}).toArray().length == 1 ) ;
+assert( t.find().sort({k:1}).count() == 1 ) ;
+
+// Check that a new conflicting insert will cause an error.
+t.insert({k:[2,3]});
+assert( db.getLastError() );
+
+t.drop();
 
+t.insert({k:3});
+t.insert({k:[2,3]});
+t.insert({k:[4,3]});
+assert( t.count() == 3 ) ;
+
+
+// Now try with a background index op.
+
+// Trigger an error, so we can test n of getPrevError() later.
+assert.throws( function() { t.find( {$where:'aaa'} ).itcount(); } );
+assert( db.getLastError() );
+checkNprev( 1 );
+
+t.ensureIndex({k:1}, {background:true, unique:true, dropDups:true});
+// Check error flag was not set SERVER-2054.
+assert( !db.getLastError() );
+// Check that offset of pervious error is correct.
+checkNprev( 2 );
+
+// Check the dups were dropped.
 assert( t.count() == 1 ) ;
 assert( t.find().sort({k:1}).toArray().length == 1 ) ;
 assert( t.find().sort({k:1}).count() == 1 ) ;
 
+// Check that a new conflicting insert will cause an error.
+t.insert({k:[2,3]});
+assert( db.getLastError() );
diff --git a/jstests/uniqueness.js b/jstests/uniqueness.js
index f1651b31c65..ce19ad08d82 100644
--- a/jstests/uniqueness.js
+++ b/jstests/uniqueness.js
@@ -26,8 +26,21 @@ db.jstests_uniqueness2.drop();
 db.jstests_uniqueness2.insert({a:3});
 db.jstests_uniqueness2.insert({a:3});
 assert( db.jstests_uniqueness2.count() == 2 , 6) ;
+db.resetError();
 db.jstests_uniqueness2.ensureIndex({a:1}, true);
 assert( db.getLastError() , 7);
+assert( db.getLastError().match( /E11000/ ) );
+
+// Check for an error message when we index in the background and there are dups 
+db.jstests_uniqueness2.drop();
+db.jstests_uniqueness2.insert({a:3});
+db.jstests_uniqueness2.insert({a:3});
+assert( db.jstests_uniqueness2.count() == 2 , 6) ;
+assert( !db.getLastError() );
+db.resetError();
+db.jstests_uniqueness2.ensureIndex({a:1}, {unique:true,background:true});
+assert( db.getLastError() , 7);
+assert( db.getLastError().match( /E11000/ ) );
 
 /* Check that if we update and remove _id, it gets added back by the DB */
 
diff --git a/jstests/updatef.js b/jstests/updatef.js
new file mode 100644
index 00000000000..69425932f19
--- /dev/null
+++ b/jstests/updatef.js
@@ -0,0 +1,24 @@
+// Test unsafe management of nsdt on update command yield SERVER-3208
+
+prefixNS = db.jstests_updatef;
+prefixNS.save( {} );
+
+t = db.jstests_updatef_actual;
+t.drop();
+
+t.save( {a:0,b:[]} );
+for( i = 0; i < 1000; ++i ) {
+	t.save( {a:100} );
+}
+t.save( {a:0,b:[]} );
+
+db.getLastError();
+// Repeatedly rename jstests_updatef to jstests_updatef_ and back.  This will
+// invalidate the jstests_updatef_actual NamespaceDetailsTransient object.
+s = startParallelShell( "for( i=0; i < 100; ++i ) { db.jstests_updatef.renameCollection( 'jstests_updatef_' ); db.jstests_updatef_.renameCollection( 'jstests_updatef' ); }" );
+
+for( i=0; i < 20; ++i ) {
+    t.update( {a:0}, {$push:{b:i}}, false, true );
+}
+
+s();
diff --git a/jstests/updateg.js b/jstests/updateg.js
new file mode 100644
index 00000000000..f8d452f71b2
--- /dev/null
+++ b/jstests/updateg.js
@@ -0,0 +1,17 @@
+// SERVER-3370 check modifiers with field name characters comparing less than '.' character.
+
+t = db.jstests_updateg;
+
+t.drop();
+t.update({}, { '$inc' : { 'all.t' : 1, 'all-copy.t' : 1 }}, true);
+assert.eq( 1, t.count( {all:{t:1},'all-copy':{t:1}} ) );
+
+t.drop();
+t.save({ 'all' : {}, 'all-copy' : {}});
+t.update({}, { '$inc' : { 'all.t' : 1, 'all-copy.t' : 1 }});
+assert.eq( 1, t.count( {all:{t:1},'all-copy':{t:1}} ) );
+
+t.drop();
+t.save({ 'all11' : {}, 'all2' : {}});
+t.update({}, { '$inc' : { 'all11.t' : 1, 'all2.t' : 1 }});
+assert.eq( 1, t.count( {all11:{t:1},'all2':{t:1}} ) );
diff --git a/pch.h b/pch.h
index 19620578a73..1e9684d16d6 100644
--- a/pch.h
+++ b/pch.h
@@ -44,7 +44,20 @@
 # include <windows.h>
 #endif
 
+#if defined(__linux__) && defined(MONGO_EXPOSE_MACROS)
+// glibc's optimized versions are better than g++ builtins
+# define __builtin_strcmp strcmp
+# define __builtin_strlen strlen
+# define __builtin_memchr memchr
+# define __builtin_memcmp memcmp
+# define __builtin_memcpy memcpy
+# define __builtin_memset memset
+# define __builtin_memmove memmove
+#endif
+
+
 #include <ctime>
+#include <cstring>
 #include <sstream>
 #include <string>
 #include <memory>
@@ -138,7 +151,11 @@ namespace mongo {
     void asserted(const char *msg, const char *file, unsigned line);
 }
 
-#define MONGO_assert(_Expression) (void)( (!!(_Expression)) || (mongo::asserted(#_Expression, __FILE__, __LINE__), 0) )
+
+
+// TODO: Rework the headers so we don't need this craziness
+#include "bson/inline_decls.h"
+#define MONGO_assert(_Expression) (void)( MONGO_likely(!!(_Expression)) || (mongo::asserted(#_Expression, __FILE__, __LINE__), 0) )
 
 #include "util/debug_util.h"
 #include "util/goodies.h"
diff --git a/rpm/mongo.spec b/rpm/mongo.spec
index 332c6d29c96..5287ec4a39a 100644
--- a/rpm/mongo.spec
+++ b/rpm/mongo.spec
@@ -1,5 +1,5 @@
 Name: mongo
-Version: 1.9.0
+Version: 1.9.2
 Release: mongodb_1%{?dist}
 Summary: mongo client shell and tools
 License: AGPL 3.0
diff --git a/s/balance.cpp b/s/balance.cpp
index da25f3362c2..0cb39ad038d 100644
--- a/s/balance.cpp
+++ b/s/balance.cpp
@@ -155,7 +155,7 @@ namespace mongo {
         cursor.reset();
 
         if ( collections.empty() ) {
-            log(1) << "no collections to balance" << endl;
+            LOG(1) << "no collections to balance" << endl;
             return;
         }
 
@@ -170,7 +170,7 @@ namespace mongo {
         vector<Shard> allShards;
         Shard::getAllShards( allShards );
         if ( allShards.size() < 2) {
-            log(1) << "can't balance without more active shards" << endl;
+            LOG(1) << "can't balance without more active shards" << endl;
             return;
         }
 
@@ -205,7 +205,7 @@ namespace mongo {
             cursor.reset();
 
             if (shardToChunksMap.empty()) {
-                log(1) << "skipping empty collection (" << ns << ")";
+                LOG(1) << "skipping empty collection (" << ns << ")";
                 continue;
             }
 
@@ -282,7 +282,7 @@ namespace mongo {
 
                 // now make sure we should even be running
                 if ( ! grid.shouldBalance() ) {
-                    log(1) << "skipping balancing round because balancing is disabled" << endl;
+                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;
                     conn.done();
                     
                     sleepsecs( 30 );
@@ -297,25 +297,25 @@ namespace mongo {
                 {
                     dist_lock_try lk( &balanceLock , "doing balance round" );
                     if ( ! lk.got() ) {
-                        log(1) << "skipping balancing round because another balancer is active" << endl;
+                        LOG(1) << "skipping balancing round because another balancer is active" << endl;
                         conn.done();
                         
                         sleepsecs( 30 ); // no need to wake up soon
                         continue;
                     }
                     
-                    log(1) << "*** start balancing round" << endl;
+                    LOG(1) << "*** start balancing round" << endl;
                     
                     vector<CandidateChunkPtr> candidateChunks;
                     _doBalanceRound( conn.conn() , &candidateChunks );
                     if ( candidateChunks.size() == 0 ) {
-                        log(1) << "no need to move any chunk" << endl;
+                        LOG(1) << "no need to move any chunk" << endl;
                     }
                     else {
                         _balancedLastTime = _moveChunks( &candidateChunks );
                     }
                     
-                    log(1) << "*** end of balancing round" << endl;
+                    LOG(1) << "*** end of balancing round" << endl;
                 }
                 
                 conn.done();
@@ -326,7 +326,7 @@ namespace mongo {
                 log() << "caught exception while doing balance: " << e.what() << endl;
 
                 // Just to match the opening statement if in log level 1
-                log(1) << "*** End of balancing round" << endl;
+                LOG(1) << "*** End of balancing round" << endl;
 
                 sleepsecs( 30 ); // sleep a fair amount b/c of error
                 continue;
diff --git a/s/balancer_policy.cpp b/s/balancer_policy.cpp
index efb0fb924af..f1b4bf14db1 100644
--- a/s/balancer_policy.cpp
+++ b/s/balancer_policy.cpp
@@ -96,13 +96,13 @@ namespace mongo {
             return NULL;
         }
 
-        log(1) << "collection : " << ns << endl;
-        log(1) << "donor      : " << max.second << " chunks on " << max.first << endl;
-        log(1) << "receiver   : " << min.second << " chunks on " << min.first << endl;
+        LOG(1) << "collection : " << ns << endl;
+        LOG(1) << "donor      : " << max.second << " chunks on " << max.first << endl;
+        LOG(1) << "receiver   : " << min.second << " chunks on " << min.first << endl;
         if ( ! drainingShards.empty() ) {
             string drainingStr;
             joinStringDelim( drainingShards, &drainingStr, ',' );
-            log(1) << "draining           : " << ! drainingShards.empty() << "(" << drainingShards.size() << ")" << endl;
+            LOG(1) << "draining           : " << ! drainingShards.empty() << "(" << drainingShards.size() << ")" << endl;
         }
 
         // Solving imbalances takes a higher priority than draining shards. Many shards can
diff --git a/s/chunk.cpp b/s/chunk.cpp
index b1984179864..09dc994d961 100644
--- a/s/chunk.cpp
+++ b/s/chunk.cpp
@@ -208,7 +208,7 @@ namespace mongo {
                 // no split points means there isn't enough data to split on
                 // 1 split point means we have between half the chunk size to full chunk size
                 // so we shouldn't split
-                log(1) << "chunk not full enough to trigger auto-split" << endl;
+                LOG(1) << "chunk not full enough to trigger auto-split" << endl;
                 return BSONObj();
             }
 
@@ -350,7 +350,7 @@ namespace mongo {
             // this was implicit before since we did a splitVector on the same socket
             ShardConnection::sync();
 
-            log(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten << " splitThreshold: " << splitThreshold << endl;
+            LOG(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten << " splitThreshold: " << splitThreshold << endl;
 
             _dataWritten = 0; // reset so we check often enough
 
@@ -378,7 +378,7 @@ namespace mongo {
                 Shard newLocation = Shard::pick( getShard() );
                 if ( getShard() == newLocation ) {
                     // if this is the best shard, then we shouldn't do anything (Shard::pick already logged our shard).
-                    log(1) << "recently split chunk: " << range << " already in the best shard: " << getShard() << endl;
+                    LOG(1) << "recently split chunk: " << range << " already in the best shard: " << getShard() << endl;
                     return true; // we did split even if we didn't migrate
                 }
 
@@ -386,7 +386,7 @@ namespace mongo {
                 ChunkPtr toMove = cm->findChunk(min);
 
                 if ( ! (toMove->getMin() == min && toMove->getMax() == max) ){
-                    log(1) << "recently split chunk: " << range << " modified before we could migrate " << toMove << endl;
+                    LOG(1) << "recently split chunk: " << range << " modified before we could migrate " << toMove << endl;
                     return true;
                 }
 
@@ -666,8 +666,10 @@ namespace mongo {
             }
 
             if ( c ) {
-                if ( c->contains( obj ) )
+                if ( c->contains( key ) ){
+                    dassert(c->contains(key)); // doesn't use fast-path in extractKey
                     return c;
+                }
 
                 PRINT(foo);
                 PRINT(*c);
@@ -791,7 +793,7 @@ namespace mongo {
 
         set<Shard> seen;
 
-        log(1) << "ChunkManager::drop : " << _ns << endl;
+        LOG(1) << "ChunkManager::drop : " << _ns << endl;
 
         // lock all shards so no one can do a split/migrate
         for ( ChunkMap::const_iterator i=_chunkMap.begin(); i!=_chunkMap.end(); ++i ) {
@@ -799,7 +801,7 @@ namespace mongo {
             seen.insert( c->getShard() );
         }
 
-        log(1) << "ChunkManager::drop : " << _ns << "\t all locked" << endl;
+        LOG(1) << "ChunkManager::drop : " << _ns << "\t all locked" << endl;
 
         // delete data from mongod
         for ( set<Shard>::iterator i=seen.begin(); i!=seen.end(); i++ ) {
@@ -808,13 +810,13 @@ namespace mongo {
             conn.done();
         }
 
-        log(1) << "ChunkManager::drop : " << _ns << "\t removed shard data" << endl;
+        LOG(1) << "ChunkManager::drop : " << _ns << "\t removed shard data" << endl;
 
         // remove chunk data
         ScopedDbConnection conn( configServer.modelServer() );
         conn->remove( Chunk::chunkMetadataNS , BSON( "ns" << _ns ) );
         conn.done();
-        log(1) << "ChunkManager::drop : " << _ns << "\t removed chunk data" << endl;
+        LOG(1) << "ChunkManager::drop : " << _ns << "\t removed chunk data" << endl;
 
         for ( set<Shard>::iterator i=seen.begin(); i!=seen.end(); i++ ) {
             ScopedDbConnection conn( *i );
@@ -830,7 +832,7 @@ namespace mongo {
             conn.done();
         }
 
-        log(1) << "ChunkManager::drop : " << _ns << "\t DONE" << endl;
+        LOG(1) << "ChunkManager::drop : " << _ns << "\t DONE" << endl;
         configServer.logChange( "dropCollection" , _ns , BSONObj() );
     }
 
@@ -841,7 +843,7 @@ namespace mongo {
         vector<BSONObj> splitPoints;
         soleChunk->pickSplitVector( splitPoints , Chunk::MaxChunkSize );
         if ( splitPoints.empty() ) {
-            log(1) << "not enough data to warrant chunking " << getns() << endl;
+            LOG(1) << "not enough data to warrant chunking " << getns() << endl;
             return;
         }
 
@@ -983,7 +985,7 @@ namespace mongo {
 
         void run() {
             runShardChunkVersion();
-            log(1) << "shardObjTest passed" << endl;
+            LOG(1) << "shardObjTest passed" << endl;
         }
     } shardObjTest;
 
@@ -1008,7 +1010,7 @@ namespace mongo {
         cmdBuilder.append( "shardHost" , s.getConnString() );
         BSONObj cmd = cmdBuilder.obj();
 
-        log(1) << "    setShardVersion  " << s.getName() << " " << conn.getServerAddress() << "  " << ns << "  " << cmd << " " << &conn << endl;
+        LOG(1) << "    setShardVersion  " << s.getName() << " " << conn.getServerAddress() << "  " << ns << "  " << cmd << " " << &conn << endl;
 
         return conn.runCommand( "admin" , cmd , result );
     }
diff --git a/s/commands_admin.cpp b/s/commands_admin.cpp
index 4cb30f99a3b..4568c4d3897 100644
--- a/s/commands_admin.cpp
+++ b/s/commands_admin.cpp
@@ -45,6 +45,7 @@
 #include "stats.h"
 #include "writeback_listener.h"
 #include "client.h"
+#include "../util/ramlog.h"
 
 namespace mongo {
 
@@ -82,7 +83,7 @@ namespace mongo {
             virtual void help( stringstream& help ) const {
                 help << " shows status/reachability of servers in the cluster";
             }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 result.append("configserver", configServer.getPrimary().getConnString() );
                 result.append("isdbgrid", 1);
                 return true;
@@ -95,7 +96,7 @@ namespace mongo {
             virtual void help( stringstream& help ) const {
                 help << "flush all router config";
             }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 grid.flushConfig();
                 result.appendBool( "flushed" , true );
                 return true;
@@ -112,7 +113,7 @@ namespace mongo {
             virtual bool slaveOk() const { return true; }
             virtual LockType locktype() const { return NONE; }
 
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
                 result.append( "host" , prettyHostName() );
                 result.append("version", versionString);
                 result.append("process","mongos");
@@ -177,6 +178,20 @@ namespace mongo {
                     bb.done();
                 }
 
+                {
+                    RamLog* rl = RamLog::get( "warnings" );
+                    verify(15879, rl);
+                    
+                    if (rl->lastWrite() >= time(0)-(10*60)){ // only show warnings from last 10 minutes
+                        vector<const char*> lines;
+                        rl->get( lines );
+                        
+                        BSONArrayBuilder arr( result.subarrayStart( "warnings" ) );
+                        for ( unsigned i=std::max(0,(int)lines.size()-10); i<lines.size(); i++ )
+                            arr.append( lines[i] );
+                        arr.done();
+                    }
+                }
 
                 return 1;
             }
@@ -187,7 +202,7 @@ namespace mongo {
         class FsyncCommand : public GridAdminCmd {
         public:
             FsyncCommand() : GridAdminCmd( "fsync" ) {}
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 if ( cmdObj["lock"].trueValue() ) {
                     errmsg = "can't do lock through mongos";
                     return false;
@@ -228,7 +243,7 @@ namespace mongo {
             virtual void help( stringstream& help ) const {
                 help << " example: { moveprimary : 'foo' , to : 'localhost:9999' }";
             }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string dbname = cmdObj.firstElement().valuestrsafe();
 
                 if ( dbname.size() == 0 ) {
@@ -323,7 +338,7 @@ namespace mongo {
                         << "Enable sharding for a db. (Use 'shardcollection' command afterwards.)\n"
                         << "  { enablesharding : \"<dbname>\" }\n";
             }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string dbname = cmdObj.firstElement().valuestrsafe();
                 if ( dbname.size() == 0 ) {
                     errmsg = "no db";
@@ -368,7 +383,7 @@ namespace mongo {
                         << "  { enablesharding : \"<dbname>\" }\n";
             }
 
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string ns = cmdObj.firstElement().valuestrsafe();
                 if ( ns.size() == 0 ) {
                     errmsg = "no ns";
@@ -517,7 +532,7 @@ namespace mongo {
                 help << " example: { getShardVersion : 'alleyinsider.foo'  } ";
             }
 
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string ns = cmdObj.firstElement().valuestrsafe();
                 if ( ns.size() == 0 ) {
                     errmsg = "need to specify fully namespace";
@@ -530,7 +545,7 @@ namespace mongo {
                     return false;
                 }
 
-                ChunkManagerPtr cm = config->getChunkManager( ns );
+                ChunkManagerPtr cm = config->getChunkManagerIfExists( ns );
                 if ( ! cm ) {
                     errmsg = "no chunk manager?";
                     return false;
@@ -555,7 +570,7 @@ namespace mongo {
                         ;
             }
 
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
 
                 if ( ! okForConfigChanges( errmsg ) )
                     return false;
@@ -633,7 +648,7 @@ namespace mongo {
             virtual void help( stringstream& help ) const {
                 help << "{ movechunk : 'test.foo' , find : { num : 1 } , to : 'localhost:30001' }";
             }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
 
                 if ( ! okForConfigChanges( errmsg ) )
                     return false;
@@ -710,7 +725,7 @@ namespace mongo {
             virtual void help( stringstream& help ) const {
                 help << "list all shards of the system";
             }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 ScopedDbConnection conn( configServer.getPrimary() );
 
                 vector<BSONObj> all;
@@ -734,7 +749,7 @@ namespace mongo {
             virtual void help( stringstream& help ) const {
                 help << "add a new shard to the system";
             }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 errmsg.clear();
 
                 // get replica set component hosts
@@ -795,7 +810,7 @@ namespace mongo {
             virtual void help( stringstream& help ) const {
                 help << "remove a shard to the system.";
             }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string target = cmdObj.firstElement().valuestrsafe();
                 Shard s = Shard::make( target );
                 if ( ! grid.knowAboutShard( s.getConnString() ) ) {
@@ -878,11 +893,12 @@ namespace mongo {
         class IsDbGridCmd : public Command {
         public:
             virtual LockType locktype() const { return NONE; }
+            virtual bool requiresAuth() { return false; }
             virtual bool slaveOk() const {
                 return true;
             }
             IsDbGridCmd() : Command("isdbgrid") { }
-            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 result.append("isdbgrid", 1);
                 result.append("hostname", getHostNameCached());
                 return true;
@@ -900,7 +916,7 @@ namespace mongo {
                 help << "test if this is master half of a replica pair";
             }
             CmdIsMaster() : Command("isMaster" , false , "ismaster") { }
-            virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 result.appendBool("ismaster", true );
                 result.append("msg", "isdbgrid");
                 result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
@@ -924,7 +940,7 @@ namespace mongo {
             virtual void help( stringstream &help ) const {
                 help << "{whatsmyuri:1}";
             }
-            virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 result << "you" << ClientInfo::get()->getRemote();
                 return true;
             }
@@ -942,7 +958,7 @@ namespace mongo {
                 help << "get previous error (since last reseterror command)";
             }
             CmdShardingGetPrevError() : Command( "getPrevError" , false , "getpreverror") { }
-            virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 errmsg += "getpreverror not supported for sharded environments";
                 return false;
             }
@@ -960,7 +976,7 @@ namespace mongo {
             }
             CmdShardingGetLastError() : Command("getLastError" , false , "getlasterror") { }
 
-            virtual bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 LastError *le = lastError.disableForCommand();
                 {
                     assert( le );
@@ -987,7 +1003,7 @@ namespace mongo {
             return true;
         }
 
-        bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             LastError *le = lastError.get();
             if ( le )
                 le->reset();
@@ -1018,7 +1034,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream& help ) const { help << "list databases on cluster"; }
 
-        bool run(const string& , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        bool run(const string& , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             vector<Shard> shards;
             Shard::getAllShards( shards );
 
@@ -1115,7 +1131,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream& help ) const { help << "Not supported sharded"; }
 
-        bool run(const string& , BSONObj& jsobj, string& errmsg, BSONObjBuilder& /*result*/, bool /*fromRepl*/) {
+        bool run(const string& , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& /*result*/, bool /*fromRepl*/) {
             errmsg = "closeAllDatabases isn't supported through mongos";
             return false;
         }
@@ -1131,7 +1147,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream& help ) const { help << "Not supported through mongos"; }
 
-        bool run(const string& , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {        
+        bool run(const string& , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             if ( jsobj["forShell"].trueValue() )
                 lastError.disableForCommand();
 
@@ -1148,7 +1164,7 @@ namespace mongo {
              << "either (1) ran from localhost or (2) authenticated.";
     }
 
-    bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+    bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
         return shutdownHelper();
     }
 
diff --git a/s/commands_public.cpp b/s/commands_public.cpp
index 713b9489fc2..ef7110c7646 100644
--- a/s/commands_public.cpp
+++ b/s/commands_public.cpp
@@ -53,22 +53,34 @@ namespace mongo {
                 return false;
             }
 
+            // Override if passthrough should also send query options
+            // Safer as off by default, can slowly enable as we add more tests
+            virtual bool passOptions() const { return false; }
+
             // all grid commands are designed not to lock
             virtual LockType locktype() const { return NONE; }
 
         protected:
+
             bool passthrough( DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ) {
-                return _passthrough(conf->getName(), conf, cmdObj, result);
+                return _passthrough(conf->getName(), conf, cmdObj, 0, result);
             }
             bool adminPassthrough( DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ) {
-                return _passthrough("admin", conf, cmdObj, result);
+                return _passthrough("admin", conf, cmdObj, 0, result);
+            }
+
+            bool passthrough( DBConfigPtr conf, const BSONObj& cmdObj , int options, BSONObjBuilder& result ) {
+                return _passthrough(conf->getName(), conf, cmdObj, options, result);
+            }
+            bool adminPassthrough( DBConfigPtr conf, const BSONObj& cmdObj , int options, BSONObjBuilder& result ) {
+                return _passthrough("admin", conf, cmdObj, options, result);
             }
 
         private:
-            bool _passthrough(const string& db,  DBConfigPtr conf, const BSONObj& cmdObj , BSONObjBuilder& result ) {
+            bool _passthrough(const string& db,  DBConfigPtr conf, const BSONObj& cmdObj , int options , BSONObjBuilder& result ) {
                 ShardConnection conn( conf->getPrimary() , "" );
                 BSONObj res;
-                bool ok = conn->runCommand( db , cmdObj , res );
+                bool ok = conn->runCommand( db , cmdObj , res , passOptions() ? options : 0 );
                 if ( ! ok && res["code"].numberInt() == StaleConfigInContextCode ) {
                     conn.done();
                     throw StaleConfigException("foo","command failed because of stale config");
@@ -99,13 +111,14 @@ namespace mongo {
             virtual void aggregateResults(const vector<BSONObj>& results, BSONObjBuilder& output) {}
 
             // don't override
-            virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& output, bool) {
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& output, bool) {
+                LOG(1) << "RunOnAllShardsCommand db: " << dbName << " cmd:" << cmdObj << endl;
                 set<Shard> shards;
                 getShards(dbName, cmdObj, shards);
 
                 list< shared_ptr<Future::CommandResult> > futures;
                 for ( set<Shard>::const_iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) {
-                    futures.push_back( Future::spawnCommand( i->getConnString() , dbName , cmdObj ) );
+                    futures.push_back( Future::spawnCommand( i->getConnString() , dbName , cmdObj, 0 ) );
                 }
 
                 vector<BSONObj> results;
@@ -159,13 +172,13 @@ namespace mongo {
 
             virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ) = 0;
 
-            virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
                 string fullns = getFullNS( dbName , cmdObj );
 
                 DBConfigPtr conf = grid.getDBConfig( dbName , false );
 
                 if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
-                    return passthrough( conf , cmdObj , result );
+                    return passthrough( conf , cmdObj , options, result );
                 }
                 errmsg = "can't do command: " + name + " on sharded collection";
                 return false;
@@ -184,6 +197,16 @@ namespace mongo {
             ReIndexCmd() :  AllShardsCollectionCommand("reIndex") {}
         } reIndexCmd;
 
+        class ProfileCmd : public PublicGridCommand {
+        public:
+            ProfileCmd() :  PublicGridCommand("profile") {}
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
+                errmsg = "profile currently not supported via mongos";
+                return false;
+            }
+        } profileCmd;
+        
+
         class ValidateCmd : public AllShardsCollectionCommand {
         public:
             ValidateCmd() :  AllShardsCollectionCommand("validate") {}
@@ -255,7 +278,7 @@ namespace mongo {
         class DropCmd : public PublicGridCommand {
         public:
             DropCmd() : PublicGridCommand( "drop" ) {}
-            bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string collection = cmdObj.firstElement().valuestrsafe();
                 string fullns = dbName + "." + collection;
 
@@ -280,7 +303,7 @@ namespace mongo {
         class DropDBCmd : public PublicGridCommand {
         public:
             DropDBCmd() : PublicGridCommand( "dropDatabase" ) {}
-            bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
 
                 BSONElement e = cmdObj.firstElement();
 
@@ -309,7 +332,7 @@ namespace mongo {
         class RenameCollectionCmd : public PublicGridCommand {
         public:
             RenameCollectionCmd() : PublicGridCommand( "renameCollection" ) {}
-            bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string fullnsFrom = cmdObj.firstElement().valuestrsafe();
                 string dbNameFrom = nsToDatabase( fullnsFrom.c_str() );
                 DBConfigPtr confFrom = grid.getDBConfig( dbNameFrom , false );
@@ -334,7 +357,7 @@ namespace mongo {
         class CopyDBCmd : public PublicGridCommand {
         public:
             CopyDBCmd() : PublicGridCommand( "copydb" ) {}
-            bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string todb = cmdObj.getStringField("todb");
                 uassert(13402, "need a todb argument", !todb.empty());
 
@@ -370,7 +393,8 @@ namespace mongo {
         class CountCmd : public PublicGridCommand {
         public:
             CountCmd() : PublicGridCommand("count") { }
-            bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool l) {
+            virtual bool passOptions() const { return true; }
+            bool run(const string& dbName, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
                 string collection = cmdObj.firstElement().valuestrsafe();
                 string fullns = dbName + "." + collection;
 
@@ -379,12 +403,11 @@ namespace mongo {
                     filter = cmdObj["query"].Obj();
 
                 DBConfigPtr conf = grid.getDBConfig( dbName , false );
-
                 if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
                     ShardConnection conn( conf->getPrimary() , fullns );
 
                     BSONObj temp;
-                    bool ok = conn->runCommand( dbName , cmdObj , temp );
+                    bool ok = conn->runCommand( dbName , cmdObj , temp, options );
                     conn.done();
 
                     if ( ok ) {
@@ -399,7 +422,7 @@ namespace mongo {
                     }
 
                     // this collection got sharded
-                    ChunkManagerPtr cm = conf->getChunkManager( fullns , true );
+                    ChunkManagerPtr cm = conf->getChunkManagerIfExists( fullns , true );
                     if ( ! cm ) {
                         errmsg = "should be sharded now";
                         result.append( "root" , temp );
@@ -410,11 +433,11 @@ namespace mongo {
                 long long total = 0;
                 map<string,long long> shardCounts;
 
-                ChunkManagerPtr cm = conf->getChunkManager( fullns );
+                ChunkManagerPtr cm = conf->getChunkManagerIfExists( fullns );
                 while ( true ) {
                     if ( ! cm ) {
                         // probably unsharded now
-                        return run( dbName , cmdObj , errmsg , result , l );
+                        return run( dbName , cmdObj , options , errmsg , result, false );
                     }
 
                     set<Shard> shards;
@@ -428,14 +451,14 @@ namespace mongo {
                         if ( conn.setVersion() ) {
                             total = 0;
                             shardCounts.clear();
-                            cm = conf->getChunkManager( fullns );
+                            cm = conf->getChunkManagerIfExists( fullns );
                             conn.done();
                             hadToBreak = true;
                             break;
                         }
 
                         BSONObj temp;
-                        bool ok = conn->runCommand( dbName , BSON( "count" << collection << "query" << filter ) , temp );
+                        bool ok = conn->runCommand( dbName , BSON( "count" << collection << "query" << filter ) , temp, options );
                         conn.done();
 
                         if ( ok ) {
@@ -449,7 +472,7 @@ namespace mongo {
                             // my version is old
                             total = 0;
                             shardCounts.clear();
-                            cm = conf->getChunkManager( fullns , true );
+                            cm = conf->getChunkManagerIfExists( fullns , true );
                             hadToBreak = true;
                             break;
                         }
@@ -476,14 +499,13 @@ namespace mongo {
         class CollectionStats : public PublicGridCommand {
         public:
             CollectionStats() : PublicGridCommand("collStats", "collstats") { }
-            bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string collection = cmdObj.firstElement().valuestrsafe();
                 string fullns = dbName + "." + collection;
 
                 DBConfigPtr conf = grid.getDBConfig( dbName , false );
 
                 if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
-                    result.append( "ns" , fullns );
                     result.appendBool("sharded", false);
                     result.append( "primary" , conf->getPrimary().getName() );
                     return passthrough( conf , cmdObj , result);
@@ -602,7 +624,7 @@ namespace mongo {
         class FindAndModifyCmd : public PublicGridCommand {
         public:
             FindAndModifyCmd() : PublicGridCommand("findAndModify", "findandmodify") { }
-            bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string collection = cmdObj.firstElement().valuestrsafe();
                 string fullns = dbName + "." + collection;
 
@@ -639,7 +661,7 @@ namespace mongo {
         class DataSizeCmd : public PublicGridCommand {
         public:
             DataSizeCmd() : PublicGridCommand("dataSize", "datasize") { }
-            bool run(const string& dbName, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string fullns = cmdObj.firstElement().String();
 
                 DBConfigPtr conf = grid.getDBConfig( dbName , false );
@@ -703,7 +725,7 @@ namespace mongo {
         class GroupCmd : public NotAllowedOnShardedCollectionCmd  {
         public:
             GroupCmd() : NotAllowedOnShardedCollectionCmd("group") {}
-
+            virtual bool passOptions() const { return true; }
             virtual string getFullNS( const string& dbName , const BSONObj& cmdObj ) {
                 return dbName + "." + cmdObj.firstElement().embeddedObjectUserCheck()["ns"].valuestrsafe();
             }
@@ -716,14 +738,15 @@ namespace mongo {
             virtual void help( stringstream &help ) const {
                 help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
             }
-            bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool passOptions() const { return true; }
+            bool run(const string& dbName , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
                 string collection = cmdObj.firstElement().valuestrsafe();
                 string fullns = dbName + "." + collection;
 
                 DBConfigPtr conf = grid.getDBConfig( dbName , false );
 
                 if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
-                    return passthrough( conf , cmdObj , result );
+                    return passthrough( conf , cmdObj , options, result );
                 }
 
                 ChunkManagerPtr cm = conf->getChunkManager( fullns );
@@ -739,7 +762,7 @@ namespace mongo {
                 for ( set<Shard>::iterator i=shards.begin(), end=shards.end() ; i != end; ++i ) {
                     ShardConnection conn( *i , fullns );
                     BSONObj res;
-                    bool ok = conn->runCommand( conf->getName() , cmdObj , res );
+                    bool ok = conn->runCommand( conf->getName() , cmdObj , res, options );
                     conn.done();
 
                     if ( ! ok ) {
@@ -774,7 +797,7 @@ namespace mongo {
             virtual void help( stringstream &help ) const {
                 help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }";
             }
-            bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string fullns = dbName;
                 fullns += ".";
                 {
@@ -811,15 +834,15 @@ namespace mongo {
         public:
             Geo2dFindNearCmd() : PublicGridCommand( "geoNear" ) {}
             void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; }
-
-            bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool passOptions() const { return true; }
+            bool run(const string& dbName , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool) {
                 string collection = cmdObj.firstElement().valuestrsafe();
                 string fullns = dbName + "." + collection;
 
                 DBConfigPtr conf = grid.getDBConfig( dbName , false );
 
                 if ( ! conf || ! conf->isShardingEnabled() || ! conf->isSharded( fullns ) ) {
-                    return passthrough( conf , cmdObj , result );
+                    return passthrough( conf , cmdObj , options, result );
                 }
 
                 ChunkManagerPtr cm = conf->getChunkManager( fullns );
@@ -836,7 +859,7 @@ namespace mongo {
                 list< shared_ptr<Future::CommandResult> > futures;
                 BSONArrayBuilder shardArray;
                 for ( set<Shard>::const_iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) {
-                    futures.push_back( Future::spawnCommand( i->getConnString() , dbName , cmdObj ) );
+                    futures.push_back( Future::spawnCommand( i->getConnString() , dbName , cmdObj, options ) );
                     shardArray.append(i->getName());
                 }
 
@@ -946,7 +969,7 @@ namespace mongo {
                 return b.obj();
             }
 
-            bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 Timer t;
 
                 string collection = cmdObj.firstElement().valuestrsafe();
@@ -1009,7 +1032,7 @@ namespace mongo {
                     for ( set<Shard>::iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) {
                         shared_ptr<ShardConnection> temp( new ShardConnection( i->getConnString() , fullns ) );
                         assert( temp->get() );
-                        futures.push_back( Future::spawnCommand( i->getConnString() , dbName , shardedCommand , temp->get() ) );
+                        futures.push_back( Future::spawnCommand( i->getConnString() , dbName , shardedCommand , 0 , temp->get() ) );
                         shardConns.push_back( temp );
                     }
                     
@@ -1096,7 +1119,7 @@ namespace mongo {
 
                     mr_shard::Config config( dbName , cmdObj );
                     mr_shard::State state(config);
-                    log(1) << "mr sharded output ns: " << config.ns << endl;
+                    LOG(1) << "mr sharded output ns: " << config.ns << endl;
 
                     if (config.outType == mr_shard::Config::INMEMORY) {
                         errmsg = "This Map Reduce mode is not supported with sharded output";
@@ -1200,7 +1223,7 @@ namespace mongo {
                         BSONObj finalCmdObj = finalCmd.obj();
                         for ( set<Shard>::iterator i=shards.begin(), end=shards.end() ; i != end ; i++ ) {
                             shared_ptr<ShardConnection> temp( new ShardConnection( i->getConnString() , outns ) );
-                            futures.push_back( Future::spawnCommand( i->getConnString() , dbName , finalCmdObj , temp->get() ) );
+                            futures.push_back( Future::spawnCommand( i->getConnString() , dbName , finalCmdObj , 0 , temp->get() ) );
                             shardConns.push_back( temp );
                         }
 
@@ -1268,7 +1291,7 @@ namespace mongo {
         class ApplyOpsCmd : public PublicGridCommand {
         public:
             ApplyOpsCmd() : PublicGridCommand( "applyOps" ) {}
-            virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 errmsg = "applyOps not allowed through mongos";
                 return false;
             }
@@ -1277,7 +1300,7 @@ namespace mongo {
         class CompactCmd : public PublicGridCommand {
         public:
             CompactCmd() : PublicGridCommand( "compact" ) {}
-            virtual bool run(const string& dbName , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            virtual bool run(const string& dbName , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 errmsg = "compact not allowed through mongos";
                 return false;
             }
@@ -1285,7 +1308,7 @@ namespace mongo {
 
     }
 
-    bool Command::runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder) {
+    bool Command::runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder, int queryOptions) {
         const char *p = strchr(ns, '.');
         if ( !p ) return false;
         if ( strcmp(p, ".$cmd") != 0 ) return false;
@@ -1326,7 +1349,7 @@ namespace mongo {
                 anObjBuilder.append( "help" , help.str() );
             }
             else {
-                ok = c->run( nsToDatabase( ns ) , jsobj, errmsg, anObjBuilder, false);
+                ok = c->run( nsToDatabase( ns ) , jsobj, queryOptions, errmsg, anObjBuilder, false );
             }
 
             BSONObj tmp = anObjBuilder.asTempObj();
diff --git a/s/config.cpp b/s/config.cpp
index b65443cb0bd..f9e548235d3 100644
--- a/s/config.cpp
+++ b/s/config.cpp
@@ -185,6 +185,16 @@ namespace mongo {
         return true;
     }
 
+    ChunkManagerPtr DBConfig::getChunkManagerIfExists( const string& ns, bool shouldReload ){
+        try{
+            return getChunkManager( ns, shouldReload );
+        }
+        catch( AssertionException& e ){
+            warning() << "chunk manager not found for " << ns << causedBy( e ) << endl;
+            return ChunkManagerPtr();
+        }
+    }
+
     ChunkManagerPtr DBConfig::getChunkManager( const string& ns , bool shouldReload ) {
         BSONObj key;
         bool unique;
@@ -201,8 +211,8 @@ namespace mongo {
                 _reload();
                 ci = _collections[ns];
             }
-            massert( 10181 ,  (string)"not sharded:" + ns , ci.isSharded() || ci.wasDropped() );
-            assert( ci.wasDropped() || ! ci.key().isEmpty() );
+            massert( 10181 ,  (string)"not sharded:" + ns , ci.isSharded() );
+            assert( ! ci.key().isEmpty() );
             
             if ( ! shouldReload || earlyReload )
                 return ci.getCM();
@@ -226,6 +236,7 @@ namespace mongo {
                 if ( v == oldVersion ) {
                     scoped_lock lk( _lock );
                     CollectionInfo& ci = _collections[ns];
+                    massert( 15885 , str::stream() << "not sharded after reloading from chunks : " << ns , ci.isSharded() );
                     return ci.getCM();
                 }
             }
@@ -244,7 +255,7 @@ namespace mongo {
         scoped_lock lk( _lock );
         
         CollectionInfo& ci = _collections[ns];
-        massert( 14822 ,  (string)"state changed in the middle: " + ns , ci.isSharded() || ci.wasDropped() );
+        massert( 14822 ,  (string)"state changed in the middle: " + ns , ci.isSharded() );
         
         if ( temp->getVersion() > ci.getCM()->getVersion() ) {
             // we only want to reset if we're newer
@@ -252,6 +263,7 @@ namespace mongo {
             ci.resetCM( temp.release() );
         }
         
+        massert( 15883 , str::stream() << "not sharded after chunk manager reset : " << ns , ci.isSharded() );
         return ci.getCM();
     }
 
@@ -268,7 +280,7 @@ namespace mongo {
     }
 
     void DBConfig::unserialize(const BSONObj& from) {
-        log(1) << "DBConfig unserialize: " << _name << " " << from << endl;
+        LOG(1) << "DBConfig unserialize: " << _name << " " << from << endl;
         assert( _name == from["_id"].String() );
 
         _shardingEnabled = from.getBoolField("partitioned");
@@ -300,13 +312,14 @@ namespace mongo {
         unserialize( o );
 
         BSONObjBuilder b;
-        b.appendRegex( "_id" , (string)"^" + _name + "." );
+        b.appendRegex( "_id" , (string)"^" + _name + "\\." );
 
         auto_ptr<DBClientCursor> cursor = conn->query( ShardNS::collection ,b.obj() );
         assert( cursor.get() );
         while ( cursor->more() ) {
             BSONObj o = cursor->next();
-            _collections[o["_id"].String()] = CollectionInfo( o );
+            if( o["dropped"].trueValue() ) _collections.erase( o["_id"].String() );
+            else _collections[o["_id"].String()] = CollectionInfo( o );
         }
 
         conn.done();
@@ -369,7 +382,7 @@ namespace mongo {
 
         // 1
         if ( ! configServer.allUp( errmsg ) ) {
-            log(1) << "\t DBConfig::dropDatabase not all up" << endl;
+            LOG(1) << "\t DBConfig::dropDatabase not all up" << endl;
             return 0;
         }
 
@@ -392,7 +405,7 @@ namespace mongo {
             log() << "error removing from config server even after checking!" << endl;
             return 0;
         }
-        log(1) << "\t removed entry from config server for: " << _name << endl;
+        LOG(1) << "\t removed entry from config server for: " << _name << endl;
 
         set<Shard> allServers;
 
@@ -428,7 +441,7 @@ namespace mongo {
             conn.done();
         }
 
-        log(1) << "\t dropped primary db for: " << _name << endl;
+        LOG(1) << "\t dropped primary db for: " << _name << endl;
 
         configServer.logChange( "dropDatabase" , _name , BSONObj() );
         return true;
@@ -440,6 +453,7 @@ namespace mongo {
         while ( true ) {
             Collections::iterator i = _collections.begin();
             for ( ; i != _collections.end(); ++i ) {
+                // log() << "coll : " << i->first << " and " << i->second.isSharded() << endl;
                 if ( i->second.isSharded() )
                     break;
             }
@@ -453,7 +467,7 @@ namespace mongo {
             }
 
             seen.insert( i->first );
-            log(1) << "\t dropping sharded collection: " << i->first << endl;
+            LOG(1) << "\t dropping sharded collection: " << i->first << endl;
 
             i->second.getCM()->getAllShards( allServers );
             i->second.getCM()->drop( i->second.getCM() );
@@ -461,7 +475,7 @@ namespace mongo {
 
             num++;
             uassert( 10184 ,  "_dropShardedCollections too many collections - bailing" , num < 100000 );
-            log(2) << "\t\t dropped " << num << " so far" << endl;
+            LOG(2) << "\t\t dropped " << num << " so far" << endl;
         }
 
         return true;
@@ -528,7 +542,7 @@ namespace mongo {
         string fullString;
         joinStringDelim( configHosts, &fullString, ',' );
         _primary.setAddress( ConnectionString( fullString , ConnectionString::SYNC ) );
-        log(1) << " config string : " << fullString << endl;
+        LOG(1) << " config string : " << fullString << endl;
 
         return true;
     }
@@ -609,7 +623,7 @@ namespace mongo {
         if ( checkConsistency ) {
             string errmsg;
             if ( ! checkConfigServersConsistent( errmsg ) ) {
-                log( LL_ERROR ) << "config servers not in sync! " << errmsg << endl;
+                log( LL_ERROR ) << "config servers not in sync! " << errmsg << warnings;
                 return false;
             }
         }
@@ -672,7 +686,7 @@ namespace mongo {
             string name = o["_id"].valuestrsafe();
             got.insert( name );
             if ( name == "chunksize" ) {
-                log(1) << "MaxChunkSize: " << o["value"] << endl;
+                LOG(1) << "MaxChunkSize: " << o["value"] << endl;
                 Chunk::MaxChunkSize = o["value"].numberInt() * 1024 * 1024;
             }
             else if ( name == "balancer" ) {
@@ -746,7 +760,7 @@ namespace mongo {
                     conn->createCollection( "config.changelog" , 1024 * 1024 * 10 , true );
                 }
                 catch ( UserException& e ) {
-                    log(1) << "couldn't create changelog (like race condition): " << e << endl;
+                    LOG(1) << "couldn't create changelog (like race condition): " << e << endl;
                     // don't care
                 }
                 createdCapped = true;
diff --git a/s/config.h b/s/config.h
index 6c8f8934aed..90c06cb0223 100644
--- a/s/config.h
+++ b/s/config.h
@@ -143,6 +143,7 @@ namespace mongo {
         bool isSharded( const string& ns );
 
         ChunkManagerPtr getChunkManager( const string& ns , bool reload = false );
+        ChunkManagerPtr getChunkManagerIfExists( const string& ns , bool reload = false );
 
         /**
          * @return the correct for shard for the ns
diff --git a/s/cursors.cpp b/s/cursors.cpp
index c65cdb9f97b..e8aeffb1cb4 100644
--- a/s/cursors.cpp
+++ b/s/cursors.cpp
@@ -112,7 +112,7 @@ namespace mongo {
         }
 
         bool hasMore = sendMore && _cursor->more();
-        log(6) << "\t hasMore:" << hasMore << " wouldSendMoreIfHad: " << sendMore << " id:" << getId() << " totalSent: " << _totalSent << endl;
+        LOG(6) << "\t hasMore:" << hasMore << " wouldSendMoreIfHad: " << sendMore << " id:" << getId() << " totalSent: " << _totalSent << endl;
 
         replyToQuery( 0 , r.p() , r.m() , b.buf() , b.len() , num , _totalSent , hasMore ? getId() : 0 );
         _totalSent += num;
@@ -131,13 +131,15 @@ namespace mongo {
 
     CursorCache::~CursorCache() {
         // TODO: delete old cursors?
-        int logLevel = 1;
+        bool print = logLevel > 0;
         if ( _cursors.size() || _refs.size() )
-            logLevel = 0;
-        log( logLevel ) << " CursorCache at shutdown - "
-                        << " sharded: " << _cursors.size()
-                        << " passthrough: " << _refs.size()
-                        << endl;
+            print = true;
+        
+        if ( print ) 
+            cout << " CursorCache at shutdown - "
+                 << " sharded: " << _cursors.size()
+                 << " passthrough: " << _refs.size()
+                 << endl;
     }
 
     ShardedClientCursorPtr CursorCache::get( long long id ) const {
@@ -300,7 +302,7 @@ namespace mongo {
             help << " example: { cursorInfo : 1 }";
         }
         virtual LockType locktype() const { return NONE; }
-        bool run(const string&, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string&, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             cursorCache.appendInfo( result );
             if ( jsobj["setTimeout"].isNumber() )
                 CursorCache::TIMEOUT = jsobj["setTimeout"].numberLong();
diff --git a/s/d_logic.cpp b/s/d_logic.cpp
index 5216b2e52ca..9d4fd74dd62 100644
--- a/s/d_logic.cpp
+++ b/s/d_logic.cpp
@@ -60,7 +60,7 @@ namespace mongo {
             return false;
         }
 
-        log(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl;
+        LOG(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl;
 
         if ( doesOpGetAResponse( op ) ) {
             assert( dbresponse );
@@ -97,8 +97,8 @@ namespace mongo {
         const OID& clientID = ShardedConnectionInfo::get(false)->getID();
         massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );
 
-        log(1) << "got write with an old config - writing back ns: " << ns << endl;
-        if ( logLevel ) log(1) << m.toString() << endl;
+        LOG(1) << "got write with an old config - writing back ns: " << ns << endl;
+        if ( logLevel ) LOG(1) << m.toString() << endl;
 
         BSONObjBuilder b;
         b.appendBool( "writeBack" , true );
@@ -109,7 +109,7 @@ namespace mongo {
         b.appendTimestamp( "version" , shardingState.getVersion( ns ) );
         b.appendTimestamp( "yourVersion" , ShardedConnectionInfo::get( true )->getVersion( ns ) );
         b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
-        log(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
+        LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
         writeBackManager.queueWriteBack( clientID.str() , b.obj() );
 
         return true;
diff --git a/s/d_migrate.cpp b/s/d_migrate.cpp
index 740a3148771..e24a02d3538 100644
--- a/s/d_migrate.cpp
+++ b/s/d_migrate.cpp
@@ -156,13 +156,28 @@ namespace mongo {
         string toString() const {
             return str::stream() << ns << " from " << min << " -> " << max;
         }
-
+        
         void doRemove() {
             ShardForceVersionOkModeBlock sf;
-            writelock lk(ns);
-            RemoveSaver rs("moveChunk",ns,"post-cleanup");
-            long long num = Helpers::removeRange( ns , min , max , true , false , cmdLine.moveParanoia ? &rs : 0 );
-            log() << "moveChunk deleted: " << num << migrateLog;
+            {
+                writelock lk(ns);
+                RemoveSaver rs("moveChunk",ns,"post-cleanup");
+                long long numDeleted = Helpers::removeRange( ns , min , max , true , false , cmdLine.moveParanoia ? &rs : 0 );
+                log() << "moveChunk deleted: " << numDeleted << migrateLog;
+            }
+            
+            ReplTime lastOpApplied = cc().getLastOp();
+            
+            Timer t;
+            for ( int i=0; i<3600; i++ ) {
+                if ( opReplicatedEnough( lastOpApplied , ( getSlaveCount() / 2 ) + 1 ) ) {
+                    LOG(t.seconds() < 30 ? 1 : 0) << "moveChunk repl sync took " << t.seconds() << " seconds" << migrateLog;
+                    return;
+                }
+                sleepsecs(1);
+            }
+            
+            warning() << "moveChunk repl sync timed out after " << t.seconds() << " seconds" << migrateLog;
         }
 
     };
@@ -646,7 +661,7 @@ namespace mongo {
     public:
         TransferModsCommand() : ChunkCommandHelper( "_transferMods" ) {}
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             return migrateFromStatus.transferMods( errmsg, result );
         }
     } transferModsCommand;
@@ -656,7 +671,7 @@ namespace mongo {
     public:
         InitialCloneCommand() : ChunkCommandHelper( "_migrateClone" ) {}
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             return migrateFromStatus.clone( errmsg, result );
         }
     } initialCloneCommand;
@@ -680,7 +695,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
 
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             // 1. parse options
             // 2. make sure my view is complete and lock
             // 3. start migrate
@@ -1064,7 +1079,7 @@ namespace mongo {
                 preCond.done();
 
                 BSONObj cmd = cmdBuilder.obj();
-                log(7) << "moveChunk update: " << cmd << migrateLog;
+                LOG(7) << "moveChunk update: " << cmd << migrateLog;
 
                 bool ok = false;
                 BSONObj cmdResult;
@@ -1177,7 +1192,7 @@ namespace mongo {
 
     class MigrateStatus {
     public:
-
+        
         MigrateStatus() : m_active("MigrateStatus") { active = false; }
 
         void prepare() {
@@ -1345,9 +1360,19 @@ namespace mongo {
                 timing.done(4);
             }
 
+            { 
+                // pause to wait for replication
+                // this will prevent us from going into critical section until we're ready
+                Timer t;
+                while ( t.minutes() < 600 ) {
+                    if ( flushPendingWrites( lastOpApplied ) )
+                        break;
+                    sleepsecs(1);
+                }
+            }
+
             {
                 // 5. wait for commit
-                Timer timeWaitingForCommit;
 
                 state = STEADY;
                 while ( state == STEADY || state == COMMIT_START ) {
@@ -1371,17 +1396,16 @@ namespace mongo {
                     if ( state == COMMIT_START ) {
                         if ( flushPendingWrites( lastOpApplied ) )
                             break;
-                        
-                        if ( timeWaitingForCommit.seconds() > 86400 ) {
-                            state = FAIL;
-                            errmsg = "timed out waiting for commit";
-                            return;
-                        }
                     }
                     
                     sleepmillis( 10 );
                 }
 
+                if ( state == FAIL ) {
+                    errmsg = "imted out waiting for commit";
+                    return;
+                }
+
                 timing.done(5);
             }
 
@@ -1516,12 +1540,14 @@ namespace mongo {
                 return false;
             state = COMMIT_START;
             
-            // we wait 5 minutes for the commit to succeed before giving up
-            for ( int i=0; i<5*60*1000; i++ ) {
+            Timer t;
+            // we wait for the commit to succeed before giving up
+            while ( t.minutes() <= 5 ) {
                 sleepmillis(1);
                 if ( state == DONE )
                     return true;
             }
+            state = FAIL;
             log() << "startCommit never finished!" << migrateLog;
             return false;
         }
@@ -1571,7 +1597,7 @@ namespace mongo {
 
         virtual LockType locktype() const { return WRITE; }  // this is so don't have to do locking internally
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
 
             if ( migrateStatus.getActive() ) {
                 errmsg = "migrate already in progress";
@@ -1608,7 +1634,7 @@ namespace mongo {
     public:
         RecvChunkStatusCommand() : ChunkCommandHelper( "_recvChunkStatus" ) {}
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             migrateStatus.status( result );
             return 1;
         }
@@ -1619,7 +1645,7 @@ namespace mongo {
     public:
         RecvChunkCommitCommand() : ChunkCommandHelper( "_recvChunkCommit" ) {}
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             bool ok = migrateStatus.startCommit();
             migrateStatus.status( result );
             return ok;
@@ -1631,7 +1657,7 @@ namespace mongo {
     public:
         RecvChunkAbortCommand() : ChunkCommandHelper( "_recvChunkAbort" ) {}
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             migrateStatus.abort();
             migrateStatus.status( result );
             return true;
@@ -1653,7 +1679,7 @@ namespace mongo {
             assert( ! isInRange( BSON( "x" << 5 ) , min , max ) );
             assert( ! isInRange( BSON( "x" << 6 ) , min , max ) );
 
-            log(1) << "isInRangeTest passed" << migrateLog;
+            LOG(1) << "isInRangeTest passed" << migrateLog;
         }
     } isInRangeTest;
 }
diff --git a/s/d_split.cpp b/s/d_split.cpp
index 64fc4cb42e4..cef6188a2bb 100644
--- a/s/d_split.cpp
+++ b/s/d_split.cpp
@@ -57,7 +57,7 @@ namespace mongo {
                  "example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n"
                  "NOTE: This command may take a while to run";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             const char *ns = jsobj.getStringField( "medianKey" );
             BSONObj min = jsobj.getObjectField( "min" );
             BSONObj max = jsobj.getObjectField( "max" );
@@ -136,7 +136,7 @@ namespace mongo {
             help << "Internal command.\n";
         }
 
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
 
             const char* ns = jsobj.getStringField( "checkShardingIndex" );
             BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
@@ -177,6 +177,11 @@ namespace mongo {
                 return false;
             }
 
+            if( d->isMultikey( d->idxNo( *idx ) ) ) {
+                errmsg = "index is multikey, cannot use for sharding";
+                return false;
+            }
+
             BtreeCursor * bc = BtreeCursor::make( d , d->idxNo(*idx) , *idx , min , max , false , 1 );
             shared_ptr<Cursor> c( bc );
             auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
@@ -248,7 +253,7 @@ namespace mongo {
                  "NOTE: This command may take a while to run";
         }
 
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
 
             //
             // 1.a We'll parse the parameters in two steps. First, make sure the we can use the split index to get
@@ -524,7 +529,7 @@ namespace mongo {
         virtual bool adminOnly() const { return true; }
         virtual LockType locktype() const { return NONE; }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
 
             //
             // 1. check whether parameters passed to splitChunk are sound
@@ -686,7 +691,7 @@ namespace mongo {
 
             BSONObjBuilder logDetail;
             origChunk.appendShortVersion( "before" , logDetail );
-            log(1) << "before split on " << origChunk << endl;
+            LOG(1) << "before split on " << origChunk << endl;
             vector<ChunkInfo> newChunks;
 
             ShardChunkVersion myVersion = maxVersion;
diff --git a/s/d_state.cpp b/s/d_state.cpp
index 409820047b0..f43865b222e 100644
--- a/s/d_state.cpp
+++ b/s/d_state.cpp
@@ -288,7 +288,7 @@ namespace mongo {
     ShardedConnectionInfo* ShardedConnectionInfo::get( bool create ) {
         ShardedConnectionInfo* info = _tl.get();
         if ( ! info && create ) {
-            log(1) << "entering shard mode for connection" << endl;
+            LOG(1) << "entering shard mode for connection" << endl;
             info = new ShardedConnectionInfo();
             _tl.reset( info );
         }
@@ -316,7 +316,7 @@ namespace mongo {
     void ShardedConnectionInfo::addHook() {
         static bool done = false;
         if (!done) {
-            log(1) << "adding sharding hook" << endl;
+            LOG(1) << "adding sharding hook" << endl;
             pool.addHook(new ShardingConnectionHook(false));
             done = true;
         }
@@ -380,7 +380,7 @@ namespace mongo {
 
         virtual bool slaveOk() const { return true; }
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             ShardedConnectionInfo::reset();
             return true;
         }
@@ -452,7 +452,7 @@ namespace mongo {
             return true;
         }
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
 
             // Steps
             // 1. check basic config
@@ -613,7 +613,7 @@ namespace mongo {
 
         virtual LockType locktype() const { return NONE; }
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string ns = cmdObj["getShardVersion"].valuestrsafe();
             if ( ns.size() == 0 ) {
                 errmsg = "need to specify full namespace";
@@ -642,7 +642,7 @@ namespace mongo {
 
         virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             shardingState.appendInfo( result );
             return true;
         }
diff --git a/s/d_writeback.cpp b/s/d_writeback.cpp
index 6839fc4b1ca..01c0c14ac0a 100644
--- a/s/d_writeback.cpp
+++ b/s/d_writeback.cpp
@@ -129,7 +129,7 @@ namespace mongo {
 
         void help(stringstream& h) const { h<<"internal"; }
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
 
             BSONElement e = cmdObj.firstElement();
             if ( e.type() != jstOID ) {
@@ -144,7 +144,7 @@ namespace mongo {
             // we want to do return at least at every 5 minutes so sockets don't timeout
             BSONObj z;
             if ( writeBackManager.getWritebackQueue(id.str())->queue.blockingPop( z, 5 * 60 /* 5 minutes */ ) ) {
-                log(1) << "WriteBackCommand got : " << z << endl;
+                LOG(1) << "WriteBackCommand got : " << z << endl;
                 result.append( "data" , z );
             }
             else {
@@ -168,7 +168,7 @@ namespace mongo {
                  << "This is an internal command";
         }
 
-        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             writeBackManager.appendStats( result );
             return true;
         }
diff --git a/s/grid.cpp b/s/grid.cpp
index 6141e061be6..3756e131a6a 100644
--- a/s/grid.cpp
+++ b/s/grid.cpp
@@ -125,6 +125,8 @@ namespace mongo {
             name = &nameInternal;
         }
 
+        ReplicaSetMonitorPtr rsMonitor;
+
         // Check whether the host (or set) exists and run several sanity checks on this request.
         // There are two set of sanity checks: making sure adding this particular shard is consistent
         // with the replica set state (if it exists) and making sure this shards databases can be
@@ -140,7 +142,7 @@ namespace mongo {
                 errMsg = "can't use sync cluster as a shard.  for replica set, have to use <setname>/<server1>,<server2>,...";
                 return false;
             }
-
+            
             BSONObj resIsMongos;
             bool ok = newShardConn->runCommand( "admin" , BSON( "isdbgrid" << 1 ) , resIsMongos );
 
@@ -264,6 +266,9 @@ namespace mongo {
                 }
             }
 
+            if ( newShardConn->type() == ConnectionString::SET ) 
+                rsMonitor = ReplicaSetMonitor::get( setName );
+
             newShardConn.done();
         }
         catch ( DBException& e ) {
@@ -295,7 +300,7 @@ namespace mongo {
         // build the ConfigDB shard document
         BSONObjBuilder b;
         b.append( "_id" , *name );
-        b.append( "host" , servers.toString() );
+        b.append( "host" , rsMonitor ? rsMonitor->getServerAddress() : servers.toString() );
         if ( maxSize > 0 ) {
             b.append( ShardFields::maxSize.name() , maxSize );
         }
@@ -508,7 +513,7 @@ namespace mongo {
             assert( Grid::_inBalancingWindow( w8 , now ) );
             assert( Grid::_inBalancingWindow( w9 , now ) );
 
-            log(1) << "BalancingWidowObjTest passed" << endl;
+            LOG(1) << "BalancingWidowObjTest passed" << endl;
         }
     } BalancingWindowObjTest;
 
diff --git a/s/request.cpp b/s/request.cpp
index cda75f63a17..36488cb5617 100644
--- a/s/request.cpp
+++ b/s/request.cpp
@@ -43,7 +43,12 @@ namespace mongo {
 
         _clientInfo = ClientInfo::get();
         _clientInfo->newRequest( p );
+    }
 
+    void Request::checkAuth() const {
+        char cl[256];
+        nsToDatabase(getns(), cl);
+        uassert(15845, "unauthorized", _clientInfo->getAuthenticationInfo()->isAuthorized(cl));
     }
 
     void Request::init() {
@@ -60,17 +65,21 @@ namespace mongo {
 
         uassert( 13644 , "can't use 'local' database through mongos" , ! str::startsWith( getns() , "local." ) );
 
-        _config = grid.getDBConfig( getns() );
+        const string nsStr (getns()); // use in functions taking string rather than char*
+
+        _config = grid.getDBConfig( nsStr );
         if ( reload ) {
-            if ( _config->isSharded( getns() ) )
-                _config->getChunkManager( getns() , true );
+            if ( _config->isSharded( nsStr ) )
+                _config->getChunkManager( nsStr , true );
             else
                 _config->reload();
         }
 
-        if ( _config->isSharded( getns() ) ) {
-            _chunkManager = _config->getChunkManager( getns() , reload );
-            uassert( 10193 ,  (string)"no shard info for: " + getns() , _chunkManager );
+        if ( _config->isSharded( nsStr ) ) {
+            _chunkManager = _config->getChunkManager( nsStr , reload );
+            // TODO:  All of these uasserts are no longer necessary, getChunkManager() throws when
+            // not returning the right value.
+            uassert( 10193 ,  (string)"no shard info for: " + nsStr , _chunkManager );
         }
         else {
             _chunkManager.reset();
@@ -104,7 +113,7 @@ namespace mongo {
         }
 
 
-        log(3) << "Request::process ns: " << getns() << " msg id:" << (int)(_m.header()->id) << " attempt: " << attempt << endl;
+        LOG(3) << "Request::process ns: " << getns() << " msg id:" << (int)(_m.header()->id) << " attempt: " << attempt << endl;
 
         Strategy * s = SINGLE;
         _counter = &opsNonSharded;
@@ -138,10 +147,7 @@ namespace mongo {
             s->getMore( *this );
         }
         else {
-            char cl[256];
-            nsToDatabase(getns(), cl);
-            uassert(15845, "unauthorized", _clientInfo->getAuthenticationInfo()->isAuthorized(cl));
-
+            checkAuth();
             s->writeOp( op, *this );
         }
 
diff --git a/s/request.h b/s/request.h
index 6645ed9a092..86a484e378b 100644
--- a/s/request.h
+++ b/s/request.h
@@ -70,6 +70,8 @@ namespace mongo {
             return _clientInfo;
         }
 
+        void checkAuth() const;
+
         // ---- remote location info -----
 
 
diff --git a/s/s_only.cpp b/s/s_only.cpp
index 4afa9008f71..6449b34ad81 100644
--- a/s/s_only.cpp
+++ b/s/s_only.cpp
@@ -91,7 +91,7 @@ namespace mongo {
         }
 
         string errmsg;
-        int ok = c->run( dbname , cmdObj , errmsg , result , fromRepl );
+        int ok = c->run( dbname , cmdObj , queryOptions, errmsg , result , fromRepl );
         if ( ! ok )
             result.append( "errmsg" , errmsg );
         return ok;
diff --git a/s/security.cpp b/s/security.cpp
index e27e68f4dcf..6cb9da624be 100644
--- a/s/security.cpp
+++ b/s/security.cpp
@@ -94,7 +94,7 @@ namespace mongo {
         return false;
     }
 
-    bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+    bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
         AuthenticationInfo *ai = ClientInfo::get()->getAuthenticationInfo();
         ai->logout(dbname);
         return true;
diff --git a/s/server.cpp b/s/server.cpp
index 1ca2e4c9d7e..a6ffab96f29 100644
--- a/s/server.cpp
+++ b/s/server.cpp
@@ -26,6 +26,7 @@
 #include "../util/ramlog.h"
 #include "../util/signal_handlers.h"
 #include "../util/admin_access.h"
+#include "../util/concurrency/task.h"
 #include "../db/dbwebserver.h"
 #include "../scripting/engine.h"
 
@@ -94,7 +95,7 @@ namespace mongo {
                 r.process();
             }
             catch ( AssertionException & e ) {
-                log( e.isUserAssertion() ? 1 : 0 ) << "AssertionException in process: " << e.what() << endl;
+                log( e.isUserAssertion() ? 1 : 0 ) << "AssertionException while processing op type : " << m.operation() << " to : " << r.getns() << causedBy(e) << endl;
 
                 le->raiseError( e.getCode() , e.what() );
 
@@ -158,9 +159,6 @@ namespace mongo {
         cursorCache.startTimeoutThread();
         PeriodicTask::theRunner->go();
 
-        log() << "waiting for connections on port " << cmdLine.port << endl;
-        //DbGridListener l(port);
-        //l.listen();
         ShardedMessageHandler handler;
         MessageServer * server = createServer( opts , &handler );
         server->setAsTimeTracker();
@@ -321,6 +319,16 @@ int _main(int argc, char* argv[]) {
         return 8;
     }
 
+    {
+        class CheckConfigServers : public task::Task {
+            virtual string name() const { return "CheckConfigServers"; }
+            virtual void doWork() { configServer.ok(true); }
+        };
+        static CheckConfigServers checkConfigServers;
+
+        task::repeat(&checkConfigServers, 60*1000);
+    }
+
     int configError = configServer.checkConfigVersion( params.count( "upgrade" ) );
     if ( configError ) {
         if ( configError > 0 ) {
diff --git a/s/shard.cpp b/s/shard.cpp
index dfd707857da..75326e047fc 100644
--- a/s/shard.cpp
+++ b/s/shard.cpp
@@ -235,7 +235,7 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return true; }
 
-        virtual bool run(const string&, mongo::BSONObj&, std::string& errmsg , mongo::BSONObjBuilder& result, bool) {
+        virtual bool run(const string&, mongo::BSONObj&, int, std::string& errmsg , mongo::BSONObjBuilder& result, bool) {
             return staticShardInfo.getShardMap( result , errmsg );
         }
     } cmdGetShardMap;
@@ -346,7 +346,7 @@ namespace mongo {
                 best = t;
         }
 
-        log(1) << "best shard for new allocation is " << best << endl;
+        LOG(1) << "best shard for new allocation is " << best << endl;
         return best.shard();
     }
 
@@ -360,7 +360,7 @@ namespace mongo {
     void ShardingConnectionHook::onCreate( DBClientBase * conn ) {
         if( !noauth ) {
             string err;
-            log(2) << "calling onCreate auth for " << conn->toString() << endl;
+            LOG(2) << "calling onCreate auth for " << conn->toString() << endl;
             uassert( 15847, "can't authenticate to shard server",
                     conn->auth("local", internalSecurity.user, internalSecurity.pwd, err, false));
         }
diff --git a/s/shard_version.cpp b/s/shard_version.cpp
index 01447749ac9..4f84b0ae61e 100644
--- a/s/shard_version.cpp
+++ b/s/shard_version.cpp
@@ -96,7 +96,7 @@ namespace mongo {
         ChunkManagerPtr manager;
         const bool isSharded = conf->isSharded( ns );
         if ( isSharded ) {
-            manager = conf->getChunkManager( ns , authoritative );
+            manager = conf->getChunkManagerIfExists( ns , authoritative );
             // It's possible the chunk manager was reset since we checked whether sharded was true,
             // so must check this here.
             if( manager ) officialSequenceNumber = manager->getSequenceNumber();
@@ -139,8 +139,14 @@ namespace mongo {
         }
         
         if ( result["reloadConfig"].trueValue() ) {
-            // reload config
-            conf->getChunkManager( ns , true );
+            if( result["version"].timestampTime() == 0 ){
+                // reload db
+                conf->reload();
+            }
+            else {
+                // reload config
+                conf->getChunkManager( ns , true );
+            }
         }
 
         const int maxNumTries = 7;
diff --git a/s/shardkey.cpp b/s/shardkey.cpp
index 9602b8566e5..d6c8eda1ae1 100644
--- a/s/shardkey.cpp
+++ b/s/shardkey.cpp
@@ -55,7 +55,8 @@ namespace mongo {
            */
 
         for(set<string>::const_iterator it = patternfields.begin(); it != patternfields.end(); ++it) {
-            if(obj.getFieldDotted(it->c_str()).eoo())
+            BSONElement e = obj.getFieldDotted(it->c_str());
+            if(e.eoo() || e.type() == Array)
                 return false;
         }
         return true;
@@ -83,7 +84,7 @@ namespace mongo {
         vector<const char*> keysToMove;
         keysToMove.push_back("_id");
         BSONForEach(e, pattern) {
-            if (strchr(e.fieldName(), '.') == NULL)
+            if (strchr(e.fieldName(), '.') == NULL && strcmp(e.fieldName(), "_id") != 0)
                 keysToMove.push_back(e.fieldName());
         }
 
@@ -185,8 +186,8 @@ namespace mongo {
             ShardKeyPattern k( fromjson("{a:1,'sub.b':-1,'sub.c':1}") );
 
             BSONObj x = fromjson("{a:1,'sub.b':2,'sub.c':3}");
-            assert( k.extractKey( fromjson("{a:1,sub:{b:2,c:3}}") ).shallowEqual(x) );
-            assert( k.extractKey( fromjson("{sub:{b:2,c:3},a:1}") ).shallowEqual(x) );
+            assert( k.extractKey( fromjson("{a:1,sub:{b:2,c:3}}") ).binaryEqual(x) );
+            assert( k.extractKey( fromjson("{sub:{b:2,c:3},a:1}") ).binaryEqual(x) );
         }
         void moveToFrontTest() {
             ShardKeyPattern sk (BSON("a" << 1 << "b" << 1));
@@ -194,13 +195,13 @@ namespace mongo {
             BSONObj ret;
 
             ret = sk.moveToFront(BSON("z" << 1 << "_id" << 1 << "y" << 1 << "a" << 1 << "x" << 1 << "b" << 1 << "w" << 1));
-            assert(ret.shallowEqual(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1)));
+            assert(ret.binaryEqual(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1)));
 
             ret = sk.moveToFront(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1));
-            assert(ret.shallowEqual(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1)));
+            assert(ret.binaryEqual(BSON("_id" << 1 << "a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "x" << 1 << "w" << 1)));
 
             ret = sk.moveToFront(BSON("z" << 1 << "y" << 1 << "a" << 1 << "b" << 1 << "Z" << 1 << "Y" << 1));
-            assert(ret.shallowEqual(BSON("a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "Z" << 1 << "Y" << 1)));
+            assert(ret.binaryEqual(BSON("a" << 1 << "b" << 1 << "z" << 1 << "y" << 1 << "Z" << 1 << "Y" << 1)));
 
         }
 
@@ -263,7 +264,7 @@ namespace mongo {
                 moveToFrontBenchmark(100);
             }
 
-            log(1) << "shardKeyTest passed" << endl;
+            LOG(1) << "shardKeyTest passed" << endl;
         }
     } shardKeyTest;
 
diff --git a/s/shardkey.h b/s/shardkey.h
index 96301ffe093..976cff09591 100644
--- a/s/shardkey.h
+++ b/s/shardkey.h
@@ -102,7 +102,21 @@ namespace mongo {
     };
 
     inline BSONObj ShardKeyPattern::extractKey(const BSONObj& from) const {
-        BSONObj k = from.extractFields(pattern);
+        BSONObj k = from;
+        bool needExtraction = false;
+
+        BSONObjIterator a(from);
+        BSONObjIterator b(pattern);
+        while (a.more() && b.more()){
+            if (strcmp(a.next().fieldName(), b.next().fieldName()) != 0){
+                needExtraction = true;
+                break;
+            }
+        }
+
+        if (needExtraction || a.more() != b.more())
+            k = from.extractFields(pattern);
+
         uassert(13334, "Shard Key must be less than 512 bytes", k.objsize() < 512);
         return k;
     }
diff --git a/s/strategy.cpp b/s/strategy.cpp
index b48a718b49d..4230b7fac92 100644
--- a/s/strategy.cpp
+++ b/s/strategy.cpp
@@ -38,7 +38,7 @@ namespace mongo {
             conn.donotCheckVersion();
         else if ( conn.setVersion() ) {
             conn.done();
-            throw StaleConfigException( r.getns() , "doWRite" , true );
+            throw StaleConfigException( r.getns() , "doWrite" , true );
         }
         conn->say( r.m() );
         conn.done();
@@ -46,6 +46,8 @@ namespace mongo {
 
     void Strategy::doQuery( Request& r , const Shard& shard ) {
 
+        r.checkAuth();
+
         ShardConnection dbcon( shard , r.getns() );
         DBClientBase &c = dbcon.conn();
 
diff --git a/s/strategy_shard.cpp b/s/strategy_shard.cpp
index 12d2049cdbf..c6b30e7965f 100644
--- a/s/strategy_shard.cpp
+++ b/s/strategy_shard.cpp
@@ -35,7 +35,9 @@ namespace mongo {
         virtual void queryOp( Request& r ) {
             QueryMessage q( r.d() );
 
-            log(3) << "shard query: " << q.ns << "  " << q.query << endl;
+            r.checkAuth();
+
+            LOG(3) << "shard query: " << q.ns << "  " << q.query << endl;
 
             if ( q.ntoreturn == 1 && strstr(q.ns, ".$cmd") )
                 throw UserException( 8010 , "something is wrong, shouldn't see a command here" );
@@ -73,7 +75,7 @@ namespace mongo {
             try {
                 cursor->init();
 
-                log(5) << "   cursor type: " << cursor->type() << endl;
+                LOG(5) << "   cursor type: " << cursor->type() << endl;
                 shardedCursorTypes.hit( cursor->type() );
 
                 if ( query.isExplain() ) {
@@ -92,7 +94,7 @@ namespace mongo {
             if ( ! cc->sendNextBatch( r ) ) {
                 return;
             }
-            log(6) << "storing cursor : " << cc->getId() << endl;
+            LOG(6) << "storing cursor : " << cc->getId() << endl;
             cursorCache.store( cc );
         }
 
@@ -100,11 +102,11 @@ namespace mongo {
             int ntoreturn = r.d().pullInt();
             long long id = r.d().pullInt64();
 
-            log(6) << "want cursor : " << id << endl;
+            LOG(6) << "want cursor : " << id << endl;
 
             ShardedClientCursorPtr cursor = cursorCache.get( id );
             if ( ! cursor ) {
-                log(6) << "\t invalid cursor :(" << endl;
+                LOG(6) << "\t invalid cursor :(" << endl;
                 replyToQuery( ResultFlag_CursorNotFound , r.p() , r.m() , 0 , 0 , 0 );
                 return;
             }
@@ -121,7 +123,7 @@ namespace mongo {
 
         void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) {
             const int flags = d.reservedField();
-            bool keepGoing = flags & InsertOption_KeepGoing; // modified before assertion if should abort
+            bool keepGoing = flags & InsertOption_ContinueOnError; // modified before assertion if should abort
 
             while ( d.moreJSObjs() ) {
                 try {
@@ -139,8 +141,8 @@ namespace mongo {
                         }
 
                         if ( bad ) {
-                            log() << "tried to insert object without shard key: " << r.getns() << "  " << o << endl;
-                            uasserted( 8011 , "tried to insert object without shard key" );
+                            log() << "tried to insert object with no valid shard key: " << r.getns() << "  " << o << endl;
+                            uasserted( 8011 , "tried to insert object with no valid shard key" );
                         }
 
                     }
@@ -154,7 +156,7 @@ namespace mongo {
                     for ( int i=0; i<maxTries; i++ ) {
                         try {
                             ChunkPtr c = manager->findChunk( o );
-                            log(4) << "  server:" << c->getShard().toString() << " " << o << endl;
+                            LOG(4) << "  server:" << c->getShard().toString() << " " << o << endl;
                             insert( c->getShard() , r.getns() , o , flags);
 
                             r.gotInsert();
@@ -167,20 +169,20 @@ namespace mongo {
                             int logLevel = i < ( maxTries / 2 );
                             LOG( logLevel ) << "retrying insert because of StaleConfigException: " << e << " object: " << o << endl;
                             r.reset();
-                            
-                            unsigned long long old = manager->getSequenceNumber();
-                            manager = r.getChunkManager();
-                            
-                            LOG( logLevel ) << "  sequence number - old: " << old << " new: " << manager->getSequenceNumber() << endl;
 
-                            if (!manager) {
+                            manager = r.getChunkManager();
+                            if( ! manager ) {
                                 keepGoing = false;
                                 uasserted(14804, "collection no longer sharded");
                             }
+
+                            unsigned long long old = manager->getSequenceNumber();
+                            
+                            LOG( logLevel ) << "  sequence number - old: " << old << " new: " << manager->getSequenceNumber() << endl;
                         }
                         sleepmillis( i * 20 );
                     }
-                    
+
                     assert( inShutdown() || gotThrough ); // not caught below
                 } catch (const UserException&){
                     if (!keepGoing || !d.moreJSObjs()){
@@ -208,8 +210,8 @@ namespace mongo {
                 }
 
                 if ( bad ) {
-                    log() << "tried to insert object without shard key: " << nsChunkLookup << "  " << o << endl;
-                    uasserted( 14842 , "tried to insert object without shard key" );
+                    log() << "tried to insert object with no valid shard key: " << nsChunkLookup << "  " << o << endl;
+                    uasserted( 14842 , "tried to insert object with no valid shard key" );
                 }
 
             }
@@ -222,7 +224,7 @@ namespace mongo {
             for ( int i=0; i<maxTries; i++ ) {
                 try {
                     ChunkPtr c = manager->findChunk( o );
-                    log(4) << "  server:" << c->getShard().toString() << " " << o << endl;
+                    LOG(4) << "  server:" << c->getShard().toString() << " " << o << endl;
                     insert( c->getShard() , ns , o , flags, safe);
                     break;
                 }
@@ -231,7 +233,7 @@ namespace mongo {
                     LOG( logLevel ) << "retrying insert because of StaleConfigException: " << e << " object: " << o << endl;
 
                     unsigned long long old = manager->getSequenceNumber();
-                    manager = conf->getChunkManager(ns);
+                    manager = conf->getChunkManagerIfExists(ns);
 
                     LOG( logLevel ) << "  sequenece number - old: " << old << " new: " << manager->getSequenceNumber() << endl;
 
@@ -256,7 +258,7 @@ namespace mongo {
             bool multi = flags & UpdateOption_Multi;
 
             if (upsert) {
-                uassert(8012, "can't upsert something without shard key",
+                uassert(8012, "can't upsert something without valid shard key",
                         (manager->hasShardKey(toupdate) ||
                          (toupdate.firstElementFieldName()[0] == '$' && manager->hasShardKey(query))));
 
@@ -271,7 +273,8 @@ namespace mongo {
                 if ( multi ) {
                 }
                 else if ( strcmp( query.firstElementFieldName() , "_id" ) || query.nFields() != 1 ) {
-                    throw UserException( 8013 , "can't do non-multi update with query that doesn't have the shard key" );
+                    log() << "Query " << query << endl;
+                    throw UserException( 8013 , "can't do non-multi update with query that doesn't have a valid shard key" );
                 }
                 else {
                     save = true;
@@ -304,7 +307,7 @@ namespace mongo {
                 }
                 else {
                     uasserted(12376,
-                              str::stream() << "shard key must be in update object for collection: " << manager->getns() );
+                              str::stream() << "valid shard key must be in update object for collection: " << manager->getns() );
                 }
             }
 
@@ -349,7 +352,7 @@ namespace mongo {
             bool multi = flags & UpdateOption_Multi;
 
             if (upsert) {
-                uassert(14854, "can't upsert something without shard key",
+                uassert(14854, "can't upsert something without valid shard key",
                         (manager->hasShardKey(toupdate) ||
                          (toupdate.firstElementFieldName()[0] == '$' && manager->hasShardKey(query))));
 
@@ -364,7 +367,7 @@ namespace mongo {
                 if ( multi ) {
                 }
                 else if ( strcmp( query.firstElementFieldName() , "_id" ) || query.nFields() != 1 ) {
-                    throw UserException( 14850 , "can't do non-multi update with query that doesn't have the shard key" );
+                    throw UserException( 14850 , "can't do non-multi update with query that doesn't have a valid shard key" );
                 }
                 else {
                     save = true;
@@ -397,7 +400,7 @@ namespace mongo {
                 }
                 else {
                     uasserted(14857,
-                              str::stream() << "shard key must be in update object for collection: " << manager->getns() );
+                              str::stream() << "valid shard key must be in update object for collection: " << manager->getns() );
                 }
             }
 
@@ -447,7 +450,7 @@ namespace mongo {
             while ( true ) {
                 try {
                     manager->getShardsForQuery( shards , pattern );
-                    log(2) << "delete : " << pattern << " \t " << shards.size() << " justOne: " << justOne << endl;
+                    LOG(2) << "delete : " << pattern << " \t " << shards.size() << " justOne: " << justOne << endl;
                     if ( shards.size() == 1 ) {
                         doWrite( dbDelete , r , *shards.begin() );
                         return;
@@ -479,7 +482,7 @@ namespace mongo {
 
         virtual void writeOp( int op , Request& r ) {
             const char *ns = r.getns();
-            log(3) << "write: " << ns << endl;
+            LOG(3) << "write: " << ns << endl;
 
             DbMessage& d = r.d();
             ChunkManagerPtr info = r.getChunkManager();
diff --git a/s/strategy_single.cpp b/s/strategy_single.cpp
index b3eef9dafa4..012be5fb3dd 100644
--- a/s/strategy_single.cpp
+++ b/s/strategy_single.cpp
@@ -36,7 +36,7 @@ namespace mongo {
         virtual void queryOp( Request& r ) {
             QueryMessage q( r.d() );
 
-            log(3) << "single query: " << q.ns << "  " << q.query << "  ntoreturn: " << q.ntoreturn << endl;
+            LOG(3) << "single query: " << q.ns << "  " << q.query << "  ntoreturn: " << q.ntoreturn << " options : " << q.queryOptions << endl;
 
             if ( r.isCommand() ) {
 
@@ -55,7 +55,7 @@ namespace mongo {
                                                          : str::equals("query", e.fieldName())))
                                 cmdObj = e.embeddedObject();
                         }
-                        bool ok = Command::runAgainstRegistered(q.ns, cmdObj, builder);
+                        bool ok = Command::runAgainstRegistered(q.ns, cmdObj, builder, q.queryOptions);
                         if ( ok ) {
                             BSONObj x = builder.done();
                             replyToQuery(0, r.p(), r.m(), x);
@@ -161,12 +161,12 @@ namespace mongo {
             if ( r.isShardingEnabled() &&
                     strstr( ns , ".system.indexes" ) == strchr( ns , '.' ) &&
                     strchr( ns , '.' ) ) {
-                log(1) << " .system.indexes write for: " << ns << endl;
+                LOG(1) << " .system.indexes write for: " << ns << endl;
                 handleIndexWrite( op , r );
                 return;
             }
 
-            log(3) << "single write: " << ns << endl;
+            LOG(3) << "single write: " << ns << endl;
             doWrite( op , r , r.primaryShard() );
             r.gotInsert(); // Won't handle mulit-insert correctly. Not worth parsing the request.
         }
diff --git a/s/writeback_listener.cpp b/s/writeback_listener.cpp
index 81f75988a6e..5f320d3921f 100644
--- a/s/writeback_listener.cpp
+++ b/s/writeback_listener.cpp
@@ -117,7 +117,7 @@ namespace mongo {
         while ( ! inShutdown() ) {
             
             if ( ! Shard::isAShardNode( _addr ) ) {
-                log(1) << _addr << " is not a shard node" << endl;
+                LOG(1) << _addr << " is not a shard node" << endl;
                 sleepsecs( 60 );
                 continue;
             }
@@ -216,7 +216,10 @@ namespace mongo {
                             
                             if ( gle["code"].numberInt() == 9517 ) {
                                 log() << "writeback failed because of stale config, retrying attempts: " << attempts << endl;
-                                db->getChunkManager( ns , true );
+                                if( ! db->getChunkManagerIfExists( ns , true ) ){
+                                    uassert( 15884, str::stream() << "Could not reload chunk manager after " << attempts << " attempts.", attempts <= 4 );
+                                    sleepsecs( attempts - 1 );
+                                }
                                 continue;
                             }
 
diff --git a/scripting/bench.cpp b/scripting/bench.cpp
index 1ac7f04a55e..9ada7d6495c 100644
--- a/scripting/bench.cpp
+++ b/scripting/bench.cpp
@@ -142,7 +142,7 @@ namespace mongo {
                     conn->remove( ns , fixQuery( e["query"].Obj() ) );
                 }
                 else if ( op == "update" ) {
-                    conn->update( ns , fixQuery( e["query"].Obj() ) , e["update"].Obj() );
+                    conn->update( ns , fixQuery( e["query"].Obj() ) , e["update"].Obj() , e["upsert"].trueValue() );
                 }
                 else {
                     log() << "don't understand op: " << op << endl;
diff --git a/server.h b/server.h
index 370c63d0c47..781e4ccc24a 100644
--- a/server.h
+++ b/server.h
@@ -1,25 +1,25 @@
-/** @file server.h
-
-    This file contains includes commonly needed in the server files (mongod, mongos, test).  It is NOT included in the C++ client.
-
-    Over time we should move more here, and more out of pch.h.  And get rid of pch.h at some point.
-*/
-
-// todo is there a boost  thign for this already?
-
-#pragma once
-
-#include "bson/inline_decls.h"
-
-/* Note: do not clutter code with these -- ONLY use in hot spots / significant loops. */
-
-// branch prediction.  indicate we expect to enter the if statement body
-#define IF MONGOIF
-
-// branch prediction.  indicate we expect to not enter the if statement body
-#define _IF MONGO_IF
-
-// prefetch data from memory
-#define PREFETCH MONGOPREFETCH
-
-using namespace bson;
+/** @file server.h
+
+    This file contains includes commonly needed in the server files (mongod, mongos, test).  It is NOT included in the C++ client.
+
+    Over time we should move more here, and more out of pch.h.  And get rid of pch.h at some point.
+*/
+
+// todo is there a boost  thign for this already?
+
+#pragma once
+
+#include "bson/inline_decls.h"
+
+/* Note: do not clutter code with these -- ONLY use in hot spots / significant loops. */
+
+// branch prediction.  indicate we expect to be true
+#define likely MONGO_likely
+
+// branch prediction.  indicate we expect to be false
+#define unlikely MONGO_unlikely
+
+// prefetch data from memory
+#define PREFETCH MONGOPREFETCH
+
+using namespace bson;
diff --git a/shell/collection.js b/shell/collection.js
index cf8f5ce19c1..862a0a11440 100644
--- a/shell/collection.js
+++ b/shell/collection.js
@@ -120,7 +120,7 @@ DBCollection.prototype._validateObject = function( o ){
         throw "can't save a DBQuery object";
 }
 
-DBCollection._allowedFields = { $id : 1 , $ref : 1 , $db : 1 };
+DBCollection._allowedFields = { $id : 1 , $ref : 1 , $db : 1 , $MinKey : 1, $MaxKey : 1 };
 
 DBCollection.prototype._validateForStorage = function( o ){
     this._validateObject( o );
diff --git a/shell/dbshell.cpp b/shell/dbshell.cpp
index 8db622732dc..f3122c797d5 100644
--- a/shell/dbshell.cpp
+++ b/shell/dbshell.cpp
@@ -403,6 +403,8 @@ string finishCode( string code ) {
             return "";
         if ( ! line )
             return "";
+        if ( code.find("\n\n") != string::npos ) // cancel multiline if two blank lines are entered
+            return ";";
 
         while (startsWith(line, "... "))
             line += 4;
@@ -504,6 +506,9 @@ int _main(int argc, char* argv[]) {
     ("version", "show version information")
     ("verbose", "increase verbosity")
     ("ipv6", "enable IPv6 support (disabled by default)")
+#ifdef MONGO_SSL
+    ("ssl", "use all for connections")
+#endif
     ;
 
     hidden_options.add_options()
@@ -572,6 +577,11 @@ int _main(int argc, char* argv[]) {
     if (params.count("quiet")) {
         mongo::cmdLine.quiet = true;
     }
+#ifdef MONGO_SSL
+    if (params.count("ssl")) {
+        mongo::cmdLine.sslOnNormalPorts = true;
+    }
+#endif
     if (params.count("nokillop")) {
         mongo::shellUtils::_nokillop = true;
     }
@@ -579,6 +589,8 @@ int _main(int argc, char* argv[]) {
         autoKillOp = true;
     }
 
+    
+
     /* This is a bit confusing, here are the rules:
      *
      * if nodb is set then all positional parameters are files
diff --git a/shell/mongo.js b/shell/mongo.js
index e129784bf66..25357691c51 100644
--- a/shell/mongo.js
+++ b/shell/mongo.js
@@ -24,8 +24,9 @@ if ( typeof mongoInject == "function" ){
     mongoInject( Mongo.prototype );
 }
 
-Mongo.prototype.setSlaveOk = function() {
-    this.slaveOk = true;
+Mongo.prototype.setSlaveOk = function( value ) {
+    if( value == undefined ) value = true
+    this.slaveOk = value
 }
 
 Mongo.prototype.getDB = function( name ){
@@ -43,6 +44,10 @@ Mongo.prototype.adminCommand = function( cmd ){
     return this.getDB( "admin" ).runCommand( cmd );
 }
 
+Mongo.prototype.setLogLevel = function( logLevel ){
+    return this.adminCommand({ setParameter : 1, logLevel : logLevel })
+}
+
 Mongo.prototype.getDBNames = function(){
     return this.getDBs().databases.map( 
         function(z){
diff --git a/shell/mongo_vstudio.cpp b/shell/mongo_vstudio.cpp
index ea0b2cd4b20..2fbb6d908b5 100644
--- a/shell/mongo_vstudio.cpp
+++ b/shell/mongo_vstudio.cpp
@@ -89,6 +89,26 @@ const StringData _jscode_raw_utils =
 "doassert( \"[\" + a + \"] != [\" + b + \"] are equal : \" + msg );\n" 
 "}\n" 
 "\n" 
+"assert.contains = function( o, arr, msg ){\n" 
+"var wasIn = false\n" 
+"\n" 
+"if( ! arr.length ){\n" 
+"for( i in arr ){\n" 
+"wasIn = arr[i] == o || ( ( arr[i] != null && o != null ) && friendlyEqual( arr[i] , o ) )\n" 
+"return;\n" 
+"if( wasIn ) break\n" 
+"}\n" 
+"}\n" 
+"else {\n" 
+"for( var i = 0; i < arr.length; i++ ){\n" 
+"wasIn = arr[i] == o || ( ( arr[i] != null && o != null ) && friendlyEqual( arr[i] , o ) )\n" 
+"if( wasIn ) break\n" 
+"}\n" 
+"}\n" 
+"\n" 
+"if( ! wasIn ) doassert( tojson( o ) + \" was not in \" + tojson( arr ) + \" : \" + msg )\n" 
+"}\n" 
+"\n" 
 "assert.repeat = function( f, msg, timeout, interval ) {\n" 
 "if ( assert._debug && msg ) print( \"in assert for: \" + msg );\n" 
 "\n" 
@@ -216,6 +236,18 @@ const StringData _jscode_raw_utils =
 "doassert( a + \" is not greater than or eq \" + b + \" : \" + msg );\n" 
 "}\n" 
 "\n" 
+"assert.between = function( a, b, c, msg, inclusive ){\n" 
+"if ( assert._debug && msg ) print( \"in assert for: \" + msg );\n" 
+"\n" 
+"if( ( inclusive == undefined || inclusive == true ) &&\n" 
+"a <= b && b <= c ) return;\n" 
+"else if( a < b && b < c ) return;\n" 
+"\n" 
+"doassert( b + \" is not between \" + a + \" and \" + c + \" : \" + msg );\n" 
+"}\n" 
+"\n" 
+"assert.betweenIn = function( a, b, c, msg ){ assert.between( a, b, c, msg, true ) }\n" 
+"assert.betweenEx = function( a, b, c, msg ){ assert.between( a, b, c, msg, false ) }\n" 
 "\n" 
 "assert.close = function( a , b , msg , places ){\n" 
 "if (places === undefined) {\n" 
@@ -243,6 +275,11 @@ const StringData _jscode_raw_utils =
 "return dst;\n" 
 "}\n" 
 "\n" 
+"Object.merge = function( dst, src, deep ){\n" 
+"var clone = Object.extend( {}, dst, deep )\n" 
+"return Object.extend( clone, src, deep )\n" 
+"}\n" 
+"\n" 
 "argumentsToArray = function( a ){\n" 
 "var arr = [];\n" 
 "for ( var i=0; i<a.length; i++ )\n" 
@@ -943,6 +980,35 @@ const StringData _jscode_raw_utils =
 "print( tojsononeline( x ) );\n" 
 "}\n" 
 "\n" 
+"if ( typeof TestData == \"undefined\" ){\n" 
+"TestData = undefined\n" 
+"}\n" 
+"\n" 
+"jsTestName = function(){\n" 
+"if( TestData ) return TestData.testName\n" 
+"return \"__unknown_name__\"\n" 
+"}\n" 
+"\n" 
+"jsTestFile = function(){\n" 
+"if( TestData ) return TestData.testFile\n" 
+"return \"__unknown_file__\"\n" 
+"}\n" 
+"\n" 
+"jsTestPath = function(){\n" 
+"if( TestData ) return TestData.testPath\n" 
+"return \"__unknown_path__\"\n" 
+"}\n" 
+"\n" 
+"jsTestOptions = function(){\n" 
+"if( TestData ) return { noJournal : TestData.noJournal,\n" 
+"noJournalPrealloc : TestData.noJournalPrealloc }\n" 
+"return {}\n" 
+"}\n" 
+"\n" 
+"testLog = function(x){\n" 
+"print( jsTestFile() + \" - \" + x )\n" 
+"}\n" 
+"\n" 
 "shellPrintHelper = function (x) {\n" 
 "\n" 
 "if (typeof (x) == \"undefined\") {\n" 
@@ -1481,6 +1547,41 @@ const StringData _jscode_raw_utils =
 "return \"error: couldn't find \"+hn+\" in \"+tojson(c.members);\n" 
 "};\n" 
 "\n" 
+"rs.debug = {};\n" 
+"\n" 
+"rs.debug.nullLastOpWritten = function(primary, secondary) {\n" 
+"var p = connect(primary+\"/local\");\n" 
+"var s = connect(secondary+\"/local\");\n" 
+"s.getMongo().setSlaveOk();\n" 
+"\n" 
+"var secondToLast = s.oplog.rs.find().sort({$natural : -1}).limit(1).next();\n" 
+"var last = p.runCommand({findAndModify : \"oplog.rs\",\n" 
+"query : {ts : {$gt : secondToLast.ts}},\n" 
+"sort : {$natural : 1},\n" 
+"update : {$set : {op : \"n\"}}});\n" 
+"\n" 
+"if (!last.value.o || !last.value.o._id) {\n" 
+"print(\"couldn't find an _id?\");\n" 
+"}\n" 
+"else {\n" 
+"last.value.o = {_id : last.value.o._id};\n" 
+"}\n" 
+"\n" 
+"print(\"nulling out this op:\");\n" 
+"printjson(last);\n" 
+"};\n" 
+"\n" 
+"rs.debug.getLastOpWritten = function(server) {\n" 
+"var s = db.getSisterDB(\"local\");\n" 
+"if (server) {\n" 
+"s = connect(server+\"/local\");\n" 
+"}\n" 
+"s.getMongo().setSlaveOk();\n" 
+"\n" 
+"return s.oplog.rs.find().sort({$natural : -1}).limit(1).next();\n" 
+"};\n" 
+"\n" 
+"\n" 
 "help = shellHelper.help = function (x) {\n" 
 "if (x == \"mr\") {\n" 
 "print(\"\\nSee also http://www.mongodb.org/display/DOCS/MapReduce\");\n" 
@@ -1634,7 +1735,8 @@ const StringData _jscode_raw_utils_sh =
 "print( \"\\tsh.moveChunk(fullName,find,to)            move the chunk where 'find' is to 'to' (name of shard)\");\n" 
 "\n" 
 "print( \"\\tsh.setBalancerState( <bool on or not> )   turns the balancer on or off true=on, false=off\" );\n" 
-"print( \"\\tsh.getBalancerState()   return true if on, off if not\" );\n" 
+"print( \"\\tsh.getBalancerState()                     return true if on, off if not\" );\n" 
+"print( \"\\tsh.isBalancerRunning()                    return true if the balancer is running on any mongos\" );\n" 
 "\n" 
 "print( \"\\tsh.status()                               prints a general overview of the cluster\" )\n" 
 "}\n" 
@@ -1691,6 +1793,11 @@ const StringData _jscode_raw_utils_sh =
 "return true;\n" 
 "return ! x.stopped;\n" 
 "}\n" 
+"\n" 
+"sh.isBalancerRunning = function() {\n" 
+"var x = db.getSisterDB( \"config\" ).locks.findOne( { _id : \"balancer\" } );\n" 
+"return x.state > 0;\n" 
+"}\n" 
 ;
 extern const JSFile utils_sh;
 const JSFile utils_sh = { "shell/utils_sh.js" , _jscode_raw_utils_sh };
@@ -2552,8 +2659,9 @@ const StringData _jscode_raw_mongo =
 "mongoInject( Mongo.prototype );\n" 
 "}\n" 
 "\n" 
-"Mongo.prototype.setSlaveOk = function() {\n" 
-"this.slaveOk = true;\n" 
+"Mongo.prototype.setSlaveOk = function( value ) {\n" 
+"if( value == undefined ) value = true\n" 
+"this.slaveOk = value\n" 
 "}\n" 
 "\n" 
 "Mongo.prototype.getDB = function( name ){\n" 
@@ -2571,6 +2679,10 @@ const StringData _jscode_raw_mongo =
 "return this.getDB( \"admin\" ).runCommand( cmd );\n" 
 "}\n" 
 "\n" 
+"Mongo.prototype.setLogLevel = function( logLevel ){\n" 
+"return this.adminCommand({ setParameter : 1, logLevel : logLevel })\n" 
+"}\n" 
+"\n" 
 "Mongo.prototype.getDBNames = function(){\n" 
 "return this.getDBs().databases.map(\n" 
 "function(z){\n" 
@@ -3162,7 +3274,7 @@ const StringData _jscode_raw_collection =
 "throw \"can't save a DBQuery object\";\n" 
 "}\n" 
 "\n" 
-"DBCollection._allowedFields = { $id : 1 , $ref : 1 , $db : 1 };\n" 
+"DBCollection._allowedFields = { $id : 1 , $ref : 1 , $db : 1 , $MinKey : 1, $MaxKey : 1 };\n" 
 "\n" 
 "DBCollection.prototype._validateForStorage = function( o ){\n" 
 "this._validateObject( o );\n" 
diff --git a/shell/servers.js b/shell/servers.js
index a4e568a0f0f..e551559a79c 100755
--- a/shell/servers.js
+++ b/shell/servers.js
@@ -21,6 +21,25 @@ _parsePort = function() {
     return port;
 }
 
+connectionURLTheSame = function( a , b ){
+    if ( a == b )
+        return true;
+
+    if ( ! a || ! b )
+        return false;
+
+    a = a.split( "/" )[0]
+    b = b.split( "/" )[0]
+
+    return a == b;
+}
+
+assert( connectionURLTheSame( "foo" , "foo" ) )
+assert( ! connectionURLTheSame( "foo" , "bar" ) )
+
+assert( connectionURLTheSame( "foo/a,b" , "foo/b,a" ) )
+assert( ! connectionURLTheSame( "foo/a,b" , "bar/a,b" ) )
+
 createMongoArgs = function( binaryName , args ){
     var fullArgs = [ binaryName ];
 
@@ -79,6 +98,9 @@ startMongodTest = function (port, dirname, restart, extraOptions ) {
             oplogSize: "40",
             nohttpinterface: ""
         };
+    
+    if( jsTestOptions().noJournal ) options["nojournal"] = ""
+    if( jsTestOptions().noJournalPrealloc ) options["nopreallocj"] = ""
 
     if ( extraOptions )
         Object.extend( options , extraOptions );
@@ -158,6 +180,17 @@ myPort = function() {
  * * useHostname to use the hostname (instead of localhost)
  */
 ShardingTest = function( testName , numShards , verboseLevel , numMongos , otherParams ){
+    
+    // Check if testName is an object, if so, pull params from there
+    if( testName && ! testName.charAt ){
+        var params = testName
+        testName = params.name || "test"
+        numShards = params.shards || 2
+        verboseLevel = params.verbose || 0
+        numMongos = params.mongos || 1
+        otherParams = params.other || {} 
+    }
+    
     this._testName = testName;
 
     if ( ! otherParams )
@@ -170,8 +203,7 @@ ShardingTest = function( testName , numShards , verboseLevel , numMongos , other
     var localhost = otherParams.useHostname ? getHostName() : "localhost";
 
     this._alldbpaths = []
-
-
+    
     if ( otherParams.rs ){
         localhost = getHostName();
         // start replica sets
@@ -179,15 +211,18 @@ ShardingTest = function( testName , numShards , verboseLevel , numMongos , other
         for ( var i=0; i<numShards; i++){
             var setName = testName + "-rs" + i;
             
-            var rsDefaults = { oplogSize : 40 }
+            var rsDefaults = { oplogSize : 40, nodes : 3 }
             var rsParams = otherParams["rs" + i]
             
             for( var param in rsParams ){
                 rsDefaults[param] = rsParams[param]
             }
+
+            var numReplicas = rsDefaults.nodes || otherParams.numReplicas || 3
+            delete rsDefaults.nodes 
             
-            var rs = new ReplSetTest( { name : setName , nodes : 3 , startPort : 31100 + ( i * 100 ) } );
-            this._rs[i] = { setName : setName , test : rs , nodes : rs.startSet( rsParams ) , url : rs.getURL() };
+            var rs = new ReplSetTest( { name : setName , nodes : numReplicas , startPort : 31100 + ( i * 100 ) } );
+            this._rs[i] = { setName : setName , test : rs , nodes : rs.startSet( rsDefaults ) , url : rs.getURL() };
             rs.initiate();
             
         }
@@ -322,10 +357,9 @@ ShardingTest.prototype.getServer = function( dbname ){
     
     for ( var i=0; i<this._connections.length; i++ ){
         var c = this._connections[i];
-        if ( name == c.name )
+        if ( connectionURLTheSame( name , c.name ) || 
+             connectionURLTheSame( rsName , c.name ) )
             return c;
-	if ( rsName && c.name.startsWith( rsName ) )
-	    return c;
     }
     
     throw "can't find server for: " + dbname + " name:" + name;
@@ -683,7 +717,7 @@ ShardingTest.prototype.getShards = function( coll, query ){
     
     for( var i = 0; i < shards.length; i++ ){
         for( var j = 0; j < this._connections.length; j++ ){
-            if( this._connections[j].name == shards[i] ){
+            if ( connectionURLTheSame(  this._connections[j].name , shards[i] ) ){
                 shards[i] = this._connections[j]
                 break;
             }
@@ -730,22 +764,36 @@ ShardingTest.prototype.shardGo = function( collName , key , split , move , dbNam
     if( collName.getDB )
         c = "" + collName
 
+    var isEmpty = this.s.getCollection( c ).count() == 0
+        
     if( ! this.isSharded( dbName ) )
         this.s.adminCommand( { enableSharding : dbName } )
     
-    this.s.adminCommand( { shardcollection : c , key : key } );
-    this.s.adminCommand( { split : c , middle : split } );
+    var result = this.s.adminCommand( { shardcollection : c , key : key } )
+    if( ! result.ok ){
+        printjson( result )
+        assert( false )
+    }
     
+    result = this.s.adminCommand( { split : c , middle : split } );
+    if( ! result.ok ){
+        printjson( result )
+        assert( false )
+    }
+        
     var result = null
     for( var i = 0; i < 5; i++ ){
         result = this.s.adminCommand( { movechunk : c , find : move , to : this.getOther( this.getServer( dbName ) ).name } );
         if( result.ok ) break;
         sleep( 5 * 1000 );
     }
+    printjson( result )
     assert( result.ok )
     
 };
 
+ShardingTest.prototype.shardColl = ShardingTest.prototype.shardGo
+
 ShardingTest.prototype.setBalancer = function( balancer ){
     if( balancer || balancer == undefined ){
         this.config.settings.update( { _id: "balancer" }, { $set : { stopped: false } } , true )
@@ -902,6 +950,8 @@ ReplTest.prototype.getOptions = function( master , extra , putBinaryFirst, norep
     a.push( "--dbpath" );
     a.push( this.getPath( master ) );
     
+    if( jsTestOptions().noJournal ) a.push( "--nojournal" )
+    if( jsTestOptions().noJournalPrealloc ) a.push( "--nopreallocj" )
 
     if ( !norepl ) {
         if ( master ){
@@ -1210,6 +1260,9 @@ ReplSetTest.prototype.getOptions = function( n , extra , putBinaryFirst ){
     a.push( "--dbpath" );
     a.push( this.getPath( ( n.host ? this.getNodeId( n ) : n ) ) );
     
+    if( jsTestOptions().noJournal ) a.push( "--nojournal" )
+    if( jsTestOptions().noJournalPrealloc ) a.push( "--nopreallocj" )
+    
     for ( var k in extra ){
         var v = extra[k];        
         a.push( "--" + k );
@@ -1271,6 +1324,50 @@ ReplSetTest.prototype.callIsMaster = function() {
   return master || false;
 }
 
+ReplSetTest.awaitRSClientHosts = function( conn, host, hostOk, rs ) {
+    
+    if( host.length ){
+        for( var i = 0; i < host.length; i++ ) this.awaitOk( conn, host[i] )
+        return
+    }
+    
+    if( hostOk == undefined ) hostOk = { ok : true }
+    if( host.host ) host = host.host
+    if( rs && rs.getMaster ) rs = rs.name
+    
+    print( "Awaiting " + host + " to be " + tojson( hostOk ) + " for " + conn + " (rs: " + rs + ")" )
+    
+    var tests = 0
+    assert.soon( function() {
+        var rsClientHosts = conn.getDB( "admin" ).runCommand( "connPoolStats" )[ "replicaSets" ]
+        if( tests++ % 10 == 0 ) 
+            printjson( rsClientHosts )
+        
+        for ( rsName in rsClientHosts ){
+            if( rs && rs != rsName ) continue
+            for ( var i = 0; i < rsClientHosts[rsName].hosts.length; i++ ){
+                var clientHost = rsClientHosts[rsName].hosts[ i ];
+                if( clientHost.addr != host ) continue
+                
+                // Check that *all* host properties are set correctly
+                var propOk = true
+                for( var prop in hostOk ){
+                    if( clientHost[prop] != hostOk[prop] ){ 
+                        propOk = false
+                        break
+                    }
+                }
+                
+                if( propOk ) return true;
+
+            }
+        }
+        return false;
+    }, "timed out waiting for replica set client to recognize hosts",
+       3 * 20 * 1000 /* ReplicaSetMonitorWatcher updates every 20s */ )
+    
+}
+
 ReplSetTest.prototype.awaitSecondaryNodes = function( timeout ) {
   var master = this.getMaster();
   var slaves = this.liveNodes.slaves;
@@ -1296,6 +1393,7 @@ ReplSetTest.prototype.getMaster = function( timeout ) {
   return master;
 }
 
+ReplSetTest.prototype.getPrimary = ReplSetTest.prototype.getMaster
 
 ReplSetTest.prototype.getSecondaries = function( timeout ){
     var master = this.getMaster( timeout )
@@ -1308,6 +1406,16 @@ ReplSetTest.prototype.getSecondaries = function( timeout ){
     return secs
 }
 
+ReplSetTest.prototype.getSecondary = function( timeout ){
+    return this.getSecondaries( timeout )[0];
+}
+
+ReplSetTest.prototype.status = function( timeout ){
+    var master = this.callIsMaster()
+    if( ! master ) master = this.liveNodes.slaves[0]
+    return master.getDB("admin").runCommand({replSetGetStatus: 1})
+}
+
 // Add a node to the test set
 ReplSetTest.prototype.add = function( config ) {
   if(this.ports.length == 0) {
@@ -1379,62 +1487,76 @@ ReplSetTest.prototype.reInitiate = function() {
     this.initiate( config , 'replSetReconfig' );
 }
 
+ReplSetTest.prototype.getLastOpTimeWritten = function() {
+    this.getMaster();
+    this.attempt({context : this, desc : "awaiting oplog query"},
+                 function() {
+                     try {
+                         this.latest = this.liveNodes.master.getDB("local")['oplog.rs'].find({}).sort({'$natural': -1}).limit(1).next()['ts'];
+                     }
+                     catch(e) {
+                         print("ReplSetTest caught exception " + e);
+                         return false;
+                     }
+                     return true;
+                 });
+};
+
 ReplSetTest.prototype.awaitReplication = function(timeout) {
-   this.getMaster();
-   timeout = timeout || 30000;
+    timeout = timeout || 30000;
 
-   this.attempt({context : this, desc : "awaiting oplog query"},
-                function() {
-                    try {
-                        latest = this.liveNodes.master.getDB("local")['oplog.rs'].find({}).sort({'$natural': -1}).limit(1).next()['ts'];
-                    }
-                    catch(e) {
-                        print("ReplSetTest caught exception " + e);
-                        return false;
-                    }
-                    return true;
-                });
-   
-   print("ReplSetTest " + latest);
-
-   this.attempt({context: this, timeout: timeout, desc: "awaiting replication"},
-       function() {
-           var synced = true;
-           for(var i=0; i<this.liveNodes.slaves.length; i++) {
-             var slave = this.liveNodes.slaves[i];
-
-             // Continue if we're connected to an arbiter
-             if(res = slave.getDB("admin").runCommand({replSetGetStatus: 1})) {
-                 if(res.myState == 7) {
-                     continue;
-                 }
-             }
-
-             slave.getDB("admin").getMongo().setSlaveOk();
-             var log = slave.getDB("local")['oplog.rs'];
-             if(log.find({}).sort({'$natural': -1}).limit(1).hasNext()) {
-               var entry = log.find({}).sort({'$natural': -1}).limit(1).next();
-               printjson( entry );
-               var ts = entry['ts'];
-               print("ReplSetTest await TS for " + slave + " is " + ts.t+":"+ts.i + " and latest is " + latest.t+":"+latest.i);
-               
-               if (latest.t < ts.t || (latest.t == ts.t && latest.i < ts.i)) {
-                   latest = this.liveNodes.master.getDB("local")['oplog.rs'].find({}).sort({'$natural': -1}).limit(1).next()['ts'];
-               }
-               
-               print("ReplSetTest await oplog size for " + slave + " is " + log.count());
-               synced = (synced && friendlyEqual(latest,ts))
-             }
-             else {
-               synced = false;
-             }
-           }
-
-           if(synced) {
-                print("ReplSetTest await synced=" + synced);
-           }
-           return synced;
-   });
+    this.getLastOpTimeWritten();
+
+    print("ReplSetTest " + this.latest);
+
+    this.attempt({context: this, timeout: timeout, desc: "awaiting replication"},
+                 function() {
+                     try {
+                         var synced = true;
+                         for(var i=0; i<this.liveNodes.slaves.length; i++) {
+                             var slave = this.liveNodes.slaves[i];
+
+                             // Continue if we're connected to an arbiter
+                             if(res = slave.getDB("admin").runCommand({replSetGetStatus: 1})) {
+                                 if(res.myState == 7) {
+                                     continue;
+                                 }
+                             }
+
+                             slave.getDB("admin").getMongo().setSlaveOk();
+                             var log = slave.getDB("local")['oplog.rs'];
+                             if(log.find({}).sort({'$natural': -1}).limit(1).hasNext()) {
+                                 var entry = log.find({}).sort({'$natural': -1}).limit(1).next();
+                                 printjson( entry );
+                                 var ts = entry['ts'];
+                                 print("ReplSetTest await TS for " + slave + " is " + ts.t+":"+ts.i + " and latest is " + this.latest.t+":"+this.latest.i);
+
+                                 if (this.latest.t < ts.t || (this.latest.t == ts.t && this.latest.i < ts.i)) {
+                                     this.latest = this.liveNodes.master.getDB("local")['oplog.rs'].find({}).sort({'$natural': -1}).limit(1).next()['ts'];
+                                 }
+
+                                 print("ReplSetTest await oplog size for " + slave + " is " + log.count());
+                                 synced = (synced && friendlyEqual(this.latest,ts))
+                             }
+                             else {
+                                 synced = false;
+                             }
+                         }
+
+                         if(synced) {
+                             print("ReplSetTest await synced=" + synced);
+                         }
+                         return synced;
+                     }
+                     catch (e) {
+                         print("ReplSetTest.awaitReplication: caught exception "+e);
+
+                         // we might have a new master now
+                         this.getLastOpTimeWritten();
+
+                         return false;
+                     }
+                 });
 }
 
 ReplSetTest.prototype.getHashes = function( db ){
@@ -1704,10 +1826,10 @@ ReplSetTest.prototype.waitForIndicator = function( node, states, ind, timeout ){
     var lastTime = null
     var currTime = new Date().getTime()
     var status = undefined
-    
+        
     this.attempt({context: this, timeout: timeout, desc: "waiting for state indicator " + ind + " for " + timeout + "ms" }, function() {
         
-        status = this.getMaster().getDB("admin").runCommand({ replSetGetStatus : 1 })
+        status = this.status()
         
         if( lastTime == null || ( currTime = new Date().getTime() ) - (1000 * 5) > lastTime ){
             if( lastTime == null ) print( "ReplSetTest waitForIndicator Initial status ( timeout : " + timeout + " ) :" )
diff --git a/shell/utils.js b/shell/utils.js
index 9a239dbc4a9..a903691fbd4 100644
--- a/shell/utils.js
+++ b/shell/utils.js
@@ -84,6 +84,26 @@ assert.neq = function( a , b , msg ){
     doassert( "[" + a + "] != [" + b + "] are equal : " + msg );
 }
 
+assert.contains = function( o, arr, msg ){
+    var wasIn = false
+    
+    if( ! arr.length ){
+        for( i in arr ){
+            wasIn = arr[i] == o || ( ( arr[i] != null && o != null ) && friendlyEqual( arr[i] , o ) )
+                return;
+            if( wasIn ) break
+        }
+    }
+    else {
+        for( var i = 0; i < arr.length; i++ ){
+            wasIn = arr[i] == o || ( ( arr[i] != null && o != null ) && friendlyEqual( arr[i] , o ) )
+            if( wasIn ) break
+        }
+    }
+    
+    if( ! wasIn ) doassert( tojson( o ) + " was not in " + tojson( arr ) + " : " + msg )
+}
+
 assert.repeat = function( f, msg, timeout, interval ) {
     if ( assert._debug && msg ) print( "in assert for: " + msg );
 
@@ -211,6 +231,18 @@ assert.gte = function( a , b , msg ){
     doassert( a + " is not greater than or eq " + b + " : " + msg );
 }
 
+assert.between = function( a, b, c, msg, inclusive ){
+    if ( assert._debug && msg ) print( "in assert for: " + msg );
+
+    if( ( inclusive == undefined || inclusive == true ) &&
+        a <= b && b <= c ) return;
+    else if( a < b && b < c ) return;
+    
+    doassert( b + " is not between " + a + " and " + c + " : " + msg );
+}
+
+assert.betweenIn = function( a, b, c, msg ){ assert.between( a, b, c, msg, true ) }
+assert.betweenEx = function( a, b, c, msg ){ assert.between( a, b, c, msg, false ) }
 
 assert.close = function( a , b , msg , places ){
     if (places === undefined) {
@@ -238,6 +270,11 @@ Object.extend = function( dst , src , deep ){
     return dst;
 }
 
+Object.merge = function( dst, src, deep ){
+    var clone = Object.extend( {}, dst, deep )
+    return Object.extend( clone, src, deep )
+}
+
 argumentsToArray = function( a ){
     var arr = [];
     for ( var i=0; i<a.length; i++ )
@@ -938,6 +975,35 @@ printjsononeline = function(x){
     print( tojsononeline( x ) );
 }
 
+if ( typeof TestData == "undefined" ){
+    TestData = undefined
+}
+
+jsTestName = function(){
+    if( TestData ) return TestData.testName
+    return "__unknown_name__"
+}
+
+jsTestFile = function(){
+    if( TestData ) return TestData.testFile
+    return "__unknown_file__"
+}
+
+jsTestPath = function(){
+    if( TestData ) return TestData.testPath
+    return "__unknown_path__"
+}
+
+jsTestOptions = function(){
+    if( TestData ) return { noJournal : TestData.noJournal,
+                            noJournalPrealloc : TestData.noJournalPrealloc }
+    return {}
+}
+
+testLog = function(x){
+    print( jsTestFile() + " - " + x )
+}
+
 shellPrintHelper = function (x) {
 
     if (typeof (x) == "undefined") {
@@ -1476,6 +1542,41 @@ rs.remove = function (hn) {
     return "error: couldn't find "+hn+" in "+tojson(c.members);
 };
 
+rs.debug = {};
+
+rs.debug.nullLastOpWritten = function(primary, secondary) {
+    var p = connect(primary+"/local");
+    var s = connect(secondary+"/local");
+    s.getMongo().setSlaveOk();
+
+    var secondToLast = s.oplog.rs.find().sort({$natural : -1}).limit(1).next();
+    var last = p.runCommand({findAndModify : "oplog.rs",
+                             query : {ts : {$gt : secondToLast.ts}},
+                             sort : {$natural : 1},
+                             update : {$set : {op : "n"}}});
+
+    if (!last.value.o || !last.value.o._id) {
+        print("couldn't find an _id?");
+    }
+    else {
+        last.value.o = {_id : last.value.o._id};
+    }
+
+    print("nulling out this op:");
+    printjson(last);
+};
+
+rs.debug.getLastOpWritten = function(server) {
+    var s = db.getSisterDB("local");
+    if (server) {
+        s = connect(server+"/local");
+    }
+    s.getMongo().setSlaveOk();
+
+    return s.oplog.rs.find().sort({$natural : -1}).limit(1).next();
+};
+
+
 help = shellHelper.help = function (x) {
     if (x == "mr") {
         print("\nSee also http://www.mongodb.org/display/DOCS/MapReduce");
diff --git a/shell/utils_sh.js b/shell/utils_sh.js
index 2f4a5a3f85b..5bd449bc61d 100644
--- a/shell/utils_sh.js
+++ b/shell/utils_sh.js
@@ -33,7 +33,8 @@ sh.help = function() {
     print( "\tsh.moveChunk(fullName,find,to)            move the chunk where 'find' is to 'to' (name of shard)");
     
     print( "\tsh.setBalancerState( <bool on or not> )   turns the balancer on or off true=on, false=off" );
-    print( "\tsh.getBalancerState()   return true if on, off if not" );
+    print( "\tsh.getBalancerState()                     return true if on, off if not" );
+    print( "\tsh.isBalancerRunning()                    return true if the balancer is running on any mongos" );
     
     print( "\tsh.status()                               prints a general overview of the cluster" )
 }
@@ -90,3 +91,8 @@ sh.getBalancerState = function() {
         return true;
     return ! x.stopped;
 }
+
+sh.isBalancerRunning = function() {
+    var x = db.getSisterDB( "config" ).locks.findOne( { _id : "balancer" } );
+    return x.state > 0;
+}
diff --git a/speed.js b/speed.js
new file mode 100755
index 00000000000..c5aa3a36964
--- /dev/null
+++ b/speed.js
@@ -0,0 +1,13 @@
+t = db.fooo;
+t.drop();
+x = { str:'aaaabbbbcc' }
+s = new Date();
+for( var i = 0; i < 100000; i++ ) { 
+    x.i = i;
+    t.insert(x);
+}
+print( (new Date())-s );
+t.ensureIndex({x:1});
+t.ensureIndex({str:1});
+print( (new Date())-s );
+
diff --git a/third_party/linenoise/linenoise.cpp b/third_party/linenoise/linenoise.cpp
index 81f76194512..dca8dbb5a4f 100644
--- a/third_party/linenoise/linenoise.cpp
+++ b/third_party/linenoise/linenoise.cpp
@@ -549,7 +549,10 @@ static int linenoisePrompt(int fd, char *buf, size_t buflen, const char *prompt)
         /* Only autocomplete when the callback is set. It returns < 0 when
          * there was an error reading from fd. Otherwise it will return the
          * character that should be handled next. */
-        if (c == 9 && completionCallback != NULL) {
+        if (c == 9 && completionCallback != NULL) { /* tab */
+            /* ignore tabs used for indentation */
+            if (pos == 0) continue;
+
             c = completeLine(fd,prompt,buf,buflen,&len,&pos,cols);
             /* Return on errors */
             if (c < 0) return len;
diff --git a/third_party/snappy/COPYING b/third_party/snappy/COPYING
new file mode 100755
index 00000000000..8d6bd9fed4e
--- /dev/null
+++ b/third_party/snappy/COPYING
@@ -0,0 +1,28 @@
+Copyright 2011, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/snappy/README b/third_party/snappy/README
new file mode 100755
index 00000000000..df8f0e178e2
--- /dev/null
+++ b/third_party/snappy/README
@@ -0,0 +1,135 @@
+Snappy, a fast compressor/decompressor.
+
+
+Introduction
+============
+
+Snappy is a compression/decompression library. It does not aim for maximum
+compression, or compatibility with any other compression library; instead,
+it aims for very high speeds and reasonable compression. For instance,
+compared to the fastest mode of zlib, Snappy is an order of magnitude faster
+for most inputs, but the resulting compressed files are anywhere from 20% to
+100% bigger. (For more information, see "Performance", below.)
+
+Snappy has the following properties:
+
+ * Fast: Compression speeds at 250 MB/sec and beyond, with no assembler code.
+   See "Performance" below.
+ * Stable: Over the last few years, Snappy has compressed and decompressed
+   petabytes of data in Google's production environment. The Snappy bitstream
+   format is stable and will not change between versions.
+ * Robust: The Snappy decompressor is designed not to crash in the face of
+   corrupted or malicious input.
+ * Free and open source software: Snappy is licensed under a BSD-type license.
+   For more information, see the included COPYING file.
+
+Snappy has previously been called "Zippy" in some Google presentations
+and the like.
+
+
+Performance
+===========
+ 
+Snappy is intended to be fast. On a single core of a Core i7 processor
+in 64-bit mode, it compresses at about 250 MB/sec or more and decompresses at
+about 500 MB/sec or more. (These numbers are for the slowest inputs in our
+benchmark suite; others are much faster.) In our tests, Snappy usually
+is faster than algorithms in the same class (e.g. LZO, LZF, FastLZ, QuickLZ,
+etc.) while achieving comparable compression ratios.
+
+Typical compression ratios (based on the benchmark suite) are about 1.5-1.7x
+for plain text, about 2-4x for HTML, and of course 1.0x for JPEGs, PNGs and
+other already-compressed data. Similar numbers for zlib in its fastest mode
+are 2.6-2.8x, 3-7x and 1.0x, respectively. More sophisticated algorithms are
+capable of achieving yet higher compression rates, although usually at the
+expense of speed. Of course, compression ratio will vary significantly with
+the input.
+
+Although Snappy should be fairly portable, it is primarily optimized
+for 64-bit x86-compatible processors, and may run slower in other environments.
+In particular:
+
+ - Snappy uses 64-bit operations in several places to process more data at
+   once than would otherwise be possible.
+ - Snappy assumes unaligned 32- and 64-bit loads and stores are cheap.
+   On some platforms, these must be emulated with single-byte loads 
+   and stores, which is much slower.
+ - Snappy assumes little-endian throughout, and needs to byte-swap data in
+   several places if running on a big-endian platform.
+
+Experience has shown that even heavily tuned code can be improved.
+Performance optimizations, whether for 64-bit x86 or other platforms,
+are of course most welcome; see "Contact", below.
+
+
+Usage
+=====
+
+Note that Snappy, both the implementation and the main interface,
+is written in C++. However, several third-party bindings to other languages
+are available; see the Google Code page at http://code.google.com/p/snappy/
+for more information. Also, if you want to use Snappy from C code, you can
+use the included C bindings in snappy-c.h.
+
+To use Snappy from your own C++ program, include the file "snappy.h" from
+your calling file, and link against the compiled library.
+
+There are many ways to call Snappy, but the simplest possible is
+
+  snappy::Compress(input, &output);
+
+and similarly
+
+  snappy::Uncompress(input, &output);
+
+where "input" and "output" are both instances of std::string.
+
+There are other interfaces that are more flexible in various ways, including
+support for custom (non-array) input sources. See the header file for more
+information.
+
+
+Tests and benchmarks
+====================
+
+When you compile Snappy, snappy_unittest is compiled in addition to the
+library itself. You do not need it to use the compressor from your own library,
+but it contains several useful components for Snappy development.
+
+First of all, it contains unit tests, verifying correctness on your machine in
+various scenarios. If you want to change or optimize Snappy, please run the
+tests to verify you have not broken anything. Note that if you have the
+Google Test library installed, unit test behavior (especially failures) will be
+significantly more user-friendly. You can find Google Test at
+
+  http://code.google.com/p/googletest/
+
+You probably also want the gflags library for handling of command-line flags;
+you can find it at
+
+  http://code.google.com/p/google-gflags/
+
+In addition to the unit tests, snappy contains microbenchmarks used to
+tune compression and decompression performance. These are automatically run
+before the unit tests, but you can disable them using the flag
+--run_microbenchmarks=false if you have gflags installed (otherwise you will
+need to edit the source).
+
+Finally, snappy can benchmark Snappy against a few other compression libraries
+(zlib, LZO, LZF, FastLZ and QuickLZ), if they were detected at configure time.
+To benchmark using a given file, give the compression algorithm you want to test
+Snappy against (e.g. --zlib) and then a list of one or more file names on the
+command line. The testdata/ directory contains the files used by the
+microbenchmark, which should provide a reasonably balanced starting point for
+benchmarking. (Note that baddata[1-3].snappy are not intended as benchmarks; they
+are used to verify correctness in the presence of corrupted data in the unit
+test.)
+
+
+Contact
+=======
+
+Snappy is distributed through Google Code. For the latest version, a bug tracker,
+and other information, see
+
+  http://code.google.com/p/snappy/
diff --git a/third_party/snappy/config.h b/third_party/snappy/config.h
new file mode 100755
index 00000000000..bfc3b30087f
--- /dev/null
+++ b/third_party/snappy/config.h
@@ -0,0 +1,124 @@
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define if building universal (internal helper macro) */
+//#undef AC_APPLE_UNIVERSAL_BUILD
+
+#if defined(_WIN32)
+// signed/unsigned mismatch
+#pragma warning( disable : 4018 )
+#endif
+
+/* Define to 1 if the compiler supports __builtin_ctz and friends. */
+#if defined(__GNUC__)
+#definfe HAVE_BUILTIN_CTZ 1
+#endif
+
+/* Define to 1 if the compiler supports __builtin_expect. */
+#if defined(__GNUC__)
+#definfe HAVE_BUILTIN_EXPECT 1
+#endif
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#if !defined(_WIN32)
+#define HAVE_DLFCN_H 1
+#endif
+
+/* Use the gflags package for command-line parsing. */
+#undef HAVE_GFLAGS
+
+/* Defined when Google Test is available. */
+#undef HAVE_GTEST
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `fastlz' library (-lfastlz). */
+#undef HAVE_LIBFASTLZ
+
+/* Define to 1 if you have the `lzf' library (-llzf). */
+#undef HAVE_LIBLZF
+
+/* Define to 1 if you have the `lzo2' library (-llzo2). */
+#undef HAVE_LIBLZO2
+
+/* Define to 1 if you have the `quicklz' library (-lquicklz). */
+#undef HAVE_LIBQUICKLZ
+
+/* Define to 1 if you have the `z' library (-lz). */
+#undef HAVE_LIBZ
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#define HAVE_STDDEF_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#if !defined(_WIN32)
+#define HAVE_SYS_MMAN_H 1
+#endif
+
+/* Define to 1 if you have the <sys/resource.h> header file. */
+#define HAVE_SYS_RESOURCE_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if you have the <windows.h> header file. */
+#if defined(_WIN32)
+#define HAVE_WINDOWS_H 1
+#endif
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR "libs/"
+
+/* Name of package */
+#define PACKAGE "snappy"
+
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "snappy"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "snappy 1.0.3"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "snappy"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.0.3"
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Version number of package */
+#define VERSION "1.0.3"
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined(__BIG_ENDIAN__)
+#define WORDS_BIGENDIAN 1
+#endif
diff --git a/third_party/snappy/snappy-internal.h b/third_party/snappy/snappy-internal.h
new file mode 100755
index 00000000000..a32eda59fb2
--- /dev/null
+++ b/third_party/snappy/snappy-internal.h
@@ -0,0 +1,150 @@
+// Copyright 2008 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Internals shared between the Snappy implementation and its unittest.
+
+#ifndef UTIL_SNAPPY_SNAPPY_INTERNAL_H_
+#define UTIL_SNAPPY_SNAPPY_INTERNAL_H_
+
+#include "snappy-stubs-internal.h"
+
+namespace snappy {
+namespace internal {
+
+class WorkingMemory {
+ public:
+  WorkingMemory() : large_table_(NULL) { }
+  ~WorkingMemory() { delete[] large_table_; }
+
+  // Allocates and clears a hash table using memory in "*this",
+  // stores the number of buckets in "*table_size" and returns a pointer to
+  // the base of the hash table.
+  uint16* GetHashTable(size_t input_size, int* table_size);
+
+ private:
+  uint16 small_table_[1<<10];    // 2KB
+  uint16* large_table_;          // Allocated only when needed
+
+  DISALLOW_COPY_AND_ASSIGN(WorkingMemory);
+};
+
+// Flat array compression that does not emit the "uncompressed length"
+// prefix. Compresses "input" string to the "*op" buffer.
+//
+// REQUIRES: "input_length <= kBlockSize"
+// REQUIRES: "op" points to an array of memory that is at least
+// "MaxCompressedLength(input_length)" in size.
+// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
+// REQUIRES: "table_size" is a power of two
+//
+// Returns an "end" pointer into "op" buffer.
+// "end - op" is the compressed size of "input".
+char* CompressFragment(const char* input,
+                       size_t input_length,
+                       char* op,
+                       uint16* table,
+                       const int table_size);
+
+// Return the largest n such that
+//
+//   s1[0,n-1] == s2[0,n-1]
+//   and n <= (s2_limit - s2).
+//
+// Does not read *s2_limit or beyond.
+// Does not read *(s1 + (s2_limit - s2)) or beyond.
+// Requires that s2_limit >= s2.
+//
+// Separate implementation for x86_64, for speed.  Uses the fact that
+// x86_64 is little endian.
+#if defined(ARCH_K8)
+static inline int FindMatchLength(const char* s1,
+                                  const char* s2,
+                                  const char* s2_limit) {
+  DCHECK_GE(s2_limit, s2);
+  int matched = 0;
+
+  // Find out how long the match is. We loop over the data 64 bits at a
+  // time until we find a 64-bit block that doesn't match; then we find
+  // the first non-matching bit and use that to calculate the total
+  // length of the match.
+  while (PREDICT_TRUE(s2 <= s2_limit - 8)) {
+    if (PREDICT_FALSE(UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched))) {
+      s2 += 8;
+      matched += 8;
+    } else {
+      // On current (mid-2008) Opteron models there is a 3% more
+      // efficient code sequence to find the first non-matching byte.
+      // However, what follows is ~10% better on Intel Core 2 and newer,
+      // and we expect AMD's bsf instruction to improve.
+      uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
+      int matching_bits = Bits::FindLSBSetNonZero64(x);
+      matched += matching_bits >> 3;
+      return matched;
+    }
+  }
+  while (PREDICT_TRUE(s2 < s2_limit)) {
+    if (PREDICT_TRUE(s1[matched] == *s2)) {
+      ++s2;
+      ++matched;
+    } else {
+      return matched;
+    }
+  }
+  return matched;
+}
+#else
+static inline int FindMatchLength(const char* s1,
+                                  const char* s2,
+                                  const char* s2_limit) {
+  // Implementation based on the x86-64 version, above.
+  DCHECK_GE(s2_limit, s2);
+  int matched = 0;
+
+  while (s2 <= s2_limit - 4 &&
+         UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
+    s2 += 4;
+    matched += 4;
+  }
+  if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
+    uint32 x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
+    int matching_bits = Bits::FindLSBSetNonZero(x);
+    matched += matching_bits >> 3;
+  } else {
+    while ((s2 < s2_limit) && (s1[matched] == *s2)) {
+      ++s2;
+      ++matched;
+    }
+  }
+  return matched;
+}
+#endif
+
+}  // end namespace internal
+}  // end namespace snappy
+
+#endif  // UTIL_SNAPPY_SNAPPY_INTERNAL_H_
diff --git a/third_party/snappy/snappy-sinksource.cc b/third_party/snappy/snappy-sinksource.cc
new file mode 100755
index 00000000000..1017895f962
--- /dev/null
+++ b/third_party/snappy/snappy-sinksource.cc
@@ -0,0 +1,72 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <string.h>
+
+#include "snappy-sinksource.h"
+
+namespace snappy {
+
+Source::~Source() { }
+
+Sink::~Sink() { }
+
+char* Sink::GetAppendBuffer(size_t length, char* scratch) {
+  return scratch;
+}
+
+ByteArraySource::~ByteArraySource() { }
+
+size_t ByteArraySource::Available() const { return left_; }
+
+const char* ByteArraySource::Peek(size_t* len) {
+  *len = left_;
+  return ptr_;
+}
+
+void ByteArraySource::Skip(size_t n) {
+  left_ -= n;
+  ptr_ += n;
+}
+
+UncheckedByteArraySink::~UncheckedByteArraySink() { }
+
+void UncheckedByteArraySink::Append(const char* data, size_t n) {
+  // Do no copying if the caller filled in the result of GetAppendBuffer()
+  if (data != dest_) {
+    memcpy(dest_, data, n);
+  }
+  dest_ += n;
+}
+
+char* UncheckedByteArraySink::GetAppendBuffer(size_t len, char* scratch) {
+  return dest_;
+}
+
+
+}
diff --git a/third_party/snappy/snappy-sinksource.h b/third_party/snappy/snappy-sinksource.h
new file mode 100755
index 00000000000..430baeabb0e
--- /dev/null
+++ b/third_party/snappy/snappy-sinksource.h
@@ -0,0 +1,136 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef UTIL_SNAPPY_SNAPPY_SINKSOURCE_H_
+#define UTIL_SNAPPY_SNAPPY_SINKSOURCE_H_
+
+#include <stddef.h>
+
+
+namespace snappy {
+
+// A Sink is an interface that consumes a sequence of bytes.
+class Sink {
+ public:
+  Sink() { }
+  virtual ~Sink();
+
+  // Append "bytes[0,n-1]" to this.
+  virtual void Append(const char* bytes, size_t n) = 0;
+
+  // Returns a writable buffer of the specified length for appending.
+  // May return a pointer to the caller-owned scratch buffer which
+  // must have at least the indicated length.  The returned buffer is
+  // only valid until the next operation on this Sink.
+  //
+  // After writing at most "length" bytes, call Append() with the
+  // pointer returned from this function and the number of bytes
+  // written.  Many Append() implementations will avoid copying
+  // bytes if this function returned an internal buffer.
+  //
+  // If a non-scratch buffer is returned, the caller may only pass a
+  // prefix of it to Append().  That is, it is not correct to pass an
+  // interior pointer of the returned array to Append().
+  //
+  // The default implementation always returns the scratch buffer.
+  virtual char* GetAppendBuffer(size_t length, char* scratch);
+
+ private:
+  // No copying
+  Sink(const Sink&);
+  void operator=(const Sink&);
+};
+
+// A Source is an interface that yields a sequence of bytes
+class Source {
+ public:
+  Source() { }
+  virtual ~Source();
+
+  // Return the number of bytes left to read from the source
+  virtual size_t Available() const = 0;
+
+  // Peek at the next flat region of the source.  Does not reposition
+  // the source.  The returned region is empty iff Available()==0.
+  //
+  // Returns a pointer to the beginning of the region and store its
+  // length in *len.
+  //
+  // The returned region is valid until the next call to Skip() or
+  // until this object is destroyed, whichever occurs first.
+  //
+  // The returned region may be larger than Available() (for example
+  // if this ByteSource is a view on a substring of a larger source).
+  // The caller is responsible for ensuring that it only reads the
+  // Available() bytes.
+  virtual const char* Peek(size_t* len) = 0;
+
+  // Skip the next n bytes.  Invalidates any buffer returned by
+  // a previous call to Peek().
+  // REQUIRES: Available() >= n
+  virtual void Skip(size_t n) = 0;
+
+ private:
+  // No copying
+  Source(const Source&);
+  void operator=(const Source&);
+};
+
+// A Source implementation that yields the contents of a flat array
+class ByteArraySource : public Source {
+ public:
+  ByteArraySource(const char* p, size_t n) : ptr_(p), left_(n) { }
+  virtual ~ByteArraySource();
+  virtual size_t Available() const;
+  virtual const char* Peek(size_t* len);
+  virtual void Skip(size_t n);
+ private:
+  const char* ptr_;
+  size_t left_;
+};
+
+// A Sink implementation that writes to a flat array without any bound checks.
+class UncheckedByteArraySink : public Sink {
+ public:
+  explicit UncheckedByteArraySink(char* dest) : dest_(dest) { }
+  virtual ~UncheckedByteArraySink();
+  virtual void Append(const char* data, size_t n);
+  virtual char* GetAppendBuffer(size_t len, char* scratch);
+
+  // Return the current output pointer so that a caller can see how
+  // many bytes were produced.
+  // Note: this is not a Sink method.
+  char* CurrentDestination() const { return dest_; }
+ private:
+  char* dest_;
+};
+
+
+}
+
+#endif  // UTIL_SNAPPY_SNAPPY_SINKSOURCE_H_
diff --git a/third_party/snappy/snappy-stubs-internal.cc b/third_party/snappy/snappy-stubs-internal.cc
new file mode 100755
index 00000000000..6ed334371f1
--- /dev/null
+++ b/third_party/snappy/snappy-stubs-internal.cc
@@ -0,0 +1,42 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <algorithm>
+#include <string>
+
+#include "snappy-stubs-internal.h"
+
+namespace snappy {
+
+void Varint::Append32(string* s, uint32 value) {
+  char buf[Varint::kMax32];
+  const char* p = Varint::Encode32(buf, value);
+  s->append(buf, p - buf);
+}
+
+}  // namespace snappy
diff --git a/third_party/snappy/snappy-stubs-internal.h b/third_party/snappy/snappy-stubs-internal.h
new file mode 100755
index 00000000000..355a06bc568
--- /dev/null
+++ b/third_party/snappy/snappy-stubs-internal.h
@@ -0,0 +1,478 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Various stubs for the open-source version of Snappy.
+
+#ifndef UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
+#define UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <iostream>
+#include <string>
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_MMAN
+#include <sys/mman.h>
+#endif
+
+#include "snappy-stubs-public.h"
+
+#if defined(__x86_64__)
+
+// Enable 64-bit optimized versions of some routines.
+#define ARCH_K8 1
+
+#endif
+
+// Needed by OS X, among others.
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+// Pull in std::min, std::ostream, and the likes. This is safe because this
+// header file is never used from any public header files.
+using namespace std;
+
+// The size of an array, if known at compile-time.
+// Will give unexpected results if used on a pointer.
+// We undefine it first, since some compilers already have a definition.
+#ifdef ARRAYSIZE
+#undef ARRAYSIZE
+#endif
+#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
+
+// Static prediction hints.
+#ifdef HAVE_BUILTIN_EXPECT
+#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#else
+#define PREDICT_FALSE(x) x
+#define PREDICT_TRUE(x) x
+#endif
+
+// This is only used for recomputing the tag byte table used during
+// decompression; for simplicity we just remove it from the open-source
+// version (anyone who wants to regenerate it can just do the call
+// themselves within main()).
+#define DEFINE_bool(flag_name, default_value, description) \
+  bool FLAGS_ ## flag_name = default_value;
+#define DECLARE_bool(flag_name) \
+  extern bool FLAGS_ ## flag_name;
+#define REGISTER_MODULE_INITIALIZER(name, code)
+
+namespace snappy {
+
+static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
+static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
+
+// Logging.
+
+#define LOG(level) LogMessage()
+#define VLOG(level) true ? (void)0 : \
+    snappy::LogMessageVoidify() & snappy::LogMessage()
+
+class LogMessage {
+ public:
+  LogMessage() { }
+  ~LogMessage() {
+    cerr << endl;
+  }
+
+  LogMessage& operator<<(const std::string& msg) {
+    cerr << msg;
+    return *this;
+  }
+  LogMessage& operator<<(int x) {
+    cerr << x;
+    return *this;
+  }
+};
+
+// Asserts, both versions activated in debug mode only,
+// and ones that are always active.
+
+#define CRASH_UNLESS(condition) \
+    PREDICT_TRUE(condition) ? (void)0 : \
+    snappy::LogMessageVoidify() & snappy::LogMessageCrash()
+
+class LogMessageCrash : public LogMessage {
+ public:
+  LogMessageCrash() { }
+  ~LogMessageCrash() {
+    cerr << endl;
+    abort();
+  }
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+
+class LogMessageVoidify {
+ public:
+  LogMessageVoidify() { }
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const LogMessage&) { }
+};
+
+#define CHECK(cond) CRASH_UNLESS(cond)
+#define CHECK_LE(a, b) CRASH_UNLESS((a) <= (b))
+#define CHECK_GE(a, b) CRASH_UNLESS((a) >= (b))
+#define CHECK_EQ(a, b) CRASH_UNLESS((a) == (b))
+#define CHECK_NE(a, b) CRASH_UNLESS((a) != (b))
+#define CHECK_LT(a, b) CRASH_UNLESS((a) < (b))
+#define CHECK_GT(a, b) CRASH_UNLESS((a) > (b))
+
+#ifdef NDEBUG
+
+#define DCHECK(cond) CRASH_UNLESS(true)
+#define DCHECK_LE(a, b) CRASH_UNLESS(true)
+#define DCHECK_GE(a, b) CRASH_UNLESS(true)
+#define DCHECK_EQ(a, b) CRASH_UNLESS(true)
+#define DCHECK_NE(a, b) CRASH_UNLESS(true)
+#define DCHECK_LT(a, b) CRASH_UNLESS(true)
+#define DCHECK_GT(a, b) CRASH_UNLESS(true)
+
+#else
+
+#define DCHECK(cond) CHECK(cond)
+#define DCHECK_LE(a, b) CHECK_LE(a, b)
+#define DCHECK_GE(a, b) CHECK_GE(a, b)
+#define DCHECK_EQ(a, b) CHECK_EQ(a, b)
+#define DCHECK_NE(a, b) CHECK_NE(a, b)
+#define DCHECK_LT(a, b) CHECK_LT(a, b)
+#define DCHECK_GT(a, b) CHECK_GT(a, b)
+
+#endif
+
+// Potentially unaligned loads and stores.
+
+#if 1
+//#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(_WIN32)
+
+#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
+#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
+#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
+
+#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
+#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
+#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
+
+#else
+
+// These functions are provided for architectures that don't support
+// unaligned loads and stores.
+
+inline uint16 UNALIGNED_LOAD16(const void *p) {
+  uint16 t;
+  memcpy(&t, p, sizeof t);
+  return t;
+}
+
+inline uint32 UNALIGNED_LOAD32(const void *p) {
+  uint32 t;
+  memcpy(&t, p, sizeof t);
+  return t;
+}
+
+inline uint64 UNALIGNED_LOAD64(const void *p) {
+  uint64 t;
+  memcpy(&t, p, sizeof t);
+  return t;
+}
+
+inline void UNALIGNED_STORE16(void *p, uint16 v) {
+  memcpy(p, &v, sizeof v);
+}
+
+inline void UNALIGNED_STORE32(void *p, uint32 v) {
+  memcpy(p, &v, sizeof v);
+}
+
+inline void UNALIGNED_STORE64(void *p, uint64 v) {
+  memcpy(p, &v, sizeof v);
+}
+
+#endif
+
+// The following guarantees declaration of the byte swap functions.
+#ifdef WORDS_BIGENDIAN
+
+#ifdef _MSC_VER
+#include <stdlib.h>
+#define bswap_16(x) _byteswap_ushort(x)
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+#define bswap_16(x) OSSwapInt16(x)
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#else
+#include <byteswap.h>
+#endif
+
+#endif  // WORDS_BIGENDIAN
+
+// Convert to little-endian storage, opposite of network format.
+// Convert x from host to little endian: x = LittleEndian.FromHost(x);
+// convert x from little endian to host: x = LittleEndian.ToHost(x);
+//
+//  Store values into unaligned memory converting to little endian order:
+//    LittleEndian.Store16(p, x);
+//
+//  Load unaligned values stored in little endian converting to host order:
+//    x = LittleEndian.Load16(p);
+class LittleEndian {
+ public:
+  // Conversion functions.
+#ifdef WORDS_BIGENDIAN
+
+  static uint16 FromHost16(uint16 x) { return bswap_16(x); }
+  static uint16 ToHost16(uint16 x) { return bswap_16(x); }
+
+  static uint32 FromHost32(uint32 x) { return bswap_32(x); }
+  static uint32 ToHost32(uint32 x) { return bswap_32(x); }
+
+  static bool IsLittleEndian() { return false; }
+
+#else  // !defined(WORDS_BIGENDIAN)
+
+  static uint16 FromHost16(uint16 x) { return x; }
+  static uint16 ToHost16(uint16 x) { return x; }
+
+  static uint32 FromHost32(uint32 x) { return x; }
+  static uint32 ToHost32(uint32 x) { return x; }
+
+  static bool IsLittleEndian() { return true; }
+
+#endif  // !defined(WORDS_BIGENDIAN)
+
+  // Functions to do unaligned loads and stores in little-endian order.
+  static uint16 Load16(const void *p) {
+    return ToHost16(UNALIGNED_LOAD16(p));
+  }
+
+  static void Store16(void *p, uint16 v) {
+    UNALIGNED_STORE16(p, FromHost16(v));
+  }
+
+  static uint32 Load32(const void *p) {
+    return ToHost32(UNALIGNED_LOAD32(p));
+  }
+
+  static void Store32(void *p, uint32 v) {
+    UNALIGNED_STORE32(p, FromHost32(v));
+  }
+};
+
+// Some bit-manipulation functions.
+class Bits {
+ public:
+  // Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+  static int Log2Floor(uint32 n);
+
+  // Return the first set least / most significant bit, 0-indexed.  Returns an
+  // undefined value if n == 0.  FindLSBSetNonZero() is similar to ffs() except
+  // that it's 0-indexed.
+  static int FindLSBSetNonZero(uint32 n);
+  static int FindLSBSetNonZero64(uint64 n);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(Bits);
+};
+
+#ifdef HAVE_BUILTIN_CTZ
+
+inline int Bits::Log2Floor(uint32 n) {
+  return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+}
+
+inline int Bits::FindLSBSetNonZero(uint32 n) {
+  return __builtin_ctz(n);
+}
+
+inline int Bits::FindLSBSetNonZero64(uint64 n) {
+  return __builtin_ctzll(n);
+}
+
+#else  // Portable versions.
+
+inline int Bits::Log2Floor(uint32 n) {
+  if (n == 0)
+    return -1;
+  int log = 0;
+  uint32 value = n;
+  for (int i = 4; i >= 0; --i) {
+    int shift = (1 << i);
+    uint32 x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  assert(value == 1);
+  return log;
+}
+
+inline int Bits::FindLSBSetNonZero(uint32 n) {
+  int rc = 31;
+  for (int i = 4, shift = 1 << 4; i >= 0; --i) {
+    const uint32 x = n << shift;
+    if (x != 0) {
+      n = x;
+      rc -= shift;
+    }
+    shift >>= 1;
+  }
+  return rc;
+}
+
+// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
+inline int Bits::FindLSBSetNonZero64(uint64 n) {
+  const uint32 bottombits = static_cast<uint32>(n);
+  if (bottombits == 0) {
+    // Bottom bits are zero, so scan in top bits
+    return 32 + FindLSBSetNonZero(static_cast<uint32>(n >> 32));
+  } else {
+    return FindLSBSetNonZero(bottombits);
+  }
+}
+
+#endif  // End portable versions.
+
+// Variable-length integer encoding.
+class Varint {
+ public:
+  // Maximum lengths of varint encoding of uint32.
+  static const int kMax32 = 5;
+
+  // Attempts to parse a varint32 from a prefix of the bytes in [ptr,limit-1].
+  // Never reads a character at or beyond limit.  If a valid/terminated varint32
+  // was found in the range, stores it in *OUTPUT and returns a pointer just
+  // past the last byte of the varint32. Else returns NULL.  On success,
+  // "result <= limit".
+  static const char* Parse32WithLimit(const char* ptr, const char* limit,
+                                      uint32* OUTPUT);
+
+  // REQUIRES   "ptr" points to a buffer of length sufficient to hold "v".
+  // EFFECTS    Encodes "v" into "ptr" and returns a pointer to the
+  //            byte just past the last encoded byte.
+  static char* Encode32(char* ptr, uint32 v);
+
+  // EFFECTS    Appends the varint representation of "value" to "*s".
+  static void Append32(string* s, uint32 value);
+};
+
+inline const char* Varint::Parse32WithLimit(const char* p,
+                                            const char* l,
+                                            uint32* OUTPUT) {
+  const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
+  const unsigned char* limit = reinterpret_cast<const unsigned char*>(l);
+  uint32 b, result;
+  if (ptr >= limit) return NULL;
+  b = *(ptr++); result = b & 127;          if (b < 128) goto done;
+  if (ptr >= limit) return NULL;
+  b = *(ptr++); result |= (b & 127) <<  7; if (b < 128) goto done;
+  if (ptr >= limit) return NULL;
+  b = *(ptr++); result |= (b & 127) << 14; if (b < 128) goto done;
+  if (ptr >= limit) return NULL;
+  b = *(ptr++); result |= (b & 127) << 21; if (b < 128) goto done;
+  if (ptr >= limit) return NULL;
+  b = *(ptr++); result |= (b & 127) << 28; if (b < 16) goto done;
+  return NULL;       // Value is too long to be a varint32
+ done:
+  *OUTPUT = result;
+  return reinterpret_cast<const char*>(ptr);
+}
+
+inline char* Varint::Encode32(char* sptr, uint32 v) {
+  // Operate on characters as unsigneds
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(sptr);
+  static const int B = 128;
+  if (v < (1<<7)) {
+    *(ptr++) = v;
+  } else if (v < (1<<14)) {
+    *(ptr++) = v | B;
+    *(ptr++) = v>>7;
+  } else if (v < (1<<21)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v>>7) | B;
+    *(ptr++) = v>>14;
+  } else if (v < (1<<28)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v>>7) | B;
+    *(ptr++) = (v>>14) | B;
+    *(ptr++) = v>>21;
+  } else {
+    *(ptr++) = v | B;
+    *(ptr++) = (v>>7) | B;
+    *(ptr++) = (v>>14) | B;
+    *(ptr++) = (v>>21) | B;
+    *(ptr++) = v>>28;
+  }
+  return reinterpret_cast<char*>(ptr);
+}
+
+// If you know the internal layout of the std::string in use, you can
+// replace this function with one that resizes the string without
+// filling the new space with zeros (if applicable) --
+// it will be non-portable but faster.
+inline void STLStringResizeUninitialized(string* s, size_t new_size) {
+  s->resize(new_size);
+}
+
+// Return a mutable char* pointing to a string's internal buffer,
+// which may not be null-terminated. Writing through this pointer will
+// modify the string.
+//
+// string_as_array(&str)[i] is valid for 0 <= i < str.size() until the
+// next call to a string method that invalidates iterators.
+//
+// As of 2006-04, there is no standard-blessed way of getting a
+// mutable reference to a string's internal buffer. However, issue 530
+// (http://www.open-std.org/JTC1/SC22/WG21/docs/lwg-defects.html#530)
+// proposes this as the method. It will officially be part of the standard
+// for C++0x. This should already work on all current implementations.
+inline char* string_as_array(string* str) {
+  return str->empty() ? NULL : &*str->begin();
+}
+
+}  // namespace snappy
+
+#endif  // UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
diff --git a/third_party/snappy/snappy-stubs-public.h b/third_party/snappy/snappy-stubs-public.h
new file mode 100755
index 00000000000..074d4638866
--- /dev/null
+++ b/third_party/snappy/snappy-stubs-public.h
@@ -0,0 +1,85 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: sesse@google.com (Steinar H. Gunderson)
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Various type stubs for the open-source version of Snappy.
+//
+// This file cannot include config.h, as it is included from snappy.h,
+// which is a public header. Instead, snappy-stubs-public.h is generated by
+// from snappy-stubs-public.h.in at configure time.
+
+#ifndef UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
+#define UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
+
+#if !defined(_WIN32)
+#include <stdint.h>
+#endif
+
+#if 1
+#include <stddef.h>
+#endif
+
+#define SNAPPY_MAJOR 1
+#define SNAPPY_MINOR 0
+#define SNAPPY_PATCHLEVEL 3
+#define SNAPPY_VERSION \
+    ((SNAPPY_MAJOR << 16) | (SNAPPY_MINOR << 8) | SNAPPY_PATCHLEVEL)
+
+#include <string>
+
+namespace snappy {
+
+#if !defined(_WIN32)
+typedef int8_t int8;
+typedef uint8_t uint8;
+typedef int16_t int16;
+typedef uint16_t uint16;
+typedef int32_t int32;
+typedef uint32_t uint32;
+typedef int64_t int64;
+typedef uint64_t uint64;
+#else
+typedef signed char int8;
+typedef unsigned char uint8;
+typedef short int16;
+typedef unsigned short uint16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+#endif
+
+typedef std::string string;
+
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);               \
+  void operator=(const TypeName&)
+
+}  // namespace snappy
+
+#endif  // UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
diff --git a/third_party/snappy/snappy.cc b/third_party/snappy/snappy.cc
new file mode 100755
index 00000000000..fdc67e886c6
--- /dev/null
+++ b/third_party/snappy/snappy.cc
@@ -0,0 +1,1026 @@
+// Copyright 2005 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "snappy.h"
+#include "snappy-internal.h"
+#include "snappy-sinksource.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+
+namespace snappy {
+
+// Any hash function will produce a valid compressed bitstream, but a good
+// hash function reduces the number of collisions and thus yields better
+// compression for compressible input, and more speed for incompressible
+// input. Of course, it doesn't hurt if the hash function is reasonably fast
+// either, as it gets called a lot.
+static inline uint32 HashBytes(uint32 bytes, int shift) {
+  uint32 kMul = 0x1e35a7bd;
+  return (bytes * kMul) >> shift;
+}
+static inline uint32 Hash(const char* p, int shift) {
+  return HashBytes(UNALIGNED_LOAD32(p), shift);
+}
+
+size_t MaxCompressedLength(size_t source_len) {
+  // Compressed data can be defined as:
+  //    compressed := item* literal*
+  //    item       := literal* copy
+  //
+  // The trailing literal sequence has a space blowup of at most 62/60
+  // since a literal of length 60 needs one tag byte + one extra byte
+  // for length information.
+  //
+  // Item blowup is trickier to measure.  Suppose the "copy" op copies
+  // 4 bytes of data.  Because of a special check in the encoding code,
+  // we produce a 4-byte copy only if the offset is < 65536.  Therefore
+  // the copy op takes 3 bytes to encode, and this type of item leads
+  // to at most the 62/60 blowup for representing literals.
+  //
+  // Suppose the "copy" op copies 5 bytes of data.  If the offset is big
+  // enough, it will take 5 bytes to encode the copy op.  Therefore the
+  // worst case here is a one-byte literal followed by a five-byte copy.
+  // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
+  //
+  // This last factor dominates the blowup, so the final estimate is:
+  return 32 + source_len + source_len/6;
+}
+
+enum {
+  LITERAL = 0,
+  COPY_1_BYTE_OFFSET = 1,  // 3 bit length + 3 bits of offset in opcode
+  COPY_2_BYTE_OFFSET = 2,
+  COPY_4_BYTE_OFFSET = 3
+};
+
+// Copy "len" bytes from "src" to "op", one byte at a time.  Used for
+// handling COPY operations where the input and output regions may
+// overlap.  For example, suppose:
+//    src    == "ab"
+//    op     == src + 2
+//    len    == 20
+// After IncrementalCopy(src, op, len), the result will have
+// eleven copies of "ab"
+//    ababababababababababab
+// Note that this does not match the semantics of either memcpy()
+// or memmove().
+static inline void IncrementalCopy(const char* src, char* op, int len) {
+  DCHECK_GT(len, 0);
+  do {
+    *op++ = *src++;
+  } while (--len > 0);
+}
+
+// Equivalent to IncrementalCopy except that it can write up to ten extra
+// bytes after the end of the copy, and that it is faster.
+//
+// The main part of this loop is a simple copy of eight bytes at a time until
+// we've copied (at least) the requested amount of bytes.  However, if op and
+// src are less than eight bytes apart (indicating a repeating pattern of
+// length < 8), we first need to expand the pattern in order to get the correct
+// results. For instance, if the buffer looks like this, with the eight-byte
+// <src> and <op> patterns marked as intervals:
+//
+//    abxxxxxxxxxxxx
+//    [------]           src
+//      [------]         op
+//
+// a single eight-byte copy from <src> to <op> will repeat the pattern once,
+// after which we can move <op> two bytes without moving <src>:
+//
+//    ababxxxxxxxxxx
+//    [------]           src
+//        [------]       op
+//
+// and repeat the exercise until the two no longer overlap.
+//
+// This allows us to do very well in the special case of one single byte
+// repeated many times, without taking a big hit for more general cases.
+//
+// The worst case of extra writing past the end of the match occurs when
+// op - src == 1 and len == 1; the last copy will read from byte positions
+// [0..7] and write to [4..11], whereas it was only supposed to write to
+// position 1. Thus, ten excess bytes.
+
+namespace {
+
+const int kMaxIncrementCopyOverflow = 10;
+
+}  // namespace
+
+static inline void IncrementalCopyFastPath(const char* src, char* op, int len) {
+  while (op - src < 8) {
+    UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+    len -= op - src;
+    op += op - src;
+  }
+  while (len > 0) {
+    UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+    src += 8;
+    op += 8;
+    len -= 8;
+  }
+}
+
+static inline char* EmitLiteral(char* op,
+                                const char* literal,
+                                int len,
+                                bool allow_fast_path) {
+  int n = len - 1;      // Zero-length literals are disallowed
+  if (n < 60) {
+    // Fits in tag byte
+    *op++ = LITERAL | (n << 2);
+
+    // The vast majority of copies are below 16 bytes, for which a
+    // call to memcpy is overkill. This fast path can sometimes
+    // copy up to 15 bytes too much, but that is okay in the
+    // main loop, since we have a bit to go on for both sides:
+    //
+    //   - The input will always have kInputMarginBytes = 15 extra
+    //     available bytes, as long as we're in the main loop, and
+    //     if not, allow_fast_path = false.
+    //   - The output will always have 32 spare bytes (see
+    //     MaxCompressedLength).
+    if (allow_fast_path && len <= 16) {
+      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal));
+      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(literal + 8));
+      return op + len;
+    }
+  } else {
+    // Encode in upcoming bytes
+    char* base = op;
+    int count = 0;
+    op++;
+    while (n > 0) {
+      *op++ = n & 0xff;
+      n >>= 8;
+      count++;
+    }
+    assert(count >= 1);
+    assert(count <= 4);
+    *base = LITERAL | ((59+count) << 2);
+  }
+  memcpy(op, literal, len);
+  return op + len;
+}
+
+static inline char* EmitCopyLessThan64(char* op, int offset, int len) {
+  DCHECK_LE(len, 64);
+  DCHECK_GE(len, 4);
+  DCHECK_LT(offset, 65536);
+
+  if ((len < 12) && (offset < 2048)) {
+    int len_minus_4 = len - 4;
+    assert(len_minus_4 < 8);            // Must fit in 3 bits
+    *op++ = COPY_1_BYTE_OFFSET | ((len_minus_4) << 2) | ((offset >> 8) << 5);
+    *op++ = offset & 0xff;
+  } else {
+    *op++ = COPY_2_BYTE_OFFSET | ((len-1) << 2);
+    LittleEndian::Store16(op, offset);
+    op += 2;
+  }
+  return op;
+}
+
+static inline char* EmitCopy(char* op, int offset, int len) {
+  // Emit 64 byte copies but make sure to keep at least four bytes reserved
+  while (len >= 68) {
+    op = EmitCopyLessThan64(op, offset, 64);
+    len -= 64;
+  }
+
+  // Emit an extra 60 byte copy if have too much data to fit in one copy
+  if (len > 64) {
+    op = EmitCopyLessThan64(op, offset, 60);
+    len -= 60;
+  }
+
+  // Emit remainder
+  op = EmitCopyLessThan64(op, offset, len);
+  return op;
+}
+
+
+bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
+  uint32 v = 0;
+  const char* limit = start + n;
+  if (Varint::Parse32WithLimit(start, limit, &v) != NULL) {
+    *result = v;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+namespace internal {
+uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
+  // Use smaller hash table when input.size() is smaller, since we
+  // fill the table, incurring O(hash table size) overhead for
+  // compression, and if the input is short, we won't need that
+  // many hash table entries anyway.
+  assert(kMaxHashTableSize >= 256);
+  int htsize = 256;
+  while (htsize < kMaxHashTableSize && htsize < input_size) {
+    htsize <<= 1;
+  }
+  CHECK_EQ(0, htsize & (htsize - 1)) << ": must be power of two";
+  CHECK_LE(htsize, kMaxHashTableSize) << ": hash table too large";
+
+  uint16* table;
+  if (htsize <= ARRAYSIZE(small_table_)) {
+    table = small_table_;
+  } else {
+    if (large_table_ == NULL) {
+      large_table_ = new uint16[kMaxHashTableSize];
+    }
+    table = large_table_;
+  }
+
+  *table_size = htsize;
+  memset(table, 0, htsize * sizeof(*table));
+  return table;
+}
+}  // end namespace internal
+
+#if defined(_WIN32)
+// signed/unsigned mismatch
+# pragma warning( disable : 4244 )
+#endif
+
+// For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will
+// equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
+// empirically found that overlapping loads such as
+//  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
+// are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
+static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
+  DCHECK(0 <= offset && offset <= 4) << offset;
+  return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
+}
+
+// Flat array compression that does not emit the "uncompressed length"
+// prefix. Compresses "input" string to the "*op" buffer.
+//
+// REQUIRES: "input" is at most "kBlockSize" bytes long.
+// REQUIRES: "op" points to an array of memory that is at least
+// "MaxCompressedLength(input.size())" in size.
+// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
+// REQUIRES: "table_size" is a power of two
+//
+// Returns an "end" pointer into "op" buffer.
+// "end - op" is the compressed size of "input".
+namespace internal {
+char* CompressFragment(const char* const input,
+                       const size_t input_size,
+                       char* op,
+                       uint16* table,
+                       const int table_size) {
+  // "ip" is the input pointer, and "op" is the output pointer.
+  const char* ip = input;
+  CHECK_LE(input_size, kBlockSize);
+  CHECK_EQ(table_size & (table_size - 1), 0) << ": table must be power of two";
+  const int shift = 32 - Bits::Log2Floor(table_size);
+  DCHECK_EQ(kuint32max >> shift, table_size - 1);
+  const char* ip_end = input + input_size;
+  const char* base_ip = ip;
+  // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
+  // [next_emit, ip_end) after the main loop.
+  const char* next_emit = ip;
+
+  const int kInputMarginBytes = 15;
+  if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
+    const char* ip_limit = input + input_size - kInputMarginBytes;
+
+    for (uint32 next_hash = Hash(++ip, shift); ; ) {
+      DCHECK_LT(next_emit, ip);
+      // The body of this loop calls EmitLiteral once and then EmitCopy one or
+      // more times.  (The exception is that when we're close to exhausting
+      // the input we goto emit_remainder.)
+      //
+      // In the first iteration of this loop we're just starting, so
+      // there's nothing to copy, so calling EmitLiteral once is
+      // necessary.  And we only start a new iteration when the
+      // current iteration has determined that a call to EmitLiteral will
+      // precede the next call to EmitCopy (if any).
+      //
+      // Step 1: Scan forward in the input looking for a 4-byte-long match.
+      // If we get close to exhausting the input then goto emit_remainder.
+      //
+      // Heuristic match skipping: If 32 bytes are scanned with no matches
+      // found, start looking only at every other byte. If 32 more bytes are
+      // scanned, look at every third byte, etc.. When a match is found,
+      // immediately go back to looking at every byte. This is a small loss
+      // (~5% performance, ~0.1% density) for compressible data due to more
+      // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
+      // win since the compressor quickly "realizes" the data is incompressible
+      // and doesn't bother looking for matches everywhere.
+      //
+      // The "skip" variable keeps track of how many bytes there are since the
+      // last match; dividing it by 32 (ie. right-shifting by five) gives the
+      // number of bytes to move ahead for each iteration.
+      uint32 skip = 32;
+
+      const char* next_ip = ip;
+      const char* candidate;
+      do {
+        ip = next_ip;
+        uint32 hash = next_hash;
+        DCHECK_EQ(hash, Hash(ip, shift));
+        uint32 bytes_between_hash_lookups = skip++ >> 5;
+        next_ip = ip + bytes_between_hash_lookups;
+        if (PREDICT_FALSE(next_ip > ip_limit)) {
+          goto emit_remainder;
+        }
+        next_hash = Hash(next_ip, shift);
+        candidate = base_ip + table[hash];
+        DCHECK_GE(candidate, base_ip);
+        DCHECK_LT(candidate, ip);
+
+        table[hash] = ip - base_ip;
+      } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
+                            UNALIGNED_LOAD32(candidate)));
+
+      // Step 2: A 4-byte match has been found.  We'll later see if more
+      // than 4 bytes match.  But, prior to the match, input
+      // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
+      DCHECK_LE(next_emit + 16, ip_end);
+      op = EmitLiteral(op, next_emit, ip - next_emit, true);
+
+      // Step 3: Call EmitCopy, and then see if another EmitCopy could
+      // be our next move.  Repeat until we find no match for the
+      // input immediately after what was consumed by the last EmitCopy call.
+      //
+      // If we exit this loop normally then we need to call EmitLiteral next,
+      // though we don't yet know how big the literal will be.  We handle that
+      // by proceeding to the next iteration of the main loop.  We also can exit
+      // this loop via goto if we get close to exhausting the input.
+      uint64 input_bytes = 0;
+      uint32 candidate_bytes = 0;
+
+      do {
+        // We have a 4-byte match at ip, and no need to emit any
+        // "literal bytes" prior to ip.
+        const char* base = ip;
+        int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
+        ip += matched;
+        int offset = base - candidate;
+        DCHECK_EQ(0, memcmp(base, candidate, matched));
+        op = EmitCopy(op, offset, matched);
+        // We could immediately start working at ip now, but to improve
+        // compression we first update table[Hash(ip - 1, ...)].
+        const char* insert_tail = ip - 1;
+        next_emit = ip;
+        if (PREDICT_FALSE(ip >= ip_limit)) {
+          goto emit_remainder;
+        }
+        input_bytes = UNALIGNED_LOAD64(insert_tail);
+        uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
+        table[prev_hash] = ip - base_ip - 1;
+        uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
+        candidate = base_ip + table[cur_hash];
+        candidate_bytes = UNALIGNED_LOAD32(candidate);
+        table[cur_hash] = ip - base_ip;
+      } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
+
+      next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
+      ++ip;
+    }
+  }
+
+ emit_remainder:
+  // Emit the remaining bytes as a literal
+  if (next_emit < ip_end) {
+    op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
+  }
+
+  return op;
+}
+}  // end namespace internal
+
+// Signature of output types needed by decompression code.
+// The decompression code is templatized on a type that obeys this
+// signature so that we do not pay virtual function call overhead in
+// the middle of a tight decompression loop.
+//
+// class DecompressionWriter {
+//  public:
+//   // Called before decompression
+//   void SetExpectedLength(size_t length);
+//
+//   // Called after decompression
+//   bool CheckLength() const;
+//
+//   // Called repeatedly during decompression
+//   bool Append(const char* ip, uint32 length, bool allow_fast_path);
+//   bool AppendFromSelf(uint32 offset, uint32 length);
+// };
+//
+// "allow_fast_path" is a parameter that says if there is at least 16
+// readable bytes in "ip". It is currently only used by SnappyArrayWriter.
+
+// -----------------------------------------------------------------------
+// Lookup table for decompression code.  Generated by ComputeTable() below.
+// -----------------------------------------------------------------------
+
+// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
+static const uint32 wordmask[] = {
+  0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
+};
+
+// Data stored per entry in lookup table:
+//      Range   Bits-used       Description
+//      ------------------------------------
+//      1..64   0..7            Literal/copy length encoded in opcode byte
+//      0..7    8..10           Copy offset encoded in opcode byte / 256
+//      0..4    11..13          Extra bytes after opcode
+//
+// We use eight bits for the length even though 7 would have sufficed
+// because of efficiency reasons:
+//      (1) Extracting a byte is faster than a bit-field
+//      (2) It properly aligns copy offset so we do not need a <<8
+static const uint16 char_table[256] = {
+  0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+  0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+  0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+  0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+  0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+  0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+  0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+  0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+  0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+  0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+  0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+  0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+  0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+  0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+  0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+  0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+  0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+  0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+  0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+  0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+  0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+  0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+  0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+  0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+  0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+  0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+  0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+  0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+  0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+  0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+  0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+  0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
+// In debug mode, allow optional computation of the table at startup.
+// Also, check that the decompression table is correct.
+#ifndef NDEBUG
+DEFINE_bool(snappy_dump_decompression_table, false,
+            "If true, we print the decompression table at startup.");
+
+static uint16 MakeEntry(unsigned int extra,
+                        unsigned int len,
+                        unsigned int copy_offset) {
+  // Check that all of the fields fit within the allocated space
+  DCHECK_EQ(extra,       extra & 0x7);          // At most 3 bits
+  DCHECK_EQ(copy_offset, copy_offset & 0x7);    // At most 3 bits
+  DCHECK_EQ(len,         len & 0x7f);           // At most 7 bits
+  return len | (copy_offset << 8) | (extra << 11);
+}
+
+static void ComputeTable() {
+  uint16 dst[256];
+
+  // Place invalid entries in all places to detect missing initialization
+  int assigned = 0;
+  for (int i = 0; i < 256; i++) {
+    dst[i] = 0xffff;
+  }
+
+  // Small LITERAL entries.  We store (len-1) in the top 6 bits.
+  for (unsigned int len = 1; len <= 60; len++) {
+    dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
+    assigned++;
+  }
+
+  // Large LITERAL entries.  We use 60..63 in the high 6 bits to
+  // encode the number of bytes of length info that follow the opcode.
+  for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
+    // We set the length field in the lookup table to 1 because extra
+    // bytes encode len-1.
+    dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
+    assigned++;
+  }
+
+  // COPY_1_BYTE_OFFSET.
+  //
+  // The tag byte in the compressed data stores len-4 in 3 bits, and
+  // offset/256 in 5 bits.  offset%256 is stored in the next byte.
+  //
+  // This format is used for length in range [4..11] and offset in
+  // range [0..2047]
+  for (unsigned int len = 4; len < 12; len++) {
+    for (unsigned int offset = 0; offset < 2048; offset += 256) {
+      dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
+        MakeEntry(1, len, offset>>8);
+      assigned++;
+    }
+  }
+
+  // COPY_2_BYTE_OFFSET.
+  // Tag contains len-1 in top 6 bits, and offset in next two bytes.
+  for (unsigned int len = 1; len <= 64; len++) {
+    dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
+    assigned++;
+  }
+
+  // COPY_4_BYTE_OFFSET.
+  // Tag contents len-1 in top 6 bits, and offset in next four bytes.
+  for (unsigned int len = 1; len <= 64; len++) {
+    dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
+    assigned++;
+  }
+
+  // Check that each entry was initialized exactly once.
+  CHECK_EQ(assigned, 256);
+  for (int i = 0; i < 256; i++) {
+    CHECK_NE(dst[i], 0xffff);
+  }
+
+  if (FLAGS_snappy_dump_decompression_table) {
+    printf("static const uint16 char_table[256] = {\n  ");
+    for (int i = 0; i < 256; i++) {
+      printf("0x%04x%s",
+             dst[i],
+             ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n  " : ", ")));
+    }
+    printf("};\n");
+  }
+
+  // Check that computed table matched recorded table
+  for (int i = 0; i < 256; i++) {
+    CHECK_EQ(dst[i], char_table[i]);
+  }
+}
+REGISTER_MODULE_INITIALIZER(snappy, ComputeTable());
+#endif /* !NDEBUG */
+
+// Helper class for decompression
+class SnappyDecompressor {
+ private:
+  Source*       reader_;         // Underlying source of bytes to decompress
+  const char*   ip_;             // Points to next buffered byte
+  const char*   ip_limit_;       // Points just past buffered bytes
+  uint32        peeked_;         // Bytes peeked from reader (need to skip)
+  bool          eof_;            // Hit end of input without an error?
+  char          scratch_[5];     // Temporary buffer for PeekFast() boundaries
+
+  // Ensure that all of the tag metadata for the next tag is available
+  // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even
+  // if (ip_limit_ - ip_ < 5).
+  //
+  // Returns true on success, false on error or end of input.
+  bool RefillTag();
+
+ public:
+  explicit SnappyDecompressor(Source* reader)
+      : reader_(reader),
+        ip_(NULL),
+        ip_limit_(NULL),
+        peeked_(0),
+        eof_(false) {
+  }
+
+  ~SnappyDecompressor() {
+    // Advance past any bytes we peeked at from the reader
+    reader_->Skip(peeked_);
+  }
+
+  // Returns true iff we have hit the end of the input without an error.
+  bool eof() const {
+    return eof_;
+  }
+
+  // Read the uncompressed length stored at the start of the compressed data.
+  // On succcess, stores the length in *result and returns true.
+  // On failure, returns false.
+  bool ReadUncompressedLength(uint32* result) {
+    DCHECK(ip_ == NULL);       // Must not have read anything yet
+    // Length is encoded in 1..5 bytes
+    *result = 0;
+    uint32 shift = 0;
+    while (true) {
+      if (shift >= 32) return false;
+      size_t n;
+      const char* ip = reader_->Peek(&n);
+      if (n == 0) return false;
+      const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
+      reader_->Skip(1);
+      *result |= static_cast<uint32>(c & 0x7f) << shift;
+      if (c < 128) {
+        break;
+      }
+      shift += 7;
+    }
+    return true;
+  }
+
+  // Process the next item found in the input.
+  // Returns true if successful, false on error or end of input.
+  template <class Writer>
+  void DecompressAllTags(Writer* writer) {
+    const char* ip = ip_;
+    for ( ;; ) {
+      if (ip_limit_ - ip < 5) {
+        ip_ = ip;
+        if (!RefillTag()) return;
+        ip = ip_;
+      }
+
+      const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
+      const uint32 entry = char_table[c];
+      const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+      ip += entry >> 11;
+      const uint32 length = entry & 0xff;
+
+      if ((c & 0x3) == LITERAL) {
+        uint32 literal_length = length + trailer;
+        uint32 avail = ip_limit_ - ip;
+        while (avail < literal_length) {
+          bool allow_fast_path = (avail >= 16);
+          if (!writer->Append(ip, avail, allow_fast_path)) return;
+          literal_length -= avail;
+          reader_->Skip(peeked_);
+          size_t n;
+          ip = reader_->Peek(&n);
+          avail = n;
+          peeked_ = avail;
+          if (avail == 0) return;  // Premature end of input
+          ip_limit_ = ip + avail;
+        }
+        bool allow_fast_path = (avail >= 16);
+        if (!writer->Append(ip, literal_length, allow_fast_path)) {
+          return;
+        }
+        ip += literal_length;
+      } else {
+        // copy_offset/256 is encoded in bits 8..10.  By just fetching
+        // those bits, we get copy_offset (since the bit-field starts at
+        // bit 8).
+        const uint32 copy_offset = entry & 0x700;
+        if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
+          return;
+        }
+      }
+    }
+  }
+};
+
+bool SnappyDecompressor::RefillTag() {
+  const char* ip = ip_;
+  if (ip == ip_limit_) {
+    // Fetch a new fragment from the reader
+    reader_->Skip(peeked_);   // All peeked bytes are used up
+    size_t n;
+    ip = reader_->Peek(&n);
+    peeked_ = n;
+    if (n == 0) {
+      eof_ = true;
+      return false;
+    }
+    ip_limit_ = ip + n;
+  }
+
+  // Read the tag character
+  DCHECK_LT(ip, ip_limit_);
+  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
+  const uint32 entry = char_table[c];
+  const uint32 needed = (entry >> 11) + 1;  // +1 byte for 'c'
+  DCHECK_LE(needed, sizeof(scratch_));
+
+  // Read more bytes from reader if needed
+  uint32 nbuf = ip_limit_ - ip;
+  if (nbuf < needed) {
+    // Stitch together bytes from ip and reader to form the word
+    // contents.  We store the needed bytes in "scratch_".  They
+    // will be consumed immediately by the caller since we do not
+    // read more than we need.
+    memmove(scratch_, ip, nbuf);
+    reader_->Skip(peeked_);  // All peeked bytes are used up
+    peeked_ = 0;
+    while (nbuf < needed) {
+      size_t length;
+      const char* src = reader_->Peek(&length);
+      if (length == 0) return false;
+      uint32 to_add = min<uint32>(needed - nbuf, length);
+      memcpy(scratch_ + nbuf, src, to_add);
+      nbuf += to_add;
+      reader_->Skip(to_add);
+    }
+    DCHECK_EQ(nbuf, needed);
+    ip_ = scratch_;
+    ip_limit_ = scratch_ + needed;
+  } else if (nbuf < 5) {
+    // Have enough bytes, but move into scratch_ so that we do not
+    // read past end of input
+    memmove(scratch_, ip, nbuf);
+    reader_->Skip(peeked_);  // All peeked bytes are used up
+    peeked_ = 0;
+    ip_ = scratch_;
+    ip_limit_ = scratch_ + nbuf;
+  } else {
+    // Pass pointer to buffer returned by reader_.
+    ip_ = ip;
+  }
+  return true;
+}
+
+template <typename Writer>
+static bool InternalUncompress(Source* r,
+                               Writer* writer,
+                               uint32 max_len) {
+  // Read the uncompressed length from the front of the compressed input
+  SnappyDecompressor decompressor(r);
+  uint32 uncompressed_len = 0;
+  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
+  // Protect against possible DoS attack
+  if (static_cast<uint64>(uncompressed_len) > max_len) {
+    return false;
+  }
+
+  writer->SetExpectedLength(uncompressed_len);
+
+  // Process the entire input
+  decompressor.DecompressAllTags(writer);
+  return (decompressor.eof() && writer->CheckLength());
+}
+
+bool GetUncompressedLength(Source* source, uint32* result) {
+  SnappyDecompressor decompressor(source);
+  return decompressor.ReadUncompressedLength(result);
+}
+
+size_t Compress(Source* reader, Sink* writer) {
+  size_t written = 0;
+  int N = reader->Available();
+  char ulength[Varint::kMax32];
+  char* p = Varint::Encode32(ulength, N);
+  writer->Append(ulength, p-ulength);
+  written += (p - ulength);
+
+  internal::WorkingMemory wmem;
+  char* scratch = NULL;
+  char* scratch_output = NULL;
+
+  while (N > 0) {
+    // Get next block to compress (without copying if possible)
+    size_t fragment_size;
+    const char* fragment = reader->Peek(&fragment_size);
+    DCHECK_NE(fragment_size, 0) << ": premature end of input";
+    const int num_to_read = min(N, kBlockSize);
+    size_t bytes_read = fragment_size;
+
+    int pending_advance = 0;
+    if (bytes_read >= num_to_read) {
+      // Buffer returned by reader is large enough
+      pending_advance = num_to_read;
+      fragment_size = num_to_read;
+    } else {
+      // Read into scratch buffer
+      if (scratch == NULL) {
+        // If this is the last iteration, we want to allocate N bytes
+        // of space, otherwise the max possible kBlockSize space.
+        // num_to_read contains exactly the correct value
+        scratch = new char[num_to_read];
+      }
+      memcpy(scratch, fragment, bytes_read);
+      reader->Skip(bytes_read);
+
+      while (bytes_read < num_to_read) {
+        fragment = reader->Peek(&fragment_size);
+        size_t n = min<size_t>(fragment_size, num_to_read - bytes_read);
+        memcpy(scratch + bytes_read, fragment, n);
+        bytes_read += n;
+        reader->Skip(n);
+      }
+      DCHECK_EQ(bytes_read, num_to_read);
+      fragment = scratch;
+      fragment_size = num_to_read;
+    }
+    DCHECK_EQ(fragment_size, num_to_read);
+
+    // Get encoding table for compression
+    int table_size;
+    uint16* table = wmem.GetHashTable(num_to_read, &table_size);
+
+    // Compress input_fragment and append to dest
+    const int max_output = MaxCompressedLength(num_to_read);
+
+    // Need a scratch buffer for the output, in case the byte sink doesn't
+    // have room for us directly.
+    if (scratch_output == NULL) {
+      scratch_output = new char[max_output];
+    } else {
+      // Since we encode kBlockSize regions followed by a region
+      // which is <= kBlockSize in length, a previously allocated
+      // scratch_output[] region is big enough for this iteration.
+    }
+    char* dest = writer->GetAppendBuffer(max_output, scratch_output);
+    char* end = internal::CompressFragment(fragment, fragment_size,
+                                           dest, table, table_size);
+    writer->Append(dest, end - dest);
+    written += (end - dest);
+
+    N -= num_to_read;
+    reader->Skip(pending_advance);
+  }
+
+  delete[] scratch;
+  delete[] scratch_output;
+
+  return written;
+}
+
+// -----------------------------------------------------------------------
+// Flat array interfaces
+// -----------------------------------------------------------------------
+
+// A type that writes to a flat array.
+// Note that this is not a "ByteSink", but a type that matches the
+// Writer template argument to SnappyDecompressor::DecompressAllTags().
+class SnappyArrayWriter {
+ private:
+  char* base_;
+  char* op_;
+  char* op_limit_;
+
+ public:
+  inline explicit SnappyArrayWriter(char* dst)
+      : base_(dst),
+        op_(dst) {
+  }
+
+  inline void SetExpectedLength(size_t len) {
+    op_limit_ = op_ + len;
+  }
+
+  inline bool CheckLength() const {
+    return op_ == op_limit_;
+  }
+
+  inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+    char* op = op_;
+    const int space_left = op_limit_ - op;
+    if (allow_fast_path && len <= 16 && space_left >= 16) {
+      // Fast path, used for the majority (about 90%) of dynamic invocations.
+      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
+      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+    } else {
+      if (space_left < len) {
+        return false;
+      }
+      memcpy(op, ip, len);
+    }
+    op_ = op + len;
+    return true;
+  }
+
+  inline bool AppendFromSelf(uint32 offset, uint32 len) {
+    char* op = op_;
+    const int space_left = op_limit_ - op;
+
+    if (op - base_ <= offset - 1u) {  // -1u catches offset==0
+      return false;
+    }
+    if (len <= 16 && offset >= 8 && space_left >= 16) {
+      // Fast path, used for the majority (70-80%) of dynamic invocations.
+      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset));
+      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8));
+    } else {
+      if (space_left >= len + kMaxIncrementCopyOverflow) {
+        IncrementalCopyFastPath(op - offset, op, len);
+      } else {
+        if (space_left < len) {
+          return false;
+        }
+        IncrementalCopy(op - offset, op, len);
+      }
+    }
+
+    op_ = op + len;
+    return true;
+  }
+};
+
+bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
+  ByteArraySource reader(compressed, n);
+  return RawUncompress(&reader, uncompressed);
+}
+
+bool RawUncompress(Source* compressed, char* uncompressed) {
+  SnappyArrayWriter output(uncompressed);
+  return InternalUncompress(compressed, &output, kuint32max);
+}
+
+bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
+  size_t ulength;
+  if (!GetUncompressedLength(compressed, n, &ulength)) {
+    return false;
+  }
+  // Protect against possible DoS attack
+  if ((static_cast<uint64>(ulength) + uncompressed->size()) >
+      uncompressed->max_size()) {
+    return false;
+  }
+  STLStringResizeUninitialized(uncompressed, ulength);
+  return RawUncompress(compressed, n, string_as_array(uncompressed));
+}
+
+
+// A Writer that drops everything on the floor and just does validation
+class SnappyDecompressionValidator {
+ private:
+  size_t expected_;
+  size_t produced_;
+
+ public:
+  inline SnappyDecompressionValidator() : produced_(0) { }
+  inline void SetExpectedLength(size_t len) {
+    expected_ = len;
+  }
+  inline bool CheckLength() const {
+    return expected_ == produced_;
+  }
+  inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+    produced_ += len;
+    return produced_ <= expected_;
+  }
+  inline bool AppendFromSelf(uint32 offset, uint32 len) {
+    if (produced_ <= offset - 1u) return false;  // -1u catches offset==0
+    produced_ += len;
+    return produced_ <= expected_;
+  }
+};
+
+bool IsValidCompressedBuffer(const char* compressed, size_t n) {
+  ByteArraySource reader(compressed, n);
+  SnappyDecompressionValidator writer;
+  return InternalUncompress(&reader, &writer, kuint32max);
+}
+
+void RawCompress(const char* input,
+                 size_t input_length,
+                 char* compressed,
+                 size_t* compressed_length) {
+  ByteArraySource reader(input, input_length);
+  UncheckedByteArraySink writer(compressed);
+  Compress(&reader, &writer);
+
+  // Compute how many bytes were added
+  *compressed_length = (writer.CurrentDestination() - compressed);
+}
+
+size_t Compress(const char* input, size_t input_length, string* compressed) {
+  // Pre-grow the buffer to the max length of the compressed output
+  compressed->resize(MaxCompressedLength(input_length));
+
+  size_t compressed_length;
+  RawCompress(input, input_length, string_as_array(compressed),
+              &compressed_length);
+  compressed->resize(compressed_length);
+  return compressed_length;
+}
+
+
+} // end namespace snappy
+
diff --git a/third_party/snappy/snappy.h b/third_party/snappy/snappy.h
new file mode 100755
index 00000000000..8d6ef2294f5
--- /dev/null
+++ b/third_party/snappy/snappy.h
@@ -0,0 +1,155 @@
+// Copyright 2005 and onwards Google Inc.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// A light-weight compression algorithm.  It is designed for speed of
+// compression and decompression, rather than for the utmost in space
+// savings.
+//
+// For getting better compression ratios when you are compressing data
+// with long repeated sequences or compressing data that is similar to
+// other data, while still compressing fast, you might look at first
+// using BMDiff and then compressing the output of BMDiff with
+// Snappy.
+
+#ifndef UTIL_SNAPPY_SNAPPY_H__
+#define UTIL_SNAPPY_SNAPPY_H__
+
+#include <stddef.h>
+#include <string>
+
+#include "snappy-stubs-public.h"
+
+namespace snappy {
+  class Source;
+  class Sink;
+
+  // ------------------------------------------------------------------------
+  // Generic compression/decompression routines.
+  // ------------------------------------------------------------------------
+
+  // Compress the bytes read from "*source" and append to "*sink". Return the
+  // number of bytes written.
+  size_t Compress(Source* source, Sink* sink);
+
+  bool GetUncompressedLength(Source* source, uint32* result);
+
+  // ------------------------------------------------------------------------
+  // Higher-level string based routines (should be sufficient for most users)
+  // ------------------------------------------------------------------------
+
+  // Sets "*output" to the compressed version of "input[0,input_length-1]".
+  // Original contents of *output are lost.
+  //
+  // REQUIRES: "input[]" is not an alias of "*output".
+  size_t Compress(const char* input, size_t input_length, string* output);
+
+  // Decompresses "compressed[0,compressed_length-1]" to "*uncompressed".
+  // Original contents of "*uncompressed" are lost.
+  //
+  // REQUIRES: "compressed[]" is not an alias of "*uncompressed".
+  //
+  // returns false if the message is corrupted and could not be decompressed
+  bool Uncompress(const char* compressed, size_t compressed_length,
+                  string* uncompressed);
+
+
+  // ------------------------------------------------------------------------
+  // Lower-level character array based routines.  May be useful for
+  // efficiency reasons in certain circumstances.
+  // ------------------------------------------------------------------------
+
+  // REQUIRES: "compressed" must point to an area of memory that is at
+  // least "MaxCompressedLength(input_length)" bytes in length.
+  //
+  // Takes the data stored in "input[0..input_length]" and stores
+  // it in the array pointed to by "compressed".
+  //
+  // "*compressed_length" is set to the length of the compressed output.
+  //
+  // Example:
+  //    char* output = new char[snappy::MaxCompressedLength(input_length)];
+  //    size_t output_length;
+  //    RawCompress(input, input_length, output, &output_length);
+  //    ... Process(output, output_length) ...
+  //    delete [] output;
+  void RawCompress(const char* input,
+                   size_t input_length,
+                   char* compressed,
+                   size_t* compressed_length);
+
+  // Given data in "compressed[0..compressed_length-1]" generated by
+  // calling the Snappy::Compress routine, this routine
+  // stores the uncompressed data to
+  //    uncompressed[0..GetUncompressedLength(compressed)-1]
+  // returns false if the message is corrupted and could not be decrypted
+  bool RawUncompress(const char* compressed, size_t compressed_length,
+                     char* uncompressed);
+
+  // Given data from the byte source 'compressed' generated by calling
+  // the Snappy::Compress routine, this routine stores the uncompressed
+  // data to
+  //    uncompressed[0..GetUncompressedLength(compressed,compressed_length)-1]
+  // returns false if the message is corrupted and could not be decrypted
+  bool RawUncompress(Source* compressed, char* uncompressed);
+
+  // Returns the maximal size of the compressed representation of
+  // input data that is "source_bytes" bytes in length;
+  size_t MaxCompressedLength(size_t source_bytes);
+
+  // REQUIRES: "compressed[]" was produced by RawCompress() or Compress()
+  // Returns true and stores the length of the uncompressed data in
+  // *result normally.  Returns false on parsing error.
+  // This operation takes O(1) time.
+  bool GetUncompressedLength(const char* compressed, size_t compressed_length,
+                             size_t* result);
+
+  // Returns true iff the contents of "compressed[]" can be uncompressed
+  // successfully.  Does not return the uncompressed data.  Takes
+  // time proportional to compressed_length, but is usually at least
+  // a factor of four faster than actual decompression.
+  bool IsValidCompressedBuffer(const char* compressed,
+                               size_t compressed_length);
+
+  // *** DO NOT CHANGE THE VALUE OF kBlockSize ***
+  //
+  // New Compression code chops up the input into blocks of at most
+  // the following size.  This ensures that back-references in the
+  // output never cross kBlockSize block boundaries.  This can be
+  // helpful in implementing blocked decompression.  However the
+  // decompression code should not rely on this guarantee since older
+  // compression code may not obey it.
+  static const int kBlockLog = 15;
+  static const int kBlockSize = 1 << kBlockLog;
+
+  static const int kMaxHashTableBits = 14;
+  static const int kMaxHashTableSize = 1 << kMaxHashTableBits;
+
+}  // end namespace snappy
+
+
+#endif  // UTIL_SNAPPY_SNAPPY_H__
diff --git a/tools/bridge.cpp b/tools/bridge.cpp
index f7518a17ad4..341a1dae687 100644
--- a/tools/bridge.cpp
+++ b/tools/bridge.cpp
@@ -88,7 +88,7 @@ set<MessagingPort*> ports;
 
 class MyListener : public Listener {
 public:
-    MyListener( int port ) : Listener( "", port ) {}
+    MyListener( int port ) : Listener( "bridge" , "", port ) {}
     virtual void accepted(MessagingPort *mp) {
         ports.insert( mp );
         Forwarder f( *mp );
diff --git a/tools/export.cpp b/tools/export.cpp
index fb32a9e58ff..c3a5420438d 100644
--- a/tools/export.cpp
+++ b/tools/export.cpp
@@ -45,6 +45,73 @@ public:
         _usesstdout = false;
     }
 
+    // Turn every double quote character into two double quote characters
+    // If hasSurroundingQuotes is true, doesn't escape the first and last
+    // characters of the string, if it's false, add a double quote character
+    // around the whole string.
+    string csvEscape(string str, bool hasSurroundingQuotes = false) {
+        size_t index = hasSurroundingQuotes ? 1 : 0;
+        while (((index = str.find('"', index)) != string::npos)
+               && (index < (hasSurroundingQuotes ? str.size() - 1 : str.size()))) {
+            str.replace(index, 1, "\"\"");
+            index += 2;
+        }
+        return hasSurroundingQuotes ? str : "\"" + str + "\"";
+    }
+
+    // Gets the string representation of a BSON object that can be correctly written to a CSV file
+    string csvString (const BSONElement& object) {
+        const char* binData; // Only used with BinData type
+
+        switch (object.type()) {
+        case MinKey:
+            return "$MinKey";
+        case MaxKey:
+            return "$MaxKey";
+        case NumberInt:
+        case NumberDouble:
+        case NumberLong:
+        case Bool:
+            return object.toString(false);
+        case String:
+        case Symbol:
+            return csvEscape(object.toString(false), true);
+        case Object:
+            return csvEscape(object.jsonString(Strict, false));
+        case Array:
+            return csvEscape(object.jsonString(Strict, false));
+        case BinData:
+            int len;
+            binData = object.binDataClean(len);
+            return toHex(binData, len);
+        case jstOID:
+            return "ObjectID(" + object.OID().toString() + ")"; // OIDs are always 24 bytes
+        case Date:
+            return timeToISOString(object.Date() / 1000);
+        case Timestamp:
+            return csvEscape(object.jsonString(Strict, false));
+        case RegEx:
+            return csvEscape("/" + string(object.regex()) + "/" + string(object.regexFlags()));
+        case Code:
+            return csvEscape(object.toString(false));
+        case CodeWScope:
+            if (string(object.codeWScopeScopeData()) == "") {
+                return csvEscape(object.toString(false));
+            } else {
+                return csvEscape(object.jsonString(Strict, false));
+            }
+        case EOO:
+        case Undefined:
+        case DBRef:
+        case jstNULL:
+            cerr << "Invalid BSON object type for CSV output: " << object.type() << endl;
+            return "";
+        }
+        // Can never get here
+        assert(false);
+        return "";
+    }
+
     int run() {
         string ns;
         const bool csv = hasParam( "csv" );
@@ -137,7 +204,7 @@ public:
                         out << ",";
                     const BSONElement & e = obj.getFieldDotted(i->c_str());
                     if ( ! e.eoo() ) {
-                        out << e.jsonString( Strict , false );
+                        out << csvString(e);
                     }
                 }
                 out << endl;
diff --git a/tools/import.cpp b/tools/import.cpp
index c7a18b940ec..16980b05fbb 100644
--- a/tools/import.cpp
+++ b/tools/import.cpp
@@ -27,6 +27,7 @@
 #include <iostream>
 
 #include <boost/program_options.hpp>
+#include <boost/algorithm/string.hpp>
 
 using namespace mongo;
 
@@ -44,100 +45,215 @@ class Import : public Tool {
     bool _doimport;
     bool _jsonArray;
     vector<string> _upsertFields;
+    static const int BUF_SIZE = 1024 * 1024 * 4;
+
+    string trimWhitespace(const string& str) {
+        if (str.size() == 0) {
+            return str;
+        }
+        size_t begin = 0;
+        size_t end = str.size() - 1;
+        while (begin < str.size() && isspace(str[begin])) { ++begin; } // Finds index of first non-whitespace character
+        while (end > 0 && isspace(str[end])) { --end; } // Finds index of last non-whitespace character
+        return str.substr(begin, end - begin + 1);
+    }
+
+    void csvTokenizeRow(const string& row, vector<string>& tokens) {
+        bool inQuotes = false;
+        bool prevWasQuote = false;
+        bool tokenQuoted = false;
+        string curtoken = "";
+        for (string::const_iterator it = row.begin(); it != row.end(); ++it) {
+            char element = *it;
+            if (element == '"') {
+                if (!inQuotes) {
+                    inQuotes = true;
+                    tokenQuoted = true;
+                    curtoken = "";
+                } else {
+                    if (prevWasQuote) {
+                        curtoken += "\"";
+                        prevWasQuote = false;
+                    } else {
+                        prevWasQuote = true;
+                    }
+                }
+            } else {
+                if (inQuotes && prevWasQuote) {
+                    inQuotes = false;
+                    prevWasQuote = false;
+                    tokens.push_back(curtoken);
+                }
+
+                if (element == ',' && !inQuotes) {
+                    if (!tokenQuoted) { // If token was quoted, it's already been added
+                        tokens.push_back(trimWhitespace(curtoken));
+                    }
+                    curtoken = "";
+                    tokenQuoted = false;
+                } else {
+                    curtoken += element;
+                }
+            }
+        }
+        if (!tokenQuoted || (inQuotes && prevWasQuote)) {
+            tokens.push_back(trimWhitespace(curtoken));
+        }
+    }
 
     void _append( BSONObjBuilder& b , const string& fieldName , const string& data ) {
-        if ( b.appendAsNumber( fieldName , data ) )
+        if ( _ignoreBlanks && data.size() == 0 )
             return;
 
-        if ( _ignoreBlanks && data.size() == 0 )
+        if ( b.appendAsNumber( fieldName , data ) )
             return;
 
         // TODO: other types?
-        b.append( fieldName , data );
+        b.append ( fieldName , data );
+    }
+
+    /*
+     * Reads one line from in into buf.
+     * Returns the number of bytes that should be skipped - the caller should
+     * increment buf by this amount.
+     */
+    int getLine(istream* in, char* buf) {
+        if (_jsonArray) {
+            in->read(buf, BUF_SIZE);
+            uassert(13295, "JSONArray file too large", (in->rdstate() & ios_base::eofbit));
+            buf[ in->gcount() ] = '\0';
+        }
+        else {
+            in->getline( buf , BUF_SIZE );
+            log(1) << "got line:" << buf << endl;
+        }
+        uassert( 10263 ,  "unknown error reading file" ,
+                 (!(in->rdstate() & ios_base::badbit)) &&
+                 (!(in->rdstate() & ios_base::failbit) || (in->rdstate() & ios_base::eofbit)) );
+
+        int numBytesSkipped = 0;
+        if (strncmp("\xEF\xBB\xBF", buf, 3) == 0) { // UTF-8 BOM (notepad is stupid)
+            buf += 3;
+            numBytesSkipped += 3;
+        }
+
+        uassert(13289, "Invalid UTF8 character detected", isValidUTF8(buf));
+        return numBytesSkipped;
     }
 
-    BSONObj parseLine( char * line ) {
-        uassert(13289, "Invalid UTF8 character detected", isValidUTF8(line));
+    /*
+     * Parses a BSON object out of a JSON array.
+     * Returns number of bytes processed on success and -1 on failure.
+     */
+    int parseJSONArray(char* buf, BSONObj& o) {
+        int len = 0;
+        while (buf[0] != '{' && buf[0] != '\0') {
+            len++;
+            buf++;
+        }
+        if (buf[0] == '\0')
+            return -1;
+
+        int jslen;
+        o = fromjson(buf, &jslen);
+        len += jslen;
 
-        if ( _type == JSON ) {
+        return len;
+    }
+
+    /*
+     * Parses one object from the input file.  This usually corresponds to one line in the input
+     * file, unless the file is a CSV and contains a newline within a quoted string entry.
+     * Returns a true if a BSONObj was successfully created and false if not.
+     */
+    bool parseRow(istream* in, BSONObj& o, int& numBytesRead) {
+        boost::scoped_array<char> buffer(new char[BUF_SIZE+2]);
+        char* line = buffer.get();
+
+        numBytesRead = getLine(in, line);
+        line += numBytesRead;
+
+        if (line[0] == '\0') {
+            return false;
+        }
+        numBytesRead += strlen( line );
+
+        if (_type == JSON) {
+            // Strip out trailing whitespace
             char * end = ( line + strlen( line ) ) - 1;
-            while ( isspace(*end) ) {
+            while ( end >= line && isspace(*end) ) {
                 *end = 0;
                 end--;
             }
-            return fromjson( line );
+            o = fromjson( line );
+            return true;
         }
 
-        BSONObjBuilder b;
+        vector<string> tokens;
+        if (_type == CSV) {
+            string row;
+            bool inside_quotes = false;
+            size_t last_quote = 0;
+            while (true) {
+                string lineStr(line);
+                // Deal with line breaks in quoted strings
+                last_quote = lineStr.find_first_of('"');
+                while (last_quote != string::npos) {
+                    inside_quotes = !inside_quotes;
+                    last_quote = lineStr.find_first_of('"', last_quote+1);
+                }
 
-        unsigned int pos=0;
-        while ( line[0] ) {
-            string name;
-            if ( pos < _fields.size() ) {
-                name = _fields[pos];
+                row.append(lineStr);
+
+                if (inside_quotes) {
+                    row.append("\n");
+                    int num = getLine(in, line);
+                    line += num;
+                    numBytesRead += num;
+
+                    uassert (15854, "CSV file ends while inside quoted field", line[0] != '\0');
+                    numBytesRead += strlen( line );
+                } else {
+                    break;
+                }
             }
-            else {
-                stringstream ss;
-                ss << "field" << pos;
-                name = ss.str();
+            // now 'row' is string corresponding to one row of the CSV file
+            // (which may span multiple lines) and represents one BSONObj
+            csvTokenizeRow(row, tokens);
+        }
+        else {  // _type == TSV
+            while (line[0] != '\t' && isspace(line[0])) { // Strip leading whitespace, but not tabs
+                line++;
             }
-            pos++;
-
-            bool done = false;
-            string data;
-            char * end;
-            if ( _type == CSV && line[0] == '"' ) {
-                line++; //skip first '"'
-
-                while (true) {
-                    end = strchr( line , '"' );
-                    if (!end) {
-                        data += line;
-                        done = true;
-                        break;
-                    }
-                    else if (end[1] == '"') {
-                        // two '"'s get appended as one
-                        data.append(line, end-line+1); //include '"'
-                        line = end+2; //skip both '"'s
-                    }
-                    else if (end[-1] == '\\') {
-                        // "\\\"" gets appended as '"'
-                        data.append(line, end-line-1); //exclude '\\'
-                        data.append("\"");
-                        line = end+1; //skip the '"'
-                    }
-                    else {
-                        data.append(line, end-line);
-                        line = end+2; //skip '"' and ','
-                        break;
-                    }
-                }
+
+            boost::split(tokens, line, boost::is_any_of(_sep));
+        }
+
+        // Now that the row is tokenized, create a BSONObj out of it.
+        BSONObjBuilder b;
+        unsigned int pos=0;
+        for (vector<string>::iterator it = tokens.begin(); it != tokens.end(); ++it) {
+            string token = *it;
+            if ( _headerLine ) {
+                _fields.push_back(token);
             }
             else {
-                end = strstr( line , _sep );
-                if ( ! end ) {
-                    done = true;
-                    data = string( line );
+                string name;
+                if ( pos < _fields.size() ) {
+                    name = _fields[pos];
                 }
                 else {
-                    data = string( line , end - line );
-                    line = end+1;
+                    stringstream ss;
+                    ss << "field" << pos;
+                    name = ss.str();
                 }
-            }
+                pos++;
 
-            if ( _headerLine ) {
-                while ( isspace( data[0] ) )
-                    data = data.substr( 1 );
-                _fields.push_back( data );
+                _append( b , name , token );
             }
-            else
-                _append( b , name , data );
-
-            if ( done )
-                break;
         }
-        return b.obj();
+        o = b.obj();
+        return true;
     }
 
 public:
@@ -255,68 +371,37 @@ public:
             _jsonArray = true;
         }
 
-        int errors = 0;
-
-        int num = 0;
-
         time_t start = time(0);
-
         log(1) << "filesize: " << fileSize << endl;
         ProgressMeter pm( fileSize );
-        const int BUF_SIZE = 1024 * 1024 * 4;
-        boost::scoped_array<char> line(new char[BUF_SIZE+2]);
-        char * buf = line.get();
-        while ( _jsonArray || in->rdstate() == 0 ) {
-            if (_jsonArray) {
-                if (buf == line.get()) { //first pass
-                    in->read(buf, BUF_SIZE);
-                    uassert(13295, "JSONArray file too large", (in->rdstate() & ios_base::eofbit));
-                    buf[ in->gcount() ] = '\0';
-                }
-            }
-            else {
-                buf = line.get();
-                in->getline( buf , BUF_SIZE );
-                log(1) << "got line:" << buf << endl;
-            }
-            uassert( 10263 ,  "unknown error reading file" ,
-                     (!(in->rdstate() & ios_base::badbit)) &&
-                     (!(in->rdstate() & ios_base::failbit) || (in->rdstate() & ios_base::eofbit)) );
-
-            int len = 0;
-            if (strncmp("\xEF\xBB\xBF", buf, 3) == 0) { // UTF-8 BOM (notepad is stupid)
-                buf += 3;
-                len += 3;
-            }
-
-            if (_jsonArray) {
-                while (buf[0] != '{' && buf[0] != '\0') {
-                    len++;
-                    buf++;
-                }
-                if (buf[0] == '\0')
-                    break;
-            }
-            else {
-                while ((_type != TSV || buf[0] != '\t') && isspace( buf[0] )) {
-                    len++;
-                    buf++;
-                }
-                if (buf[0] == '\0')
-                    continue;
-                len += strlen( buf );
-            }
+        int num = 0;
+        int errors = 0;
+        int len = 0;
+        // buffer and line are only used when parsing a jsonArray
+        boost::scoped_array<char> buffer(new char[BUF_SIZE+2]);
+        char* line = buffer.get();
 
+        while ( _jsonArray || in->rdstate() == 0 ) {
             try {
                 BSONObj o;
                 if (_jsonArray) {
-                    int jslen;
-                    o = fromjson(buf, &jslen);
-                    len += jslen;
-                    buf += jslen;
+                    int bytesProcessed = 0;
+                    if (line == buffer.get()) { // Only read on first pass - the whole array must be on one line.
+                        bytesProcessed = getLine(in, line);
+                        line += bytesProcessed;
+                        len += bytesProcessed;
+                    }
+                    if ((bytesProcessed = parseJSONArray(line, o)) < 0) {
+                        len += bytesProcessed;
+                        break;
+                    }
+                    len += bytesProcessed;
+                    line += len;
                 }
                 else {
-                    o = parseLine( buf );
+                    if (!parseRow(in, o, len)) {
+                        continue;
+                    }
                 }
 
                 if ( _headerLine ) {
@@ -348,7 +433,7 @@ public:
             }
             catch ( std::exception& e ) {
                 cout << "exception:" << e.what() << endl;
-                cout << buf << endl;
+                cout << line << endl;
                 errors++;
 
                 if (hasParam("stopOnError") || _jsonArray)
diff --git a/tools/restore.cpp b/tools/restore.cpp
index 3ff6a742d99..9adf90bd209 100644
--- a/tools/restore.cpp
+++ b/tools/restore.cpp
@@ -25,6 +25,7 @@
 #include <boost/program_options.hpp>
 
 #include <fcntl.h>
+#include <set>
 
 using namespace mongo;
 
@@ -40,6 +41,7 @@ public:
     bool _drop;
     string _curns;
     string _curdb;
+    set<string> _users; // For restoring users with --drop
 
     Restore() : BSONTool( "restore" ) , _drop(false) {
         add_options()
@@ -208,13 +210,31 @@ public:
         out() << "\t going into namespace [" << ns << "]" << endl;
 
         if ( _drop ) {
-            out() << "\t dropping" << endl;
-            conn().dropCollection( ns );
+            if (root.leaf() != "system.users.bson" ) {
+                out() << "\t dropping" << endl;
+                conn().dropCollection( ns );
+            } else {
+                // Create map of the users currently in the DB
+                BSONObj fields = BSON("user" << 1);
+                scoped_ptr<DBClientCursor> cursor(conn().query(ns, Query(), 0, 0, &fields));
+                while (cursor->more()) {
+                    BSONObj user = cursor->next();
+                    _users.insert(user["user"].String());
+                }
+            }
         }
 
         _curns = ns.c_str();
         _curdb = NamespaceString(_curns).db;
         processFile( root );
+        if (_drop && root.leaf() == "system.users.bson") {
+            // Delete any users that used to exist but weren't in the dump file
+            for (set<string>::iterator it = _users.begin(); it != _users.end(); ++it) {
+                BSONObj userMatch = BSON("user" << *it);
+                conn().remove(ns, Query(userMatch));
+            }
+            _users.clear();
+        }
     }
 
     virtual void gotObject( const BSONObj& obj ) {
@@ -260,7 +280,13 @@ public:
                 ::abort();
             }
         }
-        else {
+        else if (_drop && endsWith(_curns.c_str(), ".system.users") && _users.count(obj["user"].String())) {
+            // Since system collections can't be dropped, we have to manually
+            // replace the contents of the system.users collection
+            BSONObj userMatch = BSON("user" << obj["user"].String());
+            conn().update(_curns, Query(userMatch), obj);
+            _users.erase(obj["user"].String());
+        } else {
             conn().insert( _curns , obj );
         }
     }
diff --git a/tools/tool.cpp b/tools/tool.cpp
index 98e18a9226a..d938e752041 100644
--- a/tools/tool.cpp
+++ b/tools/tool.cpp
@@ -380,8 +380,15 @@ namespace mongo {
         if ( ! dbname.size() )
             dbname = _db;
 
-        if ( ! ( _username.size() || _password.size() ) )
+        if ( ! ( _username.size() || _password.size() ) ) {
+            // Make sure that we don't need authentication to connect to this db
+            // findOne throws an AssertionException if it's not authenticated.
+            if (_coll.size() > 0) {
+                // BSONTools don't have a collection
+                conn().findOne(getNS(), Query("{}"));
+            }
             return;
+        }
 
         string errmsg;
         if ( _conn->auth( dbname , _username , _password , errmsg ) )
@@ -396,7 +403,7 @@ namespace mongo {
     }
 
     BSONTool::BSONTool( const char * name, DBAccess access , bool objcheck )
-        : Tool( name , access , "" , "" ) , _objcheck( objcheck ) {
+        : Tool( name , access , "" , "" , false ) , _objcheck( objcheck ) {
 
         add_options()
         ("objcheck" , "validate object before inserting" )
@@ -489,9 +496,9 @@ namespace mongo {
         fclose( file );
 
         uassert( 10265 ,  "counts don't match" , m.done() == fileLength );
-        out() << "\t "  << m.hits() << " objects found" << endl;
+        (_usesstdout ? cout : cerr ) << m.hits() << " objects found" << endl;
         if ( _matcher.get() )
-            out() << "\t "  << processed << " objects processed" << endl;
+            (_usesstdout ? cout : cerr ) << processed << " objects processed" << endl;
         return processed;
     }
 
diff --git a/util/alignedbuilder.cpp b/util/alignedbuilder.cpp
index 732ef99c764..b2e0461b733 100644
--- a/util/alignedbuilder.cpp
+++ b/util/alignedbuilder.cpp
@@ -32,9 +32,30 @@ namespace mongo {
     /** reset for a re-use. shrinks if > 128MB */
     void AlignedBuilder::reset() {
         _len = 0;
-        const unsigned sizeCap = 128*1024*1024;
-        if (_p._size > sizeCap)
-            _realloc(sizeCap, _len);
+        RARELY {
+            const unsigned sizeCap = 128*1024*1024;
+            if (_p._size > sizeCap)
+                _realloc(sizeCap, _len);
+        }
+    }
+
+    /** reset with a hint as to the upcoming needed size specified */
+    void AlignedBuilder::reset(unsigned sz) { 
+        _len = 0;
+        unsigned Q = 32 * 1024 * 1024 - 1;
+        unsigned want = (sz+Q) & (~Q);
+        if( _p._size == want ) {
+            return;
+        }        
+        if( _p._size > want ) {
+            if( _p._size <= 64 * 1024 * 1024 )
+                return;
+            bool downsize = false;
+            RARELY { downsize = true; }
+            if( !downsize )
+                return;
+        }
+        _realloc(want, _len);
     }
 
     void AlignedBuilder::mallocSelfAligned(unsigned sz) {
@@ -52,10 +73,16 @@ namespace mongo {
 
     /* "slow"/infrequent portion of 'grow()'  */
     void NOINLINE_DECL AlignedBuilder::growReallocate(unsigned oldLen) {
+        dassert( _len > _p._size );
         unsigned a = _p._size;
         assert( a );
         while( 1 ) {
-            a *= 2;
+            if( a < 128 * 1024 * 1024 )
+                a *= 2;
+            else if( sizeof(int*) == 4 )
+                a += 32 * 1024 * 1024;
+            else 
+                a += 64 * 1024 * 1024;
             DEV if( a > 256*1024*1024 ) { 
                 log() << "dur AlignedBuilder too big, aborting in _DEBUG build" << endl;
                 abort();
diff --git a/util/alignedbuilder.h b/util/alignedbuilder.h
index 8760bfb9a44..1d246a9d78e 100644
--- a/util/alignedbuilder.h
+++ b/util/alignedbuilder.h
@@ -28,6 +28,9 @@ namespace mongo {
         AlignedBuilder(unsigned init_size);
         ~AlignedBuilder() { kill(); }
 
+        /** reset with a hint as to the upcoming needed size specified */
+        void reset(unsigned sz);
+
         /** reset for a re-use. shrinks if > 128MB */
         void reset();
 
@@ -43,8 +46,12 @@ namespace mongo {
             return l;
         }
 
+        /** if buffer grows pointer no longer valid */
         char* atOfs(unsigned ofs) { return _p._data + ofs; }
 
+        /** if buffer grows pointer no longer valid */
+        char* cur() { return _p._data + _len; }
+
         void appendChar(char j) {
             *((char*)grow(sizeof(char))) = j;
         }
@@ -94,7 +101,7 @@ namespace mongo {
         inline char* grow(unsigned by) {
             unsigned oldlen = _len;
             _len += by;
-            if ( _len > _p._size ) {
+            if (MONGO_unlikely( _len > _p._size )) {
                 growReallocate(oldlen);
             }
             return _p._data + oldlen;
diff --git a/util/array.h b/util/array.h
index bf705a4d988..12822252fd7 100644
--- a/util/array.h
+++ b/util/array.h
@@ -18,6 +18,12 @@
 
 namespace mongo {
 
+    /*
+     * simple array class that does no allocations
+     * same api as vector
+     * fixed buffer, so once capacity is exceeded, will assert
+     * meant to be-reused with clear()
+     */
     template<typename T>
     class FastArray {
     public:
@@ -44,6 +50,7 @@ namespace mongo {
         }
 
         void push_back( const T& t ) {
+            assert( _size < _capacity );
             _data[_size++] = t;
         }
 
diff --git a/util/assert_util.cpp b/util/assert_util.cpp
index 52947bc02b8..da039c09a58 100644
--- a/util/assert_util.cpp
+++ b/util/assert_util.cpp
@@ -66,11 +66,23 @@ namespace mongo {
 
     /* "warning" assert -- safe to continue, so we don't throw exception. */
     NOINLINE_DECL void wasserted(const char *msg, const char *file, unsigned line) {
-        problem() << "warning Assertion failure " << msg << ' ' << file << ' ' << dec << line << endl;
+        static bool rateLimited;
+        static time_t lastWhen;
+        static unsigned lastLine;
+        if( lastLine == line && time(0)-lastWhen < 5 ) { 
+            if( rateLimited++ == 0 ) { 
+                log() << "rate limiting wassert" << endl;
+            }
+            return;
+        }
+        lastWhen = time(0);
+        lastLine = line;
+
+        problem() << "warning assertion failure " << msg << ' ' << file << ' ' << dec << line << endl;
         sayDbContext();
         raiseError(0,msg && *msg ? msg : "wassertion failure");
         assertionCount.condrollover( ++assertionCount.warning );
-#if defined(_DEBUG) || defined(_DURABLEDEFAULTON)
+#if defined(_DEBUG) || defined(_DURABLEDEFAULTON) || defined(_DURABLEDEFAULTOFF)
         // this is so we notice in buildbot
         log() << "\n\n***aborting after wassert() failure in a debug/test build\n\n" << endl;
         abort();
@@ -86,7 +98,7 @@ namespace mongo {
         temp << "assertion " << file << ":" << line;
         AssertionException e(temp.str(),0);
         breakpoint();
-#if defined(_DEBUG) || defined(_DURABLEDEFAULTON)
+#if defined(_DEBUG) || defined(_DURABLEDEFAULTON) || defined(_DURABLEDEFAULTOFF)
         // this is so we notice in buildbot
         log() << "\n\n***aborting after assert() failure in a debug/test build\n\n" << endl;
         abort();
@@ -103,7 +115,7 @@ namespace mongo {
         temp << msgid;
         AssertionException e(temp.str(),0);
         breakpoint();
-#if defined(_DEBUG) || defined(_DURABLEDEFAULTON)
+#if defined(_DEBUG) || defined(_DURABLEDEFAULTON) || defined(_DURABLEDEFAULTOFF)
         // this is so we notice in buildbot
         log() << "\n\n***aborting after verify() failure in a debug/test build\n\n" << endl;
         abort();
diff --git a/util/assert_util.h b/util/assert_util.h
index 244fb2287e1..b4c68b7de34 100644
--- a/util/assert_util.h
+++ b/util/assert_util.h
@@ -175,15 +175,15 @@ namespace mongo {
 #undef assert
 #endif
 
-#define MONGO_assert(_Expression) (void)( (!!(_Expression)) || (mongo::asserted(#_Expression, __FILE__, __LINE__), 0) )
+#define MONGO_assert(_Expression) (void)( MONGO_likely(!!(_Expression)) || (mongo::asserted(#_Expression, __FILE__, __LINE__), 0) )
 #define assert MONGO_assert
 
     /* "user assert".  if asserts, user did something wrong, not our code */
-#define MONGO_uassert(msgid, msg, expr) (void)( (!!(expr)) || (mongo::uasserted(msgid, msg), 0) )
+#define MONGO_uassert(msgid, msg, expr) (void)( MONGO_likely(!!(expr)) || (mongo::uasserted(msgid, msg), 0) )
 #define uassert MONGO_uassert
 
     /* warning only - keeps going */
-#define MONGO_wassert(_Expression) (void)( (!!(_Expression)) || (mongo::wasserted(#_Expression, __FILE__, __LINE__), 0) )
+#define MONGO_wassert(_Expression) (void)( MONGO_likely(!!(_Expression)) || (mongo::wasserted(#_Expression, __FILE__, __LINE__), 0) )
 #define wassert MONGO_wassert
 
     /* display a message, no context, and throw assertionexception
@@ -191,7 +191,7 @@ namespace mongo {
        easy way to throw an exception and log something without our stack trace
        display happening.
     */
-#define MONGO_massert(msgid, msg, expr) (void)( (!!(expr)) || (mongo::msgasserted(msgid, msg), 0) )
+#define MONGO_massert(msgid, msg, expr) (void)( MONGO_likely(!!(expr)) || (mongo::msgasserted(msgid, msg), 0) )
 #define massert MONGO_massert
 
     /* dassert is 'debug assert' -- might want to turn off for production as these
diff --git a/util/bufreader.h b/util/bufreader.h
index a0dcefa8d83..53f0ba744e2 100644
--- a/util/bufreader.h
+++ b/util/bufreader.h
@@ -28,6 +28,7 @@ namespace mongo {
     public:
         class eof : public std::exception {
         public:
+            eof() { }
             virtual const char * what() { return "BufReader eof"; }
         };
 
@@ -88,6 +89,7 @@ namespace mongo {
         }
 
         const void* pos() { return _pos; }
+        const void* start() { return _start; }
 
     private:
         const void *_start;
diff --git a/util/compress.cpp b/util/compress.cpp
new file mode 100644
index 00000000000..bcde488b88b
--- /dev/null
+++ b/util/compress.cpp
@@ -0,0 +1,31 @@
+// @file compress.cpp
+
+#include "../third_party/snappy/snappy.h"
+#include "compress.h"
+#include <string>
+#include <string.h>
+#include <assert.h>
+
+namespace mongo {
+
+    void rawCompress(const char* input,
+        size_t input_length,
+        char* compressed,
+        size_t* compressed_length) 
+    { 
+        snappy::RawCompress(input, input_length, compressed, compressed_length);
+    }
+
+    size_t maxCompressedLength(size_t source_len) { 
+        return snappy::MaxCompressedLength(source_len);
+    }
+
+    size_t compress(const char* input, size_t input_length, std::string* output) { 
+        return snappy::Compress(input, input_length, output);
+    }
+
+    bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed) { 
+        return snappy::Uncompress(compressed, compressed_length, uncompressed);
+    }
+
+}
diff --git a/util/compress.h b/util/compress.h
new file mode 100644
index 00000000000..5bc5a3392bb
--- /dev/null
+++ b/util/compress.h
@@ -0,0 +1,21 @@
+// @file compress.h
+
+#pragma once
+
+#include <string>
+
+namespace mongo { 
+
+    size_t compress(const char* input, size_t input_length, std::string* output);
+
+    bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed);
+
+    size_t maxCompressedLength(size_t source_len);
+    void rawCompress(const char* input,
+        size_t input_length,
+        char* compressed,
+        size_t* compressed_length);
+
+}
+
+
diff --git a/util/concurrency/mutex.h b/util/concurrency/mutex.h
index 44c2ebee0ea..6ca76570cbf 100644
--- a/util/concurrency/mutex.h
+++ b/util/concurrency/mutex.h
@@ -24,6 +24,8 @@
 
 namespace mongo {
 
+    void printStackTrace( ostream &o );
+
     class mutex;
 
     inline boost::xtime incxtimemillis( long long s ) {
@@ -86,6 +88,16 @@ namespace mongo {
         class scoped_lock : boost::noncopyable {
         public:
 #if defined(_DEBUG)
+            struct PostStaticCheck {
+                PostStaticCheck() {
+                    if ( StaticObserver::_destroyingStatics ) {
+                        cout << "trying to lock a mongo::mutex during static shutdown" << endl;
+                        printStackTrace( cout );
+                    }
+                }
+            };
+
+            PostStaticCheck _check;
             mongo::mutex * const _mut;
 #endif
             scoped_lock( mongo::mutex &m ) : 
diff --git a/util/concurrency/race.h b/util/concurrency/race.h
index 924d6d2fc5a..6be13363a6f 100644
--- a/util/concurrency/race.h
+++ b/util/concurrency/race.h
@@ -7,6 +7,12 @@ namespace mongo {
 
     namespace race {
 
+#ifdef _WIN32
+    typedef unsigned threadId_t;
+#else
+    typedef pthread_t threadId_t;
+#endif
+
 #if defined(_DEBUG)
 
         class Block { 
diff --git a/util/concurrency/rwlock.h b/util/concurrency/rwlock.h
index d14774b4ece..c281e54ecf0 100644
--- a/util/concurrency/rwlock.h
+++ b/util/concurrency/rwlock.h
@@ -38,20 +38,22 @@ namespace mongo {
             DEV mutexDebugger.leaving(_name);
             RWLockBase::unlock();
         }
+
+        void lock_shared() { RWLockBase::lock_shared(); }
+        void unlock_shared() { RWLockBase::unlock_shared(); }
+
         void lockAsUpgradable() { RWLockBase::lockAsUpgradable(); }
         void unlockFromUpgradable() { // upgradable -> unlocked
             RWLockBase::unlockFromUpgradable();
         }
         void upgrade() { // upgradable -> exclusive lock
             RWLockBase::upgrade();
-            DEV mutexDebugger.entering(_name);
         }
-        void lock_shared() { RWLockBase::lock_shared(); }
-        void unlock_shared() { RWLockBase::unlock_shared(); }
+
         bool lock_shared_try( int millis ) { return RWLockBase::lock_shared_try(millis); }
+
         bool lock_try( int millis = 0 ) {
             if( RWLockBase::lock_try(millis) ) {
-                DEV mutexDebugger.entering(_name);
                 return true;
             }
             return false;
diff --git a/util/concurrency/synchronization.cpp b/util/concurrency/synchronization.cpp
index 0ddc417eff1..ce2547c25eb 100644
--- a/util/concurrency/synchronization.cpp
+++ b/util/concurrency/synchronization.cpp
@@ -43,6 +43,7 @@ namespace mongo {
     NotifyAll::NotifyAll() : _mutex("NotifyAll") { 
         _lastDone = 0;
         _lastReturned = 0;
+        _nWaiting = 0;
     }
 
     NotifyAll::When NotifyAll::now() { 
@@ -52,6 +53,7 @@ namespace mongo {
 
     void NotifyAll::waitFor(When e) {
         scoped_lock lock( _mutex );
+        ++_nWaiting;
         while( _lastDone < e ) {
             _condition.wait( lock.boost() );
         }
@@ -59,6 +61,7 @@ namespace mongo {
 
     void NotifyAll::awaitBeyondNow() { 
         scoped_lock lock( _mutex );
+        ++_nWaiting;
         When e = ++_lastReturned;
         while( _lastDone <= e ) {
             _condition.wait( lock.boost() );
@@ -68,6 +71,7 @@ namespace mongo {
     void NotifyAll::notifyAll(When e) {
         scoped_lock lock( _mutex );
         _lastDone = e;
+        _nWaiting = 0;
         _condition.notify_all();
     }
 
diff --git a/util/concurrency/synchronization.h b/util/concurrency/synchronization.h
index 2467292616f..a0e89f7246b 100644
--- a/util/concurrency/synchronization.h
+++ b/util/concurrency/synchronization.h
@@ -65,16 +65,21 @@ namespace mongo {
         */
         void waitFor(When);
 
+        /** a bit faster than waitFor( now() ) */
         void awaitBeyondNow();
 
         /** may be called multiple times. notifies all waiters */
         void notifyAll(When);
 
+        /** indicates how many threads are waiting for a notify. */
+        unsigned nWaiting() const { return _nWaiting; }
+
     private:
         mongo::mutex _mutex;
         boost::condition _condition;
         When _lastDone;
         When _lastReturned;
+        unsigned _nWaiting;
     };
 
 } // namespace mongo
diff --git a/util/file.h b/util/file.h
index 826a905b90e..368e6927b43 100644
--- a/util/file.h
+++ b/util/file.h
@@ -47,6 +47,9 @@ namespace mongo {
         fileofs len() { return 0; }
         void fsync() { assert(false); }
 
+        // shrink file to size bytes. No-op if file already smaller.
+        void truncate(fileofs size);
+
         /** @return  -1 if error or unavailable */
         static boost::intmax_t freeSpace(const string &path) { assert(false); return -1; }
     };
@@ -57,10 +60,11 @@ namespace mongo {
     class File : public FileInterface {
         HANDLE fd;
         bool _bad;
+        string _name;
         void err(BOOL b=false) { /* false = error happened */
             if( !b && !_bad ) {
                 _bad = true;
-                log() << "File I/O error " << GetLastError() << '\n';
+                log() << "File " << _name << "I/O error " << GetLastError() << '\n';
             }
         }
     public:
@@ -73,6 +77,7 @@ namespace mongo {
             fd = INVALID_HANDLE_VALUE;
         }
         void open(const char *filename, bool readOnly=false , bool direct=false) {
+            _name = filename;
             fd = CreateFile(
                      toNativeString(filename).c_str(),
                      ( readOnly ? 0 : GENERIC_WRITE ) | GENERIC_READ, FILE_SHARE_WRITE|FILE_SHARE_READ,
@@ -123,6 +128,20 @@ namespace mongo {
             return li.QuadPart;
         }
         void fsync() { FlushFileBuffers(fd); }
+
+        void truncate(fileofs size) {
+            if (len() <= size)
+                return;
+
+            LARGE_INTEGER li;
+            li.QuadPart = size;
+            if (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) == 0){
+                err(false);
+                return; //couldn't seek
+            }
+
+            err(SetEndOfFile(fd));
+        }
     };
 
 #else
@@ -194,6 +213,13 @@ namespace mongo {
             assert( !statvfs( path.c_str() , &info ) );
             return boost::intmax_t( info.f_bavail ) * info.f_frsize;
         }
+
+        void truncate(fileofs size) {
+            if (len() <= size)
+                return;
+
+            err(ftruncate(fd, size) == 0);
+        }
     };
 
 
diff --git a/util/file_allocator.cpp b/util/file_allocator.cpp
index bf01d90865f..b0572f971bd 100644
--- a/util/file_allocator.cpp
+++ b/util/file_allocator.cpp
@@ -287,8 +287,8 @@ namespace mongo {
                     if ( fd > 0 )
                         close( fd );
                     log() << "error failed to allocate new file: " << name
-                          << " size: " << size << ' ' << errnoWithDescription() << endl;
-                    log() << "    will try again in 10 seconds" << endl;
+                          << " size: " << size << ' ' << errnoWithDescription() << warnings;
+                    log() << "    will try again in 10 seconds" << endl; // not going to warning logs
                     try {
                         if ( tmp.size() )
                             BOOST_CHECK_EXCEPTION( boost::filesystem::remove( tmp ) );
diff --git a/util/goodies.h b/util/goodies.h
index 51a80f6783c..65bfbaba982 100644
--- a/util/goodies.h
+++ b/util/goodies.h
@@ -109,6 +109,8 @@ namespace mongo {
 // PRINTFL; prints file:line
 #define MONGO_PRINTFL cout << __FILE__ ":" << __LINE__ << endl
 #define PRINTFL MONGO_PRINTFL
+#define MONGO_FLOG log() << __FILE__ ":" << __LINE__ << endl
+#define FLOG MONGO_FLOG
 
 #undef assert
 #define assert MONGO_assert
diff --git a/util/log.h b/util/log.h
index b49d960c41d..d5c7e55aae0 100644
--- a/util/log.h
+++ b/util/log.h
@@ -298,6 +298,9 @@ namespace mongo {
         }
     public:
         static Logstream& get() {
+            if ( StaticObserver::_destroyingStatics ) {
+                cout << "Logstream::get called in uninitialized state" << endl;
+            }
             Logstream *p = tsp.get();
             if( p == 0 )
                 tsp.reset( p = new Logstream() );
@@ -342,7 +345,7 @@ namespace mongo {
         return Logstream::get().prolog();
     }
 
-#define MONGO_LOG(level) MONGO_IF ( logLevel >= (level) ) log( level )
+#define MONGO_LOG(level) if ( MONGO_unlikely(logLevel >= (level)) ) log( level )
 #define LOG MONGO_LOG
 
     inline Nullstream& log( LogLevel l ) {
@@ -517,4 +520,6 @@ namespace mongo {
         }
     };
 
+    extern Tee* const warnings; // Things put here go in serverStatus
+
 } // namespace mongo
diff --git a/util/logfile.cpp b/util/logfile.cpp
index 37e14b47678..609edb8fe2d 100644
--- a/util/logfile.cpp
+++ b/util/logfile.cpp
@@ -77,9 +77,18 @@ namespace mongo {
             CloseHandle(_fd);
     }
 
+    void LogFile::truncate() {
+        verify(15870, _fd != INVALID_HANDLE_VALUE);
+
+        if (!SetEndOfFile(_fd)){
+            msgasserted(15871, "Couldn't truncate file: " + errnoWithDescription());
+        }
+    }
+
     void LogFile::synchronousAppend(const void *_buf, size_t _len) {
         const size_t BlockSize = 8 * 1024 * 1024;
         assert(_fd);
+        assert(_len % 4096 == 0);
         const char *buf = (const char *) _buf;
         size_t left = _len;
         while( left ) {
@@ -88,7 +97,7 @@ namespace mongo {
             if( !WriteFile(_fd, buf, toWrite, &written, NULL) ) {
                 DWORD e = GetLastError();
                 if( e == 87 )
-                    msgasserted(13519, "error 87 appending to file - misaligned direct write?");
+                    msgasserted(13519, "error 87 appending to file - invalid parameter");
                 else
                     uasserted(13517, str::stream() << "error appending to file " << _name << ' ' << _len << ' ' << toWrite << ' ' << errnoWithDescription(e));
             }
@@ -150,8 +159,20 @@ namespace mongo {
         _fd = -1;
     }
 
+    void LogFile::truncate() {
+        verify(15872, _fd >= 0);
+
+        BOOST_STATIC_ASSERT(sizeof(off_t) == 8); // we don't want overflow here
+        const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek
+        if (ftruncate(_fd, pos) != 0){
+            msgasserted(15873, "Couldn't truncate file: " + errnoWithDescription());
+        }
+    }
+
     void LogFile::synchronousAppend(const void *b, size_t len) {
-        off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek
+#ifdef POSIX_FADV_DONTNEED
+        const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek
+#endif
 
         const char *buf = (char *) b;
         assert(_fd);
diff --git a/util/logfile.h b/util/logfile.h
index e4bbc467cb5..f6d1c94bf22 100644
--- a/util/logfile.h
+++ b/util/logfile.h
@@ -38,6 +38,8 @@ namespace mongo {
 
         const string _name;
 
+        void truncate(); // Removes extra data after current position
+
     private:
 #if defined(_WIN32)
         typedef HANDLE fd_type;
diff --git a/util/net/httpclient.cpp b/util/net/httpclient.cpp
index de45023c2aa..16eaa0ae80a 100644
--- a/util/net/httpclient.cpp
+++ b/util/net/httpclient.cpp
@@ -38,8 +38,15 @@ namespace mongo {
     }
 
     int HttpClient::_go( const char * command , string url , const char * body , Result * result ) {
-        uassert( 10271 ,  "invalid url" , url.find( "http://" ) == 0 );
-        url = url.substr( 7 );
+        bool ssl = false;
+        if ( url.find( "https://" ) == 0 ) {
+            ssl = true;
+            url = url.substr( 8 );
+        }
+        else {
+            uassert( 10271 ,  "invalid url" , url.find( "http://" ) == 0 );
+            url = url.substr( 7 );
+        }
 
         string host , path;
         if ( url.find( "/" ) == string::npos ) {
@@ -56,7 +63,7 @@ namespace mongo {
         HD( "path [" << path << "]" );
 
         string server = host;
-        int port = 80;
+        int port = ssl ? 443 : 80;
 
         string::size_type idx = host.find( ":" );
         if ( idx != string::npos ) {
@@ -92,6 +99,15 @@ namespace mongo {
         Socket sock;
         if ( ! sock.connect( addr ) )
             return -1;
+        
+        if ( ssl ) {
+#ifdef MONGO_SSL
+            _checkSSLManager();
+            sock.secure( _sslManager.get() );
+#else
+            uasserted( 15862 , "no ssl support" );
+#endif
+        }
 
         {
             const char * out = req.c_str();
@@ -152,5 +168,10 @@ namespace mongo {
         _body = entire;
     }
 
+#ifdef MONGO_SSL
+    void HttpClient::_checkSSLManager() {
+        _sslManager.reset( new SSLManager( true ) );
+    }
+#endif
 
 }
diff --git a/util/net/httpclient.h b/util/net/httpclient.h
index dadcc72f226..c3f8c824adc 100644
--- a/util/net/httpclient.h
+++ b/util/net/httpclient.h
@@ -18,10 +18,11 @@
 #pragma once
 
 #include "../../pch.h"
+#include "sock.h"
 
 namespace mongo {
 
-    class HttpClient {
+    class HttpClient : boost::noncopyable {
     public:
 
         typedef map<string,string> Headers;
@@ -68,6 +69,11 @@ namespace mongo {
     private:
         int _go( const char * command , string url , const char * body , Result * result );
 
+#ifdef MONGO_SSL
+        void _checkSSLManager();
+
+        scoped_ptr<SSLManager> _sslManager;
+#endif
     };
 }
 
diff --git a/util/net/listen.cpp b/util/net/listen.cpp
index 53139ccc385..16ddde880b1 100644
--- a/util/net/listen.cpp
+++ b/util/net/listen.cpp
@@ -95,20 +95,42 @@ namespace mongo {
         return out;
 
     }
+    
+    Listener::Listener(const string& name, const string &ip, int port, bool logConnect ) 
+        : _port(port), _name(name), _ip(ip), _logConnect(logConnect), _elapsedTime(0) { 
+#ifdef MONGO_SSL
+        _ssl = 0;
+        _sslPort = 0;
+
+        if ( cmdLine.sslOnNormalPorts && cmdLine.sslServerManager ) {
+            secure( cmdLine.sslServerManager );
+        }
+#endif
+    }
+    
+    Listener::~Listener() {
+        if ( _timeTracker == this )
+            _timeTracker = 0;
+    }
 
-    void Listener::initAndListen() {
-        checkTicketNumbers();
-        vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _port, (!cmdLine.noUnixSocket && useUnixSockets()));
-        vector<int> socks;
-        SOCKET maxfd = 0; // needed for select()
+#ifdef MONGO_SSL
+    void Listener::secure( SSLManager* manager ) {
+        _ssl = manager;
+    }
 
-        for (vector<SockAddr>::iterator it=mine.begin(), end=mine.end(); it != end; ++it) {
-            SockAddr& me = *it;
+    void Listener::addSecurePort( SSLManager* manager , int additionalPort ) {
+        _ssl = manager;
+        _sslPort = additionalPort;
+    }
+
+#endif
+
+    bool Listener::_setupSockets( const vector<SockAddr>& mine , vector<int>& socks ) {
+        for (vector<SockAddr>::const_iterator it=mine.begin(), end=mine.end(); it != end; ++it) {
+            const SockAddr& me = *it;
 
             SOCKET sock = ::socket(me.getType(), SOCK_STREAM, 0);
-            if ( sock == INVALID_SOCKET ) {
-                log() << "ERROR: listen(): invalid socket? " << errnoWithDescription() << endl;
-            }
+            massert( 15863 , str::stream() << "listen(): invalid socket? " << errnoWithDescription() , sock >= 0 );
 
             if (me.getType() == AF_UNIX) {
 #if !defined(_WIN32)
@@ -138,42 +160,90 @@ namespace mongo {
 
             if ( ::bind(sock, me.raw(), me.addressSize) != 0 ) {
                 int x = errno;
-                log() << "listen(): bind() failed " << errnoWithDescription(x) << " for socket: " << me.toString() << endl;
+                error() << "listen(): bind() failed " << errnoWithDescription(x) << " for socket: " << me.toString() << endl;
                 if ( x == EADDRINUSE )
-                    log() << "  addr already in use" << endl;
+                    error() << "  addr already in use" << endl;
                 closesocket(sock);
-                return;
+                return false;
             }
 
 #if !defined(_WIN32)
             if (me.getType() == AF_UNIX) {
                 if (chmod(me.getAddr().c_str(), 0777) == -1) {
-                    log() << "couldn't chmod socket file " << me << errnoWithDescription() << endl;
+                    error() << "couldn't chmod socket file " << me << errnoWithDescription() << endl;
                 }
-
                 ListeningSockets::get()->addPath( me.getAddr() );
             }
 #endif
-
+            
             if ( ::listen(sock, 128) != 0 ) {
-                log() << "listen(): listen() failed " << errnoWithDescription() << endl;
+                error() << "listen(): listen() failed " << errnoWithDescription() << endl;
                 closesocket(sock);
-                return;
+                return false;
             }
 
             ListeningSockets::get()->add( sock );
 
             socks.push_back(sock);
-            if (sock > maxfd)
-                maxfd = sock;
         }
+        
+        return true;
+    }
+    
+    void Listener::initAndListen() {
+        checkTicketNumbers();
+        vector<int> socks;
+        set<int> sslSocks;
+        
+        { // normal sockets
+            vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _port, (!cmdLine.noUnixSocket && useUnixSockets()));
+            if ( ! _setupSockets( mine , socks ) )
+                return;
+        }
+        
+#ifdef MONGO_SSL
+        if ( _ssl && _sslPort > 0 ) {
+            unsigned prev = socks.size();
+            
+            vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _sslPort, false );
+            if ( ! _setupSockets( mine , socks ) )
+                return;
+            
+            for ( unsigned i=prev; i<socks.size(); i++ ) {
+                sslSocks.insert( socks[i] );
+            }
+
+        }
+#endif
+
+        SOCKET maxfd = 0; // needed for select()
+        for ( unsigned i=0; i<socks.size(); i++ ) {
+            if ( socks[i] > maxfd )
+                maxfd = socks[i];
+        }
+        
+#ifdef MONGO_SSL
+        if ( _ssl == 0 ) {
+            _logListen( _port , false );
+        }
+        else if ( _sslPort == 0 ) {
+            _logListen( _port , true );
+        }
+        else {
+            // both
+            _logListen( _port , false );
+            _logListen( _sslPort , true );
+        }
+#else
+        _logListen( _port , false );
+#endif
 
         static long connNumber = 0;
         struct timeval maxSelectTime;
         while ( ! inShutdown() ) {
             fd_set fds[1];
             FD_ZERO(fds);
-
+            
             for (vector<int>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) {
                 FD_SET(*it, fds);
             }
@@ -233,13 +303,25 @@ namespace mongo {
                     disableNagle(s);
                 if ( _logConnect && ! cmdLine.quiet )
                     log() << "connection accepted from " << from.toString() << " #" << ++connNumber << endl;
-                accepted(s, from);
+                
+                Socket newSock = Socket(s, from);
+#ifdef MONGO_SSL
+                if ( _ssl && ( _sslPort == 0 || sslSocks.count(*it) ) ) {
+                    newSock.secureAccepted( _ssl );
+                }
+#endif
+                accepted( newSock );
             }
         }
     }
 
-    void Listener::accepted(int sock, const SockAddr& from) {
-        accepted( new MessagingPort(sock, from) );
+    void Listener::_logListen( int port , bool ssl ) {
+        log() << _name << ( _name.size() ? " " : "" ) << "waiting for connections on port " << port << ( ssl ? " ssl" : "" ) << endl;
+    }
+
+
+    void Listener::accepted(Socket socket) {
+        accepted( new MessagingPort(socket) );
     }
     
     void Listener::accepted(MessagingPort *mp) {
diff --git a/util/net/listen.h b/util/net/listen.h
index e8b4189c0f5..415db1e3fb6 100644
--- a/util/net/listen.h
+++ b/util/net/listen.h
@@ -25,15 +25,25 @@ namespace mongo {
 
     class Listener : boost::noncopyable {
     public:
-        Listener(const string &ip, int p, bool logConnect=true ) : _port(p), _ip(ip), _logConnect(logConnect), _elapsedTime(0) { }
-        virtual ~Listener() {
-            if ( _timeTracker == this )
-                _timeTracker = 0;
-        }
+
+        Listener(const string& name, const string &ip, int port, bool logConnect=true );
+
+        virtual ~Listener();
+        
+#ifdef MONGO_SSL
+        /**
+         * make this an ssl socket
+         * ownership of SSLManager remains with the caller
+         */
+        void secure( SSLManager* manager );
+
+        void addSecurePort( SSLManager* manager , int additionalPort );
+#endif
+
         void initAndListen(); // never returns unless error (start a thread)
 
         /* spawn a thread, etc., then return */
-        virtual void accepted(int sock, const SockAddr& from);
+        virtual void accepted(Socket socket);
         virtual void accepted(MessagingPort *mp);
 
         const int _port;
@@ -60,12 +70,25 @@ namespace mongo {
         }
 
     private:
+        string _name;
         string _ip;
         bool _logConnect;
         long long _elapsedTime;
+        
+#ifdef MONGO_SSL
+        SSLManager* _ssl;
+        int _sslPort;
+#endif
+        
+        /**
+         * @return true iff everything went ok
+         */
+        bool _setupSockets( const vector<SockAddr>& mine , vector<int>& socks );
+        
+        void _logListen( int port , bool ssl );
 
         static const Listener* _timeTracker;
-
+        
         virtual bool useUnixSockets() const { return false; }
     };
 
diff --git a/util/net/message_port.cpp b/util/net/message_port.cpp
index 8c50c8d52a3..9abfaf7c975 100644
--- a/util/net/message_port.cpp
+++ b/util/net/message_port.cpp
@@ -137,6 +137,10 @@ namespace mongo {
         piggyBackData = 0;
     }
 
+    MessagingPort::MessagingPort( Socket& sock )
+        : Socket( sock ) , piggyBackData( 0 ) {
+    }
+
     void MessagingPort::shutdown() {
         close();
     }
diff --git a/util/net/message_port.h b/util/net/message_port.h
index 6bbcc46f71c..22ecafecfbc 100644
--- a/util/net/message_port.h
+++ b/util/net/message_port.h
@@ -56,6 +56,8 @@ namespace mongo {
         // no data sent, then we detect that the other side is down
         MessagingPort(double so_timeout = 0, int logLevel = 0 );
 
+        MessagingPort(Socket& socket);
+
         virtual ~MessagingPort();
 
         void shutdown();
diff --git a/util/net/message_server_port.cpp b/util/net/message_server_port.cpp
index e9712d490ba..ca0b13dae07 100644
--- a/util/net/message_server_port.cpp
+++ b/util/net/message_server_port.cpp
@@ -28,6 +28,10 @@
 #include "../../db/lasterror.h"
 #include "../../db/stats/counters.h"
 
+#ifdef __linux__  // TODO: consider making this ifndef _WIN32
+# include <sys/resource.h>
+#endif
+
 namespace mongo {
 
     namespace pms {
@@ -43,6 +47,8 @@ namespace mongo {
             inPort->setLogLevel(1);
             scoped_ptr<MessagingPort> p( inPort );
 
+            p->postFork();
+
             string otherSide;
 
             Message m;
@@ -98,7 +104,7 @@ namespace mongo {
     class PortMessageServer : public MessageServer , public Listener {
     public:
         PortMessageServer(  const MessageServer::Options& opts, MessageHandler * handler ) :
-            Listener( opts.ipList, opts.port ) {
+            Listener( "" , opts.ipList, opts.port ) {
 
             uassert( 10275 ,  "multiple PortMessageServer not supported" , ! pms::handler );
             pms::handler = handler;
@@ -125,10 +131,18 @@ namespace mongo {
                 pthread_attr_init(&attrs);
                 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
 
-                static const size_t STACK_SIZE = 1024*1024;
-                pthread_attr_setstacksize(&attrs, (DEBUG_BUILD 
-                                                    ? (STACK_SIZE / 2)
-                                                    : STACK_SIZE));
+                static const size_t STACK_SIZE = 1024*1024; // if we change this we need to update the warning
+
+                struct rlimit limits;
+                verify(15887, getrlimit(RLIMIT_STACK, &limits) == 0);
+                if (limits.rlim_cur > STACK_SIZE) {
+                    pthread_attr_setstacksize(&attrs, (DEBUG_BUILD
+                                                        ? (STACK_SIZE / 2)
+                                                        : STACK_SIZE));
+                } else if (limits.rlim_cur < 1024*1024) {
+                    warning() << "Stack size set to " << (limits.rlim_cur/1024) << "KB. We suggest 1MB" << endl;
+                }
+
 
                 pthread_t thread;
                 int failed = pthread_create(&thread, &attrs, (void*(*)(void*)) &pms::threadRun, p);
diff --git a/util/net/miniwebserver.cpp b/util/net/miniwebserver.cpp
index 269a60bc85c..01a3418a909 100644
--- a/util/net/miniwebserver.cpp
+++ b/util/net/miniwebserver.cpp
@@ -23,8 +23,8 @@
 
 namespace mongo {
 
-    MiniWebServer::MiniWebServer(const string &ip, int port)
-        : Listener(ip, port, false)
+    MiniWebServer::MiniWebServer(const string& name, const string &ip, int port)
+        : Listener(name, ip, port, false)
     {}
 
     string MiniWebServer::parseURL( const char * buf ) {
@@ -108,17 +108,18 @@ namespace mongo {
         return false;
     }
 
-    void MiniWebServer::accepted(int s, const SockAddr &from) {
-        setSockTimeouts(s, 8);
+    void MiniWebServer::accepted(Socket sock) {
+        sock.postFork();
+        sock.setTimeout(8);
         char buf[4096];
         int len = 0;
         while ( 1 ) {
             int left = sizeof(buf) - 1 - len;
             if( left == 0 )
                 break;
-            int x = ::recv(s, buf + len, left, 0);
+            int x = sock.unsafe_recv( buf + len , left );
             if ( x <= 0 ) {
-                closesocket(s);
+                sock.close();
                 return;
             }
             len += x;
@@ -134,7 +135,7 @@ namespace mongo {
         vector<string> headers;
 
         try {
-            doRequest(buf, parseURL( buf ), responseMsg, responseCode, headers, from);
+            doRequest(buf, parseURL( buf ), responseMsg, responseCode, headers, sock.remoteAddr() );
         }
         catch ( std::exception& e ) {
             responseCode = 500;
@@ -165,8 +166,8 @@ namespace mongo {
         ss << responseMsg;
         string response = ss.str();
 
-        ::send(s, response.c_str(), response.size(), 0);
-        closesocket(s);
+        sock.send( response.c_str(), response.size() , "http response" );
+        sock.close();
     }
 
     string MiniWebServer::getHeader( const char * req , string wanted ) {
diff --git a/util/net/miniwebserver.h b/util/net/miniwebserver.h
index 01c810b551e..1fb6b3f2e65 100644
--- a/util/net/miniwebserver.h
+++ b/util/net/miniwebserver.h
@@ -27,7 +27,7 @@ namespace mongo {
 
     class MiniWebServer : public Listener {
     public:
-        MiniWebServer(const string &ip, int _port);
+        MiniWebServer(const string& name, const string &ip, int _port);
         virtual ~MiniWebServer() {}
 
         virtual void doRequest(
@@ -53,7 +53,7 @@ namespace mongo {
         static string urlDecode(string s) {return urlDecode(s.c_str());}
 
     private:
-        void accepted(int s, const SockAddr &from);
+        void accepted(Socket socket);
         static bool fullReceive( const char *buf );
     };
 
diff --git a/util/net/sock.cpp b/util/net/sock.cpp
index f9e4a85d832..69c42f2729d 100644
--- a/util/net/sock.cpp
+++ b/util/net/sock.cpp
@@ -34,21 +34,37 @@
 # endif
 #endif
 
+#ifdef MONGO_SSL
+#include <openssl/err.h>
+#include <openssl/ssl.h>
+#endif
+
+
 namespace mongo {
 
     static bool ipv6 = false;
     void enableIPv6(bool state) { ipv6 = state; }
     bool IPv6Enabled() { return ipv6; }
     
-    // --- some global helpers -----
+    void setSockTimeouts(int sock, double secs) {
+        struct timeval tv;
+        tv.tv_sec = (int)secs;
+        tv.tv_usec = (int)((long long)(secs*1000*1000) % (1000*1000));
+        bool report = logLevel > 3; // solaris doesn't provide these
+        DEV report = true;
+        bool ok = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *) &tv, sizeof(tv) ) == 0;
+        if( report && !ok ) log() << "unabled to set SO_RCVTIMEO" << endl;
+        ok = setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *) &tv, sizeof(tv) ) == 0;
+        DEV if( report && !ok ) log() << "unabled to set SO_RCVTIMEO" << endl;
+    }
 
 #if defined(_WIN32)
     void disableNagle(int sock) {
         int x = 1;
         if ( setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &x, sizeof(x)) )
-            out() << "ERROR: disableNagle failed" << endl;
+            error() << "disableNagle failed" << endl;
         if ( setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &x, sizeof(x)) )
-            out() << "ERROR: SO_KEEPALIVE failed" << endl;
+            error() << "SO_KEEPALIVE failed" << endl;
     }
 #else
     
@@ -62,11 +78,35 @@ namespace mongo {
 #endif
 
         if ( setsockopt(sock, level, TCP_NODELAY, (char *) &x, sizeof(x)) )
-            log() << "ERROR: disableNagle failed: " << errnoWithDescription() << endl;
+            error() << "disableNagle failed: " << errnoWithDescription() << endl;
 
 #ifdef SO_KEEPALIVE
         if ( setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &x, sizeof(x)) )
-            log() << "ERROR: SO_KEEPALIVE failed: " << errnoWithDescription() << endl;
+            error() << "SO_KEEPALIVE failed: " << errnoWithDescription() << endl;
+
+#  ifdef __linux__
+        socklen_t len = sizeof(x);
+        if ( getsockopt(sock, level, TCP_KEEPIDLE, (char *) &x, &len) )
+            error() << "can't get TCP_KEEPIDLE: " << errnoWithDescription() << endl;
+
+        if (x > 300) {
+            x = 300;
+            if ( setsockopt(sock, level, TCP_KEEPIDLE, (char *) &x, sizeof(x)) ) {
+                error() << "can't set TCP_KEEPIDLE: " << errnoWithDescription() << endl;
+            }
+        }
+
+        len = sizeof(x); // just in case it changed
+        if ( getsockopt(sock, level, TCP_KEEPINTVL, (char *) &x, &len) )
+            error() << "can't get TCP_KEEPINTVL: " << errnoWithDescription() << endl;
+
+        if (x > 300) {
+            x = 300;
+            if ( setsockopt(sock, level, TCP_KEEPINTVL, (char *) &x, sizeof(x)) ) {
+                error() << "can't set TCP_KEEPINTVL: " << errnoWithDescription() << endl;
+            }
+        }
+#  endif
 #endif
 
     }
@@ -299,29 +339,119 @@ namespace mongo {
     }
 
 
+    // ------------ SSLManager -----------------
+
+#ifdef MONGO_SSL
+    SSLManager::SSLManager( bool client ) {
+        _client = client;
+        SSL_library_init();
+        SSL_load_error_strings();
+        ERR_load_crypto_strings();
+        
+        _context = SSL_CTX_new( client ? SSLv23_client_method() : SSLv23_server_method() );
+        massert( 15864 , mongoutils::str::stream() << "can't create SSL Context: " << ERR_error_string(ERR_get_error(), NULL) , _context );
+        
+        SSL_CTX_set_options( _context, SSL_OP_ALL);   
+    }
+
+    void SSLManager::setupPubPriv( const string& privateKeyFile , const string& publicKeyFile ) {
+        massert( 15865 , 
+                 mongoutils::str::stream() << "Can't read SSL certificate from file " 
+                 << publicKeyFile << ":" <<  ERR_error_string(ERR_get_error(), NULL) ,
+                 SSL_CTX_use_certificate_file(_context, publicKeyFile.c_str(), SSL_FILETYPE_PEM) );
+  
+
+        massert( 15866 , 
+                 mongoutils::str::stream() << "Can't read SSL private key from file " 
+                 << privateKeyFile << " : " << ERR_error_string(ERR_get_error(), NULL) ,
+                 SSL_CTX_use_PrivateKey_file(_context, privateKeyFile.c_str(), SSL_FILETYPE_PEM) );
+    }
+    
+    
+    int SSLManager::password_cb(char *buf,int num, int rwflag,void *userdata){
+        SSLManager* sm = (SSLManager*)userdata;
+        string pass = sm->_password;
+        strcpy(buf,pass.c_str());
+        return(pass.size());
+    }
+
+    void SSLManager::setupPEM( const string& keyFile , const string& password ) {
+        _password = password;
+        
+        massert( 15867 , "Can't read certificate file" , SSL_CTX_use_certificate_chain_file( _context , keyFile.c_str() ) );
+        
+        SSL_CTX_set_default_passwd_cb_userdata( _context , this );
+        SSL_CTX_set_default_passwd_cb( _context, &SSLManager::password_cb );
+        
+        massert( 15868 , "Can't read key file" , SSL_CTX_use_PrivateKey_file( _context , keyFile.c_str() , SSL_FILETYPE_PEM ) );
+    }
+        
+    SSL * SSLManager::secure( int fd ) {
+        SSL * ssl = SSL_new( _context );
+        massert( 15861 , "can't create SSL" , ssl );
+        SSL_set_fd( ssl , fd );
+        return ssl;
+    }
+
+
+#endif
+
     // ------------ Socket -----------------
     
     Socket::Socket(int fd , const SockAddr& remote) : 
         _fd(fd), _remote(remote), _timeout(0) {
         _logLevel = 0;
-        _bytesOut = 0;
-        _bytesIn = 0;
+        _init();
     }
 
     Socket::Socket( double timeout, int ll ) {
         _logLevel = ll;
         _fd = -1;
         _timeout = timeout;
+        _init();
+    }
+    
+    void Socket::_init() {
         _bytesOut = 0;
         _bytesIn = 0;
+#ifdef MONGO_SSL
+        _sslAccepted = 0;
+#endif
     }
 
     void Socket::close() {
+#ifdef MONGO_SSL
+        _ssl.reset();
+#endif
         if ( _fd >= 0 ) {
             closesocket( _fd );
             _fd = -1;
         }
     }
+    
+#ifdef MONGO_SSL
+    void Socket::secure( SSLManager * ssl ) {
+        assert( ssl );
+        assert( _fd >= 0 );
+        _ssl.reset( ssl->secure( _fd ) );
+        SSL_connect( _ssl.get() );
+    }
+
+    void Socket::secureAccepted( SSLManager * ssl ) { 
+        _sslAccepted = ssl;
+    }
+#endif
+
+    void Socket::postFork() {
+#ifdef MONGO_SSL
+        if ( _sslAccepted ) {
+            assert( _fd );
+            _ssl.reset( _sslAccepted->secure( _fd ) );
+            SSL_accept( _ssl.get() );
+            _sslAccepted = 0;
+        }
+#endif
+    }
 
     class ConnectBG : public BackgroundJob {
     public:
@@ -347,7 +477,7 @@ namespace mongo {
         }
 
         if ( _timeout > 0 ) {
-            setSockTimeouts( _fd, _timeout );
+            setTimeout( _timeout );
         }
 
         ConnectBG bg(_fd, remote);
@@ -377,12 +507,29 @@ namespace mongo {
         return true;
     }
 
+    int Socket::_send( const char * data , int len ) {
+#ifdef MONGO_SSL
+        if ( _ssl ) {
+            return SSL_write( _ssl.get() , data , len );
+        }
+#endif
+        return ::send( _fd , data , len , portSendFlags );
+    }
 
     // sends all data or throws an exception
     void Socket::send( const char * data , int len, const char *context ) {
         while( len > 0 ) {
-            int ret = ::send( _fd , data , len , portSendFlags );
+            int ret = _send( data , len  );
             if ( ret == -1 ) {
+                
+#ifdef MONGO_SSL
+                if ( _ssl ) {
+                    log() << "SSL Error ret: " << ret << " err: " << SSL_get_error( _ssl.get() , ret ) 
+                          << " " << ERR_error_string(ERR_get_error(), NULL) 
+                          << endl;
+                }
+#endif
+
 #if defined(_WIN32)
                 if ( WSAGetLastError() == WSAETIMEDOUT && _timeout != 0 ) {
 #else
@@ -408,15 +555,27 @@ namespace mongo {
         }
     }
 
-    // sends all data or throws an exception
-    void Socket::send( const vector< pair< char *, int > > &data, const char *context ) {
-#if defined(_WIN32)
-        // TODO use scatter/gather api
+    void Socket::_send( const vector< pair< char *, int > > &data, const char *context ) {
         for( vector< pair< char *, int > >::const_iterator i = data.begin(); i != data.end(); ++i ) {
             char * data = i->first;
             int len = i->second;
             send( data, len, context );
         }
+    }
+
+    // sends all data or throws an exception
+    void Socket::send( const vector< pair< char *, int > > &data, const char *context ) {
+
+#ifdef MONGO_SSL
+        if ( _ssl ) {
+            _send( data , context );
+            return;
+        }
+#endif
+
+#if defined(_WIN32)
+        // TODO use scatter/gather api
+        _send( data , context );
 #else
         vector< struct iovec > d( data.size() );
         int i = 0;
@@ -479,23 +638,26 @@ namespace mongo {
                 log(3) << "Socket recv() conn closed? " << remoteString() << endl;
                 throw SocketException( SocketException::CLOSED , remoteString() );
             }
-            else { /* ret < 0  */
+            else { /* ret < 0  */                
+#if defined(_WIN32)
+                int e = WSAGetLastError();
+#else
                 int e = errno;
-                
-#if defined(EINTR) && !defined(_WIN32)
+# if defined(EINTR)
                 if( e == EINTR ) {
                     if( ++retries == 1 ) {
                         log() << "EINTR retry" << endl;
                         continue;
                     }
                 }
+# endif
 #endif
                 if ( ( e == EAGAIN 
 #if defined(_WIN32)
-
                        || e == WSAETIMEDOUT
 #endif
-                       ) && _timeout > 0 ) {
+                       ) && _timeout > 0 ) 
+                {
                     // this is a timeout
                     log(_logLevel) << "Socket recv() timeout  " << remoteString() <<endl;
                     throw SocketException( SocketException::RECV_TIMEOUT, remoteString() );                    
@@ -508,11 +670,33 @@ namespace mongo {
     }
 
     int Socket::unsafe_recv( char *buf, int max ) {
-        int x = ::recv( _fd , buf , max , portRecvFlags );
+        int x = _recv( buf , max );
         _bytesIn += x;
         return x;
     }
 
+
+    int Socket::_recv( char *buf, int max ) {
+#ifdef MONGO_SSL
+        if ( _ssl ){
+            return SSL_read( _ssl.get() , buf , max );
+        }
+#endif
+        return ::recv( _fd , buf , max , portRecvFlags );
+    }
+
+    void Socket::setTimeout( double secs ) {
+        struct timeval tv;
+        tv.tv_sec = (int)secs;
+        tv.tv_usec = (int)((long long)(secs*1000*1000) % (1000*1000));
+        bool report = logLevel > 3; // solaris doesn't provide these
+        DEV report = true;
+        bool ok = setsockopt(_fd, SOL_SOCKET, SO_RCVTIMEO, (char *) &tv, sizeof(tv) ) == 0;
+        if( report && !ok ) log() << "unabled to set SO_RCVTIMEO" << endl;
+        ok = setsockopt(_fd, SOL_SOCKET, SO_SNDTIMEO, (char *) &tv, sizeof(tv) ) == 0;
+        DEV if( report && !ok ) log() << "unabled to set SO_RCVTIMEO" << endl;
+    }
+
 #if defined(_WIN32)
     struct WinsockInit {
         WinsockInit() {
diff --git a/util/net/sock.h b/util/net/sock.h
index 9c6f0251ad6..1cd51333525 100644
--- a/util/net/sock.h
+++ b/util/net/sock.h
@@ -39,6 +39,10 @@
 
 #endif // _WIN32
 
+#ifdef MONGO_SSL
+#include <openssl/ssl.h>
+#endif
+
 namespace mongo {
 
     const int SOCK_FAMILY_UNKNOWN_ERROR=13078;
@@ -68,24 +72,13 @@ namespace mongo {
         return mongoutils::str::stream() << cmdLine.socket << "/mongodb-" << port << ".sock";
     }
 
-    inline void setSockTimeouts(int sock, double secs) {
-        struct timeval tv;
-        tv.tv_sec = (int)secs;
-        tv.tv_usec = (int)((long long)(secs*1000*1000) % (1000*1000));
-        bool report = logLevel > 3; // solaris doesn't provide these
-        DEV report = true;
-        bool ok = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *) &tv, sizeof(tv) ) == 0;
-        if( report && !ok ) log() << "unabled to set SO_RCVTIMEO" << endl;
-        ok = setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *) &tv, sizeof(tv) ) == 0;
-        DEV if( report && !ok ) log() << "unabled to set SO_RCVTIMEO" << endl;
-    }
-
     // If an ip address is passed in, just return that.  If a hostname is passed
     // in, look up its ip and return that.  Returns "" on failure.
     string hostbyname(const char *hostname);
 
     void enableIPv6(bool state=true);
     bool IPv6Enabled();
+    void setSockTimeouts(int sock, double secs);
 
     /**
      * wrapped around os representation of network address
@@ -157,6 +150,29 @@ namespace mongo {
         string _extra;
     };
 
+#ifdef MONGO_SSL
+    class SSLManager : boost::noncopyable {
+    public:
+        SSLManager( bool client );
+        
+        void setupPEM( const string& keyFile , const string& password );
+        void setupPubPriv( const string& privateKeyFile , const string& publicKeyFile );
+
+        /**
+         * creates an SSL context to be used for this file descriptor
+         * caller should delete
+         */
+        SSL * secure( int fd );
+        
+        static int password_cb( char *buf,int num, int rwflag,void *userdata );
+
+    private:
+        bool _client;
+        SSL_CTX* _context;
+        string _password;
+    };
+#endif
+
     /**
      * thin wrapped around file descriptor and system calls
      * todo: ssl
@@ -165,9 +181,12 @@ namespace mongo {
     public:
         Socket(int sock, const SockAddr& farEnd);
 
-        // in some cases the timeout will actually be 2x this value - eg we do a partial send,
-        // then the timeout fires, then we try to send again, then the timeout fires again with
-        // no data sent, then we detect that the other side is down
+        /** In some cases the timeout will actually be 2x this value - eg we do a partial send,
+            then the timeout fires, then we try to send again, then the timeout fires again with
+            no data sent, then we detect that the other side is down.
+
+            Generally you don't want a timeout, you should be very prepared for errors if you set one.
+        */
         Socket(double so_timeout = 0, int logLevel = 0 );
 
         bool connect(SockAddr& farEnd);
@@ -190,8 +209,32 @@ namespace mongo {
         void clearCounters() { _bytesIn = 0; _bytesOut = 0; }
         long long getBytesIn() const { return _bytesIn; }
         long long getBytesOut() const { return _bytesOut; }
+        
+        void setTimeout( double secs );
+
+#ifdef MONGO_SSL
+        /** secures inline */
+        void secure( SSLManager * ssl );
 
+        void secureAccepted( SSLManager * ssl );
+#endif
+        
+        /**
+         * call this after a fork for server sockets
+         */
+        void postFork();
+        
     private:
+        void _init();
+        /** raw send, same semantics as ::send */
+        int _send( const char * data , int len );
+        
+        /** sends dumbly, just each buffer at a time */
+        void _send( const vector< pair< char *, int > > &data, const char *context );
+
+        /** raw recv, same semantics as ::recv */
+        int _recv( char * buf , int max );
+
         int _fd;
         SockAddr _remote;
         double _timeout;
@@ -199,6 +242,11 @@ namespace mongo {
         long long _bytesIn;
         long long _bytesOut;
 
+#ifdef MONGO_SSL
+        shared_ptr<SSL> _ssl;
+        SSLManager * _sslAccepted;
+#endif
+
     protected:
         int _logLevel; // passed to log() when logging errors
 
diff --git a/util/paths.h b/util/paths.h
index 4ae591fb49b..2297a9a2f90 100644
--- a/util/paths.h
+++ b/util/paths.h
@@ -23,9 +23,9 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 
-using namespace mongoutils;
-
 namespace mongo {
+    
+    using namespace mongoutils;
 
     extern string dbpath;
 
diff --git a/util/processinfo_darwin.cpp b/util/processinfo_darwin.cpp
index c1190aec438..9f73cbffd4f 100644
--- a/util/processinfo_darwin.cpp
+++ b/util/processinfo_darwin.cpp
@@ -19,15 +19,14 @@
 #include "processinfo.h"
 #include "log.h"
 
-
+#include <mach/vm_statistics.h>
 #include <mach/task_info.h>
-
 #include <mach/mach_init.h>
 #include <mach/mach_host.h>
 #include <mach/mach_traps.h>
 #include <mach/task.h>
 #include <mach/vm_map.h>
-#include <mach/shared_memory_server.h>
+#include <mach/shared_region.h>
 #include <iostream>
 
 #include <sys/types.h>
diff --git a/util/ramlog.cpp b/util/ramlog.cpp
index f8cfa0a7052..69ffc175ee9 100644
--- a/util/ramlog.cpp
+++ b/util/ramlog.cpp
@@ -25,7 +25,7 @@ namespace mongo {
 
     using namespace mongoutils;
 
-    RamLog::RamLog( string name ) : _name(name) {
+    RamLog::RamLog( string name ) : _name(name), _lastWrite(0) {
         h = 0; n = 0;
         for( int i = 0; i < N; i++ )
             lines[i][C-1] = 0;
@@ -48,6 +48,8 @@ namespace mongo {
     }
 
     void RamLog::write(LogLevel ll, const string& str) {
+        _lastWrite = time(0);
+
         char *p = lines[(h+n)%N];
         
         unsigned sz = str.size();
@@ -183,4 +185,6 @@ namespace mongo {
 
     mongo::mutex* RamLog::_namedLock;
     RamLog::RM*  RamLog::_named = 0;
+
+    Tee* const warnings = new RamLog("warnings"); // Things put here go in serverStatus
 }
diff --git a/util/ramlog.h b/util/ramlog.h
index 8539a436388..d3d5c8fbb4e 100644
--- a/util/ramlog.h
+++ b/util/ramlog.h
@@ -34,6 +34,8 @@ namespace mongo {
         static RamLog* get( string name );
         static void getNames( vector<string>& names );
 
+        time_t lastWrite() { return _lastWrite; } // 0 if no writes
+
     protected:
         static int repeats(const vector<const char *>& v, int i);
         static string clean(const vector<const char *>& v, int i, string line="");
@@ -57,6 +59,7 @@ namespace mongo {
         typedef map<string,RamLog*> RM;
         static mongo::mutex* _namedLock;
         static RM*  _named;
+        time_t _lastWrite;
     };
 
 }
diff --git a/util/stringutils.h b/util/stringutils.h
index bab9f608f7e..93598aa520b 100644
--- a/util/stringutils.h
+++ b/util/stringutils.h
@@ -40,7 +40,11 @@ namespace mongo {
         return string(copy);
     }
 
-    // for convenience, '{' is greater than anything and stops number parsing
+    /**
+     * Non numeric characters are compared lexicographically; numeric substrings
+     * are compared numerically; dots separate ordered comparable subunits.
+     * For convenience, character 255 is greater than anything else.
+     */
     inline int lexNumCmp( const char *s1, const char *s2 ) {
         //cout << "START : " << s1 << "\t" << s2 << endl;
 
@@ -48,6 +52,18 @@ namespace mongo {
         
         while( *s1 && *s2 ) {
 
+            bool d1 = ( *s1 == '.' );
+            bool d2 = ( *s2 == '.' );
+            if ( d1 && !d2 )
+             	return -1;
+            if ( d2 && !d1 )
+             	return 1;
+            if ( d1 && d2 ) {
+             	++s1; ++s2;
+                startWord = true;
+                continue;
+            }
+            
             bool p1 = ( *s1 == (char)255 );
             bool p2 = ( *s2 == (char)255 );
             //cout << "\t\t " << p1 << "\t" << p2 << endl;
@@ -64,7 +80,6 @@ namespace mongo {
                 if ( startWord ) {
                     while ( *s1 == '0' ) s1++;
                     while ( *s2 == '0' ) s2++;
-                    startWord = false;
                 }
 
                 char * e1 = (char*)s1;
@@ -94,6 +109,7 @@ namespace mongo {
                 // otherwise, the numbers are equal
                 s1 = e1;
                 s2 = e2;
+                startWord = false;
                 continue;
             }
             
@@ -109,11 +125,8 @@ namespace mongo {
             if ( *s2 > *s1 )
                 return -1;
             
-            if ( *s1 == '.' )
-                startWord = true;
-            else
-                startWord = false;
             s1++; s2++;
+            startWord = false;
         }
 
         if ( *s1 )
diff --git a/util/time_support.h b/util/time_support.h
index ce2cdbc0e15..ca17807ec96 100644
--- a/util/time_support.h
+++ b/util/time_support.h
@@ -52,6 +52,16 @@ namespace mongo {
         return buf;
     }
 
+    inline string timeToISOString(time_t time) {
+        struct tm t;
+        time_t_to_Struct( time, &t );
+
+        const char* fmt = "%Y-%m-%dT%H:%M:%SZ";
+        char buf[32];
+        assert(strftime(buf, sizeof(buf), fmt, &t) == 20);
+        return buf;
+    }
+
     inline boost::gregorian::date currentDate() {
         boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
         return now.date();
diff --git a/util/timer.h b/util/timer.h
index 9db907185dd..cbfe859ef5c 100644
--- a/util/timer.h
+++ b/util/timer.h
@@ -30,6 +30,8 @@ namespace mongo {
         Timer( unsigned long long startMicros ) { old = startMicros; }
         int seconds() const { return (int)(micros() / 1000000); }
         int millis() const { return (int)(micros() / 1000); }
+        int minutes() const { return seconds() / 60; }
+        
 
         /** gets time interval and resets at the same time.  this way we can call curTimeMicros
               once instead of twice if one wanted millis() and then reset().
diff --git a/util/version.cpp b/util/version.cpp
index f9c1471c88e..809f4cde3eb 100644
--- a/util/version.cpp
+++ b/util/version.cpp
@@ -26,6 +26,8 @@
 #include "stringutils.h"
 #include "../db/jsobj.h"
 #include "file.h"
+#include "ramlog.h"
+#include "../db/cmdline.h"
 
 namespace mongo {
 
@@ -36,7 +38,7 @@ namespace mongo {
      *      1.2.3-rc4-pre-
      * If you really need to do something else you'll need to fix _versionArray()
      */
-    const char versionString[] = "1.9.1-pre-";
+    const char versionString[] = "2.0.0-rc0-pre-";
 
     // See unit test for example outputs
     static BSONArray _versionArray(const char* version){
@@ -114,35 +116,39 @@ namespace mongo {
         log() << "build info: " << sysInfo() << endl;
     }
 
+
+    static Tee * startupWarningsLog = new RamLog("startupWarnings"); //intentionally leaked
+
     //
-    // 32 bit systems warning
+    // system warnings
     //
     void show_warnings() {
-        // each message adds a leading but not a trailing newline
+        // each message adds a leading and a trailing newline
 
         bool warned = false;
         {
             const char * foo = strchr( versionString , '.' ) + 1;
             int bar = atoi( foo );
             if ( ( 2 * ( bar / 2 ) ) != bar ) {
-                cout << "\n** NOTE: This is a development version (" << versionString << ") of MongoDB.";
-                cout << "\n**       Not recommended for production." << endl;
+                log() << startupWarningsLog;
+                log() << "** NOTE: This is a development version (" << versionString << ") of MongoDB." << startupWarningsLog;
+                log() << "**       Not recommended for production." << startupWarningsLog;
                 warned = true;
             }
         }
 
         if ( sizeof(int*) == 4 ) {
-            cout << endl;
-            cout << "** NOTE: when using MongoDB 32 bit, you are limited to about 2 gigabytes of data" << endl;
-            cout << "**       see http://blog.mongodb.org/post/137788967/32-bit-limitations" << endl;
-            cout << "**       with --journal, the limit is lower" << endl;
+            log() << startupWarningsLog;
+            log() << "** NOTE: when using MongoDB 32 bit, you are limited to about 2 gigabytes of data" << startupWarningsLog;
+            log() << "**       see http://blog.mongodb.org/post/137788967/32-bit-limitations" << startupWarningsLog;
+            log() << "**       with --journal, the limit is lower" << startupWarningsLog;
             warned = true;
         }
 
 #ifdef __linux__
         if (boost::filesystem::exists("/proc/vz") && !boost::filesystem::exists("/proc/bc")) {
-            cout << endl;
-            cout << "** WARNING: You are running in OpenVZ. This is known to be broken!!!" << endl;
+            log() << startupWarningsLog;
+            log() << "** WARNING: You are running in OpenVZ. This is known to be broken!!!" << startupWarningsLog;
             warned = true;
         }
 
@@ -172,22 +178,49 @@ namespace mongo {
                 const char* space = strchr(line, ' ');
                 
                 if ( ! space ) {
-                    cout << "** WARNING: cannot parse numa_maps" << endl;
+                    log() << startupWarningsLog;
+                    log() << "** WARNING: cannot parse numa_maps" << startupWarningsLog;
                     warned = true;
                 }
                 else if ( ! startsWith(space+1, "interleave") ) {
-                    cout << endl;
-                    cout << "** WARNING: You are running on a NUMA machine." << endl;
-                    cout << "**          We suggest launching mongod like this to avoid performance problems:" << endl;
-                    cout << "**              numactl --interleave=all mongod [other options]" << endl;
+                    log() << startupWarningsLog;
+                    log() << "** WARNING: You are running on a NUMA machine." << startupWarningsLog;
+                    log() << "**          We suggest launching mongod like this to avoid performance problems:" << startupWarningsLog;
+                    log() << "**              numactl --interleave=all mongod [other options]" << startupWarningsLog;
                     warned = true;
                 }
             }
         }
+
+        if (cmdLine.dur){
+            fstream f ("/proc/sys/vm/overcommit_memory", ios_base::in);
+            unsigned val;
+            f >> val;
+
+            if (val == 2) {
+                log() << startupWarningsLog;
+                log() << "** WARNING: /proc/sys/vm/overcommit_memory is " << val << startupWarningsLog;
+                log() << "**          Journaling works best with it set to 0 or 1" << startupWarningsLog;
+            }
+        }
+
+        if (boost::filesystem::exists("/proc/sys/vm/zone_reclaim_mode")){
+            fstream f ("/proc/sys/vm/zone_reclaim_mode", ios_base::in);
+            unsigned val;
+            f >> val;
+
+            if (val != 0) {
+                log() << startupWarningsLog;
+                log() << "** WARNING: /proc/sys/vm/zone_reclaim_mode is " << val << startupWarningsLog;
+                log() << "**          We suggest setting it to 0" << startupWarningsLog;
+                log() << "**          http://www.kernel.org/doc/Documentation/sysctl/vm.txt" << startupWarningsLog;
+            }
+        }
 #endif
 
-        if (warned)
-            cout << endl;
+        if (warned) {
+            log() << startupWarningsLog;
+        }
     }
 
     int versionCmp(StringData rhs, StringData lhs) {
author	Dwight <dwight@10gen.com>	2011-08-17 16:55:13 -0400
committer	Dwight <dwight@10gen.com>	2011-08-17 16:55:13 -0400
commit	48977d2abc8ecffaa4c547d427603c7ff24895d3 (patch)
tree	d910534e1d623fdc72dbd063ebf4878b8efedd7c
parent	9ce68d36823c22f641f705928f1c1b22f6206d76 (diff)
parent	e4a084bdab0e2a61e81476068ed494e346715d41 (diff)
download	mongo-48977d2abc8ecffaa4c547d427603c7ff24895d3.tar.gz