Merge branch 'master' of github.com:mongodb/mongo

author: Alberto Lerner <alerner@10gen.com> 2010-09-27 16:53:20 -0400
committer: Alberto Lerner <alerner@10gen.com> 2010-09-27 16:53:20 -0400
commit: f1afbf0f4e52d0b4fc487d41b1c8bc8743d89092 (patch)
tree: fa56f688e4306c407dff47b7c4f038d5f1a9e93a
parent: 4b56d14a15baf9853df4c74a5fcfa0c32845842b (diff)
parent: 6309d60b49739fdd22c70b8d255399b3281812be (diff)
download: mongo-f1afbf0f4e52d0b4fc487d41b1c8bc8743d89092.tar.gz
30 files changed, 478 insertions, 235 deletions
diff --git a/db/btree.cpp b/db/btree.cpp
index 1528951a047..43b155aa16d 100644
--- a/db/btree.cpp
+++ b/db/btree.cpp
@@ -31,6 +31,12 @@ namespace mongo {
 
 #define VERIFYTHISLOC dassert( thisLoc.btree() == this );
 
+    BtreeBucket* DiskLoc::btreemod() const {
+        assert( _a != -1 );
+        BtreeBucket *b = (BtreeBucket*) btreeStore->get(*this, BucketSize);
+        return dur::writing(b);
+    }
+
     KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) :
             prevChildBucket(k.prevChildBucket),
             recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
@@ -50,8 +56,8 @@ namespace mongo {
     /* BucketBasics --------------------------------------------------- */
 
     inline void BucketBasics::modified(const DiskLoc& thisLoc) {
-        VERIFYTHISLOC
-        btreeStore->modified(thisLoc);
+//        VERIFYTHISLOC
+//        btreeStore->modified(thisLoc);
     }
 
     int BucketBasics::Size() const {
@@ -222,6 +228,7 @@ namespace mongo {
        the keynodes grow from the front.
     */
     inline int BucketBasics::_alloc(int bytes) {
+        dur::assertWriting(this);
         topSize += bytes;
         emptySize -= bytes;
         int ofs = totalDataSize() - topSize;
@@ -284,7 +291,6 @@ namespace mongo {
 
     /* insert a key in a bucket with no complexity -- no splits required */
     bool BucketBasics::basicInsert(const DiskLoc& thisLoc, int &keypos, const DiskLoc& recordLoc, const BSONObj& key, const Ordering &order) {
-        modified(thisLoc);
         assert( keypos >= 0 && keypos <= n );
         int bytesNeeded = key.objsize() + sizeof(_KeyNode);
         if ( bytesNeeded > emptySize ) {
@@ -294,13 +300,16 @@ namespace mongo {
         }
         for ( int j = n; j > keypos; j-- ) // make room
             k(j) = k(j-1);
-        n++;
-        emptySize -= sizeof(_KeyNode);
-        _KeyNode& kn = k(keypos);
+
+        BucketBasics *b = this;//dur::writing(this);
+
+        b->n++;
+        b->emptySize -= sizeof(_KeyNode);
+        _KeyNode& kn = b->k(keypos);
         kn.prevChildBucket.Null();
         kn.recordLoc = recordLoc;
-        kn.setKeyDataOfs((short) _alloc(key.objsize()) );
-        char *p = dataAt(kn.keyDataOfs());
+        kn.setKeyDataOfs((short) b->_alloc(key.objsize()) );
+        char *p = b->dataAt(kn.keyDataOfs());
         memcpy(p, key.objdata(), key.objsize());
         return true;
     }
@@ -633,7 +642,7 @@ found:
         bool found;
         DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1);
         if ( found ) {
-            loc.btree()->delKeyAtPos(loc, id, pos);
+            loc.btreemod()->delKeyAtPos(loc, id, pos);
             return true;
         }
         return false;
@@ -661,16 +670,10 @@ found:
             fix(thisLoc, k(i).prevChildBucket);
     }
 
-    /* insert a key in this bucket, splitting if necessary.
-       keypos - where to insert the key i3n range 0..n.  0=make leftmost, n=make rightmost.
-       NOTE this function may free some data, and as a result the value passed for keypos may
-       be invalid after calling insertHere()
-    */
-    void BtreeBucket::insertHere(DiskLoc thisLoc, int keypos,
+    void BtreeBucket::_insertHere(DiskLoc thisLoc, int keypos,
                                  DiskLoc recordLoc, const BSONObj& key, const Ordering& order,
                                  DiskLoc lchild, DiskLoc rchild, IndexDetails& idx)
     {
-        modified(thisLoc);
         if ( insert_debug )
             out() << "   " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' '
                  << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
@@ -806,6 +809,20 @@ found:
             out() << "     split end " << hex << thisLoc.getOfs() << dec << endl;
     }
 
+    /* insert a key in this bucket, splitting if necessary.
+       keypos - where to insert the key i3n range 0..n.  0=make leftmost, n=make rightmost.
+       NOTE this function may free some data, and as a result the value passed for keypos may
+       be invalid after calling insertHere()
+    */
+    void BtreeBucket::insertHere(DiskLoc thisLoc, int keypos,
+                                 DiskLoc recordLoc, const BSONObj& key, const Ordering& order,
+                                 DiskLoc lchild, DiskLoc rchild, IndexDetails& idx)
+    {
+        modified(thisLoc);
+        BtreeBucket *b = dur::writing(this);
+        b->_insertHere(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
+    }
+
     /* start a new index off, empty */
     DiskLoc BtreeBucket::addBucket(IndexDetails& id) {
         DiskLoc loc = btreeStore->insert(id.indexNamespace().c_str(), 0, BucketSize, true);
@@ -1247,7 +1264,7 @@ namespace mongo {
         while( 1 ) { 
             if( loc.btree()->tempNext().isNull() ) { 
                 // only 1 bucket at this level. we are done.
-                idx.head = loc;
+                dur::writingDiskLoc(idx.head) = loc;
                 break;
             }
             levels++;
diff --git a/db/btree.h b/db/btree.h
index bb31081b00d..63b1b1aa148 100644
--- a/db/btree.h
+++ b/db/btree.h
@@ -253,9 +253,14 @@ namespace mongo {
             return keyOfs >= n ? BSONObj() : keyNode(keyOfs).key;
         }
         static BtreeBucket* allocTemp(); /* caller must release with free() */
+
+        void _insertHere(DiskLoc thisLoc, int keypos,
+                        DiskLoc recordLoc, const BSONObj& key, const Ordering &order,
+                        DiskLoc lchild, DiskLoc rchild, IndexDetails&);
         void insertHere(DiskLoc thisLoc, int keypos,
                         DiskLoc recordLoc, const BSONObj& key, const Ordering &order,
                         DiskLoc lchild, DiskLoc rchild, IndexDetails&);
+
         int _insert(DiskLoc thisLoc, DiskLoc recordLoc,
                     const BSONObj& key, const Ordering &order, bool dupsAllowed,
                     DiskLoc lChild, DiskLoc rChild, IndexDetails&);
diff --git a/db/cap.cpp b/db/cap.cpp
index e80f27eb873..c36e57c1b58 100644
--- a/db/cap.cpp
+++ b/db/cap.cpp
@@ -63,7 +63,8 @@ namespace mongo {
         DiskLoc i = cappedFirstDeletedInCurExtent();
         for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted )
             drecs.push_back( i );
-        cappedFirstDeletedInCurExtent() = i;
+
+        dur::writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i;
 
         // This is the O(n^2) part.
         drecs.sort();
@@ -81,7 +82,7 @@ namespace mongo {
             DiskLoc b = *j;
             while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
                 // a & b are adjacent.  merge.
-                a.drec()->lengthWithHeaders += b.drec()->lengthWithHeaders;
+                dur::writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders;
                 j++;
                 if ( j == drecs.end() ) {
                     DEBUGGING out() << "temp: compact adddelrec2\n";
@@ -107,8 +108,8 @@ namespace mongo {
         // migrate old NamespaceDetails format
         assert( capped );
         if ( capExtent.a() == 0 && capExtent.getOfs() == 0 ) {
-            capFirstNewRecord = DiskLoc();
-            capFirstNewRecord.setInvalid();
+            //capFirstNewRecord = DiskLoc();
+            capFirstNewRecord.writing().setInvalid();
             // put all the DeletedRecords in cappedListOfAllDeletedRecords()
             for ( int i = 1; i < Buckets; ++i ) {
                 DiskLoc first = deletedList[ i ];
@@ -116,14 +117,14 @@ namespace mongo {
                     continue;
                 DiskLoc last = first;
                 for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted );
-                last.drec()->nextDeleted = cappedListOfAllDeletedRecords();
-                cappedListOfAllDeletedRecords() = first;
-                deletedList[ i ] = DiskLoc();
+                last.drec()->nextDeleted.writing() = cappedListOfAllDeletedRecords();
+                cappedListOfAllDeletedRecords().writing() = first;
+                deletedList[i].writing() = DiskLoc();
             }
             // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
 
             // Last, in case we're killed before getting here
-            capExtent = firstExtent;
+            capExtent.writing() = firstExtent;
         }
     }
 
@@ -145,20 +146,20 @@ namespace mongo {
         // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
         // (or DiskLoc() if new capExtent == firstExtent)
         if ( capExtent == lastExtent )
-            cappedLastDelRecLastExtent() = DiskLoc();
+            dur::writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
         else {
             DiskLoc i = cappedFirstDeletedInCurExtent();
             for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted );
-            cappedLastDelRecLastExtent() = i;
+            dur::writingDiskLoc( cappedLastDelRecLastExtent() ) = i;
         }
 
-        capExtent = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
+        dur::writingDiskLoc( capExtent ) = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
 
         /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
         //dassert( theCapExtent()->ns == ns );
 
         theCapExtent()->assertOk();
-        capFirstNewRecord = DiskLoc();
+        dur::writingDiskLoc( capFirstNewRecord ) = DiskLoc();
     }
 
     DiskLoc NamespaceDetails::__capAlloc( int len ) {
@@ -177,10 +178,10 @@ namespace mongo {
         /* unlink ourself from the deleted list */
         if ( !ret.isNull() ) {
             if ( prev.isNull() )
-                cappedListOfAllDeletedRecords() = ret.drec()->nextDeleted;
+                cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted;
             else
-                prev.drec()->nextDeleted = ret.drec()->nextDeleted;
-            ret.drec()->nextDeleted.setInvalid(); // defensive.
+                prev.drec()->nextDeleted.writing() = ret.drec()->nextDeleted;
+            ret.drec()->nextDeleted.writing().setInvalid(); // defensive.
             assert( ret.drec()->extentOfs < ret.getOfs() );
         }
 
@@ -190,7 +191,7 @@ namespace mongo {
     DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) { 
         // signal done allocating new extents.
         if ( !cappedLastDelRecLastExtent().isValid() )
-            cappedLastDelRecLastExtent() = DiskLoc();
+            dur::writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
         
         assert( len < 400000000 );
         int passes = 0;
@@ -209,7 +210,7 @@ namespace mongo {
         theCapExtent()->assertOk();
         DiskLoc firstEmptyExtent;
         while ( 1 ) {
-            if ( nrecords < max ) {
+            if ( stats.nrecords < max ) {
                 loc = __capAlloc( len );
                 if ( !loc.isNull() )
                     break;
@@ -218,8 +219,9 @@ namespace mongo {
             // If on first iteration through extents, don't delete anything.
             if ( !capFirstNewRecord.isValid() ) {
                 advanceCapExtent( ns );
+
                 if ( capExtent != firstExtent )
-                    capFirstNewRecord.setInvalid();
+                    capFirstNewRecord.writing().setInvalid();
                 // else signal done with first iteration through extents.
                 continue;
             }
@@ -248,14 +250,14 @@ namespace mongo {
             compact();
             if( ++passes > maxPasses ) {
                 log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n';
-                log() << "passes max:" << max << " nrecords:" << nrecords << " datasize: " << datasize << endl;
+                log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl;
                 massert( 10345 ,  "passes >= maxPasses in capped collection alloc", false );
             }
         }
 
         // Remember first record allocated on this iteration through capExtent.
         if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() )
-            capFirstNewRecord = loc;
+            dur::writingDiskLoc(capFirstNewRecord) = loc;
 
         return loc;
     }
@@ -301,7 +303,7 @@ namespace mongo {
                 }
             }
             
-            uassert( 13415, "emptying the collection is not allowed", nrecords > 1 );
+            uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 );
             
             if ( !capLooped() ) {
                 theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
@@ -363,7 +365,7 @@ namespace mongo {
         
         // preserve firstExtent/lastExtent
         capExtent = firstExtent;
-        datasize = nrecords = 0;
+        stats.datasize = stats.nrecords = 0;
         // lastExtentSize preserve
         // nIndexes preserve 0
         // capped preserve true
diff --git a/db/clientcursor.h b/db/clientcursor.h
index 729b5a5b412..08e3311d1a7 100644
--- a/db/clientcursor.h
+++ b/db/clientcursor.h
@@ -292,33 +292,22 @@ namespace mongo {
         void storeOpForSlave( DiskLoc last );
         void updateSlaveLocation( CurOp& curop );
         
-        unsigned idleTime(){
-            return _idleAgeMillis;
-        }
+        unsigned idleTime() const { return _idleAgeMillis; }
 
         static void idleTimeReport(unsigned millis);
 private:
         // cursors normally timeout after an inactivy period to prevent excess memory use
         // setting this prevents timeout of the cursor in question.
-        void noTimeout() { 
-            _pinValue++;
-        }
+        void noTimeout() { _pinValue++; }
 
-        multimap<DiskLoc, ClientCursor*>& byLoc() { 
-            return _db->ccByLoc;
-        }
+        multimap<DiskLoc, ClientCursor*>& byLoc() { return _db->ccByLoc; }
 public:
-        void setDoingDeletes( bool doingDeletes ){
-            _doingDeletes = doingDeletes;
-        }
+        void setDoingDeletes( bool doingDeletes ) {_doingDeletes = doingDeletes; }
         
         static void appendStats( BSONObjBuilder& result );
-
         static unsigned numCursors() { return clientCursorsById.size(); }
-
         static void informAboutToDeleteBucket(const DiskLoc& b);
         static void aboutToDelete(const DiskLoc& dl);
-
         static void find( const string& ns , set<CursorId>& all );
     };
 
diff --git a/db/dbcommands.cpp b/db/dbcommands.cpp
index b9486032c09..28f0ebb6705 100644
--- a/db/dbcommands.cpp
+++ b/db/dbcommands.cpp
@@ -954,7 +954,7 @@ namespace mongo {
             Client::Context ctx( ns );
             NamespaceDetails *d = nsdetails(ns.c_str());
             
-            if ( ! d || d->nrecords == 0 ){
+            if ( ! d || d->stats.nrecords == 0 ){
                 result.appendNumber( "size" , 0 );
                 result.appendNumber( "numObjects" , 0 );
                 result.append( "millis" , timer.millis() );
@@ -966,8 +966,8 @@ namespace mongo {
             shared_ptr<Cursor> c;
             if ( min.isEmpty() && max.isEmpty() ) {
                 if ( estimate ){
-                    result.appendNumber( "size" , d->datasize );
-                    result.appendNumber( "numObjects" , d->nrecords );
+                    result.appendNumber( "size" , d->stats.datasize );
+                    result.appendNumber( "numObjects" , d->stats.nrecords );
                     result.append( "millis" , timer.millis() );
                     return 1;
                 }
@@ -985,7 +985,7 @@ namespace mongo {
                 c.reset( new BtreeCursor( d, d->idxNo(*idx), *idx, min, max, false, 1 ) );
             }
             
-            long long avgObjSize = d->datasize / d->nrecords;
+            long long avgObjSize = d->stats.datasize / d->stats.nrecords;
 
             long long maxSize = jsobj["maxSize"].numberLong();
             long long maxObjects = jsobj["maxObjects"].numberLong();
@@ -1043,9 +1043,9 @@ namespace mongo {
                     log() << "error: have index ["  << collNS << "] but no NamespaceDetails" << endl;
                     continue;
                 }
-                totalSize += mine->datasize;
+                totalSize += mine->stats.datasize;
                 if ( details )
-                    details->appendNumber( d.indexName() , mine->datasize / scale );
+                    details->appendNumber( d.indexName() , mine->stats.datasize / scale );
             }
             return totalSize;
         }
@@ -1085,10 +1085,10 @@ namespace mongo {
                 return false;
             }
 
-            long long size = nsd->datasize / scale;
-            result.appendNumber( "count" , nsd->nrecords );
+            long long size = nsd->stats.datasize / scale;
+            result.appendNumber( "count" , nsd->stats.nrecords );
             result.appendNumber( "size" , size );
-            result.append      ( "avgObjSize" , double(size) / double(nsd->nrecords) );
+            result.append      ( "avgObjSize" , double(size) / double(nsd->stats.nrecords) );
             int numExtents;
             result.appendNumber( "storageSize" , nsd->storageSize( &numExtents ) / scale );
             result.append( "numExtents" , numExtents );
@@ -1143,8 +1143,8 @@ namespace mongo {
                 }
 
                 ncollections += 1;
-                objects += nsd->nrecords;
-                size += nsd->datasize;
+                objects += nsd->stats.nrecords;
+                size += nsd->stats.datasize;
 
                 int temp;
                 storageSize += nsd->storageSize( &temp );
@@ -1191,7 +1191,7 @@ namespace mongo {
             string toNs = dbname + "." + to;
             NamespaceDetails *nsd = nsdetails( fromNs.c_str() );
             massert( 10301 ,  "source collection " + fromNs + " does not exist", nsd );
-            long long excessSize = nsd->datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size'
+            long long excessSize = nsd->stats.datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size'
             DiskLoc extent = nsd->firstExtent;
             for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) {
                 excessSize -= extent.ext()->length;
@@ -1485,6 +1485,8 @@ namespace mongo {
 
                 if (!cc->yieldSometimes())
                     break;
+
+                RARELY killCurrentOp.checkForInterrupt();
             }
 
             BSONArrayBuilder b( result.subarrayStart( "values" ) );
diff --git a/db/dbcommands_admin.cpp b/db/dbcommands_admin.cpp
index b7ad9602c5f..a37bb4f6ce1 100644
--- a/db/dbcommands_admin.cpp
+++ b/db/dbcommands_admin.cpp
@@ -128,7 +128,7 @@ namespace mongo {
                 ss << " extent asserted ";
             }
 
-            ss << "  datasize?:" << d->datasize << " nrecords?:" << d->nrecords << " lastExtentSize:" << d->lastExtentSize << '\n';
+            ss << "  datasize?:" << d->stats.datasize << " nrecords?:" << d->stats.nrecords << " lastExtentSize:" << d->lastExtentSize << '\n';
             ss << "  padding:" << d->paddingFactor << '\n';
             try {
 
@@ -175,7 +175,7 @@ namespace mongo {
                         else ss << " (OK)";
                         ss << '\n';
                     }
-                    ss << "  " << n << " objects found, nobj:" << d->nrecords << '\n';
+                    ss << "  " << n << " objects found, nobj:" << d->stats.nrecords << '\n';
                     ss << "  " << len << " bytes data w/headers\n";
                     ss << "  " << nlen << " bytes data wout/headers\n";
                 }
diff --git a/db/diskloc.h b/db/diskloc.h
index 1f06fc58778..0f675c25d90 100644
--- a/db/diskloc.h
+++ b/db/diskloc.h
@@ -127,6 +127,8 @@ namespace mongo {
             return compare(b) < 0;
         }
 
+        DiskLoc& writing(); // see dur.h
+
         /* Get the "thing" associated with this disk location.
            it is assumed the object is what you say it is -- you must assure that
            (think of this as an unchecked type cast)
diff --git a/db/dur.cpp b/db/dur.cpp
new file mode 100644
index 00000000000..3d63b5003f4
--- /dev/null
+++ b/db/dur.cpp
@@ -0,0 +1,32 @@
+// @file dur.cpp
+
+#include "pch.h"
+#include "dur.h"
+#include "../util/mmap.h"
+
+namespace mongo { 
+
+    namespace dur { 
+
+#if defined(_DEBUG) && defined(_DURABLE)
+
+        void* writingPtr(void *x, size_t len) { 
+            cout << "TEMP writing " << x << ' ' << len << endl;
+            return MemoryMappedFile::getWriteViewFor(x);
+        }
+
+        void assertReading(void *p) { 
+            assert( MemoryMappedFile::getWriteViewFor(p) != 
+                    p );
+        }
+        void assertWriting(void *p) {
+            // todo: 
+            //assert( MemoryMappedFile::getWriteViewFor(p) == 
+            //        p );
+       }
+
+#endif
+
+    }
+
+}
diff --git a/db/dur.h b/db/dur.h
new file mode 100644
index 00000000000..c139bedb59a
--- /dev/null
+++ b/db/dur.h
@@ -0,0 +1,68 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "diskloc.h"
+
+namespace mongo { 
+
+    namespace dur { 
+
+        /** call writing...() to declare "i'm about to write to x and it should be logged for redo." 
+            
+            failure to call writing...() is checked in _DEBUG mode by using a read only mapped view
+            (i.e., you'll segfault if you don't...)
+        */
+
+
+#if !defined(_DURABLE)
+
+        inline void* writingPtr(void *x, size_t len) { return x; }
+        inline DiskLoc& writingDiskLoc(DiskLoc& d) { return d; }
+        inline int& writingInt(int& d) { return d; }
+        template <typename T> inline T* writing(T *x) { return x; }
+        inline void assertReading(void *p) { }
+        inline void assertWriting(void *p) { }
+
+#else
+
+        void* writingPtr(void *x, size_t len);
+
+        inline DiskLoc& writingDiskLoc(DiskLoc& d) {
+#if defined(_DEBUG)
+            return *((DiskLoc*) writingPtr(&d, sizeof(d)));
+#else
+            return d;
+#endif
+        }
+
+        inline int& writingInt(int& d) {
+#if defined(_DEBUG)
+            return *((int*) writingPtr(&d, sizeof(d)));
+#else
+            return d;
+#endif
+        }
+
+        template <typename T> 
+        inline 
+        T* writing(T *x) { 
+#if defined(_DEBUG)
+            return (T*) writingPtr(x, sizeof(T));
+#else
+            return x;
+#endif
+        }
+
+        void assertReading(void *p);
+        void assertWriting(void *p);
+
+#endif
+
+    }
+
+    inline DiskLoc& DiskLoc::writing() { 
+        return dur::writingDiskLoc(*this);
+    }
+
+}
diff --git a/db/namespace.cpp b/db/namespace.cpp
index 682300a7dd8..0e183680fea 100644
--- a/db/namespace.cpp
+++ b/db/namespace.cpp
@@ -47,7 +47,7 @@ namespace mongo {
     NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool _capped ) {
         /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
         firstExtent = lastExtent = capExtent = loc;
-        datasize = nrecords = 0;
+        stats.datasize = stats.nrecords = 0;
         lastExtentSize = 0;
         nIndexes = 0;
         capped = _capped;
@@ -181,41 +181,43 @@ namespace mongo {
     }
 
     void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
+        dur::assertReading(this);
 		BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+        dassert( dloc.drec() == d );
+        //DeletedRecord *dold = d;
+        d = dur::writing(d);
         {
             // defensive code: try to make us notice if we reference a deleted record
             (unsigned&) (((Record *) d)->data) = 0xeeeeeeee;
         }
-        dassert( dloc.drec() == d );
-        DEBUGGING out() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
+        DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
         if ( capped ) {
             if ( !cappedLastDelRecLastExtent().isValid() ) {
                 // Initial extent allocation.  Insert at end.
                 d->nextDeleted = DiskLoc();
                 if ( cappedListOfAllDeletedRecords().isNull() )
-                    cappedListOfAllDeletedRecords() = dloc;
+                    dur::writingDiskLoc( cappedListOfAllDeletedRecords() ) = dloc;
                 else {
                     DiskLoc i = cappedListOfAllDeletedRecords();
-                    for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted );
-                    i.drec()->nextDeleted = dloc;
+                    for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted )
+                        ;
+                    i.drec()->nextDeleted.writing() = dloc;
                 }
             } else {
                 d->nextDeleted = cappedFirstDeletedInCurExtent();
-                cappedFirstDeletedInCurExtent() = dloc;
+                dur::writingDiskLoc( cappedFirstDeletedInCurExtent() ) = dloc;
                 // always compact() after this so order doesn't matter
             }
         } else {
             int b = bucket(d->lengthWithHeaders);
             DiskLoc& list = deletedList[b];
             DiskLoc oldHead = list;
-            list = dloc;
+            dur::writingDiskLoc(list) = dloc;
             d->nextDeleted = oldHead;
         }
     }
 
-    /*
-       lenToAlloc is WITH header
-    */
+    // lenToAlloc is WITH header
     DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) {
         lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
         DiskLoc loc = _alloc(ns, lenToAlloc);
@@ -223,6 +225,7 @@ namespace mongo {
             return loc;
 
         DeletedRecord *r = loc.drec();
+        r = dur::writing(r);
 
         /* note we want to grab from the front so our next pointers on disk tend
         to go in a forward direction which is important for performance. */
@@ -247,9 +250,10 @@ namespace mongo {
         DiskLoc newDelLoc = loc;
         newDelLoc.inc(lenToAlloc);
         DeletedRecord *newDel = DataFileMgr::makeDeletedRecord(newDelLoc, left);
-        newDel->extentOfs = r->extentOfs;
-        newDel->lengthWithHeaders = left;
-        newDel->nextDeleted.Null();
+        DeletedRecord *newDelW = dur::writing(newDel);
+        newDelW->extentOfs = r->extentOfs;
+        newDelW->lengthWithHeaders = left;
+        newDelW->nextDeleted.Null();
 
         addDeletedRec(newDel, newDelLoc);
 
@@ -323,8 +327,8 @@ namespace mongo {
 
         /* unlink ourself from the deleted list */
         {
-            DeletedRecord *bmr = bestmatch.drec();
-            *bestprev = bmr->nextDeleted;
+            DeletedRecord *bmr = dur::writing(bestmatch.drec());
+            *dur::writing(bestprev) = bmr->nextDeleted;
             bmr->nextDeleted.setInvalid(); // defensive.
             assert(bmr->extentOfs < bestmatch.getOfs());
         }
@@ -394,6 +398,21 @@ namespace mongo {
         return cappedAlloc(ns,len);
     }
 
+    void NamespaceIndex::kill_ns(const char *ns) {
+        if ( !ht )
+            return;
+        Namespace n(ns);
+        ht->kill(n);
+
+        for( int i = 0; i<=1; i++ ) {
+            try {
+                Namespace extra(n.extraName(i).c_str());
+                ht->kill(extra);
+            }
+            catch(DBException&) { }
+        }
+    }
+
     /* extra space for indexes when more than 10 */
     NamespaceDetails::Extra* NamespaceIndex::newExtra(const char *ns, int i, NamespaceDetails *d) {
         assert( i >= 0 && i <= 1 );
@@ -440,7 +459,7 @@ namespace mongo {
             id = &idx(nIndexes,false);
         }
 
-        nIndexes++;
+        (*dur::writing(&nIndexes))++;
         if ( resetTransient )
             NamespaceDetailsTransient::get_w(thisns).addedIndex();
         return *id;
diff --git a/db/namespace.h b/db/namespace.h
index 4f6cde9ac8e..7479a21da24 100644
--- a/db/namespace.h
+++ b/db/namespace.h
@@ -125,8 +125,10 @@ namespace mongo {
         */
         DiskLoc deletedList[Buckets];
         // ofs 168 (8 byte aligned)
-        long long datasize;
-        long long nrecords;
+        struct Stats {
+            long long datasize; //datasize and nrecords MUST Be adjacent code assumes!
+            long long nrecords;
+        } stats;
         int lastExtentSize;
         int nIndexes;
     private:
@@ -503,20 +505,7 @@ namespace mongo {
             return d;
         }
 
-        void kill_ns(const char *ns) {
-            if ( !ht )
-                return;
-            Namespace n(ns);
-            ht->kill(n);
-
-            for( int i = 0; i<=1; i++ ) {
-                try {
-                    Namespace extra(n.extraName(i).c_str());
-                    ht->kill(extra);
-                }
-                catch(DBException&) { }
-            }
-        }
+        void kill_ns(const char *ns);
 
         bool find(const char *ns, DiskLoc& loc) {
             NamespaceDetails *l = details(ns);
diff --git a/db/pdfile.cpp b/db/pdfile.cpp
index cedcc2aafc6..8e75ce41925 100644
--- a/db/pdfile.cpp
+++ b/db/pdfile.cpp
@@ -396,24 +396,25 @@ namespace mongo {
     }
 
     void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { 
-        DiskLoc oldExtentLoc;
         NamespaceIndex *ni = nsindex(ns);
         NamespaceDetails *details = ni->details(ns);
         if ( details ) {
             assert( !details->lastExtent.isNull() );
             assert( !details->firstExtent.isNull() );
-            e->xprev = details->lastExtent;
-            details->lastExtent.ext()->xnext = eloc;
+            dur::writingDiskLoc(e->xprev) = details->lastExtent;
+            dur::writingDiskLoc(details->lastExtent.ext()->xnext) = eloc;
             assert( !eloc.isNull() );
-            details->lastExtent = eloc;
+            dur::writingDiskLoc(details->lastExtent) = eloc;
         }
         else {
             ni->add_ns(ns, eloc, capped);
             details = ni->details(ns);
         }
 
-        details->lastExtentSize = e->length;
-        DEBUGGING out() << "temp: newextent adddelrec " << ns << endl;
+        {
+            NamespaceDetails *dw = dur::writing(details);
+            dw->lastExtentSize = e->length;
+        }
         details->addDeletedRec(emptyLoc.drec(), emptyLoc);
     }
 
@@ -434,11 +435,13 @@ namespace mongo {
             return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
         }
         int offset = header->unused.getOfs();
-        header->unused.set( fileNo, offset + ExtentSize );
-        header->unusedLength -= ExtentSize;
+
+        DataFileHeader *h = dur::writing(header);
+        h->unused.set( fileNo, offset + ExtentSize );
+        h->unusedLength -= ExtentSize;
         loc.set(fileNo, offset);
         Extent *e = _getExtent(loc);
-        DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset);
+        DiskLoc emptyLoc = dur::writing(e)->init(ns, ExtentSize, fileNo, offset);
 
         addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
 
@@ -553,9 +556,7 @@ namespace mongo {
         emptyLoc.inc( (int) (_extentData-(char*)this) );
 
         int l = _length - (_extentData - (char *) this);
-        //DeletedRecord *empty1 = (DeletedRecord *) extentData;
-        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, l);
-        //assert( empty == empty1 );
+        DeletedRecord *empty = dur::writing( DataFileMgr::makeDeletedRecord(emptyLoc, l) );
         empty->lengthWithHeaders = l;
         empty->extentOfs = myLoc.getOfs();
         return emptyLoc;
@@ -750,12 +751,11 @@ namespace mongo {
             else { 
                 DiskLoc a = freeExtents->firstExtent;
                 assert( a.ext()->xprev.isNull() );
-                a.ext()->xprev = d->lastExtent;
-                d->lastExtent.ext()->xnext = a;
-                freeExtents->firstExtent = d->firstExtent;
-
-                d->firstExtent.setInvalid();
-                d->lastExtent.setInvalid();
+                dur::writingDiskLoc( a.ext()->xprev ) = d->lastExtent;
+                dur::writingDiskLoc( d->lastExtent.ext()->xnext ) = a;
+                dur::writingDiskLoc( freeExtents->firstExtent ) = d->firstExtent;
+                dur::writingDiskLoc( d->firstExtent ).setInvalid();
+                dur::writingDiskLoc( d->lastExtent ).setInvalid();
             }
         }
 
@@ -843,14 +843,14 @@ namespace mongo {
         /* remove ourself from the record next/prev chain */
         {
             if ( todelete->prevOfs != DiskLoc::NullOfs )
-                todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs;
+                dur::writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
             if ( todelete->nextOfs != DiskLoc::NullOfs )
-                todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs;
+                dur::writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
         }
 
         /* remove ourself from extent pointers */
         {
-            Extent *e = todelete->myExtent(dl);
+            Extent *e = dur::writing( todelete->myExtent(dl) );
             if ( e->firstRecord == dl ) {
                 if ( todelete->nextOfs == DiskLoc::NullOfs )
                     e->firstRecord.Null();
@@ -867,18 +867,26 @@ namespace mongo {
 
         /* add to the free list */
         {
-            d->nrecords--;
-            d->datasize -= todelete->netLength();
-            /* temp: if in system.indexes, don't reuse, and zero out: we want to be
-               careful until validated more, as IndexDetails has pointers
-               to this disk location.  so an incorrectly done remove would cause
-               a lot of problems.
-            */
+            {
+                NamespaceDetails::Stats *s = dur::writing(&d->stats);
+                s->datasize -= todelete->netLength();
+                s->nrecords--;
+            }
+
             if ( strstr(ns, ".system.indexes") ) {
-                memset(todelete, 0, todelete->lengthWithHeaders);
+                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+                   careful until validated more, as IndexDetails has pointers
+                   to this disk location.  so an incorrectly done remove would cause
+                   a lot of problems.
+                */
+                memset(dur::writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
             }
             else {
-                DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                DEV {
+                    unsigned long long *p = (unsigned long long *) todelete->data;
+                    *dur::writing(p) = 0;
+                    //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                }
                 d->addDeletedRec((DeletedRecord*)todelete, dl);
             }
         }
@@ -1082,7 +1090,7 @@ namespace mongo {
         bool dropDups = idx.dropDups() || inDBRepair;
         BSONObj order = idx.keyPattern();
 
-        idx.head.Null();
+        dur::writingDiskLoc(idx.head).Null();
         
         if ( logLevel > 1 ) printMemInfo( "before index start" );
 
@@ -1090,9 +1098,9 @@ namespace mongo {
         unsigned long long n = 0;
         shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
         BSONObjExternalSorter sorter(order);
-        sorter.hintNumObjects( d->nrecords );
+        sorter.hintNumObjects( d->stats.nrecords );
         unsigned long long nkeys = 0;
-        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->nrecords , 10 ) );
+        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
         while ( c->ok() ) {
             BSONObj o = c->current();
             DiskLoc loc = c->currLoc();
@@ -1180,7 +1188,7 @@ namespace mongo {
             bool dupsAllowed = !idx.unique();
             bool dropDups = idx.dropDups();
 
-            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->nrecords );
+            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords );
 
             unsigned long long n = 0;
             auto_ptr<ClientCursor> cc;
@@ -1333,7 +1341,7 @@ namespace mongo {
         if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
             return;
 
-        d->flags |= NamespaceDetails::Flag_HaveIdIndex;
+        *dur::writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex;
 
         {
             NamespaceDetails::IndexIterator i = d->ii();
@@ -1532,6 +1540,7 @@ namespace mongo {
 
         Record *r = loc.rec();
         assert( r->lengthWithHeaders >= lenWHdr );
+        r = (Record*) dur::writingPtr(r, lenWHdr);
         if( addID ) { 
             /* a little effort was made here to avoid a double copy when we add an ID */
             ((int&)*r->data) = *((int*) obuf) + newId->size();
@@ -1542,22 +1551,25 @@ namespace mongo {
             if( obuf )
                 memcpy(r->data, obuf, len);
         }
-        Extent *e = r->myExtent(loc);
+        Extent *e = dur::writing(r->myExtent(loc));
         if ( e->lastRecord.isNull() ) {
             e->firstRecord = e->lastRecord = loc;
             r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
         }
         else {
-
             Record *oldlast = e->lastRecord.rec();
             r->prevOfs = e->lastRecord.getOfs();
             r->nextOfs = DiskLoc::NullOfs;
-            oldlast->nextOfs = loc.getOfs();
+            dur::writing(oldlast)->nextOfs = loc.getOfs();
             e->lastRecord = loc;
         }
 
-        d->nrecords++;
-        d->datasize += r->netLength();
+        /* durability todo : this could be a bit annoying / slow to record constantly */
+        {
+            NamespaceDetails::Stats *s = dur::writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
 
         // we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket
         if ( !god )
@@ -1578,7 +1590,7 @@ namespace mongo {
 
             int idxNo = tableToIndex->nIndexes;
             IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
-            idx.info = loc;
+            dur::writingDiskLoc(idx.info) = loc;
             try {
                 buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
             } catch( DBException& e ) {
@@ -1669,8 +1681,12 @@ namespace mongo {
             e->lastRecord = loc;
         }
 
-        d->nrecords++;
-        d->datasize += r->netLength();
+        /* todo: don't update for oplog?  seems wasteful. */
+        {
+            NamespaceDetails::Stats *s = dur::writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
 
         return r;
     }
diff --git a/db/pdfile.h b/db/pdfile.h
index edfc4422b39..1d4001658b8 100644
--- a/db/pdfile.h
+++ b/db/pdfile.h
@@ -69,9 +69,7 @@ namespace mongo {
         */
         Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0);
 
-        DataFileHeader *getHeader() {
-            return header;
-        }
+        DataFileHeader *getHeader() { return header; }
 
         /* return max size an extent may be */
         static int maxSize();
@@ -292,10 +290,7 @@ namespace mongo {
             return ( version == VERSION ) && ( versionMinor == VERSION_MINOR );
         }
 
-        bool uninitialized() const {
-            if ( version == 0 ) return true;
-            return false;
-        }
+        bool uninitialized() const { return version == 0; }
 
         /*Record* __getRecord(DiskLoc dl) {
             int ofs = dl.getOfs();
@@ -307,12 +302,13 @@ namespace mongo {
             if ( uninitialized() ) {
                 assert(filelength > 32768 );
                 assert( HeaderSize == 8192 );
-                fileLength = filelength;
-                version = VERSION;
-                versionMinor = VERSION_MINOR;
-                unused.set( fileno, HeaderSize );
+                DataFileHeader *h = dur::writing(this);
+                h->fileLength = filelength;
+                h->version = VERSION;
+                h->versionMinor = VERSION_MINOR;
+                h->unused.set( fileno, HeaderSize );
                 assert( (data-(char*)this) == HeaderSize );
-                unusedLength = fileLength - HeaderSize - 16;
+                h->unusedLength = fileLength - HeaderSize - 16;
                 //memcpy(data+unusedLength, "      \nthe end\n", 16);
             }
         }
diff --git a/db/query.cpp b/db/query.cpp
index 8b5d24ba1f9..8f63bf44d7b 100644
--- a/db/query.cpp
+++ b/db/query.cpp
@@ -525,7 +525,7 @@ namespace mongo {
         
         // count of all objects
         if ( query.isEmpty() ){
-            return applySkipLimit( d->nrecords , cmd );
+            return applySkipLimit( d->stats.nrecords , cmd );
         }
         MultiPlanScanner mps( ns, query, BSONObj(), 0, true, BSONObj(), BSONObj(), false, true );
         CountOp original( ns , cmd );
diff --git a/db/rec.h b/db/rec.h
index 9785d4892b9..b6afa355667 100644
--- a/db/rec.h
+++ b/db/rec.h
@@ -51,7 +51,7 @@ public:
         theDataFileMgr._deleteRecord(nsdetails_notinline(ns), ns, d.rec(), d);
     }
 
-    VIRT void modified(DiskLoc d) { }
+//goingaway    VIRT void modified(DiskLoc d) { }
 
     VIRT void drop(const char *ns) { 
         dropNS(ns);
@@ -127,11 +127,4 @@ inline BtreeBucket* DiskLoc::btree() const {
     return (BtreeBucket*) btreeStore->get(*this, BucketSize);
 }
 
-inline BtreeBucket* DiskLoc::btreemod() const {
-    assert( _a != -1 );
-    BtreeBucket *b = (BtreeBucket*) btreeStore->get(*this, BucketSize);
-    btreeStore->modified(*this);
-    return b;
-}
-
 }
diff --git a/db/repl/manager.cpp b/db/repl/manager.cpp
index 328f6d279f9..c1a7c858d18 100644
--- a/db/repl/manager.cpp
+++ b/db/repl/manager.cpp
@@ -55,9 +55,12 @@ namespace mongo {
     }
     
     Manager::~Manager() { 
-        log() << "ERROR: ~Manager should never be called" << rsLog;
+        /* we don't destroy the replset object we sit in; however, the destructor could have thrown on init. 
+           the log message below is just a reminder to come back one day and review this code more, and to 
+           make it cleaner. 
+           */
+        log() << "info: ~Manager called" << rsLog;
         rs->mgr = 0;
-        //assert(false);
     }
 
     void Manager::starting() { 
diff --git a/db/repl/rs.h b/db/repl/rs.h
index 164d179d7a3..19f8e5e0ff3 100644
--- a/db/repl/rs.h
+++ b/db/repl/rs.h
@@ -75,7 +75,7 @@ namespace mongo {
         virtual void starting();
     public:
         Manager(ReplSetImpl *rs);
-        ~Manager();
+        virtual ~Manager();
         void msgReceivedNewConfig(BSONObj);
         void msgCheckNewState();
     };
diff --git a/db/repl/rs_initialsync.cpp b/db/repl/rs_initialsync.cpp
index 3851c66827d..a0579ac967e 100644
--- a/db/repl/rs_initialsync.cpp
+++ b/db/repl/rs_initialsync.cpp
@@ -74,7 +74,7 @@ namespace mongo {
 		NamespaceDetails *d = nsdetails(rsoplog);
 
 		// temp
-		if( d && d->nrecords == 0 )
+		if( d && d->stats.nrecords == 0 )
 		  return; // already empty, ok.
 
         log(1) << "replSet empty oplog" << rsLog;
diff --git a/db/repl/rs_rollback.cpp b/db/repl/rs_rollback.cpp
index aeb9b8b25e7..0fcaaece31f 100644
--- a/db/repl/rs_rollback.cpp
+++ b/db/repl/rs_rollback.cpp
@@ -524,7 +524,7 @@ namespace mongo {
                            }
                        }
                        // did we just empty the collection?  if so let's check if it even exists on the source.
-                       if( nsd->nrecords == 0 ) {
+                       if( nsd->stats.nrecords == 0 ) {
                            try { 
                                string sys = cc().database()->name + ".system.namespaces";
                                bo o = them->findOne(sys, QUERY("name"<<d.ns));
diff --git a/dbtests/mmaptests.cpp b/dbtests/mmaptests.cpp
index dd60b2f9d53..f272b63dae4 100755
--- a/dbtests/mmaptests.cpp
+++ b/dbtests/mmaptests.cpp
@@ -1,5 +1,4 @@
-// socktests.cpp : sock.{h,cpp} unit tests.
-//
+// @file mmaptests.cpp
 
 /**
  *    Copyright (C) 2008 10gen Inc.
diff --git a/dbtests/namespacetests.cpp b/dbtests/namespacetests.cpp
index ca051fe15f9..5588bf7c2cd 100644
--- a/dbtests/namespacetests.cpp
+++ b/dbtests/namespacetests.cpp
@@ -604,7 +604,7 @@ namespace NamespaceTests {
                         ++count;
                     }
                 }
-                ASSERT_EQUALS( count, nsd()->nrecords );
+                ASSERT_EQUALS( count, nsd()->stats.nrecords );
                 return count;
             }
             int nExtents() const {
@@ -620,7 +620,7 @@ namespace NamespaceTests {
                 return ns_;
             }
             NamespaceDetails *nsd() const {
-                return nsdetails( ns() );
+                return dur::writing( nsdetails( ns() ) );
             }
             static BSONObj bigObj() {
                 string as( 187, 'a' );
@@ -737,9 +737,9 @@ namespace NamespaceTests {
                 }
 
                 DiskLoc d = l[6];
-                long long n = nsd->nrecords;
+                long long n = nsd->stats.nrecords;
                 nsd->cappedTruncateAfter(ns(), d, false);
-                ASSERT_EQUALS( nsd->nrecords , n-1 );
+                ASSERT_EQUALS( nsd->stats.nrecords , n-1 );
 
                 {
                     ForwardCappedCursor c(nsd);
@@ -770,7 +770,7 @@ namespace NamespaceTests {
             void run() {
                 create();
                 nsd()->deletedList[ 2 ] = nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted.drec()->nextDeleted;
-                nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted.drec()->nextDeleted = DiskLoc();
+                nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted.drec()->nextDeleted.writing() = DiskLoc();
                 nsd()->cappedLastDelRecLastExtent().Null();
                 NamespaceDetails *d = nsd();
                 zero( &d->capExtent );
diff --git a/dbtests/test.vcxproj b/dbtests/test.vcxproj
index 8e0173c11d4..8b7e5b95fed 100644
--- a/dbtests/test.vcxproj
+++ b/dbtests/test.vcxproj
@@ -97,7 +97,7 @@
     <ClCompile>
       <Optimization>Disabled</Optimization>
       <AdditionalIncludeDirectories>..\..\js\src;..\pcre-7.4;C:\boost;\boost;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_NOTDURABLE;_UNICODE;UNICODE;SUPPORT_UCP;SUPPORT_UTF8;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_DURABLE;_UNICODE;UNICODE;SUPPORT_UCP;SUPPORT_UTF8;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>No</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
@@ -263,6 +263,7 @@
     <ClCompile Include="..\client\model.cpp" />
     <ClCompile Include="..\client\parallel.cpp" />
     <ClCompile Include="..\db\cap.cpp" />
+    <ClCompile Include="..\db\dur.cpp" />
     <ClCompile Include="..\db\geo\2d.cpp" />
     <ClCompile Include="..\db\geo\haystack.cpp" />
     <ClCompile Include="..\db\repl\consensus.cpp" />
diff --git a/dbtests/test.vcxproj.filters b/dbtests/test.vcxproj.filters
index 3302cbf79e6..16b8368ab7c 100755
--- a/dbtests/test.vcxproj.filters
+++ b/dbtests/test.vcxproj.filters
@@ -704,6 +704,9 @@
     <ClCompile Include="..\scripting\bench.cpp">
       <Filter>scripting</Filter>
     </ClCompile>
+    <ClCompile Include="..\db\dur.cpp">
+      <Filter>db\cpp</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="..\SConstruct">
diff --git a/jstests/evald.js b/jstests/evald.js
index 1131a21f414..88566a19585 100644
--- a/jstests/evald.js
+++ b/jstests/evald.js
@@ -2,7 +2,7 @@ t = db.jstests_evald;
 t.drop();
 
 function debug( x ) {
-//    printjson( x );
+    printjson( x );
 }
 
 for( i = 0; i < 10; ++i ) {
diff --git a/jstests/geo_update2.js b/jstests/geo_update2.js
new file mode 100644
index 00000000000..0a6bb2f0b00
--- /dev/null
+++ b/jstests/geo_update2.js
@@ -0,0 +1,40 @@
+
+t = db.geo_update1
+t.drop()
+
+for(var x = 0; x < 10; x++ ) { 
+    for(var y = 0; y < 10; y++ ) { 
+        t.insert({"loc": [x, y] , x : x , y : y }); 
+    } 
+} 
+
+t.ensureIndex( { loc : "2d" } ) 
+
+function p(){
+    print( "--------------" );
+    for ( var y=0; y<10; y++ ){
+        var c = t.find( { y : y } ).sort( { x : 1 } )
+        var s = "";
+        while ( c.hasNext() )
+            s += c.next().z + " ";
+        print( s )
+    }
+    print( "--------------" );
+}
+
+p()
+
+/* SERVER-1821
+t.update({"loc" : {"$within" : {"$center" : [[5,5], 2]}}}, {'$inc' : { 'z' : 1}}, false, true); 
+assert.isnull( db.getLastError() , "B1" )
+p()
+
+t.update({}, {'$inc' : { 'z' : 1}}, false, true); 
+assert.isnull( db.getLastError() , "B2" )
+p()
+
+
+t.update({"loc" : {"$within" : {"$center" : [[5,5], 2]}}}, {'$inc' : { 'z' : 1}}, false, true); 
+assert.isnull( db.getLastError() , "B3" )
+p()
+*/
diff --git a/s/d_split.cpp b/s/d_split.cpp
index 208cf665899..5c569da391b 100644
--- a/s/d_split.cpp
+++ b/s/d_split.cpp
@@ -196,8 +196,8 @@ namespace mongo {
                 return false;
             }
 
-            const long long recCount = d->nrecords;
-            const long long dataSize = d->datasize;
+            const long long recCount = d->stats.nrecords;
+            const long long dataSize = d->stats.datasize;
             
             // If there's not enough data for more than one chunk, no point continuing.
             if ( dataSize < maxChunkSize || recCount == 0 ) {
diff --git a/util/hashtab.h b/util/hashtab.h
index 16c5483eda6..6604864e65a 100644
--- a/util/hashtab.h
+++ b/util/hashtab.h
@@ -24,6 +24,7 @@
 
 #include "../pch.h"
 #include <map>
+#include "../db/dur.h"
 
 namespace mongo {
 
@@ -127,35 +128,28 @@ namespace mongo {
             bool found;
             int i = _find(k, found);
             if ( i >= 0 && found ) {
-                Node& n = nodes(i);
-                n.k.kill();
-                n.setUnused();
+                Node* n = &nodes(i);
+                n = dur::writing(n);
+                n->k.kill();
+                n->setUnused();
             }
         }
-/*
-        void drop(const Key& k) {
-            bool found;
-            int i = _find(k, found);
-            if ( i >= 0 && found ) {
-                nodes[i].setUnused();
-            }
-        }
-*/
+
         /** returns false if too full */
         bool put(const Key& k, const Type& value) {
             bool found;
             int i = _find(k, found);
             if ( i < 0 )
                 return false;
-            Node& n = nodes(i);
+            Node* n = dur::writing( &nodes(i) );
             if ( !found ) {
-                n.k = k;
-                n.hash = k.hash();
+                n->k = k;
+                n->hash = k.hash();
             }
             else {
-                assert( n.hash == k.hash() );
+                assert( n->hash == k.hash() );
             }
-            n.value = value;
+            n->value = value;
             return true;
         }
         
diff --git a/util/message.h b/util/message.h
index 9651141ad6c..84c4e24ee45 100644
--- a/util/message.h
+++ b/util/message.h
@@ -225,11 +225,15 @@ struct OP_GETMORE : public MSGHEADER {
         int len; /* len of the msg, including this field */
         MSGID id; /* request/reply id's match... */
         MSGID responseTo; /* id of the message we are responding to */
-        int _operation;
+        short _operation;
+        char _flags;
+        char _version;
         int operation() const {
             return _operation;
         }
         void setOperation(int o) {
+            _flags = 0;
+            _version = 0;
             _operation = o;
         }
         char _data[4];
@@ -241,7 +245,7 @@ struct OP_GETMORE : public MSGHEADER {
         bool valid(){
             if ( len <= 0 || len > ( 1024 * 1024 * 10 ) )
                 return false;
-            if ( _operation < 0 || _operation > 100000 )
+            if ( _operation < 0 || _operation > 30000 )
                 return false;
             return true;
         }
diff --git a/util/mmap.h b/util/mmap.h
index 826c8b07b82..eca6db811fb 100644
--- a/util/mmap.h
+++ b/util/mmap.h
@@ -136,10 +136,12 @@ namespace mongo {
         };
 
         MemoryMappedFile();
+
         ~MemoryMappedFile() {
-            destroyed();
+            destroyed(); // cleans up from the master list of mmaps
             close();
         }
+
         void close();
 
         void* testGetCopyOnWriteView();
@@ -171,15 +173,15 @@ namespace mongo {
         void flush(bool sync);
         virtual Flushable * prepareFlush();
 
-        /*void* viewOfs() {
-            return view;
-        }*/
-
         long shortLength() const { return (long) len; }
         unsigned long long length() const { return len; }
 
         string filename() const { return _filename; }
 
+#if defined(_DURABLE) && defined(_DEBUG)
+        static void* getWriteViewFor(void *ptr);
+#endif
+
     private:
         static void updateLength( const char *filename, unsigned long long &length );
         
@@ -192,7 +194,9 @@ namespace mongo {
 #ifdef _WIN32
         boost::shared_ptr<mutex> _flushMutex;
 #endif
-
+#if defined(_DURABLE)
+        void *writeView;
+#endif
     protected:
         // only posix mmap implementations will support this
         virtual void _lock();
diff --git a/util/mmap_win.cpp b/util/mmap_win.cpp
index a21e7505eb8..5901804da67 100644
--- a/util/mmap_win.cpp
+++ b/util/mmap_win.cpp
@@ -22,8 +22,11 @@
 
 namespace mongo {
 
+    static map<void *, MemoryMappedFile*> viewToWriteable;
+    static mutex viewToWriteableMutex("viewToWriteableMutex");
+
     MemoryMappedFile::MemoryMappedFile()
-        : _flushMutex(new mutex("flushMutex"))
+        : _flushMutex(new mutex("flushMutex")), _filename("??")
     {
         fd = 0;
         maphandle = 0;
@@ -33,8 +36,17 @@ namespace mongo {
     }
 
     void MemoryMappedFile::close() {
-        if ( view )
+        //log() << "dur mmap close " << filename() << endl;
+        if ( view ) {
+            {
+                mutex::scoped_lock lk(viewToWriteableMutex);
+                viewToWriteable.erase(view);
+            }
             UnmapViewOfFile(view);
+#if defined(_DURABLE)
+            UnmapViewOfFile(writeView);
+#endif
+        }
         view = 0;
         if ( maphandle )
             CloseHandle(maphandle);
@@ -61,6 +73,9 @@ namespace mongo {
     }
 
     void* MemoryMappedFile::map(const char *filenameIn, unsigned long long &length, int options) {
+#if defined(_DURABLE)
+        options |= READONLY;
+#endif
         _filename = filenameIn;
         /* big hack here: Babble uses db names with colons.  doesn't seem to work on windows.  temporary perhaps. */
         char filename[256];
@@ -80,37 +95,38 @@ namespace mongo {
 
         updateLength( filename, length );
 
-        DWORD createOptions = FILE_ATTRIBUTE_NORMAL;
-        if ( options & SEQUENTIAL )
-            createOptions |= FILE_FLAG_SEQUENTIAL_SCAN;
-        DWORD rw = GENERIC_READ | GENERIC_WRITE;
-        //if ( options & READONLY ) 
-        //    rw = GENERIC_READ;
-
-        fd = CreateFile(
-                 toNativeString(filename).c_str(),
-                 rw, // desired access
-                 FILE_SHARE_READ, // share mode
-                 NULL, // security
-                 OPEN_ALWAYS, // create disposition
-                 createOptions , // flags
-                 NULL); // hTempl
-        if ( fd == INVALID_HANDLE_VALUE ) {
-            log() << "Create/OpenFile failed " << filename << ' ' << GetLastError() << endl;
-            return 0;
+        {
+            DWORD createOptions = FILE_ATTRIBUTE_NORMAL;
+            if ( options & SEQUENTIAL )
+                createOptions |= FILE_FLAG_SEQUENTIAL_SCAN;
+            DWORD rw = GENERIC_READ | GENERIC_WRITE;
+            fd = CreateFile(
+                     toNativeString(filename).c_str(),
+                     rw, // desired access
+                     FILE_SHARE_READ, // share mode
+                     NULL, // security
+                     OPEN_ALWAYS, // create disposition
+                     createOptions , // flags
+                     NULL); // hTempl
+            if ( fd == INVALID_HANDLE_VALUE ) {
+                log() << "Create/OpenFile failed " << filename << ' ' << GetLastError() << endl;
+                return 0;
+            }
         }
 
         mapped += length;
 
-        DWORD flProtect = PAGE_READWRITE; //(options & READONLY)?PAGE_READONLY:PAGE_READWRITE;
-        maphandle = CreateFileMapping(fd, NULL, flProtect, 
-            length >> 32 /*maxsizehigh*/, 
-            (unsigned) length /*maxsizelow*/, 
-            NULL/*lpName*/);
-        if ( maphandle == NULL ) {
-            DWORD e = GetLastError(); // log() call was killing lasterror before we get to that point in the stream
-            log() << "CreateFileMapping failed " << filename << ' ' << errnoWithDescription(e) << endl;
-            return 0;
+        {
+            DWORD flProtect = PAGE_READWRITE; //(options & READONLY)?PAGE_READONLY:PAGE_READWRITE;
+            maphandle = CreateFileMapping(fd, NULL, flProtect, 
+                length >> 32 /*maxsizehigh*/, 
+                (unsigned) length /*maxsizelow*/, 
+                NULL/*lpName*/);
+            if ( maphandle == NULL ) {
+                DWORD e = GetLastError(); // log() call was killing lasterror before we get to that point in the stream
+                log() << "CreateFileMapping failed " << filename << ' ' << errnoWithDescription(e) << endl;
+                return 0;
+            }
         }
 
         {
@@ -122,9 +138,58 @@ namespace mongo {
             log() << "MapViewOfFile failed " << filename << " " << errnoWithDescription(e) << endl;
         }
         len = length;
+
+#if defined(_DURABLE)
+        {
+            if( !( options & READONLY ) ) { 
+                log() << "dur: not readonly view which is wrong : " << filename << endl;
+            }
+            void *p = MapViewOfFile(maphandle, FILE_MAP_ALL_ACCESS, /*f ofs hi*/0, /*f ofs lo*/ 0, /*dwNumberOfBytesToMap 0 means to eof*/0);
+            assert( p );
+            writeView = p;
+            {
+                mutex::scoped_lock lk(viewToWriteableMutex);
+                viewToWriteable[view] = this;
+            }
+            log() << filenameIn << endl;
+            log() << "  ro: " << view << " - " << (void*) (((char *)view)+length) << endl;
+            log() << "  w : " << writeView << " - " << (void*) (((char *)writeView)+length) << endl;
+        }
+#endif
+
         return view;
     }
 
+#if defined(_DURABLE) && defined(_DEBUG)
+  void* MemoryMappedFile::getWriteViewFor(void *p) { 
+        mutex::scoped_lock lk(viewToWriteableMutex);
+        std::map< void*, MemoryMappedFile* >::iterator i = 
+            viewToWriteable.upper_bound(((char *)p)+1);
+        i--;
+        assert( i != viewToWriteable.end() );
+        MemoryMappedFile *mmf = i->second;
+        assert( mmf );
+
+        size_t ofs = ((char *)p) - ((char*)mmf->view);
+
+        if( ofs >= mmf->len ) {
+            log() << "getWriteViewFor error? " << p << endl;
+            for( std::map<void*,MemoryMappedFile*>::iterator i = viewToWriteable.begin(); i != viewToWriteable.end(); i++ ) { 
+                char *wl = (char *) i->second->writeView;
+                char *wh = wl + i->second->length();
+                if( p >= wl && p < wh ) { 
+                    log() << "dur ERROR p " << p << " is already in the writable view of " << i->second->filename() << endl;
+                    //wassert(false);
+                    // could do this:
+                    return p;
+                }
+            }
+            assert( ofs < mmf->len ); // did you call writing() with a pointer that isn't into a datafile?
+        }
+        return ((char *)mmf->writeView) + ofs;
+    }
+#endif
+
     class WindowsFlushable : public MemoryMappedFile::Flushable {
     public:
         WindowsFlushable( void * view , HANDLE fd , string filename , boost::shared_ptr<mutex> flushMutex )
author	Alberto Lerner <alerner@10gen.com>	2010-09-27 16:53:20 -0400
committer	Alberto Lerner <alerner@10gen.com>	2010-09-27 16:53:20 -0400
commit	f1afbf0f4e52d0b4fc487d41b1c8bc8743d89092 (patch)
tree	fa56f688e4306c407dff47b7c4f038d5f1a9e93a
parent	4b56d14a15baf9853df4c74a5fcfa0c32845842b (diff)
parent	6309d60b49739fdd22c70b8d255399b3281812be (diff)
download	mongo-f1afbf0f4e52d0b4fc487d41b1c8bc8743d89092.tar.gz