17 files changed, 670 insertions, 232 deletions
diff --git a/SConstruct b/SConstruct
index 5f9e087acba..01bb0123024 100644
--- a/SConstruct
+++ b/SConstruct
@@ -327,7 +327,7 @@ coreServerFiles = [ "util/message_server_port.cpp" ,
 if has_option( "asio" ):
     coreServerFiles += [ "util/message_server_asio.cpp" ]
 
-serverOnlyFiles = Split( "util/logfile.cpp util/alignedbuilder.cpp db/mongommf.cpp db/dur.cpp db/durop.cpp db/dur_writetodatafiles.cpp db/dur_commitjob.cpp db/dur_recover.cpp db/dur_journal.cpp db/query.cpp db/update.cpp db/introspect.cpp db/btree.cpp db/clientcursor.cpp db/tests.cpp db/repl.cpp db/repl/rs.cpp db/repl/consensus.cpp db/repl/rs_initiate.cpp db/repl/replset_commands.cpp db/repl/manager.cpp db/repl/health.cpp db/repl/heartbeat.cpp db/repl/rs_config.cpp db/repl/rs_rollback.cpp db/repl/rs_sync.cpp db/repl/rs_initialsync.cpp db/oplog.cpp db/repl_block.cpp db/btreecursor.cpp db/cloner.cpp db/namespace.cpp db/cap.cpp db/matcher_covered.cpp db/dbeval.cpp db/restapi.cpp db/dbhelpers.cpp db/instance.cpp db/client.cpp db/database.cpp db/pdfile.cpp db/cursor.cpp db/security_commands.cpp db/security.cpp db/queryoptimizer.cpp db/extsort.cpp db/cmdline.cpp" )
+serverOnlyFiles = Split( "util/logfile.cpp util/alignedbuilder.cpp db/mongommf.cpp db/dur.cpp db/durop.cpp db/dur_writetodatafiles.cpp db/dur_preplogbuffer.cpp db/dur_commitjob.cpp db/dur_recover.cpp db/dur_journal.cpp db/query.cpp db/update.cpp db/introspect.cpp db/btree.cpp db/clientcursor.cpp db/tests.cpp db/repl.cpp db/repl/rs.cpp db/repl/consensus.cpp db/repl/rs_initiate.cpp db/repl/replset_commands.cpp db/repl/manager.cpp db/repl/health.cpp db/repl/heartbeat.cpp db/repl/rs_config.cpp db/repl/rs_rollback.cpp db/repl/rs_sync.cpp db/repl/rs_initialsync.cpp db/oplog.cpp db/repl_block.cpp db/btreecursor.cpp db/cloner.cpp db/namespace.cpp db/cap.cpp db/matcher_covered.cpp db/dbeval.cpp db/restapi.cpp db/dbhelpers.cpp db/instance.cpp db/client.cpp db/database.cpp db/pdfile.cpp db/cursor.cpp db/security_commands.cpp db/security.cpp db/queryoptimizer.cpp db/extsort.cpp db/cmdline.cpp" )
 
 serverOnlyFiles += [ "db/index.cpp" ] + Glob( "db/geo/*.cpp" )
 
diff --git a/client/dbclient_rs.cpp b/client/dbclient_rs.cpp
index 7280d5a0a60..00e15e825e0 100644
--- a/client/dbclient_rs.cpp
+++ b/client/dbclient_rs.cpp
@@ -360,7 +360,7 @@ namespace mongo {
                 try {
                     return checkSlave()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize);
                 }
-                catch ( DBException & e ){
+                catch ( DBException & ){
                     LOG(1) << "can't query replica set slave: " << _slaveHost << endl;
                 }
             }
@@ -378,7 +378,7 @@ namespace mongo {
                 try {
                     return checkSlave()->findOne(ns,query,fieldsToReturn,queryOptions);
                 }
-                catch ( DBException & e ){
+                catch ( DBException & ){
                     LOG(1) << "can't query replica set slave: " << _slaveHost << endl;
                 }
             }
diff --git a/db/bufreader.h b/db/bufreader.h
index 8e351f29334..bcc4f4271e3 100644
--- a/db/bufreader.h
+++ b/db/bufreader.h
@@ -60,6 +60,12 @@ namespace mongo {
         /** return remaining bytes */
         unsigned remaining() const { return (char*)_end -(char*)_pos; }
 
+        /** back up by nbytes */
+        void rewind(unsigned nbytes) { 
+            _pos = ((char *) _pos) - nbytes;
+            assert( _pos >= _start );
+        }
+
         /** return current position pointer, and advance by len */
         const void* skip(unsigned len) { 
             char *nxt = ((char *) _pos) + len;
diff --git a/db/db.vcxproj b/db/db.vcxproj
index c4f9e1edc70..7d1baae52d7 100644
--- a/db/db.vcxproj
+++ b/db/db.vcxproj
@@ -198,6 +198,7 @@
   <ItemGroup>
     <ClCompile Include="..\bson\oid.cpp" />
     <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
     <ClCompile Include="..\client\distlock.cpp" />
     <ClCompile Include="..\client\model.cpp" />
     <ClCompile Include="..\pcre-7.4\pcrecpp.cc">
@@ -483,6 +484,7 @@
     <ClCompile Include="durop.cpp" />
     <ClCompile Include="dur_commitjob.cpp" />
     <ClCompile Include="dur_journal.cpp" />
+    <ClCompile Include="dur_preplogbuffer.cpp" />
     <ClCompile Include="dur_recover.cpp" />
     <ClCompile Include="dur_writetodatafiles.cpp" />
     <ClCompile Include="geo\2d.cpp" />
diff --git a/db/db.vcxproj.filters b/db/db.vcxproj.filters
index f77d11b4125..fd2f74753f8 100755
--- a/db/db.vcxproj.filters
+++ b/db/db.vcxproj.filters
@@ -451,6 +451,12 @@
     <ClCompile Include="dur_writetodatafiles.cpp">
       <Filter>_storage engine</Filter>
     </ClCompile>
+    <ClCompile Include="dur_preplogbuffer.cpp">
+      <Filter>_storage engine</Filter>
+    </ClCompile>
+    <ClCompile Include="..\client\dbclient_rs.cpp">
+      <Filter>client</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="repl\rs_config.h">
diff --git a/db/dur.cpp b/db/dur.cpp
index d0d128ea910..b57015b7759 100644
--- a/db/dur.cpp
+++ b/db/dur.cpp
@@ -50,6 +50,7 @@
 #include "../util/mongoutils/str.h"
 #include "../util/timer.h"
 #include "dur_stats.h"
+#include "pdfile.h" // Record::HeaderSize
 
 using namespace mongoutils;
 
@@ -57,7 +58,18 @@ namespace mongo {
 
     namespace dur {
 
+#if defined(_DEBUG)
+        const bool DebugCheckLastDeclaredWrite = false;
+#else
+        const bool DebugCheckLastDeclaredWrite = false;
+#endif
+
         void WRITETODATAFILES();
+        void PREPLOGBUFFER();
+
+        static void groupCommit(); // later in this file
+
+        CommitJob commitJob;
 
         Stats stats;
 
@@ -65,6 +77,8 @@ namespace mongo {
             memset(this, 0, sizeof(*this));
         }
 
+        DurableInterface* DurableInterface::_impl = new NonDurableImpl();
+
         void NonDurableImpl::startup() {
             if( haveJournalFiles() ) { 
                 log() << "Error: journal files are present in journal directory, yet starting without --dur enabled." << endl;
@@ -74,13 +88,49 @@ namespace mongo {
             }
         }
 
-#if defined(_DEBUG)
-        const bool DebugCheckLastDeclaredWrite = false;
-#else
-        const bool DebugCheckLastDeclaredWrite = false;
-#endif
+        /** base declare write intent function that all the helpers call. */
+        void DurableImpl::declareWriteIntent(void *p, unsigned len) {
+            WriteIntent w;
+            w.p = p;
+            w.len = len;
+            commitJob.note(w);
+        }
 
-        DurableInterface* DurableInterface::_impl = new NonDurableImpl();
+        bool DurableImpl::objAppend(void *dst, const void *src, unsigned len) { 
+            {
+                char *srcRecord = (char*) src;
+                srcRecord -= Record::HeaderSize;
+                WriteIntent w;
+                w.p = srcRecord;
+                w.len = len+Record::HeaderSize;
+                if( !commitJob.alreadyNoted(w) ) { 
+
+                    if( debug ) {
+                        WriteIntent w;
+                        w.p = (void*)src;
+                        w.len = len;
+                        if( commitJob.alreadyNoted(w) ) { 
+                            log() << "dur info it appears an optimization is possible that is not yet done" << endl;
+                        }
+                    }
+
+                    return false;
+                }
+            }
+
+            BasicWriteOp b;
+            b.setObjAppend(dst, (void*)src, len);
+            commitJob.basicWrites().push_back(b);
+            if( testIntent )
+                dst = MongoMMF::switchToPrivateView(dst);
+            memcpy(dst, src, len);
+            char *p = static_cast<char*>(dst);
+            p[-3] = (char) Object; // { ..., o: <copiedobj>, ..., EOO}
+            p[-2] = 'o';
+            p[-1] = 0;
+            p[len] = EOO;
+            return true;
+        }
 
         void enableDurability() { // TODO: merge with startup() ?
             assert(typeid(*DurableInterface::_impl) == typeid(NonDurableImpl));
@@ -88,11 +138,6 @@ namespace mongo {
             DurableInterface::_impl = new DurableImpl();
         }
 
-        // later in this file
-        static void groupCommit();
-
-        CommitJob commitJob;
-
         bool DurableImpl::commitNow() {
             groupCommit();
             return true;
@@ -124,12 +169,6 @@ namespace mongo {
             groupCommit();
         }
 
-        /** declare write intent.  when already in the write view if testIntent is true. */
-        void DurableImpl::declareWriteIntent(void *p, unsigned len) {
-            WriteIntent w(p, len);
-            commitJob.note(w);
-        }
-
         void* DurableImpl::writingPtr(void *x, unsigned len) { 
             void *p = x;
             if( testIntent )
@@ -171,15 +210,15 @@ namespace mongo {
             ++n;
 
             assert(debug && cmdLine.dur);
-            vector<WriteIntent>& w = commitJob.writes();
+            vector<BasicWriteOp>& w = commitJob.basicWrites();
             if( w.size() == 0 ) 
                 return;
-            const WriteIntent &i = w[w.size()-1];
+            const BasicWriteOp &i = w[w.size()-1];
             size_t ofs;
-            MongoMMF *mmf = privateViews.find(i.p, ofs);
+            MongoMMF *mmf = privateViews.find(i.src, ofs);
             if( mmf == 0 ) 
                 return;
-            size_t past = ofs + i.len;
+            size_t past = ofs + i.len();
             if( mmf->length() < past + 8 ) 
                 return; // too close to end of view
             char *priv = (char *) mmf->getView();
@@ -188,15 +227,15 @@ namespace mongo {
             unsigned long long *b = (unsigned long long *) (writ+past);
             if( *a != *b ) { 
                 for( unsigned z = 0; z < w.size() - 1; z++ ) { 
-                    const WriteIntent& wi = w[z];
-                    char *r1 = (char*) wi.p;
-                    char *r2 = r1 + wi.len;
+                    const BasicWriteOp& wi = w[z];
+                    char *r1 = (char*) wi.src;
+                    char *r2 = r1 + wi.len();
                     if( r1 <= (((char*)a)+8) && r2 > (char*)a ) { 
                         //log() << "it's ok " << wi.p << ' ' << wi.len << endl;
                         return;
                     }
                 }
-                log() << "dur data after write area " << i.p << " does not agree" << endl;
+                log() << "dur data after write area " << i.src << " does not agree" << endl;
                 log() << " was:  " << ((void*)b) << "  " << hexdump((char*)b, 8) << endl;
                 log() << " now:  " << ((void*)a) << "  " << hexdump((char*)a, 8) << endl;
                 log() << " n:    " << n << endl;
@@ -205,91 +244,6 @@ namespace mongo {
         }
 #endif
 
-        RelativePath local = RelativePath::fromRelativePath("local");
-
-        /** we will build an output buffer ourself and then use O_DIRECT
-            we could be in read lock for this
-            caller handles locking
-        */
-        static void PREPLOGBUFFER() { 
-            assert( cmdLine.dur );
-            AlignedBuilder& bb = commitJob._ab;
-            bb.reset();
-
-            unsigned lenOfs; // we will need to backfill the length when prep is wrapping up
-            // JSectHeader
-            {
-                bb.appendStr("\nHH\n", false);
-                lenOfs = bb.skip(4);
-            }
-
-            // ops other than basic writes (DurOp's)
-            {
-                for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { 
-                    (*i)->serialize(bb);
-                }
-            }
-
-            // write intents.  note there is no particular order to these : if we have 
-            // two writes to the same location during the group commit interval, it is likely
-            // (although not assured) that it is journalled here once.
-            {
-                scoped_lock lk(privateViews._mutex());
-                RelativePath lastFilePath;
-                for( vector<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
-                    size_t ofs;
-                    MongoMMF *mmf = privateViews._find(i->p, ofs);
-                    if( mmf == 0 ) {
-                        string s = str::stream() << "view pointer cannot be resolved " << (size_t) i->p;
-                        journalingFailure(s.c_str()); // asserts
-                        return;
-                    }
-
-                    if( !mmf->willNeedRemap() ) {
-                        // tag this mmf as needed a remap of its private view later. 
-                        // usually it will already be dirty/already set, so we do the if above first
-                        // to avoid possibility of cpu cache line contention
-                        mmf->willNeedRemap() = true;
-                    }
-                    i->w_ptr = ((char*)mmf->view_write()) + ofs;
-                    JEntry e;
-                    e.len = i->len;
-                    assert( ofs <= 0x80000000 );
-                    e.ofs = (unsigned) ofs;
-                    e.setFileNo( mmf->fileSuffixNo() );
-                    if( mmf->relativePath() == local ) { 
-                        e.setLocalDbContextBit();
-                    }
-                    else if( mmf->relativePath() != lastFilePath ) { 
-                        lastFilePath = mmf->relativePath();
-                        //assert( !str::startsWith(lastFilePath, dbpath) ); // dbpath should be stripped this is a relative path
-                        JDbContext c;
-                        bb.appendStruct(c);
-                        bb.appendStr(lastFilePath.toString());
-                    }
-                    bb.appendStruct(e);
-                    bb.appendBuf(i->p, i->len);
-                }
-            }
-
-            {
-                JSectFooter f(bb.buf(), bb.len());
-                bb.appendStruct(f);
-            }
-
-            {
-                assert( 0xffffe000 == (~(Alignment-1)) );
-                unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1)); // fill to alignment
-                dassert( L >= (unsigned) bb.len() );
-                *((unsigned*)bb.atOfs(lenOfs)) = L;
-                unsigned padding = L - bb.len();
-                bb.skip(padding);
-                dassert( bb.len() % Alignment == 0 );
-            }
-
-            return;
-        }
-
         /** write the buffer we have built to the journal and fsync it.
             outside of lock as that could be slow.
         */
@@ -326,9 +280,23 @@ namespace mongo {
 
                     unsigned low = 0xffffffff;
                     unsigned high = 0;
+                    log() << "DurParanoid mismatch in " << mmf->filename() << endl;
+                    int logged = 0;
+                    unsigned lastMismatch = 0xffffffff;
                     for( unsigned i = 0; i < mmf->length(); i++ ) {
                         if( p[i] != w[i] ) { 
-                            log() << i << '\t' << (int) p[i] << '\t' << (int) w[i] << endl;
+                            if( lastMismatch != 0xffffffff && lastMismatch+1 != i ) 
+                                log() << endl; // separate blocks of mismatches
+                            lastMismatch= i;
+                            if( ++logged < 60 ) {
+                                stringstream ss;
+                                ss << "mismatch ofs:" << hex << i <<  "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
+                                if( p[i] > 32 && p[i] <= 126 )
+                                    ss << '\t' << p[i];
+                                log() << ss.str() << endl;
+                            }
+                            if( logged == 60 )
+                                log() << "..." << endl;
                             if( i < low ) low = i;
                             if( i > high ) high = i;
                         }
@@ -337,9 +305,9 @@ namespace mongo {
                         std::stringstream ss;
                         ss << "dur error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
                         log() << ss.str() << endl;
-                        log() << "priv loc: " << (void*)(p+low) << endl;
-                        vector<WriteIntent>& w = commitJob.writes();
-                        (void)w; // mark as unused. Useful for inspection in debugger
+                        log() << "priv loc: " << (void*)(p+low) << ' ' << stats.curr._objCopies << endl;
+                        vector<BasicWriteOp>& b = commitJob.basicWrites();
+                        (void)b; // mark as unused. Useful for inspection in debugger
 
                         massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false);
                     }
@@ -412,6 +380,9 @@ namespace mongo {
             stats.curr._commits++;
 
             dbMutex.assertAtLeastReadLocked();
+            if( dbMutex.isWriteLocked() ) { 
+                stats.curr._commitsInWriteLock++;
+            }
 
             if( !commitJob.hasWritten() )
                 return;
@@ -514,6 +485,7 @@ namespace mongo {
 
         void unlinkThread();
         void recover();
+
         void releasingWriteLock() {
             try {
 #if defined(_DEBUG)
@@ -548,7 +520,5 @@ namespace mongo {
         }
 
     } // namespace dur
-    
-    
+        
 } // namespace mongo
-
diff --git a/db/dur.h b/db/dur.h
index 2f4ca6ca2b2..c27e8614637 100644
--- a/db/dur.h
+++ b/db/dur.h
@@ -28,6 +28,21 @@ namespace mongo {
             /** Declare a database is about to be dropped */
             virtual void droppingDb(string db) = 0;
 
+            /** this function appends a nested bsonobj, with the fieldname, "o", to a partially built 
+                bson object.  it then terminates the bson object with EOO.  The caller is responsible 
+                for setting the outer bson object's total size correctly.
+
+                It also journals the operation. an optimization occurs in the journaling.
+                if the src was just journaled, a new copy is not written, rather, 
+                JObjAppend is added to the journal which is small.
+
+                If the src was not recently journaled, that is ok: in that case the function returns false 
+                and you must do your traditional behavior.
+
+                This is used to make writing to the replication oplog efficient.
+            */
+            virtual bool objAppend(void *dst, const void *src, unsigned len) = 0;
+
             /** Declarations of write intent.
             
                 Use these methods to declare "i'm about to write to x and it should be logged for redo." 
@@ -143,6 +158,7 @@ namespace mongo {
             void declareWriteIntent(void *, unsigned) { }
             void createdFile(string filename, unsigned long long len) { }
             void droppingDb(string db) { }
+            bool objAppend(void *dst, const void *src, unsigned len) { return false; }
             bool awaitCommit() { return false; }
             bool commitNow() { return false; }
 #if defined(_DEBUG)
@@ -157,6 +173,7 @@ namespace mongo {
             void declareWriteIntent(void *, unsigned);
             void createdFile(string filename, unsigned long long len);
             void droppingDb(string db);
+            bool objAppend(void *dst, const void *src, unsigned len);
             bool awaitCommit();
             bool commitNow();
 #if defined(_DEBUG)
diff --git a/db/dur_commitjob.cpp b/db/dur_commitjob.cpp
index 049ee41c3fa..ba74c66272d 100644
--- a/db/dur_commitjob.cpp
+++ b/db/dur_commitjob.cpp
@@ -24,7 +24,7 @@ namespace mongo {
 
         void Writes::clear() { 
             _alreadyNoted.clear();
-            _writes.clear();
+            _basicWrites.clear();
             _ops.clear();
         }
 
diff --git a/db/dur_commitjob.h b/db/dur_commitjob.h
index e265f7121b4..ceca4a0da07 100644
--- a/db/dur_commitjob.h
+++ b/db/dur_commitjob.h
@@ -29,13 +29,31 @@
 namespace mongo { 
     namespace dur {
 
-        /** declaration of an intent to write to a region of a memory mapped view */
-        struct WriteIntent /* copyable */ { 
-            WriteIntent() : w_ptr(0), p(0) { }
-            WriteIntent(void *a, unsigned b) : w_ptr(0), p(a), len(b) { }
-            void *w_ptr;  // p is mapped from private to equivalent location in the writable mmap
-            void *p;      // intent to write at p
-            unsigned len; // up to this len
+        /** "I intend to write at p for up to len bytes" */
+        struct WriteIntent { 
+            void *p;
+            unsigned len;
+        };
+
+        /** declaration of an intent to write to a region of a memory mapped view 
+            this could be either a JEntry or a JObjAppend in the journal
+        */
+        struct BasicWriteOp /* copyable */ { 
+            void *dst;
+            void *src;
+            unsigned len() const { return _len & 0x7fffffff; }
+            bool isObjAppend() const { return _len & 0x80000000; }
+            void set(void *Src, unsigned Len) {
+                dst = 0;
+                src = Src;
+                _len = Len;
+            }
+            void setObjAppend(void *Dst, void *Src, unsigned Len) { 
+                dst = Dst; src = Src; 
+                _len = Len | 0x80000000;
+            }
+        private:
+            unsigned _len;
         };
 
         /** try to remember things we have already marked for journalling.  false negatives are ok if infrequent - 
@@ -64,6 +82,15 @@ namespace mongo {
                 nd = w;
                 return false; // a new set
             }
+
+            /**
+               @return true if already indicated.
+            */
+            bool check(const WriteIntent& w) {
+                unsigned x = mongoutils::hashPointer(w.p);
+                WriteIntent& nd = nodes[x % N];
+                return nd.p == w.p && nd.len >= w.len;
+            }
         private:
             enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
             WriteIntent nodes[N];
@@ -73,8 +100,9 @@ namespace mongo {
         class Writes : boost::noncopyable {
         public:
             Already<127> _alreadyNoted;
-            vector<WriteIntent> _writes;
+            vector<BasicWriteOp> _basicWrites;
             vector< shared_ptr<DurOp> > _ops; // all the ops other than basic writes
+
             /** reset the Writes structure (empties all the above) */
             void clear();
         };
@@ -97,7 +125,10 @@ namespace mongo {
             /** note an operation other than a "basic write" */
             void noteOp(shared_ptr<DurOp> p);
 
-            vector<WriteIntent>& writes() { return _wi._writes; }
+            /** @return true if was already noted. false negatives possible (to be fast). */
+            bool alreadyNoted(WriteIntent& w) { return _wi._alreadyNoted.check(w); }
+
+            vector<BasicWriteOp>& basicWrites() { return _wi._basicWrites; }
             vector< shared_ptr<DurOp> >& ops() { return _wi._ops; }
 
             /** this method is safe to call outside of locks. when haswritten is false we don't do any group commit and avoid even 
@@ -117,13 +148,14 @@ namespace mongo {
                     _notify.wait(); 
             }
 
+            /** we check how much written and if it is getting to be a lot, we commit sooner. */
             size_t bytes() const { return _bytes; }
 
         private:
             bool _hasWritten;
             Writes _wi;
-            NotifyAll _notify;
             size_t _bytes;
+            NotifyAll _notify; // for getlasterror fsync:true acknowledgements
         };
         extern CommitJob commitJob;
 
@@ -142,7 +174,10 @@ namespace mongo {
             dassert( cmdLine.dur );
             if( !_wi._alreadyNoted.checkAndSet(w) ) {
                 if( !_hasWritten ) {
+                    // you can't be writing if one of these is pending, so this is a verification.
                     assert( !dbMutex._remapPrivateViewRequested );
+
+                    // we don't bother doing a group commit when nothing is written, so we have a var to track that
                     _hasWritten = true;
                 }
 
@@ -164,6 +199,10 @@ namespace mongo {
                         else { 
                             log() << "DEBUG note write intent " << w.p << ' ' << w.len << " NOT FOUND IN privateViews" << endl;
                         }
+                        /*if( w.len > 48 ) {
+                            log() << "big TEMP" << endl;
+                            log() << hexdump((char*) w.p, 48) << endl;
+                        }*/
                     }
                     else if( n == 10000 ) { 
                         log() << "DEBUG stopping write intent logging, too much to log" << endl;
@@ -172,10 +211,11 @@ namespace mongo {
 #endif
 
                 // remember intent. we will journal it in a bit
-                _wi._writes.push_back(w);
-                _bytes += w.len;
-                wassert( _wi._writes.size() <  2000000 );
-                //assert(  _wi._writes.size() < 20000000 );
+                BasicWriteOp b;
+                b.set(w.p, w.len);
+                _wi._basicWrites.push_back(b);
+                wassert( _wi._basicWrites.size() <  2000000 );
+                assert(  _wi._basicWrites.size() < 20000000 );
             }
         }
     }
diff --git a/db/dur_journal.cpp b/db/dur_journal.cpp
index f8ac1ff41d9..d7af952b0a4 100644
--- a/db/dur_journal.cpp
+++ b/db/dur_journal.cpp
@@ -32,8 +32,9 @@
 #include "../util/mongoutils/str.h"
 #include "../util/concurrency/mvar.h"
 
+using namespace mongoutils;
+
 namespace mongo {
-    using namespace mongoutils;
 
     class AlignedBuilder;
 
diff --git a/db/dur_journalformat.h b/db/dur_journalformat.h
index 6f6d6d2f5d3..7c8473e830f 100644
--- a/db/dur_journalformat.h
+++ b/db/dur_journalformat.h
@@ -70,27 +70,58 @@ namespace mongo {
                 OpCode_DbContext   = 0xfffffffe,
                 OpCode_FileCreated = 0xfffffffd,
                 OpCode_DropDb      = 0xfffffffc,
+                OpCode_ObjAppend       = 0xfffffffb,
                 OpCode_Min         = 0xfffff000
             };
-
             union {
-                unsigned len;    
+                unsigned len;    // length in bytes of the data of the JEntry. does not include the JEntry header
                 OpCodes opcode;
             };
+
             unsigned ofs;  // offset in file
 
-            enum {
+            // sentinel and masks for _fileNo
+            enum { 
                 DotNsSuffix = 0x7fffffff, // ".ns" file
                 LocalDbBit  = 0x80000000  // assuming "local" db instead of using the JDbContext
             };
             int _fileNo;   // high bit is set to indicate it should be the <dbpath>/local database
             // char data[] follows
 
+            const char * srcData() const { 
+                const int *i = &_fileNo;
+                return (const char *) (i+1);
+            }
+
             int getFileNo() const { return _fileNo & (~LocalDbBit); }
             void setFileNo(int f) { _fileNo = f; }
+            bool isNsSuffix() const { return getFileNo() == DotNsSuffix; }
+
             void setLocalDbContextBit() { _fileNo |= LocalDbBit; }
             bool isLocalDbContext() const { return _fileNo & LocalDbBit; }
             void clearLocalDbContextBit() { _fileNo = getFileNo(); }
+
+            static string suffix(int fileno) {
+                if( fileno == DotNsSuffix ) return "ns";
+                stringstream ss;
+                ss << fileno;
+                return ss.str();
+            }
+        };
+
+        /** append an object to a partial BSON buffer under construction.
+            adds 
+               o : <data>, EOO
+            len is the length of the data            
+        */
+        struct JObjAppend { 
+            JObjAppend() : opcode(JEntry::OpCode_ObjAppend) { }
+            unsigned opcode;
+            int dstFileNo;    // destination in the local database.
+            unsigned dstOfs;
+            int srcFileNo;    // source in the database of current context (JDbContext)
+            unsigned srcOfs;
+            unsigned len;
         };
 
         /** group commit section footer. md5 is a key field. */
diff --git a/db/dur_preplogbuffer.cpp b/db/dur_preplogbuffer.cpp
new file mode 100644
index 00000000000..dac7d62cf8f
--- /dev/null
+++ b/db/dur_preplogbuffer.cpp
@@ -0,0 +1,175 @@
+// @file dur_preplogbuffer.cpp 
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* 
+     PREPLOGBUFFER 
+       we will build an output buffer ourself and then use O_DIRECT
+       we could be in read lock for this
+       for very large objects write directly to redo log in situ?
+     @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_commitjob.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "dur_stats.h"
+
+using namespace mongoutils;
+
+namespace mongo { 
+    namespace dur {
+
+        RelativePath local = RelativePath::fromRelativePath("local");
+
+        MongoMMF* findMMF(void *ptr, size_t &ofs) {
+            MongoMMF *f = privateViews._find(ptr, ofs);
+            if( f == 0 ) {
+                string s = str::stream() << "view pointer cannot be resolved " << (size_t) ptr;
+                journalingFailure(s.c_str()); // asserts
+            }
+            return f;
+        }
+
+        /** we will build an output buffer ourself and then use O_DIRECT
+            we could be in read lock for this
+            caller handles locking
+        */
+        void PREPLOGBUFFER() { 
+            assert( cmdLine.dur );
+            AlignedBuilder& bb = commitJob._ab;
+            bb.reset();
+
+            unsigned lenOfs; // we will need to backfill the length when prep is wrapping up
+
+            // JSectHeader
+            {
+                bb.appendStr("\nHH\n", false);
+                lenOfs = bb.skip(4);
+            }
+
+            // ops other than basic writes (DurOp's)
+            {
+                for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { 
+                    (*i)->serialize(bb);
+                }
+            }
+
+            // basic write ops / write intents.  note there is no particular order to these : if we have 
+            // two writes to the same location during the group commit interval, it is likely
+            // (although not assured) that it is journaled here once.
+            // 
+            // "objAppend" operations are herein too
+            //
+            {
+                scoped_lock lk(privateViews._mutex());
+
+                // each time events switch to a different database we journal a JDbContext 
+                RelativePath lastDbPath;
+
+                for( vector<BasicWriteOp>::iterator i = commitJob.basicWrites().begin(); i != commitJob.basicWrites().end(); i++ ) {
+                    size_t ofs = 1;
+                    MongoMMF *mmf = findMMF(i->src, /*out*/ofs);
+
+                    if( i->dst ) { 
+                        // objAppend operation
+
+                        if( mmf->relativePath() != lastDbPath ) { 
+                            lastDbPath = mmf->relativePath();
+                            dassert( lastDbPath != local ); // objAppend is not used for the local database
+                            JDbContext c;
+                            bb.appendStruct(c);
+                            bb.appendStr(lastDbPath.toString());
+                        }
+
+                        size_t dstofs;
+                        MongoMMF *dstmmf = findMMF(i->dst, dstofs);
+                        if( !dstmmf->willNeedRemap() ) {
+                            dstmmf->willNeedRemap() = true;
+                        }
+
+                        // since we have already looked up the mmf, we go ahead and remember the write view location 
+                        // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+                        i->dst = ((char*)dstmmf->view_write()) + dstofs;
+
+                        JObjAppend d;
+                        d.dstFileNo = dstmmf->fileSuffixNo();
+                        d.dstOfs = (unsigned) dstofs;
+                        d.srcFileNo = mmf->fileSuffixNo();
+                        d.srcOfs = ofs;
+                        d.len = i->len();
+                        bb.appendStruct(d);
+                        ++stats.curr._objCopies;
+                    }
+                    else {
+                        if( !mmf->willNeedRemap() ) {
+                            // tag this mmf as needed a remap of its private view later. 
+                            // usually it will already be dirty/already set, so we do the if above first
+                            // to avoid possibility of cpu cache line contention
+                            mmf->willNeedRemap() = true;
+                        }
+
+                        // since we have already looked up the mmf, we go ahead and remember the write view location 
+                        // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+                        dassert( i->dst == 0 );
+                        i->dst = ((char*)mmf->view_write()) + ofs;
+
+                        JEntry e;
+                        e.len = i->len();
+                        assert( ofs <= 0x80000000 );
+                        e.ofs = (unsigned) ofs;
+                        e.setFileNo( mmf->fileSuffixNo() );
+                        if( mmf->relativePath() == local ) { 
+                            e.setLocalDbContextBit();
+                        }
+                        else if( mmf->relativePath() != lastDbPath ) { 
+                            lastDbPath = mmf->relativePath();
+                            JDbContext c;
+                            bb.appendStruct(c);
+                            bb.appendStr(lastDbPath.toString());
+                        }
+                        bb.appendStruct(e);
+                        bb.appendBuf(i->src, i->len());
+                    }
+                }
+            }
+
+            {
+                JSectFooter f(bb.buf(), bb.len());
+                bb.appendStruct(f);
+            }
+
+            {
+                // pad to alignment, and set the total section length in the JSectHeader
+                assert( 0xffffe000 == (~(Alignment-1)) );
+                unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1));
+                dassert( L >= (unsigned) bb.len() );
+                *((unsigned*)bb.atOfs(lenOfs)) = L;
+                unsigned padding = L - bb.len();
+                bb.skip(padding);
+                dassert( bb.len() % Alignment == 0 );
+            }
+
+            return;
+        }
+
+    }
+}
+\ No newline at end of file
diff --git a/db/dur_recover.cpp b/db/dur_recover.cpp
index 586fa6f7a39..9d55974a2d1 100644
--- a/db/dur_recover.cpp
+++ b/db/dur_recover.cpp
@@ -37,23 +37,19 @@ namespace mongo {
 
     namespace dur { 
 
-        /** todo clean up : this is messy from refactoring. */
-         struct FullyQualifiedJournalEntry { 
-             bool isBasicWrite() const { return dbName != 0; }
+        struct ParsedJournalEntry /*copyable*/ { 
+            ParsedJournalEntry() : e(0), d(0) { }
 
-             // relative path of database for the operation.
-             // might be a pointer into mmaped Journal file
-             const char *dbName;  
+            // relative path of database for the operation.
+            // might be a pointer into mmaped Journal file
+            const char *dbName;  
 
+            // thse are pointers into the memory mapped journal file
+            const JEntry *e;  // local db sentinel is already parsed out here into dbName
+            const JObjAppend *d;
 
-             // local db sentinel is already parsed out here into dbName
-             JEntry e;
-
-             // pointer into mmaped Journal file
-             const char *srcData; 
-
-             // if not a simple JEntry, the operation
-             shared_ptr<DurOp> op;
+            // if not one of the two simple JEntry's above, this is the operation:
+            shared_ptr<DurOp> op;
         };
 
         void removeJournalFiles();
@@ -105,7 +101,9 @@ namespace mongo {
              *  @return true if got an entry.  false at successful end of section (and no entry returned).
              *  throws on premature end of section. 
              */
-            bool next(FullyQualifiedJournalEntry& e) { 
+            bool next(ParsedJournalEntry& e) { 
+                assert( e.e == 0 );
+
                 if( !_sectHead ) {  
                     _sectHead = static_cast<const JSectHeader*>(_br.pos());
                     _br.skip(sizeof(JSectHeader));
@@ -113,36 +111,36 @@ namespace mongo {
 
                 unsigned lenOrOpCode;
                 _br.read(lenOrOpCode);
-                if( lenOrOpCode >= JEntry::OpCode_Min ) { 
-                    // not a "basic write"
 
-                    if( lenOrOpCode == JEntry::OpCode_Footer ) { 
+                switch( lenOrOpCode ) {
+
+                case JEntry::OpCode_Footer:
+                    { 
                         const char* pos = (const char*) _br.pos();
                         pos -= sizeof(lenOrOpCode); // rewind to include OpCode
                         const JSectFooter& footer = *(const JSectFooter*)pos;
-
                         int len = pos - (char*)_sectHead;
                         if (!footer.checkHash(_sectHead, len)){
                             massert(13594, str::stream() << "Journal checksum doesn't match. recorded: "
-                                                         << toHex(footer.hash, sizeof(footer.hash))
-                                                         << " actual: " << md5simpledigest(_sectHead, len)
-                                    , false);
+                                << toHex(footer.hash, sizeof(footer.hash))
+                                << " actual: " << md5simpledigest(_sectHead, len)
+                                , false);
                         }
-
                         _br.skip(sizeof(JSectFooter) - 4);
                         _br.align(Alignment);
-
                         _sectHead = NULL;
-                        return false;
+                        return false; // false return value denotes of section
                     }
 
-                    if( lenOrOpCode != JEntry::OpCode_DbContext ) { 
+                case JEntry::OpCode_FileCreated:
+                case JEntry::OpCode_DropDb:
+                    { 
                         e.dbName = 0;
                         e.op = DurOp::read(lenOrOpCode, _br);
                         return true;
                     }
 
-                    // JDbContext
+                case JEntry::OpCode_DbContext:
                     {
                         _lastDbName = (const char*) _br.pos();
                         const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _br.remaining());
@@ -151,19 +149,28 @@ namespace mongo {
                         _br.skip(len+1); // skip '\0' too
                         _br.read(lenOrOpCode);
                     }
+                    // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
+
+                case JEntry::OpCode_ObjAppend:
+                default:
+                    // fall through
+                    ;
                 }
 
-                // JEntry - a basic write
-                assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
-                e.dbName = _lastDbName;
-                e.e.len = lenOrOpCode;
-                _br.read(e.e.ofs);
-                _br.read(e.e._fileNo);
-                if( e.e.isLocalDbContext() ) { 
-                    e.dbName = "local";
-                    e.e.clearLocalDbContextBit();
+                {
+                    assert( lenOrOpCode && lenOrOpCode <= JEntry::OpCode_ObjAppend );
+                    _br.rewind(4);
+                    if( lenOrOpCode == JEntry::OpCode_ObjAppend ) { 
+                        e.d = (JObjAppend *) _br.skip(sizeof(JObjAppend));
+                        e.dbName = _lastDbName;
+                    }
+                    else {
+                        e.e = (JEntry *) _br.skip(sizeof(JEntry));
+                        e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
+                        dassert( e.e->len == lenOrOpCode );
+                        _br.skip(e.e->len);
+                    }
                 }
-                e.srcData = (const char *) _br.skip(lenOrOpCode);
                 return true;
             }
         private:
@@ -179,7 +186,8 @@ namespace mongo {
             void go(vector<path>& files);
             ~RecoveryJob();
         private:
-            void applyEntries(const vector<FullyQualifiedJournalEntry> &entries);
+            void applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump);
+            void applyEntries(const vector<ParsedJournalEntry> &entries);
             bool processBuffer(void *, unsigned len);
             bool processFile(path journalfile);
             void close();
@@ -255,67 +263,98 @@ namespace mongo {
             _fileToPtr.clear();
         }
 
-        void RecoveryJob::applyEntries(const vector<FullyQualifiedJournalEntry> &entries) { 
+        void RecoveryJob::applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump) { 
+            if( entry.e ) {
+                if( dump ) {
+                    stringstream ss;
+                    ss << "  BASICWRITE " << setw(20) << entry.dbName << '.';
+                    if( entry.e->isNsSuffix() )
+                        ss << "ns";
+                    else
+                        ss << setw(2) << entry.e->getFileNo();
+                    ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/ 
+                        "  " << hexdump(entry.e->srcData(), entry.e->len);
+                    log() << ss.str() << endl;
+                } 
+                if( apply ) {
+                    void *p = ptr(entry.dbName, entry.e->getFileNo(), entry.e->ofs);
+                    memcpy(p, entry.e->srcData(), entry.e->len);
+                }
+            } 
+            else if( entry.d ) { 
+                // OpCode_ObjAppend (struct JObjAppend)
+                if( dump ) {
+                    stringstream ss;
+                    ss << "  JObjAppend dst: local." << JEntry::suffix(entry.d->dstFileNo) << " ofs:" << entry.d->dstOfs;
+                    ss << " src:" << entry.dbName << '.' << JEntry::suffix(entry.d->srcFileNo) << " ofs:" << entry.d->srcOfs;
+                    ss << " len:" << entry.d->len;
+                    log() << ss.str() << endl;
+                } 
+                if( apply ) {
+                    void *dst = ptr("local", entry.d->dstFileNo, entry.d->dstOfs);
+                    void *src = ptr(entry.dbName, entry.d->srcFileNo, entry.d->srcOfs);
+                    memcpy(dst, src, entry.d->len);
+                    char *p = (char *) dst;
+                    p[-3] = (char) Object; // { ..., o: <copiedobj>, ..., EOO}
+                    p[-2] = 'o';
+                    p[-1] = 0;
+                    p[entry.d->len] = EOO;
+                }
+            }
+            else {
+                // a DurOp subclass operation
+                if( dump ) {
+                    log() << "  OP " << entry.op->toString() << endl;
+                } 
+                if( apply ) {
+                    if( entry.op->needFilesClosed() ) {
+                        close();
+                    }
+                    entry.op->replay();
+                }
+            }
+        }
+
+        void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) { 
             bool apply = (cmdLine.durOptions & CmdLine::DurScanOnly) == 0;
             bool dump = cmdLine.durOptions & CmdLine::DurDumpJournal;
             if( dump )
                 log() << "BEGIN section" << endl;
             
-            for( vector<FullyQualifiedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) { 
-                const FullyQualifiedJournalEntry& fqe = *i;
-                if( fqe.isBasicWrite() ) {
-                    if( dump ) {
-                        stringstream ss;
-                        ss << "  BASICWRITE " << setw(20) << fqe.dbName << '.';
-                        if( fqe.e._fileNo == JEntry::DotNsSuffix )
-                            ss << "ns";
-                        else
-                            ss << setw(2) << fqe.e._fileNo;
-                        ss << ' ' << setw(6) << fqe.e.len << ' ' << hex << setw(8) << (size_t) fqe.srcData << dec << "  " << hexdump(fqe.srcData, fqe.e.len);
-                        log() << ss.str() << endl;
-                    } 
-                    if( apply ) {
-                        void *p = ptr(fqe.dbName, fqe.e._fileNo, fqe.e.ofs);
-                        memcpy(p, fqe.srcData, fqe.e.len);
-                    }
-                } else {
-                    if( dump ) {
-                        log() << "  OP " << fqe.op->toString() << endl;
-                    } 
-                    if( apply ) {
-                        if( fqe.op->needFilesClosed() ) {
-                            close();
-                        }
-                        fqe.op->replay();
-                    }
-                }
+            for( vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) { 
+                applyEntry(*i, apply, dump);
             }            
 
             if( dump )
                 log() << "END section" << endl;
         }
 
-        /** @param p start of the memory mapped file
+        /** apply a specific journal file, that is already mmap'd
+            @param p start of the memory mapped file
             @return true if this is detected to be the last file (ends abruptly) 
         */
         bool RecoveryJob::processBuffer(void *p, unsigned len) {
             JournalIterator i(p, len);
-            vector<FullyQualifiedJournalEntry> entries;
+            vector<ParsedJournalEntry> entries;
 
             try {
                 while( 1 ) { 
                     entries.clear();
 
-                    FullyQualifiedJournalEntry e;
-                    while( i.next(e) )
+                    while( 1 ) {
+                        ParsedJournalEntry e;
+                        if( !i.next(e) )
+                            break;
                         entries.push_back(e);
+                    }
 
                     // got all the entries for one group commit.  apply them: 
                     applyEntries(entries);
 
-                    // now do the next section (i.e. group commit)
                     if( i.atEof() )
                         break;
+
+                    // loop back and do the new group commit section
                 }
             }
             catch( BufReader::eof& ) { 
@@ -327,6 +366,7 @@ namespace mongo {
             return false; // non-abrupt end
         }
 
+        /** apply a specific journal file */
         bool RecoveryJob::processFile(path journalfile) {
             log() << "recover " << journalfile.string() << endl;
             MemoryMappedFile f;
@@ -335,6 +375,7 @@ namespace mongo {
             return processBuffer(p, (unsigned) f.length());
         }
 
+        /** @param files all the j._0 style files we need to apply for recovery */
         void RecoveryJob::go(vector<path>& files) { 
             log() << "recover begin" << endl;
 
@@ -392,7 +433,7 @@ namespace mongo {
                 BufReader r((void*) "abcdabcdabcd", 12);
                 char x;
                 BufReaderY y;
-                r.read(x); cout << x; // a
+                r.read(x); //cout << x; // a
                 assert( x == 'a' );
                 r.read(y);
                 r.read(x); 
diff --git a/db/dur_stats.h b/db/dur_stats.h
index f9fe3f7eb7b..cd76efb01a9 100644
--- a/db/dur_stats.h
+++ b/db/dur_stats.h
@@ -10,9 +10,16 @@ namespace mongo {
             Stats();
             struct S { 
                 unsigned _commits;
-                unsigned _dittos;
+                unsigned _objCopies;
                 unsigned long long _journaledBytes;
                 unsigned long long _writeToDataFilesBytes;
+
+                // undesirable to be in write lock for the group commit (it can be done in a read lock), so good if we 
+                // have visibility when this happens.  can happen for a couple reasons
+                // - read lock starvation
+                // - file being closed
+                // - data being written faster than the normal group commit interval
+                unsigned _commitsInWriteLock; 
             } curr;
         };
         extern Stats stats;
diff --git a/db/dur_writetodatafiles.cpp b/db/dur_writetodatafiles.cpp
index 21b995c5732..fad947c6470 100644
--- a/db/dur_writetodatafiles.cpp
+++ b/db/dur_writetodatafiles.cpp
@@ -46,11 +46,18 @@ namespace mongo {
         */
         void WRITETODATAFILES() { 
             /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
-            for( int i = commitJob.writes().size() - 1; i >= 0; i-- ) {
-                const WriteIntent& intent = commitJob.writes()[i];
-                char *dst = (char *) (intent.w_ptr);
-                memcpy(dst, intent.p, intent.len);
-                stats.curr._writeToDataFilesBytes += intent.len;
+            for( int i = commitJob.basicWrites().size() - 1; i >= 0; i-- ) {
+                const BasicWriteOp& b = commitJob.basicWrites()[i];
+                stats.curr._writeToDataFilesBytes += b.len();
+                dassert(b.dst);
+                memcpy(b.dst, b.src, b.len());
+                if( b.isObjAppend() ) { 
+                    char *p = static_cast<char*>(b.dst);
+                    p[-3] = (char) Object; // { ..., o: <copiedobj>, ..., EOO}
+                    p[-2] = 'o';
+                    p[-1] = 0;
+                    p[b.len()] = EOO;
+                }
             }
 
             debugValidateMapsMatch();
diff --git a/db/oplog.cpp b/db/oplog.cpp
index 59dd8f01ade..b7dc8e5949d 100644
--- a/db/oplog.cpp
+++ b/db/oplog.cpp
@@ -217,9 +217,10 @@ namespace mongo {
             b.appendBool("b", *bb);
         if ( o2 )
             b.append("o2", *o2);
-        BSONObj partial = b.done();
-        int posz = partial.objsize();
-        int len = posz + obj.objsize() + 1 + 2 /*o:*/;
+        BSONObj partial = b.done(); // partial is everything except the o:... part.
+
+        int po_sz = partial.objsize();
+        int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;
 
         Record *r;
         if( logNS == 0 ) {
@@ -236,21 +237,37 @@ namespace mongo {
         } else {
             Client::Context ctx( logNS, dbpath, 0, false );
             assert( nsdetails( logNS ) );
+            // first we allocate the space, then we fill it below.
             r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
         }
 
         {
-            const int size2 = obj.objsize() + 1 + 2;
-            char *p = (char *) getDur().writingPtr(r->data, size2+posz);
-            memcpy(p, partial.objdata(), posz);
-            *((unsigned *)p) += size2;
-            p += posz - 1;
-            *p++ = (char) Object;
-            *p++ = 'o'; // { o : ... }
-            *p++ = 0;
-            memcpy(p, obj.objdata(), obj.objsize());
-            p += obj.objsize();
-            *p = EOO;
+            const int size1 = po_sz-1;  // less the EOO char
+            const int objOfs = size1+3; // 3 = byte BSONOBJTYPE + byte 'o' + byte \0
+            char *objInsertPoint = r->data + objOfs;
+
+            // don't bother if obj is tiny as there is some header overhead for the additional journal entry
+            // and also some computational administrative overhead.
+            bool objAppendWorked = obj.objsize() >= 32 && 
+                                 getDur().objAppend(objInsertPoint, obj.objdata(), obj.objsize());
+
+            void *p = (char *) getDur().writingPtr(r->data, objAppendWorked ? size1 : objOfs+obj.objsize()+1);
+
+            memcpy(p, partial.objdata(), size1);
+
+            // adjust overall bson object size for the o: field
+            *(static_cast<unsigned*>(p)) += obj.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/;
+
+            if( !objAppendWorked ) {
+                char *b = static_cast<char *>(p);
+                b += size1;
+                *b++ = (char) Object;
+                *b++ = 'o'; // { o : ... }
+                *b++ = 0;   // null terminate "o" fieldname
+                memcpy(b, obj.objdata(), obj.objsize());
+                b += obj.objsize();
+                *b = EOO;
+            }
         }
 
         context.getClient()->setLastOp( ts.asDate() );
diff --git a/jstests/dur/oplog.js b/jstests/dur/oplog.js
new file mode 100755
index 00000000000..0887826263f
--- /dev/null
+++ b/jstests/dur/oplog.js
@@ -0,0 +1,118 @@
+/* 
+   test durability
+*/
+
+var debugging = false;
+if (""+typeof(db) != "undefined")
+    debugging = true;
+var testname = "oplog";
+var step = 1;
+var conn = null;
+
+function log(str) {
+    if(str)
+        print(testname+" step " + step++ + " " + str);
+    else
+        print(testname+" step " + step++);
+}
+
+// if you do inserts here, you will want to set _id.  otherwise they won't match on different 
+// runs so we can't do a binary diff of the resulting files to check they are consistent.
+function work() {
+    log("work");
+    var d = conn.getDB("test");
+    d.foo.insert({ _id: 3, x: 22 });
+    d.foo.insert({ _id: 4, x: 22 });
+    d.a.insert({ _id: 3, x: 22, y: [1, 2, 3] });
+    d.a.insert({ _id: 4, x: 22, y: [1, 2, 3] });
+    d.a.update({ _id: 4 }, { $inc: { x: 1} });
+    // OpCode_ObjCopy fires on larger operations so make one that isn't tiny
+    var big = "axxxxxxxxxxxxxxb";
+    big = big + big;
+    big = big + big;
+    big = big + big;
+    big = big + big;
+    big = big + big;
+    d.foo.insert({ _id: 5, q: "aaaaa", b: big, z: 3 });
+
+    // assure writes applied in case we kill -9 on return from this function
+    d.getLastError(); 
+
+    log("endwork");
+}
+
+function verify() { 
+    log("verify");
+    var d = conn.getDB("local");
+    assert( d.oplog.$main.find({"o.z":3}).count() == 1, "oplog doesnt match" );
+}
+
+if( debugging ) {
+    // mongod already running in debugger
+    print("DOING DEBUG MODE BEHAVIOR AS 'db' IS DEFINED -- RUN mongo --nodb FOR REGULAR TEST BEHAVIOR");
+    conn = db.getMongo();
+    work();
+    sleep(30000);
+    quit();
+}
+
+log();
+
+// directories
+var path1 = "/data/db/" + testname+"nodur";
+var path2 = "/data/db/" + testname+"dur";
+
+// non-durable version
+log();
+conn = startMongodEmpty("--port", 30000, "--dbpath", path1, "--nodur", "--smallfiles", "--master");
+work();
+stopMongod(30000);
+
+// durable version
+log();
+conn = startMongodEmpty("--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", /*DurParanoid*/8, "--master");
+work();
+
+// wait for group commit.  use getLastError(...) later when that is enhanced.
+sleep(400);
+
+// kill the process hard
+stopMongod(30001, /*signal*/9);
+
+// journal file should be present, and non-empty as we killed hard
+
+// restart and recover
+log();
+conn = startMongodNoReset("--port", 30002, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8, "--master");
+verify();
+
+log("stop");
+stopMongod(30002);
+
+// stopMongod seems to be asynchronous (hmmm) so we sleep here.
+sleep(5000);
+
+// at this point, after clean shutdown, there should be no journal files
+log("check no journal files");
+assert(ls(path2 + "/journal") == null);
+
+log("check data matches ns");
+var diff = run("diff", path1 + "/test.ns", path2 + "/test.ns");
+if (diff != "") {
+    print("\n\n\nDIFFERS\n");
+    print(diff);
+}
+assert(diff == "", "error test.ns files differ");
+
+log("check data matches .0");
+diff = run("diff", path1 + "/test.0", path2 + "/test.0");
+if (diff != "") {
+    print("\n\n\nDIFFERS\n");
+    print(diff);
+}
+assert(diff == "", "error test.0 files differ");
+
+log("check data matches done");
+
+print(testname + " SUCCESS");
+