summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--SConstruct2
-rw-r--r--client/dbclient_rs.cpp4
-rw-r--r--db/bufreader.h6
-rw-r--r--db/db.vcxproj2
-rwxr-xr-xdb/db.vcxproj.filters6
-rw-r--r--db/dur.cpp204
-rw-r--r--db/dur.h17
-rw-r--r--db/dur_commitjob.cpp2
-rw-r--r--db/dur_commitjob.h68
-rw-r--r--db/dur_journal.cpp3
-rw-r--r--db/dur_journalformat.h37
-rw-r--r--db/dur_preplogbuffer.cpp175
-rw-r--r--db/dur_recover.cpp187
-rw-r--r--db/dur_stats.h9
-rw-r--r--db/dur_writetodatafiles.cpp17
-rw-r--r--db/oplog.cpp45
-rwxr-xr-xjstests/dur/oplog.js118
17 files changed, 670 insertions, 232 deletions
diff --git a/SConstruct b/SConstruct
index 5f9e087acba..01bb0123024 100644
--- a/SConstruct
+++ b/SConstruct
@@ -327,7 +327,7 @@ coreServerFiles = [ "util/message_server_port.cpp" ,
if has_option( "asio" ):
coreServerFiles += [ "util/message_server_asio.cpp" ]
-serverOnlyFiles = Split( "util/logfile.cpp util/alignedbuilder.cpp db/mongommf.cpp db/dur.cpp db/durop.cpp db/dur_writetodatafiles.cpp db/dur_commitjob.cpp db/dur_recover.cpp db/dur_journal.cpp db/query.cpp db/update.cpp db/introspect.cpp db/btree.cpp db/clientcursor.cpp db/tests.cpp db/repl.cpp db/repl/rs.cpp db/repl/consensus.cpp db/repl/rs_initiate.cpp db/repl/replset_commands.cpp db/repl/manager.cpp db/repl/health.cpp db/repl/heartbeat.cpp db/repl/rs_config.cpp db/repl/rs_rollback.cpp db/repl/rs_sync.cpp db/repl/rs_initialsync.cpp db/oplog.cpp db/repl_block.cpp db/btreecursor.cpp db/cloner.cpp db/namespace.cpp db/cap.cpp db/matcher_covered.cpp db/dbeval.cpp db/restapi.cpp db/dbhelpers.cpp db/instance.cpp db/client.cpp db/database.cpp db/pdfile.cpp db/cursor.cpp db/security_commands.cpp db/security.cpp db/queryoptimizer.cpp db/extsort.cpp db/cmdline.cpp" )
+serverOnlyFiles = Split( "util/logfile.cpp util/alignedbuilder.cpp db/mongommf.cpp db/dur.cpp db/durop.cpp db/dur_writetodatafiles.cpp db/dur_preplogbuffer.cpp db/dur_commitjob.cpp db/dur_recover.cpp db/dur_journal.cpp db/query.cpp db/update.cpp db/introspect.cpp db/btree.cpp db/clientcursor.cpp db/tests.cpp db/repl.cpp db/repl/rs.cpp db/repl/consensus.cpp db/repl/rs_initiate.cpp db/repl/replset_commands.cpp db/repl/manager.cpp db/repl/health.cpp db/repl/heartbeat.cpp db/repl/rs_config.cpp db/repl/rs_rollback.cpp db/repl/rs_sync.cpp db/repl/rs_initialsync.cpp db/oplog.cpp db/repl_block.cpp db/btreecursor.cpp db/cloner.cpp db/namespace.cpp db/cap.cpp db/matcher_covered.cpp db/dbeval.cpp db/restapi.cpp db/dbhelpers.cpp db/instance.cpp db/client.cpp db/database.cpp db/pdfile.cpp db/cursor.cpp db/security_commands.cpp db/security.cpp db/queryoptimizer.cpp db/extsort.cpp db/cmdline.cpp" )
serverOnlyFiles += [ "db/index.cpp" ] + Glob( "db/geo/*.cpp" )
diff --git a/client/dbclient_rs.cpp b/client/dbclient_rs.cpp
index 7280d5a0a60..00e15e825e0 100644
--- a/client/dbclient_rs.cpp
+++ b/client/dbclient_rs.cpp
@@ -360,7 +360,7 @@ namespace mongo {
try {
return checkSlave()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize);
}
- catch ( DBException & e ){
+ catch ( DBException & ){
LOG(1) << "can't query replica set slave: " << _slaveHost << endl;
}
}
@@ -378,7 +378,7 @@ namespace mongo {
try {
return checkSlave()->findOne(ns,query,fieldsToReturn,queryOptions);
}
- catch ( DBException & e ){
+ catch ( DBException & ){
LOG(1) << "can't query replica set slave: " << _slaveHost << endl;
}
}
diff --git a/db/bufreader.h b/db/bufreader.h
index 8e351f29334..bcc4f4271e3 100644
--- a/db/bufreader.h
+++ b/db/bufreader.h
@@ -60,6 +60,12 @@ namespace mongo {
/** return remaining bytes */
unsigned remaining() const { return (char*)_end -(char*)_pos; }
+ /** back up by nbytes */
+ void rewind(unsigned nbytes) {
+ _pos = ((char *) _pos) - nbytes;
+ assert( _pos >= _start );
+ }
+
/** return current position pointer, and advance by len */
const void* skip(unsigned len) {
char *nxt = ((char *) _pos) + len;
diff --git a/db/db.vcxproj b/db/db.vcxproj
index c4f9e1edc70..7d1baae52d7 100644
--- a/db/db.vcxproj
+++ b/db/db.vcxproj
@@ -198,6 +198,7 @@
<ItemGroup>
<ClCompile Include="..\bson\oid.cpp" />
<ClCompile Include="..\client\dbclientcursor.cpp" />
+ <ClCompile Include="..\client\dbclient_rs.cpp" />
<ClCompile Include="..\client\distlock.cpp" />
<ClCompile Include="..\client\model.cpp" />
<ClCompile Include="..\pcre-7.4\pcrecpp.cc">
@@ -483,6 +484,7 @@
<ClCompile Include="durop.cpp" />
<ClCompile Include="dur_commitjob.cpp" />
<ClCompile Include="dur_journal.cpp" />
+ <ClCompile Include="dur_preplogbuffer.cpp" />
<ClCompile Include="dur_recover.cpp" />
<ClCompile Include="dur_writetodatafiles.cpp" />
<ClCompile Include="geo\2d.cpp" />
diff --git a/db/db.vcxproj.filters b/db/db.vcxproj.filters
index f77d11b4125..fd2f74753f8 100755
--- a/db/db.vcxproj.filters
+++ b/db/db.vcxproj.filters
@@ -451,6 +451,12 @@
<ClCompile Include="dur_writetodatafiles.cpp">
<Filter>_storage engine</Filter>
</ClCompile>
+ <ClCompile Include="dur_preplogbuffer.cpp">
+ <Filter>_storage engine</Filter>
+ </ClCompile>
+ <ClCompile Include="..\client\dbclient_rs.cpp">
+ <Filter>client</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="repl\rs_config.h">
diff --git a/db/dur.cpp b/db/dur.cpp
index d0d128ea910..b57015b7759 100644
--- a/db/dur.cpp
+++ b/db/dur.cpp
@@ -50,6 +50,7 @@
#include "../util/mongoutils/str.h"
#include "../util/timer.h"
#include "dur_stats.h"
+#include "pdfile.h" // Record::HeaderSize
using namespace mongoutils;
@@ -57,7 +58,18 @@ namespace mongo {
namespace dur {
+#if defined(_DEBUG)
+ const bool DebugCheckLastDeclaredWrite = false;
+#else
+ const bool DebugCheckLastDeclaredWrite = false;
+#endif
+
void WRITETODATAFILES();
+ void PREPLOGBUFFER();
+
+ static void groupCommit(); // later in this file
+
+ CommitJob commitJob;
Stats stats;
@@ -65,6 +77,8 @@ namespace mongo {
memset(this, 0, sizeof(*this));
}
+ DurableInterface* DurableInterface::_impl = new NonDurableImpl();
+
void NonDurableImpl::startup() {
if( haveJournalFiles() ) {
log() << "Error: journal files are present in journal directory, yet starting without --dur enabled." << endl;
@@ -74,13 +88,49 @@ namespace mongo {
}
}
-#if defined(_DEBUG)
- const bool DebugCheckLastDeclaredWrite = false;
-#else
- const bool DebugCheckLastDeclaredWrite = false;
-#endif
+ /** base declare write intent function that all the helpers call. */
+ void DurableImpl::declareWriteIntent(void *p, unsigned len) {
+ WriteIntent w;
+ w.p = p;
+ w.len = len;
+ commitJob.note(w);
+ }
- DurableInterface* DurableInterface::_impl = new NonDurableImpl();
+ bool DurableImpl::objAppend(void *dst, const void *src, unsigned len) {
+ {
+ char *srcRecord = (char*) src;
+ srcRecord -= Record::HeaderSize;
+ WriteIntent w;
+ w.p = srcRecord;
+ w.len = len+Record::HeaderSize;
+ if( !commitJob.alreadyNoted(w) ) {
+
+ if( debug ) {
+ WriteIntent w;
+ w.p = (void*)src;
+ w.len = len;
+ if( commitJob.alreadyNoted(w) ) {
+ log() << "dur info it appears an optimization is possible that is not yet done" << endl;
+ }
+ }
+
+ return false;
+ }
+ }
+
+ BasicWriteOp b;
+ b.setObjAppend(dst, (void*)src, len);
+ commitJob.basicWrites().push_back(b);
+ if( testIntent )
+ dst = MongoMMF::switchToPrivateView(dst);
+ memcpy(dst, src, len);
+ char *p = static_cast<char*>(dst);
+ p[-3] = (char) Object; // { ..., o: <copiedobj>, ..., EOO}
+ p[-2] = 'o';
+ p[-1] = 0;
+ p[len] = EOO;
+ return true;
+ }
void enableDurability() { // TODO: merge with startup() ?
assert(typeid(*DurableInterface::_impl) == typeid(NonDurableImpl));
@@ -88,11 +138,6 @@ namespace mongo {
DurableInterface::_impl = new DurableImpl();
}
- // later in this file
- static void groupCommit();
-
- CommitJob commitJob;
-
bool DurableImpl::commitNow() {
groupCommit();
return true;
@@ -124,12 +169,6 @@ namespace mongo {
groupCommit();
}
- /** declare write intent. when already in the write view if testIntent is true. */
- void DurableImpl::declareWriteIntent(void *p, unsigned len) {
- WriteIntent w(p, len);
- commitJob.note(w);
- }
-
void* DurableImpl::writingPtr(void *x, unsigned len) {
void *p = x;
if( testIntent )
@@ -171,15 +210,15 @@ namespace mongo {
++n;
assert(debug && cmdLine.dur);
- vector<WriteIntent>& w = commitJob.writes();
+ vector<BasicWriteOp>& w = commitJob.basicWrites();
if( w.size() == 0 )
return;
- const WriteIntent &i = w[w.size()-1];
+ const BasicWriteOp &i = w[w.size()-1];
size_t ofs;
- MongoMMF *mmf = privateViews.find(i.p, ofs);
+ MongoMMF *mmf = privateViews.find(i.src, ofs);
if( mmf == 0 )
return;
- size_t past = ofs + i.len;
+ size_t past = ofs + i.len();
if( mmf->length() < past + 8 )
return; // too close to end of view
char *priv = (char *) mmf->getView();
@@ -188,15 +227,15 @@ namespace mongo {
unsigned long long *b = (unsigned long long *) (writ+past);
if( *a != *b ) {
for( unsigned z = 0; z < w.size() - 1; z++ ) {
- const WriteIntent& wi = w[z];
- char *r1 = (char*) wi.p;
- char *r2 = r1 + wi.len;
+ const BasicWriteOp& wi = w[z];
+ char *r1 = (char*) wi.src;
+ char *r2 = r1 + wi.len();
if( r1 <= (((char*)a)+8) && r2 > (char*)a ) {
//log() << "it's ok " << wi.p << ' ' << wi.len << endl;
return;
}
}
- log() << "dur data after write area " << i.p << " does not agree" << endl;
+ log() << "dur data after write area " << i.src << " does not agree" << endl;
log() << " was: " << ((void*)b) << " " << hexdump((char*)b, 8) << endl;
log() << " now: " << ((void*)a) << " " << hexdump((char*)a, 8) << endl;
log() << " n: " << n << endl;
@@ -205,91 +244,6 @@ namespace mongo {
}
#endif
- RelativePath local = RelativePath::fromRelativePath("local");
-
- /** we will build an output buffer ourself and then use O_DIRECT
- we could be in read lock for this
- caller handles locking
- */
- static void PREPLOGBUFFER() {
- assert( cmdLine.dur );
- AlignedBuilder& bb = commitJob._ab;
- bb.reset();
-
- unsigned lenOfs; // we will need to backfill the length when prep is wrapping up
- // JSectHeader
- {
- bb.appendStr("\nHH\n", false);
- lenOfs = bb.skip(4);
- }
-
- // ops other than basic writes (DurOp's)
- {
- for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) {
- (*i)->serialize(bb);
- }
- }
-
- // write intents. note there is no particular order to these : if we have
- // two writes to the same location during the group commit interval, it is likely
- // (although not assured) that it is journalled here once.
- {
- scoped_lock lk(privateViews._mutex());
- RelativePath lastFilePath;
- for( vector<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
- size_t ofs;
- MongoMMF *mmf = privateViews._find(i->p, ofs);
- if( mmf == 0 ) {
- string s = str::stream() << "view pointer cannot be resolved " << (size_t) i->p;
- journalingFailure(s.c_str()); // asserts
- return;
- }
-
- if( !mmf->willNeedRemap() ) {
- // tag this mmf as needed a remap of its private view later.
- // usually it will already be dirty/already set, so we do the if above first
- // to avoid possibility of cpu cache line contention
- mmf->willNeedRemap() = true;
- }
- i->w_ptr = ((char*)mmf->view_write()) + ofs;
- JEntry e;
- e.len = i->len;
- assert( ofs <= 0x80000000 );
- e.ofs = (unsigned) ofs;
- e.setFileNo( mmf->fileSuffixNo() );
- if( mmf->relativePath() == local ) {
- e.setLocalDbContextBit();
- }
- else if( mmf->relativePath() != lastFilePath ) {
- lastFilePath = mmf->relativePath();
- //assert( !str::startsWith(lastFilePath, dbpath) ); // dbpath should be stripped this is a relative path
- JDbContext c;
- bb.appendStruct(c);
- bb.appendStr(lastFilePath.toString());
- }
- bb.appendStruct(e);
- bb.appendBuf(i->p, i->len);
- }
- }
-
- {
- JSectFooter f(bb.buf(), bb.len());
- bb.appendStruct(f);
- }
-
- {
- assert( 0xffffe000 == (~(Alignment-1)) );
- unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1)); // fill to alignment
- dassert( L >= (unsigned) bb.len() );
- *((unsigned*)bb.atOfs(lenOfs)) = L;
- unsigned padding = L - bb.len();
- bb.skip(padding);
- dassert( bb.len() % Alignment == 0 );
- }
-
- return;
- }
-
/** write the buffer we have built to the journal and fsync it.
outside of lock as that could be slow.
*/
@@ -326,9 +280,23 @@ namespace mongo {
unsigned low = 0xffffffff;
unsigned high = 0;
+ log() << "DurParanoid mismatch in " << mmf->filename() << endl;
+ int logged = 0;
+ unsigned lastMismatch = 0xffffffff;
for( unsigned i = 0; i < mmf->length(); i++ ) {
if( p[i] != w[i] ) {
- log() << i << '\t' << (int) p[i] << '\t' << (int) w[i] << endl;
+ if( lastMismatch != 0xffffffff && lastMismatch+1 != i )
+ log() << endl; // separate blocks of mismatches
+ lastMismatch= i;
+ if( ++logged < 60 ) {
+ stringstream ss;
+ ss << "mismatch ofs:" << hex << i << "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
+ if( p[i] > 32 && p[i] <= 126 )
+ ss << '\t' << p[i];
+ log() << ss.str() << endl;
+ }
+ if( logged == 60 )
+ log() << "..." << endl;
if( i < low ) low = i;
if( i > high ) high = i;
}
@@ -337,9 +305,9 @@ namespace mongo {
std::stringstream ss;
ss << "dur error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
log() << ss.str() << endl;
- log() << "priv loc: " << (void*)(p+low) << endl;
- vector<WriteIntent>& w = commitJob.writes();
- (void)w; // mark as unused. Useful for inspection in debugger
+ log() << "priv loc: " << (void*)(p+low) << ' ' << stats.curr._objCopies << endl;
+ vector<BasicWriteOp>& b = commitJob.basicWrites();
+ (void)b; // mark as unused. Useful for inspection in debugger
massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false);
}
@@ -412,6 +380,9 @@ namespace mongo {
stats.curr._commits++;
dbMutex.assertAtLeastReadLocked();
+ if( dbMutex.isWriteLocked() ) {
+ stats.curr._commitsInWriteLock++;
+ }
if( !commitJob.hasWritten() )
return;
@@ -514,6 +485,7 @@ namespace mongo {
void unlinkThread();
void recover();
+
void releasingWriteLock() {
try {
#if defined(_DEBUG)
@@ -548,7 +520,5 @@ namespace mongo {
}
} // namespace dur
-
-
+
} // namespace mongo
-
diff --git a/db/dur.h b/db/dur.h
index 2f4ca6ca2b2..c27e8614637 100644
--- a/db/dur.h
+++ b/db/dur.h
@@ -28,6 +28,21 @@ namespace mongo {
/** Declare a database is about to be dropped */
virtual void droppingDb(string db) = 0;
+ /** this function appends a nested bsonobj, with the fieldname, "o", to a partially built
+ bson object. it then terminates the bson object with EOO. The caller is responsible
+ for setting the outer bson object's total size correctly.
+
+ It also journals the operation. an optimization occurs in the journaling.
+ if the src was just journaled, a new copy is not written, rather,
+ JObjAppend is added to the journal which is small.
+
+ If the src was not recently journaled, that is ok: in that case the function returns false
+ and you must do your traditional behavior.
+
+ This is used to make writing to the replication oplog efficient.
+ */
+ virtual bool objAppend(void *dst, const void *src, unsigned len) = 0;
+
/** Declarations of write intent.
Use these methods to declare "i'm about to write to x and it should be logged for redo."
@@ -143,6 +158,7 @@ namespace mongo {
void declareWriteIntent(void *, unsigned) { }
void createdFile(string filename, unsigned long long len) { }
void droppingDb(string db) { }
+ bool objAppend(void *dst, const void *src, unsigned len) { return false; }
bool awaitCommit() { return false; }
bool commitNow() { return false; }
#if defined(_DEBUG)
@@ -157,6 +173,7 @@ namespace mongo {
void declareWriteIntent(void *, unsigned);
void createdFile(string filename, unsigned long long len);
void droppingDb(string db);
+ bool objAppend(void *dst, const void *src, unsigned len);
bool awaitCommit();
bool commitNow();
#if defined(_DEBUG)
diff --git a/db/dur_commitjob.cpp b/db/dur_commitjob.cpp
index 049ee41c3fa..ba74c66272d 100644
--- a/db/dur_commitjob.cpp
+++ b/db/dur_commitjob.cpp
@@ -24,7 +24,7 @@ namespace mongo {
void Writes::clear() {
_alreadyNoted.clear();
- _writes.clear();
+ _basicWrites.clear();
_ops.clear();
}
diff --git a/db/dur_commitjob.h b/db/dur_commitjob.h
index e265f7121b4..ceca4a0da07 100644
--- a/db/dur_commitjob.h
+++ b/db/dur_commitjob.h
@@ -29,13 +29,31 @@
namespace mongo {
namespace dur {
- /** declaration of an intent to write to a region of a memory mapped view */
- struct WriteIntent /* copyable */ {
- WriteIntent() : w_ptr(0), p(0) { }
- WriteIntent(void *a, unsigned b) : w_ptr(0), p(a), len(b) { }
- void *w_ptr; // p is mapped from private to equivalent location in the writable mmap
- void *p; // intent to write at p
- unsigned len; // up to this len
+ /** "I intend to write at p for up to len bytes" */
+ struct WriteIntent {
+ void *p;
+ unsigned len;
+ };
+
+ /** declaration of an intent to write to a region of a memory mapped view
+ this could be either a JEntry or a JObjAppend in the journal
+ */
+ struct BasicWriteOp /* copyable */ {
+ void *dst;
+ void *src;
+ unsigned len() const { return _len & 0x7fffffff; }
+ bool isObjAppend() const { return _len & 0x80000000; }
+ void set(void *Src, unsigned Len) {
+ dst = 0;
+ src = Src;
+ _len = Len;
+ }
+ void setObjAppend(void *Dst, void *Src, unsigned Len) {
+ dst = Dst; src = Src;
+ _len = Len | 0x80000000;
+ }
+ private:
+ unsigned _len;
};
/** try to remember things we have already marked for journalling. false negatives are ok if infrequent -
@@ -64,6 +82,15 @@ namespace mongo {
nd = w;
return false; // a new set
}
+
+ /**
+ @return true if already indicated.
+ */
+ bool check(const WriteIntent& w) {
+ unsigned x = mongoutils::hashPointer(w.p);
+ WriteIntent& nd = nodes[x % N];
+ return nd.p == w.p && nd.len >= w.len;
+ }
private:
enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
WriteIntent nodes[N];
@@ -73,8 +100,9 @@ namespace mongo {
class Writes : boost::noncopyable {
public:
Already<127> _alreadyNoted;
- vector<WriteIntent> _writes;
+ vector<BasicWriteOp> _basicWrites;
vector< shared_ptr<DurOp> > _ops; // all the ops other than basic writes
+
/** reset the Writes structure (empties all the above) */
void clear();
};
@@ -97,7 +125,10 @@ namespace mongo {
/** note an operation other than a "basic write" */
void noteOp(shared_ptr<DurOp> p);
- vector<WriteIntent>& writes() { return _wi._writes; }
+ /** @return true if was already noted. false negatives possible (to be fast). */
+ bool alreadyNoted(WriteIntent& w) { return _wi._alreadyNoted.check(w); }
+
+ vector<BasicWriteOp>& basicWrites() { return _wi._basicWrites; }
vector< shared_ptr<DurOp> >& ops() { return _wi._ops; }
/** this method is safe to call outside of locks. when haswritten is false we don't do any group commit and avoid even
@@ -117,13 +148,14 @@ namespace mongo {
_notify.wait();
}
+ /** we check how much written and if it is getting to be a lot, we commit sooner. */
size_t bytes() const { return _bytes; }
private:
bool _hasWritten;
Writes _wi;
- NotifyAll _notify;
size_t _bytes;
+ NotifyAll _notify; // for getlasterror fsync:true acknowledgements
};
extern CommitJob commitJob;
@@ -142,7 +174,10 @@ namespace mongo {
dassert( cmdLine.dur );
if( !_wi._alreadyNoted.checkAndSet(w) ) {
if( !_hasWritten ) {
+ // you can't be writing if one of these is pending, so this is a verification.
assert( !dbMutex._remapPrivateViewRequested );
+
+ // we don't bother doing a group commit when nothing is written, so we have a var to track that
_hasWritten = true;
}
@@ -164,6 +199,10 @@ namespace mongo {
else {
log() << "DEBUG note write intent " << w.p << ' ' << w.len << " NOT FOUND IN privateViews" << endl;
}
+ /*if( w.len > 48 ) {
+ log() << "big TEMP" << endl;
+ log() << hexdump((char*) w.p, 48) << endl;
+ }*/
}
else if( n == 10000 ) {
log() << "DEBUG stopping write intent logging, too much to log" << endl;
@@ -172,10 +211,11 @@ namespace mongo {
#endif
// remember intent. we will journal it in a bit
- _wi._writes.push_back(w);
- _bytes += w.len;
- wassert( _wi._writes.size() < 2000000 );
- //assert( _wi._writes.size() < 20000000 );
+ BasicWriteOp b;
+ b.set(w.p, w.len);
+ _wi._basicWrites.push_back(b);
+ wassert( _wi._basicWrites.size() < 2000000 );
+ assert( _wi._basicWrites.size() < 20000000 );
}
}
}
diff --git a/db/dur_journal.cpp b/db/dur_journal.cpp
index f8ac1ff41d9..d7af952b0a4 100644
--- a/db/dur_journal.cpp
+++ b/db/dur_journal.cpp
@@ -32,8 +32,9 @@
#include "../util/mongoutils/str.h"
#include "../util/concurrency/mvar.h"
+using namespace mongoutils;
+
namespace mongo {
- using namespace mongoutils;
class AlignedBuilder;
diff --git a/db/dur_journalformat.h b/db/dur_journalformat.h
index 6f6d6d2f5d3..7c8473e830f 100644
--- a/db/dur_journalformat.h
+++ b/db/dur_journalformat.h
@@ -70,27 +70,58 @@ namespace mongo {
OpCode_DbContext = 0xfffffffe,
OpCode_FileCreated = 0xfffffffd,
OpCode_DropDb = 0xfffffffc,
+ OpCode_ObjAppend = 0xfffffffb,
OpCode_Min = 0xfffff000
};
-
union {
- unsigned len;
+ unsigned len; // length in bytes of the data of the JEntry. does not include the JEntry header
OpCodes opcode;
};
+
unsigned ofs; // offset in file
- enum {
+ // sentinel and masks for _fileNo
+ enum {
DotNsSuffix = 0x7fffffff, // ".ns" file
LocalDbBit = 0x80000000 // assuming "local" db instead of using the JDbContext
};
int _fileNo; // high bit is set to indicate it should be the <dbpath>/local database
// char data[] follows
+ const char * srcData() const {
+ const int *i = &_fileNo;
+ return (const char *) (i+1);
+ }
+
int getFileNo() const { return _fileNo & (~LocalDbBit); }
void setFileNo(int f) { _fileNo = f; }
+ bool isNsSuffix() const { return getFileNo() == DotNsSuffix; }
+
void setLocalDbContextBit() { _fileNo |= LocalDbBit; }
bool isLocalDbContext() const { return _fileNo & LocalDbBit; }
void clearLocalDbContextBit() { _fileNo = getFileNo(); }
+
+ static string suffix(int fileno) {
+ if( fileno == DotNsSuffix ) return "ns";
+ stringstream ss;
+ ss << fileno;
+ return ss.str();
+ }
+ };
+
+ /** append an object to a partial BSON buffer under construction.
+ adds
+ o : <data>, EOO
+ len is the length of the data
+ */
+ struct JObjAppend {
+ JObjAppend() : opcode(JEntry::OpCode_ObjAppend) { }
+ unsigned opcode;
+ int dstFileNo; // destination in the local database.
+ unsigned dstOfs;
+ int srcFileNo; // source in the database of current context (JDbContext)
+ unsigned srcOfs;
+ unsigned len;
};
/** group commit section footer. md5 is a key field. */
diff --git a/db/dur_preplogbuffer.cpp b/db/dur_preplogbuffer.cpp
new file mode 100644
index 00000000000..dac7d62cf8f
--- /dev/null
+++ b/db/dur_preplogbuffer.cpp
@@ -0,0 +1,175 @@
+// @file dur_preplogbuffer.cpp
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ PREPLOGBUFFER
+ we will build an output buffer ourself and then use O_DIRECT
+ we could be in read lock for this
+ for very large objects write directly to redo log in situ?
+ @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_commitjob.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "dur_stats.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+ namespace dur {
+
+ RelativePath local = RelativePath::fromRelativePath("local");
+
+ MongoMMF* findMMF(void *ptr, size_t &ofs) {
+ MongoMMF *f = privateViews._find(ptr, ofs);
+ if( f == 0 ) {
+ string s = str::stream() << "view pointer cannot be resolved " << (size_t) ptr;
+ journalingFailure(s.c_str()); // asserts
+ }
+ return f;
+ }
+
+ /** we will build an output buffer ourself and then use O_DIRECT
+ we could be in read lock for this
+ caller handles locking
+ */
+ void PREPLOGBUFFER() {
+ assert( cmdLine.dur );
+ AlignedBuilder& bb = commitJob._ab;
+ bb.reset();
+
+ unsigned lenOfs; // we will need to backfill the length when prep is wrapping up
+
+ // JSectHeader
+ {
+ bb.appendStr("\nHH\n", false);
+ lenOfs = bb.skip(4);
+ }
+
+ // ops other than basic writes (DurOp's)
+ {
+ for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) {
+ (*i)->serialize(bb);
+ }
+ }
+
+ // basic write ops / write intents. note there is no particular order to these : if we have
+ // two writes to the same location during the group commit interval, it is likely
+ // (although not assured) that it is journaled here once.
+ //
+ // "objAppend" operations are herein too
+ //
+ {
+ scoped_lock lk(privateViews._mutex());
+
+ // each time events switch to a different database we journal a JDbContext
+ RelativePath lastDbPath;
+
+ for( vector<BasicWriteOp>::iterator i = commitJob.basicWrites().begin(); i != commitJob.basicWrites().end(); i++ ) {
+ size_t ofs = 1;
+ MongoMMF *mmf = findMMF(i->src, /*out*/ofs);
+
+ if( i->dst ) {
+ // objAppend operation
+
+ if( mmf->relativePath() != lastDbPath ) {
+ lastDbPath = mmf->relativePath();
+ dassert( lastDbPath != local ); // objAppend is not used for the local database
+ JDbContext c;
+ bb.appendStruct(c);
+ bb.appendStr(lastDbPath.toString());
+ }
+
+ size_t dstofs;
+ MongoMMF *dstmmf = findMMF(i->dst, dstofs);
+ if( !dstmmf->willNeedRemap() ) {
+ dstmmf->willNeedRemap() = true;
+ }
+
+ // since we have already looked up the mmf, we go ahead and remember the write view location
+ // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+ i->dst = ((char*)dstmmf->view_write()) + dstofs;
+
+ JObjAppend d;
+ d.dstFileNo = dstmmf->fileSuffixNo();
+ d.dstOfs = (unsigned) dstofs;
+ d.srcFileNo = mmf->fileSuffixNo();
+ d.srcOfs = ofs;
+ d.len = i->len();
+ bb.appendStruct(d);
+ ++stats.curr._objCopies;
+ }
+ else {
+ if( !mmf->willNeedRemap() ) {
+ // tag this mmf as needed a remap of its private view later.
+ // usually it will already be dirty/already set, so we do the if above first
+ // to avoid possibility of cpu cache line contention
+ mmf->willNeedRemap() = true;
+ }
+
+ // since we have already looked up the mmf, we go ahead and remember the write view location
+ // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+ dassert( i->dst == 0 );
+ i->dst = ((char*)mmf->view_write()) + ofs;
+
+ JEntry e;
+ e.len = i->len();
+ assert( ofs <= 0x80000000 );
+ e.ofs = (unsigned) ofs;
+ e.setFileNo( mmf->fileSuffixNo() );
+ if( mmf->relativePath() == local ) {
+ e.setLocalDbContextBit();
+ }
+ else if( mmf->relativePath() != lastDbPath ) {
+ lastDbPath = mmf->relativePath();
+ JDbContext c;
+ bb.appendStruct(c);
+ bb.appendStr(lastDbPath.toString());
+ }
+ bb.appendStruct(e);
+ bb.appendBuf(i->src, i->len());
+ }
+ }
+ }
+
+ {
+ JSectFooter f(bb.buf(), bb.len());
+ bb.appendStruct(f);
+ }
+
+ {
+ // pad to alignment, and set the total section length in the JSectHeader
+ assert( 0xffffe000 == (~(Alignment-1)) );
+ unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1));
+ dassert( L >= (unsigned) bb.len() );
+ *((unsigned*)bb.atOfs(lenOfs)) = L;
+ unsigned padding = L - bb.len();
+ bb.skip(padding);
+ dassert( bb.len() % Alignment == 0 );
+ }
+
+ return;
+ }
+
+ }
+} \ No newline at end of file
diff --git a/db/dur_recover.cpp b/db/dur_recover.cpp
index 586fa6f7a39..9d55974a2d1 100644
--- a/db/dur_recover.cpp
+++ b/db/dur_recover.cpp
@@ -37,23 +37,19 @@ namespace mongo {
namespace dur {
- /** todo clean up : this is messy from refactoring. */
- struct FullyQualifiedJournalEntry {
- bool isBasicWrite() const { return dbName != 0; }
+ struct ParsedJournalEntry /*copyable*/ {
+ ParsedJournalEntry() : e(0), d(0) { }
- // relative path of database for the operation.
- // might be a pointer into mmaped Journal file
- const char *dbName;
+ // relative path of database for the operation.
+ // might be a pointer into mmaped Journal file
+ const char *dbName;
+ // thse are pointers into the memory mapped journal file
+ const JEntry *e; // local db sentinel is already parsed out here into dbName
+ const JObjAppend *d;
- // local db sentinel is already parsed out here into dbName
- JEntry e;
-
- // pointer into mmaped Journal file
- const char *srcData;
-
- // if not a simple JEntry, the operation
- shared_ptr<DurOp> op;
+ // if not one of the two simple JEntry's above, this is the operation:
+ shared_ptr<DurOp> op;
};
void removeJournalFiles();
@@ -105,7 +101,9 @@ namespace mongo {
* @return true if got an entry. false at successful end of section (and no entry returned).
* throws on premature end of section.
*/
- bool next(FullyQualifiedJournalEntry& e) {
+ bool next(ParsedJournalEntry& e) {
+ assert( e.e == 0 );
+
if( !_sectHead ) {
_sectHead = static_cast<const JSectHeader*>(_br.pos());
_br.skip(sizeof(JSectHeader));
@@ -113,36 +111,36 @@ namespace mongo {
unsigned lenOrOpCode;
_br.read(lenOrOpCode);
- if( lenOrOpCode >= JEntry::OpCode_Min ) {
- // not a "basic write"
- if( lenOrOpCode == JEntry::OpCode_Footer ) {
+ switch( lenOrOpCode ) {
+
+ case JEntry::OpCode_Footer:
+ {
const char* pos = (const char*) _br.pos();
pos -= sizeof(lenOrOpCode); // rewind to include OpCode
const JSectFooter& footer = *(const JSectFooter*)pos;
-
int len = pos - (char*)_sectHead;
if (!footer.checkHash(_sectHead, len)){
massert(13594, str::stream() << "Journal checksum doesn't match. recorded: "
- << toHex(footer.hash, sizeof(footer.hash))
- << " actual: " << md5simpledigest(_sectHead, len)
- , false);
+ << toHex(footer.hash, sizeof(footer.hash))
+ << " actual: " << md5simpledigest(_sectHead, len)
+ , false);
}
-
_br.skip(sizeof(JSectFooter) - 4);
_br.align(Alignment);
-
_sectHead = NULL;
- return false;
+ return false; // false return value denotes of section
}
- if( lenOrOpCode != JEntry::OpCode_DbContext ) {
+ case JEntry::OpCode_FileCreated:
+ case JEntry::OpCode_DropDb:
+ {
e.dbName = 0;
e.op = DurOp::read(lenOrOpCode, _br);
return true;
}
- // JDbContext
+ case JEntry::OpCode_DbContext:
{
_lastDbName = (const char*) _br.pos();
const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _br.remaining());
@@ -151,19 +149,28 @@ namespace mongo {
_br.skip(len+1); // skip '\0' too
_br.read(lenOrOpCode);
}
+ // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
+
+ case JEntry::OpCode_ObjAppend:
+ default:
+ // fall through
+ ;
}
- // JEntry - a basic write
- assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
- e.dbName = _lastDbName;
- e.e.len = lenOrOpCode;
- _br.read(e.e.ofs);
- _br.read(e.e._fileNo);
- if( e.e.isLocalDbContext() ) {
- e.dbName = "local";
- e.e.clearLocalDbContextBit();
+ {
+ assert( lenOrOpCode && lenOrOpCode <= JEntry::OpCode_ObjAppend );
+ _br.rewind(4);
+ if( lenOrOpCode == JEntry::OpCode_ObjAppend ) {
+ e.d = (JObjAppend *) _br.skip(sizeof(JObjAppend));
+ e.dbName = _lastDbName;
+ }
+ else {
+ e.e = (JEntry *) _br.skip(sizeof(JEntry));
+ e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
+ dassert( e.e->len == lenOrOpCode );
+ _br.skip(e.e->len);
+ }
}
- e.srcData = (const char *) _br.skip(lenOrOpCode);
return true;
}
private:
@@ -179,7 +186,8 @@ namespace mongo {
void go(vector<path>& files);
~RecoveryJob();
private:
- void applyEntries(const vector<FullyQualifiedJournalEntry> &entries);
+ void applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump);
+ void applyEntries(const vector<ParsedJournalEntry> &entries);
bool processBuffer(void *, unsigned len);
bool processFile(path journalfile);
void close();
@@ -255,67 +263,98 @@ namespace mongo {
_fileToPtr.clear();
}
- void RecoveryJob::applyEntries(const vector<FullyQualifiedJournalEntry> &entries) {
+ void RecoveryJob::applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump) {
+ if( entry.e ) {
+ if( dump ) {
+ stringstream ss;
+ ss << " BASICWRITE " << setw(20) << entry.dbName << '.';
+ if( entry.e->isNsSuffix() )
+ ss << "ns";
+ else
+ ss << setw(2) << entry.e->getFileNo();
+ ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/
+ " " << hexdump(entry.e->srcData(), entry.e->len);
+ log() << ss.str() << endl;
+ }
+ if( apply ) {
+ void *p = ptr(entry.dbName, entry.e->getFileNo(), entry.e->ofs);
+ memcpy(p, entry.e->srcData(), entry.e->len);
+ }
+ }
+ else if( entry.d ) {
+ // OpCode_ObjAppend (struct JObjAppend)
+ if( dump ) {
+ stringstream ss;
+ ss << " JObjAppend dst: local." << JEntry::suffix(entry.d->dstFileNo) << " ofs:" << entry.d->dstOfs;
+ ss << " src:" << entry.dbName << '.' << JEntry::suffix(entry.d->srcFileNo) << " ofs:" << entry.d->srcOfs;
+ ss << " len:" << entry.d->len;
+ log() << ss.str() << endl;
+ }
+ if( apply ) {
+ void *dst = ptr("local", entry.d->dstFileNo, entry.d->dstOfs);
+ void *src = ptr(entry.dbName, entry.d->srcFileNo, entry.d->srcOfs);
+ memcpy(dst, src, entry.d->len);
+ char *p = (char *) dst;
+ p[-3] = (char) Object; // { ..., o: <copiedobj>, ..., EOO}
+ p[-2] = 'o';
+ p[-1] = 0;
+ p[entry.d->len] = EOO;
+ }
+ }
+ else {
+ // a DurOp subclass operation
+ if( dump ) {
+ log() << " OP " << entry.op->toString() << endl;
+ }
+ if( apply ) {
+ if( entry.op->needFilesClosed() ) {
+ close();
+ }
+ entry.op->replay();
+ }
+ }
+ }
+
+ void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) {
bool apply = (cmdLine.durOptions & CmdLine::DurScanOnly) == 0;
bool dump = cmdLine.durOptions & CmdLine::DurDumpJournal;
if( dump )
log() << "BEGIN section" << endl;
- for( vector<FullyQualifiedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) {
- const FullyQualifiedJournalEntry& fqe = *i;
- if( fqe.isBasicWrite() ) {
- if( dump ) {
- stringstream ss;
- ss << " BASICWRITE " << setw(20) << fqe.dbName << '.';
- if( fqe.e._fileNo == JEntry::DotNsSuffix )
- ss << "ns";
- else
- ss << setw(2) << fqe.e._fileNo;
- ss << ' ' << setw(6) << fqe.e.len << ' ' << hex << setw(8) << (size_t) fqe.srcData << dec << " " << hexdump(fqe.srcData, fqe.e.len);
- log() << ss.str() << endl;
- }
- if( apply ) {
- void *p = ptr(fqe.dbName, fqe.e._fileNo, fqe.e.ofs);
- memcpy(p, fqe.srcData, fqe.e.len);
- }
- } else {
- if( dump ) {
- log() << " OP " << fqe.op->toString() << endl;
- }
- if( apply ) {
- if( fqe.op->needFilesClosed() ) {
- close();
- }
- fqe.op->replay();
- }
- }
+ for( vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) {
+ applyEntry(*i, apply, dump);
}
if( dump )
log() << "END section" << endl;
}
- /** @param p start of the memory mapped file
+ /** apply a specific journal file, that is already mmap'd
+ @param p start of the memory mapped file
@return true if this is detected to be the last file (ends abruptly)
*/
bool RecoveryJob::processBuffer(void *p, unsigned len) {
JournalIterator i(p, len);
- vector<FullyQualifiedJournalEntry> entries;
+ vector<ParsedJournalEntry> entries;
try {
while( 1 ) {
entries.clear();
- FullyQualifiedJournalEntry e;
- while( i.next(e) )
+ while( 1 ) {
+ ParsedJournalEntry e;
+ if( !i.next(e) )
+ break;
entries.push_back(e);
+ }
// got all the entries for one group commit. apply them:
applyEntries(entries);
- // now do the next section (i.e. group commit)
if( i.atEof() )
break;
+
+ // loop back and do the new group commit section
}
}
catch( BufReader::eof& ) {
@@ -327,6 +366,7 @@ namespace mongo {
return false; // non-abrupt end
}
+ /** apply a specific journal file */
bool RecoveryJob::processFile(path journalfile) {
log() << "recover " << journalfile.string() << endl;
MemoryMappedFile f;
@@ -335,6 +375,7 @@ namespace mongo {
return processBuffer(p, (unsigned) f.length());
}
+ /** @param files all the j._0 style files we need to apply for recovery */
void RecoveryJob::go(vector<path>& files) {
log() << "recover begin" << endl;
@@ -392,7 +433,7 @@ namespace mongo {
BufReader r((void*) "abcdabcdabcd", 12);
char x;
BufReaderY y;
- r.read(x); cout << x; // a
+ r.read(x); //cout << x; // a
assert( x == 'a' );
r.read(y);
r.read(x);
diff --git a/db/dur_stats.h b/db/dur_stats.h
index f9fe3f7eb7b..cd76efb01a9 100644
--- a/db/dur_stats.h
+++ b/db/dur_stats.h
@@ -10,9 +10,16 @@ namespace mongo {
Stats();
struct S {
unsigned _commits;
- unsigned _dittos;
+ unsigned _objCopies;
unsigned long long _journaledBytes;
unsigned long long _writeToDataFilesBytes;
+
+ // undesirable to be in write lock for the group commit (it can be done in a read lock), so good if we
+ // have visibility when this happens. can happen for a couple reasons
+ // - read lock starvation
+ // - file being closed
+ // - data being written faster than the normal group commit interval
+ unsigned _commitsInWriteLock;
} curr;
};
extern Stats stats;
diff --git a/db/dur_writetodatafiles.cpp b/db/dur_writetodatafiles.cpp
index 21b995c5732..fad947c6470 100644
--- a/db/dur_writetodatafiles.cpp
+++ b/db/dur_writetodatafiles.cpp
@@ -46,11 +46,18 @@ namespace mongo {
*/
void WRITETODATAFILES() {
/* we go backwards as what is at the end is most likely in the cpu cache. it won't be much, but we'll take it. */
- for( int i = commitJob.writes().size() - 1; i >= 0; i-- ) {
- const WriteIntent& intent = commitJob.writes()[i];
- char *dst = (char *) (intent.w_ptr);
- memcpy(dst, intent.p, intent.len);
- stats.curr._writeToDataFilesBytes += intent.len;
+ for( int i = commitJob.basicWrites().size() - 1; i >= 0; i-- ) {
+ const BasicWriteOp& b = commitJob.basicWrites()[i];
+ stats.curr._writeToDataFilesBytes += b.len();
+ dassert(b.dst);
+ memcpy(b.dst, b.src, b.len());
+ if( b.isObjAppend() ) {
+ char *p = static_cast<char*>(b.dst);
+ p[-3] = (char) Object; // { ..., o: <copiedobj>, ..., EOO}
+ p[-2] = 'o';
+ p[-1] = 0;
+ p[b.len()] = EOO;
+ }
}
debugValidateMapsMatch();
diff --git a/db/oplog.cpp b/db/oplog.cpp
index 59dd8f01ade..b7dc8e5949d 100644
--- a/db/oplog.cpp
+++ b/db/oplog.cpp
@@ -217,9 +217,10 @@ namespace mongo {
b.appendBool("b", *bb);
if ( o2 )
b.append("o2", *o2);
- BSONObj partial = b.done();
- int posz = partial.objsize();
- int len = posz + obj.objsize() + 1 + 2 /*o:*/;
+ BSONObj partial = b.done(); // partial is everything except the o:... part.
+
+ int po_sz = partial.objsize();
+ int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;
Record *r;
if( logNS == 0 ) {
@@ -236,21 +237,37 @@ namespace mongo {
} else {
Client::Context ctx( logNS, dbpath, 0, false );
assert( nsdetails( logNS ) );
+ // first we allocate the space, then we fill it below.
r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
}
{
- const int size2 = obj.objsize() + 1 + 2;
- char *p = (char *) getDur().writingPtr(r->data, size2+posz);
- memcpy(p, partial.objdata(), posz);
- *((unsigned *)p) += size2;
- p += posz - 1;
- *p++ = (char) Object;
- *p++ = 'o'; // { o : ... }
- *p++ = 0;
- memcpy(p, obj.objdata(), obj.objsize());
- p += obj.objsize();
- *p = EOO;
+ const int size1 = po_sz-1; // less the EOO char
+ const int objOfs = size1+3; // 3 = byte BSONOBJTYPE + byte 'o' + byte \0
+ char *objInsertPoint = r->data + objOfs;
+
+ // don't bother if obj is tiny as there is some header overhead for the additional journal entry
+ // and also some computational administrative overhead.
+ bool objAppendWorked = obj.objsize() >= 32 &&
+ getDur().objAppend(objInsertPoint, obj.objdata(), obj.objsize());
+
+ void *p = (char *) getDur().writingPtr(r->data, objAppendWorked ? size1 : objOfs+obj.objsize()+1);
+
+ memcpy(p, partial.objdata(), size1);
+
+ // adjust overall bson object size for the o: field
+ *(static_cast<unsigned*>(p)) += obj.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/;
+
+ if( !objAppendWorked ) {
+ char *b = static_cast<char *>(p);
+ b += size1;
+ *b++ = (char) Object;
+ *b++ = 'o'; // { o : ... }
+ *b++ = 0; // null terminate "o" fieldname
+ memcpy(b, obj.objdata(), obj.objsize());
+ b += obj.objsize();
+ *b = EOO;
+ }
}
context.getClient()->setLastOp( ts.asDate() );
diff --git a/jstests/dur/oplog.js b/jstests/dur/oplog.js
new file mode 100755
index 00000000000..0887826263f
--- /dev/null
+++ b/jstests/dur/oplog.js
@@ -0,0 +1,118 @@
+/*
+ test durability
+*/
+
+var debugging = false;
+if (""+typeof(db) != "undefined")
+ debugging = true;
+var testname = "oplog";
+var step = 1;
+var conn = null;
+
+function log(str) {
+ if(str)
+ print(testname+" step " + step++ + " " + str);
+ else
+ print(testname+" step " + step++);
+}
+
+// if you do inserts here, you will want to set _id. otherwise they won't match on different
+// runs so we can't do a binary diff of the resulting files to check they are consistent.
+function work() {
+ log("work");
+ var d = conn.getDB("test");
+ d.foo.insert({ _id: 3, x: 22 });
+ d.foo.insert({ _id: 4, x: 22 });
+ d.a.insert({ _id: 3, x: 22, y: [1, 2, 3] });
+ d.a.insert({ _id: 4, x: 22, y: [1, 2, 3] });
+ d.a.update({ _id: 4 }, { $inc: { x: 1} });
+ // OpCode_ObjCopy fires on larger operations so make one that isn't tiny
+ var big = "axxxxxxxxxxxxxxb";
+ big = big + big;
+ big = big + big;
+ big = big + big;
+ big = big + big;
+ big = big + big;
+ d.foo.insert({ _id: 5, q: "aaaaa", b: big, z: 3 });
+
+ // assure writes applied in case we kill -9 on return from this function
+ d.getLastError();
+
+ log("endwork");
+}
+
+function verify() {
+ log("verify");
+ var d = conn.getDB("local");
+ assert( d.oplog.$main.find({"o.z":3}).count() == 1, "oplog doesnt match" );
+}
+
+if( debugging ) {
+ // mongod already running in debugger
+ print("DOING DEBUG MODE BEHAVIOR AS 'db' IS DEFINED -- RUN mongo --nodb FOR REGULAR TEST BEHAVIOR");
+ conn = db.getMongo();
+ work();
+ sleep(30000);
+ quit();
+}
+
+log();
+
+// directories
+var path1 = "/data/db/" + testname+"nodur";
+var path2 = "/data/db/" + testname+"dur";
+
+// non-durable version
+log();
+conn = startMongodEmpty("--port", 30000, "--dbpath", path1, "--nodur", "--smallfiles", "--master");
+work();
+stopMongod(30000);
+
+// durable version
+log();
+conn = startMongodEmpty("--port", 30001, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", /*DurParanoid*/8, "--master");
+work();
+
+// wait for group commit. use getLastError(...) later when that is enhanced.
+sleep(400);
+
+// kill the process hard
+stopMongod(30001, /*signal*/9);
+
+// journal file should be present, and non-empty as we killed hard
+
+// restart and recover
+log();
+conn = startMongodNoReset("--port", 30002, "--dbpath", path2, "--dur", "--smallfiles", "--durOptions", 8, "--master");
+verify();
+
+log("stop");
+stopMongod(30002);
+
+// stopMongod seems to be asynchronous (hmmm) so we sleep here.
+sleep(5000);
+
+// at this point, after clean shutdown, there should be no journal files
+log("check no journal files");
+assert(ls(path2 + "/journal") == null);
+
+log("check data matches ns");
+var diff = run("diff", path1 + "/test.ns", path2 + "/test.ns");
+if (diff != "") {
+ print("\n\n\nDIFFERS\n");
+ print(diff);
+}
+assert(diff == "", "error test.ns files differ");
+
+log("check data matches .0");
+diff = run("diff", path1 + "/test.0", path2 + "/test.0");
+if (diff != "") {
+ print("\n\n\nDIFFERS\n");
+ print(diff);
+}
+assert(diff == "", "error test.0 files differ");
+
+log("check data matches done");
+
+print(testname + " SUCCESS");
+