summaryrefslogtreecommitdiff
path: root/src/mongo/db/dur.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/dur.cpp')
-rw-r--r--src/mongo/db/dur.cpp840
1 files changed, 840 insertions, 0 deletions
diff --git a/src/mongo/db/dur.cpp b/src/mongo/db/dur.cpp
new file mode 100644
index 00000000000..822fa5232c0
--- /dev/null
+++ b/src/mongo/db/dur.cpp
@@ -0,0 +1,840 @@
+// @file dur.cpp durability in the storage engine (crash-safeness / journaling)
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ phases:
+
+ PREPLOGBUFFER
+ we will build an output buffer ourself and then use O_DIRECT
+ we could be in read lock for this
+ for very large objects write directly to redo log in situ?
+ WRITETOJOURNAL
+ we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
+ have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
+ for now (1.7.5/1.8.0) we are in read lock which is not ideal.
+ WRITETODATAFILES
+ apply the writes back to the non-private MMF after they are for certain in redo log
+ REMAPPRIVATEVIEW
+ we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
+ remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
+ to be too frequent.
+ there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
+ be required. so doing these remaps fractionally is helpful.
+
+ mutexes:
+
+ READLOCK dbMutex
+ LOCK groupCommitMutex
+ PREPLOGBUFFER()
+ READLOCK mmmutex
+ commitJob.reset()
+ UNLOCK dbMutex // now other threads can write
+ WRITETOJOURNAL()
+ WRITETODATAFILES()
+ UNLOCK mmmutex
+ UNLOCK groupCommitMutex
+
+ on the next write lock acquisition for dbMutex: // see MongoMutex::_acquiredWriteLock()
+ REMAPPRIVATEVIEW()
+
+ @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "client.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_commitjob.h"
+#include "dur_recover.h"
+#include "dur_stats.h"
+#include "../util/concurrency/race.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/timer.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+ namespace dur {
+
+ void PREPLOGBUFFER(JSectHeader& outParm);
+ void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed);
+ void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed);
+
+ /** declared later in this file
+ only used in this file -- use DurableInterface::commitNow() outside
+ */
+ static void groupCommit();
+
+ CommitJob& commitJob = *(new CommitJob()); // don't destroy
+
+ Stats stats;
+
+ void Stats::S::reset() {
+ memset(this, 0, sizeof(*this));
+ }
+
+ Stats::Stats() {
+ _a.reset();
+ _b.reset();
+ curr = &_a;
+ _intervalMicros = 3000000;
+ }
+
+ Stats::S * Stats::other() {
+ return curr == &_a ? &_b : &_a;
+ }
+ string _CSVHeader();
+
+ string Stats::S::_CSVHeader() {
+ return "cmts jrnMB\twrDFMB\tcIWLk\tearly\tprpLgB wrToJ\twrToDF\trmpPrVw";
+ }
+
+ string Stats::S::_asCSV() {
+ stringstream ss;
+ ss <<
+ setprecision(2) <<
+ _commits << '\t' << fixed <<
+ _journaledBytes / 1000000.0 << '\t' <<
+ _writeToDataFilesBytes / 1000000.0 << '\t' <<
+ _commitsInWriteLock << '\t' <<
+ _earlyCommits << '\t' <<
+ (unsigned) (_prepLogBufferMicros/1000) << '\t' <<
+ (unsigned) (_writeToJournalMicros/1000) << '\t' <<
+ (unsigned) (_writeToDataFilesMicros/1000) << '\t' <<
+ (unsigned) (_remapPrivateViewMicros/1000);
+ return ss.str();
+ }
+
+ //int getAgeOutJournalFiles();
+ BSONObj Stats::S::_asObj() {
+ BSONObjBuilder b;
+ b <<
+ "commits" << _commits <<
+ "journaledMB" << _journaledBytes / 1000000.0 <<
+ "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 <<
+ "compression" << _journaledBytes / (_uncompressedBytes+1.0) <<
+ "commitsInWriteLock" << _commitsInWriteLock <<
+ "earlyCommits" << _earlyCommits <<
+ "timeMs" <<
+ BSON( "dt" << _dtMillis <<
+ "prepLogBuffer" << (unsigned) (_prepLogBufferMicros/1000) <<
+ "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) <<
+ "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) <<
+ "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000)
+ );
+ /*int r = getAgeOutJournalFiles();
+ if( r == -1 )
+ b << "ageOutJournalFiles" << "mutex timeout";
+ if( r == 0 )
+ b << "ageOutJournalFiles" << false;*/
+ if( cmdLine.journalCommitInterval != 0 )
+ b << "journalCommitIntervalMs" << cmdLine.journalCommitInterval;
+ return b.obj();
+ }
+
+ BSONObj Stats::asObj() {
+ return other()->_asObj();
+ }
+
+ void Stats::rotate() {
+ unsigned long long now = curTimeMicros64();
+ unsigned long long dt = now - _lastRotate;
+ if( dt >= _intervalMicros && _intervalMicros ) {
+ // rotate
+ curr->_dtMillis = (unsigned) (dt/1000);
+ _lastRotate = now;
+ curr = other();
+ curr->reset();
+ }
+ }
+
+ void NonDurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+ memcpy(dst, src, len);
+ }
+
+ void DurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+ // we are at least read locked, so we need not worry about REMAPPRIVATEVIEW herein.
+ DEV d.dbMutex.assertAtLeastReadLocked();
+
+ MemoryMappedFile::makeWritable(dst, len);
+
+ // we enter the RecoveryJob mutex here, so that if WRITETODATAFILES is happening we do not
+ // conflict with it
+ scoped_lock lk1( RecoveryJob::get()._mx );
+
+ // we stay in this mutex for everything to work with DurParanoid/validateSingleMapMatches
+ //
+ // either of these mutexes also makes setNoJournal threadsafe, which is good as we call it from a read
+ // (not a write) lock in class SlaveTracking
+ //
+ scoped_lock lk( privateViews._mutex() );
+
+ size_t ofs;
+ MongoMMF *f = privateViews.find_inlock(dst, ofs);
+ assert(f);
+ void *w = (((char *)f->view_write())+ofs);
+ // first write it to the writable (file) view
+ memcpy(w, src, len);
+ if( memcmp(w, dst, len) ) {
+ // if we get here, a copy-on-write had previously occurred. so write it to the private view too
+ // to keep them in sync. we do this as we do not want to cause a copy on write unnecessarily.
+ memcpy(dst, src, len);
+ }
+ }
+
+ /** base declare write intent function that all the helpers call. */
+ void DurableImpl::declareWriteIntent(void *p, unsigned len) {
+ commitJob.note(p, len);
+ }
+
+ static DurableImpl* durableImpl = new DurableImpl();
+ static NonDurableImpl* nonDurableImpl = new NonDurableImpl();
+ DurableInterface* DurableInterface::_impl = nonDurableImpl;
+
+ void DurableInterface::enableDurability() {
+ assert(_impl == nonDurableImpl);
+ _impl = durableImpl;
+ }
+
+ void DurableInterface::disableDurability() {
+ assert(_impl == durableImpl);
+ massert(13616, "can't disable durability with pending writes", !commitJob.hasWritten());
+ _impl = nonDurableImpl;
+ }
+
+ bool DurableImpl::commitNow() {
+ stats.curr->_earlyCommits++;
+ groupCommit();
+ return true;
+ }
+
+ bool DurableImpl::awaitCommit() {
+ commitJob._notify.awaitBeyondNow();
+ return true;
+ }
+
+ /** Declare that a file has been created
+ Normally writes are applied only after journaling, for safety. But here the file
+ is created first, and the journal will just replay the creation if the create didn't
+ happen because of crashing.
+ */
+ void DurableImpl::createdFile(string filename, unsigned long long len) {
+ shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
+ commitJob.noteOp(op);
+ }
+
+ void* DurableImpl::writingPtr(void *x, unsigned len) {
+ void *p = x;
+ declareWriteIntent(p, len);
+ return p;
+ }
+
+ /** declare intent to write
+ @param ofs offset within buf at which we will write
+ @param len the length at ofs we will write
+ @return new buffer pointer.
+ */
+ void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) {
+ char *p = (char *) buf;
+ declareWriteIntent(p+ofs, len);
+ return p;
+ }
+
+ void* DurableImpl::writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) {
+ char *p = (char *) buf;
+ for( vector< pair< long long, unsigned > >::const_iterator i = ranges.begin();
+ i != ranges.end(); ++i ) {
+ declareWriteIntent( p + i->first, i->second );
+ }
+ return p;
+ }
+
+ bool DurableImpl::aCommitIsNeeded() const {
+ DEV commitJob._nSinceCommitIfNeededCall = 0;
+ return commitJob.bytes() > UncommittedBytesLimit;
+ }
+
+ bool DurableImpl::commitIfNeeded() {
+ if ( !d.dbMutex.isWriteLocked() )
+ return false;
+
+ DEV commitJob._nSinceCommitIfNeededCall = 0;
+ if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit?
+ stats.curr->_earlyCommits++;
+ groupCommit();
+ return true;
+ }
+ return false;
+ }
+
+ /** Used in _DEBUG builds to check that we didn't overwrite the last intent
+ that was declared. called just before writelock release. we check a few
+ bytes after the declared region to see if they changed.
+
+ @see MongoMutex::_releasedWriteLock
+
+ SLOW
+ */
+#if 0
+ void DurableImpl::debugCheckLastDeclaredWrite() {
+ static int n;
+ ++n;
+
+ assert(debug && cmdLine.dur);
+ if (commitJob.writes().empty())
+ return;
+ const WriteIntent &i = commitJob.lastWrite();
+ size_t ofs;
+ MongoMMF *mmf = privateViews.find(i.start(), ofs);
+ if( mmf == 0 )
+ return;
+ size_t past = ofs + i.length();
+ if( mmf->length() < past + 8 )
+ return; // too close to end of view
+ char *priv = (char *) mmf->getView();
+ char *writ = (char *) mmf->view_write();
+ unsigned long long *a = (unsigned long long *) (priv+past);
+ unsigned long long *b = (unsigned long long *) (writ+past);
+ if( *a != *b ) {
+ for( set<WriteIntent>::iterator it(commitJob.writes().begin()), end((commitJob.writes().begin())); it != end; ++it ) {
+ const WriteIntent& wi = *it;
+ char *r1 = (char*) wi.start();
+ char *r2 = (char*) wi.end();
+ if( r1 <= (((char*)a)+8) && r2 > (char*)a ) {
+ //log() << "it's ok " << wi.p << ' ' << wi.len << endl;
+ return;
+ }
+ }
+ log() << "journal data after write area " << i.start() << " does not agree" << endl;
+ log() << " was: " << ((void*)b) << " " << hexdump((char*)b, 8) << endl;
+ log() << " now: " << ((void*)a) << " " << hexdump((char*)a, 8) << endl;
+ log() << " n: " << n << endl;
+ log() << endl;
+ }
+ }
+#endif
+
+ // Functor to be called over all MongoFiles
+
+ class validateSingleMapMatches {
+ public:
+ validateSingleMapMatches(unsigned long long& bytes) :_bytes(bytes) {}
+ void operator () (MongoFile *mf) {
+ if( mf->isMongoMMF() ) {
+ MongoMMF *mmf = (MongoMMF*) mf;
+ const unsigned char *p = (const unsigned char *) mmf->getView();
+ const unsigned char *w = (const unsigned char *) mmf->view_write();
+
+ if (!p || !w) return; // File not fully opened yet
+
+ _bytes += mmf->length();
+
+ assert( mmf->length() == (unsigned) mmf->length() );
+ {
+ scoped_lock lk( privateViews._mutex() ); // see setNoJournal
+ if (memcmp(p, w, (unsigned) mmf->length()) == 0)
+ return; // next file
+ }
+
+ unsigned low = 0xffffffff;
+ unsigned high = 0;
+ log() << "DurParanoid mismatch in " << mmf->filename() << endl;
+ int logged = 0;
+ unsigned lastMismatch = 0xffffffff;
+ for( unsigned i = 0; i < mmf->length(); i++ ) {
+ if( p[i] != w[i] ) {
+ if( lastMismatch != 0xffffffff && lastMismatch+1 != i )
+ log() << endl; // separate blocks of mismatches
+ lastMismatch= i;
+ if( ++logged < 60 ) {
+ if( logged == 1 )
+ log() << "ofs % 628 = 0x" << hex << (i%628) << endl; // for .ns files to find offset in record
+ stringstream ss;
+ ss << "mismatch ofs:" << hex << i << "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
+ if( p[i] > 32 && p[i] <= 126 )
+ ss << '\t' << p[i];
+ log() << ss.str() << endl;
+ }
+ if( logged == 60 )
+ log() << "..." << endl;
+ if( i < low ) low = i;
+ if( i > high ) high = i;
+ }
+ }
+ if( low != 0xffffffff ) {
+ std::stringstream ss;
+ ss << "journal error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
+ log() << ss.str() << endl;
+ log() << "priv loc: " << (void*)(p+low) << ' ' << endl;
+ set<WriteIntent>& b = commitJob.writes();
+ (void)b; // mark as unused. Useful for inspection in debugger
+
+ // should we abort() here so this isn't unnoticed in some circumstances?
+ massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false);
+ }
+ }
+ }
+ private:
+ unsigned long long& _bytes;
+ };
+
+ /** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
+ */
+ void debugValidateAllMapsMatch() {
+ if( ! (cmdLine.durOptions & CmdLine::DurParanoid) )
+ return;
+
+ unsigned long long bytes = 0;
+ Timer t;
+ MongoFile::forEach(validateSingleMapMatches(bytes));
+ OCCASIONALLY log() << "DurParanoid map check " << t.millis() << "ms for " << (bytes / (1024*1024)) << "MB" << endl;
+ }
+
+ extern size_t privateMapBytes;
+
+ static void _REMAPPRIVATEVIEW() {
+ // todo: Consider using ProcessInfo herein and watching for getResidentSize to drop. that could be a way
+ // to assure very good behavior here.
+
+ static unsigned startAt;
+ static unsigned long long lastRemap;
+
+ LOG(4) << "journal REMAPPRIVATEVIEW" << endl;
+
+ d.dbMutex.assertWriteLocked();
+ d.dbMutex._remapPrivateViewRequested = false;
+ assert( !commitJob.hasWritten() );
+
+ // we want to remap all private views about every 2 seconds. there could be ~1000 views so
+ // we do a little each pass; beyond the remap time, more significantly, there will be copy on write
+ // faults after remapping, so doing a little bit at a time will avoid big load spikes on
+ // remapping.
+ unsigned long long now = curTimeMicros64();
+ double fraction = (now-lastRemap)/2000000.0;
+ if( cmdLine.durOptions & CmdLine::DurAlwaysRemap )
+ fraction = 1;
+ lastRemap = now;
+
+ LockMongoFilesShared lk;
+ set<MongoFile*>& files = MongoFile::getAllFiles();
+ unsigned sz = files.size();
+ if( sz == 0 )
+ return;
+
+ {
+ // be careful not to use too much memory if the write rate is
+ // extremely high
+ double f = privateMapBytes / ((double)UncommittedBytesLimit);
+ if( f > fraction ) {
+ fraction = f;
+ }
+ privateMapBytes = 0;
+ }
+
+ unsigned ntodo = (unsigned) (sz * fraction);
+ if( ntodo < 1 ) ntodo = 1;
+ if( ntodo > sz ) ntodo = sz;
+
+ const set<MongoFile*>::iterator b = files.begin();
+ const set<MongoFile*>::iterator e = files.end();
+ set<MongoFile*>::iterator i = b;
+ // skip to our starting position
+ for( unsigned x = 0; x < startAt; x++ ) {
+ i++;
+ if( i == e ) i = b;
+ }
+ unsigned startedAt = startAt;
+ startAt = (startAt + ntodo) % sz; // mark where to start next time
+
+ Timer t;
+ for( unsigned x = 0; x < ntodo; x++ ) {
+ dassert( i != e );
+ if( (*i)->isMongoMMF() ) {
+ MongoMMF *mmf = (MongoMMF*) *i;
+ assert(mmf);
+ if( mmf->willNeedRemap() ) {
+ mmf->willNeedRemap() = false;
+ mmf->remapThePrivateView();
+ }
+ i++;
+ if( i == e ) i = b;
+ }
+ }
+ LOG(2) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' ' << t.millis() << "ms" << endl;
+ }
+
+ /** We need to remap the private views periodically. otherwise they would become very large.
+ Call within write lock. See top of file for more commentary.
+ */
+ void REMAPPRIVATEVIEW() {
+ Timer t;
+ _REMAPPRIVATEVIEW();
+ stats.curr->_remapPrivateViewMicros += t.micros();
+ }
+
+ // lock order: dbMutex first, then this
+ mutex groupCommitMutex("groupCommit");
+
+ bool _groupCommitWithLimitedLocks() {
+
+ int p = 0;
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ scoped_ptr<ExcludeAllWrites> lk1( new ExcludeAllWrites() );
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ scoped_lock lk2(groupCommitMutex);
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ commitJob.beginCommit();
+
+ if( !commitJob.hasWritten() ) {
+ // getlasterror request could have came after the data was already committed
+ commitJob.notifyCommitted();
+ return true;
+ }
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ JSectHeader h;
+ PREPLOGBUFFER(h); // need to be in readlock (writes excluded) for this
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ LockMongoFilesShared lk3;
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ unsigned abLen = commitJob._ab.len();
+ commitJob.reset(); // must be reset before allowing anyone to write
+ DEV assert( !commitJob.hasWritten() );
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ // release the readlock -- allowing others to now write while we are writing to the journal (etc.)
+ lk1.reset();
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ // ****** now other threads can do writes ******
+
+ WRITETOJOURNAL(h, commitJob._ab);
+ assert( abLen == commitJob._ab.len() ); // a check that no one touched the builder while we were doing work. if so, our locking is wrong.
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ // data is now in the journal, which is sufficient for acknowledging getLastError.
+ // (ok to crash after that)
+ commitJob.notifyCommitted();
+
+ LOG(4) << "groupcommitll " << p++ << " WRITETODATAFILES()" << endl;
+
+ WRITETODATAFILES(h, commitJob._ab);
+ assert( abLen == commitJob._ab.len() ); // check again wasn't modded
+ commitJob._ab.reset();
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ // can't : d.dbMutex._remapPrivateViewRequested = true;
+
+ return true;
+ }
+
+ /** @return true if committed; false if lock acquisition timed out (we only try for a read lock herein and only wait for a certain duration). */
+ bool groupCommitWithLimitedLocks() {
+ try {
+ return _groupCommitWithLimitedLocks();
+ }
+ catch(DBException& e ) {
+ log() << "dbexception in groupCommitLL causing immediate shutdown: " << e.toString() << endl;
+ mongoAbort("dur1");
+ }
+ catch(std::ios_base::failure& e) {
+ log() << "ios_base exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("dur2");
+ }
+ catch(std::bad_alloc& e) {
+ log() << "bad_alloc exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("dur3");
+ }
+ catch(std::exception& e) {
+ log() << "exception in dur::groupCommitLL causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("dur4");
+ }
+ return false;
+ }
+
+ static void _groupCommit() {
+
+ LOG(4) << "_groupCommit " << endl;
+
+ // we need to be at least read locked on the dbMutex so that we know the write intent data
+ // structures are not changing while we work
+ d.dbMutex.assertAtLeastReadLocked();
+
+ commitJob.beginCommit();
+
+ if( !commitJob.hasWritten() ) {
+ // getlasterror request could have came after the data was already committed
+ commitJob.notifyCommitted();
+ return;
+ }
+
+ // we need to make sure two group commits aren't running at the same time
+ // (and we are only read locked in the dbMutex, so it could happen)
+ scoped_lock lk(groupCommitMutex);
+
+ JSectHeader h;
+ PREPLOGBUFFER(h);
+
+ // todo : write to the journal outside locks, as this write can be slow.
+ // however, be careful then about remapprivateview as that cannot be done
+ // if new writes are then pending in the private maps.
+ WRITETOJOURNAL(h, commitJob._ab);
+
+ // data is now in the journal, which is sufficient for acknowledging getLastError.
+ // (ok to crash after that)
+ commitJob.notifyCommitted();
+
+ WRITETODATAFILES(h, commitJob._ab);
+ debugValidateAllMapsMatch();
+
+ commitJob.reset();
+ commitJob._ab.reset();
+
+ // REMAPPRIVATEVIEW
+ //
+ // remapping private views must occur after WRITETODATAFILES otherwise
+ // we wouldn't see newly written data on reads.
+ //
+ DEV assert( !commitJob.hasWritten() );
+ if( !d.dbMutex.isWriteLocked() ) {
+ // this needs done in a write lock (as there is a short window during remapping when each view
+ // might not exist) thus we do it on the next acquisition of that instead of here (there is no
+ // rush if you aren't writing anyway -- but it must happen, if it is done, before any uncommitted
+ // writes occur). If desired, perhaps this can be eliminated on posix as it may be that the remap
+ // is race-free there.
+ //
+ d.dbMutex._remapPrivateViewRequested = true;
+ }
+ else {
+ stats.curr->_commitsInWriteLock++;
+ // however, if we are already write locked, we must do it now -- up the call tree someone
+ // may do a write without a new lock acquisition. this can happen when MongoMMF::close() calls
+ // this method when a file (and its views) is about to go away.
+ //
+ REMAPPRIVATEVIEW();
+ }
+ }
+
+ /** locking: in read lock when called
+ or, for early commits (commitIfNeeded), in write lock
+ @see MongoMMF::close()
+ */
+ static void groupCommit() {
+ try {
+ _groupCommit();
+ }
+ catch(DBException& e ) {
+ log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl;
+ mongoAbort("gc1");
+ }
+ catch(std::ios_base::failure& e) {
+ log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("gc2");
+ }
+ catch(std::bad_alloc& e) {
+ log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("gc3");
+ }
+ catch(std::exception& e) {
+ log() << "exception in dur::groupCommit causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("gc4");
+ }
+ LOG(4) << "groupCommit end" << endl;
+ }
+
+ static void go() {
+ const int N = 10;
+ static int n;
+ if( privateMapBytes < UncommittedBytesLimit && ++n % N && (cmdLine.durOptions&CmdLine::DurAlwaysRemap)==0 ) {
+ // limited locks version doesn't do any remapprivateview at all, so only try this if privateMapBytes
+ // is in an acceptable range. also every Nth commit, we do everything so we can do some remapping;
+ // remapping a lot all at once could cause jitter from a large amount of copy-on-writes all at once.
+ if( groupCommitWithLimitedLocks() )
+ return;
+ }
+ else {
+ readlocktry lk("", 1000);
+ if( lk.got() ) {
+ groupCommit();
+ return;
+ }
+ }
+
+ // starvation on read locks could occur. so if read lock acquisition is slow, try to get a
+ // write lock instead. otherwise journaling could be delayed too long (too much data will
+ // not accumulate though, as commitIfNeeded logic will have executed in the meantime if there
+ // has been writes)
+ writelock lk;
+ groupCommit();
+ }
+
+ /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its
+ views disappear
+ */
+ void closingFileNotification() {
+ if (!cmdLine.dur)
+ return;
+
+ if( d.dbMutex.atLeastReadLocked() ) {
+ groupCommit();
+ }
+ else {
+ assert( inShutdown() );
+ if( commitJob.hasWritten() ) {
+ log() << "journal warning files are closing outside locks with writes pending" << endl;
+ }
+ }
+ }
+
+ extern int groupCommitIntervalMs;
+ boost::filesystem::path getJournalDir();
+
+ void durThread() {
+ Client::initThread("journal");
+
+ bool samePartition = true;
+ try {
+ const string dbpathDir = boost::filesystem::path(dbpath).native_directory_string();
+ samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
+ }
+ catch(...) {
+ }
+
+ while( !inShutdown() ) {
+ RACECHECK
+
+ unsigned ms = cmdLine.journalCommitInterval;
+ if( ms == 0 ) {
+ // use default
+ ms = samePartition ? 100 : 30;
+ }
+
+ unsigned oneThird = (ms / 3) + 1; // +1 so never zero
+
+ try {
+ stats.rotate();
+
+ // we do this in a couple blocks (the invoke()), which makes it a tiny bit faster (only a little) on throughput,
+ // but is likely also less spiky on our cpu usage, which is good.
+
+ // commit sooner if one or more getLastError j:true is pending
+ sleepmillis(oneThird);
+ for( unsigned i = 1; i <= 2; i++ ) {
+ if( commitJob._notify.nWaiting() )
+ break;
+ commitJob.wi()._deferred.invoke();
+ sleepmillis(oneThird);
+ }
+
+ go();
+ }
+ catch(std::exception& e) {
+ log() << "exception in durThread causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("exception in durThread");
+ }
+ }
+ cc().shutdown();
+ }
+
+ void recover();
+
+ unsigned notesThisLock = 0;
+
+ void releasingWriteLock() {
+ DEV notesThisLock = 0;
+ // implicit commitIfNeeded check on each write unlock
+ DEV commitJob._nSinceCommitIfNeededCall = 0; // implicit commit if needed
+ if( commitJob.bytes() > UncommittedBytesLimit || cmdLine.durOptions & CmdLine::DurAlwaysCommit ) {
+ stats.curr->_earlyCommits++;
+ groupCommit();
+ }
+ }
+
+ void preallocateFiles();
+
+ /** at startup, recover, and then start the journal threads */
+ void startup() {
+ if( !cmdLine.dur )
+ return;
+
+#if defined(_DURABLEDEFAULTON)
+ DEV {
+ if( time(0) & 1 ) {
+ cmdLine.durOptions |= CmdLine::DurAlwaysCommit;
+ log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysCommit mode for this run" << endl;
+ }
+ if( time(0) & 2 ) {
+ cmdLine.durOptions |= CmdLine::DurAlwaysRemap;
+ log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysRemap mode for this run" << endl;
+ }
+ }
+#endif
+
+ DurableInterface::enableDurability();
+
+ journalMakeDir();
+ try {
+ recover();
+ }
+ catch(...) {
+ log() << "exception during recovery" << endl;
+ throw;
+ }
+
+ preallocateFiles();
+
+ boost::thread t(durThread);
+ }
+
+ void DurableImpl::syncDataAndTruncateJournal() {
+ d.dbMutex.assertWriteLocked();
+
+ // a commit from the commit thread won't begin while we are in the write lock,
+ // but it may already be in progress and the end of that work is done outside
+ // (dbMutex) locks. This line waits for that to complete if already underway.
+ {
+ scoped_lock lk(groupCommitMutex);
+ }
+
+ groupCommit();
+ MongoFile::flushAll(true);
+ journalCleanup();
+
+ assert(!haveJournalFiles()); // Double check post-conditions
+ }
+
+ } // namespace dur
+
+} // namespace mongo