diff options
Diffstat (limited to 'src/mongo/db/pdfile.cpp')
-rw-r--r-- | src/mongo/db/pdfile.cpp | 2425 |
1 files changed, 2425 insertions, 0 deletions
diff --git a/src/mongo/db/pdfile.cpp b/src/mongo/db/pdfile.cpp new file mode 100644 index 00000000000..069eeadec37 --- /dev/null +++ b/src/mongo/db/pdfile.cpp @@ -0,0 +1,2425 @@ +// pdfile.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* +todo: +_ table scans must be sequential, not next/prev pointers +_ coalesce deleted +_ disallow system* manipulations from the database. +*/ + +#include "pch.h" +#include "pdfile.h" +#include "db.h" +#include "../util/mmap.h" +#include "../util/hashtab.h" +#include "../util/file_allocator.h" +#include "../util/processinfo.h" +#include "../util/file.h" +#include "btree.h" +#include "btreebuilder.h" +#include <algorithm> +#include <list> +#include "repl.h" +#include "dbhelpers.h" +#include "namespace-inl.h" +#include "queryutil.h" +#include "extsort.h" +#include "curop-inl.h" +#include "background.h" +#include "compact.h" +#include "ops/delete.h" +#include "instance.h" +#include "replutil.h" + +namespace mongo { + + BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 ); + BOOST_STATIC_ASSERT( sizeof(DataFileHeader)-4 == 8192 ); + + void printMemInfo( const char * where ) { + cout << "mem info: "; + if ( where ) + cout << where << " "; + ProcessInfo pi; + if ( ! pi.supported() ) { + cout << " not supported" << endl; + return; + } + + cout << "vsize: " << pi.getVirtualMemorySize() << " resident: " << pi.getResidentSize() << " mapped: " << ( MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ) << endl; + } + + bool isValidNS( const StringData& ns ) { + // TODO: should check for invalid characters + + const char * x = strchr( ns.data() , '.' ); + if ( ! x ) + return false; + + x++; + return *x > 0; + } + + bool inDBRepair = false; + struct doingRepair { + doingRepair() { + assert( ! inDBRepair ); + inDBRepair = true; + } + ~doingRepair() { + inDBRepair = false; + } + }; + + map<string, unsigned> BackgroundOperation::dbsInProg; + set<string> BackgroundOperation::nsInProg; + + bool BackgroundOperation::inProgForDb(const char *db) { + assertInWriteLock(); + return dbsInProg[db] != 0; + } + + bool BackgroundOperation::inProgForNs(const char *ns) { + assertInWriteLock(); + return nsInProg.count(ns) != 0; + } + + void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { + uassert(12586, "cannot perform operation: a background operation is currently running for this database", + !inProgForDb(db)); + } + + void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { + uassert(12587, "cannot perform operation: a background operation is currently running for this collection", + !inProgForNs(ns)); + } + + BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { + assertInWriteLock(); + dbsInProg[_ns.db]++; + assert( nsInProg.count(_ns.ns()) == 0 ); + nsInProg.insert(_ns.ns()); + } + + BackgroundOperation::~BackgroundOperation() { + wassert( d.dbMutex.isWriteLocked() ); + dbsInProg[_ns.db]--; + nsInProg.erase(_ns.ns()); + } + + void BackgroundOperation::dump(stringstream& ss) { + if( nsInProg.size() ) { + ss << "\n<b>Background Jobs in Progress</b>\n"; + for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ ) + ss << " " << *i << '\n'; + } + for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { + if( i->second ) + ss << "database " << i->first << ": " << i->second << '\n'; + } + } + + /* ----------------------------------------- */ + + string dbpath = "/data/db/"; + const char FREELIST_NS[] = ".$freelist"; + bool directoryperdb = false; + string repairpath; + string pidfilepath; + + DataFileMgr theDataFileMgr; + DatabaseHolder _dbHolder; + int MAGIC = 0x1000; + + DatabaseHolder& dbHolderUnchecked() { + return _dbHolder; + } + + void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0); + void ensureIdIndexForNewNs(const char *ns) { + if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) && + strstr( ns, FREELIST_NS ) == 0 ) { + log( 1 ) << "adding _id index for collection " << ns << endl; + ensureHaveIdIndex( ns ); + } + } + + string getDbContext() { + stringstream ss; + Client * c = currentClient.get(); + if ( c ) { + Client::Context * cx = c->getContext(); + if ( cx ) { + Database *database = cx->db(); + if ( database ) { + ss << database->name << ' '; + ss << cx->ns() << ' '; + } + } + } + return ss.str(); + } + + /*---------------------------------------------------------------------*/ + + // inheritable class to implement an operation that may be applied to all + // files in a database using _applyOpToDataFiles() + class FileOp { + public: + virtual ~FileOp() {} + // Return true if file exists and operation successful + virtual bool apply( const boost::filesystem::path &p ) = 0; + virtual const char * op() const = 0; + }; + + void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath ); + + void _deleteDataFiles(const char *database) { + if ( directoryperdb ) { + FileAllocator::get()->waitUntilFinished(); + MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ), "delete data files with a directoryperdb" ); + return; + } + class : public FileOp { + virtual bool apply( const boost::filesystem::path &p ) { + return boost::filesystem::remove( p ); + } + virtual const char * op() const { + return "remove"; + } + } deleter; + _applyOpToDataFiles( database, deleter, true ); + } + + int Extent::initialSize(int len) { + long long sz = len * 16; + if ( len < 1000 ) sz = len * 64; + if ( sz > 1000000000 ) + sz = 1000000000; + int z = ((int)sz) & 0xffffff00; + assert( z > len ); + return z; + } + + bool _userCreateNS(const char *ns, const BSONObj& options, string& err, bool *deferIdIndex) { + if ( nsdetails(ns) ) { + err = "collection already exists"; + return false; + } + + log(1) << "create collection " << ns << ' ' << options << endl; + + /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field + and then go back and set to ok : 1 after we are done. + */ + bool isFreeList = strstr(ns, FREELIST_NS) != 0; + if( !isFreeList ) + addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options); + + long long size = Extent::initialSize(128); + { + BSONElement e = options.getField("size"); + if ( e.isNumber() ) { + size = e.numberLong(); + size += 256; + size &= 0xffffffffffffff00LL; + } + } + + uassert( 10083 , "create collection invalid size spec", size > 0 ); + + bool newCapped = false; + int mx = 0; + if( options["capped"].trueValue() ) { + newCapped = true; + BSONElement e = options.getField("max"); + if ( e.isNumber() ) { + mx = e.numberInt(); + } + } + + // $nExtents just for debug/testing. + BSONElement e = options.getField( "$nExtents" ); + Database *database = cc().database(); + if ( e.type() == Array ) { + // We create one extent per array entry, with size specified + // by the array value. + BSONObjIterator i( e.embeddedObject() ); + while( i.more() ) { + BSONElement e = i.next(); + int size = int( e.number() ); + assert( size <= 0x7fffffff ); + // $nExtents is just for testing - always allocate new extents + // rather than reuse existing extents so we have some predictibility + // in the extent size used by our tests + database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped ); + } + } + else if ( int( e.number() ) > 0 ) { + // We create '$nExtents' extents, each of size 'size'. + int nExtents = int( e.number() ); + assert( size <= 0x7fffffff ); + for ( int i = 0; i < nExtents; ++i ) { + assert( size <= 0x7fffffff ); + // $nExtents is just for testing - always allocate new extents + // rather than reuse existing extents so we have some predictibility + // in the extent size used by our tests + database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped ); + } + } + else { + // This is the non test case, where we don't have a $nExtents spec. + while ( size > 0 ) { + int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize; + int desiredExtentSize = (int) (size > max ? max : size); + if ( desiredExtentSize < Extent::minSize() ) { + desiredExtentSize = Extent::minSize(); + } + desiredExtentSize &= 0xffffff00; + Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped, true ); + size -= e->length; + } + } + + NamespaceDetails *d = nsdetails(ns); + assert(d); + + bool ensure = false; + if ( options.getField( "autoIndexId" ).type() ) { + if ( options["autoIndexId"].trueValue() ) { + ensure = true; + } + } + else { + if ( !newCapped ) { + ensure=true; + } + } + if( ensure ) { + if( deferIdIndex ) + *deferIdIndex = true; + else + ensureIdIndexForNewNs( ns ); + } + + if ( mx > 0 ) + getDur().writingInt( d->max ) = mx; + + return true; + } + + /** { ..., capped: true, size: ..., max: ... } + @param deferIdIndex - if not not, defers id index creation. sets the bool value to true if we wanted to create the id index. + @return true if successful + */ + bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) { + const char *coll = strchr( ns, '.' ) + 1; + massert( 10356 , str::stream() << "invalid ns: " << ns , NamespaceString::validCollectionName(ns)); + char cl[ 256 ]; + nsToDatabase( ns, cl ); + bool ok = _userCreateNS(ns, options, err, deferIdIndex); + if ( logForReplication && ok ) { + if ( options.getField( "create" ).eoo() ) { + BSONObjBuilder b; + b << "create" << coll; + b.appendElements( options ); + options = b.obj(); + } + string logNs = string( cl ) + ".$cmd"; + logOp("c", logNs.c_str(), options); + } + return ok; + } + + /*---------------------------------------------------------------------*/ + + int MongoDataFile::maxSize() { + if ( sizeof( int* ) == 4 ) { + return 512 * 1024 * 1024; + } + else if ( cmdLine.smallfiles ) { + return 0x7ff00000 >> 2; + } + else { + return 0x7ff00000; + } + } + + NOINLINE_DECL void MongoDataFile::badOfs2(int ofs) const { + stringstream ss; + ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; + uasserted(13441, ss.str()); + } + + NOINLINE_DECL void MongoDataFile::badOfs(int ofs) const { + stringstream ss; + ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; + uasserted(13440, ss.str()); + } + + int MongoDataFile::defaultSize( const char *filename ) const { + int size; + if ( fileNo <= 4 ) + size = (64*1024*1024) << fileNo; + else + size = 0x7ff00000; + if ( cmdLine.smallfiles ) { + size = size >> 2; + } + return size; + } + + static void check(void *_mb) { + if( sizeof(char *) == 4 ) + uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0); + else + uassert( 10085 , "can't map file memory", _mb != 0); + } + + /** @return true if found and opened. if uninitialized (prealloc only) does not open. */ + bool MongoDataFile::openExisting( const char *filename ) { + assert( _mb == 0 ); + if( !exists(filename) ) + return false; + if( !mmf.open(filename,false) ) { + dlog(2) << "info couldn't open " << filename << " probably end of datafile list" << endl; + return false; + } + _mb = mmf.getView(); assert(_mb); + unsigned long long sz = mmf.length(); + assert( sz <= 0x7fffffff ); + assert( sz % 4096 == 0 ); + if( sz < 64*1024*1024 && !cmdLine.smallfiles ) { + if( sz >= 16*1024*1024 && sz % (1024*1024) == 0 ) { + log() << "info openExisting file size " << sz << " but cmdLine.smallfiles=false" << endl; + } + else { + log() << "openExisting size " << sz << " less then minimum file size expectation " << filename << endl; + assert(false); + } + } + check(_mb); + if( header()->uninitialized() ) + return false; + return true; + } + + void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) { + long size = defaultSize( filename ); + while ( size < minSize ) { + if ( size < maxSize() / 2 ) + size *= 2; + else { + size = maxSize(); + break; + } + } + if ( size > maxSize() ) + size = maxSize(); + + assert( size >= 64*1024*1024 || cmdLine.smallfiles ); + assert( size % 4096 == 0 ); + + if ( preallocateOnly ) { + if ( cmdLine.prealloc ) { + FileAllocator::get()->requestAllocation( filename, size ); + } + return; + } + + { + assert( _mb == 0 ); + unsigned long long sz = size; + if( mmf.create(filename, sz, false) ) + _mb = mmf.getView(); + assert( sz <= 0x7fffffff ); + size = (int) sz; + } + check(_mb); + header()->init(fileNo, size, filename); + } + + void MongoDataFile::flush( bool sync ) { + mmf.flush( sync ); + } + + void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { + NamespaceIndex *ni = nsindex(ns); + NamespaceDetails *details = ni->details(ns); + if ( details ) { + assert( !details->lastExtent.isNull() ); + assert( !details->firstExtent.isNull() ); + getDur().writingDiskLoc(e->xprev) = details->lastExtent; + getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc; + assert( !eloc.isNull() ); + getDur().writingDiskLoc(details->lastExtent) = eloc; + } + else { + ni->add_ns(ns, eloc, capped); + details = ni->details(ns); + } + + { + NamespaceDetails *dw = details->writingWithoutExtra(); + dw->lastExtentSize = e->length; + } + details->addDeletedRec(emptyLoc.drec(), emptyLoc); + } + + Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) { + { + // make sizes align with VM page size + int newSize = (approxSize + 0xfff) & 0xfffff000; + assert( newSize >= 0 ); + if( newSize < Extent::maxSize() ) + approxSize = newSize; + } + massert( 10357 , "shutdown in progress", ! inShutdown() ); + massert( 10358 , "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() ); + massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed + int ExtentSize = min(header()->unusedLength, approxSize); + DiskLoc loc; + if ( ExtentSize < Extent::minSize() ) { + /* note there could be a lot of looping here is db just started and + no files are open yet. we might want to do something about that. */ + if ( loops > 8 ) { + assert( loops < 10000 ); + out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n'; + } + log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n"; + return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1); + } + int offset = header()->unused.getOfs(); + + DataFileHeader *h = header(); + h->unused.writing().set( fileNo, offset + ExtentSize ); + getDur().writingInt(h->unusedLength) = h->unusedLength - ExtentSize; + loc.set(fileNo, offset); + Extent *e = _getExtent(loc); + DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset, newCapped); + + addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped); + + DEV tlog(1) << "new extent " << ns << " size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset + << " emptyLoc:" << hex << emptyLoc.getOfs() << dec << endl; + return e; + } + + Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { + string s = cc().database()->name + FREELIST_NS; + NamespaceDetails *f = nsdetails(s.c_str()); + if( f ) { + int low, high; + if( capped ) { + // be strict about the size + low = approxSize; + if( low > 2048 ) low -= 256; + high = (int) (approxSize * 1.05) + 256; + } + else { + low = (int) (approxSize * 0.8); + high = (int) (approxSize * 1.4); + } + if( high <= 0 ) { + // overflowed + high = max(approxSize, Extent::maxSize()); + } + int n = 0; + Extent *best = 0; + int bestDiff = 0x7fffffff; + { + Timer t; + DiskLoc L = f->firstExtent; + while( !L.isNull() ) { + Extent * e = L.ext(); + if( e->length >= low && e->length <= high ) { + int diff = abs(e->length - approxSize); + if( diff < bestDiff ) { + bestDiff = diff; + best = e; + if( ((double) diff) / approxSize < 0.1 ) { + // close enough + break; + } + if( t.seconds() >= 2 ) { + // have spent lots of time in write lock, and we are in [low,high], so close enough + // could come into play if extent freelist is very long + break; + } + } + else { + OCCASIONALLY { + if( high < 64 * 1024 && t.seconds() >= 2 ) { + // be less picky if it is taking a long time + high = 64 * 1024; + } + } + } + } + L = e->xnext; + ++n; + } + if( t.seconds() >= 10 ) { + log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl; + } + } + + if( n > 128 ) log( n < 512 ) << "warning: newExtent " << n << " scanned\n"; + + if( best ) { + Extent *e = best; + // remove from the free list + if( !e->xprev.isNull() ) + e->xprev.ext()->xnext.writing() = e->xnext; + if( !e->xnext.isNull() ) + e->xnext.ext()->xprev.writing() = e->xprev; + if( f->firstExtent == e->myLoc ) + f->firstExtent.writing() = e->xnext; + if( f->lastExtent == e->myLoc ) + f->lastExtent.writing() = e->xprev; + + // use it + OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n"; + DiskLoc emptyLoc = e->reuse(ns, capped); + addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped); + return e; + } + } + + return 0; + // return createExtent(ns, approxSize, capped); + } + + /*---------------------------------------------------------------------*/ + + void Extent::markEmpty() { + xnext.Null(); + xprev.Null(); + firstRecord.Null(); + lastRecord.Null(); + } + + DiskLoc Extent::reuse(const char *nsname, bool capped) { + return getDur().writing(this)->_reuse(nsname, capped); + } + + void getEmptyLoc(const char *ns, const DiskLoc extentLoc, int extentLength, bool capped, /*out*/DiskLoc& emptyLoc, /*out*/int& delRecLength) { + emptyLoc = extentLoc; + emptyLoc.inc( Extent::HeaderSize() ); + delRecLength = extentLength - Extent::HeaderSize(); + if( delRecLength >= 32*1024 && str::contains(ns, '$') && !capped ) { + // probably an index. so skip forward to keep its records page aligned + int& ofs = emptyLoc.GETOFS(); + int newOfs = (ofs + 0xfff) & ~0xfff; + delRecLength -= (newOfs-ofs); + dassert( delRecLength > 0 ); + ofs = newOfs; + } + } + + DiskLoc Extent::_reuse(const char *nsname, bool capped) { + LOG(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n'; + massert( 10360 , "Extent::reset bad magic value", magic == 0x41424344 ); + nsDiagnostic = nsname; + markEmpty(); + + DiskLoc emptyLoc; + int delRecLength; + getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength); + + // todo: some dup code here and below in Extent::init + DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength); + empty = getDur().writing(empty); + empty->lengthWithHeaders = delRecLength; + empty->extentOfs = myLoc.getOfs(); + empty->nextDeleted.Null(); + + return emptyLoc; + } + + /* assumes already zeroed -- insufficient for block 'reuse' perhaps */ + DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) { + magic = 0x41424344; + myLoc.set(_fileNo, _offset); + xnext.Null(); + xprev.Null(); + nsDiagnostic = nsname; + length = _length; + firstRecord.Null(); + lastRecord.Null(); + + DiskLoc emptyLoc; + int delRecLength; + getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength); + + DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength) ); + empty->lengthWithHeaders = delRecLength; + empty->extentOfs = myLoc.getOfs(); + + return emptyLoc; + } + + /* + Record* Extent::newRecord(int len) { + if( firstEmptyRegion.isNull() )8 + return 0; + + assert(len > 0); + int newRecSize = len + Record::HeaderSize; + DiskLoc newRecordLoc = firstEmptyRegion; + Record *r = getRecord(newRecordLoc); + int left = r->netLength() - len; + if( left < 0 ) { + // + firstEmptyRegion.Null(); + return 0; + } + + DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion); + r->lengthWithHeaders = newRecSize; + r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent + if( !lastRecord.isNull() ) { + assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one + getRecord(lastRecord)->next.set(newRecordLoc); // until now + r->prev.set(lastRecord); + } + else { + r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent + assert( firstRecord.isNull() ); + firstRecord = newRecordLoc; + } + lastRecord = newRecordLoc; + + if( left < Record::HeaderSize + 32 ) { + firstEmptyRegion.Null(); + } + else { + firstEmptyRegion.inc(newRecSize); + Record *empty = getRecord(firstEmptyRegion); + empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null. + empty->prev.Null(); + empty->lengthWithHeaders = left; + } + + return r; + } + */ + + int Extent::maxSize() { + int maxExtentSize = 0x7ff00000; + if ( cmdLine.smallfiles ) { + maxExtentSize >>= 2; + } + return maxExtentSize; + } + + /*---------------------------------------------------------------------*/ + + shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) { + NamespaceDetails * d = nsdetails( ns ); + if ( ! d ) + return shared_ptr<Cursor>(new BasicCursor(DiskLoc())); + + DiskLoc loc = d->firstExtent; + Extent *e = getExtent(loc); + + DEBUGGING { + out() << "listing extents for " << ns << endl; + DiskLoc tmp = loc; + set<DiskLoc> extents; + + while ( 1 ) { + Extent *f = getExtent(tmp); + out() << "extent: " << tmp.toString() << endl; + extents.insert(tmp); + tmp = f->xnext; + if ( tmp.isNull() ) + break; + f = f->getNextExtent(); + } + + out() << endl; + d->dumpDeleted(&extents); + } + + if ( d->capped ) + return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) ); + + if ( !startLoc.isNull() ) + return shared_ptr<Cursor>(new BasicCursor( startLoc )); + + while ( e->firstRecord.isNull() && !e->xnext.isNull() ) { + /* todo: if extent is empty, free it for reuse elsewhere. + that is a bit complicated have to clean up the freelists. + */ + RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl; + // find a nonempty extent + // it might be nice to free the whole extent here! but have to clean up free recs then. + e = e->getNextExtent(); + } + return shared_ptr<Cursor>(new BasicCursor( e->firstRecord )); + } + + /* get a table scan cursor, but can be forward or reverse direction. + order.$natural - if set, > 0 means forward (asc), < 0 backward (desc). + */ + shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) { + BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 } + + if ( el.number() >= 0 ) + return DataFileMgr::findAll(ns, startLoc); + + // "reverse natural order" + NamespaceDetails *d = nsdetails(ns); + + if ( !d ) + return shared_ptr<Cursor>(new BasicCursor(DiskLoc())); + + if ( !d->capped ) { + if ( !startLoc.isNull() ) + return shared_ptr<Cursor>(new ReverseCursor( startLoc )); + Extent *e = d->lastExtent.ext(); + while ( e->lastRecord.isNull() && !e->xprev.isNull() ) { + OCCASIONALLY out() << " findTableScan: extent empty, skipping ahead" << endl; + e = e->getPrevExtent(); + } + return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord )); + } + else { + return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) ); + } + } + + void printFreeList() { + string s = cc().database()->name + FREELIST_NS; + log() << "dump freelist " << s << endl; + NamespaceDetails *freeExtents = nsdetails(s.c_str()); + if( freeExtents == 0 ) { + log() << " freeExtents==0" << endl; + return; + } + DiskLoc a = freeExtents->firstExtent; + while( !a.isNull() ) { + Extent *e = a.ext(); + log() << " extent " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << endl; + a = e->xnext; + } + + log() << "end freelist" << endl; + } + + /** free a list of extents that are no longer in use. this is a double linked list of extents + (could be just one in the list) + */ + void freeExtents(DiskLoc firstExt, DiskLoc lastExt) { + { + assert( !firstExt.isNull() && !lastExt.isNull() ); + Extent *f = firstExt.ext(); + Extent *l = lastExt.ext(); + assert( f->xprev.isNull() ); + assert( l->xnext.isNull() ); + assert( f==l || !f->xnext.isNull() ); + assert( f==l || !l->xprev.isNull() ); + } + + string s = cc().database()->name + FREELIST_NS; + NamespaceDetails *freeExtents = nsdetails(s.c_str()); + if( freeExtents == 0 ) { + string err; + _userCreateNS(s.c_str(), BSONObj(), err, 0); // todo: this actually allocates an extent, which is bad! + freeExtents = nsdetails(s.c_str()); + massert( 10361 , "can't create .$freelist", freeExtents); + } + if( freeExtents->firstExtent.isNull() ) { + freeExtents->firstExtent.writing() = firstExt; + freeExtents->lastExtent.writing() = lastExt; + } + else { + DiskLoc a = freeExtents->firstExtent; + assert( a.ext()->xprev.isNull() ); + getDur().writingDiskLoc( a.ext()->xprev ) = lastExt; + getDur().writingDiskLoc( lastExt.ext()->xnext ) = a; + getDur().writingDiskLoc( freeExtents->firstExtent ) = firstExt; + } + + //printFreeList(); + } + + /* drop a collection/namespace */ + void dropNS(const string& nsToDrop) { + NamespaceDetails* d = nsdetails(nsToDrop.c_str()); + uassert( 10086 , (string)"ns not found: " + nsToDrop , d ); + + BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str()); + + NamespaceString s(nsToDrop); + assert( s.db == cc().database()->name ); + if( s.isSystem() ) { + if( s.coll == "system.profile" ) + uassert( 10087 , "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 ); + else + uasserted( 12502, "can't drop system ns" ); + } + + { + // remove from the system catalog + BSONObj cond = BSON( "name" << nsToDrop ); // { name: "colltodropname" } + string system_namespaces = cc().database()->name + ".system.namespaces"; + /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true); + // no check of return code as this ns won't exist for some of the new storage engines + } + + // free extents + if( !d->firstExtent.isNull() ) { + freeExtents(d->firstExtent, d->lastExtent); + getDur().writingDiskLoc( d->firstExtent ).setInvalid(); + getDur().writingDiskLoc( d->lastExtent ).setInvalid(); + } + + // remove from the catalog hashtable + cc().database()->namespaceIndex.kill_ns(nsToDrop.c_str()); + } + + void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ) { + log(1) << "dropCollection: " << name << endl; + NamespaceDetails *d = nsdetails(name.c_str()); + if( d == 0 ) + return; + + BackgroundOperation::assertNoBgOpInProgForNs(name.c_str()); + + if ( d->nIndexes != 0 ) { + try { + assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) ); + } + catch( DBException& e ) { + stringstream ss; + ss << "drop: dropIndexes for collection failed - consider trying repair "; + ss << " cause: " << e.what(); + uasserted(12503,ss.str()); + } + assert( d->nIndexes == 0 ); + } + log(1) << "\t dropIndexes done" << endl; + result.append("ns", name.c_str()); + ClientCursor::invalidate(name.c_str()); + Top::global.collectionDropped( name ); + NamespaceDetailsTransient::eraseForPrefix( name.c_str() ); + dropNS(name); + } + + /* unindex all keys in index for this record. */ + static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) { + BSONObjSet keys; + id.getKeysFromObject(obj, keys); + IndexInterface& ii = id.idxInterface(); + for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { + BSONObj j = *i; + + bool ok = false; + try { + ok = ii.unindex(id.head, id, j, dl); + } + catch (AssertionException& e) { + problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl; + out() << "Assertion failure: _unindex failed: " << e.what() << '\n'; + out() << " obj:" << obj.toString() << '\n'; + out() << " key:" << j.toString() << '\n'; + out() << " dl:" << dl.toString() << endl; + sayDbContext(); + } + + if ( !ok && logMissing ) { + log() << "unindex failed (key too big?) " << id.indexNamespace() << " key: " << j << " " << obj["_id"] << endl; + } + } + } +//zzz + /* unindex all keys in all indexes for this record. */ + static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) { + BSONObj obj(todelete); + int n = d->nIndexes; + for ( int i = 0; i < n; i++ ) + _unindexRecord(d->idx(i), obj, dl, !noWarn); + if( d->indexBuildInProgress ) { // background index + // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it + _unindexRecord(d->idx(n), obj, dl, false); + } + } + + /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. + caller must check if capped + */ + void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) { + /* remove ourself from the record next/prev chain */ + { + if ( todelete->prevOfs != DiskLoc::NullOfs ) + getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs; + if ( todelete->nextOfs != DiskLoc::NullOfs ) + getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs; + } + + /* remove ourself from extent pointers */ + { + Extent *e = getDur().writing( todelete->myExtent(dl) ); + if ( e->firstRecord == dl ) { + if ( todelete->nextOfs == DiskLoc::NullOfs ) + e->firstRecord.Null(); + else + e->firstRecord.set(dl.a(), todelete->nextOfs); + } + if ( e->lastRecord == dl ) { + if ( todelete->prevOfs == DiskLoc::NullOfs ) + e->lastRecord.Null(); + else + e->lastRecord.set(dl.a(), todelete->prevOfs); + } + } + + /* add to the free list */ + { + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize -= todelete->netLength(); + s->nrecords--; + } + + if ( strstr(ns, ".system.indexes") ) { + /* temp: if in system.indexes, don't reuse, and zero out: we want to be + careful until validated more, as IndexDetails has pointers + to this disk location. so an incorrectly done remove would cause + a lot of problems. + */ + memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders); + } + else { + DEV { + unsigned long long *p = (unsigned long long *) todelete->data; + *getDur().writing(p) = 0; + //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. + } + d->addDeletedRec((DeletedRecord*)todelete, dl); + } + } + } + + void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) { + dassert( todelete == dl.rec() ); + + NamespaceDetails* d = nsdetails(ns); + if ( d->capped && !cappedOK ) { + out() << "failing remove on a capped ns " << ns << endl; + uassert( 10089 , "can't remove from a capped collection" , 0 ); + return; + } + + BSONObj toDelete; + if ( doLog ) { + BSONElement e = dl.obj()["_id"]; + if ( e.type() ) { + toDelete = e.wrap(); + } + } + + /* check if any cursors point to us. if so, advance them. */ + ClientCursor::aboutToDelete(dl); + + unindexRecord(d, todelete, dl, noWarn); + + _deleteRecord(d, ns, todelete, dl); + NamespaceDetailsTransient::get( ns ).notifyOfWriteOp(); + + if ( ! toDelete.isEmpty() ) { + logOp( "d" , ns , toDelete ); + } + } + + + /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record. + */ + const DiskLoc DataFileMgr::updateRecord( + const char *ns, + NamespaceDetails *d, + NamespaceDetailsTransient *nsdt, + Record *toupdate, const DiskLoc& dl, + const char *_buf, int _len, OpDebug& debug, bool god) { + + dassert( toupdate == dl.rec() ); + + BSONObj objOld(toupdate); + BSONObj objNew(_buf); + DEV assert( objNew.objsize() == _len ); + DEV assert( objNew.objdata() == _buf ); + + if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { + /* add back the old _id value if the update removes it. Note this implementation is slow + (copies entire object multiple times), but this shouldn't happen often, so going for simple + code, not speed. + */ + BSONObjBuilder b; + BSONElement e; + assert( objOld.getObjectID(e) ); + b.append(e); // put _id first, for best performance + b.appendElements(objNew); + objNew = b.obj(); + } + + /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further + below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... + */ + vector<IndexChanges> changes; + bool changedId = false; + getIndexChanges(changes, *d, objNew, objOld, changedId); + uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId ); + dupCheck(changes, *d, dl); + + if ( toupdate->netLength() < objNew.objsize() ) { + // doesn't fit. reallocate ----------------------------------------------------- + uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->capped)); + d->paddingTooSmall(); + debug.moved = true; + deleteRecord(ns, toupdate, dl); + return insert(ns, objNew.objdata(), objNew.objsize(), god); + } + + nsdt->notifyOfWriteOp(); + d->paddingFits(); + + /* have any index keys changed? */ + { + int keyUpdates = 0; + int z = d->nIndexesBeingBuilt(); + for ( int x = 0; x < z; x++ ) { + IndexDetails& idx = d->idx(x); + IndexInterface& ii = idx.idxInterface(); + for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) { + try { + bool found = ii.unindex(idx.head, idx, *changes[x].removed[i], dl); + if ( ! found ) { + RARELY warning() << "ns: " << ns << " couldn't unindex key: " << *changes[x].removed[i] + << " for doc: " << objOld["_id"] << endl; + } + } + catch (AssertionException&) { + debug.extra << " exception update unindex "; + problem() << " caught assertion update unindex " << idx.indexNamespace() << endl; + } + } + assert( !dl.isNull() ); + BSONObj idxKey = idx.info.obj().getObjectField("key"); + Ordering ordering = Ordering::make(idxKey); + keyUpdates += changes[x].added.size(); + for ( unsigned i = 0; i < changes[x].added.size(); i++ ) { + try { + /* we did the dupCheck() above. so we don't have to worry about it here. */ + ii.bt_insert( + idx.head, + dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx); + } + catch (AssertionException& e) { + debug.extra << " exception update index "; + problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << " " << objNew["_id"] << endl; + } + } + } + + debug.keyUpdates = keyUpdates; + } + + // update in place + int sz = objNew.objsize(); + memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz); + return dl; + } + + int Extent::followupSize(int len, int lastExtentLen) { + assert( len < Extent::maxSize() ); + int x = initialSize(len); + // changed from 1.20 to 1.35 in v2.1.x to get to larger extent size faster + int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.35); + int sz = y > x ? y : x; + + if ( sz < lastExtentLen ) { + // this means there was an int overflow + // so we should turn it into maxSize + sz = Extent::maxSize(); + } + else if ( sz > Extent::maxSize() ) { + sz = Extent::maxSize(); + } + + sz = ((int)sz) & 0xffffff00; + assert( sz > len ); + + return sz; + } + + /* step one of adding keys to index idxNo for a new record + @return true means done. false means multikey involved and more work to do + */ + static void _addKeysToIndexStepOneOfTwo(BSONObjSet & /*out*/keys, NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, IndexDetails& idx) { + idx.getKeysFromObject(obj, keys); + if( keys.empty() ) + return; + bool dupsAllowed = !idx.unique(); + BSONObj order = idx.keyPattern(); + IndexInterface& ii = idx.idxInterface(); + Ordering ordering = Ordering::make(order); + + assert( !recordLoc.isNull() ); + + try { + // we can't do the two step method with multi keys as insertion of one key changes the indexes + // structure. however we can do the first key of the set so we go ahead and do that FWIW + ii.phasedQueueItemToInsert(idxNo, idx.head, recordLoc, *keys.begin(), ordering, idx, dupsAllowed); + } + catch (AssertionException& e) { + if( e.getCode() == 10287 && idxNo == d->nIndexes ) { + DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; + } + else { + throw; + } + } + } + + namespace dur { + extern unsigned notesThisLock; + } + + void upgradeToWritable(bool shouldBeUnlocked) { + // todo upgrade! + DEV { + // verify we haven't written yet (usually) + + // test binary does special things so this would assert there so don't check there + if( shouldBeUnlocked && !cmdLine.binaryName.empty() && cmdLine.binaryName != "test" ) { + static unsigned long long zeroes; + static unsigned long long tot; + tot++; + if( dur::notesThisLock == 0 ) + zeroes++; + if( tot > 1000 ) { + static int n; + DEV if( n++ == 0 ) + log() << "warning upgradeToWritable: already in writable too often" << endl; + } + } + } + } + + /** add index keys for a newly inserted record + done in two steps/phases to defer write lock portion + */ + static void indexRecordUsingTwoSteps(NamespaceDetails *d, BSONObj obj, DiskLoc loc, bool shouldBeUnlocked) { + vector<int> multi; + vector<BSONObjSet> multiKeys; + + IndexInterface::phasedBegin(); + + int n = d->nIndexesBeingBuilt(); + { + BSONObjSet keys; + for ( int i = 0; i < n; i++ ) { + IndexDetails& idx = d->idx(i); + // this call throws on unique constraint violation. we haven't done any writes yet so that is fine. + _addKeysToIndexStepOneOfTwo(/*out*/keys, d, i, obj, loc, idx); + if( keys.size() > 1 ) { + multi.push_back(i); + multiKeys.push_back(BSONObjSet()); + multiKeys[multiKeys.size()-1].swap(keys); + } + keys.clear(); + } + } + + // update lock to writable here. TODO + + upgradeToWritable(shouldBeUnlocked); + + IndexInterface::phasedFinish(); // step 2 + + // now finish adding multikeys + for( unsigned j = 0; j < multi.size(); j++ ) { + unsigned i = multi[j]; + BSONObjSet& keys = multiKeys[j]; + IndexDetails& idx = d->idx(i); + IndexInterface& ii = idx.idxInterface(); + Ordering ordering = Ordering::make(idx.keyPattern()); + d->setIndexIsMultikey(i); + for( BSONObjSet::iterator k = ++keys.begin()/*skip 1*/; k != keys.end(); k++ ) { + try { + ii.bt_insert(idx.head, loc, *k, ordering, !idx.unique(), idx); + } catch (AssertionException& e) { + if( e.getCode() == 10287 && (int) i == d->nIndexes ) { + DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; + } + else { + /* roll back previously added index entries + note must do self index as it is multikey and could require some cleanup itself + */ + for( int j = 0; j < n; j++ ) { + try { + _unindexRecord(d->idx(j), obj, loc, false); + } + catch(...) { + log(3) << "unindex fails on rollback after unique key constraint prevented insert\n"; + } + } + throw; + } + } + } + } + } + + /* add keys to index idxNo for a new record */ + static void addKeysToIndex(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) { + IndexDetails& idx = d->idx(idxNo); + BSONObjSet keys; + idx.getKeysFromObject(obj, keys); + if( keys.empty() ) + return; + BSONObj order = idx.keyPattern(); + IndexInterface& ii = idx.idxInterface(); + Ordering ordering = Ordering::make(order); + int n = 0; + for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { + if( ++n == 2 ) { + d->setIndexIsMultikey(idxNo); + } + assert( !recordLoc.isNull() ); + try { + ii.bt_insert(idx.head, recordLoc, *i, ordering, dupsAllowed, idx); + } + catch (AssertionException& e) { + if( e.getCode() == 10287 && idxNo == d->nIndexes ) { + DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; + continue; + } + if( !dupsAllowed ) { + // dup key exception, presumably. + throw; + } + problem() << " caught assertion addKeysToIndex " << idx.indexNamespace() << " " << obj["_id"] << endl; + } + } + } + +#if 0 + void testSorting() { + BSONObjBuilder b; + b.appendNull(""); + BSONObj x = b.obj(); + + BSONObjExternalSorter sorter(*IndexDetails::iis[1]); + + sorter.add(x, DiskLoc(3,7)); + sorter.add(x, DiskLoc(4,7)); + sorter.add(x, DiskLoc(2,7)); + sorter.add(x, DiskLoc(1,7)); + sorter.add(x, DiskLoc(3,77)); + + sorter.sort(); + + auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator(); + while( i->more() ) { + BSONObjExternalSorter::Data d = i->next(); + /*cout << d.second.toString() << endl; + cout << d.first.objsize() << endl; + cout<<"SORTER next:" << d.first.toString() << endl;*/ + } + } +#endif + + SortPhaseOne *precalced = 0; + + template< class V > + void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter, + bool dropDups, list<DiskLoc> &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm, + Timer& t + ) + { + BtreeBuilder<V> btBuilder(dupsAllowed, idx); + BSONObj keyLast; + auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator(); + assert( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) ); + while( i->more() ) { + RARELY killCurrentOp.checkForInterrupt(); + BSONObjExternalSorter::Data d = i->next(); + + try { + if ( !dupsAllowed && dropDups ) { + LastError::Disabled led( lastError.get() ); + btBuilder.addKey(d.first, d.second); + } + else { + btBuilder.addKey(d.first, d.second); + } + } + catch( AssertionException& e ) { + if ( dupsAllowed ) { + // unknow exception?? + throw; + } + + if( e.interrupted() ) { + killCurrentOp.checkForInterrupt(); + } + + if ( ! dropDups ) + throw; + + /* we could queue these on disk, but normally there are very few dups, so instead we + keep in ram and have a limit. + */ + dupsToDrop.push_back(d.second); + uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 ); + } + pm.hit(); + } + pm.finished(); + op->setMessage( "index: (3/3) btree-middle" ); + log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl; + btBuilder.commit(); + if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) { + warning() << "not all entries were added to the index, probably some keys were too large" << endl; + } + } + + // throws DBException + unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { + CurOp * op = cc().curop(); + + Timer t; + + tlog(1) << "fastBuildIndex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl; + + bool dupsAllowed = !idx.unique(); + bool dropDups = idx.dropDups() || inDBRepair; + BSONObj order = idx.keyPattern(); + + getDur().writingDiskLoc(idx.head).Null(); + + if ( logLevel > 1 ) printMemInfo( "before index start" ); + + /* get and sort all the keys ----- */ + ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) ); + SortPhaseOne _ours; + SortPhaseOne *phase1 = precalced; + if( phase1 == 0 ) { + phase1 = &_ours; + SortPhaseOne& p1 = *phase1; + shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); + p1.sorter.reset( new BSONObjExternalSorter(idx.idxInterface(), order) ); + p1.sorter->hintNumObjects( d->stats.nrecords ); + const IndexSpec& spec = idx.getSpec(); + while ( c->ok() ) { + BSONObj o = c->current(); + DiskLoc loc = c->currLoc(); + p1.addKeys(spec, o, loc); + c->advance(); + pm.hit(); + if ( logLevel > 1 && p1.n % 10000 == 0 ) { + printMemInfo( "\t iterating objects" ); + } + }; + } + pm.finished(); + + BSONObjExternalSorter& sorter = *(phase1->sorter); + + if( phase1->multi ) + d->setIndexIsMultikey(idxNo); + + if ( logLevel > 1 ) printMemInfo( "before final sort" ); + phase1->sorter->sort(); + if ( logLevel > 1 ) printMemInfo( "after final sort" ); + + log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl; + + list<DiskLoc> dupsToDrop; + + /* build index --- */ + if( idx.version() == 0 ) + buildBottomUpPhases2And3<V0>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t); + else if( idx.version() == 1 ) + buildBottomUpPhases2And3<V1>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t); + else + assert(false); + + log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl; + + for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){ + theDataFileMgr.deleteRecord( ns, i->rec(), *i, false /* cappedOk */ , true /* noWarn */ , isMaster( ns ) /* logOp */ ); + getDur().commitIfNeeded(); + } + + return phase1->n; + } + + class BackgroundIndexBuildJob : public BackgroundOperation { + + unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { + bool dupsAllowed = !idx.unique(); + bool dropDups = idx.dropDups(); + + ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords ); + + unsigned long long n = 0; + auto_ptr<ClientCursor> cc; + { + shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); + cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) ); + } + CursorId id = cc->cursorid(); + + while ( cc->ok() ) { + BSONObj js = cc->current(); + try { + { + if ( !dupsAllowed && dropDups ) { + LastError::Disabled led( lastError.get() ); + addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed); + } + else { + addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed); + } + } + cc->advance(); + } + catch( AssertionException& e ) { + if( e.interrupted() ) { + killCurrentOp.checkForInterrupt(); + } + + if ( dropDups ) { + DiskLoc toDelete = cc->currLoc(); + bool ok = cc->advance(); + cc->updateLocation(); + theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true , true ); + if( ClientCursor::find(id, false) == 0 ) { + cc.release(); + if( !ok ) { + /* we were already at the end. normal. */ + } + else { + uasserted(12585, "cursor gone during bg index; dropDups"); + } + break; + } + } + else { + log() << "background addExistingToIndex exception " << e.what() << endl; + throw; + } + } + n++; + progress.hit(); + + getDur().commitIfNeeded(); + + if ( cc->yieldSometimes( ClientCursor::WillNeed ) ) { + progress.setTotalWhileRunning( d->stats.nrecords ); + } + else { + cc.release(); + uasserted(12584, "cursor gone during bg index"); + break; + } + } + progress.finished(); + return n; + } + + /* we do set a flag in the namespace for quick checking, but this is our authoritative info - + that way on a crash/restart, we don't think we are still building one. */ + set<NamespaceDetails*> bgJobsInProgress; + + void prep(const char *ns, NamespaceDetails *d) { + assertInWriteLock(); + uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , mongo::d.dbMutex.getState() == 1 ); + bgJobsInProgress.insert(d); + } + void done(const char *ns, NamespaceDetails *d) { + NamespaceDetailsTransient::get(ns).addedIndex(); // clear query optimizer cache + assertInWriteLock(); + } + + public: + BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { } + + unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { + unsigned long long n = 0; + + prep(ns.c_str(), d); + assert( idxNo == d->nIndexes ); + try { + idx.head.writing() = idx.idxInterface().addBucket(idx); + n = addExistingToIndex(ns.c_str(), d, idx, idxNo); + } + catch(...) { + if( cc().database() && nsdetails(ns.c_str()) == d ) { + assert( idxNo == d->nIndexes ); + done(ns.c_str(), d); + } + else { + log() << "ERROR: db gone during bg index?" << endl; + } + throw; + } + assert( idxNo == d->nIndexes ); + done(ns.c_str(), d); + return n; + } + }; + + /** + * For the lifetime of this object, an index build is indicated on the specified + * namespace and the newest index is marked as absent. This simplifies + * the cleanup required on recovery. + */ + class RecoverableIndexState { + public: + RecoverableIndexState( NamespaceDetails *d ) : _d( d ) { + indexBuildInProgress() = 1; + nIndexes()--; + } + ~RecoverableIndexState() { + DESTRUCTOR_GUARD ( + nIndexes()++; + indexBuildInProgress() = 0; + ) + } + private: + int &nIndexes() { return getDur().writingInt( _d->nIndexes ); } + int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); } + NamespaceDetails *_d; + }; + + // throws DBException + static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { + tlog() << "build index " << ns << ' ' << idx.keyPattern() << ( background ? " background" : "" ) << endl; + Timer t; + unsigned long long n; + + assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be... + assert( d->indexBuildInProgress == 0 ); + assertInWriteLock(); + RecoverableIndexState recoverable( d ); + + // Build index spec here in case the collection is empty and the index details are invalid + idx.getSpec(); + + if( inDBRepair || !background ) { + n = fastBuildIndex(ns.c_str(), d, idx, idxNo); + assert( !idx.head.isNull() ); + } + else { + BackgroundIndexBuildJob j(ns.c_str()); + n = j.go(ns, d, idx, idxNo); + } + tlog() << "build index done " << n << " records " << t.millis() / 1000.0 << " secs" << endl; + } + + /* add keys to indexes for a new record */ +#if 0 + static void oldIndexRecord__notused(NamespaceDetails *d, BSONObj obj, DiskLoc loc) { + int n = d->nIndexesBeingBuilt(); + for ( int i = 0; i < n; i++ ) { + try { + bool unique = d->idx(i).unique(); + addKeysToIndex(d, i, obj, loc, /*dupsAllowed*/!unique); + } + catch( DBException& ) { + /* try to roll back previously added index entries + note <= i (not < i) is important here as the index we were just attempted + may be multikey and require some cleanup. + */ + for( int j = 0; j <= i; j++ ) { + try { + _unindexRecord(d->idx(j), obj, loc, false); + } + catch(...) { + log(3) << "unindex fails on rollback after unique failure\n"; + } + } + throw; + } + } + } +#endif + + extern BSONObj id_obj; // { _id : 1 } + + void ensureHaveIdIndex(const char *ns) { + NamespaceDetails *d = nsdetails(ns); + if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) ) + return; + + *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex; + + { + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + if( i.next().isIdIndex() ) + return; + } + } + + string system_indexes = cc().database()->name + ".system.indexes"; + + BSONObjBuilder b; + b.append("name", "_id_"); + b.append("ns", ns); + b.append("key", id_obj); + BSONObj o = b.done(); + + /* edge case: note the insert could fail if we have hit maxindexes already */ + theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize(), true); + } + +#pragma pack(1) + struct IDToInsert_ { + char type; + char _id[4]; + OID oid; + IDToInsert_() { + type = (char) jstOID; + strcpy(_id, "_id"); + assert( sizeof(IDToInsert_) == 17 ); + } + } idToInsert_; + struct IDToInsert : public BSONElement { + IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {} + } idToInsert; +#pragma pack() + + void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) { + BSONObj tmp = o; + insertWithObjMod( ns, tmp, god ); + logOp( "i", ns, tmp ); + } + + /** @param o the object to insert. can be modified to add _id and thus be an in/out param + */ + DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) { + bool addedID = false; + DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID ); + if( addedID && !loc.isNull() ) + o = BSONObj( loc.rec() ); + return loc; + } + + bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ); + + // We are now doing two btree scans for all unique indexes (one here, and one when we've + // written the record to the collection. This could be made more efficient inserting + // dummy data here, keeping pointers to the btree nodes holding the dummy data and then + // updating the dummy data with the DiskLoc of the real record. + void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) { + for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) { + if( d->idx(idxNo).unique() ) { + IndexDetails& idx = d->idx(idxNo); + BSONObjSet keys; + idx.getKeysFromObject(obj, keys); + BSONObj order = idx.keyPattern(); + IndexInterface& ii = idx.idxInterface(); + for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { + // WARNING: findSingle may not be compound index safe. this may need to change. see notes in + // findSingle code. + uassert( 12582, "duplicate key insert for unique index of capped collection", + ii.findSingle(idx, idx.head, *i ).isNull() ); + } + } + } + } + + /** add a record to the end of the linked list chain within this extent. + require: you must have already declared write intent for the record header. + */ + void addRecordToRecListInExtent(Record *r, DiskLoc loc) { + dassert( loc.rec() == r ); + Extent *e = r->myExtent(loc); + if ( e->lastRecord.isNull() ) { + Extent::FL *fl = getDur().writing(e->fl()); + fl->firstRecord = fl->lastRecord = loc; + r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + } + else { + Record *oldlast = e->lastRecord.rec(); + r->prevOfs = e->lastRecord.getOfs(); + r->nextOfs = DiskLoc::NullOfs; + getDur().writingInt(oldlast->nextOfs) = loc.getOfs(); + getDur().writingDiskLoc(e->lastRecord) = loc; + } + } + + NOINLINE_DECL DiskLoc outOfSpace(const char *ns, NamespaceDetails *d, int lenWHdr, bool god, DiskLoc extentLoc) { + DiskLoc loc; + if ( d->capped == 0 ) { // size capped doesn't grow + log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl; + cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god); + loc = d->alloc(ns, lenWHdr, extentLoc); + if ( loc.isNull() ) { + log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n"; + for ( int z=0; z<10 && lenWHdr > d->lastExtentSize; z++ ) { + log() << "try #" << z << endl; + cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god); + loc = d->alloc(ns, lenWHdr, extentLoc); + if ( ! loc.isNull() ) + break; + } + } + } + return loc; + } + + /** used by insert and also compact + * @return null loc if out of space + */ + DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god) { + DiskLoc extentLoc; + DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); + if ( loc.isNull() ) { + loc = outOfSpace(ns, d, lenWHdr, god, extentLoc); + } + return loc; + } + + bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) { + uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns); + if ( strstr(ns, ".system.") ) { + // later:check for dba-type permissions here if have that at some point separate + if ( strstr(ns, ".system.indexes" ) ) + wouldAddIndex = true; + else if ( legalClientSystemNS( ns , true ) ) { + if ( obuf && strstr( ns , ".system.users" ) ) { + BSONObj t( reinterpret_cast<const char *>( obuf ) ); + uassert( 14051 , "system.user entry needs 'user' field to be a string" , t["user"].type() == String ); + uassert( 14052 , "system.user entry needs 'pwd' field to be a string" , t["pwd"].type() == String ); + uassert( 14053 , "system.user entry needs 'user' field to be non-empty" , t["user"].String().size() ); + uassert( 14054 , "system.user entry needs 'pwd' field to be non-empty" , t["pwd"].String().size() ); + } + } + else if ( !god ) { + // todo this should probably uasseert rather than doing this: + log() << "ERROR: attempt to insert in system namespace " << ns << endl; + return false; + } + } + return true; + } + + NOINLINE_DECL NamespaceDetails* insert_newNamespace(const char *ns, int len, bool god) { + addNewNamespaceToCatalog(ns); + /* todo: shouldn't be in the namespace catalog until after the allocations here work. + also if this is an addIndex, those checks should happen before this! + */ + // This may create first file in the database. + int ies = Extent::initialSize(len); + if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { + // probably an index. so we pick a value here for the first extent instead of using initialExtentSize() which is more + // for user collections. TODO: we could look at the # of records in the parent collection to be smarter here. + ies = (32+4) * 1024; + } + cc().database()->allocExtent(ns, ies, false, false); + NamespaceDetails *d = nsdetails(ns); + if ( !god ) + ensureIdIndexForNewNs(ns); + return d; + } + + void NOINLINE_DECL insert_makeIndex(NamespaceDetails *tableToIndex, const string& tabletoidxns, const DiskLoc& loc) { + uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos ); + + BSONObj info = loc.obj(); + bool background = info["background"].trueValue(); + // if this is not readable, let's move things along + if (background && ((!theReplSet && cc().isSyncThread()) || (theReplSet && !theReplSet->isSecondary()))) { + log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl; + background = false; + } + + int idxNo = tableToIndex->nIndexes; + IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes + getDur().writingDiskLoc(idx.info) = loc; + try { + buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background); + } + catch( DBException& e ) { + // save our error msg string as an exception or dropIndexes will overwrite our message + LastError *le = lastError.get(); + int savecode = 0; + string saveerrmsg; + if ( le ) { + savecode = le->code; + saveerrmsg = le->msg; + } + else { + savecode = e.getCode(); + saveerrmsg = e.what(); + } + + // roll back this index + string name = idx.indexName(); + BSONObjBuilder b; + string errmsg; + bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true); + if( !ok ) { + log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl; + } + + assert( le && !saveerrmsg.empty() ); + raiseError(savecode,saveerrmsg.c_str()); + throw; + } + } + + /* if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc + after the call -- that will prevent a double buffer copy in some cases (btree.cpp). + + @param mayAddIndex almost always true, except for invocation from rename namespace command. + @param addedID if not null, set to true if adding _id element. you must assure false before calling + if using. + */ + + DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) { + bool wouldAddIndex = false; + massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); + uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); + { + const char *sys = strstr(ns, "system."); + if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) + return DiskLoc(); + } + bool addIndex = wouldAddIndex && mayAddIndex; + + NamespaceDetails *d = nsdetails(ns); + if ( d == 0 ) { + d = insert_newNamespace(ns, len, god); + } + + NamespaceDetails *tableToIndex = 0; + + string tabletoidxns; + BSONObj fixedIndexObject; + if ( addIndex ) { + assert( obuf ); + BSONObj io((const char *) obuf); + if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) { + // prepare creates _id itself, or this indicates to fail the build silently (such + // as if index already exists) + return DiskLoc(); + } + if ( ! fixedIndexObject.isEmpty() ) { + obuf = fixedIndexObject.objdata(); + len = fixedIndexObject.objsize(); + } + } + + int addID = 0; // 0 if not adding _id; if adding, the length of that new element + if( !god ) { + /* Check if we have an _id field. If we don't, we'll add it. + Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. + */ + BSONObj io((const char *) obuf); + BSONElement idField = io.getField( "_id" ); + uassert( 10099 , "_id cannot be an array", idField.type() != Array ); + // we don't add _id for capped collections as they don't have an _id index + if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 && d->haveIdIndex() ) { + if( addedID ) + *addedID = true; + addID = len; + idToInsert_.oid.init(); + len += idToInsert.size(); + } + + BSONElementManipulator::lookForTimestamps( io ); + } + + int lenWHdr = len + Record::HeaderSize; + lenWHdr = (int) (lenWHdr * d->paddingFactor); + if ( lenWHdr == 0 ) { + // old datafiles, backward compatible here. + assert( d->paddingFactor == 0 ); + *getDur().writing(&d->paddingFactor) = 1.0; + lenWHdr = len + Record::HeaderSize; + } + + // If the collection is capped, check if the new object will violate a unique index + // constraint before allocating space. + if ( d->nIndexes && d->capped && !god ) { + checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) ); + } + + bool earlyIndex = true; + DiskLoc loc; + if( addID || tableToIndex || d->capped ) { + // if need id, we don't do the early indexing. this is not the common case so that is sort of ok + earlyIndex = false; + loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); + } + else { + loc = d->allocWillBeAt(ns, lenWHdr); + if( loc.isNull() ) { + // need to get a new extent so we have to do the true alloc now (not common case) + earlyIndex = false; + loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); + } + } + if ( loc.isNull() ) { + log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl; + assert(d->capped); + return DiskLoc(); + } + + if( earlyIndex ) { + // add record to indexes using two step method so we can do the reading outside a write lock + if ( d->nIndexes ) { + assert( obuf ); + BSONObj obj((const char *) obuf); + try { + indexRecordUsingTwoSteps(d, obj, loc, true); + } + catch( AssertionException& ) { + // should be a dup key error on _id index + dassert( !tableToIndex && !d->capped ); + // no need to delete/rollback the record as it was not added yet + throw; + } + } + // really allocate now + DiskLoc real = allocateSpaceForANewRecord(ns, d, lenWHdr, god); + assert( real == loc ); + } + + Record *r = loc.rec(); + { + assert( r->lengthWithHeaders >= lenWHdr ); + r = (Record*) getDur().writingPtr(r, lenWHdr); + if( addID ) { + /* a little effort was made here to avoid a double copy when we add an ID */ + ((int&)*r->data) = *((int*) obuf) + idToInsert.size(); + memcpy(r->data+4, idToInsert.rawdata(), idToInsert.size()); + memcpy(r->data+4+idToInsert.size(), ((char *)obuf)+4, addID-4); + } + else { + if( obuf ) // obuf can be null from internal callers + memcpy(r->data, obuf, len); + } + } + + addRecordToRecListInExtent(r, loc); + + /* durability todo : this could be a bit annoying / slow to record constantly */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize += r->netLength(); + s->nrecords++; + } + + // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket + if ( !god ) + NamespaceDetailsTransient::get( ns ).notifyOfWriteOp(); + + if ( tableToIndex ) { + insert_makeIndex(tableToIndex, tabletoidxns, loc); + } + + /* add this record to our indexes */ + if ( !earlyIndex && d->nIndexes ) { + try { + BSONObj obj(r->data); + // not sure which of these is better -- either can be used. oldIndexRecord may be faster, + // but twosteps handles dup key errors more efficiently. + //oldIndexRecord(d, obj, loc); + indexRecordUsingTwoSteps(d, obj, loc, false); + + } + catch( AssertionException& e ) { + // should be a dup key error on _id index + if( tableToIndex || d->capped ) { + massert( 12583, "unexpected index insertion failure on capped collection", !d->capped ); + string s = e.toString(); + s += " : on addIndex/capped - collection and its index will not match"; + uassert_nothrow(s.c_str()); + error() << s << endl; + } + else { + // normal case -- we can roll back + _deleteRecord(d, ns, r, loc); + throw; + } + } + } + + d->paddingFits(); + + return loc; + } + + /* special version of insert for transaction logging -- streamlined a bit. + assumes ns is capped and no indexes + */ + Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) { + assert( d ); + RARELY assert( d == nsdetails(ns) ); + DEV assert( d == nsdetails(ns) ); + + DiskLoc extentLoc; + int lenWHdr = len + Record::HeaderSize; + DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); + assert( !loc.isNull() ); + + Record *r = loc.rec(); + assert( r->lengthWithHeaders >= lenWHdr ); + + Extent *e = r->myExtent(loc); + if ( e->lastRecord.isNull() ) { + Extent::FL *fl = getDur().writing( e->fl() ); + fl->firstRecord = fl->lastRecord = loc; + + Record::NP *np = getDur().writing(r->np()); + np->nextOfs = np->prevOfs = DiskLoc::NullOfs; + } + else { + Record *oldlast = e->lastRecord.rec(); + Record::NP *np = getDur().writing(r->np()); + np->prevOfs = e->lastRecord.getOfs(); + np->nextOfs = DiskLoc::NullOfs; + getDur().writingInt( oldlast->nextOfs ) = loc.getOfs(); + e->lastRecord.writing() = loc; + } + + /* todo: don't update for oplog? seems wasteful. */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize += r->netLength(); + s->nrecords++; + } + + return r; + } + +} // namespace mongo + +#include "clientcursor.h" + +namespace mongo { + + void dropAllDatabasesExceptLocal() { + writelock lk(""); + + vector<string> n; + getDatabaseNames(n); + if( n.size() == 0 ) return; + log() << "dropAllDatabasesExceptLocal " << n.size() << endl; + for( vector<string>::iterator i = n.begin(); i != n.end(); i++ ) { + if( *i != "local" ) { + Client::Context ctx(*i); + dropDatabase(*i); + } + } + } + + void dropDatabase(string db) { + log(1) << "dropDatabase " << db << endl; + Database *d = cc().database(); + assert( d ); + assert( d->name == db ); + + BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str()); + + mongo::d.dbMutex.assertWriteLocked(); + + // Not sure we need this here, so removed. If we do, we need to move it down + // within other calls both (1) as they could be called from elsewhere and + // (2) to keep the lock order right - groupcommitmutex must be locked before + // mmmutex (if both are locked). + // + // RWLockRecursive::Exclusive lk(MongoFile::mmmutex); + + getDur().syncDataAndTruncateJournal(); + + Database::closeDatabase( d->name.c_str(), d->path ); + d = 0; // d is now deleted + + _deleteDataFiles( db.c_str() ); + } + + typedef boost::filesystem::path Path; + + void boostRenameWrapper( const Path &from, const Path &to ) { + try { + boost::filesystem::rename( from, to ); + } + catch ( const boost::filesystem::filesystem_error & ) { + // boost rename doesn't work across partitions + boost::filesystem::copy_file( from, to); + boost::filesystem::remove( from ); + } + } + + // back up original database files to 'temp' dir + void _renameForBackup( const char *database, const Path &reservedPath ) { + Path newPath( reservedPath ); + if ( directoryperdb ) + newPath /= database; + class Renamer : public FileOp { + public: + Renamer( const Path &newPath ) : newPath_( newPath ) {} + private: + const boost::filesystem::path &newPath_; + virtual bool apply( const Path &p ) { + if ( !boost::filesystem::exists( p ) ) + return false; + boostRenameWrapper( p, newPath_ / ( p.leaf() + ".bak" ) ); + return true; + } + virtual const char * op() const { + return "renaming"; + } + } renamer( newPath ); + _applyOpToDataFiles( database, renamer, true ); + } + + // move temp files to standard data dir + void _replaceWithRecovered( const char *database, const char *reservedPathString ) { + Path newPath( dbpath ); + if ( directoryperdb ) + newPath /= database; + class Replacer : public FileOp { + public: + Replacer( const Path &newPath ) : newPath_( newPath ) {} + private: + const boost::filesystem::path &newPath_; + virtual bool apply( const Path &p ) { + if ( !boost::filesystem::exists( p ) ) + return false; + boostRenameWrapper( p, newPath_ / p.leaf() ); + return true; + } + virtual const char * op() const { + return "renaming"; + } + } replacer( newPath ); + _applyOpToDataFiles( database, replacer, true, reservedPathString ); + } + + // generate a directory name for storing temp data files + Path uniqueReservedPath( const char *prefix ) { + Path repairPath = Path( repairpath ); + Path reservedPath; + int i = 0; + bool exists = false; + do { + stringstream ss; + ss << prefix << "_repairDatabase_" << i++; + reservedPath = repairPath / ss.str(); + BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) ); + } + while ( exists ); + return reservedPath; + } + + boost::intmax_t dbSize( const char *database ) { + class SizeAccumulator : public FileOp { + public: + SizeAccumulator() : totalSize_( 0 ) {} + boost::intmax_t size() const { + return totalSize_; + } + private: + virtual bool apply( const boost::filesystem::path &p ) { + if ( !boost::filesystem::exists( p ) ) + return false; + totalSize_ += boost::filesystem::file_size( p ); + return true; + } + virtual const char *op() const { + return "checking size"; + } + boost::intmax_t totalSize_; + }; + SizeAccumulator sa; + _applyOpToDataFiles( database, sa ); + return sa.size(); + } + + bool repairDatabase( string dbNameS , string &errmsg, + bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) { + doingRepair dr; + dbNameS = nsToDatabase( dbNameS ); + const char * dbName = dbNameS.c_str(); + + stringstream ss; + ss << "localhost:" << cmdLine.port; + string localhost = ss.str(); + + problem() << "repairDatabase " << dbName << endl; + assert( cc().database()->name == dbName ); + assert( cc().database()->path == dbpath ); + + BackgroundOperation::assertNoBgOpInProgForDb(dbName); + + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + + boost::intmax_t totalSize = dbSize( dbName ); + boost::intmax_t freeSize = File::freeSpace(repairpath); + if ( freeSize > -1 && freeSize < totalSize ) { + stringstream ss; + ss << "Cannot repair database " << dbName << " having size: " << totalSize + << " (bytes) because free disk space is: " << freeSize << " (bytes)"; + errmsg = ss.str(); + problem() << errmsg << endl; + return false; + } + + Path reservedPath = + uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ? + "backup" : "_tmp" ); + BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) ); + string reservedPathString = reservedPath.native_directory_string(); + + bool res; + { + // clone to temp location, which effectively does repair + Client::Context ctx( dbName, reservedPathString ); + assert( ctx.justCreated() ); + + res = cloneFrom(localhost.c_str(), errmsg, dbName, + /*logForReplication=*/false, /*slaveOk*/false, /*replauth*/false, + /*snapshot*/false, /*mayYield*/false, /*mayBeInterrupted*/true); + Database::closeDatabase( dbName, reservedPathString.c_str() ); + } + + if ( !res ) { + errmsg = str::stream() << "clone failed for " << dbName << " with error: " << errmsg; + problem() << errmsg << endl; + + if ( !preserveClonedFilesOnFailure ) + BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); + + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + + return false; + } + + MongoFile::flushAll(true); + + Client::Context ctx( dbName ); + Database::closeDatabase( dbName, dbpath ); + + if ( backupOriginalFiles ) { + _renameForBackup( dbName, reservedPath ); + } + else { + _deleteDataFiles( dbName ); + BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) ); + } + + _replaceWithRecovered( dbName, reservedPathString.c_str() ); + + if ( !backupOriginalFiles ) + BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); + + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + + return true; + } + + void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) { + if ( afterAllocator ) + FileAllocator::get()->waitUntilFinished(); + string c = database; + c += '.'; + boost::filesystem::path p(path); + if ( directoryperdb ) + p /= database; + boost::filesystem::path q; + q = p / (c+"ns"); + bool ok = false; + BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) ); + if ( ok ) + log(2) << fo.op() << " file " << q.string() << endl; + int i = 0; + int extra = 10; // should not be necessary, this is defensive in case there are missing files + while ( 1 ) { + assert( i <= DiskLoc::MaxFiles ); + stringstream ss; + ss << c << i; + q = p / ss.str(); + BOOST_CHECK_EXCEPTION( ok = fo.apply(q) ); + if ( ok ) { + if ( extra != 10 ) { + log(1) << fo.op() << " file " << q.string() << endl; + log() << " _applyOpToDataFiles() warning: extra == " << extra << endl; + } + } + else if ( --extra <= 0 ) + break; + i++; + } + } + + NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); } + + bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) { + log() << "DatabaseHolder::closeAll path:" << path << endl; + d.dbMutex.assertWriteLocked(); + + map<string,Database*>& m = _paths[path]; + _size -= m.size(); + + set< string > dbs; + for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) { + wassert( i->second->path == path ); + dbs.insert( i->first ); + } + + currentClient.get()->getContext()->_clear(); + + BSONObjBuilder bb( result.subarrayStart( "dbs" ) ); + int n = 0; + int nNotClosed = 0; + for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) { + string name = *i; + log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl; + Client::Context ctx( name , path ); + if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) { + log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl; + nNotClosed++; + } + else { + Database::closeDatabase( name.c_str() , path ); + bb.append( bb.numStr( n++ ) , name ); + } + } + bb.done(); + if( nNotClosed ) + result.append("nNotClosed", nNotClosed); + else { + ClientCursor::assertNoCursors(); + } + + return true; + } + +} // namespace mongo |