// pdfile.cpp /** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ /* todo: _ table scans must be sequential, not next/prev pointers _ coalesce deleted _ disallow system* manipulations from the database. */ #include "pch.h" #include "pdfile.h" #include "db.h" #include "../util/mmap.h" #include "../util/hashtab.h" #include "../util/file_allocator.h" #include "../util/processinfo.h" #include "../util/file.h" #include "btree.h" #include "btreebuilder.h" #include #include #include "repl.h" #include "dbhelpers.h" #include "namespace-inl.h" #include "queryutil.h" #include "extsort.h" #include "curop-inl.h" #include "background.h" #include "compact.h" #include "ops/delete.h" #include "instance.h" #include "replutil.h" namespace mongo { BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 ); BOOST_STATIC_ASSERT( sizeof(DataFileHeader)-4 == 8192 ); bool isValidNS( const StringData& ns ) { // TODO: should check for invalid characters const char * x = strchr( ns.data() , '.' ); if ( ! x ) return false; x++; return *x > 0; } bool inDBRepair = false; struct doingRepair { doingRepair() { assert( ! inDBRepair ); inDBRepair = true; } ~doingRepair() { inDBRepair = false; } }; map BackgroundOperation::dbsInProg; set BackgroundOperation::nsInProg; bool BackgroundOperation::inProgForDb(const char *db) { assertInWriteLock(); return dbsInProg[db] != 0; } bool BackgroundOperation::inProgForNs(const char *ns) { assertInWriteLock(); return nsInProg.count(ns) != 0; } void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { uassert(12586, "cannot perform operation: a background operation is currently running for this database", !inProgForDb(db)); } void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { uassert(12587, "cannot perform operation: a background operation is currently running for this collection", !inProgForNs(ns)); } BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { assertInWriteLock(); dbsInProg[_ns.db]++; assert( nsInProg.count(_ns.ns()) == 0 ); nsInProg.insert(_ns.ns()); } BackgroundOperation::~BackgroundOperation() { wassert( dbMutex.isWriteLocked() ); dbsInProg[_ns.db]--; nsInProg.erase(_ns.ns()); } void BackgroundOperation::dump(stringstream& ss) { if( nsInProg.size() ) { ss << "\nBackground Jobs in Progress\n"; for( set::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ ) ss << " " << *i << '\n'; } for( map::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { if( i->second ) ss << "database " << i->first << ": " << i->second << '\n'; } } /* ----------------------------------------- */ string dbpath = "/data/db/"; bool directoryperdb = false; string repairpath; string pidfilepath; DataFileMgr theDataFileMgr; DatabaseHolder dbHolder; int MAGIC = 0x1000; void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0); void ensureIdIndexForNewNs(const char *ns) { if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) && strstr( ns, ".$freelist" ) == 0 ) { log( 1 ) << "adding _id index for collection " << ns << endl; ensureHaveIdIndex( ns ); } } string getDbContext() { stringstream ss; Client * c = currentClient.get(); if ( c ) { Client::Context * cx = c->getContext(); if ( cx ) { Database *database = cx->db(); if ( database ) { ss << database->name << ' '; ss << cx->ns() << ' '; } } } return ss.str(); } /*---------------------------------------------------------------------*/ // inheritable class to implement an operation that may be applied to all // files in a database using _applyOpToDataFiles() class FileOp { public: virtual ~FileOp() {} // Return true if file exists and operation successful virtual bool apply( const boost::filesystem::path &p ) = 0; virtual const char * op() const = 0; }; void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath ); void _deleteDataFiles(const char *database) { if ( directoryperdb ) { FileAllocator::get()->waitUntilFinished(); MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ), "delete data files with a directoryperdb" ); return; } class : public FileOp { virtual bool apply( const boost::filesystem::path &p ) { return boost::filesystem::remove( p ); } virtual const char * op() const { return "remove"; } } deleter; _applyOpToDataFiles( database, deleter, true ); } int Extent::initialSize(int len) { long long sz = len * 16; if ( len < 1000 ) sz = len * 64; if ( sz > 1000000000 ) sz = 1000000000; int z = ((int)sz) & 0xffffff00; assert( z > len ); return z; } bool _userCreateNS(const char *ns, const BSONObj& options, string& err, bool *deferIdIndex) { if ( nsdetails(ns) ) { err = "collection already exists"; return false; } log(1) << "create collection " << ns << ' ' << options << endl; /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field and then go back and set to ok : 1 after we are done. */ bool isFreeList = strstr(ns, ".$freelist") != 0; if( !isFreeList ) addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options); long long size = Extent::initialSize(128); { BSONElement e = options.getField("size"); if ( e.isNumber() ) { size = e.numberLong(); size += 256; size &= 0xffffffffffffff00LL; } } uassert( 10083 , "create collection invalid size spec", size > 0 ); bool newCapped = false; int mx = 0; if( options["capped"].trueValue() ) { newCapped = true; BSONElement e = options.getField("max"); if ( e.isNumber() ) { mx = e.numberInt(); } } // $nExtents just for debug/testing. BSONElement e = options.getField( "$nExtents" ); Database *database = cc().database(); if ( e.type() == Array ) { // We create one extent per array entry, with size specified // by the array value. BSONObjIterator i( e.embeddedObject() ); while( i.more() ) { BSONElement e = i.next(); int size = int( e.number() ); assert( size <= 0x7fffffff ); // $nExtents is just for testing - always allocate new extents // rather than reuse existing extents so we have some predictibility // in the extent size used by our tests database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped ); } } else if ( int( e.number() ) > 0 ) { // We create '$nExtents' extents, each of size 'size'. int nExtents = int( e.number() ); assert( size <= 0x7fffffff ); for ( int i = 0; i < nExtents; ++i ) { assert( size <= 0x7fffffff ); // $nExtents is just for testing - always allocate new extents // rather than reuse existing extents so we have some predictibility // in the extent size used by our tests database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped ); } } else { // This is the non test case, where we don't have a $nExtents spec. while ( size > 0 ) { int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize; int desiredExtentSize = (int) (size > max ? max : size); if ( desiredExtentSize < Extent::minSize() ) { desiredExtentSize = Extent::minSize(); } desiredExtentSize &= 0xffffff00; Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped, true ); size -= e->length; } } NamespaceDetails *d = nsdetails(ns); assert(d); bool ensure = false; if ( options.getField( "autoIndexId" ).type() ) { if ( options["autoIndexId"].trueValue() ) { ensure = true; } } else { if ( !newCapped ) { ensure=true; } } if( ensure ) { if( deferIdIndex ) *deferIdIndex = true; else ensureIdIndexForNewNs( ns ); } if ( mx > 0 ) getDur().writingInt( d->max ) = mx; return true; } /** { ..., capped: true, size: ..., max: ... } @param deferIdIndex - if not not, defers id index creation. sets the bool value to true if we wanted to create the id index. @return true if successful */ bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) { const char *coll = strchr( ns, '.' ) + 1; massert( 10356 , str::stream() << "invalid ns: " << ns , coll && *coll ); char cl[ 256 ]; nsToDatabase( ns, cl ); bool ok = _userCreateNS(ns, options, err, deferIdIndex); if ( logForReplication && ok ) { if ( options.getField( "create" ).eoo() ) { BSONObjBuilder b; b << "create" << coll; b.appendElements( options ); options = b.obj(); } string logNs = string( cl ) + ".$cmd"; logOp("c", logNs.c_str(), options); } return ok; } /*---------------------------------------------------------------------*/ int MongoDataFile::maxSize() { if ( sizeof( int* ) == 4 ) { return 512 * 1024 * 1024; } else if ( cmdLine.smallfiles ) { return 0x7ff00000 >> 2; } else { return 0x7ff00000; } } NOINLINE_DECL void MongoDataFile::badOfs2(int ofs) const { stringstream ss; ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; uasserted(13441, ss.str()); } NOINLINE_DECL void MongoDataFile::badOfs(int ofs) const { stringstream ss; ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; uasserted(13440, ss.str()); } int MongoDataFile::defaultSize( const char *filename ) const { int size; if ( fileNo <= 4 ) size = (64*1024*1024) << fileNo; else size = 0x7ff00000; if ( cmdLine.smallfiles ) { size = size >> 2; } return size; } void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) { long size = defaultSize( filename ); while ( size < minSize ) { if ( size < maxSize() / 2 ) size *= 2; else { size = maxSize(); break; } } if ( size > maxSize() ) size = maxSize(); assert( size >= 64*1024*1024 || cmdLine.smallfiles ); assert( size % 4096 == 0 ); if ( preallocateOnly ) { if ( cmdLine.prealloc ) { FileAllocator::get()->requestAllocation( filename, size ); } return; } { assert( _mb == 0 ); unsigned long long sz = size; if( mmf.create(filename, sz, false) ) _mb = mmf.getView(); assert( sz <= 0x7fffffff ); size = (int) sz; } //header = (DataFileHeader *) _p; if( sizeof(char *) == 4 ) uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0); else uassert( 10085 , "can't map file memory", _mb != 0); header()->init(fileNo, size, filename); } void MongoDataFile::flush( bool sync ) { mmf.flush( sync ); } void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { NamespaceIndex *ni = nsindex(ns); NamespaceDetails *details = ni->details(ns); if ( details ) { assert( !details->lastExtent.isNull() ); assert( !details->firstExtent.isNull() ); getDur().writingDiskLoc(e->xprev) = details->lastExtent; getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc; assert( !eloc.isNull() ); getDur().writingDiskLoc(details->lastExtent) = eloc; } else { ni->add_ns(ns, eloc, capped); details = ni->details(ns); } { NamespaceDetails *dw = details->writingWithoutExtra(); dw->lastExtentSize = e->length; } details->addDeletedRec(emptyLoc.drec(), emptyLoc); } Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) { assert( approxSize <= Extent::maxSize() ); { // make sizes align with VM page size int newSize = (approxSize + 0xfff) & 0xfffff000; assert( newSize >= 0 ); if( newSize < Extent::maxSize() ) approxSize = newSize; } massert( 10357 , "shutdown in progress", ! inShutdown() ); massert( 10358 , "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() ); massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed int ExtentSize = min(header()->unusedLength, approxSize); DiskLoc loc; if ( ExtentSize < Extent::minSize() ) { /* note there could be a lot of looping here is db just started and no files are open yet. we might want to do something about that. */ if ( loops > 8 ) { assert( loops < 10000 ); out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n'; } log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n"; return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1); } int offset = header()->unused.getOfs(); DataFileHeader *h = header(); h->unused.writing().set( fileNo, offset + ExtentSize ); getDur().writingInt(h->unusedLength) = h->unusedLength - ExtentSize; loc.set(fileNo, offset); Extent *e = _getExtent(loc); DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset, newCapped); addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped); DEV tlog(1) << "new extent " << ns << " size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset << " emptyLoc:" << hex << emptyLoc.getOfs() << dec << endl; return e; } Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { string s = cc().database()->name + ".$freelist"; NamespaceDetails *f = nsdetails(s.c_str()); if( f ) { int low, high; if( capped ) { // be strict about the size low = approxSize; if( low > 2048 ) low -= 256; high = (int) (approxSize * 1.05) + 256; } else { low = (int) (approxSize * 0.8); high = (int) (approxSize * 1.4); } if( high <= 0 ) { // overflowed high = max(approxSize, Extent::maxSize()); } if ( high <= Extent::minSize() ) { // the minimum extent size is 4097 high = Extent::minSize() + 1; } int n = 0; Extent *best = 0; int bestDiff = 0x7fffffff; { Timer t; DiskLoc L = f->firstExtent; while( !L.isNull() ) { Extent * e = L.ext(); if( e->length >= low && e->length <= high ) { int diff = abs(e->length - approxSize); if( diff < bestDiff ) { bestDiff = diff; best = e; if( ((double) diff) / approxSize < 0.1 ) { // close enough break; } if( t.seconds() >= 2 ) { // have spent lots of time in write lock, and we are in [low,high], so close enough // could come into play if extent freelist is very long break; } } else { OCCASIONALLY { if( high < 64 * 1024 && t.seconds() >= 2 ) { // be less picky if it is taking a long time high = 64 * 1024; } } } } L = e->xnext; ++n; } if( t.seconds() >= 10 ) { log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl; } } if( n > 128 ) log( n < 512 ) << "warning: newExtent " << n << " scanned\n"; if( best ) { Extent *e = best; // remove from the free list if( !e->xprev.isNull() ) e->xprev.ext()->xnext.writing() = e->xnext; if( !e->xnext.isNull() ) e->xnext.ext()->xprev.writing() = e->xprev; if( f->firstExtent == e->myLoc ) f->firstExtent.writing() = e->xnext; if( f->lastExtent == e->myLoc ) f->lastExtent.writing() = e->xprev; // use it OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n"; DiskLoc emptyLoc = e->reuse(ns, capped); addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped); return e; } } return 0; // return createExtent(ns, approxSize, capped); } /*---------------------------------------------------------------------*/ void Extent::markEmpty() { xnext.Null(); xprev.Null(); firstRecord.Null(); lastRecord.Null(); } DiskLoc Extent::reuse(const char *nsname, bool capped) { return getDur().writing(this)->_reuse(nsname, capped); } void getEmptyLoc(const char *ns, const DiskLoc extentLoc, int extentLength, bool capped, /*out*/DiskLoc& emptyLoc, /*out*/int& delRecLength) { emptyLoc = extentLoc; emptyLoc.inc( Extent::HeaderSize() ); delRecLength = extentLength - Extent::HeaderSize(); if( delRecLength >= 32*1024 && str::contains(ns, '$') && !capped ) { // probably an index. so skip forward to keep its records page aligned int& ofs = emptyLoc.GETOFS(); int newOfs = (ofs + 0xfff) & ~0xfff; delRecLength -= (newOfs-ofs); dassert( delRecLength > 0 ); ofs = newOfs; } } DiskLoc Extent::_reuse(const char *nsname, bool capped) { LOG(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n'; massert( 10360 , "Extent::reset bad magic value", magic == 0x41424344 ); nsDiagnostic = nsname; markEmpty(); DiskLoc emptyLoc; int delRecLength; getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength); // todo: some dup code here and below in Extent::init DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength); empty = getDur().writing(empty); empty->lengthWithHeaders = delRecLength; empty->extentOfs = myLoc.getOfs(); empty->nextDeleted.Null(); return emptyLoc; } /* assumes already zeroed -- insufficient for block 'reuse' perhaps */ DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) { magic = 0x41424344; myLoc.set(_fileNo, _offset); xnext.Null(); xprev.Null(); nsDiagnostic = nsname; length = _length; firstRecord.Null(); lastRecord.Null(); DiskLoc emptyLoc; int delRecLength; getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength); DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength) ); empty->lengthWithHeaders = delRecLength; empty->extentOfs = myLoc.getOfs(); return emptyLoc; } /* Record* Extent::newRecord(int len) { if( firstEmptyRegion.isNull() )8 return 0; assert(len > 0); int newRecSize = len + Record::HeaderSize; DiskLoc newRecordLoc = firstEmptyRegion; Record *r = getRecord(newRecordLoc); int left = r->netLength() - len; if( left < 0 ) { // firstEmptyRegion.Null(); return 0; } DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion); r->lengthWithHeaders = newRecSize; r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent if( !lastRecord.isNull() ) { assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one getRecord(lastRecord)->next.set(newRecordLoc); // until now r->prev.set(lastRecord); } else { r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent assert( firstRecord.isNull() ); firstRecord = newRecordLoc; } lastRecord = newRecordLoc; if( left < Record::HeaderSize + 32 ) { firstEmptyRegion.Null(); } else { firstEmptyRegion.inc(newRecSize); Record *empty = getRecord(firstEmptyRegion); empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null. empty->prev.Null(); empty->lengthWithHeaders = left; } return r; } */ int Extent::maxSize() { int maxExtentSize = 0x7ff00000; if ( cmdLine.smallfiles ) { maxExtentSize >>= 2; } return maxExtentSize; } /*---------------------------------------------------------------------*/ shared_ptr DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) { NamespaceDetails * d = nsdetails( ns ); if ( ! d ) return shared_ptr(new BasicCursor(DiskLoc())); DiskLoc loc = d->firstExtent; Extent *e = getExtent(loc); DEBUGGING { out() << "listing extents for " << ns << endl; DiskLoc tmp = loc; set extents; while ( 1 ) { Extent *f = getExtent(tmp); out() << "extent: " << tmp.toString() << endl; extents.insert(tmp); tmp = f->xnext; if ( tmp.isNull() ) break; f = f->getNextExtent(); } out() << endl; d->dumpDeleted(&extents); } if ( d->capped ) return shared_ptr( new ForwardCappedCursor( d , startLoc ) ); if ( !startLoc.isNull() ) return shared_ptr(new BasicCursor( startLoc )); while ( e->firstRecord.isNull() && !e->xnext.isNull() ) { /* todo: if extent is empty, free it for reuse elsewhere. that is a bit complicated have to clean up the freelists. */ RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl; // find a nonempty extent // it might be nice to free the whole extent here! but have to clean up free recs then. e = e->getNextExtent(); } return shared_ptr(new BasicCursor( e->firstRecord )); } /* get a table scan cursor, but can be forward or reverse direction. order.$natural - if set, > 0 means forward (asc), < 0 backward (desc). */ shared_ptr findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) { BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 } if ( el.number() >= 0 ) return DataFileMgr::findAll(ns, startLoc); // "reverse natural order" NamespaceDetails *d = nsdetails(ns); if ( !d ) return shared_ptr(new BasicCursor(DiskLoc())); if ( !d->capped ) { if ( !startLoc.isNull() ) return shared_ptr(new ReverseCursor( startLoc )); Extent *e = d->lastExtent.ext(); while ( e->lastRecord.isNull() && !e->xprev.isNull() ) { OCCASIONALLY out() << " findTableScan: extent empty, skipping ahead" << endl; e = e->getPrevExtent(); } return shared_ptr(new ReverseCursor( e->lastRecord )); } else { return shared_ptr( new ReverseCappedCursor( d, startLoc ) ); } } void printFreeList() { string s = cc().database()->name + ".$freelist"; log() << "dump freelist " << s << endl; NamespaceDetails *freeExtents = nsdetails(s.c_str()); if( freeExtents == 0 ) { log() << " freeExtents==0" << endl; return; } DiskLoc a = freeExtents->firstExtent; while( !a.isNull() ) { Extent *e = a.ext(); log() << " extent " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << endl; a = e->xnext; } log() << "end freelist" << endl; } /** free a list of extents that are no longer in use. this is a double linked list of extents (could be just one in the list) */ void freeExtents(DiskLoc firstExt, DiskLoc lastExt) { { assert( !firstExt.isNull() && !lastExt.isNull() ); Extent *f = firstExt.ext(); Extent *l = lastExt.ext(); assert( f->xprev.isNull() ); assert( l->xnext.isNull() ); assert( f==l || !f->xnext.isNull() ); assert( f==l || !l->xprev.isNull() ); } string s = cc().database()->name + ".$freelist"; NamespaceDetails *freeExtents = nsdetails(s.c_str()); if( freeExtents == 0 ) { string err; _userCreateNS(s.c_str(), BSONObj(), err, 0); // todo: this actually allocates an extent, which is bad! freeExtents = nsdetails(s.c_str()); massert( 10361 , "can't create .$freelist", freeExtents); } if( freeExtents->firstExtent.isNull() ) { freeExtents->firstExtent.writing() = firstExt; freeExtents->lastExtent.writing() = lastExt; } else { DiskLoc a = freeExtents->firstExtent; assert( a.ext()->xprev.isNull() ); getDur().writingDiskLoc( a.ext()->xprev ) = lastExt; getDur().writingDiskLoc( lastExt.ext()->xnext ) = a; getDur().writingDiskLoc( freeExtents->firstExtent ) = firstExt; } //printFreeList(); } /* drop a collection/namespace */ void dropNS(const string& nsToDrop) { NamespaceDetails* d = nsdetails(nsToDrop.c_str()); uassert( 10086 , (string)"ns not found: " + nsToDrop , d ); BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str()); NamespaceString s(nsToDrop); assert( s.db == cc().database()->name ); if( s.isSystem() ) { if( s.coll == "system.profile" ) uassert( 10087 , "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 ); else uasserted( 12502, "can't drop system ns" ); } { // remove from the system catalog BSONObj cond = BSON( "name" << nsToDrop ); // { name: "colltodropname" } string system_namespaces = cc().database()->name + ".system.namespaces"; /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true); // no check of return code as this ns won't exist for some of the new storage engines } // free extents if( !d->firstExtent.isNull() ) { freeExtents(d->firstExtent, d->lastExtent); getDur().writingDiskLoc( d->firstExtent ).setInvalid(); getDur().writingDiskLoc( d->lastExtent ).setInvalid(); } // remove from the catalog hashtable cc().database()->namespaceIndex.kill_ns(nsToDrop.c_str()); } void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ) { log(1) << "dropCollection: " << name << endl; NamespaceDetails *d = nsdetails(name.c_str()); if( d == 0 ) return; BackgroundOperation::assertNoBgOpInProgForNs(name.c_str()); if ( d->nIndexes != 0 ) { try { assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) ); } catch( DBException& e ) { stringstream ss; ss << "drop: dropIndexes for collection failed - consider trying repair "; ss << " cause: " << e.what(); uasserted(12503,ss.str()); } assert( d->nIndexes == 0 ); } log(1) << "\t dropIndexes done" << endl; result.append("ns", name.c_str()); ClientCursor::invalidate(name.c_str()); Top::global.collectionDropped( name ); NamespaceDetailsTransient::eraseForPrefix( name.c_str() ); dropNS(name); } /* unindex all keys in index for this record. */ static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) { BSONObjSet keys; id.getKeysFromObject(obj, keys); IndexInterface& ii = id.idxInterface(); for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { BSONObj j = *i; bool ok = false; try { ok = ii.unindex(id.head, id, j, dl); } catch (AssertionException& e) { problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl; out() << "Assertion failure: _unindex failed: " << e.what() << '\n'; out() << " obj:" << obj.toString() << '\n'; out() << " key:" << j.toString() << '\n'; out() << " dl:" << dl.toString() << endl; sayDbContext(); } if ( !ok && logMissing ) { log() << "unindex failed (key too big?) " << id.indexNamespace() << " key: " << j << " " << obj["_id"] << endl; } } } /* unindex all keys in all indexes for this record. */ static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) { BSONObj obj(todelete); int n = d->nIndexes; for ( int i = 0; i < n; i++ ) _unindexRecord(d->idx(i), obj, dl, !noWarn); if( d->indexBuildInProgress ) { // background index // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it _unindexRecord(d->idx(n), obj, dl, false); } } /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. caller must check if capped */ void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) { /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs != DiskLoc::NullOfs ) getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs; if ( todelete->nextOfs != DiskLoc::NullOfs ) getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs; } /* remove ourself from extent pointers */ { Extent *e = getDur().writing( todelete->myExtent(dl) ); if ( e->firstRecord == dl ) { if ( todelete->nextOfs == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs); } if ( e->lastRecord == dl ) { if ( todelete->prevOfs == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs); } } /* add to the free list */ { { NamespaceDetails::Stats *s = getDur().writing(&d->stats); s->datasize -= todelete->netLength(); s->nrecords--; } if ( strstr(ns, ".system.indexes") ) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders); } else { DEV { unsigned long long *p = (unsigned long long *) todelete->data; *getDur().writing(p) = 0; //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. } d->addDeletedRec((DeletedRecord*)todelete, dl); } } } void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) { dassert( todelete == dl.rec() ); NamespaceDetails* d = nsdetails(ns); if ( d->capped && !cappedOK ) { out() << "failing remove on a capped ns " << ns << endl; uassert( 10089 , "can't remove from a capped collection" , 0 ); return; } BSONObj toDelete; if ( doLog ) { BSONElement e = dl.obj()["_id"]; if ( e.type() ) { toDelete = e.wrap(); } } /* check if any cursors point to us. if so, advance them. */ ClientCursor::aboutToDelete(dl); unindexRecord(d, todelete, dl, noWarn); _deleteRecord(d, ns, todelete, dl); NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp(); if ( ! toDelete.isEmpty() ) { logOp( "d" , ns , toDelete ); } } /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record. */ const DiskLoc DataFileMgr::updateRecord( const char *ns, NamespaceDetails *d, NamespaceDetailsTransient *nsdt, Record *toupdate, const DiskLoc& dl, const char *_buf, int _len, OpDebug& debug, bool god) { dassert( toupdate == dl.rec() ); BSONObj objOld(toupdate); BSONObj objNew(_buf); DEV assert( objNew.objsize() == _len ); DEV assert( objNew.objdata() == _buf ); if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { /* add back the old _id value if the update removes it. Note this implementation is slow (copies entire object multiple times), but this shouldn't happen often, so going for simple code, not speed. */ BSONObjBuilder b; BSONElement e; assert( objOld.getObjectID(e) ); b.append(e); // put _id first, for best performance b.appendElements(objNew); objNew = b.obj(); } /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ vector changes; bool changedId = false; getIndexChanges(changes, *d, objNew, objOld, changedId); uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId ); dupCheck(changes, *d, dl); if ( toupdate->netLength() < objNew.objsize() ) { // doesn't fit. reallocate ----------------------------------------------------- uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->capped)); d->paddingTooSmall(); debug.moved = true; deleteRecord(ns, toupdate, dl); return insert(ns, objNew.objdata(), objNew.objsize(), god); } nsdt->notifyOfWriteOp(); d->paddingFits(); /* have any index keys changed? */ { unsigned keyUpdates = 0; int z = d->nIndexesBeingBuilt(); for ( int x = 0; x < z; x++ ) { IndexDetails& idx = d->idx(x); IndexInterface& ii = idx.idxInterface(); for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) { try { bool found = ii.unindex(idx.head, idx, *changes[x].removed[i], dl); if ( ! found ) { RARELY warning() << "ns: " << ns << " couldn't unindex key: " << *changes[x].removed[i] << " for doc: " << objOld["_id"] << endl; } } catch (AssertionException&) { debug.extra << " exception update unindex "; problem() << " caught assertion update unindex " << idx.indexNamespace() << endl; } } assert( !dl.isNull() ); BSONObj idxKey = idx.info.obj().getObjectField("key"); Ordering ordering = Ordering::make(idxKey); keyUpdates += changes[x].added.size(); for ( unsigned i = 0; i < changes[x].added.size(); i++ ) { try { /* we did the dupCheck() above. so we don't have to worry about it here. */ ii.bt_insert( idx.head, dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx); } catch (AssertionException& e) { debug.extra << " exception update index "; problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << " " << objNew["_id"] << endl; } } } debug.keyUpdates = keyUpdates; } // update in place int sz = objNew.objsize(); memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz); return dl; } int Extent::followupSize(int len, int lastExtentLen) { assert( len < Extent::maxSize() ); int x = initialSize(len); int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2); int sz = y > x ? y : x; if ( sz < lastExtentLen ) { // this means there was an int overflow // so we should turn it into maxSize sz = Extent::maxSize(); } else if ( sz > Extent::maxSize() ) { sz = Extent::maxSize(); } sz = ((int)sz) & 0xffffff00; assert( sz > len ); return sz; } /* add keys to index idxNo for a new record */ static inline void _indexRecord(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) { IndexDetails& idx = d->idx(idxNo); BSONObjSet keys; idx.getKeysFromObject(obj, keys); if( keys.empty() ) return; BSONObj order = idx.keyPattern(); IndexInterface& ii = idx.idxInterface(); Ordering ordering = Ordering::make(order); int n = 0; for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { if( ++n == 2 ) { d->setIndexIsMultikey(idxNo); } assert( !recordLoc.isNull() ); try { ii.bt_insert(idx.head, recordLoc, *i, ordering, dupsAllowed, idx); } catch (AssertionException& e) { if( e.getCode() == 10287 && idxNo == d->nIndexes ) { DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; continue; } if( !dupsAllowed ) { // dup key exception, presumably. throw; } problem() << " caught assertion _indexRecord " << idx.indexNamespace() << " " << obj["_id"] << endl; } } } #if 0 void testSorting() { BSONObjBuilder b; b.appendNull(""); BSONObj x = b.obj(); BSONObjExternalSorter sorter(*IndexDetails::iis[1]); sorter.add(x, DiskLoc(3,7)); sorter.add(x, DiskLoc(4,7)); sorter.add(x, DiskLoc(2,7)); sorter.add(x, DiskLoc(1,7)); sorter.add(x, DiskLoc(3,77)); sorter.sort(); auto_ptr i = sorter.iterator(); while( i->more() ) { BSONObjExternalSorter::Data d = i->next(); /*cout << d.second.toString() << endl; cout << d.first.objsize() << endl; cout<<"SORTER next:" << d.first.toString() << endl;*/ } } #endif SortPhaseOne *precalced = 0; template< class V > void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter, bool dropDups, list &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm, Timer& t ) { BtreeBuilder btBuilder(dupsAllowed, idx); BSONObj keyLast; auto_ptr i = sorter.iterator(); assert( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) ); while( i->more() ) { RARELY killCurrentOp.checkForInterrupt(); BSONObjExternalSorter::Data d = i->next(); try { if ( !dupsAllowed && dropDups ) { LastError::Disabled led( lastError.get() ); btBuilder.addKey(d.first, d.second); } else { btBuilder.addKey(d.first, d.second); } } catch( AssertionException& e ) { if ( dupsAllowed ) { // unknow exception?? throw; } if( e.interrupted() ) { killCurrentOp.checkForInterrupt(); } if ( ! dropDups ) throw; /* we could queue these on disk, but normally there are very few dups, so instead we keep in ram and have a limit. */ dupsToDrop.push_back(d.second); uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 ); } pm.hit(); } pm.finished(); op->setMessage( "index: (3/3) btree-middle" ); log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl; btBuilder.commit(); if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) { warning() << "not all entries were added to the index, probably some keys were too large" << endl; } } // throws DBException unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { CurOp * op = cc().curop(); Timer t; tlog(1) << "fastBuildIndex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl; bool dupsAllowed = !idx.unique(); bool dropDups = idx.dropDups() || inDBRepair; BSONObj order = idx.keyPattern(); getDur().writingDiskLoc(idx.head).Null(); if ( logLevel > 1 ) printMemInfo( "before index start" ); /* get and sort all the keys ----- */ ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) ); SortPhaseOne _ours; SortPhaseOne *phase1 = precalced; if( phase1 == 0 ) { phase1 = &_ours; SortPhaseOne& p1 = *phase1; shared_ptr c = theDataFileMgr.findAll(ns); p1.sorter.reset( new BSONObjExternalSorter(idx.idxInterface(), order) ); p1.sorter->hintNumObjects( d->stats.nrecords ); const IndexSpec& spec = idx.getSpec(); while ( c->ok() ) { BSONObj o = c->current(); DiskLoc loc = c->currLoc(); p1.addKeys(spec, o, loc); c->advance(); pm.hit(); if ( logLevel > 1 && p1.n % 10000 == 0 ) { printMemInfo( "\t iterating objects" ); } }; } pm.finished(); BSONObjExternalSorter& sorter = *(phase1->sorter); if( phase1->multi ) d->setIndexIsMultikey(idxNo); if ( logLevel > 1 ) printMemInfo( "before final sort" ); phase1->sorter->sort(); if ( logLevel > 1 ) printMemInfo( "after final sort" ); log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl; list dupsToDrop; /* build index --- */ if( idx.version() == 0 ) buildBottomUpPhases2And3(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t); else if( idx.version() == 1 ) buildBottomUpPhases2And3(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t); else assert(false); log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl; for( list::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){ theDataFileMgr.deleteRecord( ns, i->rec(), *i, false /* cappedOk */ , true /* noWarn */ , isMaster( ns ) /* logOp */ ); getDur().commitIfNeeded(); } return phase1->n; } class BackgroundIndexBuildJob : public BackgroundOperation { unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { bool dupsAllowed = !idx.unique(); bool dropDups = idx.dropDups(); ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords ); unsigned long long n = 0; auto_ptr cc; { shared_ptr c = theDataFileMgr.findAll(ns); cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) ); } CursorId id = cc->cursorid(); while ( cc->ok() ) { BSONObj js = cc->current(); try { { if ( !dupsAllowed && dropDups ) { LastError::Disabled led( lastError.get() ); _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed); } else { _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed); } } cc->advance(); } catch( AssertionException& e ) { if( e.interrupted() ) { killCurrentOp.checkForInterrupt(); } if ( dropDups ) { DiskLoc toDelete = cc->currLoc(); bool ok = cc->advance(); cc->updateLocation(); theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true , true ); if( ClientCursor::find(id, false) == 0 ) { cc.release(); if( !ok ) { /* we were already at the end. normal. */ } else { uasserted(12585, "cursor gone during bg index; dropDups"); } break; } } else { log() << "background addExistingToIndex exception " << e.what() << endl; throw; } } n++; progress.hit(); getDur().commitIfNeeded(); if ( cc->yieldSometimes( ClientCursor::WillNeed ) ) { progress.setTotalWhileRunning( d->stats.nrecords ); } else { cc.release(); uasserted(12584, "cursor gone during bg index"); break; } } progress.finished(); return n; } /* we do set a flag in the namespace for quick checking, but this is our authoritative info - that way on a crash/restart, we don't think we are still building one. */ set bgJobsInProgress; void prep(const char *ns, NamespaceDetails *d) { assertInWriteLock(); uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , dbMutex.getState() == 1 ); bgJobsInProgress.insert(d); } void done(const char *ns, NamespaceDetails *d) { NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache assertInWriteLock(); } public: BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { } unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { unsigned long long n = 0; prep(ns.c_str(), d); assert( idxNo == d->nIndexes ); try { idx.head.writing() = idx.idxInterface().addBucket(idx); n = addExistingToIndex(ns.c_str(), d, idx, idxNo); } catch(...) { if( cc().database() && nsdetails(ns.c_str()) == d ) { assert( idxNo == d->nIndexes ); done(ns.c_str(), d); } else { log() << "ERROR: db gone during bg index?" << endl; } throw; } assert( idxNo == d->nIndexes ); done(ns.c_str(), d); return n; } }; /** * For the lifetime of this object, an index build is indicated on the specified * namespace and the newest index is marked as absent. This simplifies * the cleanup required on recovery. */ class RecoverableIndexState { public: RecoverableIndexState( NamespaceDetails *d ) : _d( d ) { indexBuildInProgress() = 1; nIndexes()--; } ~RecoverableIndexState() { DESTRUCTOR_GUARD ( nIndexes()++; indexBuildInProgress() = 0; ) } private: int &nIndexes() { return getDur().writingInt( _d->nIndexes ); } int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); } NamespaceDetails *_d; }; // throws DBException static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { tlog() << "build index " << ns << ' ' << idx.keyPattern() << ( background ? " background" : "" ) << endl; Timer t; unsigned long long n; assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be... assert( d->indexBuildInProgress == 0 ); assertInWriteLock(); RecoverableIndexState recoverable( d ); // Build index spec here in case the collection is empty and the index details are invalid idx.getSpec(); if( inDBRepair || !background ) { n = fastBuildIndex(ns.c_str(), d, idx, idxNo); assert( !idx.head.isNull() ); } else { BackgroundIndexBuildJob j(ns.c_str()); n = j.go(ns, d, idx, idxNo); } tlog() << "build index done " << n << " records " << t.millis() / 1000.0 << " secs" << endl; } /* add keys to indexes for a new record */ static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) { int n = d->nIndexesBeingBuilt(); for ( int i = 0; i < n; i++ ) { try { bool unique = d->idx(i).unique(); _indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique); } catch( DBException& ) { /* try to roll back previously added index entries note <= i (not < i) is important here as the index we were just attempted may be multikey and require some cleanup. */ for( int j = 0; j <= i; j++ ) { try { _unindexRecord(d->idx(j), obj, loc, false); } catch(...) { log(3) << "unindex fails on rollback after unique failure\n"; } } throw; } } } extern BSONObj id_obj; // { _id : 1 } void ensureHaveIdIndex(const char *ns) { NamespaceDetails *d = nsdetails(ns); if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) ) return; *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex; { NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { if( i.next().isIdIndex() ) return; } } string system_indexes = cc().database()->name + ".system.indexes"; BSONObjBuilder b; b.append("name", "_id_"); b.append("ns", ns); b.append("key", id_obj); BSONObj o = b.done(); /* edge case: note the insert could fail if we have hit maxindexes already */ theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize(), true); } #pragma pack(1) struct IDToInsert_ { char type; char _id[4]; OID oid; IDToInsert_() { type = (char) jstOID; strcpy(_id, "_id"); assert( sizeof(IDToInsert_) == 17 ); } } idToInsert_; struct IDToInsert : public BSONElement { IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {} } idToInsert; #pragma pack() void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) { BSONObj tmp = o; insertWithObjMod( ns, tmp, god ); logOp( "i", ns, tmp ); } /** @param o the object to insert. can be modified to add _id and thus be an in/out param */ DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) { bool addedID = false; DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID ); if( addedID && !loc.isNull() ) o = BSONObj( loc.rec() ); return loc; } bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ); // We are now doing two btree scans for all unique indexes (one here, and one when we've // written the record to the collection. This could be made more efficient inserting // dummy data here, keeping pointers to the btree nodes holding the dummy data and then // updating the dummy data with the DiskLoc of the real record. void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) { for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) { if( d->idx(idxNo).unique() ) { IndexDetails& idx = d->idx(idxNo); BSONObjSet keys; idx.getKeysFromObject(obj, keys); BSONObj order = idx.keyPattern(); IndexInterface& ii = idx.idxInterface(); for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { // WARNING: findSingle may not be compound index safe. this may need to change. see notes in // findSingle code. uassert( 12582, "duplicate key insert for unique index of capped collection", ii.findSingle(idx, idx.head, *i ).isNull() ); } } } } /** add a record to the end of the linked list chain within this extent. require: you must have already declared write intent for the record header. */ void addRecordToRecListInExtent(Record *r, DiskLoc loc) { dassert( loc.rec() == r ); Extent *e = r->myExtent(loc); if ( e->lastRecord.isNull() ) { Extent::FL *fl = getDur().writing(e->fl()); fl->firstRecord = fl->lastRecord = loc; r->prevOfs = r->nextOfs = DiskLoc::NullOfs; } else { Record *oldlast = e->lastRecord.rec(); r->prevOfs = e->lastRecord.getOfs(); r->nextOfs = DiskLoc::NullOfs; getDur().writingInt(oldlast->nextOfs) = loc.getOfs(); getDur().writingDiskLoc(e->lastRecord) = loc; } } NOINLINE_DECL DiskLoc outOfSpace(const char *ns, NamespaceDetails *d, int lenWHdr, bool god, DiskLoc extentLoc) { DiskLoc loc; if ( d->capped == 0 ) { // size capped doesn't grow log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl; cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god); loc = d->alloc(ns, lenWHdr, extentLoc); if ( loc.isNull() ) { log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n"; for ( int z=0; z<10 && lenWHdr > d->lastExtentSize; z++ ) { log() << "try #" << z << endl; cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god); loc = d->alloc(ns, lenWHdr, extentLoc); if ( ! loc.isNull() ) break; } } } return loc; } /** used by insert and also compact * @return null loc if out of space */ DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god) { DiskLoc extentLoc; DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); if ( loc.isNull() ) { loc = outOfSpace(ns, d, lenWHdr, god, extentLoc); } return loc; } bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) { uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns); if ( strstr(ns, ".system.") ) { // later:check for dba-type permissions here if have that at some point separate if ( strstr(ns, ".system.indexes" ) ) wouldAddIndex = true; else if ( legalClientSystemNS( ns , true ) ) { if ( obuf && strstr( ns , ".system.users" ) ) { BSONObj t( reinterpret_cast( obuf ) ); uassert( 14051 , "system.user entry needs 'user' field to be a string" , t["user"].type() == String ); uassert( 14052 , "system.user entry needs 'pwd' field to be a string" , t["pwd"].type() == String ); uassert( 14053 , "system.user entry needs 'user' field to be non-empty" , t["user"].String().size() ); uassert( 14054 , "system.user entry needs 'pwd' field to be non-empty" , t["pwd"].String().size() ); } } else if ( !god ) { // todo this should probably uasseert rather than doing this: log() << "ERROR: attempt to insert in system namespace " << ns << endl; return false; } } return true; } NOINLINE_DECL NamespaceDetails* insert_newNamespace(const char *ns, int len, bool god) { addNewNamespaceToCatalog(ns); /* todo: shouldn't be in the namespace catalog until after the allocations here work. also if this is an addIndex, those checks should happen before this! */ // This may create first file in the database. int ies = Extent::initialSize(len); if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { // probably an index. so we pick a value here for the first extent instead of using initialExtentSize() which is more // for user collections. TODO: we could look at the # of records in the parent collection to be smarter here. ies = (32+4) * 1024; } cc().database()->allocExtent(ns, ies, false, false); NamespaceDetails *d = nsdetails(ns); if ( !god ) ensureIdIndexForNewNs(ns); return d; } void NOINLINE_DECL insert_makeIndex(NamespaceDetails *tableToIndex, const string& tabletoidxns, const DiskLoc& loc) { uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos ); BSONObj info = loc.obj(); bool background = info["background"].trueValue(); if( background && cc().isSyncThread() ) { /* don't do background indexing on slaves. there are nuances. this could be added later but requires more code. */ log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl; background = false; } int idxNo = tableToIndex->nIndexes; IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes getDur().writingDiskLoc(idx.info) = loc; try { buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background); } catch( DBException& e ) { // save our error msg string as an exception or dropIndexes will overwrite our message LastError *le = lastError.get(); int savecode = 0; string saveerrmsg; if ( le ) { savecode = le->code; saveerrmsg = le->msg; } else { savecode = e.getCode(); saveerrmsg = e.what(); } // roll back this index string name = idx.indexName(); BSONObjBuilder b; string errmsg; bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true); if( !ok ) { log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl; } assert( le && !saveerrmsg.empty() ); raiseError(savecode,saveerrmsg.c_str()); throw; } } /* if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc after the call -- that will prevent a double buffer copy in some cases (btree.cpp). @param mayAddIndex almost always true, except for invocation from rename namespace command. @param addedID if not null, set to true if adding _id element. you must assure false before calling if using. */ DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) { bool wouldAddIndex = false; massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); { const char *sys = strstr(ns, "system."); if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) return DiskLoc(); } bool addIndex = wouldAddIndex && mayAddIndex; NamespaceDetails *d = nsdetails(ns); if ( d == 0 ) { d = insert_newNamespace(ns, len, god); } d->paddingFits(); NamespaceDetails *tableToIndex = 0; string tabletoidxns; BSONObj fixedIndexObject; if ( addIndex ) { assert( obuf ); BSONObj io((const char *) obuf); if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) { // prepare creates _id itself, or this indicates to fail the build silently (such // as if index already exists) return DiskLoc(); } if ( ! fixedIndexObject.isEmpty() ) { obuf = fixedIndexObject.objdata(); len = fixedIndexObject.objsize(); } } int addID = 0; // 0 if not adding _id; if adding, the length of that new element if( !god ) { /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); BSONElement idField = io.getField( "_id" ); uassert( 10099 , "_id cannot be an array", idField.type() != Array ); // we don't add _id for capped collections as they don't have an _id index if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 && d->haveIdIndex() ) { if( addedID ) *addedID = true; addID = len; idToInsert_.oid.init(); len += idToInsert.size(); } BSONElementManipulator::lookForTimestamps( io ); } int lenWHdr = len + Record::HeaderSize; lenWHdr = (int) (lenWHdr * d->paddingFactor); if ( lenWHdr == 0 ) { // old datafiles, backward compatible here. assert( d->paddingFactor == 0 ); *getDur().writing(&d->paddingFactor) = 1.0; lenWHdr = len + Record::HeaderSize; } // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->nIndexes && d->capped && !god ) { checkNoIndexConflicts( d, BSONObj( reinterpret_cast( obuf ) ) ); } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); if ( loc.isNull() ) { log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl; assert(d->capped); return DiskLoc(); } Record *r = loc.rec(); { assert( r->lengthWithHeaders >= lenWHdr ); r = (Record*) getDur().writingPtr(r, lenWHdr); if( addID ) { /* a little effort was made here to avoid a double copy when we add an ID */ ((int&)*r->data) = *((int*) obuf) + idToInsert.size(); memcpy(r->data+4, idToInsert.rawdata(), idToInsert.size()); memcpy(r->data+4+idToInsert.size(), ((char *)obuf)+4, addID-4); } else { if( obuf ) // obuf can be null from internal callers memcpy(r->data, obuf, len); } } addRecordToRecListInExtent(r, loc); /* durability todo : this could be a bit annoying / slow to record constantly */ { NamespaceDetails::Stats *s = getDur().writing(&d->stats); s->datasize += r->netLength(); s->nrecords++; } // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket if ( !god ) NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp(); if ( tableToIndex ) { insert_makeIndex(tableToIndex, tabletoidxns, loc); } /* add this record to our indexes */ if ( d->nIndexes ) { try { BSONObj obj(r->data); indexRecord(d, obj, loc); } catch( AssertionException& e ) { // should be a dup key error on _id index if( tableToIndex || d->capped ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->capped ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; uassert_nothrow(s.c_str()); error() << s << endl; } else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; } } } return loc; } /* special version of insert for transaction logging -- streamlined a bit. assumes ns is capped and no indexes */ Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) { assert( d ); RARELY assert( d == nsdetails(ns) ); DEV assert( d == nsdetails(ns) ); DiskLoc extentLoc; int lenWHdr = len + Record::HeaderSize; DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); assert( !loc.isNull() ); Record *r = loc.rec(); assert( r->lengthWithHeaders >= lenWHdr ); Extent *e = r->myExtent(loc); if ( e->lastRecord.isNull() ) { Extent::FL *fl = getDur().writing( e->fl() ); fl->firstRecord = fl->lastRecord = loc; Record::NP *np = getDur().writing(r->np()); np->nextOfs = np->prevOfs = DiskLoc::NullOfs; } else { Record *oldlast = e->lastRecord.rec(); Record::NP *np = getDur().writing(r->np()); np->prevOfs = e->lastRecord.getOfs(); np->nextOfs = DiskLoc::NullOfs; getDur().writingInt( oldlast->nextOfs ) = loc.getOfs(); e->lastRecord.writing() = loc; } /* todo: don't update for oplog? seems wasteful. */ { NamespaceDetails::Stats *s = getDur().writing(&d->stats); s->datasize += r->netLength(); s->nrecords++; } return r; } } // namespace mongo #include "clientcursor.h" namespace mongo { void dropAllDatabasesExceptLocal() { writelock lk(""); vector n; getDatabaseNames(n); if( n.size() == 0 ) return; log() << "dropAllDatabasesExceptLocal " << n.size() << endl; for( vector::iterator i = n.begin(); i != n.end(); i++ ) { if( *i != "local" ) { Client::Context ctx(*i); dropDatabase(*i); } } } void dropDatabase(string db) { log(1) << "dropDatabase " << db << endl; Database *d = cc().database(); assert( d ); assert( d->name == db ); BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str()); dbMutex.assertWriteLocked(); // Not sure we need this here, so removed. If we do, we need to move it down // within other calls both (1) as they could be called from elsewhere and // (2) to keep the lock order right - groupcommitmutex must be locked before // mmmutex (if both are locked). // // RWLockRecursive::Exclusive lk(MongoFile::mmmutex); getDur().syncDataAndTruncateJournal(); Database::closeDatabase( d->name.c_str(), d->path ); d = 0; // d is now deleted _deleteDataFiles( db.c_str() ); } typedef boost::filesystem::path Path; void boostRenameWrapper( const Path &from, const Path &to ) { try { boost::filesystem::rename( from, to ); } catch ( const boost::filesystem::filesystem_error & ) { // boost rename doesn't work across partitions boost::filesystem::copy_file( from, to); boost::filesystem::remove( from ); } } // back up original database files to 'temp' dir void _renameForBackup( const char *database, const Path &reservedPath ) { Path newPath( reservedPath ); if ( directoryperdb ) newPath /= database; class Renamer : public FileOp { public: Renamer( const Path &newPath ) : newPath_( newPath ) {} private: const boost::filesystem::path &newPath_; virtual bool apply( const Path &p ) { if ( !boost::filesystem::exists( p ) ) return false; boostRenameWrapper( p, newPath_ / ( p.leaf() + ".bak" ) ); return true; } virtual const char * op() const { return "renaming"; } } renamer( newPath ); _applyOpToDataFiles( database, renamer, true ); } // move temp files to standard data dir void _replaceWithRecovered( const char *database, const char *reservedPathString ) { Path newPath( dbpath ); if ( directoryperdb ) newPath /= database; class Replacer : public FileOp { public: Replacer( const Path &newPath ) : newPath_( newPath ) {} private: const boost::filesystem::path &newPath_; virtual bool apply( const Path &p ) { if ( !boost::filesystem::exists( p ) ) return false; boostRenameWrapper( p, newPath_ / p.leaf() ); return true; } virtual const char * op() const { return "renaming"; } } replacer( newPath ); _applyOpToDataFiles( database, replacer, true, reservedPathString ); } // generate a directory name for storing temp data files Path uniqueReservedPath( const char *prefix ) { Path repairPath = Path( repairpath ); Path reservedPath; int i = 0; bool exists = false; do { stringstream ss; ss << prefix << "_repairDatabase_" << i++; reservedPath = repairPath / ss.str(); BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) ); } while ( exists ); return reservedPath; } boost::intmax_t dbSize( const char *database ) { class SizeAccumulator : public FileOp { public: SizeAccumulator() : totalSize_( 0 ) {} boost::intmax_t size() const { return totalSize_; } private: virtual bool apply( const boost::filesystem::path &p ) { if ( !boost::filesystem::exists( p ) ) return false; totalSize_ += boost::filesystem::file_size( p ); return true; } virtual const char *op() const { return "checking size"; } boost::intmax_t totalSize_; }; SizeAccumulator sa; _applyOpToDataFiles( database, sa ); return sa.size(); } bool repairDatabase( string dbNameS , string &errmsg, bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) { doingRepair dr; dbNameS = nsToDatabase( dbNameS ); const char * dbName = dbNameS.c_str(); stringstream ss; ss << "localhost:" << cmdLine.port; string localhost = ss.str(); problem() << "repairDatabase " << dbName << endl; assert( cc().database()->name == dbName ); assert( cc().database()->path == dbpath ); BackgroundOperation::assertNoBgOpInProgForDb(dbName); getDur().syncDataAndTruncateJournal(); // Must be done before and after repair boost::intmax_t totalSize = dbSize( dbName ); boost::intmax_t freeSize = File::freeSpace(repairpath); if ( freeSize > -1 && freeSize < totalSize ) { stringstream ss; ss << "Cannot repair database " << dbName << " having size: " << totalSize << " (bytes) because free disk space is: " << freeSize << " (bytes)"; errmsg = ss.str(); problem() << errmsg << endl; return false; } Path reservedPath = uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ? "backup" : "$tmp" ); BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) ); string reservedPathString = reservedPath.native_directory_string(); bool res; { // clone to temp location, which effectively does repair Client::Context ctx( dbName, reservedPathString ); assert( ctx.justCreated() ); res = cloneFrom(localhost.c_str(), errmsg, dbName, /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false, /*mayYield*/false, /*mayBeInterrupted*/true); Database::closeDatabase( dbName, reservedPathString.c_str() ); } if ( !res ) { errmsg = str::stream() << "clone failed for " << dbName << " with error: " << errmsg; problem() << errmsg << endl; if ( !preserveClonedFilesOnFailure ) BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); getDur().syncDataAndTruncateJournal(); // Must be done before and after repair return false; } MongoFile::flushAll(true); Client::Context ctx( dbName ); Database::closeDatabase( dbName, dbpath ); if ( backupOriginalFiles ) { _renameForBackup( dbName, reservedPath ); } else { _deleteDataFiles( dbName ); BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) ); } _replaceWithRecovered( dbName, reservedPathString.c_str() ); if ( !backupOriginalFiles ) BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); getDur().syncDataAndTruncateJournal(); // Must be done before and after repair return true; } void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) { if ( afterAllocator ) FileAllocator::get()->waitUntilFinished(); string c = database; c += '.'; boost::filesystem::path p(path); if ( directoryperdb ) p /= database; boost::filesystem::path q; q = p / (c+"ns"); bool ok = false; BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) ); if ( ok ) log(2) << fo.op() << " file " << q.string() << endl; int i = 0; int extra = 10; // should not be necessary, this is defensive in case there are missing files while ( 1 ) { assert( i <= DiskLoc::MaxFiles ); stringstream ss; ss << c << i; q = p / ss.str(); BOOST_CHECK_EXCEPTION( ok = fo.apply(q) ); if ( ok ) { if ( extra != 10 ) { log(1) << fo.op() << " file " << q.string() << endl; log() << " _applyOpToDataFiles() warning: extra == " << extra << endl; } } else if ( --extra <= 0 ) break; i++; } } NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); } bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) { log() << "DatabaseHolder::closeAll path:" << path << endl; dbMutex.assertWriteLocked(); map& m = _paths[path]; _size -= m.size(); set< string > dbs; for ( map::iterator i = m.begin(); i != m.end(); i++ ) { wassert( i->second->path == path ); dbs.insert( i->first ); } currentClient.get()->getContext()->clear(); BSONObjBuilder bb( result.subarrayStart( "dbs" ) ); int n = 0; int nNotClosed = 0; for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) { string name = *i; log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl; Client::Context ctx( name , path ); if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) { log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl; nNotClosed++; } else { Database::closeDatabase( name.c_str() , path ); bb.append( bb.numStr( n++ ) , name ); } } bb.done(); if( nNotClosed ) result.append("nNotClosed", nNotClosed); else { ClientCursor::assertNoCursors(); } return true; } } // namespace mongo