/** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ /* pdfile.h Files: database.ns - namespace index database.1 - data files database.2 ... */ #pragma once #include "../pch.h" #include "../util/mmap.h" #include "diskloc.h" #include "jsobjmanipulator.h" #include "namespace-inl.h" #include "client.h" #include "mongommf.h" namespace mongo { class DataFileHeader; class Extent; class Record; class Cursor; class OpDebug; void dropDatabase(string db); bool repairDatabase(string db, string &errmsg, bool preserveClonedFilesOnFailure = false, bool backupOriginalFiles = false); /* low level - only drops this ns */ void dropNS(const string& dropNs); /* deletes this ns, indexes and cursors */ void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ); bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0); shared_ptr findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc()); bool isValidNS( const StringData& ns ); /*---------------------------------------------------------------------*/ class MongoDataFile { friend class DataFileMgr; friend class BasicCursor; public: MongoDataFile(int fn) : _mb(0), fileNo(fn) { } /** @return true if found and opened. if uninitialized (prealloc only) does not open. */ bool openExisting( const char *filename ); /** creates if DNE */ void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false); /* allocate a new extent from this datafile. @param capped - true if capped collection @param loops is our recursion check variable - you want to pass in zero */ Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0); DataFileHeader *getHeader() { return header(); } unsigned long long length() const { return mmf.length(); } /* return max size an extent may be */ static int maxSize(); /** fsync */ void flush( bool sync ); /** only use fore debugging */ Extent* debug_getExtent(DiskLoc loc) { return _getExtent( loc ); } private: void badOfs(int) const; void badOfs2(int) const; int defaultSize( const char *filename ) const; Extent* getExtent(DiskLoc loc) const; Extent* _getExtent(DiskLoc loc) const; Record* recordAt(DiskLoc dl); Record* makeRecord(DiskLoc dl, int size); void grow(DiskLoc dl, int size); char* p() const { return (char *) _mb; } DataFileHeader* header() { return (DataFileHeader*) _mb; } MongoMMF mmf; void *_mb; // the memory mapped view int fileNo; }; class DataFileMgr { friend class BasicCursor; public: void init(const string& path ); /* see if we can find an extent of the right size in the freelist. */ static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false); /** @return DiskLoc where item ends up */ // changedId should be initialized to false const DiskLoc updateRecord( const char *ns, NamespaceDetails *d, NamespaceDetailsTransient *nsdt, Record *toupdate, const DiskLoc& dl, const char *buf, int len, OpDebug& debug, bool god=false); // The object o may be updated if modified on insert. void insertAndLog( const char *ns, const BSONObj &o, bool god = false ); /** insert will add an _id to the object if not present. if you would like to see the final object after such an addition, use this method. @param o both and in and out param */ DiskLoc insertWithObjMod(const char *ns, BSONObj & /*out*/o, bool god = false); /** @param obj in value only for this version. */ void insertNoReturnVal(const char *ns, BSONObj o, bool god = false); DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, bool mayAddIndex = true, bool *addedID = 0); static shared_ptr findAll(const char *ns, const DiskLoc &startLoc = DiskLoc()); /* special version of insert for transaction logging -- streamlined a bit. assumes ns is capped and no indexes no _id field check */ Record* fast_oplog_insert(NamespaceDetails *d, const char *ns, int len); static Extent* getExtent(const DiskLoc& dl); static Record* getRecord(const DiskLoc& dl); static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len); void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false, bool logOp=false); /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */ void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl); private: vector files; }; extern DataFileMgr theDataFileMgr; #pragma pack(1) class DeletedRecord { public: int lengthWithHeaders; int extentOfs; DiskLoc nextDeleted; DiskLoc myExtentLoc(const DiskLoc& myLoc) const { return DiskLoc(myLoc.a(), extentOfs); } Extent* myExtent(const DiskLoc& myLoc) { return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); } }; /* Record is a record in a datafile. DeletedRecord is similar but for deleted space. *11:03:20 AM) dm10gen: regarding extentOfs... (11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords (11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs. (64 bit total) (11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo (11:04:33 AM) dm10gen: see class DiskLoc for more info (11:04:43 AM) dm10gen: so that is how Record::myExtent() works (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must populate its extentOfs then */ class Record { public: enum HeaderSizeValue { HeaderSize = 16 }; int lengthWithHeaders; int extentOfs; int nextOfs; int prevOfs; /** be careful when referencing this that your write intent was correct */ char data[4]; int netLength() { return lengthWithHeaders - HeaderSize; } //void setNewLength(int netlen) { lengthWithHeaders = netlen + HeaderSize; } /* use this when a record is deleted. basically a union with next/prev fields */ DeletedRecord& asDeleted() { return *((DeletedRecord*) this); } Extent* myExtent(const DiskLoc& myLoc) { return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); } /* get the next record in the namespace, traversing extents as necessary */ DiskLoc getNext(const DiskLoc& myLoc); DiskLoc getPrev(const DiskLoc& myLoc); DiskLoc nextInExtent(const DiskLoc& myLoc) { if ( nextOfs == DiskLoc::NullOfs ) return DiskLoc(); assert( nextOfs ); return DiskLoc(myLoc.a(), nextOfs); } struct NP { int nextOfs; int prevOfs; }; NP* np() { return (NP*) &nextOfs; } // --------------------- // memory cache // --------------------- /** * touches the data so that is in physical memory * @param entireRecrd if false, only the header and first byte is touched * if true, the entire record is touched * */ void touch( bool entireRecrd = false ); /** * @return if this record is likely in physical memory * its not guaranteed because its possible it gets swapped out in a very unlucky windows */ bool likelyInPhysicalMemory(); /** * tell the cache this Record was accessed * @return this, for simple chaining */ Record* accessed(); static bool MemoryTrackingEnabled; }; /* extents are datafile regions where all the records within the region belong to the same namespace. (11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord (11:12:55 AM) dm10gen: and that is placed on the free list */ class Extent { public: unsigned magic; DiskLoc myLoc; DiskLoc xnext, xprev; /* next/prev extent for this namespace */ /* which namespace this extent is for. this is just for troubleshooting really and won't even be correct if the collection were renamed! */ Namespace nsDiagnostic; int length; /* size of the extent, including these fields */ DiskLoc firstRecord; DiskLoc lastRecord; char _extentData[4]; static int HeaderSize() { return sizeof(Extent)-4; } bool validates() { return !(firstRecord.isNull() ^ lastRecord.isNull()) && length >= 0 && !myLoc.isNull(); } BSONObj dump() { return BSON( "loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev" << xprev.toString() << "nsdiag" << nsDiagnostic.toString() << "size" << length << "firstRecord" << firstRecord.toString() << "lastRecord" << lastRecord.toString()); } void dump(iostream& s) { s << " loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n'; s << " nsdiag:" << nsDiagnostic.toString() << '\n'; s << " size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n'; } /* assumes already zeroed -- insufficient for block 'reuse' perhaps Returns a DeletedRecord location which is the data in the extent ready for us. Caller will need to add that to the freelist structure in namespacedetail. */ DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset, bool capped); /* like init(), but for a reuse case */ DiskLoc reuse(const char *nsname, bool newUseIsAsCapped); bool isOk() const { return magic == 0x41424344; } void assertOk() const { assert(isOk()); } Record* newRecord(int len); Record* getRecord(DiskLoc dl) { assert( !dl.isNull() ); assert( dl.sameFile(myLoc) ); int x = dl.getOfs() - myLoc.getOfs(); assert( x > 0 ); return (Record *) (((char *) this) + x); } Extent* getNextExtent() { return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); } Extent* getPrevExtent() { return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); } static int maxSize(); static int minSize() { return 0x100; } /** * @param len lengt of record we need * @param lastRecord size of last extent which is a factor in next extent size */ static int followupSize(int len, int lastExtentLen); /** get a suggested size for the first extent in a namespace * @param len length of record we need to insert */ static int initialSize(int len); struct FL { DiskLoc firstRecord; DiskLoc lastRecord; }; /** often we want to update just the firstRecord and lastRecord fields. this helper is for that -- for use with getDur().writing() method */ FL* fl() { return (FL*) &firstRecord; } /** caller must declare write intent first */ void markEmpty(); private: DiskLoc _reuse(const char *nsname, bool newUseIsAsCapped); // recycle an extent and reuse it for a different ns }; /* a datafile - i.e. the "dbname.<#>" files : ---------------------- DataFileHeader ---------------------- Extent (for a particular namespace) Record ... Record (some chained for unused space) ---------------------- more Extents... ---------------------- */ class DataFileHeader { public: int version; int versionMinor; int fileLength; DiskLoc unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */ int unusedLength; char reserved[8192 - 4*4 - 8]; char data[4]; // first extent starts here enum { HeaderSize = 8192 }; bool isCurrentVersion() const { return ( version == PDFILE_VERSION ) && ( versionMinor == PDFILE_VERSION_MINOR ); } bool uninitialized() const { return version == 0; } void init(int fileno, int filelength, const char* filename) { if ( uninitialized() ) { DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl; if( !(filelength > 32768 ) ) { massert(13640, str::stream() << "DataFileHeader looks corrupt at file open filelength:" << filelength << " fileno:" << fileno, false); } { if( !dbMutex.isWriteLocked() ) { log() << "*** TEMP NOT INITIALIZING FILE " << filename << ", not in a write lock." << endl; log() << "temp bypass until more elaborate change - case that is manifesting is benign anyway" << endl; return; /** log() << "ERROR can't create outside a write lock" << endl; printStackTrace(); ::abort(); **/ } } getDur().createdFile(filename, filelength); assert( HeaderSize == 8192 ); DataFileHeader *h = getDur().writing(this); h->fileLength = filelength; h->version = PDFILE_VERSION; h->versionMinor = PDFILE_VERSION_MINOR; h->unused.set( fileno, HeaderSize ); assert( (data-(char*)this) == HeaderSize ); h->unusedLength = fileLength - HeaderSize - 16; } } bool isEmpty() const { return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 ); } }; #pragma pack() inline Extent* MongoDataFile::_getExtent(DiskLoc loc) const { loc.assertOk(); Extent *e = (Extent *) (p()+loc.getOfs()); return e; } inline Extent* MongoDataFile::getExtent(DiskLoc loc) const { Extent *e = _getExtent(loc); e->assertOk(); return e; } } // namespace mongo #include "cursor.h" namespace mongo { inline Record* MongoDataFile::recordAt(DiskLoc dl) { int ofs = dl.getOfs(); if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path return (Record*) (p()+ofs); } inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) { int ofs = dl.getOfs(); if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path return (Record*) (p()+ofs); } inline DiskLoc Record::getNext(const DiskLoc& myLoc) { if ( nextOfs != DiskLoc::NullOfs ) { /* defensive */ if ( nextOfs >= 0 && nextOfs < 10 ) { sayDbContext("Assertion failure - Record::getNext() referencing a deleted record?"); return DiskLoc(); } return DiskLoc(myLoc.a(), nextOfs); } Extent *e = myExtent(myLoc); while ( 1 ) { if ( e->xnext.isNull() ) return DiskLoc(); // end of table. e = e->xnext.ext(); if ( !e->firstRecord.isNull() ) break; // entire extent could be empty, keep looking } return e->firstRecord; } inline DiskLoc Record::getPrev(const DiskLoc& myLoc) { if ( prevOfs != DiskLoc::NullOfs ) return DiskLoc(myLoc.a(), prevOfs); Extent *e = myExtent(myLoc); if ( e->xprev.isNull() ) return DiskLoc(); return e->xprev.ext()->lastRecord; } inline Record* DiskLoc::rec() const { return DataFileMgr::getRecord(*this); } inline BSONObj DiskLoc::obj() const { return BSONObj(rec()->accessed()); } inline DeletedRecord* DiskLoc::drec() const { assert( _a != -1 ); return (DeletedRecord*) rec(); } inline Extent* DiskLoc::ext() const { return DataFileMgr::getExtent(*this); } template< class V > inline const BtreeBucket * DiskLoc::btree() const { assert( _a != -1 ); return (const BtreeBucket *) rec()->data; } } // namespace mongo #include "database.h" namespace mongo { boost::intmax_t dbSize( const char *database ); inline NamespaceIndex* nsindex(const char *ns) { Database *database = cc().database(); assert( database ); DEV { char buf[256]; nsToDatabase(ns, buf); if ( database->name != buf ) { out() << "ERROR: attempt to write to wrong database database\n"; out() << " ns:" << ns << '\n'; out() << " database->name:" << database->name << endl; assert( database->name == buf ); } } return &database->namespaceIndex; } inline NamespaceDetails* nsdetails(const char *ns) { // if this faults, did you set the current db first? (Client::Context + dblock) return nsindex(ns)->details(ns); } inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) { assert( dl.a() != -1 ); return cc().database()->getFile(dl.a())->getExtent(dl); } inline Record* DataFileMgr::getRecord(const DiskLoc& dl) { assert( dl.a() != -1 ); return cc().database()->getFile(dl.a())->recordAt(dl); } BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) ); inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) { assert( dl.a() != -1 ); return (DeletedRecord*) cc().database()->getFile(dl.a())->makeRecord(dl, sizeof(DeletedRecord)); } void ensureHaveIdIndex(const char *ns); bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex ); inline BSONObj::BSONObj(const Record *r) { init(r->data); } } // namespace mongo