diff options
Diffstat (limited to 'src/mongo/db/pdfile.h')
-rw-r--r-- | src/mongo/db/pdfile.h | 546 |
1 files changed, 546 insertions, 0 deletions
diff --git a/src/mongo/db/pdfile.h b/src/mongo/db/pdfile.h new file mode 100644 index 00000000000..cd6062b1a48 --- /dev/null +++ b/src/mongo/db/pdfile.h @@ -0,0 +1,546 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* pdfile.h + + Files: + database.ns - namespace index + database.1 - data files + database.2 + ... +*/ + +#pragma once + +#include "../pch.h" +#include "../util/mmap.h" +#include "diskloc.h" +#include "jsobjmanipulator.h" +#include "namespace-inl.h" +#include "client.h" +#include "mongommf.h" + +namespace mongo { + + class DataFileHeader; + class Extent; + class Record; + class Cursor; + class OpDebug; + + void dropDatabase(string db); + bool repairDatabase(string db, string &errmsg, bool preserveClonedFilesOnFailure = false, bool backupOriginalFiles = false); + + /* low level - only drops this ns */ + void dropNS(const string& dropNs); + + /* deletes this ns, indexes and cursors */ + void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ); + bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0); + shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc()); + + bool isValidNS( const StringData& ns ); + + /*---------------------------------------------------------------------*/ + + class MongoDataFile { + friend class DataFileMgr; + friend class BasicCursor; + public: + MongoDataFile(int fn) : _mb(0), fileNo(fn) { } + + /** @return true if found and opened. if uninitialized (prealloc only) does not open. */ + bool openExisting( const char *filename ); + + /** creates if DNE */ + void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false); + + /* allocate a new extent from this datafile. + @param capped - true if capped collection + @param loops is our recursion check variable - you want to pass in zero + */ + Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0); + + DataFileHeader *getHeader() { return header(); } + + unsigned long long length() const { return mmf.length(); } + + /* return max size an extent may be */ + static int maxSize(); + + /** fsync */ + void flush( bool sync ); + + /** only use fore debugging */ + Extent* debug_getExtent(DiskLoc loc) { return _getExtent( loc ); } + private: + void badOfs(int) const; + void badOfs2(int) const; + int defaultSize( const char *filename ) const; + + Extent* getExtent(DiskLoc loc) const; + Extent* _getExtent(DiskLoc loc) const; + Record* recordAt(DiskLoc dl); + Record* makeRecord(DiskLoc dl, int size); + void grow(DiskLoc dl, int size); + + char* p() const { return (char *) _mb; } + DataFileHeader* header() { return (DataFileHeader*) _mb; } + + MongoMMF mmf; + void *_mb; // the memory mapped view + int fileNo; + }; + + class DataFileMgr { + friend class BasicCursor; + public: + void init(const string& path ); + + /* see if we can find an extent of the right size in the freelist. */ + static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false); + + /** @return DiskLoc where item ends up */ + // changedId should be initialized to false + const DiskLoc updateRecord( + const char *ns, + NamespaceDetails *d, + NamespaceDetailsTransient *nsdt, + Record *toupdate, const DiskLoc& dl, + const char *buf, int len, OpDebug& debug, bool god=false); + + // The object o may be updated if modified on insert. + void insertAndLog( const char *ns, const BSONObj &o, bool god = false ); + + /** insert will add an _id to the object if not present. if you would like to see the final object + after such an addition, use this method. + @param o both and in and out param + */ + DiskLoc insertWithObjMod(const char *ns, BSONObj & /*out*/o, bool god = false); + + /** @param obj in value only for this version. */ + void insertNoReturnVal(const char *ns, BSONObj o, bool god = false); + + DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, bool mayAddIndex = true, bool *addedID = 0); + static shared_ptr<Cursor> findAll(const char *ns, const DiskLoc &startLoc = DiskLoc()); + + /* special version of insert for transaction logging -- streamlined a bit. + assumes ns is capped and no indexes + no _id field check + */ + Record* fast_oplog_insert(NamespaceDetails *d, const char *ns, int len); + + static Extent* getExtent(const DiskLoc& dl); + static Record* getRecord(const DiskLoc& dl); + static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len); + + void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false, bool logOp=false); + + /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */ + void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl); + + private: + vector<MongoDataFile *> files; + }; + + extern DataFileMgr theDataFileMgr; + +#pragma pack(1) + + class DeletedRecord { + public: + int lengthWithHeaders; + int extentOfs; + DiskLoc nextDeleted; + DiskLoc myExtentLoc(const DiskLoc& myLoc) const { + return DiskLoc(myLoc.a(), extentOfs); + } + Extent* myExtent(const DiskLoc& myLoc) { + return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); + } + }; + + /* Record is a record in a datafile. DeletedRecord is similar but for deleted space. + + *11:03:20 AM) dm10gen: regarding extentOfs... + (11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords + (11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs. (64 bit total) + (11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset + (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo + (11:04:33 AM) dm10gen: see class DiskLoc for more info + (11:04:43 AM) dm10gen: so that is how Record::myExtent() works + (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must populate its extentOfs then + */ + class Record { + public: + enum HeaderSizeValue { HeaderSize = 16 }; + int lengthWithHeaders; + int extentOfs; + int nextOfs; + int prevOfs; + + /** be careful when referencing this that your write intent was correct */ + char data[4]; + + int netLength() { + return lengthWithHeaders - HeaderSize; + } + //void setNewLength(int netlen) { lengthWithHeaders = netlen + HeaderSize; } + + /* use this when a record is deleted. basically a union with next/prev fields */ + DeletedRecord& asDeleted() { return *((DeletedRecord*) this); } + + Extent* myExtent(const DiskLoc& myLoc) { return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); } + + /* get the next record in the namespace, traversing extents as necessary */ + DiskLoc getNext(const DiskLoc& myLoc); + DiskLoc getPrev(const DiskLoc& myLoc); + + DiskLoc nextInExtent(const DiskLoc& myLoc) { + if ( nextOfs == DiskLoc::NullOfs ) + return DiskLoc(); + assert( nextOfs ); + return DiskLoc(myLoc.a(), nextOfs); + } + + struct NP { + int nextOfs; + int prevOfs; + }; + NP* np() { return (NP*) &nextOfs; } + + // --------------------- + // memory cache + // --------------------- + + /** + * touches the data so that is in physical memory + * @param entireRecrd if false, only the header and first byte is touched + * if true, the entire record is touched + * */ + void touch( bool entireRecrd = false ); + + /** + * @return if this record is likely in physical memory + * its not guaranteed because its possible it gets swapped out in a very unlucky windows + */ + bool likelyInPhysicalMemory(); + + /** + * tell the cache this Record was accessed + * @return this, for simple chaining + */ + Record* accessed(); + + static bool MemoryTrackingEnabled; + }; + + /* extents are datafile regions where all the records within the region + belong to the same namespace. + + (11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord + (11:12:55 AM) dm10gen: and that is placed on the free list + */ + class Extent { + public: + unsigned magic; + DiskLoc myLoc; + DiskLoc xnext, xprev; /* next/prev extent for this namespace */ + + /* which namespace this extent is for. this is just for troubleshooting really + and won't even be correct if the collection were renamed! + */ + Namespace nsDiagnostic; + + int length; /* size of the extent, including these fields */ + DiskLoc firstRecord; + DiskLoc lastRecord; + char _extentData[4]; + + static int HeaderSize() { return sizeof(Extent)-4; } + + bool validates() { + return !(firstRecord.isNull() ^ lastRecord.isNull()) && + length >= 0 && !myLoc.isNull(); + } + + BSONObj dump() { + return BSON( "loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev" << xprev.toString() + << "nsdiag" << nsDiagnostic.toString() + << "size" << length << "firstRecord" << firstRecord.toString() << "lastRecord" << lastRecord.toString()); + } + + void dump(iostream& s) { + s << " loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n'; + s << " nsdiag:" << nsDiagnostic.toString() << '\n'; + s << " size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n'; + } + + /* assumes already zeroed -- insufficient for block 'reuse' perhaps + Returns a DeletedRecord location which is the data in the extent ready for us. + Caller will need to add that to the freelist structure in namespacedetail. + */ + DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset, bool capped); + + /* like init(), but for a reuse case */ + DiskLoc reuse(const char *nsname, bool newUseIsAsCapped); + + bool isOk() const { return magic == 0x41424344; } + void assertOk() const { assert(isOk()); } + + Record* newRecord(int len); + + Record* getRecord(DiskLoc dl) { + assert( !dl.isNull() ); + assert( dl.sameFile(myLoc) ); + int x = dl.getOfs() - myLoc.getOfs(); + assert( x > 0 ); + return (Record *) (((char *) this) + x); + } + + Extent* getNextExtent() { return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); } + Extent* getPrevExtent() { return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); } + + static int maxSize(); + static int minSize() { return 0x100; } + /** + * @param len lengt of record we need + * @param lastRecord size of last extent which is a factor in next extent size + */ + static int followupSize(int len, int lastExtentLen); + + /** get a suggested size for the first extent in a namespace + * @param len length of record we need to insert + */ + static int initialSize(int len); + + struct FL { + DiskLoc firstRecord; + DiskLoc lastRecord; + }; + /** often we want to update just the firstRecord and lastRecord fields. + this helper is for that -- for use with getDur().writing() method + */ + FL* fl() { return (FL*) &firstRecord; } + + /** caller must declare write intent first */ + void markEmpty(); + private: + DiskLoc _reuse(const char *nsname, bool newUseIsAsCapped); // recycle an extent and reuse it for a different ns + }; + + /* a datafile - i.e. the "dbname.<#>" files : + + ---------------------- + DataFileHeader + ---------------------- + Extent (for a particular namespace) + Record + ... + Record (some chained for unused space) + ---------------------- + more Extents... + ---------------------- + */ + class DataFileHeader { + public: + int version; + int versionMinor; + int fileLength; + DiskLoc unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */ + int unusedLength; + char reserved[8192 - 4*4 - 8]; + + char data[4]; // first extent starts here + + enum { HeaderSize = 8192 }; + + bool isCurrentVersion() const { return ( version == PDFILE_VERSION ) && ( versionMinor == PDFILE_VERSION_MINOR ); } + + bool uninitialized() const { return version == 0; } + + void init(int fileno, int filelength, const char* filename) { + if ( uninitialized() ) { + DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl; + if( !(filelength > 32768 ) ) { + massert(13640, str::stream() << "DataFileHeader looks corrupt at file open filelength:" << filelength << " fileno:" << fileno, false); + } + + { + if( !d.dbMutex.isWriteLocked() ) { + log() << "*** TEMP NOT INITIALIZING FILE " << filename << ", not in a write lock." << endl; + log() << "temp bypass until more elaborate change - case that is manifesting is benign anyway" << endl; + return; +/** + log() << "ERROR can't create outside a write lock" << endl; + printStackTrace(); + ::abort(); +**/ + } + } + + getDur().createdFile(filename, filelength); + assert( HeaderSize == 8192 ); + DataFileHeader *h = getDur().writing(this); + h->fileLength = filelength; + h->version = PDFILE_VERSION; + h->versionMinor = PDFILE_VERSION_MINOR; + h->unused.set( fileno, HeaderSize ); + assert( (data-(char*)this) == HeaderSize ); + h->unusedLength = fileLength - HeaderSize - 16; + } + } + + bool isEmpty() const { + return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 ); + } + }; + +#pragma pack() + + inline Extent* MongoDataFile::_getExtent(DiskLoc loc) const { + loc.assertOk(); + Extent *e = (Extent *) (p()+loc.getOfs()); + return e; + } + + inline Extent* MongoDataFile::getExtent(DiskLoc loc) const { + Extent *e = _getExtent(loc); + e->assertOk(); + return e; + } + +} // namespace mongo + +#include "cursor.h" + +namespace mongo { + + inline Record* MongoDataFile::recordAt(DiskLoc dl) { + int ofs = dl.getOfs(); + if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path + return (Record*) (p()+ofs); + } + + inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) { + int ofs = dl.getOfs(); + if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path + return (Record*) (p()+ofs); + } + + inline DiskLoc Record::getNext(const DiskLoc& myLoc) { + if ( nextOfs != DiskLoc::NullOfs ) { + /* defensive */ + if ( nextOfs >= 0 && nextOfs < 10 ) { + sayDbContext("Assertion failure - Record::getNext() referencing a deleted record?"); + return DiskLoc(); + } + + return DiskLoc(myLoc.a(), nextOfs); + } + Extent *e = myExtent(myLoc); + while ( 1 ) { + if ( e->xnext.isNull() ) + return DiskLoc(); // end of table. + e = e->xnext.ext(); + if ( !e->firstRecord.isNull() ) + break; + // entire extent could be empty, keep looking + } + return e->firstRecord; + } + inline DiskLoc Record::getPrev(const DiskLoc& myLoc) { + if ( prevOfs != DiskLoc::NullOfs ) + return DiskLoc(myLoc.a(), prevOfs); + Extent *e = myExtent(myLoc); + if ( e->xprev.isNull() ) + return DiskLoc(); + return e->xprev.ext()->lastRecord; + } + + inline BSONObj DiskLoc::obj() const { + return BSONObj(rec()->accessed()); + } + inline DeletedRecord* DiskLoc::drec() const { + assert( _a != -1 ); + return (DeletedRecord*) rec(); + } + inline Extent* DiskLoc::ext() const { + return DataFileMgr::getExtent(*this); + } + + template< class V > + inline + const BtreeBucket<V> * DiskLoc::btree() const { + assert( _a != -1 ); + return (const BtreeBucket<V> *) rec()->data; + } + +} // namespace mongo + +#include "database.h" + +namespace mongo { + + boost::intmax_t dbSize( const char *database ); + + inline NamespaceIndex* nsindex(const char *ns) { + Database *database = cc().database(); + assert( database ); + DEV { + char buf[256]; + nsToDatabase(ns, buf); + if ( database->name != buf ) { + out() << "ERROR: attempt to write to wrong database\n"; + out() << " ns:" << ns << '\n'; + out() << " database->name:" << database->name << endl; + assert( database->name == buf ); + } + } + return &database->namespaceIndex; + } + + inline NamespaceDetails* nsdetails(const char *ns) { + // if this faults, did you set the current db first? (Client::Context + dblock) + return nsindex(ns)->details(ns); + } + + inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) { + assert( dl.a() != -1 ); + return cc().database()->getFile(dl.a())->getExtent(dl); + } + + inline Record* DataFileMgr::getRecord(const DiskLoc& dl) { + assert( dl.a() != -1 ); + return cc().database()->getFile(dl.a())->recordAt(dl); + } + + BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) ); + + inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) { + assert( dl.a() != -1 ); + return (DeletedRecord*) cc().database()->getFile(dl.a())->makeRecord(dl, sizeof(DeletedRecord)); + } + + void ensureHaveIdIndex(const char *ns); + + bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex ); + + inline BSONObj::BSONObj(const Record *r) { + init(r->data); + } + +} // namespace mongo |