// namespace_details.h /** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #pragma once #include "mongo/pch.h" #include "mongo/db/d_concurrency.h" #include "mongo/db/diskloc.h" #include "mongo/db/index.h" #include "mongo/db/jsobj.h" #include "mongo/db/mongommf.h" #include "mongo/db/namespace.h" #include "mongo/db/namespacestring.h" #include "mongo/db/queryoptimizercursor.h" #include "mongo/db/querypattern.h" #include "mongo/platform/unordered_map.h" #include "mongo/util/hashtab.h" namespace mongo { class Database; /** @return true if a client can modify this namespace even though it is under ".system." For example .system.users is ok for regular clients to update. @param write used when .system.js */ bool legalClientSystemNS( const string& ns , bool write ); /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes so you can look for a deleterecord about the right size. */ const int Buckets = 19; const int MaxBucket = 18; extern int bucketSizes[]; #pragma pack(1) /* NamespaceDetails : this is the "header" for a collection that has all its details. It's in the .ns file and this is a memory mapped region (thus the pack pragma above). */ class NamespaceDetails { public: enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 }; /*-------- data fields, as present on disk : */ DiskLoc firstExtent; DiskLoc lastExtent; /* NOTE: capped collections v1 override the meaning of deletedList. deletedList[0] points to a list of free records (DeletedRecord's) for all extents in the capped namespace. deletedList[1] points to the last record in the prev extent. When the "current extent" changes, this value is updated. !deletedList[1].isValid() when this value is not yet computed. */ DiskLoc deletedList[Buckets]; // ofs 168 (8 byte aligned) struct Stats { // datasize and nrecords MUST Be adjacent code assumes! long long datasize; // this includes padding, but not record headers long long nrecords; } stats; int lastExtentSize; int nIndexes; private: // ofs 192 IndexDetails _indexes[NIndexesBase]; // ofs 352 (16 byte aligned) int _isCapped; // there is wasted space here if I'm right (ERH) int _maxDocsInCapped; // max # of objects for a capped table. TODO: should this be 64 bit? double _paddingFactor; // 1.0 = no padding. // ofs 386 (16) int _systemFlags; // things that the system sets/cares about public: DiskLoc capExtent; DiskLoc capFirstNewRecord; unsigned short dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h unsigned short indexFileVersion; unsigned long long multiKeyIndexBits; private: // ofs 400 (16) unsigned long long reservedA; long long extraOffset; // where the $extra info is located (bytes relative to this) public: int indexBuildInProgress; // 1 if in prog private: int _userFlags; char reserved[72]; /*-------- end data 496 bytes */ public: explicit NamespaceDetails( const DiskLoc &loc, bool _capped ); class Extra { long long _next; public: IndexDetails details[NIndexesExtra]; private: unsigned reserved2; unsigned reserved3; Extra(const Extra&) { verify(false); } Extra& operator=(const Extra& r) { verify(false); return *this; } public: Extra() { } long ofsFrom(NamespaceDetails *d) { return ((char *) this) - ((char *) d); } void init() { memset(this, 0, sizeof(Extra)); } Extra* next(NamespaceDetails *d) { if( _next == 0 ) return 0; return (Extra*) (((char *) d) + _next); } void setNext(long ofs) { *getDur().writing(&_next) = ofs; } void copy(NamespaceDetails *d, const Extra& e) { memcpy(this, &e, sizeof(Extra)); _next = 0; } }; Extra* extra() { if( extraOffset == 0 ) return 0; return (Extra *) (((char *) this) + extraOffset); } /* add extra space for indexes when more than 10 */ Extra* allocExtra(const char *ns, int nindexessofar); void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra /* called when loaded from disk */ void onLoad(const Namespace& k); /* dump info on this namespace. for debugging. */ void dump(const Namespace& k); /* dump info on all extents for this namespace. for debugging. */ void dumpExtents(); private: Extent *theCapExtent() const { return capExtent.ext(); } void advanceCapExtent( const char *ns ); DiskLoc __capAlloc(int len); DiskLoc cappedAlloc(const char *ns, int len); DiskLoc &cappedFirstDeletedInCurExtent(); bool nextIsInCapExtent( const DiskLoc &dl ) const; public: bool isCapped() const { return _isCapped; } long long maxCappedDocs() const { verify( isCapped() ); return _maxDocsInCapped; } void setMaxCappedDocs( long long max ); DiskLoc& cappedListOfAllDeletedRecords() { return deletedList[0]; } DiskLoc& cappedLastDelRecLastExtent() { return deletedList[1]; } void cappedDumpDelInfo(); bool capLooped() const { return _isCapped && capFirstNewRecord.isValid(); } bool inCapExtent( const DiskLoc &dl ) const; void cappedCheckMigrate(); /** * Truncate documents newer than the document at 'end' from the capped * collection. The collection cannot be completely emptied using this * function. An assertion will be thrown if that is attempted. * @param inclusive - Truncate 'end' as well iff true */ void cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive); /** Remove all documents from the capped collection */ void emptyCappedCollection(const char *ns); /* when a background index build is in progress, we don't count the index in nIndexes until complete, yet need to still use it in _indexRecord() - thus we use this function for that. */ int nIndexesBeingBuilt() const { return nIndexes + indexBuildInProgress; } /* NOTE: be careful with flags. are we manipulating them in read locks? if so, this isn't thread safe. TODO */ enum SystemFlags { Flag_HaveIdIndex = 1 << 0 // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called) }; enum UserFlags { Flag_UsePowerOf2Sizes = 1 << 0 }; IndexDetails& idx(int idxNo, bool missingExpected = false ); /** get the IndexDetails for the index currently being built in the background. (there is at most one) */ IndexDetails& inProgIdx() { DEV verify(indexBuildInProgress); return idx(nIndexes); } class IndexIterator { public: int pos() { return i; } // note this is the next one to come bool more() { return i < n; } IndexDetails& next() { return d->idx(i++); } private: friend class NamespaceDetails; int i, n; NamespaceDetails *d; IndexIterator(NamespaceDetails *_d); }; IndexIterator ii() { return IndexIterator(this); } /* hackish - find our index # in the indexes array */ int idxNo(const IndexDetails& idx); /* multikey indexes are indexes where there are more than one key in the index for a single document. see multikey in wiki. for these, we have to do some dedup work on queries. */ bool isMultikey(int i) const { return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; } void setIndexIsMultikey(const char *thisns, int i); /* add a new index. does not add to system.indexes etc. - just to NamespaceDetails. caller must populate returned object. */ IndexDetails& addIndex(const char *thisns, bool resetTransient=true); void aboutToDeleteAnIndex() { clearSystemFlag( Flag_HaveIdIndex ); } /* returns index of the first index in which the field is present. -1 if not present. */ int fieldIsIndexed(const char *fieldName); /** * @return the actual size to create * will be >= oldRecordSize * based on padding and any other flags */ int getRecordAllocationSize( int minRecordSize ); double paddingFactor() const { return _paddingFactor; } void setPaddingFactor( double paddingFactor ) { *getDur().writing(&_paddingFactor) = paddingFactor; } /* called to indicate that an update fit in place. fits also called on an insert -- idea there is that if you had some mix and then went to pure inserts it would adapt and PF would trend to 1.0. note update calls insert on a move so there is a double count there that must be adjusted for below. todo: greater sophistication could be helpful and added later. for example the absolute size of documents might be considered -- in some cases smaller ones are more likely to grow than larger ones in the same collection? (not always) */ void paddingFits() { MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less double x = _paddingFactor - 0.001; if ( x >= 1.0 ) { setPaddingFactor( x ); } } } void paddingTooSmall() { MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less /* the more indexes we have, the higher the cost of a move. so we take that into account herein. note on a move that insert() calls paddingFits(), thus here for example with no inserts and nIndexes = 1 we have .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves. insert heavy can pushes this down considerably. further tweaking will be a good idea but this should be an adequate starting point. */ double N = min(nIndexes,7) + 3; double x = _paddingFactor + (0.001 * N); if ( x <= 2.0 ) { setPaddingFactor( x ); } } } // @return offset in indexes[] int findIndexByName(const char *name); // @return offset in indexes[] int findIndexByKeyPattern(const BSONObj& keyPattern); void findIndexByType( const string& name , vector& matches ) { IndexIterator i = ii(); while ( i.more() ) { if ( i.next().getSpec().getTypeName() == name ) matches.push_back( i.pos() - 1 ); } } /* Returns the index entry for the first index whose prefix contains * 'keyPattern'. If 'requireSingleKey' is true, skip indices that contain * array attributes. Otherwise, returns NULL. */ const IndexDetails* findIndexByPrefix( const BSONObj &keyPattern , bool requireSingleKey ); const int systemFlags() const { return _systemFlags; } bool isSystemFlagSet( int flag ) const { return _systemFlags & flag; } void setSystemFlag( int flag ); void clearSystemFlag( int flag ); const int userFlags() const { return _userFlags; } bool isUserFlagSet( int flag ) const { return _userFlags & flag; } /** * these methods only modify NamespaceDetails and do not * sync changes back to system.namespaces * a typical call might if ( nsd->setUserFlag( 4 ) ) { nsd->syncUserFlags(); } * these methods all return true iff only something was modified */ bool setUserFlag( int flag ); bool clearUserFlag( int flag ); bool replaceUserFlags( int flags ); void syncUserFlags( const string& ns ); /* @return -1 = not found generally id is first index, so not that expensive an operation (assuming present). */ int findIdIndex() { IndexIterator i = ii(); while( i.more() ) { if( i.next().isIdIndex() ) return i.pos()-1; } return -1; } bool haveIdIndex() { return isSystemFlagSet( NamespaceDetails::Flag_HaveIdIndex ) || findIdIndex() >= 0; } /* return which "deleted bucket" for this size object */ static int bucket(int n) { for ( int i = 0; i < Buckets; i++ ) if ( bucketSizes[i] > n ) return i; return Buckets-1; } /* predetermine location of the next alloc without actually doing it. if cannot predetermine returns null (so still call alloc() then) */ DiskLoc allocWillBeAt(const char *ns, int lenToAlloc); /* allocate a new record. lenToAlloc includes headers. */ DiskLoc alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc); /* add a given record to the deleted chains for this NS */ void addDeletedRec(DeletedRecord *d, DiskLoc dloc); void dumpDeleted(set *extents = 0); // Start from firstExtent by default. DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const; // Start from lastExtent by default. DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const; long long storageSize( int * numExtents = 0 , BSONArrayBuilder * extentInfo = 0 ) const; int averageObjectSize() { if ( stats.nrecords == 0 ) return 5; return (int) (stats.datasize / stats.nrecords); } NamespaceDetails *writingWithoutExtra() { return ( NamespaceDetails* ) getDur().writingPtr( this, sizeof( NamespaceDetails ) ); } /** Make all linked Extra objects writeable as well */ NamespaceDetails *writingWithExtra(); private: DiskLoc _alloc(const char *ns, int len); void maybeComplain( const char *ns, int len ) const; DiskLoc __stdAlloc(int len, bool willBeAt); void compact(); // combine adjacent deleted records friend class NamespaceIndex; struct ExtraOld { // note we could use this field for more chaining later, so don't waste it: unsigned long long reserved1; IndexDetails details[NIndexesExtra]; unsigned reserved2; unsigned reserved3; }; /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */ void cappedTruncateLastDelUpdate(); BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 ); BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 ); BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 ); }; // NamespaceDetails #pragma pack() class ParsedQuery; class QueryPlanSummary; /* NamespaceDetailsTransient these are things we know / compute about a namespace that are transient -- things we don't actually store in the .ns file. so mainly caching of frequently used information. CAUTION: Are you maintaining this properly on a collection drop()? A dropdatabase()? Be careful. The current field "allIndexKeys" may have too many keys in it on such an occurrence; as currently used that does not cause anything terrible to happen. todo: cleanup code, need abstractions and separation */ // todo: multiple db's with the same name (repairDatabase) is not handled herein. that may be // the way to go, if not used by repair, but need some sort of enforcement / asserts. class NamespaceDetailsTransient : boost::noncopyable { BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 ); //Database *database; const string _ns; void reset(); // < db -> < fullns -> NDT > > typedef unordered_map< string, shared_ptr > CMap; typedef unordered_map< string, CMap*, NamespaceDBHash, NamespaceDBEquals > DMap; static DMap _nsdMap; NamespaceDetailsTransient(Database*,const string& ns); public: ~NamespaceDetailsTransient(); void addedIndex() { reset(); } void deletedIndex() { reset(); } /** * reset stats for a given collection */ static void resetCollection(const string& ns ); /** * remove entry for a collection */ static void eraseCollection(const string& ns); /** * remove all entries for db */ static void eraseDB(const string& db); /** * @return a cursor interface to the query optimizer. The implementation may utilize a * single query plan or interleave results from multiple query plans before settling on a * single query plan. Note that the schema of currKey() documents, indexKeyPattern(), the * matcher(), and the isMultiKey() nature of the cursor may change over the course of * iteration. * * @param query - Query used to select indexes and populate matchers; not copied if unowned * (see bsonobj.h). * * @param order - Required ordering spec for documents produced by this cursor, empty object * default indicates no order requirement. If no index exists that satisfies the required * sort order, an empty shared_ptr is returned unless parsedQuery is also provided. This is * not copied if unowned. * * @param planPolicy - A policy for selecting query plans - see queryoptimizercursor.h * * @param simpleEqualityMatch - Set to true for certain simple queries - see * queryoptimizer.cpp. * * @param parsedQuery - Additional query parameters, as from a client query request. * * @param requireOrder - If false, the resulting cursor may return results in an order * inconsistent with the @param order spec. See queryoptimizercursor.h for information on * handling these results properly. * * @param singlePlanSummary - Query plan summary information that may be provided when a * cursor running a single plan is returned. * * The returned cursor may @throw inside of advance() or recoverFromYield() in certain error * cases, for example if a capped overrun occurred during a yield. This indicates that the * cursor was unable to perform a complete scan. * * This is a work in progress. Partial list of features not yet implemented through this * interface: * * - covered indexes * - in memory sorting */ static shared_ptr getCursor( const char *ns, const BSONObj &query, const BSONObj &order = BSONObj(), const QueryPlanSelectionPolicy &planPolicy = QueryPlanSelectionPolicy::any(), bool *simpleEqualityMatch = 0, const shared_ptr &parsedQuery = shared_ptr(), bool requireOrder = true, QueryPlanSummary *singlePlanSummary = 0 ); /** * @return a single cursor that may work well for the given query. A $or style query will * produce a single cursor, not a MultiCursor. * It is possible no cursor is returned if the sort is not supported by an index. Clients are responsible * for checking this if they are not sure an index for a sort exists, and defaulting to a non-sort if * no suitable indices exist. */ static shared_ptr bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ); /* indexKeys() cache ---------------------------------------------------- */ /* assumed to be in write lock for this */ private: bool _keysComputed; set _indexKeys; void computeIndexKeys(); public: /* get set of index keys for this namespace. handy to quickly check if a given field is indexed (Note it might be a secondary component of a compound index.) */ set& indexKeys() { DEV Lock::assertWriteLocked(_ns); if ( !_keysComputed ) computeIndexKeys(); return _indexKeys; } /* IndexSpec caching */ private: map _indexSpecs; static SimpleMutex _isMutex; public: const IndexSpec& getIndexSpec( const IndexDetails * details ) { IndexSpec& spec = _indexSpecs[details]; if ( ! spec._finishedInit ) { SimpleMutex::scoped_lock lk(_isMutex); if ( ! spec._finishedInit ) { spec.reset( details ); verify( spec._finishedInit ); } } return spec; } /* query cache (for query optimizer) ------------------------------------- */ private: int _qcWriteCount; map _qcCache; static NamespaceDetailsTransient& make_inlock(const string& ns); static CMap& get_cmap_inlock(const string& ns); public: static SimpleMutex _qcMutex; /* you must be in the qcMutex when calling this. A NamespaceDetailsTransient object will not go out of scope on you if you are d.dbMutex.atLeastReadLocked(), so you do't have to stay locked. Creates a NamespaceDetailsTransient before returning if one DNE. todo: avoid creating too many on erroneous ns queries. */ static NamespaceDetailsTransient& get_inlock(const string& ns); static NamespaceDetailsTransient& get(const char *ns) { // todo : _qcMutex will create bottlenecks in our parallelism SimpleMutex::scoped_lock lk(_qcMutex); return get_inlock(ns); } void clearQueryCache() { _qcCache.clear(); _qcWriteCount = 0; } /* you must notify the cache if you are doing writes, as query plan utility will change */ void notifyOfWriteOp() { if ( _qcCache.empty() ) return; if ( ++_qcWriteCount >= 100 ) clearQueryCache(); } CachedQueryPlan cachedQueryPlanForPattern( const QueryPattern &pattern ) { return _qcCache[ pattern ]; } void registerCachedQueryPlanForPattern( const QueryPattern &pattern, const CachedQueryPlan &cachedQueryPlan ) { _qcCache[ pattern ] = cachedQueryPlan; } }; /* NamespaceDetailsTransient */ inline NamespaceDetailsTransient& NamespaceDetailsTransient::get_inlock(const string& ns) { CMap& m = get_cmap_inlock(ns); CMap::iterator i = m.find( ns ); if ( i != m.end() && i->second.get() ) { // could be null ptr from clearForPrefix return *i->second; } return make_inlock(ns); } /* NamespaceIndex is the ".ns" file you see in the data directory. It is the "system catalog" if you will: at least the core parts. (Additional info in system.* collections.) */ class NamespaceIndex { public: NamespaceIndex(const string &dir, const string &database) : ht( 0 ), dir_( dir ), database_( database ) {} /* returns true if new db will be created if we init lazily */ bool exists() const; void init() { if( !ht ) _init(); } void add_ns(const char *ns, DiskLoc& loc, bool capped); void add_ns( const char *ns, const NamespaceDetails &details ); NamespaceDetails* details(const char *ns) { if ( !ht ) return 0; Namespace n(ns); NamespaceDetails *d = ht->get(n); if ( d && d->isCapped() ) d->cappedCheckMigrate(); return d; } void kill_ns(const char *ns); bool find(const char *ns, DiskLoc& loc) { NamespaceDetails *l = details(ns); if ( l ) { loc = l->firstExtent; return true; } return false; } bool allocated() const { return ht != 0; } void getNamespaces( list& tofill , bool onlyCollections = true ) const; NamespaceDetails::Extra* newExtra(const char *ns, int n, NamespaceDetails *d); boost::filesystem::path path() const; unsigned long long fileLength() const { return f.length(); } private: void _init(); void maybeMkdir() const; MongoMMF f; HashTable *ht; string dir_; string database_; }; extern string dbpath; // --dbpath parm extern bool directoryperdb; // Rename a namespace within current 'client' db. // (Arguments should include db name) void renameNamespace( const char *from, const char *to, bool stayTemp); } // namespace mongo