/** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the GNU Affero General Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #pragma once #include "mongo/pch.h" #include "mongo/db/d_concurrency.h" #include "mongo/db/diskloc.h" #include "mongo/db/storage/index_details.h" #include "mongo/db/index_names.h" #include "mongo/db/index_set.h" #include "mongo/db/jsobj.h" #include "mongo/db/storage/durable_mapped_file.h" #include "mongo/db/namespace_string.h" #include "mongo/db/catalog/ondisk/namespace.h" #include "mongo/db/catalog/ondisk/namespace_index.h" #include "mongo/platform/unordered_map.h" namespace mongo { class Database; class IndexCatalog; /** @return true if a client can modify this namespace even though it is under ".system." For example .system.users is ok for regular clients to update. @param write used when .system.js */ bool legalClientSystemNS( const StringData& ns , bool write ); /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes so you can look for a deleterecord about the right size. */ const int Buckets = 19; const int MaxBucket = 18; extern int bucketSizes[]; #pragma pack(1) /* NamespaceDetails : this is the "header" for a collection that has all its details. It's in the .ns file and this is a memory mapped region (thus the pack pragma above). */ class NamespaceDetails { public: enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 }; private: /*-------- data fields, as present on disk : */ DiskLoc _firstExtent; DiskLoc _lastExtent; /* NOTE: capped collections v1 override the meaning of deletedList. deletedList[0] points to a list of free records (DeletedRecord's) for all extents in the capped namespace. deletedList[1] points to the last record in the prev extent. When the "current extent" changes, this value is updated. !deletedList[1].isValid() when this value is not yet computed. */ DiskLoc _deletedList[Buckets]; // ofs 168 (8 byte aligned) struct Stats { // datasize and nrecords MUST Be adjacent code assumes! long long datasize; // this includes padding, but not record headers long long nrecords; } _stats; int _lastExtentSize; int _nIndexes; // ofs 192 IndexDetails _indexes[NIndexesBase]; // ofs 352 (16 byte aligned) int _isCapped; // there is wasted space here if I'm right (ERH) int _maxDocsInCapped; // max # of objects for a capped table, -1 for inf. double _paddingFactor; // 1.0 = no padding. // ofs 368 (16) int _systemFlags; // things that the system sets/cares about DiskLoc _capExtent; // the "current" extent we're writing too for a capped collection DiskLoc _capFirstNewRecord; unsigned short _dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h unsigned short _indexFileVersion; unsigned long long _multiKeyIndexBits; // ofs 400 (16) unsigned long long _reservedA; long long _extraOffset; // where the $extra info is located (bytes relative to this) int _indexBuildsInProgress; // Number of indexes currently being built int _userFlags; char _reserved[72]; /*-------- end data 496 bytes */ public: explicit NamespaceDetails( const DiskLoc &loc, bool _capped ); class Extra { long long _next; public: IndexDetails details[NIndexesExtra]; private: unsigned reserved2; unsigned reserved3; Extra(const Extra&) { verify(false); } Extra& operator=(const Extra& r) { verify(false); return *this; } public: Extra() { } long ofsFrom(NamespaceDetails *d) { return ((char *) this) - ((char *) d); } void init() { memset(this, 0, sizeof(Extra)); } Extra* next(NamespaceDetails *d) { if( _next == 0 ) return 0; return (Extra*) (((char *) d) + _next); } void setNext(long ofs) { *getDur().writing(&_next) = ofs; } void copy(NamespaceDetails *d, const Extra& e) { memcpy(this, &e, sizeof(Extra)); _next = 0; } }; Extra* extra() { if( _extraOffset == 0 ) return 0; return (Extra *) (((char *) this) + _extraOffset); } /* add extra space for indexes when more than 10 */ Extra* allocExtra(const char *ns, int nindexessofar); void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra /* dump info on this namespace. for debugging. */ void dump(const Namespace& k); /* dump info on all extents for this namespace. for debugging. */ void dumpExtents(); public: const DiskLoc& capExtent() const { return _capExtent; } const DiskLoc capFirstNewRecord() const { return _capFirstNewRecord; } DiskLoc& capExtent() { return _capExtent; } DiskLoc& capFirstNewRecord() { return _capFirstNewRecord; } private: Extent *theCapExtent() const { return _capExtent.ext(); } void advanceCapExtent( const StringData& ns ); DiskLoc __capAlloc(int len); DiskLoc cappedAlloc(const StringData& ns, int len); DiskLoc &cappedFirstDeletedInCurExtent(); bool nextIsInCapExtent( const DiskLoc &dl ) const; public: const DiskLoc& firstExtent() const { return _firstExtent; } const DiskLoc& lastExtent() const { return _lastExtent; } DiskLoc& firstExtent() { return _firstExtent; } DiskLoc& lastExtent() { return _lastExtent; } void setFirstExtent( DiskLoc newFirstExtent ); void setLastExtent( DiskLoc newLastExtent ); void setFirstExtentInvalid(); void setLastExtentInvalid(); long long dataSize() const { return _stats.datasize; } long long numRecords() const { return _stats.nrecords; } void incrementStats( long long dataSizeIncrement, long long numRecordsIncrement ); void setStats( long long dataSizeIncrement, long long numRecordsIncrement ); bool isCapped() const { return _isCapped; } long long maxCappedDocs() const; void setMaxCappedDocs( long long max ); int lastExtentSize() const { return _lastExtentSize; } void setLastExtentSize( int newMax ); const DiskLoc& deletedListEntry( int bucket ) const { return _deletedList[bucket]; } DiskLoc& deletedListEntry( int bucket ) { return _deletedList[bucket]; } void orphanDeletedList(); /** * @param max in and out, will be adjusted * @return if the value is valid at all */ static bool validMaxCappedDocs( long long* max ); DiskLoc& cappedListOfAllDeletedRecords() { return _deletedList[0]; } DiskLoc& cappedLastDelRecLastExtent() { return _deletedList[1]; } void cappedDumpDelInfo(); bool capLooped() const { return _isCapped && _capFirstNewRecord.isValid(); } bool inCapExtent( const DiskLoc &dl ) const; void cappedCheckMigrate(); /** * Truncate documents newer than the document at 'end' from the capped * collection. The collection cannot be completely emptied using this * function. An assertion will be thrown if that is attempted. * @param inclusive - Truncate 'end' as well iff true */ void cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive); /** Remove all documents from the capped collection */ void emptyCappedCollection(const char *ns); /* when a background index build is in progress, we don't count the index in nIndexes until complete, yet need to still use it in _indexRecord() - thus we use this function for that. */ int getTotalIndexCount() const { return _nIndexes + _indexBuildsInProgress; } int getCompletedIndexCount() const { return _nIndexes; } int getIndexBuildsInProgress() const { return _indexBuildsInProgress; } /* NOTE: be careful with flags. are we manipulating them in read locks? if so, this isn't thread safe. TODO */ enum SystemFlags { Flag_HaveIdIndex = 1 << 0 // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called) }; enum UserFlags { Flag_UsePowerOf2Sizes = 1 << 0 }; IndexDetails& idx(int idxNo, bool missingExpected = false ); class IndexIterator { public: int pos() { return i; } // note this is the next one to come bool more() { return i < n; } IndexDetails& next() { return d->idx(i++); } private: friend class NamespaceDetails; int i, n; NamespaceDetails *d; IndexIterator(NamespaceDetails *_d, bool includeBackgroundInProgress); }; IndexIterator ii( bool includeBackgroundInProgress = false ) { return IndexIterator(this, includeBackgroundInProgress); } /* hackish - find our index # in the indexes array */ int idxNo(const IndexDetails& idx); /* multikey indexes are indexes where there are more than one key in the index for a single document. see multikey in docs. for these, we have to do some dedup work on queries. */ bool isMultikey(int i) const { return (_multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; } /** * @return - if any state was changed */ bool setIndexIsMultikey(int i, bool multikey = true); /** * This fetches the IndexDetails for the next empty index slot. The caller must populate * returned object. This handles allocating extra index space, if necessary. */ IndexDetails& getNextIndexDetails(const char* thisns); /* returns index of the first index in which the field is present. -1 if not present. */ int fieldIsIndexed(const char *fieldName); /** * @return the actual size to create * will be >= oldRecordSize * based on padding and any other flags */ int getRecordAllocationSize( int minRecordSize ); double paddingFactor() const { return _paddingFactor; } void setPaddingFactor( double paddingFactor ) { *getDur().writing(&_paddingFactor) = paddingFactor; } /* called to indicate that an update fit in place. fits also called on an insert -- idea there is that if you had some mix and then went to pure inserts it would adapt and PF would trend to 1.0. note update calls insert on a move so there is a double count there that must be adjusted for below. todo: greater sophistication could be helpful and added later. for example the absolute size of documents might be considered -- in some cases smaller ones are more likely to grow than larger ones in the same collection? (not always) */ void paddingFits() { MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less double x = _paddingFactor - 0.001; if ( x >= 1.0 ) { setPaddingFactor( x ); } } } void paddingTooSmall() { MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less /* the more indexes we have, the higher the cost of a move. so we take that into account herein. note on a move that insert() calls paddingFits(), thus here for example with no inserts and nIndexes = 1 we have .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves. insert heavy can pushes this down considerably. further tweaking will be a good idea but this should be an adequate starting point. */ double N = min(_nIndexes,7) + 3; double x = _paddingFactor + (0.001 * N); if ( x <= 2.0 ) { setPaddingFactor( x ); } } } // @return offset in indexes[] int findIndexByName(const StringData& name, bool includeBackgroundInProgress = false); // @return offset in indexes[] int findIndexByKeyPattern(const BSONObj& keyPattern, bool includeBackgroundInProgress = false); void findIndexByType( const string& name , vector& matches ) { IndexIterator i = ii(); while ( i.more() ) { if ( IndexNames::findPluginName(i.next().keyPattern()) == name ) matches.push_back( i.pos() - 1 ); } } /* Returns the index entry for the first index whose prefix contains * 'keyPattern'. If 'requireSingleKey' is true, skip indices that contain * array attributes. Otherwise, returns NULL. */ const IndexDetails* findIndexByPrefix( const BSONObj &keyPattern , bool requireSingleKey ); /* Updates the expireAfterSeconds field of the given index to the value in newExpireSecs. * The specified index must already contain an expireAfterSeconds field, and the value in * that field and newExpireSecs must both be numeric. */ void updateTTLIndex( int idxNo , const BSONElement& newExpireSecs ); const int systemFlags() const { return _systemFlags; } bool isSystemFlagSet( int flag ) const { return _systemFlags & flag; } void setSystemFlag( int flag ); void clearSystemFlag( int flag ); const int userFlags() const { return _userFlags; } bool isUserFlagSet( int flag ) const { return _userFlags & flag; } /** * these methods only modify NamespaceDetails and do not * sync changes back to system.namespaces * a typical call might if ( nsd->setUserFlag( 4 ) ) { nsd->syncUserFlags(); } * these methods all return true iff only something was modified */ bool setUserFlag( int flag ); bool clearUserFlag( int flag ); bool replaceUserFlags( int flags ); void syncUserFlags( const string& ns ); /* @return -1 = not found generally id is first index, so not that expensive an operation (assuming present). */ int findIdIndex() { IndexIterator i = ii(); while( i.more() ) { if( i.next().isIdIndex() ) return i.pos()-1; } return -1; } bool haveIdIndex() { return isSystemFlagSet( NamespaceDetails::Flag_HaveIdIndex ) || findIdIndex() >= 0; } /* return which "deleted bucket" for this size object */ static int bucket(int size) { for ( int i = 0; i < Buckets; i++ ) { if ( bucketSizes[i] > size ) { // Return the first bucket sized _larger_ than the requested size. return i; } } return MaxBucket; } /* @return the size for an allocated record quantized to 1/16th of the BucketSize. @param allocSize requested size to allocate The returned size will be greater than or equal to 'allocSize'. */ static int quantizeAllocationSpace(int allocSize); /** * Quantize 'allocSize' to the nearest bucketSize (or nearest 1mb boundary for large sizes). */ static int quantizePowerOf2AllocationSpace(int allocSize); /* predetermine location of the next alloc without actually doing it. if cannot predetermine returns null (so still call alloc() then) */ DiskLoc allocWillBeAt(const char *ns, int lenToAlloc); /** allocate space for a new record from deleted lists. @param lenToAlloc is WITH header @return null diskloc if no room - allocate a new extent then */ DiskLoc alloc(const StringData& ns, int lenToAlloc); /* add a given record to the deleted chains for this NS */ void addDeletedRec(DeletedRecord *d, DiskLoc dloc); void dumpDeleted(set *extents = 0); // Start from firstExtent by default. DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const; // Start from lastExtent by default. DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const; NamespaceDetails *writingWithoutExtra() { return ( NamespaceDetails* ) getDur().writingPtr( this, sizeof( NamespaceDetails ) ); } /** Make all linked Extra objects writeable as well */ NamespaceDetails *writingWithExtra(); private: void _removeIndexFromMe( int idx ); /** * swaps all meta data for 2 indexes * a and b are 2 index ids, whose contents will be swapped * must have a lock on the entire collection to do this */ void swapIndex( int a, int b ); DiskLoc _alloc(const StringData& ns, int len); void maybeComplain( const StringData& ns, int len ) const; DiskLoc __stdAlloc(int len, bool willBeAt); void compact(); // combine adjacent deleted records friend class NamespaceIndex; friend class IndexCatalog; struct ExtraOld { // note we could use this field for more chaining later, so don't waste it: unsigned long long reserved1; IndexDetails details[NIndexesExtra]; unsigned reserved2; unsigned reserved3; }; /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */ void cappedTruncateLastDelUpdate(); BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 ); BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 ); BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 ); }; // NamespaceDetails #pragma pack() } // namespace mongo