// namespacedetails.cpp /** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "stdafx.h" #include "pdfile.h" #include "db.h" #include "../util/mmap.h" #include "../util/hashtab.h" #include "btree.h" #include #include #include "query.h" /* deleted lists -- linked lists of deleted records -- tehy are placed in 'buckets' of various sizes so you can look for a deleterecord about the right size. */ int bucketSizes[] = { 32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, 0x400000, 0x800000 }; //NamespaceIndexMgr namespaceIndexMgr; /* returns true if we created (did not exist) during init() */ bool NamespaceIndex::init(const char *dir, const char *database) { boost::filesystem::path path( dir ); path /= string( database ) + ".ns"; bool created = !boost::filesystem::exists(path); /* if someone manually deleted the datafiles for a database, we need to be sure to clear any cached info for the database in local.*. */ if ( string("local") != database ) { DBInfo i(database); i.dbDropped(); } const int LEN = 16 * 1024 * 1024; string pathString = path.string(); void *p = f.map(pathString.c_str(), LEN); if ( p == 0 ) { problem() << "couldn't open namespace.idx " << pathString << " terminating" << endl; exit(-3); } ht = new HashTable(p, LEN, "namespace index"); return created; } void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) { { // defensive code: try to make us notice if we reference a deleted record (unsigned&) (((Record *) d)->data) = 0xeeeeeeee; } dassert( dloc.drec() == d ); DEBUGGING cout << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl; int b = bucket(d->lengthWithHeaders); DiskLoc& list = deletedList[b]; DiskLoc oldHead = list; list = dloc; d->nextDeleted = oldHead; } /* lenToAlloc is WITH header */ DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) { lenToAlloc = (lenToAlloc + 3) & 0xfffffffc; DiskLoc loc = _alloc(ns, lenToAlloc); if ( loc.isNull() ) return loc; DeletedRecord *r = loc.drec(); /* note we want to grab from the front so our next pointers on disk tend to go in a forward direction which is important for performance. */ int regionlen = r->lengthWithHeaders; extentLoc.set(loc.a(), r->extentOfs); assert( r->extentOfs < loc.getOfs() ); DEBUGGING cout << "TEMP: alloc() returns " << loc.toString() << ' ' << ns << " lentoalloc:" << lenToAlloc << " ext:" << extentLoc.toString() << endl; int left = regionlen - lenToAlloc; if ( left < 24 || (left < (lenToAlloc >> 3) && capped == 0) ) { // you get the whole thing. return loc; } /* split off some for further use. */ r->lengthWithHeaders = lenToAlloc; DiskLoc newDelLoc = loc; newDelLoc.inc(lenToAlloc); DeletedRecord *newDel = newDelLoc.drec(); newDel->extentOfs = r->extentOfs; newDel->lengthWithHeaders = left; newDel->nextDeleted.Null(); addDeletedRec(newDel, newDelLoc); return loc; } /* for non-capped collections. returned item is out of the deleted list upon return */ DiskLoc NamespaceDetails::__stdAlloc(int len) { DiskLoc *prev; DiskLoc *bestprev = 0; DiskLoc bestmatch; int bestmatchlen = 0x7fffffff; int b = bucket(len); DiskLoc cur = deletedList[b]; prev = &deletedList[b]; int extra = 5; // look for a better fit, a little. int chain = 0; while ( 1 ) { { int a = cur.a(); if ( a < -1 || a >= 100000 ) { problem() << "~~ Assertion - cur out of range in _alloc() " << cur.toString() << " b:" << b << " chain:" << chain << '\n'; sayDbContext(); if ( cur == *prev ) prev->Null(); cur.Null(); } } if ( cur.isNull() ) { // move to next bucket. if we were doing "extra", just break if ( bestmatchlen < 0x7fffffff ) break; b++; if ( b > MaxBucket ) { // out of space. alloc a new extent. return DiskLoc(); } cur = deletedList[b]; prev = &deletedList[b]; continue; } DeletedRecord *r = cur.drec(); if ( r->lengthWithHeaders >= len && r->lengthWithHeaders < bestmatchlen ) { bestmatchlen = r->lengthWithHeaders; bestmatch = cur; bestprev = prev; } if ( bestmatchlen < 0x7fffffff && --extra <= 0 ) break; if ( ++chain > 30 && b < MaxBucket ) { // too slow, force move to next bucket to grab a big chunk //b++; chain = 0; cur.Null(); } else { if ( r->nextDeleted.getOfs() == 0 ) { problem() << "~~ Assertion - bad nextDeleted " << r->nextDeleted.toString() << " b:" << b << " chain:" << chain << ", fixing.\n"; r->nextDeleted.Null(); } cur = r->nextDeleted; prev = &r->nextDeleted; } } /* unlink ourself from the deleted list */ { DeletedRecord *bmr = bestmatch.drec(); *bestprev = bmr->nextDeleted; bmr->nextDeleted.setInvalid(); // defensive. assert(bmr->extentOfs < bestmatch.getOfs()); } return bestmatch; } void NamespaceDetails::dumpDeleted(set *extents) { // cout << "DUMP deleted chains" << endl; for ( int i = 0; i < Buckets; i++ ) { // cout << " bucket " << i << endl; DiskLoc dl = deletedList[i]; while ( !dl.isNull() ) { DeletedRecord *r = dl.drec(); DiskLoc extLoc(dl.a(), r->extentOfs); if ( extents == 0 || extents->count(extLoc) <= 0 ) { cout << " bucket " << i << endl; cout << " " << dl.toString() << " ext:" << extLoc.toString(); if ( extents && extents->count(extLoc) <= 0 ) cout << '?'; cout << " len:" << r->lengthWithHeaders << endl; } dl = r->nextDeleted; } } // cout << endl; } /* combine adjacent deleted records this is O(n^2) but we call it for capped tables where typically n==1 or 2! (or 3...there will be a little unused sliver at the end of the extent.) */ void NamespaceDetails::compact() { assert(capped); list drecs; for ( int i = 0; i < Buckets; i++ ) { DiskLoc dl = deletedList[i]; deletedList[i].Null(); while ( !dl.isNull() ) { DeletedRecord *r = dl.drec(); drecs.push_back(dl); dl = r->nextDeleted; } } drecs.sort(); list::iterator j = drecs.begin(); assert( j != drecs.end() ); DiskLoc a = *j; while ( 1 ) { j++; if ( j == drecs.end() ) { DEBUGGING cout << "TEMP: compact adddelrec\n"; addDeletedRec(a.drec(), a); break; } DiskLoc b = *j; while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) { // a & b are adjacent. merge. a.drec()->lengthWithHeaders += b.drec()->lengthWithHeaders; j++; if ( j == drecs.end() ) { DEBUGGING cout << "temp: compact adddelrec2\n"; addDeletedRec(a.drec(), a); return; } b = *j; } DEBUGGING cout << "temp: compact adddelrec3\n"; addDeletedRec(a.drec(), a); a = b; } } /* alloc with capped table handling. */ int n_complaints_cap = 0; DiskLoc NamespaceDetails::_alloc(const char *ns, int len) { if ( !capped ) return __stdAlloc(len); // capped. assert( len < 400000000 ); int passes = 0; DiskLoc loc; // delete records until we have room and the max # objects limit achieved. Extent *theExtent = firstExtent.ext(); // only one extent if capped. dassert( theExtent->ns == ns ); theExtent->assertOk(); while ( 1 ) { if ( nrecords < max ) { loc = __stdAlloc(len); if ( !loc.isNull() ) break; } DiskLoc fr = theExtent->firstRecord; if ( fr.isNull() ) { if ( ++n_complaints_cap < 8 ) { cout << "couldn't make room for new record in capped ns " << ns << '\n' << " len: " << len << " extentsize:" << lastExtentSize << '\n'; cout << " magic: " << hex << theExtent->magic << " extent->ns: " << theExtent->ns.buf << '\n'; cout << " fr: " << theExtent->firstRecord.toString() << " lr: " << theExtent->lastRecord.toString() << " extent->len: " << theExtent->length << '\n'; assert( len * 5 > lastExtentSize ); // assume it is unusually large record; if not, something is broken } return DiskLoc(); } theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); compact(); assert( ++passes < 5000 ); } return loc; } /* you MUST call when adding an index. see pdfile.cpp */ void NamespaceDetails::addingIndex(const char *thisns, IndexDetails& details) { assert( nsdetails(thisns) == this ); assert( &details == &indexes[nIndexes] ); nIndexes++; NamespaceDetailsTransient::get(thisns).addedIndex(); } /* returns index of the first index in which the field is present. -1 if not present. (aug08 - this method not currently used) */ int NamespaceDetails::fieldIsIndexed(const char *fieldName) { for ( int i = 0; i < nIndexes; i++ ) { IndexDetails& idx = indexes[i]; BSONObj idxKey = idx.info.obj().getObjectField("key"); // e.g., { ts : -1 } if ( !idxKey.findElement(fieldName).eoo() ) return i; } return -1; } /* ------------------------------------------------------------------------- */ map NamespaceDetailsTransient::map; typedef map::iterator ouriter; NamespaceDetailsTransient& NamespaceDetailsTransient::get(const char *ns) { NamespaceDetailsTransient*& t = map[ns]; if ( t == 0 ) t = new NamespaceDetailsTransient(ns); return *t; } void NamespaceDetailsTransient::computeIndexKeys() { NamespaceDetails *d = nsdetails(ns.c_str()); for ( int i = 0; i < d->nIndexes; i++ ) { // set fields; d->indexes[i].keyPattern().getFieldNames(allIndexKeys); // allIndexKeys.insert(fields.begin(),fields.end()); } } /* ------------------------------------------------------------------------- */ /* add a new namespace to the system catalog (.system.namespaces). options: { capped : ..., size : ... } */ void addNewNamespaceToCatalog(const char *ns, BSONObj *options = 0) { if( verbose ) log() << "New namespace: " << ns << '\n'; if ( strstr(ns, "system.namespaces") ) { // system.namespaces holds all the others, so it is not explicitly listed in the catalog. // TODO: fix above should not be strstr! return; } { BSONObjBuilder b; b.append("name", ns); if ( options ) b.append("options", *options); BSONObj j = b.done(); char database[256]; nsToClient(ns, database); string s = database; s += ".system.namespaces"; theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true); } }