diff options
Diffstat (limited to 'db/pdfile.cpp')
-rw-r--r-- | db/pdfile.cpp | 1938 |
1 files changed, 969 insertions, 969 deletions
diff --git a/db/pdfile.cpp b/db/pdfile.cpp index c063d91b1c1..ddf74adf615 100644 --- a/db/pdfile.cpp +++ b/db/pdfile.cpp @@ -1,969 +1,969 @@ -// pdfile.cpp
-
-/*
-todo:
-_ manage deleted records. bucket?
-_ use deleted on inserts!
-_ quantize allocations
-_ table scans must be sequential, not next/prev pointers
-_ regex support
-*/
-
-#include "stdafx.h"
-#include "pdfile.h"
-#include "db.h"
-#include "../util/mmap.h"
-#include "../util/hashtab.h"
-#include "objwrappers.h"
-#include "btree.h"
-#include <algorithm>
-#include <list>
-
-const char *dbpath = "/data/db/";
-
-DataFileMgr theDataFileMgr;
-map<string,Client*> clients;
-Client *client;
-const char *curNs = "";
-int MAGIC = 0x1000;
-int curOp = -2;
-int callDepth = 0;
-
-extern int otherTraceLevel;
-
-void sayDbContext(const char *errmsg) {
- if( errmsg ) {
- cout << errmsg << '\n';
- problem() << errmsg << endl;
- }
- cout << " client: " << (client ? client->name.c_str() : "null");
- cout << " op:" << curOp << ' ' << callDepth << endl;
- if( client )
- cout << " ns: " << curNs << endl;
-}
-
-JSObj::JSObj(Record *r) {
- _objdata = r->data;
- _objsize = *((int*) _objdata);
- if( _objsize > r->netLength() ) {
- cout << "About to assert fail _objsize <= r->netLength()" << endl;
- cout << " _objsize: " << _objsize << endl;
- cout << " netLength(): " << r->netLength() << endl;
- cout << " extentOfs: " << r->extentOfs << endl;
- cout << " nextOfs: " << r->nextOfs << endl;
- cout << " prevOfs: " << r->prevOfs << endl;
- assert( _objsize <= r->netLength() );
- }
- iFree = false;
-}
-
-/*---------------------------------------------------------------------*/
-
-int bucketSizes[] = {
- 32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000,
- 0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000,
- 0x400000, 0x800000
-};
-
-//NamespaceIndexMgr namespaceIndexMgr;
-
-void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
- DEBUGGING cout << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
- int b = bucket(d->lengthWithHeaders);
- DiskLoc& list = deletedList[b];
- DiskLoc oldHead = list;
- list = dloc;
- d->nextDeleted = oldHead;
-}
-
-/* lenToAlloc is WITH header
-*/
-DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) {
- lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
- DiskLoc loc = _alloc(ns, lenToAlloc);
- if( loc.isNull() )
- return loc;
-
- DeletedRecord *r = loc.drec();
-
- /* note we want to grab from the front so our next pointers on disk tend
- to go in a forward direction which is important for performance. */
- int regionlen = r->lengthWithHeaders;
- extentLoc.set(loc.a(), r->extentOfs);
-
- DEBUGGING cout << "TEMP: alloc() returns " << loc.toString() << ' ' << ns << " lentoalloc:" << lenToAlloc << " ext:" << extentLoc.toString() << endl;
-
- int left = regionlen - lenToAlloc;
- if( left < 24 || (left < (lenToAlloc >> 3) && capped == 0) ) {
- // you get the whole thing.
- return loc;
- }
-
- /* split off some for further use. */
- r->lengthWithHeaders = lenToAlloc;
- DiskLoc newDelLoc = loc;
- newDelLoc.inc(lenToAlloc);
- DeletedRecord *newDel = newDelLoc.drec();
- newDel->extentOfs = r->extentOfs;
- newDel->lengthWithHeaders = left;
- newDel->nextDeleted.Null();
-
- addDeletedRec(newDel, newDelLoc);
-
- return loc;
-}
-
-/* for non-capped collections.
- returned item is out of the deleted list upon return
-*/
-DiskLoc NamespaceDetails::__stdAlloc(int len) {
- DiskLoc *prev;
- DiskLoc *bestprev = 0;
- DiskLoc bestmatch;
- int bestmatchlen = 0x7fffffff;
- int b = bucket(len);
- DiskLoc cur = deletedList[b]; prev = &deletedList[b];
- int extra = 5; // look for a better fit, a little.
- int chain = 0;
- while( 1 ) {
- {
- int a = cur.a();
- if( a < -1 || a >= 100000 ) {
- problem() << "Assertion failure - a() out of range in _alloc() " << a << endl;
- cout << "Assertion failure - a() out of range in _alloc() " << a << '\n';
- sayDbContext();
- if( cur == *prev )
- prev->Null();
- cur.Null();
- }
- }
- if( cur.isNull() ) {
- // move to next bucket. if we were doing "extra", just break
- if( bestmatchlen < 0x7fffffff )
- break;
- b++;
- if( b > MaxBucket ) {
- // out of space. alloc a new extent.
- return DiskLoc();
- }
- cur = deletedList[b]; prev = &deletedList[b];
- continue;
- }
- DeletedRecord *r = cur.drec();
- if( r->lengthWithHeaders >= len &&
- r->lengthWithHeaders < bestmatchlen ) {
- bestmatchlen = r->lengthWithHeaders;
- bestmatch = cur;
- bestprev = prev;
- }
- if( bestmatchlen < 0x7fffffff && --extra <= 0 )
- break;
- if( ++chain > 30 && b < MaxBucket ) {
- // too slow, force move to next bucket to grab a big chunk
- b++;
- chain = 0;
- cur.Null();
- }
- else {
- cur = r->nextDeleted; prev = &r->nextDeleted;
- }
- }
-
- /* unlink ourself from the deleted list */
- *bestprev = bestmatch.drec()->nextDeleted;
-
- return bestmatch;
-}
-
-void NamespaceDetails::dumpDeleted(set<DiskLoc> *extents) {
-// cout << "DUMP deleted chains" << endl;
- for( int i = 0; i < Buckets; i++ ) {
-// cout << " bucket " << i << endl;
- DiskLoc dl = deletedList[i];
- while( !dl.isNull() ) {
- DeletedRecord *r = dl.drec();
- DiskLoc extLoc(dl.a(), r->extentOfs);
- if( extents == 0 || extents->count(extLoc) <= 0 ) {
- cout << " bucket " << i << endl;
- cout << " " << dl.toString() << " ext:" << extLoc.toString();
- if( extents && extents->count(extLoc) <= 0 )
- cout << '?';
- cout << " len:" << r->lengthWithHeaders << endl;
- }
- dl = r->nextDeleted;
- }
- }
-// cout << endl;
-}
-
-/* combine adjacent deleted records
-
- this is O(n^2) but we call it for capped tables where typically n==1 or 2!
- (or 3...there will be a little unused sliver at the end of the extent.)
-*/
-void NamespaceDetails::compact() {
- assert(capped);
- list<DiskLoc> drecs;
-
- for( int i = 0; i < Buckets; i++ ) {
- DiskLoc dl = deletedList[i];
- deletedList[i].Null();
- while( !dl.isNull() ) {
- DeletedRecord *r = dl.drec();
- drecs.push_back(dl);
- dl = r->nextDeleted;
- }
- }
-
- drecs.sort();
-
- list<DiskLoc>::iterator j = drecs.begin();
- assert( j != drecs.end() );
- DiskLoc a = *j;
- while( 1 ) {
- j++;
- if( j == drecs.end() ) {
- DEBUGGING cout << "TEMP: compact adddelrec\n";
- addDeletedRec(a.drec(), a);
- break;
- }
- DiskLoc b = *j;
- while( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
- // a & b are adjacent. merge.
- a.drec()->lengthWithHeaders += b.drec()->lengthWithHeaders;
- j++;
- if( j == drecs.end() ) {
- DEBUGGING cout << "temp: compact adddelrec2\n";
- addDeletedRec(a.drec(), a);
- return;
- }
- b = *j;
- }
- DEBUGGING cout << "temp: compact adddelrec3\n";
- addDeletedRec(a.drec(), a);
- a = b;
- }
-}
-
-DiskLoc NamespaceDetails::_alloc(const char *ns, int len) {
- if( !capped )
- return __stdAlloc(len);
-
- assert( len < 400000000 );
- int passes = 0;
- DiskLoc loc;
-
- // delete records until we have room and the max # objects limit achieved.
- while( 1 ) {
- if( nrecords < max ) {
- loc = __stdAlloc(len);
- if( !loc.isNull() )
- break;
- }
-
- DiskLoc fr = firstExtent.ext()->firstRecord;
- if( fr.isNull() ) {
- cout << "couldn't make room for new record in capped ns\n";
- cout << " ns:" << ns;
- cout << "\n len: " << len << endl;
- assert(false);
- return DiskLoc();
- }
-
- theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true);
- compact();
- assert( ++passes < 5000 );
- }
-
- return loc;
-}
-
-/*
-class NamespaceCursor : public Cursor {
-public:
- virtual bool ok() { return i >= 0; }
- virtual Record* _current() { assert(false); return 0; }
- virtual DiskLoc currLoc() { assert(false); return DiskLoc(); }
-
- virtual JSObj current() {
- NamespaceDetails &d = namespaceIndex.ht->nodes[i].value;
- JSObjBuilder b;
- b.append("name", namespaceIndex.ht->nodes[i].k.buf);
- return b.done();
- }
-
- virtual bool advance() {
- while( 1 ) {
- i++;
- if( i >= namespaceIndex.ht->n )
- break;
- if( namespaceIndex.ht->nodes[i].inUse() )
- return true;
- }
- i = -1000000;
- return false;
- }
-
- NamespaceCursor() {
- i = -1;
- advance();
- }
-private:
- int i;
-};
-
-auto_ptr<Cursor> makeNamespaceCursor() {
- return auto_ptr<Cursor>(new NamespaceCursor());
-}*/
-
-void newNamespace(const char *ns) {
- cout << "New namespace: " << ns << endl;
- if( strstr(ns, "system.namespaces") == 0 ) {
- JSObjBuilder b;
- b.append("name", ns);
- JSObj j = b.done();
- char client[256];
- nsToClient(ns, client);
- string s = client;
- s += ".system.namespaces";
- theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true);
- }
-}
-
-int initialExtentSize(int len) {
- long long sz = len * 16;
- if( len < 1000 ) sz = len * 64;
- if( sz > 1000000000 )
- sz = 1000000000;
- int z = ((int)sz) & 0xffffff00;
- assert( z > len );
- cout << "initialExtentSize(" << len << ") returns " << z << endl;
- return z;
-}
-
-// { ..., capped: true, size: ..., max: ... }
-bool userCreateNS(const char *ns, JSObj& j) {
- if( nsdetails(ns) )
- return false;
-
- cout << j.toString() << endl;
-
- newNamespace(ns);
-
- int ies = initialExtentSize(128);
- Element e = j.findElement("size");
- if( e.type() == Number ) {
- ies = (int) e.number();
- ies += 256;
- ies &= 0xffffff00;
- if( ies > 1024 * 1024 * 1024 + 256 ) return false;
- }
-
- client->newestFile()->newExtent(ns, ies);
- NamespaceDetails *d = nsdetails(ns);
- assert(d);
-
- e = j.findElement("capped");
- if( e.type() == Bool && e.boolean() ) {
- d->capped = 1;
- e = j.findElement("max");
- if( e.type() == Number ) {
- int mx = (int) e.number();
- if( mx > 0 )
- d->max = mx;
- }
- }
-
- return true;
-}
-
-/*---------------------------------------------------------------------*/
-
-void PhysicalDataFile::open(int fn, const char *filename) {
- int length;
-
- if( fn <= 4 ) {
- length = (64*1024*1024) << fn;
- if( strstr(filename, "alleyinsider") && length < 1024 * 1024 * 1024 )
- length = 1024 * 1024 * 1024;
- } else
- length = 0x7ff00000;
-
- assert( length >= 64*1024*1024 && length % 4096 == 0 );
-
- assert(fn == fileNo);
- header = (PDFHeader *) mmf.map(filename, length);
- assert(header);
- header->init(fileNo, length);
-}
-
-/* prev - previous extent for this namespace. null=this is the first one. */
-Extent* PhysicalDataFile::newExtent(const char *ns, int approxSize, int loops) {
- assert( approxSize >= 0 && approxSize <= 0x7ff00000 );
-
- int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength;
- DiskLoc loc;
- if( ExtentSize <= 0 ) {
- if( loops > 8 ) {
- assert(false);
- return 0;
- }
- cout << "INFO: newExtent(): file full, adding a new file " << ns << endl;
- return client->addAFile()->newExtent(ns, approxSize, loops+1);
- }
- int offset = header->unused.getOfs();
- header->unused.setOfs( fileNo, offset + ExtentSize );
- header->unusedLength -= ExtentSize;
- loc.setOfs(fileNo, offset);
- Extent *e = _getExtent(loc);
- DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset);
-
- DiskLoc oldExtentLoc;
- NamespaceIndex *ni = nsindex(ns);
- NamespaceDetails *details = ni->details(ns);
- if( details ) {
- assert( !details->firstExtent.isNull() );
- e->xprev = details->lastExtent;
- details->lastExtent.ext()->xnext = loc;
- details->lastExtent = loc;
- }
- else {
- ni->add(ns, loc);
- details = ni->details(ns);
- }
-
- details->lastExtentSize = approxSize;
- DEBUGGING cout << "temp: newextent adddelrec " << ns << endl;
- details->addDeletedRec(emptyLoc.drec(), emptyLoc);
-
- cout << "new extent size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset << dec;
- cout << " emptyLoc:" << hex << emptyLoc.getOfs() << dec;
- cout << ' ' << ns << endl;
- return e;
-}
-
-/*---------------------------------------------------------------------*/
-
-/* assumes already zeroed -- insufficient for block 'reuse' perhaps */
-DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) {
- magic = 0x41424344;
- myLoc.setOfs(_fileNo, _offset);
- xnext.Null(); xprev.Null();
- ns = nsname;
- length = _length;
- firstRecord.Null(); lastRecord.Null();
-
- DiskLoc emptyLoc = myLoc;
- emptyLoc.inc( (extentData-(char*)this) );
-
- DeletedRecord *empty1 = (DeletedRecord *) extentData;
- DeletedRecord *empty = (DeletedRecord *) getRecord(emptyLoc);
- assert( empty == empty1 );
- empty->lengthWithHeaders = _length - (extentData - (char *) this);
- empty->extentOfs = myLoc.getOfs();
- return emptyLoc;
-}
-
-/*
-Record* Extent::newRecord(int len) {
- if( firstEmptyRegion.isNull() )
- return 0;
-
- assert(len > 0);
- int newRecSize = len + Record::HeaderSize;
- DiskLoc newRecordLoc = firstEmptyRegion;
- Record *r = getRecord(newRecordLoc);
- int left = r->netLength() - len;
- if( left < 0 ) {
- //
- firstEmptyRegion.Null();
- return 0;
- }
-
- DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
- r->lengthWithHeaders = newRecSize;
- r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
- if( !lastRecord.isNull() ) {
- assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
- getRecord(lastRecord)->next.set(newRecordLoc); // until now
- r->prev.set(lastRecord);
- }
- else {
- r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
- assert( firstRecord.isNull() );
- firstRecord = newRecordLoc;
- }
- lastRecord = newRecordLoc;
-
- if( left < Record::HeaderSize + 32 ) {
- firstEmptyRegion.Null();
- }
- else {
- firstEmptyRegion.inc(newRecSize);
- Record *empty = getRecord(firstEmptyRegion);
- empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
- empty->prev.Null();
- empty->lengthWithHeaders = left;
- }
-
- return r;
-}
-*/
-
-/*---------------------------------------------------------------------*/
-
-auto_ptr<Cursor> DataFileMgr::findAll(const char *ns) {
- DiskLoc loc;
- bool found = nsindex(ns)->find(ns, loc);
- if( !found ) {
- // cout << "info: findAll() namespace does not exist: " << ns << endl;
- return auto_ptr<Cursor>(new BasicCursor(DiskLoc()));
- }
-
- Extent *e = getExtent(loc);
-
- DEBUGGING {
- cout << "temp: listing extents for " << ns << endl;
- DiskLoc tmp = loc;
- set<DiskLoc> extents;
-
- while( 1 ) {
- Extent *f = getExtent(tmp);
- cout << "extent: " << tmp.toString() << endl;
- extents.insert(tmp);
- tmp = f->xnext;
- if( tmp.isNull() )
- break;
- f = f->getNextExtent();
- }
-
- cout << endl;
- nsdetails(ns)->dumpDeleted(&extents);
- }
-
- while( e->firstRecord.isNull() && !e->xnext.isNull() ) {
- OCCASIONALLY cout << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead" << endl;
- // find a nonempty extent
- // it might be nice to free the whole extent here! but have to clean up free recs then.
- e = e->getNextExtent();
- }
- return auto_ptr<Cursor>(new BasicCursor( e->firstRecord ));
-}
-
-/* get a table scan cursor, but can be forward or reverse direction */
-auto_ptr<Cursor> findTableScan(const char *ns, JSObj& order) {
- Element el = order.findElement("$natural");
- if( el.type() != Number || el.number() >= 0 )
- return DataFileMgr::findAll(ns);
-
- // "reverse natural order" // "reverse natural order"
- NamespaceDetails *d = nsdetails(ns);
- if( !d )
- return auto_ptr<Cursor>(new BasicCursor(DiskLoc()));
- Extent *e = d->lastExtent.ext();
- while( e->lastRecord.isNull() && !e->xprev.isNull() ) {
- OCCASIONALLY cout << " findTableScan: extent empty, skipping ahead" << endl;
- e = e->getPrevExtent();
- }
- return auto_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
-}
-
-void aboutToDelete(const DiskLoc& dl);
-
-/* pull out the relevant key objects from obj, so we
- can index them. Note that the set is multiple elements
- only when it's a "multikey" array.
- keys will be left empty if key not found in the object.
-*/
-void IndexDetails::getKeysFromObject(JSObj& obj, set<JSObj>& keys) {
- JSObj keyPattern = info.obj().getObjectField("key");
- JSObjBuilder b;
- JSObj key = obj.extractFields(keyPattern, b);
- if( key.isEmpty() )
- return;
- Element f = key.firstElement();
- if( f.type() != Array ) {
- b.decouple();
- key.iWillFree();
- keys.insert(key);
- return;
- }
- JSObj arr = f.embeddedObject();
-// cout << arr.toString() << endl;
- JSElemIter i(arr);
- while( i.more() ) {
- Element e = i.next();
- if( e.eoo() ) break;
- JSObjBuilder b;
-
- b.appendAs(e, f.fieldName());
- JSObj o = b.doneAndDecouple();
- assert( o.objdata() );
- keys.insert(o);
- }
-}
-
-int nUnindexes = 0;
-
-void _unindexRecord(const char *ns, IndexDetails& id, JSObj& obj, const DiskLoc& dl) {
- set<JSObj> keys;
- id.getKeysFromObject(obj, keys);
- for( set<JSObj>::iterator i=keys.begin(); i != keys.end(); i++ ) {
- JSObj j = *i;
-// cout << "UNINDEX: j:" << j.toString() << " head:" << id.head.toString() << dl.toString() << endl;
- if( otherTraceLevel >= 5 ) {
- cout << "_unindexRecord() " << obj.toString();
- cout << "\n unindex:" << j.toString() << endl;
- }
- nUnindexes++;
- bool ok = false;
- try {
- ok = id.head.btree()->unindex(id.head, id, j, dl);
- }
- catch(AssertionException) {
- cout << " caught assertion _unindexRecord " << id.indexNamespace() << '\n';
- problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl;
- cout << "Assertion failure: _unindex failed" << '\n';
- cout << " obj:" << obj.toString() << '\n';
- cout << " key:" << j.toString() << '\n';
- cout << " dl:" << dl.toString() << endl;
- sayDbContext();
- }
-
- if( !ok ) {
- cout << "unindex failed (key too big?) " << id.indexNamespace() << '\n';
- }
- }
-}
-
-void unindexRecord(const char *ns, NamespaceDetails *d, Record *todelete, const DiskLoc& dl) {
- if( d->nIndexes == 0 ) return;
- JSObj obj(todelete);
- for( int i = 0; i < d->nIndexes; i++ ) {
- _unindexRecord(ns, d->indexes[i], obj, dl);
- }
-}
-
-void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK)
-{
- int tempextofs = todelete->extentOfs;
-
- NamespaceDetails* d = nsdetails(ns);
- if( d->capped && !cappedOK ) {
- cout << "failing remove on a capped ns " << ns << endl;
- return;
- }
-
- /* check if any cursors point to us. if so, advance them. */
- aboutToDelete(dl);
-
- unindexRecord(ns, d, todelete, dl);
-
- /* remove ourself from the record next/prev chain */
- {
- if( todelete->prevOfs != DiskLoc::NullOfs )
- todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs;
- if( todelete->nextOfs != DiskLoc::NullOfs )
- todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs;
- }
-
- /* remove ourself from extent pointers */
- {
- Extent *e = todelete->myExtent(dl);
- if( e->firstRecord == dl ) {
- if( todelete->nextOfs == DiskLoc::NullOfs )
- e->firstRecord.Null();
- else
- e->firstRecord.setOfs(dl.a(), todelete->nextOfs);
- }
- if( e->lastRecord == dl ) {
- if( todelete->prevOfs == DiskLoc::NullOfs )
- e->lastRecord.Null();
- else
- e->lastRecord.setOfs(dl.a(), todelete->prevOfs);
- }
- }
-
- /* add to the free list */
- {
- d->nrecords--;
- d->datasize -= todelete->netLength();
-/// DEBUGGING << "temp: dddelrec deleterecord " << ns << endl;
-// if( todelete->extentOfs == 0xaca500 ) {
-// cout << "break\n";
-// }
-/*
-TEMP: add deleted rec 0:aca5b0 aca500
-temp: adddelrec deleterecord admin.blog.posts
-TEMP: add deleted rec 0:b9e750 b6a500
-temp: adddelrec deleterecord admin.blog.posts
-*/
-
- d->addDeletedRec((DeletedRecord*)todelete, dl);
- }
-}
-
-void setDifference(set<JSObj>& l, set<JSObj>& r, vector<JSObj*> &diff) {
- set<JSObj>::iterator i = l.begin();
- set<JSObj>::iterator j = r.begin();
- while( 1 ) {
- if( i == l.end() )
- break;
- while( j != r.end() && *j < *i )
- j++;
- if( j == r.end() || !i->woEqual(*j) ) {
- const JSObj *jo = &*i;
- diff.push_back( (JSObj *) jo );
- }
- i++;
- }
-}
-
-/** Note: as written so far, if the object shrinks a lot, we don't free up space. */
-void DataFileMgr::update(
- const char *ns,
- Record *toupdate, const DiskLoc& dl,
- const char *buf, int len, stringstream& ss)
-{
- NamespaceDetails *d = nsdetails(ns);
-
- if( toupdate->netLength() < len ) {
- if( d && d->capped ) {
- ss << " failing a growing update on a capped ns " << ns << endl;
- return;
- }
-
- // doesn't fit.
- if( client->profile )
- ss << " moved ";
- deleteRecord(ns, toupdate, dl);
- insert(ns, buf, len);
- return;
- }
-
- /* has any index keys changed? */
- {
- NamespaceDetails *d = nsdetails(ns);
- if( d->nIndexes ) {
- JSObj newObj(buf);
- JSObj oldObj = dl.obj();
- for( int i = 0; i < d->nIndexes; i++ ) {
- IndexDetails& idx = d->indexes[i];
- JSObj idxKey = idx.info.obj().getObjectField("key");
-
- set<JSObj> oldkeys;
- set<JSObj> newkeys;
- idx.getKeysFromObject(oldObj, oldkeys);
- idx.getKeysFromObject(newObj, newkeys);
- vector<JSObj*> removed;
- setDifference(oldkeys, newkeys, removed);
- string idxns = idx.indexNamespace();
- for( unsigned i = 0; i < removed.size(); i++ ) {
- try {
- idx.head.btree()->unindex(idx.head, idx, *removed[i], dl);
- }
- catch(AssertionException) {
- ss << " exception update unindex ";
- cout << " caught assertion update unindex " << idxns.c_str() << '\n';
- problem() << " caught assertion update unindex " << idxns.c_str() << endl;
- }
- }
- vector<JSObj*> added;
- setDifference(newkeys, oldkeys, added);
- assert( !dl.isNull() );
- for( unsigned i = 0; i < added.size(); i++ ) {
- try {
- idx.head.btree()->insert(
- idx.head,
- dl, *added[i], false, idx, true);
- }
- catch(AssertionException) {
- ss << " exception update index ";
- cout << " caught assertion update index " << idxns.c_str() << '\n';
- problem() << " caught assertion update index " << idxns.c_str() << endl;
- }
- }
- if( client->profile )
- ss << "<br>" << added.size() << " key updates ";
-
- }
- }
- }
-
- // update in place
- memcpy(toupdate->data, buf, len);
-}
-
-int followupExtentSize(int len, int lastExtentLen) {
- int x = initialExtentSize(len);
- int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2);
- int sz = y > x ? y : x;
- sz = ((int)sz) & 0xffffff00;
- assert( sz > len );
- return sz;
-}
-
-int deb=0;
-
-/* add keys to indexes for a new record */
-void _indexRecord(IndexDetails& idx, JSObj& obj, DiskLoc newRecordLoc) {
-
- set<JSObj> keys;
- idx.getKeysFromObject(obj, keys);
- for( set<JSObj>::iterator i=keys.begin(); i != keys.end(); i++ ) {
- assert( !newRecordLoc.isNull() );
- try {
-// DEBUGGING << "temp index: " << newRecordLoc.toString() << obj.toString() << endl;
- idx.head.btree()->insert(idx.head, newRecordLoc,
- (JSObj&) *i, false, idx, true);
- }
- catch(AssertionException) {
- cout << " caught assertion _indexRecord " << idx.indexNamespace() << '\n';
- problem() << " caught assertion _indexRecord " << idx.indexNamespace() << endl;
- }
- }
-}
-
-/* note there are faster ways to build an index in bulk, that can be
- done eventually */
-void addExistingToIndex(const char *ns, IndexDetails& idx) {
- cout << "Adding all existing records for " << ns << " to new index" << endl;
- int n = 0;
- auto_ptr<Cursor> c = theDataFileMgr.findAll(ns);
- while( c->ok() ) {
- JSObj js = c->current();
- _indexRecord(idx, js, c->currLoc());
- c->advance();
- n++;
- };
- cout << " indexing complete for " << n << " records" << endl;
-}
-
-/* add keys to indexes for a new record */
-void indexRecord(NamespaceDetails *d, const void *buf, int len, DiskLoc newRecordLoc) {
- JSObj obj((const char *)buf);
- for( int i = 0; i < d->nIndexes; i++ ) {
- _indexRecord(d->indexes[i], obj, newRecordLoc);
- }
-}
-
-DiskLoc DataFileMgr::insert(const char *ns, const void *buf, int len, bool god) {
- bool addIndex = false;
- const char *sys = strstr(ns, "system.");
- if( sys ) {
- if( sys == ns ) {
- cout << "ERROR: attempt to insert for invalid client 'system': " << ns << endl;
- return DiskLoc();
- }
- if( strstr(ns, ".system.") ) {
- if( strstr(ns, ".system.indexes") )
- addIndex = true;
- else if( !god ) {
- cout << "ERROR: attempt to insert in system namespace " << ns << endl;
- return DiskLoc();
- }
- }
- }
-
- NamespaceDetails *d = nsdetails(ns);
- if( d == 0 ) {
- newNamespace(ns);
- client->newestFile()->newExtent(ns, initialExtentSize(len));
- d = nsdetails(ns);
- }
-
- NamespaceDetails *tableToIndex = 0;
-
- const char *tabletoidxns = 0;
- if( addIndex ) {
- JSObj io((const char *) buf);
- const char *name = io.getStringField("name"); // name of the index
- tabletoidxns = io.getStringField("ns"); // table it indexes
- JSObj key = io.getObjectField("key");
- if( name == 0 || *name == 0 || tabletoidxns == 0 || key.isEmpty() || key.objsize() > 2048 ) {
- cout << "user warning: bad add index attempt name:" << (name?name:"") << " ns:" <<
- (tabletoidxns?tabletoidxns:"") << endl;
- return DiskLoc();
- }
- tableToIndex = nsdetails(tabletoidxns);
- if( tableToIndex == 0 ) {
- cout << "user warning: bad add index attempt, no such table(ns):" << tabletoidxns << endl;
- return DiskLoc();
- }
- if( tableToIndex->nIndexes >= MaxIndexes ) {
- cout << "user warning: bad add index attempt, too many indexes for:" << tabletoidxns << endl;
- return DiskLoc();
- }
- if( tableToIndex->findIndexByName(name) >= 0 ) {
- //cout << "INFO: index:" << name << " already exists for:" << tabletoidxns << endl;
- return DiskLoc();
- }
- //indexFullNS = tabletoidxns;
- //indexFullNS += ".$";
- //indexFullNS += name; // client.table.$index -- note this doesn't contain jsobjs, it contains BtreeBuckets.
- }
-
- DiskLoc extentLoc;
- int lenWHdr = len + Record::HeaderSize;
- DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
- if( loc.isNull() ) {
- // out of space
- if( d->capped == 0 ) { // size capped doesn't grow
- cout << "allocating new extent for " << ns << endl;
- client->newestFile()->newExtent(ns, followupExtentSize(len, d->lastExtentSize));
- loc = d->alloc(ns, lenWHdr, extentLoc);
- }
- if( loc.isNull() ) {
- cout << "out of space in datafile. capped:" << d->capped << endl;
- assert(d->capped);
- return DiskLoc();
- }
- }
-
- Record *r = loc.rec();
- assert( r->lengthWithHeaders >= lenWHdr );
- memcpy(r->data, buf, len);
- Extent *e = r->myExtent(loc);
- if( e->lastRecord.isNull() ) {
- e->firstRecord = e->lastRecord = loc;
- r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
- }
- else {
- Record *oldlast = e->lastRecord.rec();
- r->prevOfs = e->lastRecord.getOfs();
- r->nextOfs = DiskLoc::NullOfs;
- oldlast->nextOfs = loc.getOfs();
- e->lastRecord = loc;
- }
-
- d->nrecords++;
- d->datasize += r->netLength();
-
- if( tableToIndex ) {
- IndexDetails& idxinfo = tableToIndex->indexes[tableToIndex->nIndexes];
- idxinfo.info = loc;
- idxinfo.head = BtreeBucket::addHead(idxinfo);
- tableToIndex->nIndexes++;
- /* todo: index existing records here */
- addExistingToIndex(tabletoidxns, idxinfo);
- }
-
- /* add this record to our indexes */
- if( d->nIndexes )
- indexRecord(d, buf, len, loc);
-
-// cout << " inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
- return loc;
-}
-
-void DataFileMgr::init(const char *dir) {
-/* string path = dir;
- path += "temp.dat";
- temp.open(path.c_str(), 64 * 1024 * 1024);
-*/
-}
-
-void pdfileInit() {
-// namespaceIndex.init(dbpath);
- theDataFileMgr.init(dbpath);
-}
+// pdfile.cpp + +/* +todo: +_ manage deleted records. bucket? +_ use deleted on inserts! +_ quantize allocations +_ table scans must be sequential, not next/prev pointers +_ regex support +*/ + +#include "stdafx.h" +#include "pdfile.h" +#include "db.h" +#include "../util/mmap.h" +#include "../util/hashtab.h" +#include "objwrappers.h" +#include "btree.h" +#include <algorithm> +#include <list> + +const char *dbpath = "/data/db/"; + +DataFileMgr theDataFileMgr; +map<string,Client*> clients; +Client *client; +const char *curNs = ""; +int MAGIC = 0x1000; +int curOp = -2; +int callDepth = 0; + +extern int otherTraceLevel; + +void sayDbContext(const char *errmsg) { + if( errmsg ) { + cout << errmsg << '\n'; + problem() << errmsg << endl; + } + cout << " client: " << (client ? client->name.c_str() : "null"); + cout << " op:" << curOp << ' ' << callDepth << endl; + if( client ) + cout << " ns: " << curNs << endl; +} + +JSObj::JSObj(Record *r) { + _objdata = r->data; + _objsize = *((int*) _objdata); + if( _objsize > r->netLength() ) { + cout << "About to assert fail _objsize <= r->netLength()" << endl; + cout << " _objsize: " << _objsize << endl; + cout << " netLength(): " << r->netLength() << endl; + cout << " extentOfs: " << r->extentOfs << endl; + cout << " nextOfs: " << r->nextOfs << endl; + cout << " prevOfs: " << r->prevOfs << endl; + assert( _objsize <= r->netLength() ); + } + iFree = false; +} + +/*---------------------------------------------------------------------*/ + +int bucketSizes[] = { + 32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, + 0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, + 0x400000, 0x800000 +}; + +//NamespaceIndexMgr namespaceIndexMgr; + +void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) { + DEBUGGING cout << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl; + int b = bucket(d->lengthWithHeaders); + DiskLoc& list = deletedList[b]; + DiskLoc oldHead = list; + list = dloc; + d->nextDeleted = oldHead; +} + +/* lenToAlloc is WITH header +*/ +DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) { + lenToAlloc = (lenToAlloc + 3) & 0xfffffffc; + DiskLoc loc = _alloc(ns, lenToAlloc); + if( loc.isNull() ) + return loc; + + DeletedRecord *r = loc.drec(); + + /* note we want to grab from the front so our next pointers on disk tend + to go in a forward direction which is important for performance. */ + int regionlen = r->lengthWithHeaders; + extentLoc.set(loc.a(), r->extentOfs); + + DEBUGGING cout << "TEMP: alloc() returns " << loc.toString() << ' ' << ns << " lentoalloc:" << lenToAlloc << " ext:" << extentLoc.toString() << endl; + + int left = regionlen - lenToAlloc; + if( left < 24 || (left < (lenToAlloc >> 3) && capped == 0) ) { + // you get the whole thing. + return loc; + } + + /* split off some for further use. */ + r->lengthWithHeaders = lenToAlloc; + DiskLoc newDelLoc = loc; + newDelLoc.inc(lenToAlloc); + DeletedRecord *newDel = newDelLoc.drec(); + newDel->extentOfs = r->extentOfs; + newDel->lengthWithHeaders = left; + newDel->nextDeleted.Null(); + + addDeletedRec(newDel, newDelLoc); + + return loc; +} + +/* for non-capped collections. + returned item is out of the deleted list upon return +*/ +DiskLoc NamespaceDetails::__stdAlloc(int len) { + DiskLoc *prev; + DiskLoc *bestprev = 0; + DiskLoc bestmatch; + int bestmatchlen = 0x7fffffff; + int b = bucket(len); + DiskLoc cur = deletedList[b]; prev = &deletedList[b]; + int extra = 5; // look for a better fit, a little. + int chain = 0; + while( 1 ) { + { + int a = cur.a(); + if( a < -1 || a >= 100000 ) { + problem() << "Assertion failure - a() out of range in _alloc() " << a << endl; + cout << "Assertion failure - a() out of range in _alloc() " << a << '\n'; + sayDbContext(); + if( cur == *prev ) + prev->Null(); + cur.Null(); + } + } + if( cur.isNull() ) { + // move to next bucket. if we were doing "extra", just break + if( bestmatchlen < 0x7fffffff ) + break; + b++; + if( b > MaxBucket ) { + // out of space. alloc a new extent. + return DiskLoc(); + } + cur = deletedList[b]; prev = &deletedList[b]; + continue; + } + DeletedRecord *r = cur.drec(); + if( r->lengthWithHeaders >= len && + r->lengthWithHeaders < bestmatchlen ) { + bestmatchlen = r->lengthWithHeaders; + bestmatch = cur; + bestprev = prev; + } + if( bestmatchlen < 0x7fffffff && --extra <= 0 ) + break; + if( ++chain > 30 && b < MaxBucket ) { + // too slow, force move to next bucket to grab a big chunk + b++; + chain = 0; + cur.Null(); + } + else { + cur = r->nextDeleted; prev = &r->nextDeleted; + } + } + + /* unlink ourself from the deleted list */ + *bestprev = bestmatch.drec()->nextDeleted; + + return bestmatch; +} + +void NamespaceDetails::dumpDeleted(set<DiskLoc> *extents) { +// cout << "DUMP deleted chains" << endl; + for( int i = 0; i < Buckets; i++ ) { +// cout << " bucket " << i << endl; + DiskLoc dl = deletedList[i]; + while( !dl.isNull() ) { + DeletedRecord *r = dl.drec(); + DiskLoc extLoc(dl.a(), r->extentOfs); + if( extents == 0 || extents->count(extLoc) <= 0 ) { + cout << " bucket " << i << endl; + cout << " " << dl.toString() << " ext:" << extLoc.toString(); + if( extents && extents->count(extLoc) <= 0 ) + cout << '?'; + cout << " len:" << r->lengthWithHeaders << endl; + } + dl = r->nextDeleted; + } + } +// cout << endl; +} + +/* combine adjacent deleted records + + this is O(n^2) but we call it for capped tables where typically n==1 or 2! + (or 3...there will be a little unused sliver at the end of the extent.) +*/ +void NamespaceDetails::compact() { + assert(capped); + list<DiskLoc> drecs; + + for( int i = 0; i < Buckets; i++ ) { + DiskLoc dl = deletedList[i]; + deletedList[i].Null(); + while( !dl.isNull() ) { + DeletedRecord *r = dl.drec(); + drecs.push_back(dl); + dl = r->nextDeleted; + } + } + + drecs.sort(); + + list<DiskLoc>::iterator j = drecs.begin(); + assert( j != drecs.end() ); + DiskLoc a = *j; + while( 1 ) { + j++; + if( j == drecs.end() ) { + DEBUGGING cout << "TEMP: compact adddelrec\n"; + addDeletedRec(a.drec(), a); + break; + } + DiskLoc b = *j; + while( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) { + // a & b are adjacent. merge. + a.drec()->lengthWithHeaders += b.drec()->lengthWithHeaders; + j++; + if( j == drecs.end() ) { + DEBUGGING cout << "temp: compact adddelrec2\n"; + addDeletedRec(a.drec(), a); + return; + } + b = *j; + } + DEBUGGING cout << "temp: compact adddelrec3\n"; + addDeletedRec(a.drec(), a); + a = b; + } +} + +DiskLoc NamespaceDetails::_alloc(const char *ns, int len) { + if( !capped ) + return __stdAlloc(len); + + assert( len < 400000000 ); + int passes = 0; + DiskLoc loc; + + // delete records until we have room and the max # objects limit achieved. + while( 1 ) { + if( nrecords < max ) { + loc = __stdAlloc(len); + if( !loc.isNull() ) + break; + } + + DiskLoc fr = firstExtent.ext()->firstRecord; + if( fr.isNull() ) { + cout << "couldn't make room for new record in capped ns\n"; + cout << " ns:" << ns; + cout << "\n len: " << len << endl; + assert(false); + return DiskLoc(); + } + + theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); + compact(); + assert( ++passes < 5000 ); + } + + return loc; +} + +/* +class NamespaceCursor : public Cursor { +public: + virtual bool ok() { return i >= 0; } + virtual Record* _current() { assert(false); return 0; } + virtual DiskLoc currLoc() { assert(false); return DiskLoc(); } + + virtual JSObj current() { + NamespaceDetails &d = namespaceIndex.ht->nodes[i].value; + JSObjBuilder b; + b.append("name", namespaceIndex.ht->nodes[i].k.buf); + return b.done(); + } + + virtual bool advance() { + while( 1 ) { + i++; + if( i >= namespaceIndex.ht->n ) + break; + if( namespaceIndex.ht->nodes[i].inUse() ) + return true; + } + i = -1000000; + return false; + } + + NamespaceCursor() { + i = -1; + advance(); + } +private: + int i; +}; + +auto_ptr<Cursor> makeNamespaceCursor() { + return auto_ptr<Cursor>(new NamespaceCursor()); +}*/ + +void newNamespace(const char *ns) { + cout << "New namespace: " << ns << endl; + if( strstr(ns, "system.namespaces") == 0 ) { + JSObjBuilder b; + b.append("name", ns); + JSObj j = b.done(); + char client[256]; + nsToClient(ns, client); + string s = client; + s += ".system.namespaces"; + theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true); + } +} + +int initialExtentSize(int len) { + long long sz = len * 16; + if( len < 1000 ) sz = len * 64; + if( sz > 1000000000 ) + sz = 1000000000; + int z = ((int)sz) & 0xffffff00; + assert( z > len ); + cout << "initialExtentSize(" << len << ") returns " << z << endl; + return z; +} + +// { ..., capped: true, size: ..., max: ... } +bool userCreateNS(const char *ns, JSObj& j) { + if( nsdetails(ns) ) + return false; + + cout << j.toString() << endl; + + newNamespace(ns); + + int ies = initialExtentSize(128); + Element e = j.findElement("size"); + if( e.type() == Number ) { + ies = (int) e.number(); + ies += 256; + ies &= 0xffffff00; + if( ies > 1024 * 1024 * 1024 + 256 ) return false; + } + + client->newestFile()->newExtent(ns, ies); + NamespaceDetails *d = nsdetails(ns); + assert(d); + + e = j.findElement("capped"); + if( e.type() == Bool && e.boolean() ) { + d->capped = 1; + e = j.findElement("max"); + if( e.type() == Number ) { + int mx = (int) e.number(); + if( mx > 0 ) + d->max = mx; + } + } + + return true; +} + +/*---------------------------------------------------------------------*/ + +void PhysicalDataFile::open(int fn, const char *filename) { + int length; + + if( fn <= 4 ) { + length = (64*1024*1024) << fn; + if( strstr(filename, "alleyinsider") && length < 1024 * 1024 * 1024 ) + length = 1024 * 1024 * 1024; + } else + length = 0x7ff00000; + + assert( length >= 64*1024*1024 && length % 4096 == 0 ); + + assert(fn == fileNo); + header = (PDFHeader *) mmf.map(filename, length); + assert(header); + header->init(fileNo, length); +} + +/* prev - previous extent for this namespace. null=this is the first one. */ +Extent* PhysicalDataFile::newExtent(const char *ns, int approxSize, int loops) { + assert( approxSize >= 0 && approxSize <= 0x7ff00000 ); + + int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength; + DiskLoc loc; + if( ExtentSize <= 0 ) { + if( loops > 8 ) { + assert(false); + return 0; + } + cout << "INFO: newExtent(): file full, adding a new file " << ns << endl; + return client->addAFile()->newExtent(ns, approxSize, loops+1); + } + int offset = header->unused.getOfs(); + header->unused.setOfs( fileNo, offset + ExtentSize ); + header->unusedLength -= ExtentSize; + loc.setOfs(fileNo, offset); + Extent *e = _getExtent(loc); + DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset); + + DiskLoc oldExtentLoc; + NamespaceIndex *ni = nsindex(ns); + NamespaceDetails *details = ni->details(ns); + if( details ) { + assert( !details->firstExtent.isNull() ); + e->xprev = details->lastExtent; + details->lastExtent.ext()->xnext = loc; + details->lastExtent = loc; + } + else { + ni->add(ns, loc); + details = ni->details(ns); + } + + details->lastExtentSize = approxSize; + DEBUGGING cout << "temp: newextent adddelrec " << ns << endl; + details->addDeletedRec(emptyLoc.drec(), emptyLoc); + + cout << "new extent size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset << dec; + cout << " emptyLoc:" << hex << emptyLoc.getOfs() << dec; + cout << ' ' << ns << endl; + return e; +} + +/*---------------------------------------------------------------------*/ + +/* assumes already zeroed -- insufficient for block 'reuse' perhaps */ +DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) { + magic = 0x41424344; + myLoc.setOfs(_fileNo, _offset); + xnext.Null(); xprev.Null(); + ns = nsname; + length = _length; + firstRecord.Null(); lastRecord.Null(); + + DiskLoc emptyLoc = myLoc; + emptyLoc.inc( (extentData-(char*)this) ); + + DeletedRecord *empty1 = (DeletedRecord *) extentData; + DeletedRecord *empty = (DeletedRecord *) getRecord(emptyLoc); + assert( empty == empty1 ); + empty->lengthWithHeaders = _length - (extentData - (char *) this); + empty->extentOfs = myLoc.getOfs(); + return emptyLoc; +} + +/* +Record* Extent::newRecord(int len) { + if( firstEmptyRegion.isNull() ) + return 0; + + assert(len > 0); + int newRecSize = len + Record::HeaderSize; + DiskLoc newRecordLoc = firstEmptyRegion; + Record *r = getRecord(newRecordLoc); + int left = r->netLength() - len; + if( left < 0 ) { + // + firstEmptyRegion.Null(); + return 0; + } + + DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion); + r->lengthWithHeaders = newRecSize; + r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent + if( !lastRecord.isNull() ) { + assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one + getRecord(lastRecord)->next.set(newRecordLoc); // until now + r->prev.set(lastRecord); + } + else { + r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent + assert( firstRecord.isNull() ); + firstRecord = newRecordLoc; + } + lastRecord = newRecordLoc; + + if( left < Record::HeaderSize + 32 ) { + firstEmptyRegion.Null(); + } + else { + firstEmptyRegion.inc(newRecSize); + Record *empty = getRecord(firstEmptyRegion); + empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null. + empty->prev.Null(); + empty->lengthWithHeaders = left; + } + + return r; +} +*/ + +/*---------------------------------------------------------------------*/ + +auto_ptr<Cursor> DataFileMgr::findAll(const char *ns) { + DiskLoc loc; + bool found = nsindex(ns)->find(ns, loc); + if( !found ) { + // cout << "info: findAll() namespace does not exist: " << ns << endl; + return auto_ptr<Cursor>(new BasicCursor(DiskLoc())); + } + + Extent *e = getExtent(loc); + + DEBUGGING { + cout << "temp: listing extents for " << ns << endl; + DiskLoc tmp = loc; + set<DiskLoc> extents; + + while( 1 ) { + Extent *f = getExtent(tmp); + cout << "extent: " << tmp.toString() << endl; + extents.insert(tmp); + tmp = f->xnext; + if( tmp.isNull() ) + break; + f = f->getNextExtent(); + } + + cout << endl; + nsdetails(ns)->dumpDeleted(&extents); + } + + while( e->firstRecord.isNull() && !e->xnext.isNull() ) { + OCCASIONALLY cout << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead" << endl; + // find a nonempty extent + // it might be nice to free the whole extent here! but have to clean up free recs then. + e = e->getNextExtent(); + } + return auto_ptr<Cursor>(new BasicCursor( e->firstRecord )); +} + +/* get a table scan cursor, but can be forward or reverse direction */ +auto_ptr<Cursor> findTableScan(const char *ns, JSObj& order) { + Element el = order.findElement("$natural"); + if( el.type() != Number || el.number() >= 0 ) + return DataFileMgr::findAll(ns); + + // "reverse natural order" // "reverse natural order" + NamespaceDetails *d = nsdetails(ns); + if( !d ) + return auto_ptr<Cursor>(new BasicCursor(DiskLoc())); + Extent *e = d->lastExtent.ext(); + while( e->lastRecord.isNull() && !e->xprev.isNull() ) { + OCCASIONALLY cout << " findTableScan: extent empty, skipping ahead" << endl; + e = e->getPrevExtent(); + } + return auto_ptr<Cursor>(new ReverseCursor( e->lastRecord )); +} + +void aboutToDelete(const DiskLoc& dl); + +/* pull out the relevant key objects from obj, so we + can index them. Note that the set is multiple elements + only when it's a "multikey" array. + keys will be left empty if key not found in the object. +*/ +void IndexDetails::getKeysFromObject(JSObj& obj, set<JSObj>& keys) { + JSObj keyPattern = info.obj().getObjectField("key"); + JSObjBuilder b; + JSObj key = obj.extractFields(keyPattern, b); + if( key.isEmpty() ) + return; + Element f = key.firstElement(); + if( f.type() != Array ) { + b.decouple(); + key.iWillFree(); + keys.insert(key); + return; + } + JSObj arr = f.embeddedObject(); +// cout << arr.toString() << endl; + JSElemIter i(arr); + while( i.more() ) { + Element e = i.next(); + if( e.eoo() ) break; + JSObjBuilder b; + + b.appendAs(e, f.fieldName()); + JSObj o = b.doneAndDecouple(); + assert( o.objdata() ); + keys.insert(o); + } +} + +int nUnindexes = 0; + +void _unindexRecord(const char *ns, IndexDetails& id, JSObj& obj, const DiskLoc& dl) { + set<JSObj> keys; + id.getKeysFromObject(obj, keys); + for( set<JSObj>::iterator i=keys.begin(); i != keys.end(); i++ ) { + JSObj j = *i; +// cout << "UNINDEX: j:" << j.toString() << " head:" << id.head.toString() << dl.toString() << endl; + if( otherTraceLevel >= 5 ) { + cout << "_unindexRecord() " << obj.toString(); + cout << "\n unindex:" << j.toString() << endl; + } + nUnindexes++; + bool ok = false; + try { + ok = id.head.btree()->unindex(id.head, id, j, dl); + } + catch(AssertionException) { + cout << " caught assertion _unindexRecord " << id.indexNamespace() << '\n'; + problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl; + cout << "Assertion failure: _unindex failed" << '\n'; + cout << " obj:" << obj.toString() << '\n'; + cout << " key:" << j.toString() << '\n'; + cout << " dl:" << dl.toString() << endl; + sayDbContext(); + } + + if( !ok ) { + cout << "unindex failed (key too big?) " << id.indexNamespace() << '\n'; + } + } +} + +void unindexRecord(const char *ns, NamespaceDetails *d, Record *todelete, const DiskLoc& dl) { + if( d->nIndexes == 0 ) return; + JSObj obj(todelete); + for( int i = 0; i < d->nIndexes; i++ ) { + _unindexRecord(ns, d->indexes[i], obj, dl); + } +} + +void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK) +{ + int tempextofs = todelete->extentOfs; + + NamespaceDetails* d = nsdetails(ns); + if( d->capped && !cappedOK ) { + cout << "failing remove on a capped ns " << ns << endl; + return; + } + + /* check if any cursors point to us. if so, advance them. */ + aboutToDelete(dl); + + unindexRecord(ns, d, todelete, dl); + + /* remove ourself from the record next/prev chain */ + { + if( todelete->prevOfs != DiskLoc::NullOfs ) + todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs; + if( todelete->nextOfs != DiskLoc::NullOfs ) + todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs; + } + + /* remove ourself from extent pointers */ + { + Extent *e = todelete->myExtent(dl); + if( e->firstRecord == dl ) { + if( todelete->nextOfs == DiskLoc::NullOfs ) + e->firstRecord.Null(); + else + e->firstRecord.setOfs(dl.a(), todelete->nextOfs); + } + if( e->lastRecord == dl ) { + if( todelete->prevOfs == DiskLoc::NullOfs ) + e->lastRecord.Null(); + else + e->lastRecord.setOfs(dl.a(), todelete->prevOfs); + } + } + + /* add to the free list */ + { + d->nrecords--; + d->datasize -= todelete->netLength(); +/// DEBUGGING << "temp: dddelrec deleterecord " << ns << endl; +// if( todelete->extentOfs == 0xaca500 ) { +// cout << "break\n"; +// } +/* +TEMP: add deleted rec 0:aca5b0 aca500 +temp: adddelrec deleterecord admin.blog.posts +TEMP: add deleted rec 0:b9e750 b6a500 +temp: adddelrec deleterecord admin.blog.posts +*/ + + d->addDeletedRec((DeletedRecord*)todelete, dl); + } +} + +void setDifference(set<JSObj>& l, set<JSObj>& r, vector<JSObj*> &diff) { + set<JSObj>::iterator i = l.begin(); + set<JSObj>::iterator j = r.begin(); + while( 1 ) { + if( i == l.end() ) + break; + while( j != r.end() && *j < *i ) + j++; + if( j == r.end() || !i->woEqual(*j) ) { + const JSObj *jo = &*i; + diff.push_back( (JSObj *) jo ); + } + i++; + } +} + +/** Note: as written so far, if the object shrinks a lot, we don't free up space. */ +void DataFileMgr::update( + const char *ns, + Record *toupdate, const DiskLoc& dl, + const char *buf, int len, stringstream& ss) +{ + NamespaceDetails *d = nsdetails(ns); + + if( toupdate->netLength() < len ) { + if( d && d->capped ) { + ss << " failing a growing update on a capped ns " << ns << endl; + return; + } + + // doesn't fit. + if( client->profile ) + ss << " moved "; + deleteRecord(ns, toupdate, dl); + insert(ns, buf, len); + return; + } + + /* has any index keys changed? */ + { + NamespaceDetails *d = nsdetails(ns); + if( d->nIndexes ) { + JSObj newObj(buf); + JSObj oldObj = dl.obj(); + for( int i = 0; i < d->nIndexes; i++ ) { + IndexDetails& idx = d->indexes[i]; + JSObj idxKey = idx.info.obj().getObjectField("key"); + + set<JSObj> oldkeys; + set<JSObj> newkeys; + idx.getKeysFromObject(oldObj, oldkeys); + idx.getKeysFromObject(newObj, newkeys); + vector<JSObj*> removed; + setDifference(oldkeys, newkeys, removed); + string idxns = idx.indexNamespace(); + for( unsigned i = 0; i < removed.size(); i++ ) { + try { + idx.head.btree()->unindex(idx.head, idx, *removed[i], dl); + } + catch(AssertionException) { + ss << " exception update unindex "; + cout << " caught assertion update unindex " << idxns.c_str() << '\n'; + problem() << " caught assertion update unindex " << idxns.c_str() << endl; + } + } + vector<JSObj*> added; + setDifference(newkeys, oldkeys, added); + assert( !dl.isNull() ); + for( unsigned i = 0; i < added.size(); i++ ) { + try { + idx.head.btree()->insert( + idx.head, + dl, *added[i], false, idx, true); + } + catch(AssertionException) { + ss << " exception update index "; + cout << " caught assertion update index " << idxns.c_str() << '\n'; + problem() << " caught assertion update index " << idxns.c_str() << endl; + } + } + if( client->profile ) + ss << "<br>" << added.size() << " key updates "; + + } + } + } + + // update in place + memcpy(toupdate->data, buf, len); +} + +int followupExtentSize(int len, int lastExtentLen) { + int x = initialExtentSize(len); + int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2); + int sz = y > x ? y : x; + sz = ((int)sz) & 0xffffff00; + assert( sz > len ); + return sz; +} + +int deb=0; + +/* add keys to indexes for a new record */ +void _indexRecord(IndexDetails& idx, JSObj& obj, DiskLoc newRecordLoc) { + + set<JSObj> keys; + idx.getKeysFromObject(obj, keys); + for( set<JSObj>::iterator i=keys.begin(); i != keys.end(); i++ ) { + assert( !newRecordLoc.isNull() ); + try { +// DEBUGGING << "temp index: " << newRecordLoc.toString() << obj.toString() << endl; + idx.head.btree()->insert(idx.head, newRecordLoc, + (JSObj&) *i, false, idx, true); + } + catch(AssertionException) { + cout << " caught assertion _indexRecord " << idx.indexNamespace() << '\n'; + problem() << " caught assertion _indexRecord " << idx.indexNamespace() << endl; + } + } +} + +/* note there are faster ways to build an index in bulk, that can be + done eventually */ +void addExistingToIndex(const char *ns, IndexDetails& idx) { + cout << "Adding all existing records for " << ns << " to new index" << endl; + int n = 0; + auto_ptr<Cursor> c = theDataFileMgr.findAll(ns); + while( c->ok() ) { + JSObj js = c->current(); + _indexRecord(idx, js, c->currLoc()); + c->advance(); + n++; + }; + cout << " indexing complete for " << n << " records" << endl; +} + +/* add keys to indexes for a new record */ +void indexRecord(NamespaceDetails *d, const void *buf, int len, DiskLoc newRecordLoc) { + JSObj obj((const char *)buf); + for( int i = 0; i < d->nIndexes; i++ ) { + _indexRecord(d->indexes[i], obj, newRecordLoc); + } +} + +DiskLoc DataFileMgr::insert(const char *ns, const void *buf, int len, bool god) { + bool addIndex = false; + const char *sys = strstr(ns, "system."); + if( sys ) { + if( sys == ns ) { + cout << "ERROR: attempt to insert for invalid client 'system': " << ns << endl; + return DiskLoc(); + } + if( strstr(ns, ".system.") ) { + if( strstr(ns, ".system.indexes") ) + addIndex = true; + else if( !god ) { + cout << "ERROR: attempt to insert in system namespace " << ns << endl; + return DiskLoc(); + } + } + } + + NamespaceDetails *d = nsdetails(ns); + if( d == 0 ) { + newNamespace(ns); + client->newestFile()->newExtent(ns, initialExtentSize(len)); + d = nsdetails(ns); + } + + NamespaceDetails *tableToIndex = 0; + + const char *tabletoidxns = 0; + if( addIndex ) { + JSObj io((const char *) buf); + const char *name = io.getStringField("name"); // name of the index + tabletoidxns = io.getStringField("ns"); // table it indexes + JSObj key = io.getObjectField("key"); + if( name == 0 || *name == 0 || tabletoidxns == 0 || key.isEmpty() || key.objsize() > 2048 ) { + cout << "user warning: bad add index attempt name:" << (name?name:"") << " ns:" << + (tabletoidxns?tabletoidxns:"") << endl; + return DiskLoc(); + } + tableToIndex = nsdetails(tabletoidxns); + if( tableToIndex == 0 ) { + cout << "user warning: bad add index attempt, no such table(ns):" << tabletoidxns << endl; + return DiskLoc(); + } + if( tableToIndex->nIndexes >= MaxIndexes ) { + cout << "user warning: bad add index attempt, too many indexes for:" << tabletoidxns << endl; + return DiskLoc(); + } + if( tableToIndex->findIndexByName(name) >= 0 ) { + //cout << "INFO: index:" << name << " already exists for:" << tabletoidxns << endl; + return DiskLoc(); + } + //indexFullNS = tabletoidxns; + //indexFullNS += ".$"; + //indexFullNS += name; // client.table.$index -- note this doesn't contain jsobjs, it contains BtreeBuckets. + } + + DiskLoc extentLoc; + int lenWHdr = len + Record::HeaderSize; + DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); + if( loc.isNull() ) { + // out of space + if( d->capped == 0 ) { // size capped doesn't grow + cout << "allocating new extent for " << ns << endl; + client->newestFile()->newExtent(ns, followupExtentSize(len, d->lastExtentSize)); + loc = d->alloc(ns, lenWHdr, extentLoc); + } + if( loc.isNull() ) { + cout << "out of space in datafile. capped:" << d->capped << endl; + assert(d->capped); + return DiskLoc(); + } + } + + Record *r = loc.rec(); + assert( r->lengthWithHeaders >= lenWHdr ); + memcpy(r->data, buf, len); + Extent *e = r->myExtent(loc); + if( e->lastRecord.isNull() ) { + e->firstRecord = e->lastRecord = loc; + r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + } + else { + Record *oldlast = e->lastRecord.rec(); + r->prevOfs = e->lastRecord.getOfs(); + r->nextOfs = DiskLoc::NullOfs; + oldlast->nextOfs = loc.getOfs(); + e->lastRecord = loc; + } + + d->nrecords++; + d->datasize += r->netLength(); + + if( tableToIndex ) { + IndexDetails& idxinfo = tableToIndex->indexes[tableToIndex->nIndexes]; + idxinfo.info = loc; + idxinfo.head = BtreeBucket::addHead(idxinfo); + tableToIndex->nIndexes++; + /* todo: index existing records here */ + addExistingToIndex(tabletoidxns, idxinfo); + } + + /* add this record to our indexes */ + if( d->nIndexes ) + indexRecord(d, buf, len, loc); + +// cout << " inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl; + return loc; +} + +void DataFileMgr::init(const char *dir) { +/* string path = dir; + path += "temp.dat"; + temp.open(path.c_str(), 64 * 1024 * 1024); +*/ +} + +void pdfileInit() { +// namespaceIndex.init(dbpath); + theDataFileMgr.init(dbpath); +} |