summaryrefslogtreecommitdiff
path: root/src/mongo/db/record.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/record.cpp')
-rw-r--r--src/mongo/db/record.cpp267
1 files changed, 267 insertions, 0 deletions
diff --git a/src/mongo/db/record.cpp b/src/mongo/db/record.cpp
new file mode 100644
index 00000000000..17987002efc
--- /dev/null
+++ b/src/mongo/db/record.cpp
@@ -0,0 +1,267 @@
+// record.cpp
+
+#include "pch.h"
+#include "pdfile.h"
+#include "../util/processinfo.h"
+#include "../util/net/listen.h"
+#include "pagefault.h"
+
+namespace mongo {
+
+ namespace ps {
+
+ enum State {
+ In , Out, Unk
+ };
+
+ enum Constants {
+ SliceSize = 65536 ,
+ MaxChain = 20 , // intentionally very low
+ NumSlices = 10 ,
+ RotateTimeSecs = 90
+ };
+
+ int hash( size_t region ) {
+ return
+ abs( ( ( 7 + (int)(region & 0xFFFF) )
+ * ( 11 + (int)( ( region >> 16 ) & 0xFFFF ) )
+#if defined(_WIN64) || defined(__amd64__)
+ * ( 13 + (int)( ( region >> 32 ) & 0xFFFF ) )
+ * ( 17 + (int)( ( region >> 48 ) & 0xFFFF ) )
+#endif
+ ) % SliceSize );
+ }
+
+
+ /**
+ * simple hash map for region -> status
+ * this constitures a single region of time
+ * it does chaining, but very short chains
+ */
+ class Slice {
+
+ struct Entry {
+ size_t region;
+ unsigned long long value;
+ };
+
+ public:
+
+ Slice() {
+ reset();
+ }
+
+ void reset() {
+ memset( _data , 0 , SliceSize * sizeof(Entry) );
+ }
+
+ State get( int regionHash , size_t region , short offset ) {
+ DEV assert( hash( region ) == regionHash );
+
+ Entry * e = _get( regionHash , region , false );
+ if ( ! e )
+ return Unk;
+
+ return ( e->value & ( ((unsigned long long)1) << offset ) ) ? In : Out;
+ }
+
+ /**
+ * @return true if added, false if full
+ */
+ bool in( int regionHash , size_t region , short offset ) {
+ DEV assert( hash( region ) == regionHash );
+
+ Entry * e = _get( regionHash , region , true );
+ if ( ! e )
+ return false;
+
+ e->value |= ((unsigned long long)1) << offset;
+ return true;
+ }
+
+ private:
+
+ Entry* _get( int start , size_t region , bool add ) {
+ for ( int i=0; i<MaxChain; i++ ) {
+
+ int bucket = ( start + i ) % SliceSize;
+
+ if ( _data[bucket].region == 0 ) {
+ if ( ! add )
+ return 0;
+
+ _data[bucket].region = region;
+ return &_data[bucket];
+ }
+
+ if ( _data[bucket].region == region ) {
+ return &_data[bucket];
+ }
+ }
+ return 0;
+ }
+
+ Entry _data[SliceSize];
+ };
+
+
+ /**
+ * this contains many slices of times
+ * the idea you put mem status in the current time slice
+ * and then after a certain period of time, it rolls off so we check again
+ */
+ class Rolling {
+
+ public:
+ Rolling()
+ : _lock( "ps::Rolling" ){
+ _curSlice = 0;
+ _lastRotate = Listener::getElapsedTimeMillis();
+ }
+
+
+ /**
+ * after this call, we assume the page is in ram
+ * @param doHalf if this is a known good access, want to put in first half
+ * @return whether we know the page is in ram
+ */
+ bool access( size_t region , short offset , bool doHalf ) {
+ int regionHash = hash(region);
+
+ SimpleMutex::scoped_lock lk( _lock );
+
+ static int rarely_count = 0;
+ if ( rarely_count++ % 2048 == 0 ) {
+ long long now = Listener::getElapsedTimeMillis();
+ RARELY if ( now == 0 ) {
+ tlog() << "warning Listener::getElapsedTimeMillis returning 0ms" << endl;
+ }
+
+ if ( now - _lastRotate > ( 1000 * RotateTimeSecs ) ) {
+ _rotate();
+ }
+ }
+
+ for ( int i=0; i<NumSlices / ( doHalf ? 2 : 1 ); i++ ) {
+ int pos = (_curSlice+i)%NumSlices;
+ State s = _slices[pos].get( regionHash , region , offset );
+
+ if ( s == In )
+ return true;
+
+ if ( s == Out ) {
+ _slices[pos].in( regionHash , region , offset );
+ return false;
+ }
+ }
+
+ // we weren't in any slice
+ // so add to cur
+ if ( ! _slices[_curSlice].in( regionHash , region , offset ) ) {
+ _rotate();
+ _slices[_curSlice].in( regionHash , region , offset );
+ }
+ return false;
+ }
+
+ private:
+
+ void _rotate() {
+ _curSlice = ( _curSlice + 1 ) % NumSlices;
+ _slices[_curSlice].reset();
+ _lastRotate = Listener::getElapsedTimeMillis();
+ }
+
+ int _curSlice;
+ long long _lastRotate;
+ Slice _slices[NumSlices];
+
+ SimpleMutex _lock;
+ } rolling;
+
+ }
+
+ bool Record::MemoryTrackingEnabled = true;
+
+ volatile int __record_touch_dummy = 1; // this is used to make sure the compiler doesn't get too smart on us
+ void Record::touch( bool entireRecrd ) {
+ if ( lengthWithHeaders > HeaderSize ) { // this also makes sure lengthWithHeaders is in memory
+ char * addr = data;
+ char * end = data + netLength();
+ for ( ; addr <= end ; addr += 2048 ) {
+ __record_touch_dummy += addr[0];
+
+ break; // TODO: remove this, pending SERVER-3711
+
+ // note if this is a touch of a deletedrecord, we don't want to touch more than the first part. we may simply
+ // be updated the linked list and a deletedrecord could be gigantic. similar circumstance just less extreme
+ // exists for any record if we are just updating its header, say on a remove(); some sort of hints might be
+ // useful.
+
+ if ( ! entireRecrd )
+ break;
+ }
+ }
+ }
+
+ const bool blockSupported = ProcessInfo::blockCheckSupported();
+
+ bool Record::likelyInPhysicalMemory() {
+ if ( ! MemoryTrackingEnabled )
+ return true;
+
+ const size_t page = (size_t)data >> 12;
+ const size_t region = page >> 6;
+ const size_t offset = page & 0x3f;
+
+ if ( ps::rolling.access( region , offset , false ) )
+ return true;
+
+ if ( ! blockSupported ) {
+ // this means we don't fallback to system call
+ // and assume things aren't in memory
+ // possible we yield too much - but better than not yielding through a fault
+ return false;
+ }
+
+ return ProcessInfo::blockInMemory( data );
+ }
+
+
+ Record* Record::accessed() {
+ const size_t page = (size_t)data >> 12;
+ const size_t region = page >> 6;
+ const size_t offset = page & 0x3f;
+ ps::rolling.access( region , offset , true );
+ return this;
+ }
+
+ Record* DiskLoc::rec() const {
+ Record *r = DataFileMgr::getRecord(*this);
+#if defined(_PAGEFAULTEXCEPTION)
+ DEV ONCE {
+ log() << "_DEBUG info _PAGEFAULTEXCEPTION is ON -- experimental at this time" << endl;
+ }
+ bool fault = !r->likelyInPhysicalMemory();
+ DEV if( rand() % 100 == 0 )
+ fault = true;
+ if( fault &&
+ !cc()._hasWrittenThisPass &&
+ cc()._pageFaultRetryableSection )
+ {
+ if( cc()._pageFaultRetryableSection->_laps > 100 ) {
+ log() << "info pagefaultexception _laps > 100" << endl;
+ }
+ else {
+ throw PageFaultException(r);
+ }
+ }
+#else
+ DEV ONCE {
+ log() << "_DEBUG info _PAGEFAULTEXCEPTION is off" << endl;
+ }
+#endif
+ return r;
+ }
+
+}