summaryrefslogtreecommitdiff
path: root/src/mongo/db/storage/mmap_v1
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/storage/mmap_v1')
-rw-r--r--src/mongo/db/storage/mmap_v1/aligned_builder.cpp207
-rw-r--r--src/mongo/db/storage/mmap_v1/aligned_builder.h211
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp545
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_interface.h14
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_interface_test.cpp54
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp4066
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_logic.h823
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp3986
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_ondisk.cpp32
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h519
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp343
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_test_help.h196
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/key.cpp1040
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/key.h213
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/hashtab.cpp82
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/hashtab.h158
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/index_details.cpp9
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/index_details.h51
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h69
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace.cpp17
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace.h116
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp344
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details.h346
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp553
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h101
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp258
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h91
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp305
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_index.h70
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp49
-rw-r--r--src/mongo/db/storage/mmap_v1/compress.cpp36
-rw-r--r--src/mongo/db/storage/mmap_v1/compress.h19
-rw-r--r--src/mongo/db/storage/mmap_v1/data_file.cpp323
-rw-r--r--src/mongo/db/storage/mmap_v1/data_file.h273
-rw-r--r--src/mongo/db/storage/mmap_v1/data_file_sync.cpp141
-rw-r--r--src/mongo/db/storage/mmap_v1/data_file_sync.h42
-rw-r--r--src/mongo/db/storage/mmap_v1/diskloc.h271
-rw-r--r--src/mongo/db/storage/mmap_v1/dur.cpp1272
-rw-r--r--src/mongo/db/storage/mmap_v1/dur.h188
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_commitjob.cpp107
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_commitjob.h304
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_journal.cpp1263
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_journal.h85
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_journal_writer.cpp419
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_journal_writer.h264
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_journalformat.h304
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_journalimpl.h136
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_preplogbuffer.cpp255
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_recover.cpp1019
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_recover.h88
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_recovery_unit.cpp432
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_recovery_unit.h236
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_stats.h85
-rw-r--r--src/mongo/db/storage/mmap_v1/durable_mapped_file.cpp408
-rw-r--r--src/mongo/db/storage/mmap_v1/durable_mapped_file.h361
-rw-r--r--src/mongo/db/storage/mmap_v1/durop.cpp233
-rw-r--r--src/mongo/db/storage/mmap_v1/durop.h165
-rw-r--r--src/mongo/db/storage/mmap_v1/extent.cpp124
-rw-r--r--src/mongo/db/storage/mmap_v1/extent.h63
-rw-r--r--src/mongo/db/storage/mmap_v1/extent_manager.cpp96
-rw-r--r--src/mongo/db/storage/mmap_v1/extent_manager.h260
-rw-r--r--src/mongo/db/storage/mmap_v1/file_allocator.cpp644
-rw-r--r--src/mongo/db/storage/mmap_v1/file_allocator.h100
-rw-r--r--src/mongo/db/storage/mmap_v1/heap_record_store_btree.cpp208
-rw-r--r--src/mongo/db/storage/mmap_v1/heap_record_store_btree.h326
-rw-r--r--src/mongo/db/storage/mmap_v1/journal_latency_test_cmd.cpp176
-rw-r--r--src/mongo/db/storage/mmap_v1/logfile.cpp294
-rw-r--r--src/mongo/db/storage/mmap_v1/logfile.h55
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap.cpp363
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap.h392
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_posix.cpp394
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp1274
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h261
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_engine.cpp480
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_engine.h98
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.cpp980
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h322
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_init.cpp66
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_init_test.cpp158
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_options.cpp110
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_options.h97
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_record_store_test.cpp77
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_windows.cpp762
-rw-r--r--src/mongo/db/storage/mmap_v1/record.h211
-rw-r--r--src/mongo/db/storage/mmap_v1/record_access_tracker.cpp497
-rw-r--r--src/mongo/db/storage/mmap_v1/record_access_tracker.h199
-rw-r--r--src/mongo/db/storage/mmap_v1/record_access_tracker_test.cpp198
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp1517
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_base.h471
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp1068
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped.h179
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp290
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h100
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp1274
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp234
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h109
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp720
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple.h127
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp156
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h60
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp786
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp1016
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h243
-rw-r--r--src/mongo/db/storage/mmap_v1/repair_database.cpp700
104 files changed, 20860 insertions, 21072 deletions
diff --git a/src/mongo/db/storage/mmap_v1/aligned_builder.cpp b/src/mongo/db/storage/mmap_v1/aligned_builder.cpp
index 96b16e59f4a..8742f25e285 100644
--- a/src/mongo/db/storage/mmap_v1/aligned_builder.cpp
+++ b/src/mongo/db/storage/mmap_v1/aligned_builder.cpp
@@ -37,135 +37,136 @@
namespace mongo {
- using std::endl;
+using std::endl;
- AlignedBuilder::AlignedBuilder(unsigned initSize) {
- _len = 0;
- _malloc(initSize);
- uassert(13584, "out of memory AlignedBuilder", _p._allocationAddress);
- }
+AlignedBuilder::AlignedBuilder(unsigned initSize) {
+ _len = 0;
+ _malloc(initSize);
+ uassert(13584, "out of memory AlignedBuilder", _p._allocationAddress);
+}
- BOOST_STATIC_ASSERT(sizeof(void*) == sizeof(size_t));
+BOOST_STATIC_ASSERT(sizeof(void*) == sizeof(size_t));
- /** reset for a re-use. shrinks if > 128MB */
- void AlignedBuilder::reset() {
- _len = 0;
- RARELY {
- const unsigned sizeCap = 128*1024*1024;
- if (_p._size > sizeCap)
- _realloc(sizeCap, _len);
- }
+/** reset for a re-use. shrinks if > 128MB */
+void AlignedBuilder::reset() {
+ _len = 0;
+ RARELY {
+ const unsigned sizeCap = 128 * 1024 * 1024;
+ if (_p._size > sizeCap)
+ _realloc(sizeCap, _len);
}
+}
- /** reset with a hint as to the upcoming needed size specified */
- void AlignedBuilder::reset(unsigned sz) {
- _len = 0;
- unsigned Q = 32 * 1024 * 1024 - 1;
- unsigned want = (sz+Q) & (~Q);
- if( _p._size == want ) {
+/** reset with a hint as to the upcoming needed size specified */
+void AlignedBuilder::reset(unsigned sz) {
+ _len = 0;
+ unsigned Q = 32 * 1024 * 1024 - 1;
+ unsigned want = (sz + Q) & (~Q);
+ if (_p._size == want) {
+ return;
+ }
+ if (_p._size > want) {
+ if (_p._size <= 64 * 1024 * 1024)
return;
- }
- if( _p._size > want ) {
- if( _p._size <= 64 * 1024 * 1024 )
- return;
- bool downsize = false;
- RARELY { downsize = true; }
- if( !downsize )
- return;
+ bool downsize = false;
+ RARELY {
+ downsize = true;
}
- _realloc(want, _len);
- }
-
- void AlignedBuilder::mallocSelfAligned(unsigned sz) {
- verify( sz == _p._size );
- void *p = malloc(sz + Alignment - 1);
- _p._allocationAddress = p;
- size_t s = (size_t) p;
- size_t sold = s;
- s += Alignment - 1;
- s = (s/Alignment)*Alignment;
- verify( s >= sold ); // beginning
- verify( (s + sz) <= (sold + sz + Alignment - 1) ); //end
- _p._data = (char *) s;
+ if (!downsize)
+ return;
}
+ _realloc(want, _len);
+}
- /* "slow"/infrequent portion of 'grow()' */
- void NOINLINE_DECL AlignedBuilder::growReallocate(unsigned oldLen) {
- const unsigned MB = 1024*1024;
- const unsigned kMaxSize = (sizeof(int*) == 4) ? 512*MB : 2000*MB;
- const unsigned kWarnSize = (sizeof(int*) == 4) ? 256*MB : 512*MB;
+void AlignedBuilder::mallocSelfAligned(unsigned sz) {
+ verify(sz == _p._size);
+ void* p = malloc(sz + Alignment - 1);
+ _p._allocationAddress = p;
+ size_t s = (size_t)p;
+ size_t sold = s;
+ s += Alignment - 1;
+ s = (s / Alignment) * Alignment;
+ verify(s >= sold); // beginning
+ verify((s + sz) <= (sold + sz + Alignment - 1)); // end
+ _p._data = (char*)s;
+}
- const unsigned oldSize = _p._size;
+/* "slow"/infrequent portion of 'grow()' */
+void NOINLINE_DECL AlignedBuilder::growReallocate(unsigned oldLen) {
+ const unsigned MB = 1024 * 1024;
+ const unsigned kMaxSize = (sizeof(int*) == 4) ? 512 * MB : 2000 * MB;
+ const unsigned kWarnSize = (sizeof(int*) == 4) ? 256 * MB : 512 * MB;
- // Warn for unexpectedly large buffer
- wassert(_len <= kWarnSize);
+ const unsigned oldSize = _p._size;
- // Check validity of requested size
- invariant(_len > oldSize);
- if (_len > kMaxSize) {
- log() << "error writing journal: too much uncommitted data (" << _len << " bytes)";
- log() << "shutting down immediately to avoid corruption";
- fassert(28614, _len <= kMaxSize);
- }
+ // Warn for unexpectedly large buffer
+ wassert(_len <= kWarnSize);
- // Use smaller maximum for debug builds, as we should never be close the the maximum
- dassert(_len <= 256*MB);
+ // Check validity of requested size
+ invariant(_len > oldSize);
+ if (_len > kMaxSize) {
+ log() << "error writing journal: too much uncommitted data (" << _len << " bytes)";
+ log() << "shutting down immediately to avoid corruption";
+ fassert(28614, _len <= kMaxSize);
+ }
- // Compute newSize by doubling the existing maximum size until the maximum is reached
- invariant(oldSize > 0);
- uint64_t newSize = oldSize; // use 64 bits to defend against accidental overflow
- while (newSize < _len) {
- newSize *= 2;
- }
+ // Use smaller maximum for debug builds, as we should never be close the the maximum
+ dassert(_len <= 256 * MB);
- if (newSize > kMaxSize) {
- newSize = kMaxSize;
- }
+ // Compute newSize by doubling the existing maximum size until the maximum is reached
+ invariant(oldSize > 0);
+ uint64_t newSize = oldSize; // use 64 bits to defend against accidental overflow
+ while (newSize < _len) {
+ newSize *= 2;
+ }
- _realloc(newSize, oldLen);
+ if (newSize > kMaxSize) {
+ newSize = kMaxSize;
}
- void AlignedBuilder::_malloc(unsigned sz) {
- _p._size = sz;
+ _realloc(newSize, oldLen);
+}
+
+void AlignedBuilder::_malloc(unsigned sz) {
+ _p._size = sz;
#if defined(_WIN32)
- void *p = VirtualAlloc(0, sz, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
- _p._allocationAddress = p;
- _p._data = (char *) p;
+ void* p = VirtualAlloc(0, sz, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+ _p._allocationAddress = p;
+ _p._data = (char*)p;
#elif defined(__linux__)
- // in theory #ifdef _POSIX_VERSION should work, but it doesn't on OS X 10.4, and needs to be tested on solaris.
- // so for now, linux only for this.
- void *p = 0;
- int res = posix_memalign(&p, Alignment, sz);
- massert(13524, "out of memory AlignedBuilder", res == 0);
- _p._allocationAddress = p;
- _p._data = (char *) p;
+ // in theory #ifdef _POSIX_VERSION should work, but it doesn't on OS X 10.4, and needs to be tested on solaris.
+ // so for now, linux only for this.
+ void* p = 0;
+ int res = posix_memalign(&p, Alignment, sz);
+ massert(13524, "out of memory AlignedBuilder", res == 0);
+ _p._allocationAddress = p;
+ _p._data = (char*)p;
#else
- mallocSelfAligned(sz);
- verify( ((size_t) _p._data) % Alignment == 0 );
+ mallocSelfAligned(sz);
+ verify(((size_t)_p._data) % Alignment == 0);
#endif
- }
+}
- void AlignedBuilder::_realloc(unsigned newSize, unsigned oldLen) {
- // posix_memalign alignment is not maintained on reallocs, so we can't use realloc().
- AllocationInfo old = _p;
- _malloc(newSize);
- verify( oldLen <= _len );
- memcpy(_p._data, old._data, oldLen);
- _free(old._allocationAddress);
- }
+void AlignedBuilder::_realloc(unsigned newSize, unsigned oldLen) {
+ // posix_memalign alignment is not maintained on reallocs, so we can't use realloc().
+ AllocationInfo old = _p;
+ _malloc(newSize);
+ verify(oldLen <= _len);
+ memcpy(_p._data, old._data, oldLen);
+ _free(old._allocationAddress);
+}
- void AlignedBuilder::_free(void *p) {
+void AlignedBuilder::_free(void* p) {
#if defined(_WIN32)
- VirtualFree(p, 0, MEM_RELEASE);
+ VirtualFree(p, 0, MEM_RELEASE);
#else
- free(p);
+ free(p);
#endif
- }
-
- void AlignedBuilder::kill() {
- _free(_p._allocationAddress);
- _p._allocationAddress = 0;
- _p._data = 0;
- }
+}
+void AlignedBuilder::kill() {
+ _free(_p._allocationAddress);
+ _p._allocationAddress = 0;
+ _p._data = 0;
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/aligned_builder.h b/src/mongo/db/storage/mmap_v1/aligned_builder.h
index fb184424b66..f43cbee7d5d 100644
--- a/src/mongo/db/storage/mmap_v1/aligned_builder.h
+++ b/src/mongo/db/storage/mmap_v1/aligned_builder.h
@@ -33,104 +33,117 @@
namespace mongo {
- /** a page-aligned BufBuilder. */
- class AlignedBuilder {
- public:
- AlignedBuilder(unsigned init_size);
- ~AlignedBuilder() { kill(); }
-
- /** reset with a hint as to the upcoming needed size specified */
- void reset(unsigned sz);
-
- /** reset for a re-use. shrinks if > 128MB */
- void reset();
-
- /** note this may be deallocated (realloced) if you keep writing or reset(). */
- const char* buf() const { return _p._data; }
-
- /** leave room for some stuff later
- @return offset in the buffer that was our current position
- */
- size_t skip(unsigned n) {
- unsigned l = len();
- grow(n);
- return l;
+/** a page-aligned BufBuilder. */
+class AlignedBuilder {
+public:
+ AlignedBuilder(unsigned init_size);
+ ~AlignedBuilder() {
+ kill();
+ }
+
+ /** reset with a hint as to the upcoming needed size specified */
+ void reset(unsigned sz);
+
+ /** reset for a re-use. shrinks if > 128MB */
+ void reset();
+
+ /** note this may be deallocated (realloced) if you keep writing or reset(). */
+ const char* buf() const {
+ return _p._data;
+ }
+
+ /** leave room for some stuff later
+ @return offset in the buffer that was our current position
+ */
+ size_t skip(unsigned n) {
+ unsigned l = len();
+ grow(n);
+ return l;
+ }
+
+ /** if buffer grows pointer no longer valid */
+ char* atOfs(unsigned ofs) {
+ return _p._data + ofs;
+ }
+
+ /** if buffer grows pointer no longer valid */
+ char* cur() {
+ return _p._data + _len;
+ }
+
+ void appendChar(char j) {
+ *((char*)grow(sizeof(char))) = j;
+ }
+ void appendNum(char j) {
+ *((char*)grow(sizeof(char))) = j;
+ }
+ void appendNum(short j) {
+ *((short*)grow(sizeof(short))) = j;
+ }
+ void appendNum(int j) {
+ *((int*)grow(sizeof(int))) = j;
+ }
+ void appendNum(unsigned j) {
+ *((unsigned*)grow(sizeof(unsigned))) = j;
+ }
+ void appendNum(bool j) {
+ *((bool*)grow(sizeof(bool))) = j;
+ }
+ void appendNum(double j) {
+ *((double*)grow(sizeof(double))) = j;
+ }
+ void appendNum(long long j) {
+ *((long long*)grow(sizeof(long long))) = j;
+ }
+ void appendNum(unsigned long long j) {
+ *((unsigned long long*)grow(sizeof(unsigned long long))) = j;
+ }
+
+ void appendBuf(const void* src, size_t len) {
+ memcpy(grow((unsigned)len), src, len);
+ }
+
+ template <class T>
+ void appendStruct(const T& s) {
+ appendBuf(&s, sizeof(T));
+ }
+
+ void appendStr(StringData str, bool includeEOO = true) {
+ const unsigned len = str.size() + (includeEOO ? 1 : 0);
+ verify(len < (unsigned)BSONObjMaxUserSize);
+ str.copyTo(grow(len), includeEOO);
+ }
+
+ /** @return the in-use length */
+ unsigned len() const {
+ return _len;
+ }
+
+private:
+ static const unsigned Alignment = 8192;
+
+ /** returns the pre-grow write position */
+ inline char* grow(unsigned by) {
+ unsigned oldlen = _len;
+ _len += by;
+ if (MONGO_unlikely(_len > _p._size)) {
+ growReallocate(oldlen);
}
-
- /** if buffer grows pointer no longer valid */
- char* atOfs(unsigned ofs) { return _p._data + ofs; }
-
- /** if buffer grows pointer no longer valid */
- char* cur() { return _p._data + _len; }
-
- void appendChar(char j) {
- *((char*)grow(sizeof(char))) = j;
- }
- void appendNum(char j) {
- *((char*)grow(sizeof(char))) = j;
- }
- void appendNum(short j) {
- *((short*)grow(sizeof(short))) = j;
- }
- void appendNum(int j) {
- *((int*)grow(sizeof(int))) = j;
- }
- void appendNum(unsigned j) {
- *((unsigned*)grow(sizeof(unsigned))) = j;
- }
- void appendNum(bool j) {
- *((bool*)grow(sizeof(bool))) = j;
- }
- void appendNum(double j) {
- *((double*)grow(sizeof(double))) = j;
- }
- void appendNum(long long j) {
- *((long long*)grow(sizeof(long long))) = j;
- }
- void appendNum(unsigned long long j) {
- *((unsigned long long*)grow(sizeof(unsigned long long))) = j;
- }
-
- void appendBuf(const void *src, size_t len) { memcpy(grow((unsigned) len), src, len); }
-
- template<class T>
- void appendStruct(const T& s) { appendBuf(&s, sizeof(T)); }
-
- void appendStr(StringData str , bool includeEOO = true ) {
- const unsigned len = str.size() + ( includeEOO ? 1 : 0 );
- verify( len < (unsigned) BSONObjMaxUserSize );
- str.copyTo( grow(len), includeEOO );
- }
-
- /** @return the in-use length */
- unsigned len() const { return _len; }
-
- private:
- static const unsigned Alignment = 8192;
-
- /** returns the pre-grow write position */
- inline char* grow(unsigned by) {
- unsigned oldlen = _len;
- _len += by;
- if (MONGO_unlikely( _len > _p._size )) {
- growReallocate(oldlen);
- }
- return _p._data + oldlen;
- }
-
- void growReallocate(unsigned oldLenInUse);
- void kill();
- void mallocSelfAligned(unsigned sz);
- void _malloc(unsigned sz);
- void _realloc(unsigned newSize, unsigned oldLenInUse);
- void _free(void*);
-
- struct AllocationInfo {
- char *_data;
- void *_allocationAddress;
- unsigned _size;
- } _p;
- unsigned _len; // bytes in use
- };
-
+ return _p._data + oldlen;
+ }
+
+ void growReallocate(unsigned oldLenInUse);
+ void kill();
+ void mallocSelfAligned(unsigned sz);
+ void _malloc(unsigned sz);
+ void _realloc(unsigned newSize, unsigned oldLenInUse);
+ void _free(void*);
+
+ struct AllocationInfo {
+ char* _data;
+ void* _allocationAddress;
+ unsigned _size;
+ } _p;
+ unsigned _len; // bytes in use
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
index 422a6441e9a..ce1aa117fef 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
@@ -39,340 +39,335 @@
namespace mongo {
namespace {
- using std::unique_ptr;
- using std::string;
- using std::vector;
+using std::unique_ptr;
+using std::string;
+using std::vector;
+
+template <class OnDiskFormat>
+class BtreeBuilderInterfaceImpl final : public SortedDataBuilderInterface {
+public:
+ BtreeBuilderInterfaceImpl(OperationContext* trans,
+ typename BtreeLogic<OnDiskFormat>::Builder* builder)
+ : _builder(builder), _trans(trans) {}
+
+ Status addKey(const BSONObj& key, const RecordId& loc) {
+ return _builder->addKey(key, DiskLoc::fromRecordId(loc));
+ }
- template <class OnDiskFormat>
- class BtreeBuilderInterfaceImpl final : public SortedDataBuilderInterface {
- public:
- BtreeBuilderInterfaceImpl(OperationContext* trans,
- typename BtreeLogic<OnDiskFormat>::Builder* builder)
- : _builder(builder), _trans(trans) { }
+private:
+ std::unique_ptr<typename BtreeLogic<OnDiskFormat>::Builder> _builder;
+
+ // Not owned here.
+ OperationContext* _trans;
+};
+
+template <class OnDiskFormat>
+class BtreeInterfaceImpl final : public SortedDataInterface {
+public:
+ BtreeInterfaceImpl(HeadManager* headManager,
+ RecordStore* recordStore,
+ SavedCursorRegistry* cursorRegistry,
+ const Ordering& ordering,
+ const string& indexName) {
+ _btree.reset(new BtreeLogic<OnDiskFormat>(
+ headManager, recordStore, cursorRegistry, ordering, indexName));
+ }
- Status addKey(const BSONObj& key, const RecordId& loc) {
- return _builder->addKey(key, DiskLoc::fromRecordId(loc));
- }
+ virtual ~BtreeInterfaceImpl() {}
- private:
- std::unique_ptr<typename BtreeLogic<OnDiskFormat>::Builder> _builder;
+ virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn, bool dupsAllowed) {
+ return new BtreeBuilderInterfaceImpl<OnDiskFormat>(txn,
+ _btree->newBuilder(txn, dupsAllowed));
+ }
- // Not owned here.
- OperationContext* _trans;
- };
+ virtual Status insert(OperationContext* txn,
+ const BSONObj& key,
+ const RecordId& loc,
+ bool dupsAllowed) {
+ return _btree->insert(txn, key, DiskLoc::fromRecordId(loc), dupsAllowed);
+ }
- template <class OnDiskFormat>
- class BtreeInterfaceImpl final : public SortedDataInterface {
- public:
- BtreeInterfaceImpl(HeadManager* headManager,
- RecordStore* recordStore,
- SavedCursorRegistry* cursorRegistry,
- const Ordering& ordering,
- const string& indexName) {
- _btree.reset(new BtreeLogic<OnDiskFormat>(headManager,
- recordStore,
- cursorRegistry,
- ordering,
- indexName));
- }
+ virtual void unindex(OperationContext* txn,
+ const BSONObj& key,
+ const RecordId& loc,
+ bool dupsAllowed) {
+ _btree->unindex(txn, key, DiskLoc::fromRecordId(loc));
+ }
- virtual ~BtreeInterfaceImpl() { }
+ virtual void fullValidate(OperationContext* txn,
+ bool full,
+ long long* numKeysOut,
+ BSONObjBuilder* output) const {
+ *numKeysOut = _btree->fullValidate(txn, NULL, false, false, 0);
+ }
- virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn,
- bool dupsAllowed) {
+ virtual bool appendCustomStats(OperationContext* txn,
+ BSONObjBuilder* output,
+ double scale) const {
+ return false;
+ }
- return new BtreeBuilderInterfaceImpl<OnDiskFormat>(
- txn, _btree->newBuilder(txn, dupsAllowed));
- }
+ virtual long long getSpaceUsedBytes(OperationContext* txn) const {
+ return _btree->getRecordStore()->dataSize(txn);
+ }
- virtual Status insert(OperationContext* txn,
- const BSONObj& key,
- const RecordId& loc,
- bool dupsAllowed) {
+ virtual Status dupKeyCheck(OperationContext* txn, const BSONObj& key, const RecordId& loc) {
+ return _btree->dupKeyCheck(txn, key, DiskLoc::fromRecordId(loc));
+ }
- return _btree->insert(txn, key, DiskLoc::fromRecordId(loc), dupsAllowed);
- }
+ virtual bool isEmpty(OperationContext* txn) {
+ return _btree->isEmpty(txn);
+ }
- virtual void unindex(OperationContext* txn,
- const BSONObj& key,
- const RecordId& loc,
- bool dupsAllowed) {
+ virtual Status touch(OperationContext* txn) const {
+ return _btree->touch(txn);
+ }
- _btree->unindex(txn, key, DiskLoc::fromRecordId(loc));
- }
+ class Cursor final : public SortedDataInterface::Cursor {
+ public:
+ Cursor(OperationContext* txn, const BtreeLogic<OnDiskFormat>* btree, bool forward)
+ : _txn(txn), _btree(btree), _direction(forward ? 1 : -1), _ofs(0) {}
+
+ boost::optional<IndexKeyEntry> next(RequestedInfo parts) override {
+ if (isEOF())
+ return {};
+ if (_lastMoveWasRestore) {
+ // Return current position rather than advancing.
+ _lastMoveWasRestore = false;
+ } else {
+ _btree->advance(_txn, &_bucket, &_ofs, _direction);
+ }
- virtual void fullValidate(OperationContext* txn, bool full, long long *numKeysOut,
- BSONObjBuilder* output) const {
- *numKeysOut = _btree->fullValidate(txn, NULL, false, false, 0);
+ if (atEndPoint())
+ markEOF();
+ return curr(parts);
}
- virtual bool appendCustomStats(OperationContext* txn, BSONObjBuilder* output, double scale)
- const {
- return false;
- }
+ void setEndPosition(const BSONObj& key, bool inclusive) override {
+ if (key.isEmpty()) {
+ // This means scan to end of index.
+ _endState = {};
+ return;
+ }
- virtual long long getSpaceUsedBytes( OperationContext* txn ) const {
- return _btree->getRecordStore()->dataSize( txn );
+ _endState = {{key, inclusive}};
+ seekEndCursor(); // Completes initialization of _endState.
}
- virtual Status dupKeyCheck(OperationContext* txn,
- const BSONObj& key,
- const RecordId& loc) {
- return _btree->dupKeyCheck(txn, key, DiskLoc::fromRecordId(loc));
- }
+ boost::optional<IndexKeyEntry> seek(const BSONObj& key,
+ bool inclusive,
+ RequestedInfo parts) override {
+ locate(key, inclusive == forward() ? RecordId::min() : RecordId::max());
+ _lastMoveWasRestore = false;
- virtual bool isEmpty(OperationContext* txn) {
- return _btree->isEmpty(txn);
+ if (isEOF())
+ return {};
+ dassert(inclusive ? compareKeys(getKey(), key) >= 0 : compareKeys(getKey(), key) > 0);
+ return curr(parts);
}
- virtual Status touch(OperationContext* txn) const{
- return _btree->touch(txn);
- }
- class Cursor final : public SortedDataInterface::Cursor {
- public:
- Cursor(OperationContext* txn,
- const BtreeLogic<OnDiskFormat>* btree,
- bool forward)
- : _txn(txn),
- _btree(btree),
- _direction(forward ? 1 : -1),
- _ofs(0)
- {}
-
- boost::optional<IndexKeyEntry> next(RequestedInfo parts) override {
- if (isEOF()) return {};
- if (_lastMoveWasRestore) {
- // Return current position rather than advancing.
- _lastMoveWasRestore = false;
- }
- else {
- _btree->advance(_txn, &_bucket, &_ofs, _direction);
- }
+ boost::optional<IndexKeyEntry> seek(const IndexSeekPoint& seekPoint,
+ RequestedInfo parts) override {
+ bool canUseAdvanceTo = false;
+ if (!isEOF()) {
+ int cmp = _btree->customBSONCmp(getKey(), seekPoint, _direction);
- if (atEndPoint()) markEOF();
- return curr(parts);
+ // advanceTo requires that we are positioned "earlier" in the index than the
+ // seek point, in scan order.
+ canUseAdvanceTo = forward() ? cmp < 0 : cmp > 0;
}
- void setEndPosition(const BSONObj& key, bool inclusive) override {
- if (key.isEmpty()) {
- // This means scan to end of index.
- _endState = {};
- return;
- }
- _endState = {{key, inclusive}};
- seekEndCursor(); // Completes initialization of _endState.
+ if (canUseAdvanceTo) {
+ // This takes advantage of current location.
+ _btree->advanceTo(_txn, &_bucket, &_ofs, seekPoint, _direction);
+ } else {
+ // Start at root.
+ _bucket = _btree->getHead(_txn);
+ _ofs = 0;
+ _btree->customLocate(_txn, &_bucket, &_ofs, seekPoint, _direction);
}
- boost::optional<IndexKeyEntry> seek(const BSONObj& key, bool inclusive,
- RequestedInfo parts) override {
- locate(key, inclusive == forward() ? RecordId::min() : RecordId::max());
- _lastMoveWasRestore = false;
-
- if (isEOF()) return {};
- dassert(inclusive ? compareKeys(getKey(), key) >= 0
- : compareKeys(getKey(), key) > 0);
- return curr(parts);
- }
+ _lastMoveWasRestore = false;
+ if (atOrPastEndPointAfterSeeking())
+ markEOF();
+ return curr(parts);
+ }
- boost::optional<IndexKeyEntry> seek(const IndexSeekPoint& seekPoint,
- RequestedInfo parts) override {
- bool canUseAdvanceTo = false;
- if (!isEOF()) {
- int cmp = _btree->customBSONCmp(getKey(), seekPoint, _direction);
-
- // advanceTo requires that we are positioned "earlier" in the index than the
- // seek point, in scan order.
- canUseAdvanceTo = forward() ? cmp < 0 : cmp > 0;
- }
+ void savePositioned() override {
+ _txn = nullptr;
+ if (!_lastMoveWasRestore)
+ _savedEOF = isEOF();
- if (canUseAdvanceTo) {
- // This takes advantage of current location.
- _btree->advanceTo(_txn, &_bucket, &_ofs, seekPoint, _direction);
- }
- else {
- // Start at root.
- _bucket = _btree->getHead(_txn);
- _ofs = 0;
- _btree->customLocate(_txn, &_bucket, &_ofs, seekPoint, _direction);
+ if (!isEOF()) {
+ _saved.bucket = _bucket;
+ _btree->savedCursors()->registerCursor(&_saved);
+ // Don't want to change saved position if we only moved during restore.
+ if (!_lastMoveWasRestore) {
+ _saved.key = getKey().getOwned();
+ _saved.loc = getDiskLoc();
}
+ }
+ // Doing nothing with end cursor since it will do full reseek on restore.
+ }
- _lastMoveWasRestore = false;
+ void saveUnpositioned() override {
+ _txn = nullptr;
+ // Don't leak our registration if savePositioned() was previously called.
+ if (!_saved.bucket.isNull())
+ _btree->savedCursors()->unregisterCursor(&_saved);
- if (atOrPastEndPointAfterSeeking()) markEOF();
- return curr(parts);
- }
+ _saved.bucket = DiskLoc();
+ _savedEOF = true;
+ }
- void savePositioned() override {
- _txn = nullptr;
+ void restore(OperationContext* txn) override {
+ // guard against accidental double restore
+ invariant(!_txn);
+ _txn = txn;
- if (!_lastMoveWasRestore) _savedEOF = isEOF();
+ // Always do a full seek on restore. We cannot use our last position since index
+ // entries may have been inserted closer to our endpoint and we would need to move
+ // over them.
+ seekEndCursor();
- if (!isEOF()) {
- _saved.bucket = _bucket;
- _btree->savedCursors()->registerCursor(&_saved);
- // Don't want to change saved position if we only moved during restore.
- if (!_lastMoveWasRestore) {
- _saved.key = getKey().getOwned();
- _saved.loc = getDiskLoc();
- }
- }
- // Doing nothing with end cursor since it will do full reseek on restore.
+ if (_savedEOF) {
+ markEOF();
+ return;
}
- void saveUnpositioned() override {
- _txn = nullptr;
- // Don't leak our registration if savePositioned() was previously called.
- if (!_saved.bucket.isNull()) _btree->savedCursors()->unregisterCursor(&_saved);
-
- _saved.bucket = DiskLoc();
- _savedEOF = true;
+ if (_btree->savedCursors()->unregisterCursor(&_saved)) {
+ // We can use the fast restore mechanism.
+ _btree->restorePosition(_txn, _saved.key, _saved.loc, _direction, &_bucket, &_ofs);
+ } else {
+ // Need to find our position from the root.
+ locate(_saved.key, _saved.loc.toRecordId());
}
- void restore(OperationContext* txn) override {
- // guard against accidental double restore
- invariant(!_txn);
- _txn = txn;
+ _lastMoveWasRestore = isEOF() // We weren't EOF but now are.
+ || getDiskLoc() != _saved.loc || compareKeys(getKey(), _saved.key) != 0;
+ }
- // Always do a full seek on restore. We cannot use our last position since index
- // entries may have been inserted closer to our endpoint and we would need to move
- // over them.
- seekEndCursor();
+ private:
+ bool isEOF() const {
+ return _bucket.isNull();
+ }
+ void markEOF() {
+ _bucket = DiskLoc();
+ }
- if (_savedEOF) {
- markEOF();
- return;
- }
+ boost::optional<IndexKeyEntry> curr(RequestedInfo parts) {
+ if (isEOF())
+ return {};
+ return {{(parts & kWantKey) ? getKey() : BSONObj(),
+ (parts & kWantLoc) ? getDiskLoc().toRecordId() : RecordId()}};
+ }
- if (_btree->savedCursors()->unregisterCursor(&_saved)) {
- // We can use the fast restore mechanism.
- _btree->restorePosition(_txn, _saved.key, _saved.loc, _direction,
- &_bucket, &_ofs);
- }
- else {
- // Need to find our position from the root.
- locate(_saved.key, _saved.loc.toRecordId());
- }
+ bool atEndPoint() const {
+ return _endState && _bucket == _endState->bucket && (isEOF() || _ofs == _endState->ofs);
+ }
- _lastMoveWasRestore = isEOF() // We weren't EOF but now are.
- || getDiskLoc() != _saved.loc
- || compareKeys(getKey(), _saved.key) != 0;
- }
+ bool atOrPastEndPointAfterSeeking() const {
+ if (!_endState)
+ return false;
+ if (isEOF())
+ return true;
- private:
- bool isEOF() const { return _bucket.isNull(); }
- void markEOF() { _bucket = DiskLoc(); }
+ int cmp = compareKeys(getKey(), _endState->key);
+ return _endState->inclusive ? cmp > 0 : cmp >= 0;
+ }
- boost::optional<IndexKeyEntry> curr(RequestedInfo parts) {
- if (isEOF()) return {};
- return {{(parts & kWantKey) ? getKey() : BSONObj(),
- (parts & kWantLoc) ? getDiskLoc().toRecordId() : RecordId()}};
- }
+ void locate(const BSONObj& key, const RecordId& loc) {
+ _btree->locate(_txn, key, DiskLoc::fromRecordId(loc), _direction, &_ofs, &_bucket);
+ if (atOrPastEndPointAfterSeeking())
+ markEOF();
+ }
- bool atEndPoint() const {
- return _endState
- && _bucket == _endState->bucket
- && (isEOF() || _ofs == _endState->ofs);
- }
+ // Returns comparison relative to direction of scan. If rhs would be seen later, returns
+ // a positive value.
+ int compareKeys(const BSONObj& lhs, const BSONObj& rhs) const {
+ int cmp = lhs.woCompare(rhs, _btree->ordering(), /*considerFieldName*/ false);
+ return forward() ? cmp : -cmp;
+ }
- bool atOrPastEndPointAfterSeeking() const {
- if (!_endState) return false;
- if (isEOF()) return true;
-
- int cmp = compareKeys(getKey(), _endState->key);
- return _endState->inclusive ? cmp > 0 : cmp >= 0;
- }
+ BSONObj getKey() const {
+ return _btree->getKey(_txn, _bucket, _ofs);
+ }
+ DiskLoc getDiskLoc() const {
+ return _btree->getDiskLoc(_txn, _bucket, _ofs);
+ }
- void locate(const BSONObj& key, const RecordId& loc) {
- _btree->locate(_txn, key, DiskLoc::fromRecordId(loc), _direction, &_ofs, &_bucket);
- if (atOrPastEndPointAfterSeeking()) markEOF();
- }
+ void seekEndCursor() {
+ if (!_endState)
+ return;
+ _btree->locate(_txn,
+ _endState->key,
+ forward() == _endState->inclusive ? DiskLoc::max() : DiskLoc::min(),
+ _direction,
+ &_endState->ofs,
+ &_endState->bucket); // pure out params.
+ }
- // Returns comparison relative to direction of scan. If rhs would be seen later, returns
- // a positive value.
- int compareKeys(const BSONObj& lhs, const BSONObj& rhs) const {
- int cmp = lhs.woCompare(rhs, _btree->ordering(), /*considerFieldName*/false);
- return forward() ? cmp : -cmp;
- }
+ bool forward() const {
+ return _direction == 1;
+ }
- BSONObj getKey() const { return _btree->getKey(_txn, _bucket, _ofs); }
- DiskLoc getDiskLoc() const { return _btree->getDiskLoc(_txn, _bucket, _ofs); }
+ OperationContext* _txn; // not owned
+ const BtreeLogic<OnDiskFormat>* const _btree;
+ const int _direction;
- void seekEndCursor() {
- if (!_endState) return;
- _btree->locate(_txn,
- _endState->key,
- forward() == _endState->inclusive ? DiskLoc::max() : DiskLoc::min(),
- _direction,
- &_endState->ofs, &_endState->bucket); // pure out params.
- }
+ DiskLoc _bucket;
+ int _ofs;
- bool forward() const { return _direction == 1; }
-
- OperationContext* _txn; // not owned
- const BtreeLogic<OnDiskFormat>* const _btree;
- const int _direction;
-
- DiskLoc _bucket;
- int _ofs;
-
- struct EndState {
- BSONObj key;
- bool inclusive;
- DiskLoc bucket;
- int ofs;
- };
- boost::optional<EndState> _endState;
-
- // Used by next to decide to return current position rather than moving. Should be reset
- // to false by any operation that moves the cursor, other than subsequent save/restore
- // pairs.
- bool _lastMoveWasRestore = false;
-
- // Only used by save/restore() if _bucket is non-Null.
- bool _savedEOF = false;
- SavedCursorRegistry::SavedCursor _saved;
+ struct EndState {
+ BSONObj key;
+ bool inclusive;
+ DiskLoc bucket;
+ int ofs;
};
+ boost::optional<EndState> _endState;
- virtual std::unique_ptr<SortedDataInterface::Cursor> newCursor(
- OperationContext* txn,
- bool isForward = true) const {
- return stdx::make_unique<Cursor>(txn, _btree.get(), isForward);
- }
+ // Used by next to decide to return current position rather than moving. Should be reset
+ // to false by any operation that moves the cursor, other than subsequent save/restore
+ // pairs.
+ bool _lastMoveWasRestore = false;
- virtual Status initAsEmpty(OperationContext* txn) {
- return _btree->initAsEmpty(txn);
- }
-
- private:
- unique_ptr<BtreeLogic<OnDiskFormat> > _btree;
+ // Only used by save/restore() if _bucket is non-Null.
+ bool _savedEOF = false;
+ SavedCursorRegistry::SavedCursor _saved;
};
-} // namespace
-
- SortedDataInterface* getMMAPV1Interface(HeadManager* headManager,
- RecordStore* recordStore,
- SavedCursorRegistry* cursorRegistry,
- const Ordering& ordering,
- const string& indexName,
- int version) {
- if (0 == version) {
- return new BtreeInterfaceImpl<BtreeLayoutV0>(headManager,
- recordStore,
- cursorRegistry,
- ordering,
- indexName);
- }
- else {
- invariant(1 == version);
- return new BtreeInterfaceImpl<BtreeLayoutV1>(headManager,
- recordStore,
- cursorRegistry,
- ordering,
- indexName);
- }
+
+ virtual std::unique_ptr<SortedDataInterface::Cursor> newCursor(OperationContext* txn,
+ bool isForward = true) const {
+ return stdx::make_unique<Cursor>(txn, _btree.get(), isForward);
+ }
+
+ virtual Status initAsEmpty(OperationContext* txn) {
+ return _btree->initAsEmpty(txn);
+ }
+
+private:
+ unique_ptr<BtreeLogic<OnDiskFormat>> _btree;
+};
+} // namespace
+
+SortedDataInterface* getMMAPV1Interface(HeadManager* headManager,
+ RecordStore* recordStore,
+ SavedCursorRegistry* cursorRegistry,
+ const Ordering& ordering,
+ const string& indexName,
+ int version) {
+ if (0 == version) {
+ return new BtreeInterfaceImpl<BtreeLayoutV0>(
+ headManager, recordStore, cursorRegistry, ordering, indexName);
+ } else {
+ invariant(1 == version);
+ return new BtreeInterfaceImpl<BtreeLayoutV1>(
+ headManager, recordStore, cursorRegistry, ordering, indexName);
}
+}
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.h b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h
index cb2cdd21125..b5814c8a1f5 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_interface.h
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h
@@ -39,12 +39,12 @@
#pragma once
namespace mongo {
- class SavedCursorRegistry;
+class SavedCursorRegistry;
- SortedDataInterface* getMMAPV1Interface(HeadManager* headManager,
- RecordStore* recordStore,
- SavedCursorRegistry* cursorRegistry,
- const Ordering& ordering,
- const std::string& indexName,
- int version);
+SortedDataInterface* getMMAPV1Interface(HeadManager* headManager,
+ RecordStore* recordStore,
+ SavedCursorRegistry* cursorRegistry,
+ const Ordering& ordering,
+ const std::string& indexName,
+ int version);
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_interface_test.cpp
index 23f649bfcaa..1272ea4d080 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_interface_test.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface_test.cpp
@@ -35,40 +35,32 @@
namespace mongo {
- using std::unique_ptr;
+using std::unique_ptr;
- class MyHarnessHelper final : public HarnessHelper {
- public:
- MyHarnessHelper()
- : _recordStore("a.b"),
- _order(Ordering::make(BSONObj())) {
- }
+class MyHarnessHelper final : public HarnessHelper {
+public:
+ MyHarnessHelper() : _recordStore("a.b"), _order(Ordering::make(BSONObj())) {}
- std::unique_ptr<SortedDataInterface> newSortedDataInterface(bool unique) final {
- std::unique_ptr<SortedDataInterface> sorted(getMMAPV1Interface(&_headManager,
- &_recordStore,
- &_cursorRegistry,
- _order,
- "a_1",
- 1));
- OperationContextNoop op;
- massertStatusOK(sorted->initAsEmpty(&op));
- return sorted;
- }
-
- std::unique_ptr<RecoveryUnit> newRecoveryUnit() final {
- return stdx::make_unique<HeapRecordStoreBtreeRecoveryUnit>();
- }
-
- private:
- TestHeadManager _headManager;
- HeapRecordStoreBtree _recordStore;
- SavedCursorRegistry _cursorRegistry;
- Ordering _order;
- };
+ std::unique_ptr<SortedDataInterface> newSortedDataInterface(bool unique) final {
+ std::unique_ptr<SortedDataInterface> sorted(
+ getMMAPV1Interface(&_headManager, &_recordStore, &_cursorRegistry, _order, "a_1", 1));
+ OperationContextNoop op;
+ massertStatusOK(sorted->initAsEmpty(&op));
+ return sorted;
+ }
- std::unique_ptr<HarnessHelper> newHarnessHelper() {
- return stdx::make_unique<MyHarnessHelper>();
+ std::unique_ptr<RecoveryUnit> newRecoveryUnit() final {
+ return stdx::make_unique<HeapRecordStoreBtreeRecoveryUnit>();
}
+private:
+ TestHeadManager _headManager;
+ HeapRecordStoreBtree _recordStore;
+ SavedCursorRegistry _cursorRegistry;
+ Ordering _order;
+};
+
+std::unique_ptr<HarnessHelper> newHarnessHelper() {
+ return stdx::make_unique<MyHarnessHelper>();
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
index 1afe24331cf..11e31b3fce7 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
@@ -42,2383 +42,2299 @@
namespace mongo {
- using std::unique_ptr;
- using std::dec;
- using std::endl;
- using std::hex;
- using std::make_pair;
- using std::pair;
- using std::string;
- using std::stringstream;
- using std::vector;
-
- // BtreeLogic::Builder algorithm
- //
- // Phase 1:
- // Handled by caller. Extracts keys from raw documents and puts them in external sorter
- //
- // Phase 2 (the addKeys phase):
- // Add all keys to buckets. When a bucket gets full, pop the highest key (setting the
- // nextChild pointer of the bucket to the prevChild of the popped key), add the popped key to
- // a parent bucket, and create a new right sibling bucket to add the new key to. If the parent
- // bucket is full, this same operation is performed on the parent and all full ancestors. If
- // we get to the root and it is full, a new root is created above the current root. When
- // creating a new right sibling, it is set as its parent's nextChild as all keys in the right
- // sibling will be higher than all keys currently in the parent.
-
- //
- // Public Builder logic
- //
-
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::Builder*
- BtreeLogic<BtreeLayout>::newBuilder(OperationContext* txn, bool dupsAllowed) {
- return new Builder(this, txn, dupsAllowed);
- }
-
- template <class BtreeLayout>
- BtreeLogic<BtreeLayout>::Builder::Builder(BtreeLogic* logic,
- OperationContext* txn,
- bool dupsAllowed)
- : _logic(logic),
- _dupsAllowed(dupsAllowed),
- _txn(txn) {
-
- // The normal bulk building path calls initAsEmpty, so we already have an empty root bucket.
- // This isn't the case in some unit tests that use the Builder directly rather than going
- // through an IndexAccessMethod.
- _rightLeafLoc = DiskLoc::fromRecordId(_logic->_headManager->getHead(txn));
- if (_rightLeafLoc.isNull()) {
- _rightLeafLoc = _logic->_addBucket(txn);
- _logic->_headManager->setHead(_txn, _rightLeafLoc.toRecordId());
- }
-
- // must be empty when starting
- invariant(_getBucket(_rightLeafLoc)->n == 0);
- }
-
- template <class BtreeLayout>
- class BtreeLogic<BtreeLayout>::Builder::SetRightLeafLocChange : public RecoveryUnit::Change {
- public:
- SetRightLeafLocChange(Builder* builder, DiskLoc oldLoc)
- : _builder(builder)
- , _oldLoc(oldLoc)
- {}
-
- virtual void commit() {}
- virtual void rollback() { _builder->_rightLeafLoc = _oldLoc; }
-
- Builder* _builder;
- const DiskLoc _oldLoc;
- };
-
- template <class BtreeLayout>
- Status BtreeLogic<BtreeLayout>::Builder::addKey(const BSONObj& keyObj, const DiskLoc& loc) {
- unique_ptr<KeyDataOwnedType> key(new KeyDataOwnedType(keyObj));
-
- if (key->dataSize() > BtreeLayout::KeyMax) {
- string msg = str::stream() << "Btree::insert: key too large to index, failing "
- << _logic->_indexName
- << ' ' << key->dataSize() << ' ' << key->toString();
- log() << msg << endl;
- return Status(ErrorCodes::KeyTooLong, msg);
- }
+using std::unique_ptr;
+using std::dec;
+using std::endl;
+using std::hex;
+using std::make_pair;
+using std::pair;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+// BtreeLogic::Builder algorithm
+//
+// Phase 1:
+// Handled by caller. Extracts keys from raw documents and puts them in external sorter
+//
+// Phase 2 (the addKeys phase):
+// Add all keys to buckets. When a bucket gets full, pop the highest key (setting the
+// nextChild pointer of the bucket to the prevChild of the popped key), add the popped key to
+// a parent bucket, and create a new right sibling bucket to add the new key to. If the parent
+// bucket is full, this same operation is performed on the parent and all full ancestors. If
+// we get to the root and it is full, a new root is created above the current root. When
+// creating a new right sibling, it is set as its parent's nextChild as all keys in the right
+// sibling will be higher than all keys currently in the parent.
+
+//
+// Public Builder logic
+//
+
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::Builder* BtreeLogic<BtreeLayout>::newBuilder(
+ OperationContext* txn, bool dupsAllowed) {
+ return new Builder(this, txn, dupsAllowed);
+}
+
+template <class BtreeLayout>
+BtreeLogic<BtreeLayout>::Builder::Builder(BtreeLogic* logic,
+ OperationContext* txn,
+ bool dupsAllowed)
+ : _logic(logic), _dupsAllowed(dupsAllowed), _txn(txn) {
+ // The normal bulk building path calls initAsEmpty, so we already have an empty root bucket.
+ // This isn't the case in some unit tests that use the Builder directly rather than going
+ // through an IndexAccessMethod.
+ _rightLeafLoc = DiskLoc::fromRecordId(_logic->_headManager->getHead(txn));
+ if (_rightLeafLoc.isNull()) {
+ _rightLeafLoc = _logic->_addBucket(txn);
+ _logic->_headManager->setHead(_txn, _rightLeafLoc.toRecordId());
+ }
+
+ // must be empty when starting
+ invariant(_getBucket(_rightLeafLoc)->n == 0);
+}
+
+template <class BtreeLayout>
+class BtreeLogic<BtreeLayout>::Builder::SetRightLeafLocChange : public RecoveryUnit::Change {
+public:
+ SetRightLeafLocChange(Builder* builder, DiskLoc oldLoc) : _builder(builder), _oldLoc(oldLoc) {}
+
+ virtual void commit() {}
+ virtual void rollback() {
+ _builder->_rightLeafLoc = _oldLoc;
+ }
+
+ Builder* _builder;
+ const DiskLoc _oldLoc;
+};
+
+template <class BtreeLayout>
+Status BtreeLogic<BtreeLayout>::Builder::addKey(const BSONObj& keyObj, const DiskLoc& loc) {
+ unique_ptr<KeyDataOwnedType> key(new KeyDataOwnedType(keyObj));
+
+ if (key->dataSize() > BtreeLayout::KeyMax) {
+ string msg = str::stream() << "Btree::insert: key too large to index, failing "
+ << _logic->_indexName << ' ' << key->dataSize() << ' '
+ << key->toString();
+ log() << msg << endl;
+ return Status(ErrorCodes::KeyTooLong, msg);
+ }
+
+ // If we have a previous key to compare to...
+ if (_keyLast.get()) {
+ int cmp = _keyLast->woCompare(*key, _logic->_ordering);
+
+ // This shouldn't happen ever. We expect keys in sorted order.
+ if (cmp > 0) {
+ return Status(ErrorCodes::InternalError, "Bad key order in btree builder");
+ }
+
+ // This could easily happen..
+ if (!_dupsAllowed && (cmp == 0)) {
+ return Status(ErrorCodes::DuplicateKey, _logic->dupKeyError(*_keyLast));
+ }
+ }
+
+ BucketType* rightLeaf = _getModifiableBucket(_rightLeafLoc);
+ if (!_logic->pushBack(rightLeaf, loc, *key, DiskLoc())) {
+ // bucket was full, so split and try with the new node.
+ _txn->recoveryUnit()->registerChange(new SetRightLeafLocChange(this, _rightLeafLoc));
+ _rightLeafLoc = newBucket(rightLeaf, _rightLeafLoc);
+ rightLeaf = _getModifiableBucket(_rightLeafLoc);
+ invariant(_logic->pushBack(rightLeaf, loc, *key, DiskLoc()));
+ }
+
+ _keyLast = std::move(key);
+ return Status::OK();
+}
+
+//
+// Private Builder logic
+//
+
+template <class BtreeLayout>
+DiskLoc BtreeLogic<BtreeLayout>::Builder::newBucket(BucketType* leftSib, DiskLoc leftSibLoc) {
+ invariant(leftSib->n >= 2); // Guaranteed by sufficiently small KeyMax.
+
+ if (leftSib->parent.isNull()) {
+ // Making a new root
+ invariant(leftSibLoc.toRecordId() == _logic->_headManager->getHead(_txn));
+ const DiskLoc newRootLoc = _logic->_addBucket(_txn);
+ leftSib->parent = newRootLoc;
+ _logic->_headManager->setHead(_txn, newRootLoc.toRecordId());
+
+ // Set the newRoot's nextChild to point to leftSib for the invariant below.
+ BucketType* newRoot = _getBucket(newRootLoc);
+ *_txn->recoveryUnit()->writing(&newRoot->nextChild) = leftSibLoc;
+ }
+
+ DiskLoc parentLoc = leftSib->parent;
+ BucketType* parent = _getModifiableBucket(parentLoc);
+
+ // For the pushBack below to be correct, leftSib must be the right-most child of parent.
+ invariant(parent->nextChild == leftSibLoc);
+
+ // Pull right-most key out of leftSib and move to parent, splitting parent if necessary.
+ // Note that popBack() handles setting leftSib's nextChild to the former prevChildNode of
+ // the popped key.
+ KeyDataType key;
+ DiskLoc val;
+ _logic->popBack(leftSib, &val, &key);
+ if (!_logic->pushBack(parent, val, key, leftSibLoc)) {
+ // parent is full, so split it.
+ parentLoc = newBucket(parent, parentLoc);
+ parent = _getModifiableBucket(parentLoc);
+ invariant(_logic->pushBack(parent, val, key, leftSibLoc));
+ leftSib->parent = parentLoc;
+ }
+
+ // Create a new bucket to the right of leftSib and set its parent pointer and the downward
+ // nextChild pointer from the parent.
+ DiskLoc newBucketLoc = _logic->_addBucket(_txn);
+ BucketType* newBucket = _getBucket(newBucketLoc);
+ *_txn->recoveryUnit()->writing(&newBucket->parent) = parentLoc;
+ *_txn->recoveryUnit()->writing(&parent->nextChild) = newBucketLoc;
+ return newBucketLoc;
+}
+
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::BucketType*
+BtreeLogic<BtreeLayout>::Builder::_getModifiableBucket(DiskLoc loc) {
+ return _logic->btreemod(_txn, _logic->getBucket(_txn, loc));
+}
+
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::Builder::_getBucket(
+ DiskLoc loc) {
+ return _logic->getBucket(_txn, loc);
+}
+
+//
+// BtreeLogic logic
+//
+
+// static
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::FullKey BtreeLogic<BtreeLayout>::getFullKey(
+ const BucketType* bucket, int i) {
+ if (i >= bucket->n) {
+ int code = 13000;
+ massert(code,
+ (string) "invalid keyNode: " + BSON("i" << i << "n" << bucket->n).jsonString(),
+ i < bucket->n);
+ }
+ return FullKey(bucket, i);
+}
+
+// static
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::KeyHeaderType& BtreeLogic<BtreeLayout>::getKeyHeader(
+ BucketType* bucket, int i) {
+ return ((KeyHeaderType*)bucket->data)[i];
+}
+
+// static
+template <class BtreeLayout>
+const typename BtreeLogic<BtreeLayout>::KeyHeaderType& BtreeLogic<BtreeLayout>::getKeyHeader(
+ const BucketType* bucket, int i) {
+ return ((const KeyHeaderType*)bucket->data)[i];
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::markUnused(BucketType* bucket, int keyPos) {
+ invariant(keyPos >= 0 && keyPos < bucket->n);
+ getKeyHeader(bucket, keyPos).setUnused();
+}
+
+template <class BtreeLayout>
+char* BtreeLogic<BtreeLayout>::dataAt(BucketType* bucket, short ofs) {
+ return bucket->data + ofs;
+}
+
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::btreemod(
+ OperationContext* txn, BucketType* bucket) {
+ txn->recoveryUnit()->writingPtr(bucket, BtreeLayout::BucketSize);
+ return bucket;
+}
+
+template <class BtreeLayout>
+int BtreeLogic<BtreeLayout>::totalDataSize(BucketType* bucket) {
+ return (int)(BtreeLayout::BucketSize - (bucket->data - (char*)bucket));
+}
+
+// We define this value as the maximum number of bytes such that, if we have
+// fewer than this many bytes, we must be able to either merge with or receive
+// keys from any neighboring node. If our utilization goes below this value we
+// know we can bring up the utilization with a simple operation. Ignoring the
+// 90/10 split policy which is sometimes employed and our 'unused' nodes, this
+// is a lower bound on bucket utilization for non root buckets.
+//
+// Note that the exact value here depends on the implementation of
+// _rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as
+// follows: We know we cannot merge with the neighbor, so the total data size
+// for us, the neighbor, and the separator must be at least
+// BucketType::bodySize() + 1. We must be able to accept one key of any
+// allowed size, so our size plus storage for that additional key must be
+// <= BucketType::bodySize() / 2. This way, with the extra key we'll have a
+// new bucket data size < half the total data size and by the implementation
+// of _rebalancedSeparatorPos() the key must be added.
+template <class BtreeLayout>
+int BtreeLogic<BtreeLayout>::lowWaterMark() {
+ return BtreeLayout::BucketBodySize / 2 - BtreeLayout::KeyMax - sizeof(KeyHeaderType) + 1;
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::init(BucketType* bucket) {
+ BtreeLayout::initBucket(bucket);
+ bucket->parent.Null();
+ bucket->nextChild.Null();
+ bucket->flags = Packed;
+ bucket->n = 0;
+ bucket->emptySize = totalDataSize(bucket);
+ bucket->topSize = 0;
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::_unalloc(BucketType* bucket, int bytes) {
+ bucket->topSize -= bytes;
+ bucket->emptySize += bytes;
+}
- // If we have a previous key to compare to...
- if (_keyLast.get()) {
- int cmp = _keyLast->woCompare(*key, _logic->_ordering);
-
- // This shouldn't happen ever. We expect keys in sorted order.
- if (cmp > 0) {
- return Status(ErrorCodes::InternalError, "Bad key order in btree builder");
- }
+/**
+ * We allocate space from the end of the buffer for data. The keynodes grow from the front.
+ */
+template <class BtreeLayout>
+int BtreeLogic<BtreeLayout>::_alloc(BucketType* bucket, int bytes) {
+ invariant(bucket->emptySize >= bytes);
+ bucket->topSize += bytes;
+ bucket->emptySize -= bytes;
+ int ofs = totalDataSize(bucket) - bucket->topSize;
+ invariant(ofs > 0);
+ return ofs;
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::setNotPacked(BucketType* bucket) {
+ bucket->flags &= ~Packed;
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::setPacked(BucketType* bucket) {
+ bucket->flags |= Packed;
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::_delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty) {
+ invariant(keypos >= 0 && keypos <= bucket->n);
+ invariant(childLocForPos(bucket, keypos).isNull());
+ invariant((mayEmpty && bucket->n > 0) || bucket->n > 1 || bucket->nextChild.isNull());
+
+ bucket->emptySize += sizeof(KeyHeaderType);
+ bucket->n--;
+
+ for (int j = keypos; j < bucket->n; j++) {
+ getKeyHeader(bucket, j) = getKeyHeader(bucket, j + 1);
+ }
+
+ setNotPacked(bucket);
+}
- // This could easily happen..
- if (!_dupsAllowed && (cmp == 0)) {
- return Status(ErrorCodes::DuplicateKey, _logic->dupKeyError(*_keyLast));
- }
- }
-
- BucketType* rightLeaf = _getModifiableBucket(_rightLeafLoc);
- if (!_logic->pushBack(rightLeaf, loc, *key, DiskLoc())) {
- // bucket was full, so split and try with the new node.
- _txn->recoveryUnit()->registerChange(new SetRightLeafLocChange(this, _rightLeafLoc));
- _rightLeafLoc = newBucket(rightLeaf, _rightLeafLoc);
- rightLeaf = _getModifiableBucket(_rightLeafLoc);
- invariant(_logic->pushBack(rightLeaf, loc, *key, DiskLoc()));
- }
+/**
+ * Pull rightmost key from the bucket and set its prevChild pointer to be the nextChild for the
+ * whole bucket. It is assumed that caller already has the old value of the nextChild
+ * pointer and is about to add a pointer to it elsewhere in the tree.
+ *
+ * This is only used by BtreeLogic::Builder. Think very hard (and change this comment) before
+ * using it anywhere else.
+ *
+ * WARNING: The keyDataOut that is filled out by this function points to newly unalloced memory
+ * inside of this bucket. It only remains valid until the next write to this bucket.
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::popBack(BucketType* bucket,
+ DiskLoc* recordLocOut,
+ KeyDataType* keyDataOut) {
+ massert(17435, "n==0 in btree popBack()", bucket->n > 0);
+
+ invariant(getKeyHeader(bucket, bucket->n - 1).isUsed());
+
+ FullKey kn = getFullKey(bucket, bucket->n - 1);
+ *recordLocOut = kn.recordLoc;
+ keyDataOut->assign(kn.data);
+ int keysize = kn.data.dataSize();
+
+ // The left/prev child of the node we are popping now goes in to the nextChild slot as all
+ // of its keys are greater than all remaining keys in this node.
+ bucket->nextChild = kn.prevChildBucket;
+ bucket->n--;
+
+ // This is risky because the keyDataOut we filled out above will now point to this newly
+ // unalloced memory.
+ bucket->emptySize += sizeof(KeyHeaderType);
+ _unalloc(bucket, keysize);
+}
- _keyLast = std::move(key);
- return Status::OK();
+/**
+ * Add a key. Must be > all existing. Be careful to set next ptr right.
+ */
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::pushBack(BucketType* bucket,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc prevChild) {
+ int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType);
+ if (bytesNeeded > bucket->emptySize) {
+ return false;
}
+ invariant(bytesNeeded <= bucket->emptySize);
- //
- // Private Builder logic
- //
-
- template <class BtreeLayout>
- DiskLoc BtreeLogic<BtreeLayout>::Builder::newBucket(BucketType* leftSib,
- DiskLoc leftSibLoc) {
- invariant(leftSib->n >= 2); // Guaranteed by sufficiently small KeyMax.
-
- if (leftSib->parent.isNull()) {
- // Making a new root
- invariant(leftSibLoc.toRecordId() == _logic->_headManager->getHead(_txn));
- const DiskLoc newRootLoc = _logic->_addBucket(_txn);
- leftSib->parent = newRootLoc;
- _logic->_headManager->setHead(_txn, newRootLoc.toRecordId());
-
- // Set the newRoot's nextChild to point to leftSib for the invariant below.
- BucketType* newRoot = _getBucket(newRootLoc);
- *_txn->recoveryUnit()->writing(&newRoot->nextChild) = leftSibLoc;
- }
-
- DiskLoc parentLoc = leftSib->parent;
- BucketType* parent = _getModifiableBucket(parentLoc);
-
- // For the pushBack below to be correct, leftSib must be the right-most child of parent.
- invariant(parent->nextChild == leftSibLoc);
-
- // Pull right-most key out of leftSib and move to parent, splitting parent if necessary.
- // Note that popBack() handles setting leftSib's nextChild to the former prevChildNode of
- // the popped key.
- KeyDataType key;
- DiskLoc val;
- _logic->popBack(leftSib, &val, &key);
- if (!_logic->pushBack(parent, val, key, leftSibLoc)) {
- // parent is full, so split it.
- parentLoc = newBucket(parent, parentLoc);
- parent = _getModifiableBucket(parentLoc);
- invariant(_logic->pushBack(parent, val, key, leftSibLoc));
- leftSib->parent = parentLoc;
+ if (bucket->n) {
+ const FullKey klast = getFullKey(bucket, bucket->n - 1);
+ if (klast.data.woCompare(key, _ordering) > 0) {
+ log() << "btree bucket corrupt? "
+ "consider reindexing or running validate command" << endl;
+ log() << " klast: " << klast.data.toString() << endl;
+ log() << " key: " << key.toString() << endl;
+ invariant(false);
}
-
- // Create a new bucket to the right of leftSib and set its parent pointer and the downward
- // nextChild pointer from the parent.
- DiskLoc newBucketLoc = _logic->_addBucket(_txn);
- BucketType* newBucket = _getBucket(newBucketLoc);
- *_txn->recoveryUnit()->writing(&newBucket->parent) = parentLoc;
- *_txn->recoveryUnit()->writing(&parent->nextChild) = newBucketLoc;
- return newBucketLoc;
}
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::BucketType*
- BtreeLogic<BtreeLayout>::Builder::_getModifiableBucket(DiskLoc loc) {
- return _logic->btreemod(_txn, _logic->getBucket(_txn, loc));
- }
+ bucket->emptySize -= sizeof(KeyHeaderType);
+ KeyHeaderType& kn = getKeyHeader(bucket, bucket->n++);
+ kn.prevChildBucket = prevChild;
+ kn.recordLoc = recordLoc;
+ kn.setKeyDataOfs((short)_alloc(bucket, key.dataSize()));
+ short ofs = kn.keyDataOfs();
+ char* p = dataAt(bucket, ofs);
+ memcpy(p, key.data(), key.dataSize());
+ return true;
+}
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::BucketType*
- BtreeLogic<BtreeLayout>::Builder::_getBucket(DiskLoc loc) {
- return _logic->getBucket(_txn, loc);
- }
+/**
+ * Durability note:
+ *
+ * We do separate intent declarations herein. Arguably one could just declare the whole bucket
+ * given we do group commits. This is something we could investigate later as to what is
+ * faster.
+ **/
- //
- // BtreeLogic logic
- //
+/**
+ * Insert a key in a bucket with no complexity -- no splits required
+ * Returns false if a split is required.
+ */
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::basicInsert(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int& keypos,
+ const KeyDataType& key,
+ const DiskLoc recordLoc) {
+ invariant(bucket->n < 1024);
+ invariant(keypos >= 0 && keypos <= bucket->n);
- // static
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::FullKey
- BtreeLogic<BtreeLayout>::getFullKey(const BucketType* bucket, int i) {
- if (i >= bucket->n) {
- int code = 13000;
- massert(code,
- (string)"invalid keyNode: " + BSON( "i" << i << "n" << bucket->n ).jsonString(),
- i < bucket->n );
+ int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType);
+ if (bytesNeeded > bucket->emptySize) {
+ _pack(txn, bucket, bucketLoc, keypos);
+ if (bytesNeeded > bucket->emptySize) {
+ return false;
}
- return FullKey(bucket, i);
}
- // static
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::KeyHeaderType&
- BtreeLogic<BtreeLayout>::getKeyHeader(BucketType* bucket, int i) {
- return ((KeyHeaderType*)bucket->data)[i];
- }
+ invariant(getBucket(txn, bucketLoc) == bucket);
- // static
- template <class BtreeLayout>
- const typename BtreeLogic<BtreeLayout>::KeyHeaderType&
- BtreeLogic<BtreeLayout>::getKeyHeader(const BucketType* bucket, int i) {
- return ((const KeyHeaderType*)bucket->data)[i];
- }
+ {
+ // declare that we will write to [k(keypos),k(n)]
+ char* start = reinterpret_cast<char*>(&getKeyHeader(bucket, keypos));
+ char* end = reinterpret_cast<char*>(&getKeyHeader(bucket, bucket->n + 1));
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::markUnused(BucketType* bucket, int keyPos) {
- invariant(keyPos >= 0 && keyPos < bucket->n);
- getKeyHeader(bucket, keyPos).setUnused();
+ // Declare that we will write to [k(keypos),k(n)]
+ txn->recoveryUnit()->writingPtr(start, end - start);
}
- template <class BtreeLayout>
- char* BtreeLogic<BtreeLayout>::dataAt(BucketType* bucket, short ofs) {
- return bucket->data + ofs;
+ // e.g. for n==3, keypos==2
+ // 1 4 9 -> 1 4 _ 9
+ for (int j = bucket->n; j > keypos; j--) {
+ getKeyHeader(bucket, j) = getKeyHeader(bucket, j - 1);
}
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::BucketType*
- BtreeLogic<BtreeLayout>::btreemod(OperationContext* txn, BucketType* bucket) {
- txn->recoveryUnit()->writingPtr(bucket, BtreeLayout::BucketSize);
- return bucket;
- }
-
- template <class BtreeLayout>
- int BtreeLogic<BtreeLayout>::totalDataSize(BucketType* bucket) {
- return (int) (BtreeLayout::BucketSize - (bucket->data - (char*)bucket));
- }
-
- // We define this value as the maximum number of bytes such that, if we have
- // fewer than this many bytes, we must be able to either merge with or receive
- // keys from any neighboring node. If our utilization goes below this value we
- // know we can bring up the utilization with a simple operation. Ignoring the
- // 90/10 split policy which is sometimes employed and our 'unused' nodes, this
- // is a lower bound on bucket utilization for non root buckets.
- //
- // Note that the exact value here depends on the implementation of
- // _rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as
- // follows: We know we cannot merge with the neighbor, so the total data size
- // for us, the neighbor, and the separator must be at least
- // BucketType::bodySize() + 1. We must be able to accept one key of any
- // allowed size, so our size plus storage for that additional key must be
- // <= BucketType::bodySize() / 2. This way, with the extra key we'll have a
- // new bucket data size < half the total data size and by the implementation
- // of _rebalancedSeparatorPos() the key must be added.
- template <class BtreeLayout>
- int BtreeLogic<BtreeLayout>::lowWaterMark() {
- return BtreeLayout::BucketBodySize / 2 - BtreeLayout::KeyMax - sizeof(KeyHeaderType) + 1;
- }
-
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::init(BucketType* bucket) {
- BtreeLayout::initBucket(bucket);
- bucket->parent.Null();
- bucket->nextChild.Null();
- bucket->flags = Packed;
- bucket->n = 0;
- bucket->emptySize = totalDataSize(bucket);
- bucket->topSize = 0;
- }
-
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::_unalloc(BucketType* bucket, int bytes) {
- bucket->topSize -= bytes;
- bucket->emptySize += bytes;
- }
+ size_t writeLen = sizeof(bucket->emptySize) + sizeof(bucket->topSize) + sizeof(bucket->n);
+ txn->recoveryUnit()->writingPtr(&bucket->emptySize, writeLen);
+ bucket->emptySize -= sizeof(KeyHeaderType);
+ bucket->n++;
- /**
- * We allocate space from the end of the buffer for data. The keynodes grow from the front.
- */
- template <class BtreeLayout>
- int BtreeLogic<BtreeLayout>::_alloc(BucketType* bucket, int bytes) {
- invariant(bucket->emptySize >= bytes);
- bucket->topSize += bytes;
- bucket->emptySize -= bytes;
- int ofs = totalDataSize(bucket) - bucket->topSize;
- invariant(ofs > 0);
- return ofs;
- }
+ // This _KeyNode was marked for writing above.
+ KeyHeaderType& kn = getKeyHeader(bucket, keypos);
+ kn.prevChildBucket.Null();
+ kn.recordLoc = recordLoc;
+ kn.setKeyDataOfs((short)_alloc(bucket, key.dataSize()));
+ char* p = dataAt(bucket, kn.keyDataOfs());
+ txn->recoveryUnit()->writingPtr(p, key.dataSize());
+ memcpy(p, key.data(), key.dataSize());
+ return true;
+}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::setNotPacked(BucketType* bucket) {
- bucket->flags &= ~Packed;
- }
+/**
+ * With this implementation, refPos == 0 disregards effect of refPos. index > 0 prevents
+ * creation of an empty bucket.
+ */
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::mayDropKey(BucketType* bucket, int index, int refPos) {
+ return index > 0 && (index != refPos) && getKeyHeader(bucket, index).isUnused() &&
+ getKeyHeader(bucket, index).prevChildBucket.isNull();
+}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::setPacked(BucketType* bucket) {
- bucket->flags |= Packed;
+template <class BtreeLayout>
+int BtreeLogic<BtreeLayout>::_packedDataSize(BucketType* bucket, int refPos) {
+ if (bucket->flags & Packed) {
+ return BtreeLayout::BucketSize - bucket->emptySize - BucketType::HeaderSize;
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::_delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty) {
- invariant(keypos >= 0 && keypos <= bucket->n);
- invariant(childLocForPos(bucket, keypos).isNull());
- invariant((mayEmpty && bucket->n > 0) || bucket->n > 1 || bucket->nextChild.isNull());
-
- bucket->emptySize += sizeof(KeyHeaderType);
- bucket->n--;
-
- for (int j = keypos; j < bucket->n; j++) {
- getKeyHeader(bucket, j) = getKeyHeader(bucket, j + 1);
+ int size = 0;
+ for (int j = 0; j < bucket->n; ++j) {
+ if (mayDropKey(bucket, j, refPos)) {
+ continue;
}
-
- setNotPacked(bucket);
- }
-
- /**
- * Pull rightmost key from the bucket and set its prevChild pointer to be the nextChild for the
- * whole bucket. It is assumed that caller already has the old value of the nextChild
- * pointer and is about to add a pointer to it elsewhere in the tree.
- *
- * This is only used by BtreeLogic::Builder. Think very hard (and change this comment) before
- * using it anywhere else.
- *
- * WARNING: The keyDataOut that is filled out by this function points to newly unalloced memory
- * inside of this bucket. It only remains valid until the next write to this bucket.
- */
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::popBack(BucketType* bucket,
- DiskLoc* recordLocOut,
- KeyDataType* keyDataOut) {
-
- massert(17435, "n==0 in btree popBack()", bucket->n > 0 );
-
- invariant(getKeyHeader(bucket, bucket->n - 1).isUsed());
-
- FullKey kn = getFullKey(bucket, bucket->n - 1);
- *recordLocOut = kn.recordLoc;
- keyDataOut->assign(kn.data);
- int keysize = kn.data.dataSize();
-
- // The left/prev child of the node we are popping now goes in to the nextChild slot as all
- // of its keys are greater than all remaining keys in this node.
- bucket->nextChild = kn.prevChildBucket;
- bucket->n--;
-
- // This is risky because the keyDataOut we filled out above will now point to this newly
- // unalloced memory.
- bucket->emptySize += sizeof(KeyHeaderType);
- _unalloc(bucket, keysize);
- }
-
- /**
- * Add a key. Must be > all existing. Be careful to set next ptr right.
- */
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::pushBack(BucketType* bucket,
- const DiskLoc recordLoc,
- const KeyDataType& key,
- const DiskLoc prevChild) {
-
- int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType);
- if (bytesNeeded > bucket->emptySize) {
- return false;
- }
- invariant(bytesNeeded <= bucket->emptySize);
-
- if (bucket->n) {
- const FullKey klast = getFullKey(bucket, bucket->n - 1);
- if (klast.data.woCompare(key, _ordering) > 0) {
- log() << "btree bucket corrupt? "
- "consider reindexing or running validate command" << endl;
- log() << " klast: " << klast.data.toString() << endl;
- log() << " key: " << key.toString() << endl;
- invariant(false);
- }
- }
-
- bucket->emptySize -= sizeof(KeyHeaderType);
- KeyHeaderType& kn = getKeyHeader(bucket, bucket->n++);
- kn.prevChildBucket = prevChild;
- kn.recordLoc = recordLoc;
- kn.setKeyDataOfs((short)_alloc(bucket, key.dataSize()));
- short ofs = kn.keyDataOfs();
- char *p = dataAt(bucket, ofs);
- memcpy(p, key.data(), key.dataSize());
- return true;
+ size += getFullKey(bucket, j).data.dataSize() + sizeof(KeyHeaderType);
}
- /**
- * Durability note:
- *
- * We do separate intent declarations herein. Arguably one could just declare the whole bucket
- * given we do group commits. This is something we could investigate later as to what is
- * faster.
- **/
-
- /**
- * Insert a key in a bucket with no complexity -- no splits required
- * Returns false if a split is required.
- */
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::basicInsert(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int& keypos,
- const KeyDataType& key,
- const DiskLoc recordLoc) {
- invariant(bucket->n < 1024);
- invariant(keypos >= 0 && keypos <= bucket->n);
+ return size;
+}
- int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType);
- if (bytesNeeded > bucket->emptySize) {
- _pack(txn, bucket, bucketLoc, keypos);
- if (bytesNeeded > bucket->emptySize) {
- return false;
- }
- }
-
- invariant(getBucket(txn, bucketLoc) == bucket);
-
- {
- // declare that we will write to [k(keypos),k(n)]
- char* start = reinterpret_cast<char*>(&getKeyHeader(bucket, keypos));
- char* end = reinterpret_cast<char*>(&getKeyHeader(bucket, bucket->n + 1));
-
- // Declare that we will write to [k(keypos),k(n)]
- txn->recoveryUnit()->writingPtr(start, end - start);
- }
-
- // e.g. for n==3, keypos==2
- // 1 4 9 -> 1 4 _ 9
- for (int j = bucket->n; j > keypos; j--) {
- getKeyHeader(bucket, j) = getKeyHeader(bucket, j - 1);
- }
-
- size_t writeLen = sizeof(bucket->emptySize) + sizeof(bucket->topSize) + sizeof(bucket->n);
- txn->recoveryUnit()->writingPtr(&bucket->emptySize, writeLen);
- bucket->emptySize -= sizeof(KeyHeaderType);
- bucket->n++;
-
- // This _KeyNode was marked for writing above.
- KeyHeaderType& kn = getKeyHeader(bucket, keypos);
- kn.prevChildBucket.Null();
- kn.recordLoc = recordLoc;
- kn.setKeyDataOfs((short) _alloc(bucket, key.dataSize()));
- char *p = dataAt(bucket, kn.keyDataOfs());
- txn->recoveryUnit()->writingPtr(p, key.dataSize());
- memcpy(p, key.data(), key.dataSize());
- return true;
- }
-
- /**
- * With this implementation, refPos == 0 disregards effect of refPos. index > 0 prevents
- * creation of an empty bucket.
- */
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::mayDropKey(BucketType* bucket, int index, int refPos) {
- return index > 0
- && (index != refPos)
- && getKeyHeader(bucket, index).isUnused()
- && getKeyHeader(bucket, index).prevChildBucket.isNull();
- }
-
- template <class BtreeLayout>
- int BtreeLogic<BtreeLayout>::_packedDataSize(BucketType* bucket, int refPos) {
- if (bucket->flags & Packed) {
- return BtreeLayout::BucketSize - bucket->emptySize - BucketType::HeaderSize;
- }
-
- int size = 0;
- for (int j = 0; j < bucket->n; ++j) {
- if (mayDropKey(bucket, j, refPos)) {
- continue;
- }
- size += getFullKey(bucket, j).data.dataSize() + sizeof(KeyHeaderType);
- }
+/**
+ * When we delete things, we just leave empty space until the node is full and then we repack
+ * it.
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::_pack(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc thisLoc,
+ int& refPos) {
+ invariant(getBucket(txn, thisLoc) == bucket);
- return size;
+ if (bucket->flags & Packed) {
+ return;
}
- /**
- * When we delete things, we just leave empty space until the node is full and then we repack
- * it.
- */
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::_pack(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc thisLoc,
- int &refPos) {
-
- invariant(getBucket(txn, thisLoc) == bucket);
+ _packReadyForMod(btreemod(txn, bucket), refPos);
+}
- if (bucket->flags & Packed) {
- return;
- }
-
- _packReadyForMod(btreemod(txn, bucket), refPos);
+/**
+ * Version when write intent already declared.
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::_packReadyForMod(BucketType* bucket, int& refPos) {
+ if (bucket->flags & Packed) {
+ return;
}
- /**
- * Version when write intent already declared.
- */
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::_packReadyForMod(BucketType* bucket, int &refPos) {
- if (bucket->flags & Packed) {
- return;
- }
-
- int tdz = totalDataSize(bucket);
- char temp[BtreeLayout::BucketSize];
- int ofs = tdz;
- bucket->topSize = 0;
-
- int i = 0;
- for (int j = 0; j < bucket->n; j++) {
- if (mayDropKey(bucket, j, refPos)) {
- // key is unused and has no children - drop it
- continue;
- }
-
- if (i != j) {
- if (refPos == j) {
- // i < j so j will never be refPos again
- refPos = i;
- }
- getKeyHeader(bucket, i) = getKeyHeader(bucket, j);
- }
+ int tdz = totalDataSize(bucket);
+ char temp[BtreeLayout::BucketSize];
+ int ofs = tdz;
+ bucket->topSize = 0;
- short ofsold = getKeyHeader(bucket, i).keyDataOfs();
- int sz = getFullKey(bucket, i).data.dataSize();
- ofs -= sz;
- bucket->topSize += sz;
- memcpy(temp + ofs, dataAt(bucket, ofsold), sz);
- getKeyHeader(bucket, i).setKeyDataOfsSavingUse(ofs);
- ++i;
+ int i = 0;
+ for (int j = 0; j < bucket->n; j++) {
+ if (mayDropKey(bucket, j, refPos)) {
+ // key is unused and has no children - drop it
+ continue;
}
- if (refPos == bucket->n) {
- refPos = i;
- }
-
- bucket->n = i;
- int dataUsed = tdz - ofs;
- memcpy(bucket->data + ofs, temp + ofs, dataUsed);
-
- bucket->emptySize = tdz - dataUsed - bucket->n * sizeof(KeyHeaderType);
- int foo = bucket->emptySize;
- invariant( foo >= 0 );
- setPacked(bucket);
- assertValid(_indexName, bucket, _ordering);
- }
-
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::truncateTo(BucketType* bucket,
- int N,
- int &refPos) {
- bucket->n = N;
- setNotPacked(bucket);
- _packReadyForMod(bucket, refPos);
- }
-
- /**
- * In the standard btree algorithm, we would split based on the
- * existing keys _and_ the new key. But that's more work to
- * implement, so we split the existing keys and then add the new key.
- *
- * There are several published heuristic algorithms for doing splits, but basically what you
- * want are (1) even balancing between the two sides and (2) a small split key so the parent can
- * have a larger branching factor.
- *
- * We just have a simple algorithm right now: if a key includes the halfway point (or 10% way
- * point) in terms of bytes, split on that key; otherwise split on the key immediately to the
- * left of the halfway point (or 10% point).
- *
- * This function is expected to be called on a packed bucket.
- */
- template <class BtreeLayout>
- int BtreeLogic<BtreeLayout>::splitPos(BucketType* bucket, int keypos) {
- invariant(bucket->n > 2);
- int split = 0;
- int rightSize = 0;
-
- // When splitting a btree node, if the new key is greater than all the other keys, we should
- // not do an even split, but a 90/10 split. see SERVER-983. TODO I think we only want to
- // do the 90% split on the rhs node of the tree.
- int rightSizeLimit = (bucket->topSize + sizeof(KeyHeaderType) * bucket->n)
- / (keypos == bucket->n ? 10 : 2);
-
- for (int i = bucket->n - 1; i > -1; --i) {
- rightSize += getFullKey(bucket, i).data.dataSize() + sizeof(KeyHeaderType);
- if (rightSize > rightSizeLimit) {
- split = i;
- break;
+ if (i != j) {
+ if (refPos == j) {
+ // i < j so j will never be refPos again
+ refPos = i;
}
+ getKeyHeader(bucket, i) = getKeyHeader(bucket, j);
}
- // safeguards - we must not create an empty bucket
- if (split < 1) {
- split = 1;
- }
- else if (split > bucket->n - 2) {
- split = bucket->n - 2;
- }
-
- return split;
+ short ofsold = getKeyHeader(bucket, i).keyDataOfs();
+ int sz = getFullKey(bucket, i).data.dataSize();
+ ofs -= sz;
+ bucket->topSize += sz;
+ memcpy(temp + ofs, dataAt(bucket, ofsold), sz);
+ getKeyHeader(bucket, i).setKeyDataOfsSavingUse(ofs);
+ ++i;
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::reserveKeysFront(BucketType* bucket, int nAdd) {
- invariant(bucket->emptySize >= int(sizeof(KeyHeaderType) * nAdd));
- bucket->emptySize -= sizeof(KeyHeaderType) * nAdd;
- for (int i = bucket->n - 1; i > -1; --i) {
- getKeyHeader(bucket, i + nAdd) = getKeyHeader(bucket, i);
- }
- bucket->n += nAdd;
- }
-
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::setKey(BucketType* bucket,
- int i,
- const DiskLoc recordLoc,
- const KeyDataType& key,
- const DiskLoc prevChildBucket) {
- KeyHeaderType &kn = getKeyHeader(bucket, i);
- kn.recordLoc = recordLoc;
- kn.prevChildBucket = prevChildBucket;
- short ofs = (short) _alloc(bucket, key.dataSize());
- kn.setKeyDataOfs(ofs);
- char *p = dataAt(bucket, ofs);
- memcpy(p, key.data(), key.dataSize());
- }
-
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::dropFront(BucketType* bucket,
- int nDrop,
- int &refpos) {
- for (int i = nDrop; i < bucket->n; ++i) {
- getKeyHeader(bucket, i - nDrop) = getKeyHeader(bucket, i);
- }
- bucket->n -= nDrop;
- setNotPacked(bucket);
- _packReadyForMod(bucket, refpos );
+ if (refPos == bucket->n) {
+ refPos = i;
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn,
- DiskLoc* locInOut,
- int* keyOfsInOut,
- const IndexSeekPoint& seekPoint,
- int direction) const {
- pair<DiskLoc, int> unused;
+ bucket->n = i;
+ int dataUsed = tdz - ofs;
+ memcpy(bucket->data + ofs, temp + ofs, dataUsed);
- customLocate(txn, locInOut, keyOfsInOut, seekPoint, direction, unused);
- skipUnusedKeys(txn, locInOut, keyOfsInOut, direction);
- }
+ bucket->emptySize = tdz - dataUsed - bucket->n * sizeof(KeyHeaderType);
+ int foo = bucket->emptySize;
+ invariant(foo >= 0);
+ setPacked(bucket);
+ assertValid(_indexName, bucket, _ordering);
+}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::advance(OperationContext* txn,
- DiskLoc* bucketLocInOut,
- int* posInOut,
- int direction) const {
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::truncateTo(BucketType* bucket, int N, int& refPos) {
+ bucket->n = N;
+ setNotPacked(bucket);
+ _packReadyForMod(bucket, refPos);
+}
- *bucketLocInOut = advance(txn, *bucketLocInOut, posInOut, direction);
- skipUnusedKeys(txn, bucketLocInOut, posInOut, direction);
+/**
+ * In the standard btree algorithm, we would split based on the
+ * existing keys _and_ the new key. But that's more work to
+ * implement, so we split the existing keys and then add the new key.
+ *
+ * There are several published heuristic algorithms for doing splits, but basically what you
+ * want are (1) even balancing between the two sides and (2) a small split key so the parent can
+ * have a larger branching factor.
+ *
+ * We just have a simple algorithm right now: if a key includes the halfway point (or 10% way
+ * point) in terms of bytes, split on that key; otherwise split on the key immediately to the
+ * left of the halfway point (or 10% point).
+ *
+ * This function is expected to be called on a packed bucket.
+ */
+template <class BtreeLayout>
+int BtreeLogic<BtreeLayout>::splitPos(BucketType* bucket, int keypos) {
+ invariant(bucket->n > 2);
+ int split = 0;
+ int rightSize = 0;
+
+ // When splitting a btree node, if the new key is greater than all the other keys, we should
+ // not do an even split, but a 90/10 split. see SERVER-983. TODO I think we only want to
+ // do the 90% split on the rhs node of the tree.
+ int rightSizeLimit =
+ (bucket->topSize + sizeof(KeyHeaderType) * bucket->n) / (keypos == bucket->n ? 10 : 2);
+
+ for (int i = bucket->n - 1; i > -1; --i) {
+ rightSize += getFullKey(bucket, i).data.dataSize() + sizeof(KeyHeaderType);
+ if (rightSize > rightSizeLimit) {
+ split = i;
+ break;
+ }
+ }
+
+ // safeguards - we must not create an empty bucket
+ if (split < 1) {
+ split = 1;
+ } else if (split > bucket->n - 2) {
+ split = bucket->n - 2;
+ }
+
+ return split;
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::reserveKeysFront(BucketType* bucket, int nAdd) {
+ invariant(bucket->emptySize >= int(sizeof(KeyHeaderType) * nAdd));
+ bucket->emptySize -= sizeof(KeyHeaderType) * nAdd;
+ for (int i = bucket->n - 1; i > -1; --i) {
+ getKeyHeader(bucket, i + nAdd) = getKeyHeader(bucket, i);
+ }
+ bucket->n += nAdd;
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::setKey(BucketType* bucket,
+ int i,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc prevChildBucket) {
+ KeyHeaderType& kn = getKeyHeader(bucket, i);
+ kn.recordLoc = recordLoc;
+ kn.prevChildBucket = prevChildBucket;
+ short ofs = (short)_alloc(bucket, key.dataSize());
+ kn.setKeyDataOfs(ofs);
+ char* p = dataAt(bucket, ofs);
+ memcpy(p, key.data(), key.dataSize());
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::dropFront(BucketType* bucket, int nDrop, int& refpos) {
+ for (int i = nDrop; i < bucket->n; ++i) {
+ getKeyHeader(bucket, i - nDrop) = getKeyHeader(bucket, i);
+ }
+ bucket->n -= nDrop;
+ setNotPacked(bucket);
+ _packReadyForMod(bucket, refpos);
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn,
+ DiskLoc* locInOut,
+ int* keyOfsInOut,
+ const IndexSeekPoint& seekPoint,
+ int direction) const {
+ pair<DiskLoc, int> unused;
+
+ customLocate(txn, locInOut, keyOfsInOut, seekPoint, direction, unused);
+ skipUnusedKeys(txn, locInOut, keyOfsInOut, direction);
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::advance(OperationContext* txn,
+ DiskLoc* bucketLocInOut,
+ int* posInOut,
+ int direction) const {
+ *bucketLocInOut = advance(txn, *bucketLocInOut, posInOut, direction);
+ skipUnusedKeys(txn, bucketLocInOut, posInOut, direction);
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::skipUnusedKeys(OperationContext* txn,
+ DiskLoc* loc,
+ int* pos,
+ int direction) const {
+ while (!loc->isNull() && !keyIsUsed(txn, *loc, *pos)) {
+ *loc = advance(txn, *loc, pos, direction);
}
+}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::skipUnusedKeys(OperationContext* txn,
- DiskLoc* loc,
- int* pos,
- int direction) const {
- while (!loc->isNull() && !keyIsUsed(txn, *loc, *pos)) {
- *loc = advance(txn, *loc, pos, direction);
- }
- }
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::advanceTo(OperationContext* txn,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ const IndexSeekPoint& seekPoint,
+ int direction) const {
+ advanceToImpl(txn, thisLocInOut, keyOfsInOut, seekPoint, direction);
+ skipUnusedKeys(txn, thisLocInOut, keyOfsInOut, direction);
+}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::advanceTo(OperationContext* txn,
+/**
+ * find smallest/biggest value greater-equal/less-equal than specified
+ *
+ * starting thisLoc + keyOfs will be strictly less than/strictly greater than
+ * keyBegin/keyBeginLen/keyEnd
+ *
+ * All the direction checks below allowed me to refactor the code, but possibly separate forward
+ * and reverse implementations would be more efficient
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::advanceToImpl(OperationContext* txn,
DiskLoc* thisLocInOut,
int* keyOfsInOut,
const IndexSeekPoint& seekPoint,
int direction) const {
+ BucketType* bucket = getBucket(txn, *thisLocInOut);
- advanceToImpl(txn, thisLocInOut, keyOfsInOut, seekPoint, direction);
- skipUnusedKeys(txn, thisLocInOut, keyOfsInOut, direction);
- }
-
- /**
- * find smallest/biggest value greater-equal/less-equal than specified
- *
- * starting thisLoc + keyOfs will be strictly less than/strictly greater than
- * keyBegin/keyBeginLen/keyEnd
- *
- * All the direction checks below allowed me to refactor the code, but possibly separate forward
- * and reverse implementations would be more efficient
- */
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::advanceToImpl(OperationContext* txn,
- DiskLoc* thisLocInOut,
- int* keyOfsInOut,
- const IndexSeekPoint& seekPoint,
- int direction) const {
-
- BucketType* bucket = getBucket(txn, *thisLocInOut);
-
- int l, h;
- bool dontGoUp;
-
- if (direction > 0) {
- l = *keyOfsInOut;
- h = bucket->n - 1;
- int cmpResult = customBSONCmp(getFullKey(bucket, h).data.toBson(),
- seekPoint,
- direction);
- dontGoUp = (cmpResult >= 0);
- }
- else {
- l = 0;
- h = *keyOfsInOut;
- int cmpResult = customBSONCmp(getFullKey(bucket, l).data.toBson(),
- seekPoint,
- direction);
- dontGoUp = (cmpResult <= 0);
- }
+ int l, h;
+ bool dontGoUp;
- pair<DiskLoc, int> bestParent;
-
- if (dontGoUp) {
- // this comparison result assures h > l
- if (!customFind(txn,
- l,
- h,
- seekPoint,
- direction,
- thisLocInOut,
- keyOfsInOut,
- bestParent)) {
- return;
- }
+ if (direction > 0) {
+ l = *keyOfsInOut;
+ h = bucket->n - 1;
+ int cmpResult = customBSONCmp(getFullKey(bucket, h).data.toBson(), seekPoint, direction);
+ dontGoUp = (cmpResult >= 0);
+ } else {
+ l = 0;
+ h = *keyOfsInOut;
+ int cmpResult = customBSONCmp(getFullKey(bucket, l).data.toBson(), seekPoint, direction);
+ dontGoUp = (cmpResult <= 0);
+ }
+
+ pair<DiskLoc, int> bestParent;
+
+ if (dontGoUp) {
+ // this comparison result assures h > l
+ if (!customFind(txn, l, h, seekPoint, direction, thisLocInOut, keyOfsInOut, bestParent)) {
+ return;
}
- else {
- // go up parents until rightmost/leftmost node is >=/<= target or at top
- while (!bucket->parent.isNull()) {
- *thisLocInOut = bucket->parent;
- bucket = getBucket(txn,
- *thisLocInOut);
-
- if (direction > 0) {
- if (customBSONCmp(getFullKey(bucket, bucket->n - 1).data.toBson(),
- seekPoint,
- direction) >= 0 ) {
- break;
- }
+ } else {
+ // go up parents until rightmost/leftmost node is >=/<= target or at top
+ while (!bucket->parent.isNull()) {
+ *thisLocInOut = bucket->parent;
+ bucket = getBucket(txn, *thisLocInOut);
+
+ if (direction > 0) {
+ if (customBSONCmp(getFullKey(bucket, bucket->n - 1).data.toBson(),
+ seekPoint,
+ direction) >= 0) {
+ break;
}
- else {
- if (customBSONCmp(getFullKey(bucket, 0).data.toBson(),
- seekPoint,
- direction) <= 0) {
- break;
- }
+ } else {
+ if (customBSONCmp(getFullKey(bucket, 0).data.toBson(), seekPoint, direction) <= 0) {
+ break;
}
}
}
-
- customLocate(txn, thisLocInOut, keyOfsInOut, seekPoint, direction, bestParent);
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn,
- DiskLoc* locInOut,
- int* keyOfsInOut,
- const IndexSeekPoint& seekPoint,
- int direction,
- pair<DiskLoc, int>& bestParent) const {
+ customLocate(txn, thisLocInOut, keyOfsInOut, seekPoint, direction, bestParent);
+}
- BucketType* bucket = getBucket(txn, *locInOut);
-
- if (0 == bucket->n) {
- *locInOut = DiskLoc();
- return;
- }
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn,
+ DiskLoc* locInOut,
+ int* keyOfsInOut,
+ const IndexSeekPoint& seekPoint,
+ int direction,
+ pair<DiskLoc, int>& bestParent) const {
+ BucketType* bucket = getBucket(txn, *locInOut);
- // go down until find smallest/biggest >=/<= target
- for (;;) {
- int l = 0;
- int h = bucket->n - 1;
+ if (0 == bucket->n) {
+ *locInOut = DiskLoc();
+ return;
+ }
- // +direction: 0, -direction: h
- int z = (direction > 0) ? 0 : h;
+ // go down until find smallest/biggest >=/<= target
+ for (;;) {
+ int l = 0;
+ int h = bucket->n - 1;
- // leftmost/rightmost key may possibly be >=/<= search key
- int res = customBSONCmp(getFullKey(bucket, z).data.toBson(), seekPoint, direction);
- if (direction * res >= 0) {
- DiskLoc next;
- *keyOfsInOut = z;
+ // +direction: 0, -direction: h
+ int z = (direction > 0) ? 0 : h;
- if (direction > 0) {
- dassert(z == 0);
- next = getKeyHeader(bucket, 0).prevChildBucket;
- }
- else {
- next = bucket->nextChild;
- }
+ // leftmost/rightmost key may possibly be >=/<= search key
+ int res = customBSONCmp(getFullKey(bucket, z).data.toBson(), seekPoint, direction);
+ if (direction * res >= 0) {
+ DiskLoc next;
+ *keyOfsInOut = z;
- if (!next.isNull()) {
- bestParent = pair<DiskLoc, int>(*locInOut, *keyOfsInOut);
- *locInOut = next;
- bucket = getBucket(txn, *locInOut);
- continue;
- }
- else {
- return;
- }
+ if (direction > 0) {
+ dassert(z == 0);
+ next = getKeyHeader(bucket, 0).prevChildBucket;
+ } else {
+ next = bucket->nextChild;
}
- res = customBSONCmp(getFullKey(bucket, h - z).data.toBson(), seekPoint, direction);
- if (direction * res < 0) {
- DiskLoc next;
- if (direction > 0) {
- next = bucket->nextChild;
- }
- else {
- next = getKeyHeader(bucket, 0).prevChildBucket;
- }
+ if (!next.isNull()) {
+ bestParent = pair<DiskLoc, int>(*locInOut, *keyOfsInOut);
+ *locInOut = next;
+ bucket = getBucket(txn, *locInOut);
+ continue;
+ } else {
+ return;
+ }
+ }
- if (next.isNull()) {
- // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc()
- *locInOut = bestParent.first;
- *keyOfsInOut = bestParent.second;
- return;
- }
- else {
- *locInOut = next;
- bucket = getBucket(txn, *locInOut);
- continue;
- }
+ res = customBSONCmp(getFullKey(bucket, h - z).data.toBson(), seekPoint, direction);
+ if (direction * res < 0) {
+ DiskLoc next;
+ if (direction > 0) {
+ next = bucket->nextChild;
+ } else {
+ next = getKeyHeader(bucket, 0).prevChildBucket;
}
- if (!customFind(txn,
- l,
- h,
- seekPoint,
- direction,
- locInOut,
- keyOfsInOut,
- bestParent)) {
+ if (next.isNull()) {
+ // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc()
+ *locInOut = bestParent.first;
+ *keyOfsInOut = bestParent.second;
return;
+ } else {
+ *locInOut = next;
+ bucket = getBucket(txn, *locInOut);
+ continue;
}
-
- bucket = getBucket(txn, *locInOut);
}
- }
-
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::customFind(OperationContext* txn,
- int low,
- int high,
- const IndexSeekPoint& seekPoint,
- int direction,
- DiskLoc* thisLocInOut,
- int* keyOfsInOut,
- pair<DiskLoc, int>& bestParent) const {
- const BucketType* bucket = getBucket(txn, *thisLocInOut);
+ if (!customFind(txn, l, h, seekPoint, direction, locInOut, keyOfsInOut, bestParent)) {
+ return;
+ }
- for (;;) {
- if (low + 1 == high) {
- *keyOfsInOut = (direction > 0) ? high : low;
- DiskLoc next = getKeyHeader(bucket, high).prevChildBucket;
- if (!next.isNull()) {
- bestParent = make_pair(*thisLocInOut, *keyOfsInOut);
- *thisLocInOut = next;
- return true;
- }
- else {
- return false;
- }
+ bucket = getBucket(txn, *locInOut);
+ }
+}
+
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::customFind(OperationContext* txn,
+ int low,
+ int high,
+ const IndexSeekPoint& seekPoint,
+ int direction,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ pair<DiskLoc, int>& bestParent) const {
+ const BucketType* bucket = getBucket(txn, *thisLocInOut);
+
+ for (;;) {
+ if (low + 1 == high) {
+ *keyOfsInOut = (direction > 0) ? high : low;
+ DiskLoc next = getKeyHeader(bucket, high).prevChildBucket;
+ if (!next.isNull()) {
+ bestParent = make_pair(*thisLocInOut, *keyOfsInOut);
+ *thisLocInOut = next;
+ return true;
+ } else {
+ return false;
}
+ }
- int middle = low + (high - low) / 2;
+ int middle = low + (high - low) / 2;
- int cmp = customBSONCmp(getFullKey(bucket, middle).data.toBson(), seekPoint, direction);
- if (cmp < 0) {
+ int cmp = customBSONCmp(getFullKey(bucket, middle).data.toBson(), seekPoint, direction);
+ if (cmp < 0) {
+ low = middle;
+ } else if (cmp > 0) {
+ high = middle;
+ } else {
+ if (direction < 0) {
low = middle;
- }
- else if (cmp > 0) {
+ } else {
high = middle;
}
- else {
- if (direction < 0) {
- low = middle;
- }
- else {
- high = middle;
- }
- }
}
}
+}
- /**
- * NOTE: Currently the Ordering implementation assumes a compound index will not have more keys
- * than an unsigned variable has bits. The same assumption is used in the implementation below
- * with respect to the 'mask' variable.
- *
- * 'l' is a regular bsonobj
- *
- * 'rBegin' is composed partly of an existing bsonobj, and the remaining keys are taken from a
- * vector of elements that frequently changes
- *
- * see https://jira.mongodb.org/browse/SERVER-371
- */
- // static
- template <class BtreeLayout>
- int BtreeLogic<BtreeLayout>::customBSONCmp(const BSONObj& left,
- const IndexSeekPoint& right,
- int direction) const {
- // XXX: make this readable
- dassert(right.keySuffix.size() == right.suffixInclusive.size());
-
- BSONObjIterator ll( left );
- BSONObjIterator rr( right.keyPrefix );
- unsigned mask = 1;
- size_t i = 0;
- for( ; i < size_t(right.prefixLen); ++i, mask <<= 1 ) {
- BSONElement lll = ll.next();
- BSONElement rrr = rr.next();
-
- int x = lll.woCompare( rrr, false );
- if ( _ordering.descending( mask ) )
- x = -x;
- if ( x != 0 )
- return x;
- }
- if (right.prefixExclusive) {
+/**
+ * NOTE: Currently the Ordering implementation assumes a compound index will not have more keys
+ * than an unsigned variable has bits. The same assumption is used in the implementation below
+ * with respect to the 'mask' variable.
+ *
+ * 'l' is a regular bsonobj
+ *
+ * 'rBegin' is composed partly of an existing bsonobj, and the remaining keys are taken from a
+ * vector of elements that frequently changes
+ *
+ * see https://jira.mongodb.org/browse/SERVER-371
+ */
+// static
+template <class BtreeLayout>
+int BtreeLogic<BtreeLayout>::customBSONCmp(const BSONObj& left,
+ const IndexSeekPoint& right,
+ int direction) const {
+ // XXX: make this readable
+ dassert(right.keySuffix.size() == right.suffixInclusive.size());
+
+ BSONObjIterator ll(left);
+ BSONObjIterator rr(right.keyPrefix);
+ unsigned mask = 1;
+ size_t i = 0;
+ for (; i < size_t(right.prefixLen); ++i, mask <<= 1) {
+ BSONElement lll = ll.next();
+ BSONElement rrr = rr.next();
+
+ int x = lll.woCompare(rrr, false);
+ if (_ordering.descending(mask))
+ x = -x;
+ if (x != 0)
+ return x;
+ }
+ if (right.prefixExclusive) {
+ return -direction;
+ }
+ for (; i < right.keySuffix.size(); ++i, mask <<= 1) {
+ if (!ll.more())
+ return -direction;
+
+ BSONElement lll = ll.next();
+ BSONElement rrr = *right.keySuffix[i];
+ int x = lll.woCompare(rrr, false);
+ if (_ordering.descending(mask))
+ x = -x;
+ if (x != 0)
+ return x;
+ if (!right.suffixInclusive[i]) {
return -direction;
}
- for( ; i < right.keySuffix.size(); ++i, mask <<= 1 ) {
- if (!ll.more())
- return -direction;
-
- BSONElement lll = ll.next();
- BSONElement rrr = *right.keySuffix[i];
- int x = lll.woCompare( rrr, false );
- if ( _ordering.descending( mask ) )
- x = -x;
- if ( x != 0 )
- return x;
- if ( !right.suffixInclusive[i] ) {
- return -direction;
- }
- }
- return ll.more() ? direction : 0;
}
+ return ll.more() ? direction : 0;
+}
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::exists(OperationContext* txn, const KeyDataType& key) const {
- int position = 0;
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::exists(OperationContext* txn, const KeyDataType& key) const {
+ int position = 0;
- // Find the DiskLoc
- bool found;
+ // Find the DiskLoc
+ bool found;
- DiskLoc bucket = _locate(txn, getRootLoc(txn), key, &position, &found, DiskLoc::min(), 1);
+ DiskLoc bucket = _locate(txn, getRootLoc(txn), key, &position, &found, DiskLoc::min(), 1);
- while (!bucket.isNull()) {
- FullKey fullKey = getFullKey(getBucket(txn, bucket), position);
- if (fullKey.header.isUsed()) {
- return fullKey.data.woEqual(key);
- }
- bucket = advance(txn, bucket, &position, 1);
+ while (!bucket.isNull()) {
+ FullKey fullKey = getFullKey(getBucket(txn, bucket), position);
+ if (fullKey.header.isUsed()) {
+ return fullKey.data.woEqual(key);
}
-
- return false;
+ bucket = advance(txn, bucket, &position, 1);
}
- template <class BtreeLayout>
- Status BtreeLogic<BtreeLayout>::dupKeyCheck(OperationContext* txn,
- const BSONObj& key,
- const DiskLoc& loc) const {
- KeyDataOwnedType theKey(key);
- if (!wouldCreateDup(txn, theKey, loc)) {
- return Status::OK();
- }
+ return false;
+}
- return Status(ErrorCodes::DuplicateKey, dupKeyError(theKey));
+template <class BtreeLayout>
+Status BtreeLogic<BtreeLayout>::dupKeyCheck(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc) const {
+ KeyDataOwnedType theKey(key);
+ if (!wouldCreateDup(txn, theKey, loc)) {
+ return Status::OK();
}
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::wouldCreateDup(OperationContext* txn,
- const KeyDataType& key,
- const DiskLoc self) const {
- int position;
- bool found;
-
- DiskLoc posLoc = _locate(txn, getRootLoc(txn), key, &position, &found, DiskLoc::min(), 1);
+ return Status(ErrorCodes::DuplicateKey, dupKeyError(theKey));
+}
- while (!posLoc.isNull()) {
- FullKey fullKey = getFullKey(getBucket(txn, posLoc), position);
- if (fullKey.header.isUsed()) {
- // TODO: we may not need fullKey.data until we know fullKey.header.isUsed() here
- // and elsewhere.
- if (fullKey.data.woEqual(key)) {
- return fullKey.recordLoc != self;
- }
- break;
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::wouldCreateDup(OperationContext* txn,
+ const KeyDataType& key,
+ const DiskLoc self) const {
+ int position;
+ bool found;
+
+ DiskLoc posLoc = _locate(txn, getRootLoc(txn), key, &position, &found, DiskLoc::min(), 1);
+
+ while (!posLoc.isNull()) {
+ FullKey fullKey = getFullKey(getBucket(txn, posLoc), position);
+ if (fullKey.header.isUsed()) {
+ // TODO: we may not need fullKey.data until we know fullKey.header.isUsed() here
+ // and elsewhere.
+ if (fullKey.data.woEqual(key)) {
+ return fullKey.recordLoc != self;
}
-
- posLoc = advance(txn, posLoc, &position, 1);
+ break;
}
- return false;
+
+ posLoc = advance(txn, posLoc, &position, 1);
}
+ return false;
+}
- template <class BtreeLayout>
- string BtreeLogic<BtreeLayout>::dupKeyError(const KeyDataType& key) const {
- stringstream ss;
- ss << "E11000 duplicate key error ";
- ss << "index: " << _indexName << " ";
- ss << "dup key: " << key.toString();
- return ss.str();
- }
-
- /**
- * Find a key within this btree bucket.
- *
- * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
- * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, our
- * performance is still good.
- *
- * assertIfDup: if the key exists (ignoring the recordLoc), uassert
- *
- * pos: for existing keys k0...kn-1.
- * returns # it goes BEFORE. so key[pos-1] < key < key[pos]
- * returns n if it goes after the last existing key.
- * note result might be an Unused location!
- */
- template <class BtreeLayout>
- Status BtreeLogic<BtreeLayout>::_find(OperationContext* txn,
- BucketType* bucket,
- const KeyDataType& key,
- const DiskLoc& recordLoc,
- bool errorIfDup,
- int* keyPositionOut,
- bool* foundOut) const {
-
- // XXX: fix the ctor for DiskLoc56bit so we can just convert w/o assignment operator
- LocType genericRecordLoc;
- genericRecordLoc = recordLoc;
-
- bool dupsCheckedYet = false;
-
- int low = 0;
- int high = bucket->n - 1;
- int middle = (low + high) / 2;
-
- while (low <= high) {
- FullKey fullKey = getFullKey(bucket, middle);
- int cmp = key.woCompare(fullKey.data, _ordering);
-
- // The key data is the same.
- if (0 == cmp) {
- // Found the key in this bucket. If we're checking for dups...
- if (errorIfDup) {
- if (fullKey.header.isUnused()) {
- // It's ok that the key is there if it is unused. We need to check that
- // there aren't other entries for the key then. as it is very rare that
- // we get here, we don't put any coding effort in here to make this
- // particularly fast
- if (!dupsCheckedYet) {
- // This is expensive and we only want to do it once(? -- when would
- // it happen twice).
- dupsCheckedYet = true;
- if (exists(txn, key)) {
- if (wouldCreateDup(txn, key, genericRecordLoc)) {
- return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000);
- }
- else {
- return Status(ErrorCodes::DuplicateKeyValue,
- "key/value already in index");
- }
+template <class BtreeLayout>
+string BtreeLogic<BtreeLayout>::dupKeyError(const KeyDataType& key) const {
+ stringstream ss;
+ ss << "E11000 duplicate key error ";
+ ss << "index: " << _indexName << " ";
+ ss << "dup key: " << key.toString();
+ return ss.str();
+}
+
+/**
+ * Find a key within this btree bucket.
+ *
+ * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
+ * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, our
+ * performance is still good.
+ *
+ * assertIfDup: if the key exists (ignoring the recordLoc), uassert
+ *
+ * pos: for existing keys k0...kn-1.
+ * returns # it goes BEFORE. so key[pos-1] < key < key[pos]
+ * returns n if it goes after the last existing key.
+ * note result might be an Unused location!
+ */
+template <class BtreeLayout>
+Status BtreeLogic<BtreeLayout>::_find(OperationContext* txn,
+ BucketType* bucket,
+ const KeyDataType& key,
+ const DiskLoc& recordLoc,
+ bool errorIfDup,
+ int* keyPositionOut,
+ bool* foundOut) const {
+ // XXX: fix the ctor for DiskLoc56bit so we can just convert w/o assignment operator
+ LocType genericRecordLoc;
+ genericRecordLoc = recordLoc;
+
+ bool dupsCheckedYet = false;
+
+ int low = 0;
+ int high = bucket->n - 1;
+ int middle = (low + high) / 2;
+
+ while (low <= high) {
+ FullKey fullKey = getFullKey(bucket, middle);
+ int cmp = key.woCompare(fullKey.data, _ordering);
+
+ // The key data is the same.
+ if (0 == cmp) {
+ // Found the key in this bucket. If we're checking for dups...
+ if (errorIfDup) {
+ if (fullKey.header.isUnused()) {
+ // It's ok that the key is there if it is unused. We need to check that
+ // there aren't other entries for the key then. as it is very rare that
+ // we get here, we don't put any coding effort in here to make this
+ // particularly fast
+ if (!dupsCheckedYet) {
+ // This is expensive and we only want to do it once(? -- when would
+ // it happen twice).
+ dupsCheckedYet = true;
+ if (exists(txn, key)) {
+ if (wouldCreateDup(txn, key, genericRecordLoc)) {
+ return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000);
+ } else {
+ return Status(ErrorCodes::DuplicateKeyValue,
+ "key/value already in index");
}
}
}
- else {
- if (fullKey.recordLoc == recordLoc) {
- return Status(ErrorCodes::DuplicateKeyValue,
- "key/value already in index");
- }
- else {
- return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000);
- }
+ } else {
+ if (fullKey.recordLoc == recordLoc) {
+ return Status(ErrorCodes::DuplicateKeyValue, "key/value already in index");
+ } else {
+ return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000);
}
}
+ }
- // If we're here dup keys are allowed, or the key is a dup but unused.
- LocType recordLocCopy = fullKey.recordLoc;
-
- // We clear this bit so we can test equality without the used bit messing us up.
- // XXX: document this
- // XXX: kill this GETOFS stuff
- recordLocCopy.GETOFS() &= ~1;
+ // If we're here dup keys are allowed, or the key is a dup but unused.
+ LocType recordLocCopy = fullKey.recordLoc;
- // Set 'cmp' to the comparison w/the DiskLoc and fall through below.
- cmp = recordLoc.compare(recordLocCopy);
- }
+ // We clear this bit so we can test equality without the used bit messing us up.
+ // XXX: document this
+ // XXX: kill this GETOFS stuff
+ recordLocCopy.GETOFS() &= ~1;
- if (cmp < 0) {
- high = middle - 1;
- }
- else if (cmp > 0) {
- low = middle + 1;
- }
- else {
- // Found it!
- *keyPositionOut = middle;
- *foundOut = true;
- return Status::OK();
- }
+ // Set 'cmp' to the comparison w/the DiskLoc and fall through below.
+ cmp = recordLoc.compare(recordLocCopy);
+ }
- middle = (low + high) / 2;
+ if (cmp < 0) {
+ high = middle - 1;
+ } else if (cmp > 0) {
+ low = middle + 1;
+ } else {
+ // Found it!
+ *keyPositionOut = middle;
+ *foundOut = true;
+ return Status::OK();
}
- // Not found.
- *keyPositionOut = low;
+ middle = (low + high) / 2;
+ }
+
+ // Not found.
+ *keyPositionOut = low;
- // Some debugging checks.
- if (low != bucket->n) {
- wassert(key.woCompare(getFullKey(bucket, low).data, _ordering) <= 0);
+ // Some debugging checks.
+ if (low != bucket->n) {
+ wassert(key.woCompare(getFullKey(bucket, low).data, _ordering) <= 0);
- if (low > 0) {
- if (getFullKey(bucket, low - 1).data.woCompare(key, _ordering) > 0) {
- DEV {
- log() << key.toString() << endl;
- log() << getFullKey(bucket, low - 1).data.toString() << endl;
- }
- wassert(false);
+ if (low > 0) {
+ if (getFullKey(bucket, low - 1).data.woCompare(key, _ordering) > 0) {
+ DEV {
+ log() << key.toString() << endl;
+ log() << getFullKey(bucket, low - 1).data.toString() << endl;
}
+ wassert(false);
}
}
-
- *foundOut = false;
- return Status::OK();
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::delBucket(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc) {
- invariant(bucketLoc != getRootLoc(txn));
-
- _cursorRegistry->invalidateCursorsForBucket(bucketLoc);
-
- BucketType* p = getBucket(txn, bucket->parent);
- int parentIdx = indexInParent(txn, bucket, bucketLoc);
- *txn->recoveryUnit()->writing(&childLocForPos(p, parentIdx)) = DiskLoc();
- deallocBucket(txn, bucket, bucketLoc);
- }
+ *foundOut = false;
+ return Status::OK();
+}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::deallocBucket(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc) {
- bucket->n = BtreeLayout::INVALID_N_SENTINEL;
- bucket->parent.Null();
- _recordStore->deleteRecord(txn, bucketLoc.toRecordId());
- }
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::delBucket(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) {
+ invariant(bucketLoc != getRootLoc(txn));
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::restorePosition(OperationContext* txn,
- const BSONObj& savedKey,
- const DiskLoc& savedLoc,
- int direction,
- DiskLoc* bucketLocInOut,
- int* keyOffsetInOut) const {
+ _cursorRegistry->invalidateCursorsForBucket(bucketLoc);
- // The caller has to ensure validity of the saved cursor using the SavedCursorRegistry
- BucketType* bucket = getBucket(txn, *bucketLocInOut);
- invariant(bucket);
- invariant(BtreeLayout::INVALID_N_SENTINEL != bucket->n);
+ BucketType* p = getBucket(txn, bucket->parent);
+ int parentIdx = indexInParent(txn, bucket, bucketLoc);
+ *txn->recoveryUnit()->writing(&childLocForPos(p, parentIdx)) = DiskLoc();
+ deallocBucket(txn, bucket, bucketLoc);
+}
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::deallocBucket(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) {
+ bucket->n = BtreeLayout::INVALID_N_SENTINEL;
+ bucket->parent.Null();
+ _recordStore->deleteRecord(txn, bucketLoc.toRecordId());
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::restorePosition(OperationContext* txn,
+ const BSONObj& savedKey,
+ const DiskLoc& savedLoc,
+ int direction,
+ DiskLoc* bucketLocInOut,
+ int* keyOffsetInOut) const {
+ // The caller has to ensure validity of the saved cursor using the SavedCursorRegistry
+ BucketType* bucket = getBucket(txn, *bucketLocInOut);
+ invariant(bucket);
+ invariant(BtreeLayout::INVALID_N_SENTINEL != bucket->n);
+
+ if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) {
+ skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction);
+ return;
+ }
+
+ if (*keyOffsetInOut > 0) {
+ (*keyOffsetInOut)--;
if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) {
skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction);
return;
}
+ }
- if (*keyOffsetInOut > 0) {
- (*keyOffsetInOut)--;
- if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) {
- skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction);
- return;
- }
- }
+ locate(txn, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut);
+}
- locate(txn, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut);
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::_keyIsAt(const BSONObj& savedKey,
+ const DiskLoc& savedLoc,
+ BucketType* bucket,
+ int keyPos) const {
+ if (keyPos >= bucket->n) {
+ return false;
}
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::_keyIsAt(const BSONObj& savedKey,
- const DiskLoc& savedLoc,
- BucketType* bucket,
- int keyPos) const {
- if (keyPos >= bucket->n) {
- return false;
- }
-
- FullKey key = getFullKey(bucket, keyPos);
- if (!key.data.toBson().binaryEqual(savedKey)) {
- return false;
- }
- return key.header.recordLoc == savedLoc;
+ FullKey key = getFullKey(bucket, keyPos);
+ if (!key.data.toBson().binaryEqual(savedKey)) {
+ return false;
}
+ return key.header.recordLoc == savedLoc;
+}
- /**
- * May delete the bucket 'bucket' rendering 'bucketLoc' invalid.
- */
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::delKeyAtPos(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int p) {
- invariant(bucket->n > 0);
- DiskLoc left = childLocForPos(bucket, p);
- if (bucket->n == 1) {
- if (left.isNull() && bucket->nextChild.isNull()) {
- _delKeyAtPos(bucket, p);
- if (isHead(bucket)) {
- // we don't delete the top bucket ever
- }
- else {
- if (!mayBalanceWithNeighbors(txn, bucket, bucketLoc)) {
- // An empty bucket is only allowed as a txnient state. If
- // there are no neighbors to balance with, we delete ourself.
- // This condition is only expected in legacy btrees.
- delBucket(txn, bucket, bucketLoc);
- }
+/**
+ * May delete the bucket 'bucket' rendering 'bucketLoc' invalid.
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::delKeyAtPos(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int p) {
+ invariant(bucket->n > 0);
+ DiskLoc left = childLocForPos(bucket, p);
+ if (bucket->n == 1) {
+ if (left.isNull() && bucket->nextChild.isNull()) {
+ _delKeyAtPos(bucket, p);
+ if (isHead(bucket)) {
+ // we don't delete the top bucket ever
+ } else {
+ if (!mayBalanceWithNeighbors(txn, bucket, bucketLoc)) {
+ // An empty bucket is only allowed as a txnient state. If
+ // there are no neighbors to balance with, we delete ourself.
+ // This condition is only expected in legacy btrees.
+ delBucket(txn, bucket, bucketLoc);
}
- return;
}
- deleteInternalKey(txn, bucket, bucketLoc, p);
return;
}
-
- if (left.isNull()) {
- _delKeyAtPos(bucket, p);
- mayBalanceWithNeighbors(txn, bucket, bucketLoc);
- }
- else {
- deleteInternalKey(txn, bucket, bucketLoc, p);
- }
+ deleteInternalKey(txn, bucket, bucketLoc, p);
+ return;
}
- /**
- * This function replaces the specified key (k) by either the prev or next key in the btree
- * (k'). We require that k have either a left or right child. If k has a left child, we set k'
- * to the prev key of k, which must be a leaf present in the left child. If k does not have a
- * left child, we set k' to the next key of k, which must be a leaf present in the right child.
- * When we replace k with k', we copy k' over k (which may cause a split) and then remove k'
- * from its original location. Because k' is stored in a descendent of k, replacing k by k'
- * will not modify the storage location of the original k', and we can easily remove k' from its
- * original location.
- *
- * This function is only needed in cases where k has a left or right child; in other cases a
- * simpler key removal implementation is possible.
- *
- * NOTE on noncompliant BtreeBuilder btrees: It is possible (though likely rare) for btrees
- * created by BtreeBuilder to have k' that is not a leaf, see SERVER-2732. These cases are
- * handled in the same manner as described in the "legacy btree structures" note below.
- *
- * NOTE on legacy btree structures: In legacy btrees, k' can be a nonleaf. In such a case we
- * 'delete' k by marking it as an unused node rather than replacing it with k'. Also, k' may be
- * a leaf but marked as an unused node. In such a case we replace k by k', preserving the key's
- * unused marking. This function is only expected to mark a key as unused when handling a
- * legacy btree.
- */
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::deleteInternalKey(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int keypos) {
- DiskLoc lchild = childLocForPos(bucket, keypos);
- DiskLoc rchild = childLocForPos(bucket, keypos + 1);
- invariant(!lchild.isNull() || !rchild.isNull());
- int advanceDirection = lchild.isNull() ? 1 : -1;
- int advanceKeyOfs = keypos;
- DiskLoc advanceLoc = advance(txn, bucketLoc, &advanceKeyOfs, advanceDirection);
- // advanceLoc must be a descentant of thisLoc, because thisLoc has a
- // child in the proper direction and all descendants of thisLoc must be
- // nonempty because they are not the root.
- BucketType* advanceBucket = getBucket(txn, advanceLoc);
-
- if (!childLocForPos(advanceBucket, advanceKeyOfs).isNull()
- || !childLocForPos(advanceBucket, advanceKeyOfs + 1).isNull()) {
-
- markUnused(bucket, keypos);
- return;
- }
-
- FullKey kn = getFullKey(advanceBucket, advanceKeyOfs);
- // Because advanceLoc is a descendant of thisLoc, updating thisLoc will
- // not affect packing or keys of advanceLoc and kn will be stable
- // during the following setInternalKey()
- setInternalKey(txn, bucket, bucketLoc, keypos, kn.recordLoc, kn.data,
- childLocForPos(bucket, keypos),
- childLocForPos(bucket, keypos + 1));
- delKeyAtPos(txn, btreemod(txn, advanceBucket), advanceLoc, advanceKeyOfs);
+ if (left.isNull()) {
+ _delKeyAtPos(bucket, p);
+ mayBalanceWithNeighbors(txn, bucket, bucketLoc);
+ } else {
+ deleteInternalKey(txn, bucket, bucketLoc, p);
}
+}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::replaceWithNextChild(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc) {
-
- invariant(bucket->n == 0 && !bucket->nextChild.isNull() );
- if (bucket->parent.isNull()) {
- invariant(getRootLoc(txn) == bucketLoc);
- _headManager->setHead(txn, bucket->nextChild.toRecordId());
- }
- else {
- BucketType* parentBucket = getBucket(txn, bucket->parent);
- int bucketIndexInParent = indexInParent(txn, bucket, bucketLoc);
- *txn->recoveryUnit()->writing(&childLocForPos(parentBucket, bucketIndexInParent)) =
- bucket->nextChild;
- }
-
- *txn->recoveryUnit()->writing(&getBucket(txn, bucket->nextChild)->parent) = bucket->parent;
- _cursorRegistry->invalidateCursorsForBucket(bucketLoc);
- deallocBucket(txn, bucket, bucketLoc);
- }
-
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::canMergeChildren(OperationContext* txn,
+/**
+ * This function replaces the specified key (k) by either the prev or next key in the btree
+ * (k'). We require that k have either a left or right child. If k has a left child, we set k'
+ * to the prev key of k, which must be a leaf present in the left child. If k does not have a
+ * left child, we set k' to the next key of k, which must be a leaf present in the right child.
+ * When we replace k with k', we copy k' over k (which may cause a split) and then remove k'
+ * from its original location. Because k' is stored in a descendent of k, replacing k by k'
+ * will not modify the storage location of the original k', and we can easily remove k' from its
+ * original location.
+ *
+ * This function is only needed in cases where k has a left or right child; in other cases a
+ * simpler key removal implementation is possible.
+ *
+ * NOTE on noncompliant BtreeBuilder btrees: It is possible (though likely rare) for btrees
+ * created by BtreeBuilder to have k' that is not a leaf, see SERVER-2732. These cases are
+ * handled in the same manner as described in the "legacy btree structures" note below.
+ *
+ * NOTE on legacy btree structures: In legacy btrees, k' can be a nonleaf. In such a case we
+ * 'delete' k by marking it as an unused node rather than replacing it with k'. Also, k' may be
+ * a leaf but marked as an unused node. In such a case we replace k by k', preserving the key's
+ * unused marking. This function is only expected to mark a key as unused when handling a
+ * legacy btree.
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::deleteInternalKey(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos) {
+ DiskLoc lchild = childLocForPos(bucket, keypos);
+ DiskLoc rchild = childLocForPos(bucket, keypos + 1);
+ invariant(!lchild.isNull() || !rchild.isNull());
+ int advanceDirection = lchild.isNull() ? 1 : -1;
+ int advanceKeyOfs = keypos;
+ DiskLoc advanceLoc = advance(txn, bucketLoc, &advanceKeyOfs, advanceDirection);
+ // advanceLoc must be a descentant of thisLoc, because thisLoc has a
+ // child in the proper direction and all descendants of thisLoc must be
+ // nonempty because they are not the root.
+ BucketType* advanceBucket = getBucket(txn, advanceLoc);
+
+ if (!childLocForPos(advanceBucket, advanceKeyOfs).isNull() ||
+ !childLocForPos(advanceBucket, advanceKeyOfs + 1).isNull()) {
+ markUnused(bucket, keypos);
+ return;
+ }
+
+ FullKey kn = getFullKey(advanceBucket, advanceKeyOfs);
+ // Because advanceLoc is a descendant of thisLoc, updating thisLoc will
+ // not affect packing or keys of advanceLoc and kn will be stable
+ // during the following setInternalKey()
+ setInternalKey(txn,
+ bucket,
+ bucketLoc,
+ keypos,
+ kn.recordLoc,
+ kn.data,
+ childLocForPos(bucket, keypos),
+ childLocForPos(bucket, keypos + 1));
+ delKeyAtPos(txn, btreemod(txn, advanceBucket), advanceLoc, advanceKeyOfs);
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::replaceWithNextChild(OperationContext* txn,
BucketType* bucket,
- const DiskLoc bucketLoc,
- const int leftIndex) {
- invariant(leftIndex >= 0 && leftIndex < bucket->n);
+ const DiskLoc bucketLoc) {
+ invariant(bucket->n == 0 && !bucket->nextChild.isNull());
+ if (bucket->parent.isNull()) {
+ invariant(getRootLoc(txn) == bucketLoc);
+ _headManager->setHead(txn, bucket->nextChild.toRecordId());
+ } else {
+ BucketType* parentBucket = getBucket(txn, bucket->parent);
+ int bucketIndexInParent = indexInParent(txn, bucket, bucketLoc);
+ *txn->recoveryUnit()->writing(&childLocForPos(parentBucket, bucketIndexInParent)) =
+ bucket->nextChild;
+ }
+
+ *txn->recoveryUnit()->writing(&getBucket(txn, bucket->nextChild)->parent) = bucket->parent;
+ _cursorRegistry->invalidateCursorsForBucket(bucketLoc);
+ deallocBucket(txn, bucket, bucketLoc);
+}
+
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::canMergeChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ const int leftIndex) {
+ invariant(leftIndex >= 0 && leftIndex < bucket->n);
- DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex);
- DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1);
+ DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex);
+ DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1);
- if (leftNodeLoc.isNull() || rightNodeLoc.isNull()) {
- return false;
- }
+ if (leftNodeLoc.isNull() || rightNodeLoc.isNull()) {
+ return false;
+ }
- int pos = 0;
+ int pos = 0;
- BucketType* leftBucket = getBucket(txn, leftNodeLoc);
- BucketType* rightBucket = getBucket(txn, rightNodeLoc);
+ BucketType* leftBucket = getBucket(txn, leftNodeLoc);
+ BucketType* rightBucket = getBucket(txn, rightNodeLoc);
- int sum = BucketType::HeaderSize
- + _packedDataSize(leftBucket, pos)
- + _packedDataSize(rightBucket, pos)
- + getFullKey(bucket, leftIndex).data.dataSize()
- + sizeof(KeyHeaderType);
+ int sum = BucketType::HeaderSize + _packedDataSize(leftBucket, pos) +
+ _packedDataSize(rightBucket, pos) + getFullKey(bucket, leftIndex).data.dataSize() +
+ sizeof(KeyHeaderType);
- return sum <= BtreeLayout::BucketSize;
- }
+ return sum <= BtreeLayout::BucketSize;
+}
- /**
- * This implementation must respect the meaning and value of lowWaterMark. Also see comments in
- * splitPos().
- */
- template <class BtreeLayout>
- int BtreeLogic<BtreeLayout>::_rebalancedSeparatorPos(OperationContext* txn,
- BucketType* bucket,
- int leftIndex) {
- int split = -1;
- int rightSize = 0;
+/**
+ * This implementation must respect the meaning and value of lowWaterMark. Also see comments in
+ * splitPos().
+ */
+template <class BtreeLayout>
+int BtreeLogic<BtreeLayout>::_rebalancedSeparatorPos(OperationContext* txn,
+ BucketType* bucket,
+ int leftIndex) {
+ int split = -1;
+ int rightSize = 0;
- const BucketType* l = childForPos(txn, bucket, leftIndex);
- const BucketType* r = childForPos(txn, bucket, leftIndex + 1);
+ const BucketType* l = childForPos(txn, bucket, leftIndex);
+ const BucketType* r = childForPos(txn, bucket, leftIndex + 1);
- int KNS = sizeof(KeyHeaderType);
- int rightSizeLimit = ( l->topSize
- + l->n * KNS
- + getFullKey(bucket, leftIndex).data.dataSize()
- + KNS
- + r->topSize
- + r->n * KNS ) / 2;
+ int KNS = sizeof(KeyHeaderType);
+ int rightSizeLimit = (l->topSize + l->n * KNS + getFullKey(bucket, leftIndex).data.dataSize() +
+ KNS + r->topSize + r->n * KNS) /
+ 2;
- // This constraint should be ensured by only calling this function
- // if we go below the low water mark.
- invariant(rightSizeLimit < BtreeLayout::BucketBodySize);
+ // This constraint should be ensured by only calling this function
+ // if we go below the low water mark.
+ invariant(rightSizeLimit < BtreeLayout::BucketBodySize);
- for (int i = r->n - 1; i > -1; --i) {
- rightSize += getFullKey(r, i).data.dataSize() + KNS;
- if (rightSize > rightSizeLimit) {
- split = l->n + 1 + i;
- break;
- }
+ for (int i = r->n - 1; i > -1; --i) {
+ rightSize += getFullKey(r, i).data.dataSize() + KNS;
+ if (rightSize > rightSizeLimit) {
+ split = l->n + 1 + i;
+ break;
}
+ }
- if (split == -1) {
- rightSize += getFullKey(bucket, leftIndex).data.dataSize() + KNS;
- if (rightSize > rightSizeLimit) {
- split = l->n;
- }
+ if (split == -1) {
+ rightSize += getFullKey(bucket, leftIndex).data.dataSize() + KNS;
+ if (rightSize > rightSizeLimit) {
+ split = l->n;
}
+ }
- if (split == -1) {
- for (int i = l->n - 1; i > -1; --i) {
- rightSize += getFullKey(l, i).data.dataSize() + KNS;
- if (rightSize > rightSizeLimit) {
- split = i;
- break;
- }
+ if (split == -1) {
+ for (int i = l->n - 1; i > -1; --i) {
+ rightSize += getFullKey(l, i).data.dataSize() + KNS;
+ if (rightSize > rightSizeLimit) {
+ split = i;
+ break;
}
}
+ }
- // safeguards - we must not create an empty bucket
- if (split < 1) {
- split = 1;
- }
- else if (split > l->n + 1 + r->n - 2) {
- split = l->n + 1 + r->n - 2;
- }
-
- return split;
+ // safeguards - we must not create an empty bucket
+ if (split < 1) {
+ split = 1;
+ } else if (split > l->n + 1 + r->n - 2) {
+ split = l->n + 1 + r->n - 2;
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::doMergeChildren(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int leftIndex) {
+ return split;
+}
- DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex);
- DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1);
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::doMergeChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex) {
+ DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex);
+ DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1);
- BucketType* l = btreemod(txn, getBucket(txn, leftNodeLoc));
- BucketType* r = btreemod(txn, getBucket(txn, rightNodeLoc));
+ BucketType* l = btreemod(txn, getBucket(txn, leftNodeLoc));
+ BucketType* r = btreemod(txn, getBucket(txn, rightNodeLoc));
- int pos = 0;
- _packReadyForMod(l, pos);
- _packReadyForMod(r, pos);
+ int pos = 0;
+ _packReadyForMod(l, pos);
+ _packReadyForMod(r, pos);
- // We know the additional keys below will fit in l because canMergeChildren() must be true.
- int oldLNum = l->n;
- // left child's right child becomes old parent key's left child
- FullKey knLeft = getFullKey(bucket, leftIndex);
- invariant(pushBack(l, knLeft.recordLoc, knLeft.data, l->nextChild));
+ // We know the additional keys below will fit in l because canMergeChildren() must be true.
+ int oldLNum = l->n;
+ // left child's right child becomes old parent key's left child
+ FullKey knLeft = getFullKey(bucket, leftIndex);
+ invariant(pushBack(l, knLeft.recordLoc, knLeft.data, l->nextChild));
- for (int i = 0; i < r->n; ++i) {
- FullKey kn = getFullKey(r, i);
- invariant(pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket));
- }
+ for (int i = 0; i < r->n; ++i) {
+ FullKey kn = getFullKey(r, i);
+ invariant(pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket));
+ }
- l->nextChild = r->nextChild;
- fixParentPtrs(txn, l, leftNodeLoc, oldLNum);
- delBucket(txn, r, rightNodeLoc);
+ l->nextChild = r->nextChild;
+ fixParentPtrs(txn, l, leftNodeLoc, oldLNum);
+ delBucket(txn, r, rightNodeLoc);
- childLocForPos(bucket, leftIndex + 1) = leftNodeLoc;
- childLocForPos(bucket, leftIndex) = DiskLoc();
- _delKeyAtPos(bucket, leftIndex, true);
+ childLocForPos(bucket, leftIndex + 1) = leftNodeLoc;
+ childLocForPos(bucket, leftIndex) = DiskLoc();
+ _delKeyAtPos(bucket, leftIndex, true);
- if (bucket->n == 0) {
- // Will trash bucket and bucketLoc.
- //
- // TODO To ensure all leaves are of equal height, we should ensure this is only called
- // on the root.
- replaceWithNextChild(txn, bucket, bucketLoc);
- }
- else {
- mayBalanceWithNeighbors(txn, bucket, bucketLoc);
- }
+ if (bucket->n == 0) {
+ // Will trash bucket and bucketLoc.
+ //
+ // TODO To ensure all leaves are of equal height, we should ensure this is only called
+ // on the root.
+ replaceWithNextChild(txn, bucket, bucketLoc);
+ } else {
+ mayBalanceWithNeighbors(txn, bucket, bucketLoc);
}
+}
- template <class BtreeLayout>
- int BtreeLogic<BtreeLayout>::indexInParent(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc) const {
- invariant(!bucket->parent.isNull());
- const BucketType* p = getBucket(txn, bucket->parent);
- if (p->nextChild == bucketLoc) {
- return p->n;
- }
+template <class BtreeLayout>
+int BtreeLogic<BtreeLayout>::indexInParent(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) const {
+ invariant(!bucket->parent.isNull());
+ const BucketType* p = getBucket(txn, bucket->parent);
+ if (p->nextChild == bucketLoc) {
+ return p->n;
+ }
- for (int i = 0; i < p->n; ++i) {
- if (getKeyHeader(p, i).prevChildBucket == bucketLoc) {
- return i;
- }
+ for (int i = 0; i < p->n; ++i) {
+ if (getKeyHeader(p, i).prevChildBucket == bucketLoc) {
+ return i;
}
-
- log() << "ERROR: can't find ref to child bucket.\n";
- log() << "child: " << bucketLoc << "\n";
- //dump();
- log() << "Parent: " << bucket->parent << "\n";
- //p->dump();
- invariant(false);
- return -1; // just to compile
}
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::tryBalanceChildren(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int leftIndex) {
-
- // If we can merge, then we must merge rather than balance to preserve bucket utilization
- // constraints.
- if (canMergeChildren(txn, bucket, bucketLoc, leftIndex)) {
- return false;
- }
+ log() << "ERROR: can't find ref to child bucket.\n";
+ log() << "child: " << bucketLoc << "\n";
+ // dump();
+ log() << "Parent: " << bucket->parent << "\n";
+ // p->dump();
+ invariant(false);
+ return -1; // just to compile
+}
- doBalanceChildren(txn, btreemod(txn, bucket), bucketLoc, leftIndex);
- return true;
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::tryBalanceChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex) {
+ // If we can merge, then we must merge rather than balance to preserve bucket utilization
+ // constraints.
+ if (canMergeChildren(txn, bucket, bucketLoc, leftIndex)) {
+ return false;
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::doBalanceLeftToRight(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int leftIndex,
- int split,
- BucketType* l,
- const DiskLoc lchild,
- BucketType* r,
- const DiskLoc rchild) {
-
- // TODO maybe do some audits the same way pushBack() does? As a precondition, rchild + the
- // old separator are <= half a body size, and lchild is at most completely full. Based on
- // the value of split, rchild will get <= half of the total bytes which is at most 75% of a
- // full body. So rchild will have room for the following keys:
- int rAdd = l->n - split;
- reserveKeysFront(r, rAdd);
-
- for (int i = split + 1, j = 0; i < l->n; ++i, ++j) {
- FullKey kn = getFullKey(l, i);
- setKey(r, j, kn.recordLoc, kn.data, kn.prevChildBucket);
- }
+ doBalanceChildren(txn, btreemod(txn, bucket), bucketLoc, leftIndex);
+ return true;
+}
- FullKey leftIndexKN = getFullKey(bucket, leftIndex);
- setKey(r, rAdd - 1, leftIndexKN.recordLoc, leftIndexKN.data, l->nextChild);
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::doBalanceLeftToRight(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex,
+ int split,
+ BucketType* l,
+ const DiskLoc lchild,
+ BucketType* r,
+ const DiskLoc rchild) {
+ // TODO maybe do some audits the same way pushBack() does? As a precondition, rchild + the
+ // old separator are <= half a body size, and lchild is at most completely full. Based on
+ // the value of split, rchild will get <= half of the total bytes which is at most 75% of a
+ // full body. So rchild will have room for the following keys:
+ int rAdd = l->n - split;
+ reserveKeysFront(r, rAdd);
+
+ for (int i = split + 1, j = 0; i < l->n; ++i, ++j) {
+ FullKey kn = getFullKey(l, i);
+ setKey(r, j, kn.recordLoc, kn.data, kn.prevChildBucket);
+ }
+
+ FullKey leftIndexKN = getFullKey(bucket, leftIndex);
+ setKey(r, rAdd - 1, leftIndexKN.recordLoc, leftIndexKN.data, l->nextChild);
+
+ fixParentPtrs(txn, r, rchild, 0, rAdd - 1);
+
+ FullKey kn = getFullKey(l, split);
+ l->nextChild = kn.prevChildBucket;
+
+ // Because lchild is a descendant of thisLoc, updating thisLoc will not affect packing or
+ // keys of lchild and kn will be stable during the following setInternalKey()
+ setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild);
+
+ // lchild and rchild cannot be merged, so there must be >0 (actually more) keys to the left
+ // of split.
+ int zeropos = 0;
+ truncateTo(l, split, zeropos);
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::doBalanceRightToLeft(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex,
+ int split,
+ BucketType* l,
+ const DiskLoc lchild,
+ BucketType* r,
+ const DiskLoc rchild) {
+ // As a precondition, lchild + the old separator are <= half a body size,
+ // and rchild is at most completely full. Based on the value of split,
+ // lchild will get less than half of the total bytes which is at most 75%
+ // of a full body. So lchild will have room for the following keys:
+ int lN = l->n;
+
+ {
+ // left child's right child becomes old parent key's left child
+ FullKey kn = getFullKey(bucket, leftIndex);
+ invariant(pushBack(l, kn.recordLoc, kn.data, l->nextChild));
+ }
- fixParentPtrs(txn, r, rchild, 0, rAdd - 1);
+ for (int i = 0; i < split - lN - 1; ++i) {
+ FullKey kn = getFullKey(r, i);
+ invariant(pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket));
+ }
- FullKey kn = getFullKey(l, split);
+ {
+ FullKey kn = getFullKey(r, split - lN - 1);
l->nextChild = kn.prevChildBucket;
-
- // Because lchild is a descendant of thisLoc, updating thisLoc will not affect packing or
- // keys of lchild and kn will be stable during the following setInternalKey()
+ // Child lN was lchild's old nextChild, and don't need to fix that one.
+ fixParentPtrs(txn, l, lchild, lN + 1, l->n);
+ // Because rchild is a descendant of thisLoc, updating thisLoc will
+ // not affect packing or keys of rchild and kn will be stable
+ // during the following setInternalKey()
setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild);
-
- // lchild and rchild cannot be merged, so there must be >0 (actually more) keys to the left
- // of split.
- int zeropos = 0;
- truncateTo(l, split, zeropos);
- }
-
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::doBalanceRightToLeft(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int leftIndex,
- int split,
- BucketType* l,
- const DiskLoc lchild,
- BucketType* r,
- const DiskLoc rchild) {
- // As a precondition, lchild + the old separator are <= half a body size,
- // and rchild is at most completely full. Based on the value of split,
- // lchild will get less than half of the total bytes which is at most 75%
- // of a full body. So lchild will have room for the following keys:
- int lN = l->n;
-
- {
- // left child's right child becomes old parent key's left child
- FullKey kn = getFullKey(bucket, leftIndex);
- invariant(pushBack(l, kn.recordLoc, kn.data, l->nextChild));
- }
-
- for (int i = 0; i < split - lN - 1; ++i) {
- FullKey kn = getFullKey(r, i);
- invariant(pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket));
- }
-
- {
- FullKey kn = getFullKey(r, split - lN - 1);
- l->nextChild = kn.prevChildBucket;
- // Child lN was lchild's old nextChild, and don't need to fix that one.
- fixParentPtrs(txn, l, lchild, lN + 1, l->n);
- // Because rchild is a descendant of thisLoc, updating thisLoc will
- // not affect packing or keys of rchild and kn will be stable
- // during the following setInternalKey()
- setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild);
- }
-
- // lchild and rchild cannot be merged, so there must be >0 (actually more)
- // keys to the right of split.
- int zeropos = 0;
- dropFront(r, split - lN, zeropos);
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::doBalanceChildren(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int leftIndex) {
-
- DiskLoc lchild = childLocForPos(bucket, leftIndex);
- DiskLoc rchild = childLocForPos(bucket, leftIndex + 1);
-
- int zeropos = 0;
- BucketType* l = btreemod(txn, getBucket(txn, lchild));
- _packReadyForMod(l, zeropos);
+ // lchild and rchild cannot be merged, so there must be >0 (actually more)
+ // keys to the right of split.
+ int zeropos = 0;
+ dropFront(r, split - lN, zeropos);
+}
- BucketType* r = btreemod(txn, getBucket(txn, rchild));
- _packReadyForMod(r, zeropos);
-
- int split = _rebalancedSeparatorPos(txn, bucket, leftIndex);
-
- // By definition, if we are below the low water mark and cannot merge
- // then we must actively balance.
- invariant(split != l->n);
- if (split < l->n) {
- doBalanceLeftToRight(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild);
- }
- else {
- doBalanceRightToLeft(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild);
- }
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::doBalanceChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex) {
+ DiskLoc lchild = childLocForPos(bucket, leftIndex);
+ DiskLoc rchild = childLocForPos(bucket, leftIndex + 1);
+
+ int zeropos = 0;
+ BucketType* l = btreemod(txn, getBucket(txn, lchild));
+ _packReadyForMod(l, zeropos);
+
+ BucketType* r = btreemod(txn, getBucket(txn, rchild));
+ _packReadyForMod(r, zeropos);
+
+ int split = _rebalancedSeparatorPos(txn, bucket, leftIndex);
+
+ // By definition, if we are below the low water mark and cannot merge
+ // then we must actively balance.
+ invariant(split != l->n);
+ if (split < l->n) {
+ doBalanceLeftToRight(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild);
+ } else {
+ doBalanceRightToLeft(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild);
+ }
+}
+
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::mayBalanceWithNeighbors(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) {
+ if (bucket->parent.isNull()) {
+ return false;
}
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::mayBalanceWithNeighbors(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc) {
- if (bucket->parent.isNull()) {
- return false;
- }
-
- if (_packedDataSize(bucket, 0) >= lowWaterMark()) {
- return false;
- }
-
- BucketType* p = getBucket(txn, bucket->parent);
- int parentIdx = indexInParent(txn, bucket, bucketLoc);
-
- // TODO will missing neighbor case be possible long term? Should we try to merge/balance
- // somehow in that case if so?
- bool mayBalanceRight = (parentIdx < p->n) && !childLocForPos(p, parentIdx + 1).isNull();
- bool mayBalanceLeft = ( parentIdx > 0 ) && !childLocForPos(p, parentIdx - 1).isNull();
-
- // Balance if possible on one side - we merge only if absolutely necessary to preserve btree
- // bucket utilization constraints since that's a more heavy duty operation (especially if we
- // must re-split later).
- if (mayBalanceRight && tryBalanceChildren(txn, p, bucket->parent, parentIdx)) {
- return true;
- }
-
- if (mayBalanceLeft && tryBalanceChildren(txn, p, bucket->parent, parentIdx - 1)) {
- return true;
- }
-
- BucketType* pm = btreemod(txn, getBucket(txn, bucket->parent));
- if (mayBalanceRight) {
- doMergeChildren(txn, pm, bucket->parent, parentIdx);
- return true;
- }
- else if (mayBalanceLeft) {
- doMergeChildren(txn, pm, bucket->parent, parentIdx - 1);
- return true;
- }
-
+ if (_packedDataSize(bucket, 0) >= lowWaterMark()) {
return false;
}
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::unindex(OperationContext* txn,
- const BSONObj& key,
- const DiskLoc& recordLoc) {
- int pos;
- bool found = false;
- KeyDataOwnedType ownedKey(key);
+ BucketType* p = getBucket(txn, bucket->parent);
+ int parentIdx = indexInParent(txn, bucket, bucketLoc);
- DiskLoc loc = _locate(txn, getRootLoc(txn), ownedKey, &pos, &found, recordLoc, 1);
- if (found) {
- BucketType* bucket = btreemod(txn, getBucket(txn, loc));
- delKeyAtPos(txn, bucket, loc, pos);
- assertValid(_indexName, getRoot(txn), _ordering);
- }
- return found;
- }
+ // TODO will missing neighbor case be possible long term? Should we try to merge/balance
+ // somehow in that case if so?
+ bool mayBalanceRight = (parentIdx < p->n) && !childLocForPos(p, parentIdx + 1).isNull();
+ bool mayBalanceLeft = (parentIdx > 0) && !childLocForPos(p, parentIdx - 1).isNull();
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::isEmpty(OperationContext* txn) const {
- return getRoot(txn)->n == 0;
+ // Balance if possible on one side - we merge only if absolutely necessary to preserve btree
+ // bucket utilization constraints since that's a more heavy duty operation (especially if we
+ // must re-split later).
+ if (mayBalanceRight && tryBalanceChildren(txn, p, bucket->parent, parentIdx)) {
+ return true;
}
- /**
- * This can cause a lot of additional page writes when we assign buckets to different parents.
- * Maybe get rid of parent ptrs?
- */
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::fixParentPtrs(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int firstIndex,
- int lastIndex) {
-
- invariant(getBucket(txn, bucketLoc) == bucket);
-
- if (lastIndex == -1) {
- lastIndex = bucket->n;
- }
-
- for (int i = firstIndex; i <= lastIndex; i++) {
- const DiskLoc childLoc = childLocForPos(bucket, i);
- if (!childLoc.isNull()) {
- *txn->recoveryUnit()->writing(&getBucket(txn, childLoc)->parent) = bucketLoc;
- }
- }
+ if (mayBalanceLeft && tryBalanceChildren(txn, p, bucket->parent, parentIdx - 1)) {
+ return true;
}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::setInternalKey(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int keypos,
- const DiskLoc recordLoc,
- const KeyDataType& key,
- const DiskLoc lchild,
- const DiskLoc rchild) {
- childLocForPos(bucket, keypos).Null();
- // This may leave the bucket empty (n == 0) which is ok only as a txnient state. In the
- // instant case, the implementation of insertHere behaves correctly when n == 0 and as a
- // side effect increments n.
- _delKeyAtPos(bucket, keypos, true);
-
- // Ensure we do not orphan neighbor's old child.
- invariant(childLocForPos(bucket, keypos ) == rchild);
-
- // Just set temporarily - required to pass validation in insertHere()
- childLocForPos(bucket, keypos) = lchild;
-
- insertHere(txn, bucketLoc, keypos, key, recordLoc, lchild, rchild);
- }
-
- /**
- * insert a key in this bucket, splitting if necessary.
- *
- * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. NOTE
- * this function may free some data, and as a result the value passed for keypos may be invalid
- * after calling insertHere()
- *
- * Some of the write intent signaling below relies on the implementation of the optimized write
- * intent code in basicInsert().
- */
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::insertHere(OperationContext* txn,
- const DiskLoc bucketLoc,
- int pos,
- const KeyDataType& key,
- const DiskLoc recordLoc,
- const DiskLoc leftChildLoc,
- const DiskLoc rightChildLoc) {
+ BucketType* pm = btreemod(txn, getBucket(txn, bucket->parent));
+ if (mayBalanceRight) {
+ doMergeChildren(txn, pm, bucket->parent, parentIdx);
+ return true;
+ } else if (mayBalanceLeft) {
+ doMergeChildren(txn, pm, bucket->parent, parentIdx - 1);
+ return true;
+ }
- BucketType* bucket = getBucket(txn, bucketLoc);
+ return false;
+}
- if (!basicInsert(txn, bucket, bucketLoc, pos, key, recordLoc)) {
- // If basicInsert() fails, the bucket will be packed as required by split().
- split(txn, btreemod(txn, bucket), bucketLoc, pos, recordLoc, key, leftChildLoc, rightChildLoc);
- return;
- }
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::unindex(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& recordLoc) {
+ int pos;
+ bool found = false;
+ KeyDataOwnedType ownedKey(key);
- KeyHeaderType* kn = &getKeyHeader(bucket, pos);
- if (pos + 1 == bucket->n) {
- // It's the last key.
- if (bucket->nextChild != leftChildLoc) {
- // XXX log more
- invariant(false);
- }
- kn->prevChildBucket = bucket->nextChild;
- invariant(kn->prevChildBucket == leftChildLoc);
- *txn->recoveryUnit()->writing(&bucket->nextChild) = rightChildLoc;
- if (!rightChildLoc.isNull()) {
- *txn->recoveryUnit()->writing(&getBucket(txn, rightChildLoc)->parent) = bucketLoc;
- }
- }
- else {
- kn->prevChildBucket = leftChildLoc;
- if (getKeyHeader(bucket, pos + 1).prevChildBucket != leftChildLoc) {
- // XXX: log more
- invariant(false);
- }
- const LocType *pc = &getKeyHeader(bucket, pos + 1).prevChildBucket;
- // Intent declared in basicInsert()
- *const_cast<LocType*>(pc) = rightChildLoc;
- if (!rightChildLoc.isNull()) {
- *txn->recoveryUnit()->writing(&getBucket(txn, rightChildLoc)->parent) = bucketLoc;
- }
- }
+ DiskLoc loc = _locate(txn, getRootLoc(txn), ownedKey, &pos, &found, recordLoc, 1);
+ if (found) {
+ BucketType* bucket = btreemod(txn, getBucket(txn, loc));
+ delKeyAtPos(txn, bucket, loc, pos);
+ assertValid(_indexName, getRoot(txn), _ordering);
}
+ return found;
+}
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::split(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int keypos,
- const DiskLoc recordLoc,
- const KeyDataType& key,
- const DiskLoc lchild,
- const DiskLoc rchild) {
-
- int split = splitPos(bucket, keypos);
- DiskLoc rLoc = _addBucket(txn);
- BucketType* r = btreemod(txn, getBucket(txn, rLoc));
-
- for (int i = split + 1; i < bucket->n; i++) {
- FullKey kn = getFullKey(bucket, i);
- invariant(pushBack(r, kn.recordLoc, kn.data, kn.prevChildBucket));
- }
- r->nextChild = bucket->nextChild;
- assertValid(_indexName, r, _ordering);
-
- r = NULL;
- fixParentPtrs(txn, getBucket(txn, rLoc), rLoc);
-
- FullKey splitkey = getFullKey(bucket, split);
- // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
- bucket->nextChild = splitkey.prevChildBucket;
-
- // Because thisLoc is a descendant of parent, updating parent will not affect packing or
- // keys of thisLoc and splitkey will be stable during the following:
-
- if (bucket->parent.isNull()) {
- // promote splitkey to a parent this->node make a new parent if we were the root
- DiskLoc L = _addBucket(txn);
- BucketType* p = btreemod(txn, getBucket(txn, L));
- invariant(pushBack(p, splitkey.recordLoc, splitkey.data, bucketLoc));
- p->nextChild = rLoc;
- assertValid(_indexName, p, _ordering);
- bucket->parent = L;
- _headManager->setHead(txn, L.toRecordId());
- *txn->recoveryUnit()->writing(&getBucket(txn, rLoc)->parent) = bucket->parent;
- }
- else {
- // set this before calling _insert - if it splits it will do fixParent() logic and
- // change the value.
- *txn->recoveryUnit()->writing(&getBucket(txn, rLoc)->parent) = bucket->parent;
- _insert(txn,
- getBucket(txn, bucket->parent),
- bucket->parent,
- splitkey.data,
- splitkey.recordLoc,
- true, // dupsallowed
- bucketLoc,
- rLoc);
- }
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::isEmpty(OperationContext* txn) const {
+ return getRoot(txn)->n == 0;
+}
- int newpos = keypos;
- // note this may trash splitkey.key. thus we had to promote it before finishing up here.
- truncateTo(bucket, split, newpos);
+/**
+ * This can cause a lot of additional page writes when we assign buckets to different parents.
+ * Maybe get rid of parent ptrs?
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::fixParentPtrs(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int firstIndex,
+ int lastIndex) {
+ invariant(getBucket(txn, bucketLoc) == bucket);
- // add our this->new key, there is room this->now
- if (keypos <= split) {
- insertHere(txn, bucketLoc, newpos, key, recordLoc, lchild, rchild);
- }
- else {
- int kp = keypos - split - 1;
- invariant(kp >= 0);
- insertHere(txn, rLoc, kp, key, recordLoc, lchild, rchild);
- }
+ if (lastIndex == -1) {
+ lastIndex = bucket->n;
}
- class DummyDocWriter : public DocWriter {
- public:
- DummyDocWriter(size_t sz) : _sz(sz) { }
- virtual void writeDocument(char* buf) const { /* no-op */ }
- virtual size_t documentSize() const { return _sz; }
- private:
- size_t _sz;
- };
-
- template <class BtreeLayout>
- Status BtreeLogic<BtreeLayout>::initAsEmpty(OperationContext* txn) {
- if (!_headManager->getHead(txn).isNull()) {
- return Status(ErrorCodes::InternalError, "index already initialized");
+ for (int i = firstIndex; i <= lastIndex; i++) {
+ const DiskLoc childLoc = childLocForPos(bucket, i);
+ if (!childLoc.isNull()) {
+ *txn->recoveryUnit()->writing(&getBucket(txn, childLoc)->parent) = bucketLoc;
}
-
- _headManager->setHead(txn, _addBucket(txn).toRecordId());
- return Status::OK();
}
+}
- template <class BtreeLayout>
- DiskLoc BtreeLogic<BtreeLayout>::_addBucket(OperationContext* txn) {
- DummyDocWriter docWriter(BtreeLayout::BucketSize);
- StatusWith<RecordId> loc = _recordStore->insertRecord(txn, &docWriter, false);
- // XXX: remove this(?) or turn into massert or sanely bubble it back up.
- uassertStatusOK(loc.getStatus());
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::setInternalKey(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc lchild,
+ const DiskLoc rchild) {
+ childLocForPos(bucket, keypos).Null();
+ // This may leave the bucket empty (n == 0) which is ok only as a txnient state. In the
+ // instant case, the implementation of insertHere behaves correctly when n == 0 and as a
+ // side effect increments n.
+ _delKeyAtPos(bucket, keypos, true);
- // this is a new bucket, not referenced by anyone, probably don't need this lock
- BucketType* b = btreemod(txn, getBucket(txn, loc.getValue()));
- init(b);
- return DiskLoc::fromRecordId(loc.getValue());
- }
+ // Ensure we do not orphan neighbor's old child.
+ invariant(childLocForPos(bucket, keypos) == rchild);
- // static
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::dumpBucket(const BucketType* bucket, int indentLength) {
- log() << "BUCKET n:" << bucket->n << ", parent:" << hex << bucket->parent.getOfs() << dec;
+ // Just set temporarily - required to pass validation in insertHere()
+ childLocForPos(bucket, keypos) = lchild;
- const string indent = string(indentLength, ' ');
+ insertHere(txn, bucketLoc, keypos, key, recordLoc, lchild, rchild);
+}
- for (int i = 0; i < bucket->n; i++) {
- log() << '\n' << indent;
- FullKey k = getFullKey(bucket, i);
- string ks = k.data.toString();
- log() << " " << hex << k.prevChildBucket.getOfs() << "<-- prevChildBucket for " << i << '\n';
- log() << indent << " " << i << ' ' << ks.substr(0, 30)
- << " Loc:" << k.recordLoc.toString() << dec;
- if (getKeyHeader(bucket, i).isUnused()) {
- log() << " UNUSED";
- }
+/**
+ * insert a key in this bucket, splitting if necessary.
+ *
+ * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. NOTE
+ * this function may free some data, and as a result the value passed for keypos may be invalid
+ * after calling insertHere()
+ *
+ * Some of the write intent signaling below relies on the implementation of the optimized write
+ * intent code in basicInsert().
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::insertHere(OperationContext* txn,
+ const DiskLoc bucketLoc,
+ int pos,
+ const KeyDataType& key,
+ const DiskLoc recordLoc,
+ const DiskLoc leftChildLoc,
+ const DiskLoc rightChildLoc) {
+ BucketType* bucket = getBucket(txn, bucketLoc);
+
+ if (!basicInsert(txn, bucket, bucketLoc, pos, key, recordLoc)) {
+ // If basicInsert() fails, the bucket will be packed as required by split().
+ split(txn,
+ btreemod(txn, bucket),
+ bucketLoc,
+ pos,
+ recordLoc,
+ key,
+ leftChildLoc,
+ rightChildLoc);
+ return;
+ }
+
+ KeyHeaderType* kn = &getKeyHeader(bucket, pos);
+ if (pos + 1 == bucket->n) {
+ // It's the last key.
+ if (bucket->nextChild != leftChildLoc) {
+ // XXX log more
+ invariant(false);
}
-
- log() << "\n" << indent << " " << hex << bucket->nextChild.getOfs() << dec << endl;
- }
-
- template <class BtreeLayout>
- DiskLoc BtreeLogic<BtreeLayout>::getDiskLoc(OperationContext* txn,
- const DiskLoc& bucketLoc,
- const int keyOffset) const {
- invariant(!bucketLoc.isNull());
- BucketType* bucket = getBucket(txn, bucketLoc);
- return getKeyHeader(bucket, keyOffset).recordLoc;
- }
-
- template <class BtreeLayout>
- BSONObj BtreeLogic<BtreeLayout>::getKey(OperationContext* txn,
- const DiskLoc& bucketLoc,
- const int keyOffset) const {
- invariant(!bucketLoc.isNull());
- BucketType* bucket = getBucket(txn, bucketLoc);
- int n = bucket->n;
- invariant(n != BtreeLayout::INVALID_N_SENTINEL);
- invariant(n >= 0);
- invariant(n < 10000);
- invariant(n != 0xffff);
-
- invariant(keyOffset >= 0);
- invariant(keyOffset < n);
-
- // XXX: should we really return an empty obj if keyOffset>=n?
- if (keyOffset >= n) {
- return BSONObj();
+ kn->prevChildBucket = bucket->nextChild;
+ invariant(kn->prevChildBucket == leftChildLoc);
+ *txn->recoveryUnit()->writing(&bucket->nextChild) = rightChildLoc;
+ if (!rightChildLoc.isNull()) {
+ *txn->recoveryUnit()->writing(&getBucket(txn, rightChildLoc)->parent) = bucketLoc;
}
- else {
- return getFullKey(bucket, keyOffset).data.toBson();
+ } else {
+ kn->prevChildBucket = leftChildLoc;
+ if (getKeyHeader(bucket, pos + 1).prevChildBucket != leftChildLoc) {
+ // XXX: log more
+ invariant(false);
}
- }
+ const LocType* pc = &getKeyHeader(bucket, pos + 1).prevChildBucket;
+ // Intent declared in basicInsert()
+ *const_cast<LocType*>(pc) = rightChildLoc;
+ if (!rightChildLoc.isNull()) {
+ *txn->recoveryUnit()->writing(&getBucket(txn, rightChildLoc)->parent) = bucketLoc;
+ }
+ }
+}
+
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::split(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc lchild,
+ const DiskLoc rchild) {
+ int split = splitPos(bucket, keypos);
+ DiskLoc rLoc = _addBucket(txn);
+ BucketType* r = btreemod(txn, getBucket(txn, rLoc));
+
+ for (int i = split + 1; i < bucket->n; i++) {
+ FullKey kn = getFullKey(bucket, i);
+ invariant(pushBack(r, kn.recordLoc, kn.data, kn.prevChildBucket));
+ }
+ r->nextChild = bucket->nextChild;
+ assertValid(_indexName, r, _ordering);
+
+ r = NULL;
+ fixParentPtrs(txn, getBucket(txn, rLoc), rLoc);
+
+ FullKey splitkey = getFullKey(bucket, split);
+ // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
+ bucket->nextChild = splitkey.prevChildBucket;
+
+ // Because thisLoc is a descendant of parent, updating parent will not affect packing or
+ // keys of thisLoc and splitkey will be stable during the following:
+
+ if (bucket->parent.isNull()) {
+ // promote splitkey to a parent this->node make a new parent if we were the root
+ DiskLoc L = _addBucket(txn);
+ BucketType* p = btreemod(txn, getBucket(txn, L));
+ invariant(pushBack(p, splitkey.recordLoc, splitkey.data, bucketLoc));
+ p->nextChild = rLoc;
+ assertValid(_indexName, p, _ordering);
+ bucket->parent = L;
+ _headManager->setHead(txn, L.toRecordId());
+ *txn->recoveryUnit()->writing(&getBucket(txn, rLoc)->parent) = bucket->parent;
+ } else {
+ // set this before calling _insert - if it splits it will do fixParent() logic and
+ // change the value.
+ *txn->recoveryUnit()->writing(&getBucket(txn, rLoc)->parent) = bucket->parent;
+ _insert(txn,
+ getBucket(txn, bucket->parent),
+ bucket->parent,
+ splitkey.data,
+ splitkey.recordLoc,
+ true, // dupsallowed
+ bucketLoc,
+ rLoc);
+ }
+
+ int newpos = keypos;
+ // note this may trash splitkey.key. thus we had to promote it before finishing up here.
+ truncateTo(bucket, split, newpos);
+
+ // add our this->new key, there is room this->now
+ if (keypos <= split) {
+ insertHere(txn, bucketLoc, newpos, key, recordLoc, lchild, rchild);
+ } else {
+ int kp = keypos - split - 1;
+ invariant(kp >= 0);
+ insertHere(txn, rLoc, kp, key, recordLoc, lchild, rchild);
+ }
+}
+
+class DummyDocWriter : public DocWriter {
+public:
+ DummyDocWriter(size_t sz) : _sz(sz) {}
+ virtual void writeDocument(char* buf) const { /* no-op */
+ }
+ virtual size_t documentSize() const {
+ return _sz;
+ }
+
+private:
+ size_t _sz;
+};
+
+template <class BtreeLayout>
+Status BtreeLogic<BtreeLayout>::initAsEmpty(OperationContext* txn) {
+ if (!_headManager->getHead(txn).isNull()) {
+ return Status(ErrorCodes::InternalError, "index already initialized");
+ }
+
+ _headManager->setHead(txn, _addBucket(txn).toRecordId());
+ return Status::OK();
+}
+
+template <class BtreeLayout>
+DiskLoc BtreeLogic<BtreeLayout>::_addBucket(OperationContext* txn) {
+ DummyDocWriter docWriter(BtreeLayout::BucketSize);
+ StatusWith<RecordId> loc = _recordStore->insertRecord(txn, &docWriter, false);
+ // XXX: remove this(?) or turn into massert or sanely bubble it back up.
+ uassertStatusOK(loc.getStatus());
+
+ // this is a new bucket, not referenced by anyone, probably don't need this lock
+ BucketType* b = btreemod(txn, getBucket(txn, loc.getValue()));
+ init(b);
+ return DiskLoc::fromRecordId(loc.getValue());
+}
+
+// static
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::dumpBucket(const BucketType* bucket, int indentLength) {
+ log() << "BUCKET n:" << bucket->n << ", parent:" << hex << bucket->parent.getOfs() << dec;
+
+ const string indent = string(indentLength, ' ');
+
+ for (int i = 0; i < bucket->n; i++) {
+ log() << '\n' << indent;
+ FullKey k = getFullKey(bucket, i);
+ string ks = k.data.toString();
+ log() << " " << hex << k.prevChildBucket.getOfs() << "<-- prevChildBucket for " << i
+ << '\n';
+ log() << indent << " " << i << ' ' << ks.substr(0, 30)
+ << " Loc:" << k.recordLoc.toString() << dec;
+ if (getKeyHeader(bucket, i).isUnused()) {
+ log() << " UNUSED";
+ }
+ }
+
+ log() << "\n" << indent << " " << hex << bucket->nextChild.getOfs() << dec << endl;
+}
+
+template <class BtreeLayout>
+DiskLoc BtreeLogic<BtreeLayout>::getDiskLoc(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ const int keyOffset) const {
+ invariant(!bucketLoc.isNull());
+ BucketType* bucket = getBucket(txn, bucketLoc);
+ return getKeyHeader(bucket, keyOffset).recordLoc;
+}
+
+template <class BtreeLayout>
+BSONObj BtreeLogic<BtreeLayout>::getKey(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ const int keyOffset) const {
+ invariant(!bucketLoc.isNull());
+ BucketType* bucket = getBucket(txn, bucketLoc);
+ int n = bucket->n;
+ invariant(n != BtreeLayout::INVALID_N_SENTINEL);
+ invariant(n >= 0);
+ invariant(n < 10000);
+ invariant(n != 0xffff);
+
+ invariant(keyOffset >= 0);
+ invariant(keyOffset < n);
+
+ // XXX: should we really return an empty obj if keyOffset>=n?
+ if (keyOffset >= n) {
+ return BSONObj();
+ } else {
+ return getFullKey(bucket, keyOffset).data.toBson();
+ }
+}
+
+template <class BtreeLayout>
+Status BtreeLogic<BtreeLayout>::touch(OperationContext* txn) const {
+ return _recordStore->touch(txn, NULL);
+}
+
+template <class BtreeLayout>
+long long BtreeLogic<BtreeLayout>::fullValidate(OperationContext* txn,
+ long long* unusedCount,
+ bool strict,
+ bool dumpBuckets,
+ unsigned depth) const {
+ return _fullValidate(txn, getRootLoc(txn), unusedCount, strict, dumpBuckets, depth);
+}
+
+template <class BtreeLayout>
+long long BtreeLogic<BtreeLayout>::_fullValidate(OperationContext* txn,
+ const DiskLoc bucketLoc,
+ long long* unusedCount,
+ bool strict,
+ bool dumpBuckets,
+ unsigned depth) const {
+ BucketType* bucket = getBucket(txn, bucketLoc);
+ assertValid(_indexName, bucket, _ordering, true);
- template <class BtreeLayout>
- Status BtreeLogic<BtreeLayout>::touch(OperationContext* txn) const {
- return _recordStore->touch( txn, NULL );
+ if (dumpBuckets) {
+ log() << bucketLoc.toString() << ' ';
+ dumpBucket(bucket, depth);
}
- template <class BtreeLayout>
- long long BtreeLogic<BtreeLayout>::fullValidate(OperationContext* txn,
- long long *unusedCount,
- bool strict,
- bool dumpBuckets,
- unsigned depth) const {
- return _fullValidate(txn, getRootLoc(txn), unusedCount, strict, dumpBuckets, depth);
- }
+ long long keyCount = 0;
- template <class BtreeLayout>
- long long BtreeLogic<BtreeLayout>::_fullValidate(OperationContext* txn,
- const DiskLoc bucketLoc,
- long long *unusedCount,
- bool strict,
- bool dumpBuckets,
- unsigned depth) const {
- BucketType* bucket = getBucket(txn, bucketLoc);
- assertValid(_indexName, bucket, _ordering, true);
+ for (int i = 0; i < bucket->n; i++) {
+ KeyHeaderType& kn = getKeyHeader(bucket, i);
- if (dumpBuckets) {
- log() << bucketLoc.toString() << ' ';
- dumpBucket(bucket, depth);
+ if (kn.isUsed()) {
+ keyCount++;
+ } else if (NULL != unusedCount) {
+ ++(*unusedCount);
}
- long long keyCount = 0;
-
- for (int i = 0; i < bucket->n; i++) {
- KeyHeaderType& kn = getKeyHeader(bucket, i);
-
- if (kn.isUsed()) {
- keyCount++;
- }
- else if (NULL != unusedCount) {
- ++(*unusedCount);
- }
-
- if (!kn.prevChildBucket.isNull()) {
- DiskLoc left = kn.prevChildBucket;
- BucketType* b = getBucket(txn, left);
-
- if (strict) {
- invariant(b->parent == bucketLoc);
- }
- else {
- wassert(b->parent == bucketLoc);
- }
-
- keyCount += _fullValidate(txn, left, unusedCount, strict, dumpBuckets, depth + 1);
- }
- }
+ if (!kn.prevChildBucket.isNull()) {
+ DiskLoc left = kn.prevChildBucket;
+ BucketType* b = getBucket(txn, left);
- if (!bucket->nextChild.isNull()) {
- BucketType* b = getBucket(txn, bucket->nextChild);
if (strict) {
invariant(b->parent == bucketLoc);
- }
- else {
+ } else {
wassert(b->parent == bucketLoc);
}
- keyCount += _fullValidate(txn, bucket->nextChild, unusedCount, strict, dumpBuckets, depth + 1);
+ keyCount += _fullValidate(txn, left, unusedCount, strict, dumpBuckets, depth + 1);
+ }
+ }
+
+ if (!bucket->nextChild.isNull()) {
+ BucketType* b = getBucket(txn, bucket->nextChild);
+ if (strict) {
+ invariant(b->parent == bucketLoc);
+ } else {
+ wassert(b->parent == bucketLoc);
}
- return keyCount;
+ keyCount +=
+ _fullValidate(txn, bucket->nextChild, unusedCount, strict, dumpBuckets, depth + 1);
}
- // XXX: remove this(?) used to not dump every key in assertValid.
- int nDumped = 0;
+ return keyCount;
+}
- // static
- template <class BtreeLayout>
- void BtreeLogic<BtreeLayout>::assertValid(const std::string& ns,
- BucketType* bucket,
- const Ordering& ordering,
- bool force) {
- if (!force) {
- return;
- }
+// XXX: remove this(?) used to not dump every key in assertValid.
+int nDumped = 0;
- // this is very slow so don't do often
- {
- static int _k;
- if (++_k % 128) {
- return;
- }
+// static
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::assertValid(const std::string& ns,
+ BucketType* bucket,
+ const Ordering& ordering,
+ bool force) {
+ if (!force) {
+ return;
+ }
+
+ // this is very slow so don't do often
+ {
+ static int _k;
+ if (++_k % 128) {
+ return;
}
+ }
- DEV {
- // slow:
- for (int i = 0; i < bucket->n - 1; i++) {
- FullKey firstKey = getFullKey(bucket, i);
- FullKey secondKey = getFullKey(bucket, i + 1);
- int z = firstKey.data.woCompare(secondKey.data, ordering);
- if (z > 0) {
- log() << "ERROR: btree key order corrupt. Keys:" << endl;
- if (++nDumped < 5) {
- for (int j = 0; j < bucket->n; j++) {
- log() << " " << getFullKey(bucket, j).data.toString() << endl;
- }
- dumpBucket(bucket);
+ DEV {
+ // slow:
+ for (int i = 0; i < bucket->n - 1; i++) {
+ FullKey firstKey = getFullKey(bucket, i);
+ FullKey secondKey = getFullKey(bucket, i + 1);
+ int z = firstKey.data.woCompare(secondKey.data, ordering);
+ if (z > 0) {
+ log() << "ERROR: btree key order corrupt. Keys:" << endl;
+ if (++nDumped < 5) {
+ for (int j = 0; j < bucket->n; j++) {
+ log() << " " << getFullKey(bucket, j).data.toString() << endl;
}
- wassert(false);
- break;
+ dumpBucket(bucket);
}
- else if (z == 0) {
- if (!(firstKey.header.recordLoc < secondKey.header.recordLoc)) {
- log() << "ERROR: btree key order corrupt (recordlocs wrong):" << endl;
- log() << " k(" << i << ")" << firstKey.data.toString()
- << " RL:" << firstKey.header.recordLoc.toString() << endl;
- log() << " k(" << i + 1 << ")" << secondKey.data.toString()
- << " RL:" << secondKey.header.recordLoc.toString() << endl;
- wassert(firstKey.header.recordLoc < secondKey.header.recordLoc);
- }
+ wassert(false);
+ break;
+ } else if (z == 0) {
+ if (!(firstKey.header.recordLoc < secondKey.header.recordLoc)) {
+ log() << "ERROR: btree key order corrupt (recordlocs wrong):" << endl;
+ log() << " k(" << i << ")" << firstKey.data.toString()
+ << " RL:" << firstKey.header.recordLoc.toString() << endl;
+ log() << " k(" << i + 1 << ")" << secondKey.data.toString()
+ << " RL:" << secondKey.header.recordLoc.toString() << endl;
+ wassert(firstKey.header.recordLoc < secondKey.header.recordLoc);
}
}
}
- else {
- //faster:
- if (bucket->n > 1) {
- FullKey k1 = getFullKey(bucket, 0);
- FullKey k2 = getFullKey(bucket, bucket->n - 1);
- int z = k1.data.woCompare(k2.data, ordering);
- //wassert( z <= 0 );
- if (z > 0) {
- log() << "Btree keys out of order in collection " << ns;
- ONCE {
- dumpBucket(bucket);
- }
- invariant(false);
+ }
+ else {
+ // faster:
+ if (bucket->n > 1) {
+ FullKey k1 = getFullKey(bucket, 0);
+ FullKey k2 = getFullKey(bucket, bucket->n - 1);
+ int z = k1.data.woCompare(k2.data, ordering);
+ // wassert( z <= 0 );
+ if (z > 0) {
+ log() << "Btree keys out of order in collection " << ns;
+ ONCE {
+ dumpBucket(bucket);
}
+ invariant(false);
}
}
}
+}
- template <class BtreeLayout>
- Status BtreeLogic<BtreeLayout>::insert(OperationContext* txn,
- const BSONObj& rawKey,
- const DiskLoc& value,
- bool dupsAllowed) {
- KeyDataOwnedType key(rawKey);
+template <class BtreeLayout>
+Status BtreeLogic<BtreeLayout>::insert(OperationContext* txn,
+ const BSONObj& rawKey,
+ const DiskLoc& value,
+ bool dupsAllowed) {
+ KeyDataOwnedType key(rawKey);
- if (key.dataSize() > BtreeLayout::KeyMax) {
- string msg = str::stream() << "Btree::insert: key too large to index, failing "
- << _indexName << ' '
- << key.dataSize() << ' ' << key.toString();
- return Status(ErrorCodes::KeyTooLong, msg);
- }
-
- Status status = _insert(txn,
- getRoot(txn),
- getRootLoc(txn),
- key,
- value,
- dupsAllowed,
- DiskLoc(),
- DiskLoc());
-
- assertValid(_indexName, getRoot(txn), _ordering);
- return status;
+ if (key.dataSize() > BtreeLayout::KeyMax) {
+ string msg = str::stream() << "Btree::insert: key too large to index, failing "
+ << _indexName << ' ' << key.dataSize() << ' ' << key.toString();
+ return Status(ErrorCodes::KeyTooLong, msg);
}
- template <class BtreeLayout>
- Status BtreeLogic<BtreeLayout>::_insert(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- const KeyDataType& key,
- const DiskLoc recordLoc,
- bool dupsAllowed,
- const DiskLoc leftChild,
- const DiskLoc rightChild) {
- invariant( key.dataSize() > 0 );
-
- int pos;
- bool found;
- Status findStatus = _find(txn, bucket, key, recordLoc, !dupsAllowed, &pos, &found);
- if (!findStatus.isOK()) {
- return findStatus;
- }
+ Status status =
+ _insert(txn, getRoot(txn), getRootLoc(txn), key, value, dupsAllowed, DiskLoc(), DiskLoc());
- if (found) {
- KeyHeaderType& header = getKeyHeader(bucket, pos);
- if (header.isUnused()) {
- LOG(4) << "btree _insert: reusing unused key" << endl;
- massert(17433, "_insert: reuse key but lchild is not null", leftChild.isNull());
- massert(17434, "_insert: reuse key but rchild is not null", rightChild.isNull());
- txn->recoveryUnit()->writing(&header)->setUsed();
- return Status::OK();
- }
- // The logic in _find() prohibits finding and returning a position if the 'used' bit
- // in the header is set and dups are disallowed.
- invariant(dupsAllowed);
- return Status(ErrorCodes::DuplicateKeyValue, "key/value already in index");
- }
+ assertValid(_indexName, getRoot(txn), _ordering);
+ return status;
+}
- DiskLoc childLoc = childLocForPos(bucket, pos);
-
- // In current usage, rightChild is NULL for a new key and is not NULL when we are
- // promoting a split key. These are the only two cases where _insert() is called
- // currently.
- if (childLoc.isNull() || !rightChild.isNull()) {
- insertHere(txn, bucketLoc, pos, key, recordLoc, leftChild, rightChild);
+template <class BtreeLayout>
+Status BtreeLogic<BtreeLayout>::_insert(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ const KeyDataType& key,
+ const DiskLoc recordLoc,
+ bool dupsAllowed,
+ const DiskLoc leftChild,
+ const DiskLoc rightChild) {
+ invariant(key.dataSize() > 0);
+
+ int pos;
+ bool found;
+ Status findStatus = _find(txn, bucket, key, recordLoc, !dupsAllowed, &pos, &found);
+ if (!findStatus.isOK()) {
+ return findStatus;
+ }
+
+ if (found) {
+ KeyHeaderType& header = getKeyHeader(bucket, pos);
+ if (header.isUnused()) {
+ LOG(4) << "btree _insert: reusing unused key" << endl;
+ massert(17433, "_insert: reuse key but lchild is not null", leftChild.isNull());
+ massert(17434, "_insert: reuse key but rchild is not null", rightChild.isNull());
+ txn->recoveryUnit()->writing(&header)->setUsed();
return Status::OK();
}
- else {
- return _insert(txn,
- getBucket(txn, childLoc),
- childLoc,
- key,
- recordLoc,
- dupsAllowed,
- DiskLoc(),
- DiskLoc());
- }
+ // The logic in _find() prohibits finding and returning a position if the 'used' bit
+ // in the header is set and dups are disallowed.
+ invariant(dupsAllowed);
+ return Status(ErrorCodes::DuplicateKeyValue, "key/value already in index");
}
- template <class BtreeLayout>
- DiskLoc BtreeLogic<BtreeLayout>::advance(OperationContext* txn,
- const DiskLoc& bucketLoc,
- int* posInOut,
- int direction) const {
- BucketType* bucket = getBucket(txn, bucketLoc);
-
- if (*posInOut < 0 || *posInOut >= bucket->n ) {
- log() << "ASSERT failure advancing btree bucket" << endl;
- log() << " thisLoc: " << bucketLoc.toString() << endl;
- log() << " keyOfs: " << *posInOut << " n:" << bucket->n << " direction: " << direction << endl;
- // log() << bucketSummary() << endl;
- invariant(false);
- }
+ DiskLoc childLoc = childLocForPos(bucket, pos);
- // XXX document
- int adj = direction < 0 ? 1 : 0;
- int ko = *posInOut + direction;
-
- // Look down if we need to.
- DiskLoc nextDownLoc = childLocForPos(bucket, ko + adj);
- BucketType* nextDown = getBucket(txn, nextDownLoc);
- if (NULL != nextDown) {
- for (;;) {
- if (direction > 0) {
- *posInOut = 0;
- }
- else {
- *posInOut = nextDown->n - 1;
- }
- DiskLoc newNextDownLoc = childLocForPos(nextDown, *posInOut + adj);
- BucketType* newNextDownBucket = getBucket(txn, newNextDownLoc);
- if (NULL == newNextDownBucket) {
- break;
- }
- nextDownLoc = newNextDownLoc;
- nextDown = newNextDownBucket;
- }
- return nextDownLoc;
- }
+ // In current usage, rightChild is NULL for a new key and is not NULL when we are
+ // promoting a split key. These are the only two cases where _insert() is called
+ // currently.
+ if (childLoc.isNull() || !rightChild.isNull()) {
+ insertHere(txn, bucketLoc, pos, key, recordLoc, leftChild, rightChild);
+ return Status::OK();
+ } else {
+ return _insert(txn,
+ getBucket(txn, childLoc),
+ childLoc,
+ key,
+ recordLoc,
+ dupsAllowed,
+ DiskLoc(),
+ DiskLoc());
+ }
+}
+
+template <class BtreeLayout>
+DiskLoc BtreeLogic<BtreeLayout>::advance(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ int* posInOut,
+ int direction) const {
+ BucketType* bucket = getBucket(txn, bucketLoc);
+
+ if (*posInOut < 0 || *posInOut >= bucket->n) {
+ log() << "ASSERT failure advancing btree bucket" << endl;
+ log() << " thisLoc: " << bucketLoc.toString() << endl;
+ log() << " keyOfs: " << *posInOut << " n:" << bucket->n << " direction: " << direction
+ << endl;
+ // log() << bucketSummary() << endl;
+ invariant(false);
+ }
- // Looking down isn't the right choice, move forward.
- if (ko < bucket->n && ko >= 0) {
- *posInOut = ko;
- return bucketLoc;
- }
+ // XXX document
+ int adj = direction < 0 ? 1 : 0;
+ int ko = *posInOut + direction;
- // Hit the end of the bucket, move up and over.
- DiskLoc childLoc = bucketLoc;
- DiskLoc ancestor = getBucket(txn, bucketLoc)->parent;
+ // Look down if we need to.
+ DiskLoc nextDownLoc = childLocForPos(bucket, ko + adj);
+ BucketType* nextDown = getBucket(txn, nextDownLoc);
+ if (NULL != nextDown) {
for (;;) {
- if (ancestor.isNull()) {
- break;
+ if (direction > 0) {
+ *posInOut = 0;
+ } else {
+ *posInOut = nextDown->n - 1;
}
- BucketType* an = getBucket(txn, ancestor);
- for (int i = 0; i < an->n; i++) {
- if (childLocForPos(an, i + adj) == childLoc) {
- *posInOut = i;
- return ancestor;
- }
+ DiskLoc newNextDownLoc = childLocForPos(nextDown, *posInOut + adj);
+ BucketType* newNextDownBucket = getBucket(txn, newNextDownLoc);
+ if (NULL == newNextDownBucket) {
+ break;
}
- invariant(direction < 0 || an->nextChild == childLoc);
- // parent exhausted also, keep going up
- childLoc = ancestor;
- ancestor = an->parent;
+ nextDownLoc = newNextDownLoc;
+ nextDown = newNextDownBucket;
}
+ return nextDownLoc;
+ }
- return DiskLoc();
+ // Looking down isn't the right choice, move forward.
+ if (ko < bucket->n && ko >= 0) {
+ *posInOut = ko;
+ return bucketLoc;
}
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::keyIsUsed(OperationContext* txn,
- const DiskLoc& loc,
- const int& pos) const {
- return getKeyHeader(getBucket(txn, loc), pos).isUsed();
+ // Hit the end of the bucket, move up and over.
+ DiskLoc childLoc = bucketLoc;
+ DiskLoc ancestor = getBucket(txn, bucketLoc)->parent;
+ for (;;) {
+ if (ancestor.isNull()) {
+ break;
+ }
+ BucketType* an = getBucket(txn, ancestor);
+ for (int i = 0; i < an->n; i++) {
+ if (childLocForPos(an, i + adj) == childLoc) {
+ *posInOut = i;
+ return ancestor;
+ }
+ }
+ invariant(direction < 0 || an->nextChild == childLoc);
+ // parent exhausted also, keep going up
+ childLoc = ancestor;
+ ancestor = an->parent;
}
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::locate(OperationContext* txn,
- const BSONObj& key,
- const DiskLoc& recordLoc,
- const int direction,
- int* posOut,
- DiskLoc* bucketLocOut) const {
- // Clear out any data.
- *posOut = 0;
- *bucketLocOut = DiskLoc();
+ return DiskLoc();
+}
- bool found = false;
- KeyDataOwnedType owned(key);
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::keyIsUsed(OperationContext* txn,
+ const DiskLoc& loc,
+ const int& pos) const {
+ return getKeyHeader(getBucket(txn, loc), pos).isUsed();
+}
- *bucketLocOut = _locate(txn, getRootLoc(txn), owned, posOut, &found, recordLoc, direction);
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::locate(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& recordLoc,
+ const int direction,
+ int* posOut,
+ DiskLoc* bucketLocOut) const {
+ // Clear out any data.
+ *posOut = 0;
+ *bucketLocOut = DiskLoc();
- if (!found) {
- return false;
- }
+ bool found = false;
+ KeyDataOwnedType owned(key);
- skipUnusedKeys(txn, bucketLocOut, posOut, direction);
+ *bucketLocOut = _locate(txn, getRootLoc(txn), owned, posOut, &found, recordLoc, direction);
- return found;
+ if (!found) {
+ return false;
}
- /**
- * Recursively walk down the btree, looking for a match of key and recordLoc.
- * Caller should have acquired lock on bucketLoc.
- */
- template <class BtreeLayout>
- DiskLoc BtreeLogic<BtreeLayout>::_locate(OperationContext* txn,
- const DiskLoc& bucketLoc,
- const KeyDataType& key,
- int* posOut,
- bool* foundOut,
- const DiskLoc& recordLoc,
- const int direction) const {
- int position;
- BucketType* bucket = getBucket(txn, bucketLoc);
- // XXX: owned to not owned conversion(?)
- _find(txn, bucket, key, recordLoc, false, &position, foundOut);
-
- // Look in our current bucket.
- if (*foundOut) {
- *posOut = position;
- return bucketLoc;
- }
+ skipUnusedKeys(txn, bucketLocOut, posOut, direction);
- // Not in our current bucket. 'position' tells us where there may be a child.
- DiskLoc childLoc = childLocForPos(bucket, position);
+ return found;
+}
- if (!childLoc.isNull()) {
- DiskLoc inChild = _locate(txn, childLoc, key, posOut, foundOut, recordLoc, direction);
- if (!inChild.isNull()) {
- return inChild;
- }
+/**
+ * Recursively walk down the btree, looking for a match of key and recordLoc.
+ * Caller should have acquired lock on bucketLoc.
+ */
+template <class BtreeLayout>
+DiskLoc BtreeLogic<BtreeLayout>::_locate(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ const KeyDataType& key,
+ int* posOut,
+ bool* foundOut,
+ const DiskLoc& recordLoc,
+ const int direction) const {
+ int position;
+ BucketType* bucket = getBucket(txn, bucketLoc);
+ // XXX: owned to not owned conversion(?)
+ _find(txn, bucket, key, recordLoc, false, &position, foundOut);
+
+ // Look in our current bucket.
+ if (*foundOut) {
+ *posOut = position;
+ return bucketLoc;
+ }
+
+ // Not in our current bucket. 'position' tells us where there may be a child.
+ DiskLoc childLoc = childLocForPos(bucket, position);
+
+ if (!childLoc.isNull()) {
+ DiskLoc inChild = _locate(txn, childLoc, key, posOut, foundOut, recordLoc, direction);
+ if (!inChild.isNull()) {
+ return inChild;
}
+ }
- *posOut = position;
+ *posOut = position;
- if (direction < 0) {
- // The key *would* go to our left.
- (*posOut)--;
- if (-1 == *posOut) {
- // But there's no space for that in our bucket.
- return DiskLoc();
- }
- else {
- return bucketLoc;
- }
+ if (direction < 0) {
+ // The key *would* go to our left.
+ (*posOut)--;
+ if (-1 == *posOut) {
+ // But there's no space for that in our bucket.
+ return DiskLoc();
+ } else {
+ return bucketLoc;
}
- else {
- // The key would go to our right...
- if (bucket->n == *posOut) {
- return DiskLoc();
- }
- else {
- // But only if there is space.
- return bucketLoc;
- }
+ } else {
+ // The key would go to our right...
+ if (bucket->n == *posOut) {
+ return DiskLoc();
+ } else {
+ // But only if there is space.
+ return bucketLoc;
}
}
+}
- // TODO relcoate
- template <class BtreeLayout>
- bool BtreeLogic<BtreeLayout>::isHead(BucketType* bucket) {
- return bucket->parent.isNull();
- }
+// TODO relcoate
+template <class BtreeLayout>
+bool BtreeLogic<BtreeLayout>::isHead(BucketType* bucket) {
+ return bucket->parent.isNull();
+}
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::BucketType*
- BtreeLogic<BtreeLayout>::getBucket(OperationContext* txn, const RecordId id) const {
- if (id.isNull()) {
- return NULL;
- }
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::getBucket(
+ OperationContext* txn, const RecordId id) const {
+ if (id.isNull()) {
+ return NULL;
+ }
- RecordData recordData = _recordStore->dataFor(txn, id);
+ RecordData recordData = _recordStore->dataFor(txn, id);
- // we need to be working on the raw bytes, not a transient copy
- invariant(!recordData.isOwned());
+ // we need to be working on the raw bytes, not a transient copy
+ invariant(!recordData.isOwned());
- return reinterpret_cast<BucketType*>(const_cast<char*>(recordData.data()));
- }
+ return reinterpret_cast<BucketType*>(const_cast<char*>(recordData.data()));
+}
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::BucketType*
- BtreeLogic<BtreeLayout>::getRoot(OperationContext* txn) const {
- return getBucket(txn, _headManager->getHead(txn));
- }
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::getRoot(
+ OperationContext* txn) const {
+ return getBucket(txn, _headManager->getHead(txn));
+}
- template <class BtreeLayout>
- DiskLoc
- BtreeLogic<BtreeLayout>::getRootLoc(OperationContext* txn) const {
- return DiskLoc::fromRecordId(_headManager->getHead(txn));
- }
+template <class BtreeLayout>
+DiskLoc BtreeLogic<BtreeLayout>::getRootLoc(OperationContext* txn) const {
+ return DiskLoc::fromRecordId(_headManager->getHead(txn));
+}
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::BucketType*
- BtreeLogic<BtreeLayout>::childForPos(OperationContext* txn, BucketType* bucket, int pos) const {
- DiskLoc loc = childLocForPos(bucket, pos);
- return getBucket(txn, loc);
- }
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::childForPos(
+ OperationContext* txn, BucketType* bucket, int pos) const {
+ DiskLoc loc = childLocForPos(bucket, pos);
+ return getBucket(txn, loc);
+}
- template <class BtreeLayout>
- typename BtreeLogic<BtreeLayout>::LocType&
- BtreeLogic<BtreeLayout>::childLocForPos(BucketType* bucket, int pos) {
- if (bucket->n == pos) {
- return bucket->nextChild;
- }
- else {
- return getKeyHeader(bucket, pos).prevChildBucket;
- }
+template <class BtreeLayout>
+typename BtreeLogic<BtreeLayout>::LocType& BtreeLogic<BtreeLayout>::childLocForPos(
+ BucketType* bucket, int pos) {
+ if (bucket->n == pos) {
+ return bucket->nextChild;
+ } else {
+ return getKeyHeader(bucket, pos).prevChildBucket;
}
+}
- //
- // And, template stuff.
- //
+//
+// And, template stuff.
+//
- // V0 format.
- template struct FixedWidthKey<DiskLoc>;
- template class BtreeLogic<BtreeLayoutV0>;
+// V0 format.
+template struct FixedWidthKey<DiskLoc>;
+template class BtreeLogic<BtreeLayoutV0>;
- // V1 format.
- template struct FixedWidthKey<DiskLoc56Bit>;
- template class BtreeLogic<BtreeLayoutV1>;
+// V1 format.
+template struct FixedWidthKey<DiskLoc56Bit>;
+template class BtreeLogic<BtreeLayoutV1>;
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
index 48a307f3b4d..3c742170bcd 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
@@ -41,539 +41,522 @@
namespace mongo {
- class RecordStore;
- class SavedCursorRegistry;
+class RecordStore;
+class SavedCursorRegistry;
- // Used for unit-testing only
- template <class BtreeLayout> class BtreeLogicTestBase;
- template <class BtreeLayout> class ArtificialTreeBuilder;
-
- /**
- * This is the logic for manipulating the Btree. It is (mostly) independent of the on-disk
- * format.
- */
- template <class BtreeLayout>
- class BtreeLogic {
- public:
- // AKA _keyNode
- typedef typename BtreeLayout::FixedWidthKeyType KeyHeaderType;
-
- // AKA Key
- typedef typename BtreeLayout::KeyType KeyDataType;
+// Used for unit-testing only
+template <class BtreeLayout>
+class BtreeLogicTestBase;
+template <class BtreeLayout>
+class ArtificialTreeBuilder;
- // AKA KeyOwned
- typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
+/**
+ * This is the logic for manipulating the Btree. It is (mostly) independent of the on-disk
+ * format.
+ */
+template <class BtreeLayout>
+class BtreeLogic {
+public:
+ // AKA _keyNode
+ typedef typename BtreeLayout::FixedWidthKeyType KeyHeaderType;
- // AKA Loc
- typedef typename BtreeLayout::LocType LocType;
+ // AKA Key
+ typedef typename BtreeLayout::KeyType KeyDataType;
- // AKA BucketBasics or BtreeBucket, either one.
- typedef typename BtreeLayout::BucketType BucketType;
+ // AKA KeyOwned
+ typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
- /**
- * 'head' manages the catalog information.
- * 'store' allocates and frees buckets.
- * 'ordering' is meta-information we store in the catalog.
- * 'indexName' is a string identifying the index that we use to print errors with.
- */
- BtreeLogic(HeadManager* head,
- RecordStore* store,
- SavedCursorRegistry* cursors,
- const Ordering& ordering,
- const std::string& indexName)
- : _headManager(head),
- _recordStore(store),
- _cursorRegistry(cursors),
- _ordering(ordering),
- _indexName(indexName) {
- }
+ // AKA Loc
+ typedef typename BtreeLayout::LocType LocType;
- //
- // Public-facing
- //
+ // AKA BucketBasics or BtreeBucket, either one.
+ typedef typename BtreeLayout::BucketType BucketType;
- class Builder {
- public:
- typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
- typedef typename BtreeLayout::KeyType KeyDataType;
+ /**
+ * 'head' manages the catalog information.
+ * 'store' allocates and frees buckets.
+ * 'ordering' is meta-information we store in the catalog.
+ * 'indexName' is a string identifying the index that we use to print errors with.
+ */
+ BtreeLogic(HeadManager* head,
+ RecordStore* store,
+ SavedCursorRegistry* cursors,
+ const Ordering& ordering,
+ const std::string& indexName)
+ : _headManager(head),
+ _recordStore(store),
+ _cursorRegistry(cursors),
+ _ordering(ordering),
+ _indexName(indexName) {}
+
+ //
+ // Public-facing
+ //
+
+ class Builder {
+ public:
+ typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
+ typedef typename BtreeLayout::KeyType KeyDataType;
- Status addKey(const BSONObj& key, const DiskLoc& loc);
+ Status addKey(const BSONObj& key, const DiskLoc& loc);
- private:
- friend class BtreeLogic;
+ private:
+ friend class BtreeLogic;
- class SetRightLeafLocChange;
+ class SetRightLeafLocChange;
- Builder(BtreeLogic* logic, OperationContext* txn, bool dupsAllowed);
+ Builder(BtreeLogic* logic, OperationContext* txn, bool dupsAllowed);
- /**
- * Creates and returns a new empty bucket to the right of leftSib, maintaining the
- * internal consistency of the tree. leftSib must be the right-most child of its parent
- * or it must be the root.
- */
- DiskLoc newBucket(BucketType* leftSib, DiskLoc leftSibLoc);
+ /**
+ * Creates and returns a new empty bucket to the right of leftSib, maintaining the
+ * internal consistency of the tree. leftSib must be the right-most child of its parent
+ * or it must be the root.
+ */
+ DiskLoc newBucket(BucketType* leftSib, DiskLoc leftSibLoc);
- BucketType* _getModifiableBucket(DiskLoc loc);
- BucketType* _getBucket(DiskLoc loc);
+ BucketType* _getModifiableBucket(DiskLoc loc);
+ BucketType* _getBucket(DiskLoc loc);
- // Not owned.
- BtreeLogic* _logic;
+ // Not owned.
+ BtreeLogic* _logic;
- DiskLoc _rightLeafLoc; // DiskLoc of right-most (highest) leaf bucket.
- bool _dupsAllowed;
- std::unique_ptr<KeyDataOwnedType> _keyLast;
+ DiskLoc _rightLeafLoc; // DiskLoc of right-most (highest) leaf bucket.
+ bool _dupsAllowed;
+ std::unique_ptr<KeyDataOwnedType> _keyLast;
- // Not owned.
- OperationContext* _txn;
- };
+ // Not owned.
+ OperationContext* _txn;
+ };
- /**
- * Caller owns the returned pointer.
- * 'this' must outlive the returned pointer.
- */
- Builder* newBuilder(OperationContext* txn, bool dupsAllowed);
+ /**
+ * Caller owns the returned pointer.
+ * 'this' must outlive the returned pointer.
+ */
+ Builder* newBuilder(OperationContext* txn, bool dupsAllowed);
- Status dupKeyCheck(OperationContext* txn,
- const BSONObj& key,
- const DiskLoc& loc) const;
+ Status dupKeyCheck(OperationContext* txn, const BSONObj& key, const DiskLoc& loc) const;
- Status insert(OperationContext* txn,
- const BSONObj& rawKey,
- const DiskLoc& value,
- bool dupsAllowed);
+ Status insert(OperationContext* txn,
+ const BSONObj& rawKey,
+ const DiskLoc& value,
+ bool dupsAllowed);
- /**
- * Navigates down the tree and locates the bucket and position containing a record with
- * the specified <key, recordLoc> combination.
- *
- * @return true if the exact <key, recordLoc> was found. Otherwise, false and the
- * bucketLocOut would contain the bucket containing key which is before or after the
- * searched one (dependent on the direction).
- */
- bool locate(OperationContext* txn,
- const BSONObj& key,
- const DiskLoc& recordLoc,
- const int direction,
- int* posOut,
- DiskLoc* bucketLocOut) const;
+ /**
+ * Navigates down the tree and locates the bucket and position containing a record with
+ * the specified <key, recordLoc> combination.
+ *
+ * @return true if the exact <key, recordLoc> was found. Otherwise, false and the
+ * bucketLocOut would contain the bucket containing key which is before or after the
+ * searched one (dependent on the direction).
+ */
+ bool locate(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& recordLoc,
+ const int direction,
+ int* posOut,
+ DiskLoc* bucketLocOut) const;
- void advance(OperationContext* txn,
- DiskLoc* bucketLocInOut,
- int* posInOut,
- int direction) const;
+ void advance(OperationContext* txn,
+ DiskLoc* bucketLocInOut,
+ int* posInOut,
+ int direction) const;
- bool exists(OperationContext* txn, const KeyDataType& key) const;
+ bool exists(OperationContext* txn, const KeyDataType& key) const;
- bool unindex(OperationContext* txn,
- const BSONObj& key,
- const DiskLoc& recordLoc);
+ bool unindex(OperationContext* txn, const BSONObj& key, const DiskLoc& recordLoc);
- bool isEmpty(OperationContext* txn) const;
+ bool isEmpty(OperationContext* txn) const;
- long long fullValidate(OperationContext*,
- long long *unusedCount,
- bool strict,
- bool dumpBuckets,
- unsigned depth) const;
+ long long fullValidate(OperationContext*,
+ long long* unusedCount,
+ bool strict,
+ bool dumpBuckets,
+ unsigned depth) const;
- DiskLoc getDiskLoc(OperationContext* txn,
- const DiskLoc& bucketLoc,
- const int keyOffset) const;
+ DiskLoc getDiskLoc(OperationContext* txn, const DiskLoc& bucketLoc, const int keyOffset) const;
- BSONObj getKey(OperationContext* txn,
- const DiskLoc& bucketLoc,
- const int keyOffset) const;
+ BSONObj getKey(OperationContext* txn, const DiskLoc& bucketLoc, const int keyOffset) const;
- DiskLoc getHead(OperationContext* txn) const {
- return DiskLoc::fromRecordId(_headManager->getHead(txn));
- }
+ DiskLoc getHead(OperationContext* txn) const {
+ return DiskLoc::fromRecordId(_headManager->getHead(txn));
+ }
- Status touch(OperationContext* txn) const;
+ Status touch(OperationContext* txn) const;
- //
- // Composite key navigation methods
- //
+ //
+ // Composite key navigation methods
+ //
- void customLocate(OperationContext* txn,
- DiskLoc* locInOut,
- int* keyOfsInOut,
- const IndexSeekPoint& seekPoint,
- int direction) const;
+ void customLocate(OperationContext* txn,
+ DiskLoc* locInOut,
+ int* keyOfsInOut,
+ const IndexSeekPoint& seekPoint,
+ int direction) const;
- void advanceTo(OperationContext*,
- DiskLoc* thisLocInOut,
- int* keyOfsInOut,
- const IndexSeekPoint& seekPoint,
- int direction) const;
+ void advanceTo(OperationContext*,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ const IndexSeekPoint& seekPoint,
+ int direction) const;
- void restorePosition(OperationContext* txn,
- const BSONObj& savedKey,
- const DiskLoc& savedLoc,
- int direction,
- DiskLoc* bucketInOut,
- int* keyOffsetInOut) const;
+ void restorePosition(OperationContext* txn,
+ const BSONObj& savedKey,
+ const DiskLoc& savedLoc,
+ int direction,
+ DiskLoc* bucketInOut,
+ int* keyOffsetInOut) const;
- //
- // Creation and deletion
- //
+ //
+ // Creation and deletion
+ //
- /**
- * Returns OK if the index was uninitialized before, error status otherwise.
- */
- Status initAsEmpty(OperationContext* txn);
+ /**
+ * Returns OK if the index was uninitialized before, error status otherwise.
+ */
+ Status initAsEmpty(OperationContext* txn);
- //
- // Size constants
- //
+ //
+ // Size constants
+ //
- const RecordStore* getRecordStore() const { return _recordStore; }
+ const RecordStore* getRecordStore() const {
+ return _recordStore;
+ }
- SavedCursorRegistry* savedCursors() const { return _cursorRegistry; }
+ SavedCursorRegistry* savedCursors() const {
+ return _cursorRegistry;
+ }
- static int lowWaterMark();
-
- Ordering ordering() const { return _ordering; }
+ static int lowWaterMark();
- int customBSONCmp(const BSONObj& inIndex_left,
- const IndexSeekPoint& seekPoint_right,
- int direction) const;
+ Ordering ordering() const {
+ return _ordering;
+ }
- private:
- friend class BtreeLogic::Builder;
+ int customBSONCmp(const BSONObj& inIndex_left,
+ const IndexSeekPoint& seekPoint_right,
+ int direction) const;
- // Used for unit-testing only
- friend class BtreeLogicTestBase<BtreeLayout>;
- friend class ArtificialTreeBuilder<BtreeLayout>;
+private:
+ friend class BtreeLogic::Builder;
- /**
- * This is an in memory wrapper for the variable length data associated with a
- * KeyHeaderType. It points to on-disk data but is not itself on-disk data.
- *
- * This object and its BSONObj 'key' will become invalid if the KeyHeaderType data that owns
- * this it is moved within the btree. In general, a KeyWrapper should not be expected to be
- * valid after a write.
- */
- struct FullKey {
- FullKey(const BucketType* bucket, int i)
- : header(getKeyHeader(bucket, i)),
- prevChildBucket(header.prevChildBucket),
- recordLoc(header.recordLoc),
- data(bucket->data + header.keyDataOfs()) { }
+ // Used for unit-testing only
+ friend class BtreeLogicTestBase<BtreeLayout>;
+ friend class ArtificialTreeBuilder<BtreeLayout>;
- // This is actually a reference to something on-disk.
- const KeyHeaderType& header;
+ /**
+ * This is an in memory wrapper for the variable length data associated with a
+ * KeyHeaderType. It points to on-disk data but is not itself on-disk data.
+ *
+ * This object and its BSONObj 'key' will become invalid if the KeyHeaderType data that owns
+ * this it is moved within the btree. In general, a KeyWrapper should not be expected to be
+ * valid after a write.
+ */
+ struct FullKey {
+ FullKey(const BucketType* bucket, int i)
+ : header(getKeyHeader(bucket, i)),
+ prevChildBucket(header.prevChildBucket),
+ recordLoc(header.recordLoc),
+ data(bucket->data + header.keyDataOfs()) {}
+
+ // This is actually a reference to something on-disk.
+ const KeyHeaderType& header;
+
+ // These are actually in 'header'.
+ const LocType& prevChildBucket;
+ const LocType& recordLoc;
+
+ // This is *not* memory-mapped but its members point to something on-disk.
+ KeyDataType data;
+ };
- // These are actually in 'header'.
- const LocType& prevChildBucket;
- const LocType& recordLoc;
+ //
+ // Functions that depend on the templated type info but nothing in 'this'.
+ //
- // This is *not* memory-mapped but its members point to something on-disk.
- KeyDataType data;
- };
+ static LocType& childLocForPos(BucketType* bucket, int pos);
- //
- // Functions that depend on the templated type info but nothing in 'this'.
- //
+ static FullKey getFullKey(const BucketType* bucket, int i);
- static LocType& childLocForPos(BucketType* bucket, int pos);
+ static KeyHeaderType& getKeyHeader(BucketType* bucket, int i);
- static FullKey getFullKey(const BucketType* bucket, int i);
+ static const KeyHeaderType& getKeyHeader(const BucketType* bucket, int i);
- static KeyHeaderType& getKeyHeader(BucketType* bucket, int i);
+ static char* dataAt(BucketType* bucket, short ofs);
- static const KeyHeaderType& getKeyHeader(const BucketType* bucket, int i);
+ static void markUnused(BucketType* bucket, int keypos);
- static char* dataAt(BucketType* bucket, short ofs);
+ static int totalDataSize(BucketType* bucket);
- static void markUnused(BucketType* bucket, int keypos);
+ static void init(BucketType* bucket);
- static int totalDataSize(BucketType* bucket);
+ static int _alloc(BucketType* bucket, int bytes);
- static void init(BucketType* bucket);
+ static void _unalloc(BucketType* bucket, int bytes);
- static int _alloc(BucketType* bucket, int bytes);
+ static void _delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty = false);
- static void _unalloc(BucketType* bucket, int bytes);
+ static void popBack(BucketType* bucket, DiskLoc* recordLocOut, KeyDataType* keyDataOut);
- static void _delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty = false);
+ static bool mayDropKey(BucketType* bucket, int index, int refPos);
- static void popBack(BucketType* bucket, DiskLoc* recordLocOut, KeyDataType *keyDataOut);
+ static int _packedDataSize(BucketType* bucket, int refPos);
- static bool mayDropKey(BucketType* bucket, int index, int refPos);
+ static void setPacked(BucketType* bucket);
- static int _packedDataSize(BucketType* bucket, int refPos);
+ static void setNotPacked(BucketType* bucket);
- static void setPacked(BucketType* bucket);
+ static BucketType* btreemod(OperationContext* txn, BucketType* bucket);
- static void setNotPacked(BucketType* bucket);
+ static int splitPos(BucketType* bucket, int keypos);
- static BucketType* btreemod(OperationContext* txn, BucketType* bucket);
+ static void reserveKeysFront(BucketType* bucket, int nAdd);
- static int splitPos(BucketType* bucket, int keypos);
+ static void setKey(BucketType* bucket,
+ int i,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc prevChildBucket);
- static void reserveKeysFront(BucketType* bucket, int nAdd);
+ static bool isHead(BucketType* bucket);
- static void setKey(BucketType* bucket,
- int i,
- const DiskLoc recordLoc,
- const KeyDataType &key,
- const DiskLoc prevChildBucket);
+ static void dumpBucket(const BucketType* bucket, int indentLength = 0);
- static bool isHead(BucketType* bucket);
+ static void assertValid(const std::string& ns,
+ BucketType* bucket,
+ const Ordering& ordering,
+ bool force = false);
- static void dumpBucket(const BucketType* bucket, int indentLength = 0);
+ //
+ // 'this'-specific helpers (require record store, catalog information, or ordering, or type
+ // information).
+ //
- static void assertValid(const std::string& ns,
- BucketType* bucket,
- const Ordering& ordering,
- bool force = false);
+ bool basicInsert(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int& keypos,
+ const KeyDataType& key,
+ const DiskLoc recordLoc);
+
+ void dropFront(BucketType* bucket, int nDrop, int& refpos);
+
+ void _pack(OperationContext* txn, BucketType* bucket, const DiskLoc thisLoc, int& refPos);
+
+ void customLocate(OperationContext* txn,
+ DiskLoc* locInOut,
+ int* keyOfsInOut,
+ const IndexSeekPoint& seekPoint,
+ int direction,
+ std::pair<DiskLoc, int>& bestParent) const;
+
+ Status _find(OperationContext* txn,
+ BucketType* bucket,
+ const KeyDataType& key,
+ const DiskLoc& recordLoc,
+ bool errorIfDup,
+ int* keyPositionOut,
+ bool* foundOut) const;
+
+ bool customFind(OperationContext* txn,
+ int low,
+ int high,
+ const IndexSeekPoint& seekPoint,
+ int direction,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ std::pair<DiskLoc, int>& bestParent) const;
+
+ void advanceToImpl(OperationContext* txn,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ const IndexSeekPoint& seekPoint,
+ int direction) const;
- //
- // 'this'-specific helpers (require record store, catalog information, or ordering, or type
- // information).
- //
+ bool wouldCreateDup(OperationContext* txn, const KeyDataType& key, const DiskLoc self) const;
- bool basicInsert(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int& keypos,
- const KeyDataType& key,
- const DiskLoc recordLoc);
+ bool keyIsUsed(OperationContext* txn, const DiskLoc& loc, const int& pos) const;
- void dropFront(BucketType* bucket, int nDrop, int& refpos);
+ void skipUnusedKeys(OperationContext* txn, DiskLoc* loc, int* pos, int direction) const;
- void _pack(OperationContext* txn, BucketType* bucket, const DiskLoc thisLoc, int &refPos);
+ DiskLoc advance(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ int* posInOut,
+ int direction) const;
- void customLocate(OperationContext* txn,
- DiskLoc* locInOut,
- int* keyOfsInOut,
- const IndexSeekPoint& seekPoint,
- int direction,
- std::pair<DiskLoc, int>& bestParent) const;
+ DiskLoc _locate(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ const KeyDataType& key,
+ int* posOut,
+ bool* foundOut,
+ const DiskLoc& recordLoc,
+ const int direction) const;
- Status _find(OperationContext* txn,
- BucketType* bucket,
- const KeyDataType& key,
- const DiskLoc& recordLoc,
- bool errorIfDup,
- int* keyPositionOut,
- bool* foundOut) const;
-
- bool customFind(OperationContext* txn,
- int low,
- int high,
- const IndexSeekPoint& seekPoint,
- int direction,
- DiskLoc* thisLocInOut,
- int* keyOfsInOut,
- std::pair<DiskLoc, int>& bestParent) const;
-
- void advanceToImpl(OperationContext* txn,
- DiskLoc* thisLocInOut,
- int* keyOfsInOut,
- const IndexSeekPoint& seekPoint,
- int direction) const;
-
- bool wouldCreateDup(OperationContext* txn,
- const KeyDataType& key,
- const DiskLoc self) const;
-
- bool keyIsUsed(OperationContext* txn, const DiskLoc& loc, const int& pos) const;
-
- void skipUnusedKeys(OperationContext* txn,
- DiskLoc* loc,
- int* pos,
- int direction) const;
-
- DiskLoc advance(OperationContext* txn,
- const DiskLoc& bucketLoc,
- int* posInOut,
- int direction) const;
-
- DiskLoc _locate(OperationContext* txn,
- const DiskLoc& bucketLoc,
- const KeyDataType& key,
- int* posOut,
- bool* foundOut,
- const DiskLoc& recordLoc,
- const int direction) const;
+ long long _fullValidate(OperationContext* txn,
+ const DiskLoc bucketLoc,
+ long long* unusedCount,
+ bool strict,
+ bool dumpBuckets,
+ unsigned depth) const;
- long long _fullValidate(OperationContext* txn,
- const DiskLoc bucketLoc,
- long long *unusedCount,
- bool strict,
- bool dumpBuckets,
- unsigned depth) const ;
+ DiskLoc _addBucket(OperationContext* txn);
- DiskLoc _addBucket(OperationContext* txn);
+ bool canMergeChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ const int leftIndex);
- bool canMergeChildren(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- const int leftIndex);
+ // has to look in children of 'bucket' and requires record store
+ int _rebalancedSeparatorPos(OperationContext* txn, BucketType* bucket, int leftIndex);
- // has to look in children of 'bucket' and requires record store
- int _rebalancedSeparatorPos(OperationContext* txn,
- BucketType* bucket,
- int leftIndex);
+ void _packReadyForMod(BucketType* bucket, int& refPos);
- void _packReadyForMod(BucketType* bucket, int &refPos);
+ void truncateTo(BucketType* bucket, int N, int& refPos);
- void truncateTo(BucketType* bucket, int N, int &refPos);
+ void split(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc lchild,
+ const DiskLoc rchild);
- void split(OperationContext* txn,
+ Status _insert(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
- int keypos,
- const DiskLoc recordLoc,
const KeyDataType& key,
- const DiskLoc lchild,
- const DiskLoc rchild);
-
- Status _insert(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- const KeyDataType& key,
- const DiskLoc recordLoc,
- bool dupsAllowed,
- const DiskLoc leftChild,
- const DiskLoc rightChild);
-
- // TODO take a BucketType*?
- void insertHere(OperationContext* txn,
+ const DiskLoc recordLoc,
+ bool dupsAllowed,
+ const DiskLoc leftChild,
+ const DiskLoc rightChild);
+
+ // TODO take a BucketType*?
+ void insertHere(OperationContext* txn,
+ const DiskLoc bucketLoc,
+ int pos,
+ const KeyDataType& key,
+ const DiskLoc recordLoc,
+ const DiskLoc leftChild,
+ const DiskLoc rightChild);
+
+ std::string dupKeyError(const KeyDataType& key) const;
+
+ void setInternalKey(OperationContext* txn,
+ BucketType* bucket,
const DiskLoc bucketLoc,
- int pos,
- const KeyDataType& key,
+ int keypos,
const DiskLoc recordLoc,
- const DiskLoc leftChild,
- const DiskLoc rightChild);
+ const KeyDataType& key,
+ const DiskLoc lchild,
+ const DiskLoc rchild);
- std::string dupKeyError(const KeyDataType& key) const;
+ void fixParentPtrs(OperationContext* trans,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int firstIndex = 0,
+ int lastIndex = -1);
- void setInternalKey(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int keypos,
- const DiskLoc recordLoc,
- const KeyDataType& key,
- const DiskLoc lchild,
- const DiskLoc rchild);
+ bool mayBalanceWithNeighbors(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc);
- void fixParentPtrs(OperationContext* trans,
+ void doBalanceChildren(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
- int firstIndex = 0,
- int lastIndex = -1);
-
- bool mayBalanceWithNeighbors(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
-
- void doBalanceChildren(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int leftIndex);
-
- void doBalanceLeftToRight(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc thisLoc,
- int leftIndex,
- int split,
- BucketType* l,
- const DiskLoc lchild,
- BucketType* r,
- const DiskLoc rchild);
-
- void doBalanceRightToLeft(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc thisLoc,
- int leftIndex,
- int split,
- BucketType* l,
- const DiskLoc lchild,
- BucketType* r,
- const DiskLoc rchild);
-
- bool tryBalanceChildren(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int leftIndex);
-
- int indexInParent(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc) const;
-
- void doMergeChildren(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int leftIndex);
+ int leftIndex);
- void replaceWithNextChild(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc);
+ void doBalanceLeftToRight(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc thisLoc,
+ int leftIndex,
+ int split,
+ BucketType* l,
+ const DiskLoc lchild,
+ BucketType* r,
+ const DiskLoc rchild);
+
+ void doBalanceRightToLeft(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc thisLoc,
+ int leftIndex,
+ int split,
+ BucketType* l,
+ const DiskLoc lchild,
+ BucketType* r,
+ const DiskLoc rchild);
+
+ bool tryBalanceChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex);
- void deleteInternalKey(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc,
- int keypos);
+ int indexInParent(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc) const;
- void delKeyAtPos(OperationContext* txn,
+ void doMergeChildren(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
- int p);
+ int leftIndex);
- void delBucket(OperationContext* txn,
- BucketType* bucket,
- const DiskLoc bucketLoc);
+ void replaceWithNextChild(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
- void deallocBucket(OperationContext* txn,
+ void deleteInternalKey(OperationContext* txn,
BucketType* bucket,
- const DiskLoc bucketLoc);
+ const DiskLoc bucketLoc,
+ int keypos);
- bool _keyIsAt(const BSONObj& savedKey,
- const DiskLoc& savedLoc,
- BucketType* bucket,
- int keyPos) const;
+ void delKeyAtPos(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc, int p);
- /**
- * Tries to push key into bucket. Return false if it can't because key doesn't fit.
- *
- * bucket must be declared as writable by the caller.
- * The new key/recordLoc pair must be higher than any others in bucket.
- *
- * TODO needs 'this' for _ordering for sanity check
- */
- bool pushBack(BucketType* bucket,
- const DiskLoc recordLoc,
- const KeyDataType& key,
- const DiskLoc prevChild);
+ void delBucket(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
+ void deallocBucket(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
- BucketType* childForPos(OperationContext* txn, BucketType* bucket, int pos) const;
+ bool _keyIsAt(const BSONObj& savedKey,
+ const DiskLoc& savedLoc,
+ BucketType* bucket,
+ int keyPos) const;
- BucketType* getBucket(OperationContext* txn, const DiskLoc dl) const {
- return getBucket(txn, dl.toRecordId());
- }
- BucketType* getBucket(OperationContext* txn, const RecordId dl) const;
+ /**
+ * Tries to push key into bucket. Return false if it can't because key doesn't fit.
+ *
+ * bucket must be declared as writable by the caller.
+ * The new key/recordLoc pair must be higher than any others in bucket.
+ *
+ * TODO needs 'this' for _ordering for sanity check
+ */
+ bool pushBack(BucketType* bucket,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc prevChild);
- BucketType* getRoot(OperationContext* txn) const;
- DiskLoc getRootLoc(OperationContext* txn) const;
+ BucketType* childForPos(OperationContext* txn, BucketType* bucket, int pos) const;
- //
- // Data
- //
+ BucketType* getBucket(OperationContext* txn, const DiskLoc dl) const {
+ return getBucket(txn, dl.toRecordId());
+ }
+ BucketType* getBucket(OperationContext* txn, const RecordId dl) const;
- // Not owned here.
- HeadManager* _headManager;
+ BucketType* getRoot(OperationContext* txn) const;
- // Not owned here.
- RecordStore* _recordStore;
+ DiskLoc getRootLoc(OperationContext* txn) const;
- // Not owned Here.
- SavedCursorRegistry* _cursorRegistry;
+ //
+ // Data
+ //
- Ordering _ordering;
+ // Not owned here.
+ HeadManager* _headManager;
- std::string _indexName;
- };
+ // Not owned here.
+ RecordStore* _recordStore;
+
+ // Not owned Here.
+ SavedCursorRegistry* _cursorRegistry;
+
+ Ordering _ordering;
+
+ std::string _indexName;
+};
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp
index 1c0bd1c1505..b4e42196c99 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp
@@ -43,2070 +43,2244 @@
namespace mongo {
- using std::string;
+using std::string;
+
+/**
+ * This class is made friend of BtreeLogic so we can add whatever private method accesses we
+ * need to it, to be used by the tests.
+ */
+template <class BtreeLayoutType>
+class BtreeLogicTestBase {
+public:
+ typedef typename BtreeLayoutType::BucketType BucketType;
+ typedef typename BtreeLayoutType::FixedWidthKeyType FixedWidthKeyType;
+
+ typedef typename BtreeLogic<BtreeLayoutType>::FullKey FullKey;
+ typedef typename BtreeLogic<BtreeLayoutType>::KeyDataOwnedType KeyDataOwnedType;
+
+ BtreeLogicTestBase() : _helper(BSON("TheKey" << 1)) {}
+
+ virtual ~BtreeLogicTestBase() {}
+
+protected:
+ void checkValidNumKeys(int nKeys) {
+ OperationContextNoop txn;
+ ASSERT_EQUALS(nKeys, _helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ }
+
+ Status insert(const BSONObj& key, const DiskLoc dl, bool dupsAllowed = true) {
+ OperationContextNoop txn;
+ return _helper.btree.insert(&txn, key, dl, dupsAllowed);
+ }
+
+ bool unindex(const BSONObj& key) {
+ OperationContextNoop txn;
+ return _helper.btree.unindex(&txn, key, _helper.dummyDiskLoc);
+ }
+
+ void locate(const BSONObj& key,
+ int expectedPos,
+ bool expectedFound,
+ const RecordId& expectedLocation,
+ int direction) {
+ return locate(
+ key, expectedPos, expectedFound, DiskLoc::fromRecordId(expectedLocation), direction);
+ }
+ void locate(const BSONObj& key,
+ int expectedPos,
+ bool expectedFound,
+ const DiskLoc& expectedLocation,
+ int direction) {
+ int pos;
+ DiskLoc loc;
+ OperationContextNoop txn;
+ ASSERT_EQUALS(expectedFound,
+ _helper.btree.locate(&txn, key, _helper.dummyDiskLoc, direction, &pos, &loc));
+ ASSERT_EQUALS(expectedLocation, loc);
+ ASSERT_EQUALS(expectedPos, pos);
+ }
+
+ const BucketType* child(const BucketType* bucket, int i) const {
+ verify(i <= bucket->n);
+
+ DiskLoc diskLoc;
+ if (i == bucket->n) {
+ diskLoc = bucket->nextChild;
+ } else {
+ FullKey fullKey = BtreeLogic<BtreeLayoutType>::getFullKey(bucket, i);
+ diskLoc = fullKey.prevChildBucket;
+ }
+
+ verify(!diskLoc.isNull());
+
+ return _helper.btree.getBucket(NULL, diskLoc);
+ }
+
+ BucketType* head() const {
+ OperationContextNoop txn;
+ return _helper.btree.getBucket(&txn, _helper.headManager.getHead(&txn));
+ }
+
+ void forcePackBucket(const RecordId bucketLoc) {
+ BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc);
+
+ bucket->topSize += bucket->emptySize;
+ bucket->emptySize = 0;
+ BtreeLogic<BtreeLayoutType>::setNotPacked(bucket);
+ }
+
+ void truncateBucket(BucketType* bucket, int N, int& refPos) {
+ _helper.btree.truncateTo(bucket, N, refPos);
+ }
+
+ int bucketPackedDataSize(BucketType* bucket, int refPos) {
+ return _helper.btree._packedDataSize(bucket, refPos);
+ }
+
+ int bucketRebalancedSeparatorPos(const RecordId bucketLoc, int leftIndex) {
+ BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc);
+ OperationContextNoop txn;
+ return _helper.btree._rebalancedSeparatorPos(&txn, bucket, leftIndex);
+ }
+
+ FullKey getKey(const RecordId bucketLoc, int pos) const {
+ const BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc);
+ return BtreeLogic<BtreeLayoutType>::getFullKey(bucket, pos);
+ }
+
+ void markKeyUnused(const DiskLoc bucketLoc, int keyPos) {
+ BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc);
+ invariant(keyPos >= 0 && keyPos < bucket->n);
+
+ _helper.btree.getKeyHeader(bucket, keyPos).setUnused();
+ }
+
+ DiskLoc newBucket() {
+ OperationContextNoop txn;
+ return _helper.btree._addBucket(&txn);
+ }
/**
- * This class is made friend of BtreeLogic so we can add whatever private method accesses we
- * need to it, to be used by the tests.
+ * Sets the nextChild pointer for the bucket at the specified location.
*/
- template<class BtreeLayoutType>
- class BtreeLogicTestBase {
- public:
- typedef typename BtreeLayoutType::BucketType BucketType;
- typedef typename BtreeLayoutType::FixedWidthKeyType FixedWidthKeyType;
+ void setBucketNextChild(const DiskLoc bucketLoc, const DiskLoc nextChild) {
+ OperationContextNoop txn;
- typedef typename BtreeLogic<BtreeLayoutType>::FullKey FullKey;
- typedef typename BtreeLogic<BtreeLayoutType>::KeyDataOwnedType KeyDataOwnedType;
+ BucketType* bucket = _helper.btree.getBucket(&txn, bucketLoc);
+ bucket->nextChild = nextChild;
- BtreeLogicTestBase() : _helper(BSON("TheKey" << 1)) {
+ _helper.btree.fixParentPtrs(&txn, bucket, bucketLoc);
+ }
- }
+protected:
+ BtreeLogicTestHelper<BtreeLayoutType> _helper;
+};
- virtual ~BtreeLogicTestBase() {
+//
+// TESTS
+//
- }
+template <class OnDiskFormat>
+class SimpleCreate : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ this->checkValidNumKeys(0);
+ }
+};
+
+template <class OnDiskFormat>
+class SimpleInsertDelete : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ BSONObj key = simpleKey('z');
+ this->insert(key, this->_helper.dummyDiskLoc);
+
+ this->checkValidNumKeys(1);
+ this->locate(key, 0, true, this->_helper.headManager.getHead(&txn), 1);
+
+ this->unindex(key);
+
+ this->checkValidNumKeys(0);
+ this->locate(key, 0, false, DiskLoc(), 1);
+ }
+};
+
+template <class OnDiskFormat>
+class SplitUnevenBucketBase : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 10; ++i) {
+ BSONObj shortKey = simpleKey(shortToken(i), 1);
+ this->insert(shortKey, this->_helper.dummyDiskLoc);
+
+ BSONObj longKey = simpleKey(longToken(i), 800);
+ this->insert(longKey, this->_helper.dummyDiskLoc);
+ }
+
+ this->checkValidNumKeys(20);
+ ASSERT_EQUALS(1, this->head()->n);
+ checkSplit();
+ }
+
+protected:
+ virtual char shortToken(int i) const = 0;
+ virtual char longToken(int i) const = 0;
+ virtual void checkSplit() = 0;
+
+ static char leftToken(int i) {
+ return 'a' + i;
+ }
+
+ static char rightToken(int i) {
+ return 'z' - i;
+ }
+};
+
+template <class OnDiskFormat>
+class SplitRightHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> {
+private:
+ virtual char shortToken(int i) const {
+ return this->leftToken(i);
+ }
+ virtual char longToken(int i) const {
+ return this->rightToken(i);
+ }
+ virtual void checkSplit() {
+ ASSERT_EQUALS(15, this->child(this->head(), 0)->n);
+ ASSERT_EQUALS(4, this->child(this->head(), 1)->n);
+ }
+};
+
+template <class OnDiskFormat>
+class SplitLeftHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> {
+private:
+ virtual char shortToken(int i) const {
+ return this->rightToken(i);
+ }
+ virtual char longToken(int i) const {
+ return this->leftToken(i);
+ }
+ virtual void checkSplit() {
+ ASSERT_EQUALS(4, this->child(this->head(), 0)->n);
+ ASSERT_EQUALS(15, this->child(this->head(), 1)->n);
+ }
+};
+
+template <class OnDiskFormat>
+class MissingLocate : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 3; ++i) {
+ BSONObj k = simpleKey('b' + 2 * i);
+ this->insert(k, this->_helper.dummyDiskLoc);
+ }
+
+ locateExtended(1, 'a', 'b', this->_helper.headManager.getHead(&txn));
+ locateExtended(1, 'c', 'd', this->_helper.headManager.getHead(&txn));
+ locateExtended(1, 'e', 'f', this->_helper.headManager.getHead(&txn));
+ locateExtended(1, 'g', 'g' + 1, RecordId()); // of course, 'h' isn't in the index.
+
+ // old behavior
+ // locateExtended( -1, 'a', 'b', dl() );
+ // locateExtended( -1, 'c', 'd', dl() );
+ // locateExtended( -1, 'e', 'f', dl() );
+ // locateExtended( -1, 'g', 'f', dl() );
+
+ locateExtended(-1, 'a', 'a' - 1, RecordId()); // of course, 'a' - 1 isn't in the index
+ locateExtended(-1, 'c', 'b', this->_helper.headManager.getHead(&txn));
+ locateExtended(-1, 'e', 'd', this->_helper.headManager.getHead(&txn));
+ locateExtended(-1, 'g', 'f', this->_helper.headManager.getHead(&txn));
+ }
+
+private:
+ void locateExtended(int direction, char token, char expectedMatch, RecordId expectedLocation) {
+ const BSONObj k = simpleKey(token);
+ int expectedPos = (expectedMatch - 'b') / 2;
+
+ this->locate(k, expectedPos, false, expectedLocation, direction);
+ }
+};
+
+template <class OnDiskFormat>
+class MissingLocateMultiBucket : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc);
+
+ // This causes split
+ this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc);
+
+ int pos;
+ DiskLoc loc;
+
+ // 'E' is the split point and should be in the head the rest should be ~50/50
+ const BSONObj splitPoint = simpleKey('E', 800);
+ this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc);
+ ASSERT_EQUALS(this->_helper.headManager.getHead(&txn), loc.toRecordId());
+ ASSERT_EQUALS(0, pos);
+
+ // Find the one before 'E'
+ int largePos;
+ DiskLoc largeLoc;
+ this->_helper.btree.locate(
+ &txn, splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc);
+ this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1);
+
+ // Find the one after 'E'
+ int smallPos;
+ DiskLoc smallLoc;
+ this->_helper.btree.locate(
+ &txn, splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc);
+ this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1);
+
+ ASSERT_NOT_EQUALS(smallLoc, largeLoc);
+ ASSERT_NOT_EQUALS(smallLoc, loc);
+ ASSERT_NOT_EQUALS(largeLoc, loc);
+ }
+};
- protected:
- void checkValidNumKeys(int nKeys) {
- OperationContextNoop txn;
- ASSERT_EQUALS(nKeys, _helper.btree.fullValidate(&txn, NULL, true, false, 0));
- }
+/**
+ * Validates that adding keys incrementally produces buckets, which are 90%/10% full.
+ */
+template <class OnDiskFormat>
+class SERVER983 : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc);
+
+ // This will cause split
+ this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc);
+
+ int pos;
+ DiskLoc loc;
+
+ // 'H' is the maximum 'large' interval key, 90% should be < 'H' and 10% larger
+ const BSONObj splitPoint = simpleKey('H', 800);
+ this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc);
+ ASSERT_EQUALS(this->_helper.headManager.getHead(&txn), loc.toRecordId());
+ ASSERT_EQUALS(0, pos);
+
+ // Find the one before 'H'
+ int largePos;
+ DiskLoc largeLoc;
+ this->_helper.btree.locate(
+ &txn, splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc);
+ this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1);
+
+ // Find the one after 'H'
+ int smallPos;
+ DiskLoc smallLoc;
+ this->_helper.btree.locate(
+ &txn, splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc);
+ this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1);
+
+ ASSERT_NOT_EQUALS(smallLoc, largeLoc);
+ ASSERT_NOT_EQUALS(smallLoc, loc);
+ ASSERT_NOT_EQUALS(largeLoc, loc);
+ }
+};
+
+template <class OnDiskFormat>
+class DontReuseUnused : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 10; ++i) {
+ const BSONObj k = simpleKey('b' + 2 * i, 800);
+ this->insert(k, this->_helper.dummyDiskLoc);
+ }
+
+ const BSONObj root = simpleKey('p', 800);
+ this->unindex(root);
+
+ this->insert(root, this->_helper.dummyDiskLoc);
+ this->locate(root, 0, true, this->head()->nextChild, 1);
+ }
+};
+
+template <class OnDiskFormat>
+class MergeBucketsTestBase : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 10; ++i) {
+ const BSONObj k = simpleKey('b' + 2 * i, 800);
+ this->insert(k, this->_helper.dummyDiskLoc);
+ }
+
+ // numRecords() - 1, because this->_helper.dummyDiskLoc is actually in the record store too
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL) - 1);
+
+ long long expectedCount = 10 - unindexKeys();
+ ASSERT_EQUALS(1, this->_helper.recordStore.numRecords(NULL) - 1);
+
+ long long unusedCount = 0;
+ ASSERT_EQUALS(expectedCount,
+ this->_helper.btree.fullValidate(&txn, &unusedCount, true, false, 0));
+ ASSERT_EQUALS(0, unusedCount);
+ }
+
+protected:
+ virtual int unindexKeys() = 0;
+};
+
+template <class OnDiskFormat>
+class MergeBucketsLeft : public MergeBucketsTestBase<OnDiskFormat> {
+ virtual int unindexKeys() {
+ BSONObj k = simpleKey('b', 800);
+ this->unindex(k);
+
+ k = simpleKey('b' + 2, 800);
+ this->unindex(k);
+
+ k = simpleKey('b' + 4, 800);
+ this->unindex(k);
+
+ k = simpleKey('b' + 6, 800);
+ this->unindex(k);
+
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class MergeBucketsRight : public MergeBucketsTestBase<OnDiskFormat> {
+ virtual int unindexKeys() {
+ const BSONObj k = simpleKey('b' + 2 * 9, 800);
+ this->unindex(k);
+ return 1;
+ }
+};
+
+template <class OnDiskFormat>
+class MergeBucketsDontReplaceHead : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 18; ++i) {
+ const BSONObj k = simpleKey('a' + i, 800);
+ this->insert(k, this->_helper.dummyDiskLoc);
+ }
+
+ // numRecords(NULL) - 1, because fixedDiskLoc is actually in the record store too
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL) - 1);
+
+ const BSONObj k = simpleKey('a' + 17, 800);
+ this->unindex(k);
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL) - 1);
+
+ long long unusedCount = 0;
+ ASSERT_EQUALS(17, this->_helper.btree.fullValidate(&txn, &unusedCount, true, false, 0));
+ ASSERT_EQUALS(0, unusedCount);
+ }
+};
- Status insert(const BSONObj &key, const DiskLoc dl, bool dupsAllowed = true) {
- OperationContextNoop txn;
- return _helper.btree.insert(&txn, key, dl, dupsAllowed);
- }
+template <class OnDiskFormat>
+class MergeBucketsDelInternal : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- bool unindex(const BSONObj &key) {
- OperationContextNoop txn;
- return _helper.btree.unindex(&txn, key, _helper.dummyDiskLoc);
- }
+ builder.makeTree("{d:{b:{a:null},bb:null,_:{c:null}},_:{f:{e:null},_:{g:null}}}");
+ ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- void locate(const BSONObj &key,
- int expectedPos,
- bool expectedFound,
- const RecordId &expectedLocation,
- int direction) {
- return locate(key, expectedPos, expectedFound, DiskLoc::fromRecordId(expectedLocation),
- direction);
- }
- void locate(const BSONObj &key,
- int expectedPos,
- bool expectedFound,
- const DiskLoc &expectedLocation,
- int direction) {
- int pos;
- DiskLoc loc;
- OperationContextNoop txn;
- ASSERT_EQUALS(expectedFound,
- _helper.btree.locate(&txn, key, _helper.dummyDiskLoc, direction, &pos, &loc));
- ASSERT_EQUALS(expectedLocation, loc);
- ASSERT_EQUALS(expectedPos, pos);
- }
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- const BucketType* child(const BucketType* bucket, int i) const {
- verify(i <= bucket->n);
+ const BSONObj k = BSON(""
+ << "bb");
+ verify(this->unindex(k));
- DiskLoc diskLoc;
- if (i == bucket->n) {
- diskLoc = bucket->nextChild;
- }
- else {
- FullKey fullKey = BtreeLogic<BtreeLayoutType>::getFullKey(bucket, i);
- diskLoc = fullKey.prevChildBucket;
- }
+ ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- verify(!diskLoc.isNull());
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
- return _helper.btree.getBucket(NULL, diskLoc);
- }
+ builder.checkStructure("{b:{a:null},d:{c:null},f:{e:null},_:{g:null}}");
+ }
+};
- BucketType* head() const {
- OperationContextNoop txn;
- return _helper.btree.getBucket(&txn, _helper.headManager.getHead(&txn));
- }
+template <class OnDiskFormat>
+class MergeBucketsRightNull : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- void forcePackBucket(const RecordId bucketLoc) {
- BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc);
+ builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},_:{f:{e:null},h:{g:null}}}");
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- bucket->topSize += bucket->emptySize;
- bucket->emptySize = 0;
- BtreeLogic<BtreeLayoutType>::setNotPacked(bucket);
- }
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- void truncateBucket(BucketType* bucket, int N, int &refPos) {
- _helper.btree.truncateTo(bucket, N, refPos);
- }
+ const BSONObj k = BSON(""
+ << "bb");
+ verify(this->unindex(k));
- int bucketPackedDataSize(BucketType* bucket, int refPos) {
- return _helper.btree._packedDataSize(bucket, refPos);
- }
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- int bucketRebalancedSeparatorPos(const RecordId bucketLoc, int leftIndex) {
- BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc);
- OperationContextNoop txn;
- return _helper.btree._rebalancedSeparatorPos(&txn, bucket, leftIndex);
- }
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
- FullKey getKey(const RecordId bucketLoc, int pos) const {
- const BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc);
- return BtreeLogic<BtreeLayoutType>::getFullKey(bucket, pos);
- }
+ builder.checkStructure("{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}");
+ }
+};
- void markKeyUnused(const DiskLoc bucketLoc, int keyPos) {
- BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc);
- invariant(keyPos >= 0 && keyPos < bucket->n);
+// This comment was here during porting, not sure what it means:
+//
+// "Not yet handling this case"
+template <class OnDiskFormat>
+class DontMergeSingleBucket : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- _helper.btree.getKeyHeader(bucket, keyPos).setUnused();
- }
+ builder.makeTree("{d:{b:{a:null},c:null}}");
- DiskLoc newBucket() {
- OperationContextNoop txn;
- return _helper.btree._addBucket(&txn);
- }
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- /**
- * Sets the nextChild pointer for the bucket at the specified location.
- */
- void setBucketNextChild(const DiskLoc bucketLoc, const DiskLoc nextChild) {
- OperationContextNoop txn;
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- BucketType* bucket = _helper.btree.getBucket(&txn, bucketLoc);
- bucket->nextChild = nextChild;
+ const BSONObj k = BSON(""
+ << "c");
+ verify(this->unindex(k));
- _helper.btree.fixParentPtrs(&txn, bucket, bucketLoc);
- }
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- protected:
- BtreeLogicTestHelper<BtreeLayoutType> _helper;
- };
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- //
- // TESTS
- //
+ builder.checkStructure("{d:{b:{a:null}}}");
+ }
+};
- template<class OnDiskFormat>
- class SimpleCreate : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
+template <class OnDiskFormat>
+class ParentMergeNonRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- this->checkValidNumKeys(0);
- }
- };
+ builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},i:{f:{e:null},h:{g:null}}}");
- template<class OnDiskFormat>
- class SimpleInsertDelete : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- BSONObj key = simpleKey('z');
- this->insert(key, this->_helper.dummyDiskLoc);
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- this->checkValidNumKeys(1);
- this->locate(key, 0, true, this->_helper.headManager.getHead(&txn), 1);
+ const BSONObj k = BSON(""
+ << "bb");
+ verify(this->unindex(k));
- this->unindex(key);
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- this->checkValidNumKeys(0);
- this->locate(key, 0, false, DiskLoc(), 1);
- }
- };
+ // Child does not currently replace parent in this case. Also, the tree
+ // has 6 buckets + 1 for the this->_helper.dummyDiskLoc.
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class SplitUnevenBucketBase : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
+ builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}");
+ }
+};
- for (int i = 0; i < 10; ++i) {
- BSONObj shortKey = simpleKey(shortToken(i), 1);
- this->insert(shortKey, this->_helper.dummyDiskLoc);
+template <class OnDiskFormat>
+class ParentMergeNonRightToRight : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- BSONObj longKey = simpleKey(longToken(i), 800);
- this->insert(longKey, this->_helper.dummyDiskLoc);
- }
+ builder.makeTree("{d:{b:{a:null},cc:{c:null}},i:{f:{e:null},ff:null,h:{g:null}}}");
- this->checkValidNumKeys(20);
- ASSERT_EQUALS(1, this->head()->n);
- checkSplit();
- }
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- protected:
- virtual char shortToken(int i) const = 0;
- virtual char longToken(int i) const = 0;
- virtual void checkSplit() = 0;
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- static char leftToken(int i) {
- return 'a' + i;
- }
+ const BSONObj k = BSON(""
+ << "ff");
+ verify(this->unindex(k));
- static char rightToken(int i) {
- return 'z' - i;
- }
- };
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- template<class OnDiskFormat>
- class SplitRightHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> {
- private:
- virtual char shortToken(int i) const {
- return this->leftToken(i);
- }
- virtual char longToken(int i) const {
- return this->rightToken(i);
- }
- virtual void checkSplit() {
- ASSERT_EQUALS(15, this->child(this->head(), 0)->n);
- ASSERT_EQUALS(4, this->child(this->head(), 1)->n);
- }
- };
+ // Child does not currently replace parent in this case. Also, the tree
+ // has 6 buckets + 1 for the this->_helper.dummyDiskLoc.
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class SplitLeftHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> {
- private:
- virtual char shortToken(int i) const {
- return this->rightToken(i);
- }
- virtual char longToken(int i) const {
- return this->leftToken(i);
- }
- virtual void checkSplit() {
- ASSERT_EQUALS(4, this->child(this->head(), 0)->n);
- ASSERT_EQUALS(15, this->child(this->head(), 1)->n);
- }
- };
+ builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}");
+ }
+};
- template<class OnDiskFormat>
- class MissingLocate : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
+template <class OnDiskFormat>
+class CantMergeRightNoMerge : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- for (int i = 0; i < 3; ++i) {
- BSONObj k = simpleKey('b' + 2 * i);
- this->insert(k, this->_helper.dummyDiskLoc);
- }
-
- locateExtended(1, 'a', 'b', this->_helper.headManager.getHead(&txn));
- locateExtended(1, 'c', 'd', this->_helper.headManager.getHead(&txn));
- locateExtended(1, 'e', 'f', this->_helper.headManager.getHead(&txn));
- locateExtended(1, 'g', 'g' + 1, RecordId()); // of course, 'h' isn't in the index.
-
- // old behavior
- // locateExtended( -1, 'a', 'b', dl() );
- // locateExtended( -1, 'c', 'd', dl() );
- // locateExtended( -1, 'e', 'f', dl() );
- // locateExtended( -1, 'g', 'f', dl() );
-
- locateExtended(-1, 'a', 'a' - 1, RecordId()); // of course, 'a' - 1 isn't in the index
- locateExtended(-1, 'c', 'b', this->_helper.headManager.getHead(&txn));
- locateExtended(-1, 'e', 'd', this->_helper.headManager.getHead(&txn));
- locateExtended(-1, 'g', 'f', this->_helper.headManager.getHead(&txn));
- }
+ builder.makeTree(
+ "{d:{b:{a:null},bb:null,cc:{c:null}},"
+ "dd:null,"
+ "_:{f:{e:null},h:{g:null}}}");
- private:
- void locateExtended(
- int direction, char token, char expectedMatch, RecordId expectedLocation) {
- const BSONObj k = simpleKey(token);
- int expectedPos = (expectedMatch - 'b') / 2;
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- this->locate(k, expectedPos, false, expectedLocation, direction);
- }
- };
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class MissingLocateMultiBucket : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
-
- this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc);
-
- // This causes split
- this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc);
-
- int pos;
- DiskLoc loc;
-
- // 'E' is the split point and should be in the head the rest should be ~50/50
- const BSONObj splitPoint = simpleKey('E', 800);
- this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc);
- ASSERT_EQUALS(this->_helper.headManager.getHead(&txn), loc.toRecordId());
- ASSERT_EQUALS(0, pos);
-
- // Find the one before 'E'
- int largePos;
- DiskLoc largeLoc;
- this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc);
- this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1);
-
- // Find the one after 'E'
- int smallPos;
- DiskLoc smallLoc;
- this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc);
- this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1);
-
- ASSERT_NOT_EQUALS(smallLoc, largeLoc);
- ASSERT_NOT_EQUALS(smallLoc, loc);
- ASSERT_NOT_EQUALS(largeLoc, loc);
- }
- };
+ const BSONObj k = BSON(""
+ << "bb");
+ verify(this->unindex(k));
- /**
- * Validates that adding keys incrementally produces buckets, which are 90%/10% full.
- */
- template<class OnDiskFormat>
- class SERVER983 : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
-
- this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc);
- this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc);
-
- // This will cause split
- this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc);
-
- int pos;
- DiskLoc loc;
-
- // 'H' is the maximum 'large' interval key, 90% should be < 'H' and 10% larger
- const BSONObj splitPoint = simpleKey('H', 800);
- this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc);
- ASSERT_EQUALS(this->_helper.headManager.getHead(&txn), loc.toRecordId());
- ASSERT_EQUALS(0, pos);
-
- // Find the one before 'H'
- int largePos;
- DiskLoc largeLoc;
- this->_helper.btree.locate(&txn,
- splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc);
- this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1);
-
- // Find the one after 'H'
- int smallPos;
- DiskLoc smallLoc;
- this->_helper.btree.locate(&txn,
- splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc);
- this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1);
-
- ASSERT_NOT_EQUALS(smallLoc, largeLoc);
- ASSERT_NOT_EQUALS(smallLoc, loc);
- ASSERT_NOT_EQUALS(largeLoc, loc);
- }
- };
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- template<class OnDiskFormat>
- class DontReuseUnused : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- for (int i = 0; i < 10; ++i) {
- const BSONObj k = simpleKey('b' + 2 * i, 800);
- this->insert(k, this->_helper.dummyDiskLoc);
- }
+ builder.checkStructure(
+ "{d:{b:{a:null},cc:{c:null}},"
+ "dd:null,"
+ "_:{f:{e:null},h:{g:null}}}");
+ }
+};
- const BSONObj root = simpleKey('p', 800);
- this->unindex(root);
+template <class OnDiskFormat>
+class CantMergeLeftNoMerge : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- this->insert(root, this->_helper.dummyDiskLoc);
- this->locate(root, 0, true, this->head()->nextChild, 1);
- }
- };
+ builder.makeTree("{c:{b:{a:null}},d:null,_:{f:{e:null},g:null}}");
- template<class OnDiskFormat>
- class MergeBucketsTestBase : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
+ ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- for (int i = 0; i < 10; ++i) {
- const BSONObj k = simpleKey('b' + 2 * i, 800);
- this->insert(k, this->_helper.dummyDiskLoc);
- }
-
- // numRecords() - 1, because this->_helper.dummyDiskLoc is actually in the record store too
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL) - 1);
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
- long long expectedCount = 10 - unindexKeys();
- ASSERT_EQUALS(1, this->_helper.recordStore.numRecords(NULL) - 1);
+ const BSONObj k = BSON(""
+ << "g");
+ verify(this->unindex(k));
- long long unusedCount = 0;
- ASSERT_EQUALS(expectedCount, this->_helper.btree.fullValidate(&txn, &unusedCount, true, false, 0));
- ASSERT_EQUALS(0, unusedCount);
- }
+ ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- protected:
- virtual int unindexKeys() = 0;
- };
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class MergeBucketsLeft : public MergeBucketsTestBase<OnDiskFormat> {
- virtual int unindexKeys() {
- BSONObj k = simpleKey('b', 800);
- this->unindex(k);
+ builder.checkStructure("{c:{b:{a:null}},d:null,_:{f:{e:null}}}");
+ }
+};
- k = simpleKey('b' + 2, 800);
- this->unindex(k);
+template <class OnDiskFormat>
+class MergeOption : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- k = simpleKey('b' + 4, 800);
- this->unindex(k);
+ builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},_:{h:{g:null}}}");
- k = simpleKey('b' + 6, 800);
- this->unindex(k);
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- return 4;
- }
- };
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class MergeBucketsRight : public MergeBucketsTestBase<OnDiskFormat> {
- virtual int unindexKeys() {
- const BSONObj k = simpleKey('b' + 2 * 9, 800);
- this->unindex(k);
- return 1;
- }
- };
+ const BSONObj k = BSON(""
+ << "ee");
+ verify(this->unindex(k));
- template<class OnDiskFormat>
- class MergeBucketsDontReplaceHead : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
+ ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- for (int i = 0; i < 18; ++i) {
- const BSONObj k = simpleKey('a' + i, 800);
- this->insert(k, this->_helper.dummyDiskLoc);
- }
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
- // numRecords(NULL) - 1, because fixedDiskLoc is actually in the record store too
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL) - 1);
+ builder.checkStructure("{c:{b:{a:null}},_:{e:{d:null},f:null,h:{g:null}}}");
+ }
+};
- const BSONObj k = simpleKey('a' + 17, 800);
- this->unindex(k);
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL) - 1);
+template <class OnDiskFormat>
+class ForceMergeLeft : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- long long unusedCount = 0;
- ASSERT_EQUALS(17, this->_helper.btree.fullValidate(&txn, &unusedCount, true, false, 0));
- ASSERT_EQUALS(0, unusedCount);
- }
- };
+ builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},ff:null,_:{h:{g:null}}}");
- template<class OnDiskFormat>
- class MergeBucketsDelInternal : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- builder.makeTree("{d:{b:{a:null},bb:null,_:{c:null}},_:{f:{e:null},_:{g:null}}}");
- ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
+ const BSONObj k = BSON(""
+ << "ee");
+ verify(this->unindex(k));
- const BSONObj k = BSON("" << "bb");
- verify(this->unindex(k));
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
- // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
+ builder.checkStructure("{f:{b:{a:null},c:null,e:{d:null}},ff:null,_:{h:{g:null}}}");
+ }
+};
- builder.checkStructure("{b:{a:null},d:{c:null},f:{e:null},_:{g:null}}");
- }
- };
+template <class OnDiskFormat>
+class ForceMergeRight : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- template<class OnDiskFormat>
- class MergeBucketsRightNull : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ builder.makeTree("{c:{b:{a:null}},cc:null,f:{e:{d:null},ee:null},_:{h:{g:null}}}");
- builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},_:{f:{e:null},h:{g:null}}}");
- ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
- const BSONObj k = BSON("" << "bb");
- verify(this->unindex(k));
+ const BSONObj k = BSON(""
+ << "ee");
+ verify(this->unindex(k));
- ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
- builder.checkStructure("{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}");
- }
- };
+ builder.checkStructure("{c:{b:{a:null}},cc:null,_:{e:{d:null},f:null,h:{g:null}}}");
+ }
+};
- // This comment was here during porting, not sure what it means:
- //
- // "Not yet handling this case"
- template<class OnDiskFormat>
- class DontMergeSingleBucket : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+template <class OnDiskFormat>
+class RecursiveMerge : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- builder.makeTree("{d:{b:{a:null},c:null}}");
+ builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},j:{i:null}}");
- ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
- const BSONObj k = BSON("" << "c");
- verify(this->unindex(k));
+ const BSONObj k = BSON(""
+ << "c");
+ verify(this->unindex(k));
- ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- builder.checkStructure("{d:{b:{a:null}}}");
- }
- };
+ // Height is not currently reduced in this case
+ builder.checkStructure("{j:{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}}");
+ }
+};
- template<class OnDiskFormat>
- class ParentMergeNonRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+template <class OnDiskFormat>
+class RecursiveMergeRightBucket : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},i:{f:{e:null},h:{g:null}}}");
+ builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},_:{i:null}}");
- ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
- const BSONObj k = BSON("" << "bb");
- verify(this->unindex(k));
+ const BSONObj k = BSON(""
+ << "c");
+ verify(this->unindex(k));
- ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // Child does not currently replace parent in this case. Also, the tree
- // has 6 buckets + 1 for the this->_helper.dummyDiskLoc.
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}");
- }
- };
+ builder.checkStructure("{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}");
+ }
+};
- template<class OnDiskFormat>
- class ParentMergeNonRightToRight : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+template <class OnDiskFormat>
+class RecursiveMergeDoubleRightBucket : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- builder.makeTree("{d:{b:{a:null},cc:{c:null}},i:{f:{e:null},ff:null,h:{g:null}}}");
+ builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}");
- ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
- const BSONObj k = BSON("" << "ff");
- verify(this->unindex(k));
+ const BSONObj k = BSON(""
+ << "c");
+ verify(this->unindex(k));
- ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // Child does not currently replace parent in this case. Also, the tree
- // has 6 buckets + 1 for the this->_helper.dummyDiskLoc.
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}");
- }
- };
+ // no recursion currently in this case
+ builder.checkStructure("{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}");
+ }
+};
- template<class OnDiskFormat>
- class CantMergeRightNoMerge : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+template <class OnDiskFormat>
+class MergeSizeTestBase : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ MergeSizeTestBase() : _count(0) {}
- builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},"
- "dd:null,"
- "_:{f:{e:null},h:{g:null}}}");
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
- ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
+ const BSONObj& topKey = biggestKey('m');
- const BSONObj k = BSON("" << "bb");
- verify(this->unindex(k));
+ DiskLoc leftChild = this->newBucket();
+ builder.push(
+ DiskLoc::fromRecordId(this->_helper.headManager.getHead(&txn)), topKey, leftChild);
+ _count++;
- ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ DiskLoc rightChild = this->newBucket();
+ this->setBucketNextChild(DiskLoc::fromRecordId(this->_helper.headManager.getHead(&txn)),
+ rightChild);
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
+ _count += builder.fillBucketToExactSize(leftChild, leftSize(), 'a');
+ _count += builder.fillBucketToExactSize(rightChild, rightSize(), 'n');
- builder.checkStructure("{d:{b:{a:null},cc:{c:null}},"
- "dd:null,"
- "_:{f:{e:null},h:{g:null}}}");
+ ASSERT(leftAdditional() <= 2);
+ if (leftAdditional() >= 2) {
+ builder.push(leftChild, bigKey('k'), DiskLoc());
}
- };
-
- template<class OnDiskFormat>
- class CantMergeLeftNoMerge : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{c:{b:{a:null}},d:null,_:{f:{e:null},g:null}}");
-
- ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << "g");
- verify(this->unindex(k));
-
- ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{c:{b:{a:null}},d:null,_:{f:{e:null}}}");
+ if (leftAdditional() >= 1) {
+ builder.push(leftChild, bigKey('l'), DiskLoc());
}
- };
-
- template<class OnDiskFormat>
- class MergeOption : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},_:{h:{g:null}}}");
-
- ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << "ee");
- verify(this->unindex(k));
-
- ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{c:{b:{a:null}},_:{e:{d:null},f:null,h:{g:null}}}");
+ ASSERT(rightAdditional() <= 2);
+ if (rightAdditional() >= 2) {
+ builder.push(rightChild, bigKey('y'), DiskLoc());
}
- };
-
- template<class OnDiskFormat>
- class ForceMergeLeft : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},ff:null,_:{h:{g:null}}}");
-
- ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << "ee");
- verify(this->unindex(k));
-
- ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{f:{b:{a:null},c:null,e:{d:null}},ff:null,_:{h:{g:null}}}");
- }
- };
-
- template<class OnDiskFormat>
- class ForceMergeRight : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{c:{b:{a:null}},cc:null,f:{e:{d:null},ee:null},_:{h:{g:null}}}");
-
- ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << "ee");
- verify(this->unindex(k));
-
- ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{c:{b:{a:null}},cc:null,_:{e:{d:null},f:null,h:{g:null}}}");
- }
- };
-
- template<class OnDiskFormat>
- class RecursiveMerge : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},j:{i:null}}");
-
- ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << "c");
- verify(this->unindex(k));
-
- ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
-
- // Height is not currently reduced in this case
- builder.checkStructure("{j:{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}}");
+ if (rightAdditional() >= 1) {
+ builder.push(rightChild, bigKey('z'), DiskLoc());
}
- };
-
- template<class OnDiskFormat>
- class RecursiveMergeRightBucket : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},_:{i:null}}");
-
- ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << "c");
- verify(this->unindex(k));
-
- ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}");
- }
- };
-
- template<class OnDiskFormat>
- class RecursiveMergeDoubleRightBucket : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}");
-
- ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << "c");
- verify(this->unindex(k));
-
- ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
-
- // no recursion currently in this case
- builder.checkStructure("{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}");
- }
- };
-
- template<class OnDiskFormat>
- class MergeSizeTestBase : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- MergeSizeTestBase() : _count(0) {
-
- }
-
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
-
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- const BSONObj& topKey = biggestKey('m');
-
- DiskLoc leftChild = this->newBucket();
- builder.push(DiskLoc::fromRecordId(this->_helper.headManager.getHead(&txn)), topKey,
- leftChild);
- _count++;
-
- DiskLoc rightChild = this->newBucket();
- this->setBucketNextChild(DiskLoc::fromRecordId(this->_helper.headManager.getHead(&txn)),
- rightChild);
-
- _count += builder.fillBucketToExactSize(leftChild, leftSize(), 'a');
- _count += builder.fillBucketToExactSize(rightChild, rightSize(), 'n');
-
- ASSERT(leftAdditional() <= 2);
- if (leftAdditional() >= 2) {
- builder.push(leftChild, bigKey('k'), DiskLoc());
- }
- if (leftAdditional() >= 1) {
- builder.push(leftChild, bigKey('l'), DiskLoc());
- }
-
- ASSERT(rightAdditional() <= 2);
- if (rightAdditional() >= 2) {
- builder.push(rightChild, bigKey('y'), DiskLoc());
- }
- if (rightAdditional() >= 1) {
- builder.push(rightChild, bigKey('z'), DiskLoc());
- }
-
- _count += leftAdditional() + rightAdditional();
-
- initCheck();
-
- const char *keys = delKeys();
- for (const char *i = keys; *i; ++i) {
- long long unused = 0;
- ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- ASSERT_EQUALS(0, unused);
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+ _count += leftAdditional() + rightAdditional();
- const BSONObj k = bigKey(*i);
- this->unindex(k);
-
- --_count;
- }
+ initCheck();
+ const char* keys = delKeys();
+ for (const char* i = keys; *i; ++i) {
long long unused = 0;
ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
ASSERT_EQUALS(0, unused);
- validate();
-
- if (!merge()) {
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- }
- else {
- // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
- }
- }
-
- protected:
- virtual int leftAdditional() const { return 2; }
- virtual int rightAdditional() const { return 2; }
- virtual void initCheck() {}
- virtual void validate() {}
- virtual int leftSize() const = 0;
- virtual int rightSize() const = 0;
- virtual const char * delKeys() const { return "klyz"; }
- virtual bool merge() const { return true; }
-
- static BSONObj bigKey(char a) {
- return simpleKey(a, 801);
- }
-
- static BSONObj biggestKey(char a) {
- int size = OnDiskFormat::KeyMax - bigSize() + 801;
- return simpleKey(a, size);
- }
-
- static int bigSize() {
- return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(bigKey('a')).dataSize();
- }
-
- static int biggestSize() {
- return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(biggestKey('a')).dataSize();
- }
-
- int _count;
- };
-
- template<class OnDiskFormat>
- class MergeSizeJustRightRight : public MergeSizeTestBase<OnDiskFormat> {
- protected:
- virtual int rightSize() const {
- return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1;
- }
-
- virtual int leftSize() const {
- return OnDiskFormat::BucketBodySize -
- MergeSizeTestBase<OnDiskFormat>::biggestSize() -
- sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) -
- (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1);
- }
- };
-
- template<class OnDiskFormat>
- class MergeSizeJustRightLeft : public MergeSizeTestBase<OnDiskFormat> {
- protected:
- virtual int leftSize() const {
- return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1;
- }
-
- virtual int rightSize() const {
- return OnDiskFormat::BucketBodySize -
- MergeSizeTestBase<OnDiskFormat>::biggestSize() -
- sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) -
- (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1);
- }
-
- virtual const char * delKeys() const { return "yzkl"; }
- };
-
- template<class OnDiskFormat>
- class MergeSizeRight : public MergeSizeJustRightRight<OnDiskFormat> {
- virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() - 1; }
- virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
- };
-
- template<class OnDiskFormat>
- class MergeSizeLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
- virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
- virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() - 1; }
- };
-
- template<class OnDiskFormat>
- class NoMergeBelowMarkRight : public MergeSizeJustRightRight<OnDiskFormat> {
- virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; }
- virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() - 1; }
- virtual bool merge() const { return false; }
- };
-
- template<class OnDiskFormat>
- class NoMergeBelowMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
- virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() - 1; }
- virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; }
- virtual bool merge() const { return false; }
- };
-
- template<class OnDiskFormat>
- class MergeSizeRightTooBig : public MergeSizeJustRightLeft<OnDiskFormat> {
- virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
- virtual bool merge() const { return false; }
- };
-
- template<class OnDiskFormat>
- class MergeSizeLeftTooBig : public MergeSizeJustRightRight<OnDiskFormat> {
- virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
- virtual bool merge() const { return false; }
- };
-
- template<class OnDiskFormat>
- class MergeRightEmpty : public MergeSizeTestBase<OnDiskFormat> {
- protected:
- virtual int rightAdditional() const { return 1; }
- virtual int leftAdditional() const { return 1; }
- virtual const char * delKeys() const { return "lz"; }
- virtual int rightSize() const { return 0; }
- virtual int leftSize() const {
- return OnDiskFormat::BucketBodySize -
- MergeSizeTestBase<OnDiskFormat>::biggestSize() -
- sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
- }
- };
-
- template<class OnDiskFormat>
- class MergeMinRightEmpty : public MergeSizeTestBase<OnDiskFormat> {
- protected:
- virtual int rightAdditional() const { return 1; }
- virtual int leftAdditional() const { return 0; }
- virtual const char * delKeys() const { return "z"; }
- virtual int rightSize() const { return 0; }
- virtual int leftSize() const {
- return MergeSizeTestBase<OnDiskFormat>::bigSize() +
- sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
- }
- };
-
- template<class OnDiskFormat>
- class MergeLeftEmpty : public MergeSizeTestBase<OnDiskFormat> {
- protected:
- virtual int rightAdditional() const { return 1; }
- virtual int leftAdditional() const { return 1; }
- virtual const char * delKeys() const { return "zl"; }
- virtual int leftSize() const { return 0; }
- virtual int rightSize() const {
- return OnDiskFormat::BucketBodySize -
- MergeSizeTestBase<OnDiskFormat>::biggestSize() -
- sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
- }
- };
-
- template<class OnDiskFormat>
- class MergeMinLeftEmpty : public MergeSizeTestBase<OnDiskFormat> {
- protected:
- virtual int leftAdditional() const { return 1; }
- virtual int rightAdditional() const { return 0; }
- virtual const char * delKeys() const { return "l"; }
- virtual int leftSize() const { return 0; }
- virtual int rightSize() const {
- return MergeSizeTestBase<OnDiskFormat>::bigSize() +
- sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
- }
- };
-
- template<class OnDiskFormat>
- class BalanceRightEmpty : public MergeRightEmpty<OnDiskFormat> {
- protected:
- virtual int leftSize() const {
- return OnDiskFormat::BucketBodySize -
- MergeSizeTestBase<OnDiskFormat>::biggestSize() -
- sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1;
- }
-
- virtual bool merge() const { return false; }
-
- virtual void initCheck() {
- OperationContextNoop txn;
- _oldTop = this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson();
- }
-
- virtual void validate() {
- OperationContextNoop txn;
- ASSERT_NOT_EQUALS(_oldTop,
- this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
- }
-
- private:
- BSONObj _oldTop;
- };
-
- template<class OnDiskFormat>
- class BalanceLeftEmpty : public MergeLeftEmpty<OnDiskFormat> {
- protected:
- virtual int rightSize() const {
- return OnDiskFormat::BucketBodySize -
- MergeSizeTestBase<OnDiskFormat>::biggestSize() -
- sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1;
- }
-
- virtual bool merge() const { return false; }
-
- virtual void initCheck() {
- OperationContextNoop txn;
- _oldTop = this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson();
- }
-
- virtual void validate() {
- OperationContextNoop txn;
- ASSERT_TRUE(_oldTop != this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
- }
-
- private:
- BSONObj _oldTop;
- };
-
- template<class OnDiskFormat>
- class BalanceOneLeftToRight : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
- "b:{$20:null,$30:null,$40:null,$50:null,a:null},"
- "_:{c:null}}");
-
- ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << bigNumString(0x40, 800));
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
- "b:{$10:null,$20:null,$30:null,$50:null,a:null},"
- "_:{c:null}}");
- }
- };
-
- template<class OnDiskFormat>
- class BalanceOneRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null},"
- "b:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},"
- "_:{c:null}}");
-
- ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << bigNumString(0x3, 800));
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{$20:{$1:null,$2:null,$4:null,$10:null},"
- "b:{$30:null,$40:null,$50:null,$60:null,$70:null},"
- "_:{c:null}}");
- }
- };
-
- template<class OnDiskFormat>
- class BalanceThreeLeftToRight : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{$20:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null},"
- "$9:{$8:null},$11:{$10:null},$13:{$12:null},_:{$14:null}},"
- "b:{$30:null,$40:{$35:null},$50:{$45:null}},"
- "_:{c:null}}");
-
- ASSERT_EQUALS(23, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(15, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << bigNumString(0x30, 800));
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(15, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{$9:{$1:{$0:null},$3:{$2:null},"
- "$5:{$4:null},$7:{$6:null},_:{$8:null}},"
- "b:{$11:{$10:null},$13:{$12:null},$20:{$14:null},"
- "$40:{$35:null},$50:{$45:null}},"
- "_:{c:null}}");
- }
- };
-
- template<class OnDiskFormat>
- class BalanceThreeRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{$20:{$1:{$0:null},$3:{$2:null},$5:null,_:{$14:null}},"
- "b:{$30:{$25:null},$40:{$35:null},$50:{$45:null},$60:{$55:null},"
- "$70:{$65:null},$80:{$75:null},"
- "$90:{$85:null},$100:{$95:null}},"
- "_:{c:null}}");
-
- ASSERT_EQUALS(25, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(16, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << bigNumString(0x5, 800));
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(24, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(16, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{$50:{$1:{$0:null},$3:{$2:null},$20:{$14:null},"
- "$30:{$25:null},$40:{$35:null},_:{$45:null}},"
- "b:{$60:{$55:null},$70:{$65:null},$80:{$75:null},"
- "$90:{$85:null},$100:{$95:null}},"
- "_:{c:null}}");
- }
- };
-
- template<class OnDiskFormat>
- class BalanceSingleParentKey : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
- "_:{$20:null,$30:null,$40:null,$50:null,a:null}}");
-
- ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
// The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- const BSONObj k = BSON("" << bigNumString(0x40, 800));
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
- "_:{$10:null,$20:null,$30:null,$50:null,a:null}}");
- }
- };
-
- template<class OnDiskFormat>
- class PackEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{a:null}");
-
- const BSONObj k = BSON("" << "a");
- ASSERT(this->unindex(k));
-
- this->forcePackBucket(this->_helper.headManager.getHead(&txn));
-
- typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head();
-
- ASSERT_EQUALS(0, headBucket->n);
- ASSERT_FALSE(headBucket->flags & Packed);
-
- int unused = 0;
- this->truncateBucket(headBucket, 0, unused);
-
- ASSERT_EQUALS(0, headBucket->n);
- ASSERT_EQUALS(0, headBucket->topSize);
- ASSERT_EQUALS((int)OnDiskFormat::BucketBodySize, headBucket->emptySize);
- ASSERT_TRUE(headBucket->flags & Packed);
- }
- };
-
- template<class OnDiskFormat>
- class PackedDataSizeEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree("{a:null}");
-
- const BSONObj k = BSON("" << "a");
- ASSERT(this->unindex(k));
-
- this->forcePackBucket(this->_helper.headManager.getHead(&txn));
-
- typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head();
+ const BSONObj k = bigKey(*i);
+ this->unindex(k);
- ASSERT_EQUALS(0, headBucket->n);
- ASSERT_FALSE(headBucket->flags & Packed);
- ASSERT_EQUALS(0, this->bucketPackedDataSize(headBucket, 0));
- ASSERT_FALSE(headBucket->flags & Packed);
+ --_count;
}
- };
- template<class OnDiskFormat>
- class BalanceSingleParentKeyPackParent : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ long long unused = 0;
+ ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ ASSERT_EQUALS(0, unused);
- builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
- "_:{$20:null,$30:null,$40:null,$50:null,a:null}}");
-
- ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ validate();
+ if (!merge()) {
// The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
-
- // force parent pack
- this->forcePackBucket(this->_helper.headManager.getHead(&txn));
-
- const BSONObj k = BSON("" << bigNumString(0x40, 800));
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
- "_:{$10:null,$20:null,$30:null,$50:null,a:null}}");
- }
- };
-
- template<class OnDiskFormat>
- class BalanceSplitParent : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree(
- "{$10$10:{$1:null,$2:null,$3:null,$4:null},"
- "$100:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null},"
- "$200:null,$300:null,$400:null,$500:null,$600:null,"
- "$700:null,$800:null,$900:null,_:{c:null}}");
-
- ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
-
- const BSONObj k = BSON("" << bigNumString(0x3, 800));
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(21, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
-
- // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
-
- builder.checkStructure("{$500:{ $30:{$1:null,$2:null,$4:null,$10$10:null,$20:null},"
- "$100:{$40:null,$50:null,$60:null,$70:null,$80:null},"
- "$200:null,$300:null,$400:null},"
- "_:{$600:null,$700:null,$800:null,$900:null,_:{c:null}}}");
- }
- };
-
- template<class OnDiskFormat>
- class RebalancedSeparatorBase : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
-
- builder.makeTree(treeSpec());
- modTree();
-
- ASSERT_EQUALS(expectedSeparator(),
- this->bucketRebalancedSeparatorPos(
- this->_helper.headManager.getHead(&txn), 0));
- }
-
- virtual string treeSpec() const = 0;
- virtual int expectedSeparator() const = 0;
- virtual void modTree() {}
- };
-
- template<class OnDiskFormat>
- class EvenRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$7:{$1:null,$2$31f:null,$3:null,"
- "$4$31f:null,$5:null,$6:null},"
- "_:{$8:null,$9:null,$10$31e:null}}"; }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class EvenRebalanceLeftCusp : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const {
- return "{$6:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null},"
- "_:{$7:null,$8:null,$9$31e:null,$10:null}}";
- }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class EvenRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$3:{$1:null,$2$31f:null},_:{$4$31f:null,$5:null,$6:null,$7:null,$8$31e:null,$9:null,$10:null}}"; }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class EvenRebalanceRightCusp : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$4$31f:{$1:null,$2$31f:null,$3:null},_:{$5:null,$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class EvenRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$5:{$1:null,$2$31f:null,$3:null,$4$31f:null},_:{$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class OddRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$6$31f:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$7:null,$8:null,$9:null,$10:null}}"; }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class OddRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$4:{$1:null,$2:null,$3:null},_:{$5:null,$6:null,$7:null,$8$31f:null,$9:null,$10:null}}"; }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class OddRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$5:{$1:null,$2:null,$3:null,$4:null},_:{$6:null,$7:null,$8:null,$9:null,$10$31f:null}}"; }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class RebalanceEmptyRight : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$a:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null,$7:null,$8:null,$9:null},_:{$b:null}}"; }
- virtual void modTree() {
- BSONObj k = BSON("" << bigNumString(0xb, 800));
- ASSERT(this->unindex(k));
- }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class RebalanceEmptyLeft : public RebalancedSeparatorBase<OnDiskFormat> {
- virtual string treeSpec() const { return "{$a:{$1:null},_:{$11:null,$12:null,$13:null,$14:null,$15:null,$16:null,$17:null,$18:null,$19:null}}"; }
- virtual void modTree() {
- BSONObj k = BSON("" << bigNumString(0x1, 800));
- ASSERT(this->unindex(k));
- }
- virtual int expectedSeparator() const { return 4; }
- };
-
- template<class OnDiskFormat>
- class NoMoveAtLowWaterMarkRight : public MergeSizeJustRightRight<OnDiskFormat> {
- virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; }
-
- virtual void initCheck() {
- OperationContextNoop txn;
- _oldTop = this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson();
- }
-
- virtual void validate() {
- OperationContextNoop txn;
- ASSERT_EQUALS(_oldTop, this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
- }
-
- virtual bool merge() const { return false; }
-
- protected:
- BSONObj _oldTop;
- };
-
- template<class OnDiskFormat>
- class MoveBelowLowWaterMarkRight : public NoMoveAtLowWaterMarkRight<OnDiskFormat> {
- virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize(); }
- virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
-
- virtual void validate() {
- OperationContextNoop txn;
- // Different top means we rebalanced
- ASSERT_NOT_EQUALS(this->_oldTop,
- this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
- }
- };
-
- template<class OnDiskFormat>
- class NoMoveAtLowWaterMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
- virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; }
- virtual void initCheck() {
- OperationContextNoop txn;
- this->_oldTop = this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson();
+ } else {
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
}
-
- virtual void validate() {
- OperationContextNoop txn;
- ASSERT_EQUALS(this->_oldTop,
+ }
+
+protected:
+ virtual int leftAdditional() const {
+ return 2;
+ }
+ virtual int rightAdditional() const {
+ return 2;
+ }
+ virtual void initCheck() {}
+ virtual void validate() {}
+ virtual int leftSize() const = 0;
+ virtual int rightSize() const = 0;
+ virtual const char* delKeys() const {
+ return "klyz";
+ }
+ virtual bool merge() const {
+ return true;
+ }
+
+ static BSONObj bigKey(char a) {
+ return simpleKey(a, 801);
+ }
+
+ static BSONObj biggestKey(char a) {
+ int size = OnDiskFormat::KeyMax - bigSize() + 801;
+ return simpleKey(a, size);
+ }
+
+ static int bigSize() {
+ return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(bigKey('a')).dataSize();
+ }
+
+ static int biggestSize() {
+ return
+ typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(biggestKey('a')).dataSize();
+ }
+
+ int _count;
+};
+
+template <class OnDiskFormat>
+class MergeSizeJustRightRight : public MergeSizeTestBase<OnDiskFormat> {
+protected:
+ virtual int rightSize() const {
+ return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1;
+ }
+
+ virtual int leftSize() const {
+ return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) -
+ (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1);
+ }
+};
+
+template <class OnDiskFormat>
+class MergeSizeJustRightLeft : public MergeSizeTestBase<OnDiskFormat> {
+protected:
+ virtual int leftSize() const {
+ return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1;
+ }
+
+ virtual int rightSize() const {
+ return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) -
+ (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1);
+ }
+
+ virtual const char* delKeys() const {
+ return "yzkl";
+ }
+};
+
+template <class OnDiskFormat>
+class MergeSizeRight : public MergeSizeJustRightRight<OnDiskFormat> {
+ virtual int rightSize() const {
+ return MergeSizeJustRightRight<OnDiskFormat>::rightSize() - 1;
+ }
+ virtual int leftSize() const {
+ return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1;
+ }
+};
+
+template <class OnDiskFormat>
+class MergeSizeLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+ virtual int rightSize() const {
+ return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1;
+ }
+ virtual int leftSize() const {
+ return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() - 1;
+ }
+};
+
+template <class OnDiskFormat>
+class NoMergeBelowMarkRight : public MergeSizeJustRightRight<OnDiskFormat> {
+ virtual int rightSize() const {
+ return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1;
+ }
+ virtual int leftSize() const {
+ return MergeSizeJustRightRight<OnDiskFormat>::leftSize() - 1;
+ }
+ virtual bool merge() const {
+ return false;
+ }
+};
+
+template <class OnDiskFormat>
+class NoMergeBelowMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+ virtual int rightSize() const {
+ return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() - 1;
+ }
+ virtual int leftSize() const {
+ return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1;
+ }
+ virtual bool merge() const {
+ return false;
+ }
+};
+
+template <class OnDiskFormat>
+class MergeSizeRightTooBig : public MergeSizeJustRightLeft<OnDiskFormat> {
+ virtual int rightSize() const {
+ return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1;
+ }
+ virtual bool merge() const {
+ return false;
+ }
+};
+
+template <class OnDiskFormat>
+class MergeSizeLeftTooBig : public MergeSizeJustRightRight<OnDiskFormat> {
+ virtual int leftSize() const {
+ return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1;
+ }
+ virtual bool merge() const {
+ return false;
+ }
+};
+
+template <class OnDiskFormat>
+class MergeRightEmpty : public MergeSizeTestBase<OnDiskFormat> {
+protected:
+ virtual int rightAdditional() const {
+ return 1;
+ }
+ virtual int leftAdditional() const {
+ return 1;
+ }
+ virtual const char* delKeys() const {
+ return "lz";
+ }
+ virtual int rightSize() const {
+ return 0;
+ }
+ virtual int leftSize() const {
+ return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+ }
+};
+
+template <class OnDiskFormat>
+class MergeMinRightEmpty : public MergeSizeTestBase<OnDiskFormat> {
+protected:
+ virtual int rightAdditional() const {
+ return 1;
+ }
+ virtual int leftAdditional() const {
+ return 0;
+ }
+ virtual const char* delKeys() const {
+ return "z";
+ }
+ virtual int rightSize() const {
+ return 0;
+ }
+ virtual int leftSize() const {
+ return MergeSizeTestBase<OnDiskFormat>::bigSize() +
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+ }
+};
+
+template <class OnDiskFormat>
+class MergeLeftEmpty : public MergeSizeTestBase<OnDiskFormat> {
+protected:
+ virtual int rightAdditional() const {
+ return 1;
+ }
+ virtual int leftAdditional() const {
+ return 1;
+ }
+ virtual const char* delKeys() const {
+ return "zl";
+ }
+ virtual int leftSize() const {
+ return 0;
+ }
+ virtual int rightSize() const {
+ return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+ }
+};
+
+template <class OnDiskFormat>
+class MergeMinLeftEmpty : public MergeSizeTestBase<OnDiskFormat> {
+protected:
+ virtual int leftAdditional() const {
+ return 1;
+ }
+ virtual int rightAdditional() const {
+ return 0;
+ }
+ virtual const char* delKeys() const {
+ return "l";
+ }
+ virtual int leftSize() const {
+ return 0;
+ }
+ virtual int rightSize() const {
+ return MergeSizeTestBase<OnDiskFormat>::bigSize() +
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+ }
+};
+
+template <class OnDiskFormat>
+class BalanceRightEmpty : public MergeRightEmpty<OnDiskFormat> {
+protected:
+ virtual int leftSize() const {
+ return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1;
+ }
+
+ virtual bool merge() const {
+ return false;
+ }
+
+ virtual void initCheck() {
+ OperationContextNoop txn;
+ _oldTop = this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson();
+ }
+
+ virtual void validate() {
+ OperationContextNoop txn;
+ ASSERT_NOT_EQUALS(_oldTop,
this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
- }
- virtual bool merge() const { return false; }
+ }
+
+private:
+ BSONObj _oldTop;
+};
+
+template <class OnDiskFormat>
+class BalanceLeftEmpty : public MergeLeftEmpty<OnDiskFormat> {
+protected:
+ virtual int rightSize() const {
+ return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1;
+ }
+
+ virtual bool merge() const {
+ return false;
+ }
+
+ virtual void initCheck() {
+ OperationContextNoop txn;
+ _oldTop = this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson();
+ }
- protected:
- BSONObj _oldTop;
- };
+ virtual void validate() {
+ OperationContextNoop txn;
+ ASSERT_TRUE(_oldTop !=
+ this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
+ }
+
+private:
+ BSONObj _oldTop;
+};
+
+template <class OnDiskFormat>
+class BalanceOneLeftToRight : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(
+ "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+ "b:{$20:null,$30:null,$40:null,$50:null,a:null},"
+ "_:{c:null}}");
+
+ ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+
+ const BSONObj k = BSON("" << bigNumString(0x40, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+
+ builder.checkStructure(
+ "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+ "b:{$10:null,$20:null,$30:null,$50:null,a:null},"
+ "_:{c:null}}");
+ }
+};
+
+template <class OnDiskFormat>
+class BalanceOneRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(
+ "{$10:{$1:null,$2:null,$3:null,$4:null},"
+ "b:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},"
+ "_:{c:null}}");
+
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+
+ const BSONObj k = BSON("" << bigNumString(0x3, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+
+ builder.checkStructure(
+ "{$20:{$1:null,$2:null,$4:null,$10:null},"
+ "b:{$30:null,$40:null,$50:null,$60:null,$70:null},"
+ "_:{c:null}}");
+ }
+};
+
+template <class OnDiskFormat>
+class BalanceThreeLeftToRight : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(
+ "{$20:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null},"
+ "$9:{$8:null},$11:{$10:null},$13:{$12:null},_:{$14:null}},"
+ "b:{$30:null,$40:{$35:null},$50:{$45:null}},"
+ "_:{c:null}}");
+
+ ASSERT_EQUALS(23, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(15, this->_helper.recordStore.numRecords(NULL));
+
+ const BSONObj k = BSON("" << bigNumString(0x30, 800));
+ ASSERT(this->unindex(k));
- template<class OnDiskFormat>
- class MoveBelowLowWaterMarkLeft : public NoMoveAtLowWaterMarkLeft<OnDiskFormat> {
- virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize(); }
- virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
+ ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(15, this->_helper.recordStore.numRecords(NULL));
- virtual void validate() {
- OperationContextNoop txn;
- // Different top means we rebalanced
- ASSERT_NOT_EQUALS(this->_oldTop,
- this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
- }
- };
+ builder.checkStructure(
+ "{$9:{$1:{$0:null},$3:{$2:null},"
+ "$5:{$4:null},$7:{$6:null},_:{$8:null}},"
+ "b:{$11:{$10:null},$13:{$12:null},$20:{$14:null},"
+ "$40:{$35:null},$50:{$45:null}},"
+ "_:{c:null}}");
+ }
+};
+
+template <class OnDiskFormat>
+class BalanceThreeRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- template<class OnDiskFormat>
- class PreferBalanceLeft : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ builder.makeTree(
+ "{$20:{$1:{$0:null},$3:{$2:null},$5:null,_:{$14:null}},"
+ "b:{$30:{$25:null},$40:{$35:null},$50:{$45:null},$60:{$55:null},"
+ "$70:{$65:null},$80:{$75:null},"
+ "$90:{$85:null},$100:{$95:null}},"
+ "_:{c:null}}");
- builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
- "$20:{$11:null,$12:null,$13:null,$14:null},"
- "_:{$30:null}}");
+ ASSERT_EQUALS(25, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(16, this->_helper.recordStore.numRecords(NULL));
+
+ const BSONObj k = BSON("" << bigNumString(0x5, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(24, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(16, this->_helper.recordStore.numRecords(NULL));
+
+ builder.checkStructure(
+ "{$50:{$1:{$0:null},$3:{$2:null},$20:{$14:null},"
+ "$30:{$25:null},$40:{$35:null},_:{$45:null}},"
+ "b:{$60:{$55:null},$70:{$65:null},$80:{$75:null},"
+ "$90:{$85:null},$100:{$95:null}},"
+ "_:{c:null}}");
+ }
+};
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+template <class OnDiskFormat>
+class BalanceSingleParentKey : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- const BSONObj k = BSON("" << bigNumString(0x12, 800));
- ASSERT(this->unindex(k));
+ builder.makeTree(
+ "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+ "_:{$20:null,$30:null,$40:null,$50:null,a:null}}");
- ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- builder.checkStructure("{$5:{$1:null,$2:null,$3:null,$4:null},"
- "$20:{$6:null,$10:null,$11:null,$13:null,$14:null},"
- "_:{$30:null}}");
- }
- };
+ const BSONObj k = BSON("" << bigNumString(0x40, 800));
+ ASSERT(this->unindex(k));
- template<class OnDiskFormat>
- class PreferBalanceRight : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+
+ builder.checkStructure(
+ "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+ "_:{$10:null,$20:null,$30:null,$50:null,a:null}}");
+ }
+};
+
+template <class OnDiskFormat>
+class PackEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- builder.makeTree("{$10:{$1:null},"
- "$20:{$11:null,$12:null,$13:null,$14:null},"
- "_:{$31:null,$32:null,$33:null,$34:null,$35:null,$36:null}}");
+ builder.makeTree("{a:null}");
- ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ const BSONObj k = BSON(""
+ << "a");
+ ASSERT(this->unindex(k));
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ this->forcePackBucket(this->_helper.headManager.getHead(&txn));
- const BSONObj k = BSON("" << bigNumString(0x12, 800));
- ASSERT(this->unindex(k));
+ typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head();
- ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ ASSERT_EQUALS(0, headBucket->n);
+ ASSERT_FALSE(headBucket->flags & Packed);
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ int unused = 0;
+ this->truncateBucket(headBucket, 0, unused);
+
+ ASSERT_EQUALS(0, headBucket->n);
+ ASSERT_EQUALS(0, headBucket->topSize);
+ ASSERT_EQUALS((int)OnDiskFormat::BucketBodySize, headBucket->emptySize);
+ ASSERT_TRUE(headBucket->flags & Packed);
+ }
+};
+
+template <class OnDiskFormat>
+class PackedDataSizeEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- builder.checkStructure("{$10:{$1:null},"
- "$31:{$11:null,$13:null,$14:null,$20:null},"
- "_:{$32:null,$33:null,$34:null,$35:null,$36:null}}");
- }
- };
+ builder.makeTree("{a:null}");
- template<class OnDiskFormat>
- class RecursiveMergeThenBalance : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ const BSONObj k = BSON(""
+ << "a");
+ ASSERT(this->unindex(k));
- builder.makeTree("{$10:{$5:{$1:null,$2:null},$8:{$6:null,$7:null}},"
- "_:{$20:null,$30:null,$40:null,$50:null,"
- "$60:null,$70:null,$80:null,$90:null}}");
+ this->forcePackBucket(this->_helper.headManager.getHead(&txn));
- ASSERT_EQUALS(15, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head();
- // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, headBucket->n);
+ ASSERT_FALSE(headBucket->flags & Packed);
+ ASSERT_EQUALS(0, this->bucketPackedDataSize(headBucket, 0));
+ ASSERT_FALSE(headBucket->flags & Packed);
+ }
+};
+
+template <class OnDiskFormat>
+class BalanceSingleParentKeyPackParent : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(
+ "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+ "_:{$20:null,$30:null,$40:null,$50:null,a:null}}");
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+
+ // force parent pack
+ this->forcePackBucket(this->_helper.headManager.getHead(&txn));
+
+ const BSONObj k = BSON("" << bigNumString(0x40, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+
+ builder.checkStructure(
+ "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+ "_:{$10:null,$20:null,$30:null,$50:null,a:null}}");
+ }
+};
+
+template <class OnDiskFormat>
+class BalanceSplitParent : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(
+ "{$10$10:{$1:null,$2:null,$3:null,$4:null},"
+ "$100:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null},"
+ "$200:null,$300:null,$400:null,$500:null,$600:null,"
+ "$700:null,$800:null,$900:null,_:{c:null}}");
+
+ ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+
+ const BSONObj k = BSON("" << bigNumString(0x3, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(21, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL));
+
+ builder.checkStructure(
+ "{$500:{ $30:{$1:null,$2:null,$4:null,$10$10:null,$20:null},"
+ "$100:{$40:null,$50:null,$60:null,$70:null,$80:null},"
+ "$200:null,$300:null,$400:null},"
+ "_:{$600:null,$700:null,$800:null,$900:null,_:{c:null}}}");
+ }
+};
+
+template <class OnDiskFormat>
+class RebalancedSeparatorBase : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(treeSpec());
+ modTree();
+
+ ASSERT_EQUALS(
+ expectedSeparator(),
+ this->bucketRebalancedSeparatorPos(this->_helper.headManager.getHead(&txn), 0));
+ }
+
+ virtual string treeSpec() const = 0;
+ virtual int expectedSeparator() const = 0;
+ virtual void modTree() {}
+};
+
+template <class OnDiskFormat>
+class EvenRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$7:{$1:null,$2$31f:null,$3:null,"
+ "$4$31f:null,$5:null,$6:null},"
+ "_:{$8:null,$9:null,$10$31e:null}}";
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class EvenRebalanceLeftCusp : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$6:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null},"
+ "_:{$7:null,$8:null,$9$31e:null,$10:null}}";
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class EvenRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$3:{$1:null,$2$31f:null},_:{$4$31f:null,$5:null,$6:null,$7:null,$8$31e:null,$9:"
+ "null,$10:null}}";
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class EvenRebalanceRightCusp : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$4$31f:{$1:null,$2$31f:null,$3:null},_:{$5:null,$6:null,$7$31e:null,$8:null,$9:"
+ "null,$10:null}}";
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class EvenRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$5:{$1:null,$2$31f:null,$3:null,$4$31f:null},_:{$6:null,$7$31e:null,$8:null,$9:"
+ "null,$10:null}}";
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class OddRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$6$31f:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$7:null,$8:null,$9:null,$10:"
+ "null}}";
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class OddRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$4:{$1:null,$2:null,$3:null},_:{$5:null,$6:null,$7:null,$8$31f:null,$9:null,$10:"
+ "null}}";
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class OddRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$5:{$1:null,$2:null,$3:null,$4:null},_:{$6:null,$7:null,$8:null,$9:null,$10$31f:"
+ "null}}";
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class RebalanceEmptyRight : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$a:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null,$7:null,$8:null,$9:null},_:{$"
+ "b:null}}";
+ }
+ virtual void modTree() {
+ BSONObj k = BSON("" << bigNumString(0xb, 800));
+ ASSERT(this->unindex(k));
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class RebalanceEmptyLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$a:{$1:null},_:{$11:null,$12:null,$13:null,$14:null,$15:null,$16:null,$17:null,$"
+ "18:null,$19:null}}";
+ }
+ virtual void modTree() {
+ BSONObj k = BSON("" << bigNumString(0x1, 800));
+ ASSERT(this->unindex(k));
+ }
+ virtual int expectedSeparator() const {
+ return 4;
+ }
+};
+
+template <class OnDiskFormat>
+class NoMoveAtLowWaterMarkRight : public MergeSizeJustRightRight<OnDiskFormat> {
+ virtual int rightSize() const {
+ return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1;
+ }
+
+ virtual void initCheck() {
+ OperationContextNoop txn;
+ _oldTop = this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson();
+ }
+
+ virtual void validate() {
+ OperationContextNoop txn;
+ ASSERT_EQUALS(_oldTop,
+ this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
+ }
+
+ virtual bool merge() const {
+ return false;
+ }
+
+protected:
+ BSONObj _oldTop;
+};
+
+template <class OnDiskFormat>
+class MoveBelowLowWaterMarkRight : public NoMoveAtLowWaterMarkRight<OnDiskFormat> {
+ virtual int rightSize() const {
+ return MergeSizeJustRightRight<OnDiskFormat>::rightSize();
+ }
+ virtual int leftSize() const {
+ return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1;
+ }
+
+ virtual void validate() {
+ OperationContextNoop txn;
+ // Different top means we rebalanced
+ ASSERT_NOT_EQUALS(this->_oldTop,
+ this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
+ }
+};
+
+template <class OnDiskFormat>
+class NoMoveAtLowWaterMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+ virtual int leftSize() const {
+ return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1;
+ }
+ virtual void initCheck() {
+ OperationContextNoop txn;
+ this->_oldTop = this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson();
+ }
+
+ virtual void validate() {
+ OperationContextNoop txn;
+ ASSERT_EQUALS(this->_oldTop,
+ this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
+ }
+ virtual bool merge() const {
+ return false;
+ }
+
+protected:
+ BSONObj _oldTop;
+};
+
+template <class OnDiskFormat>
+class MoveBelowLowWaterMarkLeft : public NoMoveAtLowWaterMarkLeft<OnDiskFormat> {
+ virtual int leftSize() const {
+ return MergeSizeJustRightLeft<OnDiskFormat>::leftSize();
+ }
+ virtual int rightSize() const {
+ return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1;
+ }
+
+ virtual void validate() {
+ OperationContextNoop txn;
+ // Different top means we rebalanced
+ ASSERT_NOT_EQUALS(this->_oldTop,
+ this->getKey(this->_helper.headManager.getHead(&txn), 0).data.toBson());
+ }
+};
- const BSONObj k = BSON("" << bigNumString(0x7, 800));
- ASSERT(this->unindex(k));
+template <class OnDiskFormat>
+class PreferBalanceLeft : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ builder.makeTree(
+ "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+ "$20:{$11:null,$12:null,$13:null,$14:null},"
+ "_:{$30:null}}");
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- builder.checkStructure(
- "{$40:{$8:{$1:null,$2:null,$5:null,$6:null},$10:null,$20:null,$30:null},"
- "_:{$50:null,$60:null,$70:null,$80:null,$90:null}}");
- }
- };
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class DelEmptyNoNeighbors : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ const BSONObj k = BSON("" << bigNumString(0x12, 800));
+ ASSERT(this->unindex(k));
- builder.makeTree("{b:{a:null}}");
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
+ builder.checkStructure(
+ "{$5:{$1:null,$2:null,$3:null,$4:null},"
+ "$20:{$6:null,$10:null,$11:null,$13:null,$14:null},"
+ "_:{$30:null}}");
+ }
+};
- const BSONObj k = BSON("" << "a");
- ASSERT(this->unindex(k));
+template <class OnDiskFormat>
+class PreferBalanceRight : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ builder.makeTree(
+ "{$10:{$1:null},"
+ "$20:{$11:null,$12:null,$13:null,$14:null},"
+ "_:{$31:null,$32:null,$33:null,$34:null,$35:null,$36:null}}");
- // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- builder.checkStructure("{b:null}");
- }
- };
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class DelEmptyEmptyNeighbors : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ const BSONObj k = BSON("" << bigNumString(0x12, 800));
+ ASSERT(this->unindex(k));
- builder.makeTree("{a:null,c:{b:null},d:null}");
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
+ builder.checkStructure(
+ "{$10:{$1:null},"
+ "$31:{$11:null,$13:null,$14:null,$20:null},"
+ "_:{$32:null,$33:null,$34:null,$35:null,$36:null}}");
+ }
+};
- const BSONObj k = BSON("" << "b");
- ASSERT(this->unindex(k));
+template <class OnDiskFormat>
+class RecursiveMergeThenBalance : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
+ builder.makeTree(
+ "{$10:{$5:{$1:null,$2:null},$8:{$6:null,$7:null}},"
+ "_:{$20:null,$30:null,$40:null,$50:null,"
+ "$60:null,$70:null,$80:null,$90:null}}");
- // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(15, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- builder.checkStructure("{a:null,c:null,d:null}");
- }
- };
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class DelInternal : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ const BSONObj k = BSON("" << bigNumString(0x7, 800));
+ ASSERT(this->unindex(k));
- builder.makeTree("{a:null,c:{b:null},d:null}");
+ ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- long long unused = 0;
- ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ builder.checkStructure(
+ "{$40:{$8:{$1:null,$2:null,$5:null,$6:null},$10:null,$20:null,$30:null},"
+ "_:{$50:null,$60:null,$70:null,$80:null,$90:null}}");
+ }
+};
- const BSONObj k = BSON("" << "c");
- ASSERT(this->unindex(k));
+template <class OnDiskFormat>
+class DelEmptyNoNeighbors : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ builder.makeTree("{b:{a:null}}");
- // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- builder.checkStructure("{a:null,b:null,d:null}");
- }
- };
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
- template<class OnDiskFormat>
- class DelInternalReplaceWithUnused : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ const BSONObj k = BSON(""
+ << "a");
+ ASSERT(this->unindex(k));
- builder.makeTree("{a:null,c:{b:null},d:null}");
+ ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- const DiskLoc prevChildBucket =
- this->getKey(this->_helper.headManager.getHead(&txn), 1).prevChildBucket;
- this->markKeyUnused(prevChildBucket, 0);
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
- long long unused = 0;
- ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ builder.checkStructure("{b:null}");
+ }
+};
- // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(1, unused);
+template <class OnDiskFormat>
+class DelEmptyEmptyNeighbors : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- const BSONObj k = BSON("" << "c");
- ASSERT(this->unindex(k));
+ builder.makeTree("{a:null,c:{b:null},d:null}");
- unused = 0;
- ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(1, unused);
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
- // doesn't discriminate between used and unused
- builder.checkStructure("{a:null,b:null,d:null}");
- }
- };
+ const BSONObj k = BSON(""
+ << "b");
+ ASSERT(this->unindex(k));
- template<class OnDiskFormat>
- class DelInternalReplaceRight : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, false, 0));
- builder.makeTree("{a:null,_:{b:null}}");
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
- long long unused = 0;
- ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ builder.checkStructure("{a:null,c:null,d:null}");
+ }
+};
- // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+template <class OnDiskFormat>
+class DelInternal : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- const BSONObj k = BSON("" << "a");
- ASSERT(this->unindex(k));
+ builder.makeTree("{a:null,c:{b:null},d:null}");
- unused = 0;
- ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ long long unused = 0;
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- builder.checkStructure("{b:null}");
- }
- };
+ const BSONObj k = BSON(""
+ << "c");
+ ASSERT(this->unindex(k));
- template<class OnDiskFormat>
- class DelInternalPromoteKey : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- builder.makeTree("{a:null,y:{d:{c:{b:null}},_:{e:null}},z:null}");
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- long long unused = 0;
- ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ builder.checkStructure("{a:null,b:null,d:null}");
+ }
+};
- // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+template <class OnDiskFormat>
+class DelInternalReplaceWithUnused : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- const BSONObj k = BSON("" << "y");
- ASSERT(this->unindex(k));
+ builder.makeTree("{a:null,c:{b:null},d:null}");
- unused = 0;
- ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ const DiskLoc prevChildBucket =
+ this->getKey(this->_helper.headManager.getHead(&txn), 1).prevChildBucket;
+ this->markKeyUnused(prevChildBucket, 0);
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ long long unused = 0;
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- builder.checkStructure("{a:null,e:{c:{b:null},d:null},z:null}");
- }
- };
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(1, unused);
- template<class OnDiskFormat>
- class DelInternalPromoteRightKey : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ const BSONObj k = BSON(""
+ << "c");
+ ASSERT(this->unindex(k));
- builder.makeTree("{a:null,_:{e:{c:null},_:{f:null}}}");
+ unused = 0;
+ ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- long long unused = 0;
- ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(1, unused);
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ // doesn't discriminate between used and unused
+ builder.checkStructure("{a:null,b:null,d:null}");
+ }
+};
- const BSONObj k = BSON("" << "a");
- ASSERT(this->unindex(k));
+template <class OnDiskFormat>
+class DelInternalReplaceRight : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- unused = 0;
- ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ builder.makeTree("{a:null,_:{b:null}}");
- // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ long long unused = 0;
+ ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- builder.checkStructure("{c:null,_:{e:null,f:null}}");
- }
- };
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- template<class OnDiskFormat>
- class DelInternalReplacementPrevNonNull : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ const BSONObj k = BSON(""
+ << "a");
+ ASSERT(this->unindex(k));
- builder.makeTree("{a:null,d:{c:{b:null}},e:null}");
+ unused = 0;
+ ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- long long unused = 0;
- ASSERT_EQUALS(5, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ builder.checkStructure("{b:null}");
+ }
+};
- const BSONObj k = BSON("" << "d");
- ASSERT(this->unindex(k));
+template <class OnDiskFormat>
+class DelInternalPromoteKey : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ builder.makeTree("{a:null,y:{d:{c:{b:null}},_:{e:null}},z:null}");
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(1, unused);
+ long long unused = 0;
+ ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- builder.checkStructure("{a:null,d:{c:{b:null}},e:null}");
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- // Check 'unused' key
- ASSERT(this->getKey(this->_helper.headManager.getHead(&txn), 1).recordLoc.getOfs() & 1);
- }
- };
+ const BSONObj k = BSON(""
+ << "y");
+ ASSERT(this->unindex(k));
- template<class OnDiskFormat>
- class DelInternalReplacementNextNonNull : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ unused = 0;
+ ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- builder.makeTree("{a:null,_:{c:null,_:{d:null}}}");
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- long long unused = 0;
- ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ builder.checkStructure("{a:null,e:{c:{b:null},d:null},z:null}");
+ }
+};
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
-
- const BSONObj k = BSON("" << "a");
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
-
- // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(1, unused);
-
- builder.checkStructure("{a:null,_:{c:null,_:{d:null}}}");
+template <class OnDiskFormat>
+class DelInternalPromoteRightKey : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- // Check 'unused' key
- ASSERT(this->getKey(this->_helper.headManager.getHead(&txn), 0).recordLoc.getOfs() & 1);
- }
- };
+ builder.makeTree("{a:null,_:{e:{c:null},_:{f:null}}}");
- template<class OnDiskFormat>
- class DelInternalSplitPromoteLeft : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ long long unused = 0;
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- builder.makeTree("{$10:null,$20:null,$30$10:{$25:{$23:null},_:{$27:null}},"
- "$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100:null}");
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- long long unused = 0;
- ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ const BSONObj k = BSON(""
+ << "a");
+ ASSERT(this->unindex(k));
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
-
- const BSONObj k = BSON("" << bigNumString(0x30, 0x10));
- ASSERT(this->unindex(k));
-
- ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
-
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ unused = 0;
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- builder.checkStructure("{$60:{$10:null,$20:null,"
- "$27:{$23:null,$25:null},$40:null,$50:null},"
- "_:{$70:null,$80:null,$90:null,$100:null}}");
- }
- };
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- template<class OnDiskFormat>
- class DelInternalSplitPromoteRight : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ builder.checkStructure("{c:null,_:{e:null,f:null}}");
+ }
+};
- builder.makeTree("{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,"
- "$80:null,$90:null,$100$10:{$95:{$93:null},_:{$97:null}}}");
+template <class OnDiskFormat>
+class DelInternalReplacementPrevNonNull : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
- long long unused = 0;
- ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ builder.makeTree("{a:null,d:{c:{b:null}},e:null}");
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ long long unused = 0;
+ ASSERT_EQUALS(5, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- const BSONObj k = BSON("" << bigNumString(0x100, 0x10));
- ASSERT(this->unindex(k));
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+ const BSONObj k = BSON(""
+ << "d");
+ ASSERT(this->unindex(k));
- // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
- ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
- ASSERT_EQUALS(0, unused);
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- builder.checkStructure(
- "{$80:{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},"
- "_:{$90:null,$97:{$93:null,$95:null}}}");
- }
- };
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(1, unused);
- template<class OnDiskFormat>
- class LocateEmptyForward : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
-
- BSONObj key1 = simpleKey('a');
- this->insert(key1, this->_helper.dummyDiskLoc);
- BSONObj key2 = simpleKey('b');
- this->insert(key2, this->_helper.dummyDiskLoc);
- BSONObj key3 = simpleKey('c');
- this->insert(key3, this->_helper.dummyDiskLoc);
-
- this->checkValidNumKeys(3);
- this->locate(BSONObj(), 0, false, this->_helper.headManager.getHead(&txn), 1);
- }
- };
+ builder.checkStructure("{a:null,d:{c:{b:null}},e:null}");
- template<class OnDiskFormat>
- class LocateEmptyReverse : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
-
- BSONObj key1 = simpleKey('a');
- this->insert(key1, this->_helper.dummyDiskLoc);
- BSONObj key2 = simpleKey('b');
- this->insert(key2, this->_helper.dummyDiskLoc);
- BSONObj key3 = simpleKey('c');
- this->insert(key3, this->_helper.dummyDiskLoc);
-
- this->checkValidNumKeys(3);
- this->locate(BSONObj(), -1, false, DiskLoc(), -1);
- }
- };
+ // Check 'unused' key
+ ASSERT(this->getKey(this->_helper.headManager.getHead(&txn), 1).recordLoc.getOfs() & 1);
+ }
+};
- template<class OnDiskFormat>
- class DuplicateKeys : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- OperationContextNoop txn;
- this->_helper.btree.initAsEmpty(&txn);
-
- BSONObj key1 = simpleKey('z');
- ASSERT_OK(this->insert(key1, this->_helper.dummyDiskLoc, true));
- this->checkValidNumKeys(1);
- this->locate(key1, 0, true, this->_helper.headManager.getHead(&txn), 1);
-
- // Attempt to insert a dup key/value.
- ASSERT_EQUALS(ErrorCodes::DuplicateKeyValue,
- this->insert(key1, this->_helper.dummyDiskLoc, true));
- this->checkValidNumKeys(1);
- this->locate(key1, 0, true, this->_helper.headManager.getHead(&txn), 1);
-
- // Attempt to insert a dup key/value with dupsAllowed=false.
- ASSERT_EQUALS(ErrorCodes::DuplicateKeyValue,
- this->insert(key1, this->_helper.dummyDiskLoc, false));
- this->checkValidNumKeys(1);
- this->locate(key1, 0, true, this->_helper.headManager.getHead(&txn), 1);
-
- // Add another record to produce another diskloc.
- StatusWith<RecordId> s = this->_helper.recordStore.insertRecord(&txn, "a", 1, false);
-
- ASSERT_TRUE(s.isOK());
- ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
-
- const DiskLoc dummyDiskLoc2 = DiskLoc::fromRecordId(s.getValue());
-
- // Attempt to insert a dup key but this time with a different value.
- ASSERT_EQUALS(ErrorCodes::DuplicateKey, this->insert(key1, dummyDiskLoc2, false));
- this->checkValidNumKeys(1);
-
- // Insert a dup key with dupsAllowed=true, should succeed.
- ASSERT_OK(this->insert(key1, dummyDiskLoc2, true));
- this->checkValidNumKeys(2);
-
- // Clean up.
- this->_helper.recordStore.deleteRecord(&txn, s.getValue());
- ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
- }
- };
+template <class OnDiskFormat>
+class DelInternalReplacementNextNonNull : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+ builder.makeTree("{a:null,_:{c:null,_:{d:null}}}");
- /* This test requires the entire server to be linked-in and it is better implemented using
- the JS framework. Disabling here and will put in jsCore.
+ long long unused = 0;
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
- template<class OnDiskFormat>
- class SignedZeroDuplication : public BtreeLogicTestBase<OnDiskFormat> {
- public:
- void run() {
- ASSERT_EQUALS(0.0, -0.0);
- DBDirectClient c;
-
- static const string ns("unittests.SignedZeroDuplication");
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
- c.ensureIndex(ns, BSON("b" << 1), true);
- c.insert(ns, BSON("b" << 0.0));
- c.insert(ns, BSON("b" << 1.0));
- c.update(ns, BSON("b" << 1.0), BSON("b" << -0.0));
+ const BSONObj k = BSON(""
+ << "a");
+ ASSERT(this->unindex(k));
- ASSERT_EQUALS(1U, c.count(ns, BSON("b" << 0.0)));
- }
- };
- */
+ ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(1, unused);
+
+ builder.checkStructure("{a:null,_:{c:null,_:{d:null}}}");
+
+ // Check 'unused' key
+ ASSERT(this->getKey(this->_helper.headManager.getHead(&txn), 0).recordLoc.getOfs() & 1);
+ }
+};
+
+template <class OnDiskFormat>
+class DelInternalSplitPromoteLeft : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(
+ "{$10:null,$20:null,$30$10:{$25:{$23:null},_:{$27:null}},"
+ "$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100:null}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << bigNumString(0x30, 0x10));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
+
+ builder.checkStructure(
+ "{$60:{$10:null,$20:null,"
+ "$27:{$23:null,$25:null},$40:null,$50:null},"
+ "_:{$70:null,$80:null,$90:null,$100:null}}");
+ }
+};
+
+template <class OnDiskFormat>
+class DelInternalSplitPromoteRight : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(
+ "{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,"
+ "$80:null,$90:null,$100$10:{$95:{$93:null},_:{$97:null}}}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << bigNumString(0x100, 0x10));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, false, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL));
+ ASSERT_EQUALS(0, unused);
+
+ builder.checkStructure(
+ "{$80:{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},"
+ "_:{$90:null,$97:{$93:null,$95:null}}}");
+ }
+};
+
+template <class OnDiskFormat>
+class LocateEmptyForward : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ BSONObj key1 = simpleKey('a');
+ this->insert(key1, this->_helper.dummyDiskLoc);
+ BSONObj key2 = simpleKey('b');
+ this->insert(key2, this->_helper.dummyDiskLoc);
+ BSONObj key3 = simpleKey('c');
+ this->insert(key3, this->_helper.dummyDiskLoc);
+
+ this->checkValidNumKeys(3);
+ this->locate(BSONObj(), 0, false, this->_helper.headManager.getHead(&txn), 1);
+ }
+};
+
+template <class OnDiskFormat>
+class LocateEmptyReverse : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ BSONObj key1 = simpleKey('a');
+ this->insert(key1, this->_helper.dummyDiskLoc);
+ BSONObj key2 = simpleKey('b');
+ this->insert(key2, this->_helper.dummyDiskLoc);
+ BSONObj key3 = simpleKey('c');
+ this->insert(key3, this->_helper.dummyDiskLoc);
+
+ this->checkValidNumKeys(3);
+ this->locate(BSONObj(), -1, false, DiskLoc(), -1);
+ }
+};
+
+template <class OnDiskFormat>
+class DuplicateKeys : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ BSONObj key1 = simpleKey('z');
+ ASSERT_OK(this->insert(key1, this->_helper.dummyDiskLoc, true));
+ this->checkValidNumKeys(1);
+ this->locate(key1, 0, true, this->_helper.headManager.getHead(&txn), 1);
+
+ // Attempt to insert a dup key/value.
+ ASSERT_EQUALS(ErrorCodes::DuplicateKeyValue,
+ this->insert(key1, this->_helper.dummyDiskLoc, true));
+ this->checkValidNumKeys(1);
+ this->locate(key1, 0, true, this->_helper.headManager.getHead(&txn), 1);
+
+ // Attempt to insert a dup key/value with dupsAllowed=false.
+ ASSERT_EQUALS(ErrorCodes::DuplicateKeyValue,
+ this->insert(key1, this->_helper.dummyDiskLoc, false));
+ this->checkValidNumKeys(1);
+ this->locate(key1, 0, true, this->_helper.headManager.getHead(&txn), 1);
+
+ // Add another record to produce another diskloc.
+ StatusWith<RecordId> s = this->_helper.recordStore.insertRecord(&txn, "a", 1, false);
+
+ ASSERT_TRUE(s.isOK());
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL));
+
+ const DiskLoc dummyDiskLoc2 = DiskLoc::fromRecordId(s.getValue());
+
+ // Attempt to insert a dup key but this time with a different value.
+ ASSERT_EQUALS(ErrorCodes::DuplicateKey, this->insert(key1, dummyDiskLoc2, false));
+ this->checkValidNumKeys(1);
+
+ // Insert a dup key with dupsAllowed=true, should succeed.
+ ASSERT_OK(this->insert(key1, dummyDiskLoc2, true));
+ this->checkValidNumKeys(2);
+
+ // Clean up.
+ this->_helper.recordStore.deleteRecord(&txn, s.getValue());
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL));
+ }
+};
+
+
+/* This test requires the entire server to be linked-in and it is better implemented using
+ the JS framework. Disabling here and will put in jsCore.
+
+template<class OnDiskFormat>
+class SignedZeroDuplication : public BtreeLogicTestBase<OnDiskFormat> {
+public:
+ void run() {
+ ASSERT_EQUALS(0.0, -0.0);
+ DBDirectClient c;
+
+ static const string ns("unittests.SignedZeroDuplication");
+
+ c.ensureIndex(ns, BSON("b" << 1), true);
+ c.insert(ns, BSON("b" << 0.0));
+ c.insert(ns, BSON("b" << 1.0));
+ c.update(ns, BSON("b" << 1.0), BSON("b" << -0.0));
+
+ ASSERT_EQUALS(1U, c.count(ns, BSON("b" << 0.0)));
+ }
+};
+*/
/*
// QUERY_MIGRATION: port later
@@ -2217,111 +2391,107 @@ namespace mongo {
};
*/
- //
- // TEST SUITE DEFINITION
- //
-
- template<class OnDiskFormat>
- class BtreeLogicTestSuite : public unittest::Suite {
- public:
- BtreeLogicTestSuite(const std::string& name) : Suite(name) {
-
- }
-
- void setupTests() {
- add< SimpleCreate<OnDiskFormat> >();
- add< SimpleInsertDelete<OnDiskFormat> >();
- add< SplitRightHeavyBucket<OnDiskFormat> >();
- add< SplitLeftHeavyBucket<OnDiskFormat> >();
- add< MissingLocate<OnDiskFormat> >();
- add< MissingLocateMultiBucket<OnDiskFormat> >();
- add< SERVER983<OnDiskFormat> >();
- add< DontReuseUnused<OnDiskFormat> >();
- add< MergeBucketsLeft<OnDiskFormat> >();
- add< MergeBucketsRight<OnDiskFormat> >();
- add< MergeBucketsDontReplaceHead<OnDiskFormat> >();
- add< MergeBucketsDelInternal<OnDiskFormat> >();
- add< MergeBucketsRightNull<OnDiskFormat> >();
- add< DontMergeSingleBucket<OnDiskFormat> >();
- add< ParentMergeNonRightToLeft<OnDiskFormat> >();
- add< ParentMergeNonRightToRight<OnDiskFormat> >();
- add< CantMergeRightNoMerge<OnDiskFormat> >();
- add< CantMergeLeftNoMerge<OnDiskFormat> >();
- add< MergeOption<OnDiskFormat> >();
- add< ForceMergeLeft<OnDiskFormat> >();
- add< ForceMergeRight<OnDiskFormat> >();
- add< RecursiveMerge<OnDiskFormat> >();
- add< RecursiveMergeRightBucket<OnDiskFormat> >();
- add< RecursiveMergeDoubleRightBucket<OnDiskFormat> >();
-
- add< MergeSizeJustRightRight<OnDiskFormat> >();
- add< MergeSizeJustRightLeft<OnDiskFormat> >();
- add< MergeSizeRight<OnDiskFormat> >();
- add< MergeSizeLeft<OnDiskFormat> >();
- add< NoMergeBelowMarkRight<OnDiskFormat> >();
- add< NoMergeBelowMarkLeft<OnDiskFormat> >();
- add< MergeSizeRightTooBig<OnDiskFormat> >();
- add< MergeSizeLeftTooBig<OnDiskFormat> >();
- add< MergeRightEmpty<OnDiskFormat> >();
- add< MergeMinRightEmpty<OnDiskFormat> >();
- add< MergeLeftEmpty<OnDiskFormat> >();
- add< MergeMinLeftEmpty<OnDiskFormat> >();
- add< BalanceRightEmpty<OnDiskFormat> >();
- add< BalanceLeftEmpty<OnDiskFormat> >();
-
- add< BalanceOneLeftToRight<OnDiskFormat> >();
- add< BalanceOneRightToLeft<OnDiskFormat> >();
- add< BalanceThreeLeftToRight<OnDiskFormat> >();
- add< BalanceThreeRightToLeft<OnDiskFormat> >();
- add< BalanceSingleParentKey<OnDiskFormat> >();
-
- add< PackEmptyBucket<OnDiskFormat> >();
- add< PackedDataSizeEmptyBucket<OnDiskFormat> >();
-
- add< BalanceSingleParentKeyPackParent<OnDiskFormat> >();
- add< BalanceSplitParent<OnDiskFormat> >();
- add< EvenRebalanceLeft<OnDiskFormat> >();
- add< EvenRebalanceLeftCusp<OnDiskFormat> >();
- add< EvenRebalanceRight<OnDiskFormat> >();
- add< EvenRebalanceRightCusp<OnDiskFormat> >();
- add< EvenRebalanceCenter<OnDiskFormat> >();
- add< OddRebalanceLeft<OnDiskFormat> >();
- add< OddRebalanceRight<OnDiskFormat> >();
- add< OddRebalanceCenter<OnDiskFormat> >();
- add< RebalanceEmptyRight<OnDiskFormat> >();
- add< RebalanceEmptyLeft<OnDiskFormat> >();
-
- add< NoMoveAtLowWaterMarkRight<OnDiskFormat> >();
- add< MoveBelowLowWaterMarkRight<OnDiskFormat> >();
- add< NoMoveAtLowWaterMarkLeft<OnDiskFormat> >();
- add< MoveBelowLowWaterMarkLeft<OnDiskFormat> >();
-
- add< PreferBalanceLeft<OnDiskFormat> >();
- add< PreferBalanceRight<OnDiskFormat> >();
- add< RecursiveMergeThenBalance<OnDiskFormat> >();
- add< DelEmptyNoNeighbors<OnDiskFormat> >();
- add< DelEmptyEmptyNeighbors<OnDiskFormat> >();
- add< DelInternal<OnDiskFormat> >();
- add< DelInternalReplaceWithUnused<OnDiskFormat> >();
- add< DelInternalReplaceRight<OnDiskFormat> >();
- add< DelInternalPromoteKey<OnDiskFormat> >();
- add< DelInternalPromoteRightKey<OnDiskFormat> >();
- add< DelInternalReplacementPrevNonNull<OnDiskFormat> >();
- add< DelInternalReplacementNextNonNull<OnDiskFormat> >();
- add< DelInternalSplitPromoteLeft<OnDiskFormat> >();
- add< DelInternalSplitPromoteRight<OnDiskFormat> >();
-
- add< LocateEmptyForward<OnDiskFormat> >();
- add< LocateEmptyReverse<OnDiskFormat> >();
-
- add< DuplicateKeys<OnDiskFormat> >();
- }
- };
-
- // Test suite for both V0 and V1
- static unittest::SuiteInstance< BtreeLogicTestSuite<BtreeLayoutV0> > SUITE_V0(
- "BTreeLogicTests_V0");
+//
+// TEST SUITE DEFINITION
+//
- static unittest::SuiteInstance< BtreeLogicTestSuite<BtreeLayoutV1> > SUITE_V1(
- "BTreeLogicTests_V1");
+template <class OnDiskFormat>
+class BtreeLogicTestSuite : public unittest::Suite {
+public:
+ BtreeLogicTestSuite(const std::string& name) : Suite(name) {}
+
+ void setupTests() {
+ add<SimpleCreate<OnDiskFormat>>();
+ add<SimpleInsertDelete<OnDiskFormat>>();
+ add<SplitRightHeavyBucket<OnDiskFormat>>();
+ add<SplitLeftHeavyBucket<OnDiskFormat>>();
+ add<MissingLocate<OnDiskFormat>>();
+ add<MissingLocateMultiBucket<OnDiskFormat>>();
+ add<SERVER983<OnDiskFormat>>();
+ add<DontReuseUnused<OnDiskFormat>>();
+ add<MergeBucketsLeft<OnDiskFormat>>();
+ add<MergeBucketsRight<OnDiskFormat>>();
+ add<MergeBucketsDontReplaceHead<OnDiskFormat>>();
+ add<MergeBucketsDelInternal<OnDiskFormat>>();
+ add<MergeBucketsRightNull<OnDiskFormat>>();
+ add<DontMergeSingleBucket<OnDiskFormat>>();
+ add<ParentMergeNonRightToLeft<OnDiskFormat>>();
+ add<ParentMergeNonRightToRight<OnDiskFormat>>();
+ add<CantMergeRightNoMerge<OnDiskFormat>>();
+ add<CantMergeLeftNoMerge<OnDiskFormat>>();
+ add<MergeOption<OnDiskFormat>>();
+ add<ForceMergeLeft<OnDiskFormat>>();
+ add<ForceMergeRight<OnDiskFormat>>();
+ add<RecursiveMerge<OnDiskFormat>>();
+ add<RecursiveMergeRightBucket<OnDiskFormat>>();
+ add<RecursiveMergeDoubleRightBucket<OnDiskFormat>>();
+
+ add<MergeSizeJustRightRight<OnDiskFormat>>();
+ add<MergeSizeJustRightLeft<OnDiskFormat>>();
+ add<MergeSizeRight<OnDiskFormat>>();
+ add<MergeSizeLeft<OnDiskFormat>>();
+ add<NoMergeBelowMarkRight<OnDiskFormat>>();
+ add<NoMergeBelowMarkLeft<OnDiskFormat>>();
+ add<MergeSizeRightTooBig<OnDiskFormat>>();
+ add<MergeSizeLeftTooBig<OnDiskFormat>>();
+ add<MergeRightEmpty<OnDiskFormat>>();
+ add<MergeMinRightEmpty<OnDiskFormat>>();
+ add<MergeLeftEmpty<OnDiskFormat>>();
+ add<MergeMinLeftEmpty<OnDiskFormat>>();
+ add<BalanceRightEmpty<OnDiskFormat>>();
+ add<BalanceLeftEmpty<OnDiskFormat>>();
+
+ add<BalanceOneLeftToRight<OnDiskFormat>>();
+ add<BalanceOneRightToLeft<OnDiskFormat>>();
+ add<BalanceThreeLeftToRight<OnDiskFormat>>();
+ add<BalanceThreeRightToLeft<OnDiskFormat>>();
+ add<BalanceSingleParentKey<OnDiskFormat>>();
+
+ add<PackEmptyBucket<OnDiskFormat>>();
+ add<PackedDataSizeEmptyBucket<OnDiskFormat>>();
+
+ add<BalanceSingleParentKeyPackParent<OnDiskFormat>>();
+ add<BalanceSplitParent<OnDiskFormat>>();
+ add<EvenRebalanceLeft<OnDiskFormat>>();
+ add<EvenRebalanceLeftCusp<OnDiskFormat>>();
+ add<EvenRebalanceRight<OnDiskFormat>>();
+ add<EvenRebalanceRightCusp<OnDiskFormat>>();
+ add<EvenRebalanceCenter<OnDiskFormat>>();
+ add<OddRebalanceLeft<OnDiskFormat>>();
+ add<OddRebalanceRight<OnDiskFormat>>();
+ add<OddRebalanceCenter<OnDiskFormat>>();
+ add<RebalanceEmptyRight<OnDiskFormat>>();
+ add<RebalanceEmptyLeft<OnDiskFormat>>();
+
+ add<NoMoveAtLowWaterMarkRight<OnDiskFormat>>();
+ add<MoveBelowLowWaterMarkRight<OnDiskFormat>>();
+ add<NoMoveAtLowWaterMarkLeft<OnDiskFormat>>();
+ add<MoveBelowLowWaterMarkLeft<OnDiskFormat>>();
+
+ add<PreferBalanceLeft<OnDiskFormat>>();
+ add<PreferBalanceRight<OnDiskFormat>>();
+ add<RecursiveMergeThenBalance<OnDiskFormat>>();
+ add<DelEmptyNoNeighbors<OnDiskFormat>>();
+ add<DelEmptyEmptyNeighbors<OnDiskFormat>>();
+ add<DelInternal<OnDiskFormat>>();
+ add<DelInternalReplaceWithUnused<OnDiskFormat>>();
+ add<DelInternalReplaceRight<OnDiskFormat>>();
+ add<DelInternalPromoteKey<OnDiskFormat>>();
+ add<DelInternalPromoteRightKey<OnDiskFormat>>();
+ add<DelInternalReplacementPrevNonNull<OnDiskFormat>>();
+ add<DelInternalReplacementNextNonNull<OnDiskFormat>>();
+ add<DelInternalSplitPromoteLeft<OnDiskFormat>>();
+ add<DelInternalSplitPromoteRight<OnDiskFormat>>();
+
+ add<LocateEmptyForward<OnDiskFormat>>();
+ add<LocateEmptyReverse<OnDiskFormat>>();
+
+ add<DuplicateKeys<OnDiskFormat>>();
+ }
+};
+
+// Test suite for both V0 and V1
+static unittest::SuiteInstance<BtreeLogicTestSuite<BtreeLayoutV0>> SUITE_V0("BTreeLogicTests_V0");
+
+static unittest::SuiteInstance<BtreeLogicTestSuite<BtreeLayoutV1>> SUITE_V1("BTreeLogicTests_V1");
}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.cpp
index 15997d5681c..91b7141e7ed 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.cpp
@@ -37,23 +37,23 @@
namespace mongo {
- void DiskLoc56Bit::operator=(const DiskLoc& loc) {
- ofs = loc.getOfs();
- int la = loc.a();
- if (la == DiskLoc::max().a()) {
- invariant(ofs == DiskLoc::max().getOfs());
- la = OurMaxA;
- }
- invariant( la <= OurMaxA ); // must fit in 3 bytes
- if( la < 0 ) {
- if ( la != -1 ) {
- log() << "btree diskloc isn't negative 1: " << la << std::endl;
- invariant ( la == -1 );
- }
- la = 0;
- ofs = OurNullOfs;
+void DiskLoc56Bit::operator=(const DiskLoc& loc) {
+ ofs = loc.getOfs();
+ int la = loc.a();
+ if (la == DiskLoc::max().a()) {
+ invariant(ofs == DiskLoc::max().getOfs());
+ la = OurMaxA;
+ }
+ invariant(la <= OurMaxA); // must fit in 3 bytes
+ if (la < 0) {
+ if (la != -1) {
+ log() << "btree diskloc isn't negative 1: " << la << std::endl;
+ invariant(la == -1);
}
- memcpy(_a, &la, 3); // endian
+ la = 0;
+ ofs = OurNullOfs;
}
+ memcpy(_a, &la, 3); // endian
+}
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h
index a5ddec6bccd..3238ec64179 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h
@@ -34,337 +34,342 @@
namespace mongo {
- const int OldBucketSize = 8192;
+const int OldBucketSize = 8192;
+//
+// On-disk index format
+//
+
+#pragma pack(1)
+/**
+ * This is the fixed width data component for storage of a key within a bucket. It contains an
+ * offset pointer to the variable width bson data component. This may be 'unused', please see
+ * below.
+ *
+ * Why is this templated on Loc? Because V0 and V1 have different size DiskLoc(s) but otherwise
+ * the same layout.
+ */
+template <class LocType>
+struct FixedWidthKey {
//
- // On-disk index format
+ // Data
//
-#pragma pack(1)
/**
- * This is the fixed width data component for storage of a key within a bucket. It contains an
- * offset pointer to the variable width bson data component. This may be 'unused', please see
- * below.
- *
- * Why is this templated on Loc? Because V0 and V1 have different size DiskLoc(s) but otherwise
- * the same layout.
+ * The 'left' child bucket of this key. If this is the i-th key, it points to the i index
+ * child bucket.
*/
- template <class LocType>
- struct FixedWidthKey {
- //
- // Data
- //
-
- /**
- * The 'left' child bucket of this key. If this is the i-th key, it points to the i index
- * child bucket.
- */
- LocType prevChildBucket;
-
- /**
- * The location of the record associated with this key.
- */
- LocType recordLoc;
-
- /**
- * Offset within current bucket of the variable width bson key for this _KeyNode.
- */
- unsigned short _kdo;
-
- //
- // Accessors / mutators
- //
-
- short keyDataOfs() const {
- return static_cast<short>(_kdo);
- }
+ LocType prevChildBucket;
- void setKeyDataOfs(short s) {
- _kdo = s;
- invariant(s>=0);
- }
+ /**
+ * The location of the record associated with this key.
+ */
+ LocType recordLoc;
- void setKeyDataOfsSavingUse(short s) {
- // XXX kill this func
- setKeyDataOfs(s);
- }
+ /**
+ * Offset within current bucket of the variable width bson key for this _KeyNode.
+ */
+ unsigned short _kdo;
- /**
- * Unused keys are not returned by read operations. Keys may be marked
- * as unused in cases where it is difficult to delete them while
- * maintaining the constraints required of a btree.
- *
- * Setting ofs to odd is the sentinel for unused, as real recordLoc's
- * are always even numbers. Note we need to keep its value basically
- * the same as we use the recordLoc as part of the key in the index
- * (to handle duplicate keys efficiently).
- *
- * Flagging keys as unused is a feature that is being phased out in favor
- * of deleting the keys outright. The current btree implementation is
- * not expected to mark a key as unused in a non legacy btree.
- */
- void setUnused() {
- recordLoc.GETOFS() |= 1;
- }
+ //
+ // Accessors / mutators
+ //
- void setUsed() { recordLoc.GETOFS() &= ~1; }
+ short keyDataOfs() const {
+ return static_cast<short>(_kdo);
+ }
- int isUnused() const {
- return recordLoc.getOfs() & 1;
- }
+ void setKeyDataOfs(short s) {
+ _kdo = s;
+ invariant(s >= 0);
+ }
- int isUsed() const {
- return !isUnused();
- }
- };
+ void setKeyDataOfsSavingUse(short s) {
+ // XXX kill this func
+ setKeyDataOfs(s);
+ }
/**
- * This structure represents header data for a btree bucket. An object of
- * this type is typically allocated inside of a buffer of size BucketSize,
- * resulting in a full bucket with an appropriate header.
+ * Unused keys are not returned by read operations. Keys may be marked
+ * as unused in cases where it is difficult to delete them while
+ * maintaining the constraints required of a btree.
*
- * The body of a btree bucket contains an array of _KeyNode objects starting
- * from its lowest indexed bytes and growing to higher indexed bytes. The
- * body also contains variable width bson keys, which are allocated from the
- * highest indexed bytes toward lower indexed bytes.
+ * Setting ofs to odd is the sentinel for unused, as real recordLoc's
+ * are always even numbers. Note we need to keep its value basically
+ * the same as we use the recordLoc as part of the key in the index
+ * (to handle duplicate keys efficiently).
*
- * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
- * h = header data
- * k = KeyNode data
- * - = empty space
- * b = bson key data
- * u = unused (old) bson key data, that may be garbage collected
+ * Flagging keys as unused is a feature that is being phased out in favor
+ * of deleting the keys outright. The current btree implementation is
+ * not expected to mark a key as unused in a non legacy btree.
*/
- struct BtreeBucketV0 {
- /**
- * Parent bucket of this bucket, which isNull() for the root bucket.
- */
- DiskLoc parent;
+ void setUnused() {
+ recordLoc.GETOFS() |= 1;
+ }
- /**
- * Given that there are n keys, this is the n index child.
- */
- DiskLoc nextChild;
+ void setUsed() {
+ recordLoc.GETOFS() &= ~1;
+ }
- /**
- * Can be reused, value is 8192 in current pdfile version Apr2010
- */
- unsigned short _wasSize;
+ int isUnused() const {
+ return recordLoc.getOfs() & 1;
+ }
- /**
- * zero
- */
- unsigned short _reserved1;
+ int isUsed() const {
+ return !isUnused();
+ }
+};
- int flags;
+/**
+ * This structure represents header data for a btree bucket. An object of
+ * this type is typically allocated inside of a buffer of size BucketSize,
+ * resulting in a full bucket with an appropriate header.
+ *
+ * The body of a btree bucket contains an array of _KeyNode objects starting
+ * from its lowest indexed bytes and growing to higher indexed bytes. The
+ * body also contains variable width bson keys, which are allocated from the
+ * highest indexed bytes toward lower indexed bytes.
+ *
+ * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
+ * h = header data
+ * k = KeyNode data
+ * - = empty space
+ * b = bson key data
+ * u = unused (old) bson key data, that may be garbage collected
+ */
+struct BtreeBucketV0 {
+ /**
+ * Parent bucket of this bucket, which isNull() for the root bucket.
+ */
+ DiskLoc parent;
- /** basicInsert() assumes the next three members are consecutive and in this order: */
+ /**
+ * Given that there are n keys, this is the n index child.
+ */
+ DiskLoc nextChild;
- /** Size of the empty region. */
- int emptySize;
+ /**
+ * Can be reused, value is 8192 in current pdfile version Apr2010
+ */
+ unsigned short _wasSize;
- /** Size used for bson storage, including storage of old keys. */
- int topSize;
+ /**
+ * zero
+ */
+ unsigned short _reserved1;
- /* Number of keys in the bucket. */
- int n;
+ int flags;
- int reserved;
+ /** basicInsert() assumes the next three members are consecutive and in this order: */
- /* Beginning of the bucket's body */
- char data[4];
+ /** Size of the empty region. */
+ int emptySize;
- // Precalculated size constants
- enum { HeaderSize = 40 };
- };
+ /** Size used for bson storage, including storage of old keys. */
+ int topSize;
- // BtreeBucketV0 is part of the on-disk format, so it should never be changed
- BOOST_STATIC_ASSERT(
- sizeof(BtreeBucketV0) - sizeof(static_cast<BtreeBucketV0*>(NULL)->data)
- == BtreeBucketV0::HeaderSize);
+ /* Number of keys in the bucket. */
+ int n;
- /**
- * A variant of DiskLoc Used by the V1 bucket type.
- */
- struct DiskLoc56Bit {
- //
- // Data
- //
+ int reserved;
- int ofs;
+ /* Beginning of the bucket's body */
+ char data[4];
- unsigned char _a[3];
+ // Precalculated size constants
+ enum { HeaderSize = 40 };
+};
- //
- // Accessors XXX rename these, this is terrible
- //
+// BtreeBucketV0 is part of the on-disk format, so it should never be changed
+BOOST_STATIC_ASSERT(sizeof(BtreeBucketV0) - sizeof(static_cast<BtreeBucketV0*>(NULL)->data) ==
+ BtreeBucketV0::HeaderSize);
- int& GETOFS() { return ofs; }
+/**
+ * A variant of DiskLoc Used by the V1 bucket type.
+ */
+struct DiskLoc56Bit {
+ //
+ // Data
+ //
- int getOfs() const { return ofs; }
+ int ofs;
- //
- // Comparison
- //
+ unsigned char _a[3];
- bool isNull() const { return ofs < 0; }
+ //
+ // Accessors XXX rename these, this is terrible
+ //
- unsigned long long toLongLong() const {
- // endian
- unsigned long long result = ofs;
- char* cursor = reinterpret_cast<char *>(&result);
- *reinterpret_cast<uint16_t*>(cursor + 4) = *reinterpret_cast<const uint16_t*>(&_a[0]);
- *reinterpret_cast<uint8_t*>(cursor + 6) = *reinterpret_cast<const uint8_t*>(&_a[2]);
- *reinterpret_cast<uint8_t*>(cursor + 7) = uint8_t(0);
- return result;
- }
+ int& GETOFS() {
+ return ofs;
+ }
- bool operator<(const DiskLoc56Bit& rhs) const {
- // the orderering of dup keys in btrees isn't too critical, but we'd like to put items
- // that are close together on disk close together in the tree, so we do want the file #
- // to be the most significant bytes
- return toLongLong() < rhs.toLongLong();
- }
+ int getOfs() const {
+ return ofs;
+ }
- int compare(const DiskLoc56Bit& rhs) const {
- unsigned long long a = toLongLong();
- unsigned long long b = rhs.toLongLong();
- if ( a < b ) {
- return -1;
- }
- else {
- return a == b ? 0 : 1;
- }
- }
+ //
+ // Comparison
+ //
- bool operator==(const DiskLoc56Bit& rhs) const {
- return toLongLong() == rhs.toLongLong();
+ bool isNull() const {
+ return ofs < 0;
+ }
+
+ unsigned long long toLongLong() const {
+ // endian
+ unsigned long long result = ofs;
+ char* cursor = reinterpret_cast<char*>(&result);
+ *reinterpret_cast<uint16_t*>(cursor + 4) = *reinterpret_cast<const uint16_t*>(&_a[0]);
+ *reinterpret_cast<uint8_t*>(cursor + 6) = *reinterpret_cast<const uint8_t*>(&_a[2]);
+ *reinterpret_cast<uint8_t*>(cursor + 7) = uint8_t(0);
+ return result;
+ }
+
+ bool operator<(const DiskLoc56Bit& rhs) const {
+ // the orderering of dup keys in btrees isn't too critical, but we'd like to put items
+ // that are close together on disk close together in the tree, so we do want the file #
+ // to be the most significant bytes
+ return toLongLong() < rhs.toLongLong();
+ }
+
+ int compare(const DiskLoc56Bit& rhs) const {
+ unsigned long long a = toLongLong();
+ unsigned long long b = rhs.toLongLong();
+ if (a < b) {
+ return -1;
+ } else {
+ return a == b ? 0 : 1;
}
+ }
- bool operator!=(const DiskLoc56Bit& rhs) const {
- return toLongLong() != rhs.toLongLong();
- }
+ bool operator==(const DiskLoc56Bit& rhs) const {
+ return toLongLong() == rhs.toLongLong();
+ }
- bool operator==(const DiskLoc& rhs) const {
- return DiskLoc(*this) == rhs;
- }
+ bool operator!=(const DiskLoc56Bit& rhs) const {
+ return toLongLong() != rhs.toLongLong();
+ }
- bool operator!=(const DiskLoc& rhs) const {
- return !(*this==rhs);
- }
+ bool operator==(const DiskLoc& rhs) const {
+ return DiskLoc(*this) == rhs;
+ }
- //
- // Mutation
- //
+ bool operator!=(const DiskLoc& rhs) const {
+ return !(*this == rhs);
+ }
- enum {
- OurNullOfs = -2, // first bit of offsets used in _KeyNode we don't use -1 here
- OurMaxA = 0xffffff, // highest 3-byte value
- };
+ //
+ // Mutation
+ //
- void Null() {
- ofs = OurNullOfs;
- _a[0] = _a[1] = _a[2] = 0;
- }
+ enum {
+ OurNullOfs = -2, // first bit of offsets used in _KeyNode we don't use -1 here
+ OurMaxA = 0xffffff, // highest 3-byte value
+ };
- void operator=(const DiskLoc& loc);
+ void Null() {
+ ofs = OurNullOfs;
+ _a[0] = _a[1] = _a[2] = 0;
+ }
- //
- // Type Conversion
- //
+ void operator=(const DiskLoc& loc);
- RecordId toRecordId() const {
- return DiskLoc(*this).toRecordId();
- }
+ //
+ // Type Conversion
+ //
- operator DiskLoc() const {
- // endian
- if( isNull() ) return DiskLoc();
- unsigned a = *((unsigned *) (_a-1));
- return DiskLoc(a >> 8, ofs);
- }
+ RecordId toRecordId() const {
+ return DiskLoc(*this).toRecordId();
+ }
- std::string toString() const { return DiskLoc(*this).toString(); }
- };
+ operator DiskLoc() const {
+ // endian
+ if (isNull())
+ return DiskLoc();
+ unsigned a = *((unsigned*)(_a - 1));
+ return DiskLoc(a >> 8, ofs);
+ }
- struct BtreeBucketV1 {
- /** Parent bucket of this bucket, which isNull() for the root bucket. */
- DiskLoc56Bit parent;
+ std::string toString() const {
+ return DiskLoc(*this).toString();
+ }
+};
- /** Given that there are n keys, this is the n index child. */
- DiskLoc56Bit nextChild;
+struct BtreeBucketV1 {
+ /** Parent bucket of this bucket, which isNull() for the root bucket. */
+ DiskLoc56Bit parent;
- unsigned short flags;
+ /** Given that there are n keys, this is the n index child. */
+ DiskLoc56Bit nextChild;
- /** Size of the empty region. */
- unsigned short emptySize;
+ unsigned short flags;
- /** Size used for bson storage, including storage of old keys. */
- unsigned short topSize;
+ /** Size of the empty region. */
+ unsigned short emptySize;
- /* Number of keys in the bucket. */
- unsigned short n;
+ /** Size used for bson storage, including storage of old keys. */
+ unsigned short topSize;
- /* Beginning of the bucket's body */
- char data[4];
+ /* Number of keys in the bucket. */
+ unsigned short n;
- // Precalculated size constants
- enum { HeaderSize = 22 };
- };
+ /* Beginning of the bucket's body */
+ char data[4];
- // BtreeBucketV1 is part of the on-disk format, so it should never be changed
- BOOST_STATIC_ASSERT(
- sizeof(BtreeBucketV1) - sizeof(static_cast<BtreeBucketV1*>(NULL)->data)
- == BtreeBucketV1::HeaderSize);
+ // Precalculated size constants
+ enum { HeaderSize = 22 };
+};
- enum Flags {
- Packed = 1
- };
+// BtreeBucketV1 is part of the on-disk format, so it should never be changed
+BOOST_STATIC_ASSERT(sizeof(BtreeBucketV1) - sizeof(static_cast<BtreeBucketV1*>(NULL)->data) ==
+ BtreeBucketV1::HeaderSize);
- struct BtreeLayoutV0 {
- typedef FixedWidthKey<DiskLoc> FixedWidthKeyType;
- typedef DiskLoc LocType;
- typedef KeyBson KeyType;
- typedef KeyBson KeyOwnedType;
- typedef BtreeBucketV0 BucketType;
+enum Flags { Packed = 1 };
- enum { BucketSize = 8192,
- BucketBodySize = BucketSize - BucketType::HeaderSize
- };
+struct BtreeLayoutV0 {
+ typedef FixedWidthKey<DiskLoc> FixedWidthKeyType;
+ typedef DiskLoc LocType;
+ typedef KeyBson KeyType;
+ typedef KeyBson KeyOwnedType;
+ typedef BtreeBucketV0 BucketType;
- // largest key size we allow. note we very much need to support bigger keys (somehow) in
- // the future.
+ enum { BucketSize = 8192, BucketBodySize = BucketSize - BucketType::HeaderSize };
- static const int KeyMax = OldBucketSize / 10;
+ // largest key size we allow. note we very much need to support bigger keys (somehow) in
+ // the future.
- // A sentinel value sometimes used to identify a deallocated bucket.
- static const int INVALID_N_SENTINEL = -1;
+ static const int KeyMax = OldBucketSize / 10;
- static void initBucket(BucketType* bucket) {
- bucket->_reserved1 = 0;
- bucket->_wasSize = BucketSize;
- bucket->reserved = 0;
- }
- };
+ // A sentinel value sometimes used to identify a deallocated bucket.
+ static const int INVALID_N_SENTINEL = -1;
+
+ static void initBucket(BucketType* bucket) {
+ bucket->_reserved1 = 0;
+ bucket->_wasSize = BucketSize;
+ bucket->reserved = 0;
+ }
+};
- struct BtreeLayoutV1 {
- typedef FixedWidthKey<DiskLoc56Bit> FixedWidthKeyType;
- typedef KeyV1 KeyType;
- typedef KeyV1Owned KeyOwnedType;
- typedef DiskLoc56Bit LocType;
- typedef BtreeBucketV1 BucketType;
+struct BtreeLayoutV1 {
+ typedef FixedWidthKey<DiskLoc56Bit> FixedWidthKeyType;
+ typedef KeyV1 KeyType;
+ typedef KeyV1Owned KeyOwnedType;
+ typedef DiskLoc56Bit LocType;
+ typedef BtreeBucketV1 BucketType;
- enum { BucketSize = 8192 - 16, // The -16 is to leave room for the MmapV1RecordHeader header
- BucketBodySize = BucketSize - BucketType::HeaderSize
- };
+ enum {
+ BucketSize = 8192 - 16, // The -16 is to leave room for the MmapV1RecordHeader header
+ BucketBodySize = BucketSize - BucketType::HeaderSize
+ };
- static const int KeyMax = 1024;
+ static const int KeyMax = 1024;
- // A sentinel value sometimes used to identify a deallocated bucket.
- static const unsigned short INVALID_N_SENTINEL = 0xffff;
+ // A sentinel value sometimes used to identify a deallocated bucket.
+ static const unsigned short INVALID_N_SENTINEL = 0xffff;
- static void initBucket(BucketType* bucket) { }
- };
+ static void initBucket(BucketType* bucket) {}
+};
#pragma pack()
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp
index fe0cdf7e82e..760095898be 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp
@@ -37,210 +37,203 @@
namespace mongo {
- using std::string;
-
- string bigNumString(long long n, int len) {
- char sub[17];
- sprintf(sub, "%.16llx", n);
- string val(len, ' ');
- for (int i = 0; i < len; ++i) {
- val[i] = sub[i % 16];
- }
- return val;
- }
-
- BSONObj simpleKey(char c, int n) {
- BSONObjBuilder builder;
- string val(n, c);
- builder.append("a", val);
- return builder.obj();
+using std::string;
+
+string bigNumString(long long n, int len) {
+ char sub[17];
+ sprintf(sub, "%.16llx", n);
+ string val(len, ' ');
+ for (int i = 0; i < len; ++i) {
+ val[i] = sub[i % 16];
}
+ return val;
+}
- //
- // BtreeLogicTestHelper
- //
-
- template <class OnDiskFormat>
- BtreeLogicTestHelper<OnDiskFormat>::BtreeLogicTestHelper(const BSONObj& order)
- : recordStore("TestRecordStore"),
- btree(&headManager,
- &recordStore,
- &cursorRegistry,
- Ordering::make(order),
- "TestIndex") {
- static const string randomData("RandomStuff");
-
- // Generate a valid record location for a "fake" record, which we will repeatedly use
- // thoughout the tests.
- OperationContextNoop txn;
- StatusWith<RecordId> s =
- recordStore.insertRecord(&txn, randomData.c_str(), randomData.length(), false);
-
- ASSERT_TRUE(s.isOK());
- ASSERT_EQUALS(1, recordStore.numRecords(NULL));
-
- dummyDiskLoc = DiskLoc::fromRecordId(s.getValue());
- }
+BSONObj simpleKey(char c, int n) {
+ BSONObjBuilder builder;
+ string val(n, c);
+ builder.append("a", val);
+ return builder.obj();
+}
+//
+// BtreeLogicTestHelper
+//
- //
- // ArtificialTreeBuilder
- //
+template <class OnDiskFormat>
+BtreeLogicTestHelper<OnDiskFormat>::BtreeLogicTestHelper(const BSONObj& order)
+ : recordStore("TestRecordStore"),
+ btree(&headManager, &recordStore, &cursorRegistry, Ordering::make(order), "TestIndex") {
+ static const string randomData("RandomStuff");
- template <class OnDiskFormat>
- void ArtificialTreeBuilder<OnDiskFormat>::makeTree(const string &spec) {
- _helper->headManager.setHead(_txn, makeTree(fromjson(spec)).toRecordId());
- }
+ // Generate a valid record location for a "fake" record, which we will repeatedly use
+ // thoughout the tests.
+ OperationContextNoop txn;
+ StatusWith<RecordId> s =
+ recordStore.insertRecord(&txn, randomData.c_str(), randomData.length(), false);
- template <class OnDiskFormat>
- DiskLoc ArtificialTreeBuilder<OnDiskFormat>::makeTree(const BSONObj &spec) {
- DiskLoc bucketLoc = _helper->btree._addBucket(_txn);
- BucketType* bucket = _helper->btree.getBucket(_txn, bucketLoc);
-
- BSONObjIterator i(spec);
- while (i.more()) {
- BSONElement e = i.next();
- DiskLoc child;
- if (e.type() == Object) {
- child = makeTree(e.embeddedObject());
- }
-
- if (e.fieldName() == string("_")) {
- bucket->nextChild = child;
- }
- else {
- KeyDataOwnedType key(BSON("" << expectedKey(e.fieldName())));
- invariant(_helper->btree.pushBack(bucket, _helper->dummyDiskLoc, key, child));
- }
- }
+ ASSERT_TRUE(s.isOK());
+ ASSERT_EQUALS(1, recordStore.numRecords(NULL));
- _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc);
- return bucketLoc;
- }
+ dummyDiskLoc = DiskLoc::fromRecordId(s.getValue());
+}
- template <class OnDiskFormat>
- void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(const string &spec) const {
- checkStructure(fromjson(spec), DiskLoc::fromRecordId(_helper->headManager.getHead(_txn)));
- }
- template <class OnDiskFormat>
- void ArtificialTreeBuilder<OnDiskFormat>::push(
- const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child) {
- KeyDataOwnedType k(key);
- BucketType* bucket = _helper->btree.getBucket(_txn, bucketLoc);
+//
+// ArtificialTreeBuilder
+//
- invariant(_helper->btree.pushBack(bucket, _helper->dummyDiskLoc, k, child));
- _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc);
- }
+template <class OnDiskFormat>
+void ArtificialTreeBuilder<OnDiskFormat>::makeTree(const string& spec) {
+ _helper->headManager.setHead(_txn, makeTree(fromjson(spec)).toRecordId());
+}
- template <class OnDiskFormat>
- void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(
- const BSONObj &spec, const DiskLoc node) const {
- BucketType* bucket = _helper->btree.getBucket(_txn, node);
-
- BSONObjIterator j(spec);
- for (int i = 0; i < bucket->n; ++i) {
- ASSERT(j.more());
- BSONElement e = j.next();
- KeyHeaderType kn = BtreeLogic<OnDiskFormat>::getKeyHeader(bucket, i);
- string expected = expectedKey(e.fieldName());
- ASSERT(isPresent(BSON("" << expected), 1));
- ASSERT(isPresent(BSON("" << expected), -1));
-
- // ASSERT_EQUALS(expected, kn.key.toBson().firstElement().valuestr());
- if (kn.prevChildBucket.isNull()) {
- ASSERT(e.type() == jstNULL);
- }
- else {
- ASSERT(e.type() == Object);
- checkStructure(e.embeddedObject(), kn.prevChildBucket);
- }
+template <class OnDiskFormat>
+DiskLoc ArtificialTreeBuilder<OnDiskFormat>::makeTree(const BSONObj& spec) {
+ DiskLoc bucketLoc = _helper->btree._addBucket(_txn);
+ BucketType* bucket = _helper->btree.getBucket(_txn, bucketLoc);
+
+ BSONObjIterator i(spec);
+ while (i.more()) {
+ BSONElement e = i.next();
+ DiskLoc child;
+ if (e.type() == Object) {
+ child = makeTree(e.embeddedObject());
}
- if (bucket->nextChild.isNull()) {
- // maybe should allow '_' field with null value?
- ASSERT(!j.more());
- }
- else {
- BSONElement e = j.next();
- ASSERT_EQUALS(string("_"), e.fieldName());
- ASSERT(e.type() == Object);
- checkStructure(e.embeddedObject(), bucket->nextChild);
- }
- ASSERT(!j.more());
- }
- template <class OnDiskFormat>
- bool ArtificialTreeBuilder<OnDiskFormat>::isPresent(const BSONObj &key, int direction) const {
- int pos;
- DiskLoc loc;
- OperationContextNoop txn;
- return _helper->btree.locate(&txn, key, _helper->dummyDiskLoc, direction, &pos, &loc);
+ if (e.fieldName() == string("_")) {
+ bucket->nextChild = child;
+ } else {
+ KeyDataOwnedType key(BSON("" << expectedKey(e.fieldName())));
+ invariant(_helper->btree.pushBack(bucket, _helper->dummyDiskLoc, key, child));
+ }
}
- // Static
- template <class OnDiskFormat>
- string ArtificialTreeBuilder<OnDiskFormat>::expectedKey(const char *spec) {
- if (spec[0] != '$') {
- return spec;
- }
- char *endPtr;
+ _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc);
+ return bucketLoc;
+}
- // parsing a long long is a pain, so just allow shorter keys for now
- unsigned long long num = strtol(spec + 1, &endPtr, 16);
- int len = 800;
- if (*endPtr == '$') {
- len = strtol(endPtr + 1, 0, 16);
- }
+template <class OnDiskFormat>
+void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(const string& spec) const {
+ checkStructure(fromjson(spec), DiskLoc::fromRecordId(_helper->headManager.getHead(_txn)));
+}
- return bigNumString(num, len);
- }
+template <class OnDiskFormat>
+void ArtificialTreeBuilder<OnDiskFormat>::push(const DiskLoc bucketLoc,
+ const BSONObj& key,
+ const DiskLoc child) {
+ KeyDataOwnedType k(key);
+ BucketType* bucket = _helper->btree.getBucket(_txn, bucketLoc);
- template <class OnDiskFormat>
- int ArtificialTreeBuilder<OnDiskFormat>::fillBucketToExactSize(
- const DiskLoc bucketLoc, int targetSize, char startKey) {
- ASSERT_FALSE(bucketLoc.isNull());
+ invariant(_helper->btree.pushBack(bucket, _helper->dummyDiskLoc, k, child));
+ _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc);
+}
- BucketType* bucket = _helper->btree.getBucket(_txn, bucketLoc);
- ASSERT_EQUALS(0, bucket->n);
+template <class OnDiskFormat>
+void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(const BSONObj& spec,
+ const DiskLoc node) const {
+ BucketType* bucket = _helper->btree.getBucket(_txn, node);
+
+ BSONObjIterator j(spec);
+ for (int i = 0; i < bucket->n; ++i) {
+ ASSERT(j.more());
+ BSONElement e = j.next();
+ KeyHeaderType kn = BtreeLogic<OnDiskFormat>::getKeyHeader(bucket, i);
+ string expected = expectedKey(e.fieldName());
+ ASSERT(isPresent(BSON("" << expected), 1));
+ ASSERT(isPresent(BSON("" << expected), -1));
+
+ // ASSERT_EQUALS(expected, kn.key.toBson().firstElement().valuestr());
+ if (kn.prevChildBucket.isNull()) {
+ ASSERT(e.type() == jstNULL);
+ } else {
+ ASSERT(e.type() == Object);
+ checkStructure(e.embeddedObject(), kn.prevChildBucket);
+ }
+ }
+ if (bucket->nextChild.isNull()) {
+ // maybe should allow '_' field with null value?
+ ASSERT(!j.more());
+ } else {
+ BSONElement e = j.next();
+ ASSERT_EQUALS(string("_"), e.fieldName());
+ ASSERT(e.type() == Object);
+ checkStructure(e.embeddedObject(), bucket->nextChild);
+ }
+ ASSERT(!j.more());
+}
- static const int bigSize = KeyDataOwnedType(simpleKey('a', 801)).dataSize();
+template <class OnDiskFormat>
+bool ArtificialTreeBuilder<OnDiskFormat>::isPresent(const BSONObj& key, int direction) const {
+ int pos;
+ DiskLoc loc;
+ OperationContextNoop txn;
+ return _helper->btree.locate(&txn, key, _helper->dummyDiskLoc, direction, &pos, &loc);
+}
- int size = 0;
- int keyCount = 0;
- while (size < targetSize) {
- int space = targetSize - size;
- int nextSize = space - sizeof(FixedWidthKeyType);
- verify(nextSize > 0);
+// Static
+template <class OnDiskFormat>
+string ArtificialTreeBuilder<OnDiskFormat>::expectedKey(const char* spec) {
+ if (spec[0] != '$') {
+ return spec;
+ }
+ char* endPtr;
- BSONObj newKey;
- if (nextSize >= bigSize) {
- newKey = simpleKey(startKey++, 801);
- }
- else {
- newKey = simpleKey(startKey++, nextSize - (bigSize - 801));
- }
+ // parsing a long long is a pain, so just allow shorter keys for now
+ unsigned long long num = strtol(spec + 1, &endPtr, 16);
+ int len = 800;
+ if (*endPtr == '$') {
+ len = strtol(endPtr + 1, 0, 16);
+ }
- push(bucketLoc, newKey, DiskLoc());
+ return bigNumString(num, len);
+}
- size += KeyDataOwnedType(newKey).dataSize() +
- sizeof(FixedWidthKeyType);
- keyCount += 1;
+template <class OnDiskFormat>
+int ArtificialTreeBuilder<OnDiskFormat>::fillBucketToExactSize(const DiskLoc bucketLoc,
+ int targetSize,
+ char startKey) {
+ ASSERT_FALSE(bucketLoc.isNull());
+
+ BucketType* bucket = _helper->btree.getBucket(_txn, bucketLoc);
+ ASSERT_EQUALS(0, bucket->n);
+
+ static const int bigSize = KeyDataOwnedType(simpleKey('a', 801)).dataSize();
+
+ int size = 0;
+ int keyCount = 0;
+ while (size < targetSize) {
+ int space = targetSize - size;
+ int nextSize = space - sizeof(FixedWidthKeyType);
+ verify(nextSize > 0);
+
+ BSONObj newKey;
+ if (nextSize >= bigSize) {
+ newKey = simpleKey(startKey++, 801);
+ } else {
+ newKey = simpleKey(startKey++, nextSize - (bigSize - 801));
}
- ASSERT_EQUALS(_helper->btree._packedDataSize(bucket, 0), targetSize);
+ push(bucketLoc, newKey, DiskLoc());
- return keyCount;
+ size += KeyDataOwnedType(newKey).dataSize() + sizeof(FixedWidthKeyType);
+ keyCount += 1;
}
- //
- // This causes actual code to be generated for the usages of the templates in this file.
- //
+ ASSERT_EQUALS(_helper->btree._packedDataSize(bucket, 0), targetSize);
+
+ return keyCount;
+}
+
+//
+// This causes actual code to be generated for the usages of the templates in this file.
+//
- // V0 format.
- template struct BtreeLogicTestHelper<BtreeLayoutV0>;
- template class ArtificialTreeBuilder<BtreeLayoutV0>;
+// V0 format.
+template struct BtreeLogicTestHelper<BtreeLayoutV0>;
+template class ArtificialTreeBuilder<BtreeLayoutV0>;
- // V1 format.
- template struct BtreeLogicTestHelper<BtreeLayoutV1>;
- template class ArtificialTreeBuilder<BtreeLayoutV1>;
+// V1 format.
+template struct BtreeLogicTestHelper<BtreeLayoutV1>;
+template class ArtificialTreeBuilder<BtreeLayoutV1>;
}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h
index b282e72d827..5aeec516528 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h
@@ -37,118 +37,114 @@
namespace mongo {
+/**
+ * Generates a string of the specified length containing repeated concatenation of the
+ * hexadecimal representation of the input value.
+ */
+std::string bigNumString(long long n, int len);
+
+/**
+ * Generates key on a field 'a', with the specified number of repetitions of the character.
+ */
+BSONObj simpleKey(char c, int n = 1);
+
+/**
+ * Simple head manager, which performs no validity checking or persistence.
+ */
+class TestHeadManager : public HeadManager {
+public:
+ virtual const RecordId getHead(OperationContext* txn) const {
+ return _head;
+ }
+
+ virtual void setHead(OperationContext* txn, const RecordId newHead) {
+ _head = newHead;
+ }
+
+private:
+ RecordId _head;
+};
+
+
+/**
+ * This structure encapsulates a Btree and all the infrastructure needed by it (head manager,
+ * record store and a valid disk location to use by the tests).
+ */
+template <class OnDiskFormat>
+struct BtreeLogicTestHelper {
+ BtreeLogicTestHelper(const BSONObj& order);
+
+ // Everything needed for a fully-functional Btree logic
+ TestHeadManager headManager;
+ HeapRecordStoreBtree recordStore;
+ SavedCursorRegistry cursorRegistry;
+ BtreeLogic<OnDiskFormat> btree;
+ DiskLoc dummyDiskLoc;
+};
+
+
+/**
+ * Tool to construct custom tree shapes for tests.
+ */
+template <class OnDiskFormat>
+class ArtificialTreeBuilder {
+public:
+ typedef typename BtreeLogic<OnDiskFormat>::BucketType BucketType;
+ typedef typename BtreeLogic<OnDiskFormat>::KeyDataOwnedType KeyDataOwnedType;
+ typedef typename BtreeLogic<OnDiskFormat>::KeyHeaderType KeyHeaderType;
+
+ typedef typename OnDiskFormat::FixedWidthKeyType FixedWidthKeyType;
+
/**
- * Generates a string of the specified length containing repeated concatenation of the
- * hexadecimal representation of the input value.
+ * The tree builder wraps around the passed-in helper and will invoke methods on it. It
+ * does not do any cleanup, so constructing multiple trees over the same helper will
+ * cause leaked records.
*/
- std::string bigNumString(long long n, int len);
+ ArtificialTreeBuilder(OperationContext* txn, BtreeLogicTestHelper<OnDiskFormat>* helper)
+ : _txn(txn), _helper(helper) {}
/**
- * Generates key on a field 'a', with the specified number of repetitions of the character.
+ * Causes the specified tree shape to be built on the associated helper and the tree's
+ * root installed as the head. Uses a custom JSON-based language with the following
+ * syntax:
+ *
+ * Btree := BTreeBucket
+ * BtreeBucket := { Child_1_Key: <BtreeBucket | null>,
+ * Child_2_Key: <BtreeBucket | null>,
+ * ...,
+ * _: <BtreeBucket | null> }
+ *
+ * The _ key name specifies the content of the nextChild pointer. The value null means
+ * use a fixed disk loc.
*/
- BSONObj simpleKey(char c, int n = 1);
+ void makeTree(const std::string& spec);
/**
- * Simple head manager, which performs no validity checking or persistence.
+ * Validates that the structure of the Btree in the helper matches the specification.
*/
- class TestHeadManager : public HeadManager {
- public:
- virtual const RecordId getHead( OperationContext* txn ) const {
- return _head;
- }
-
- virtual void setHead(OperationContext* txn, const RecordId newHead) {
- _head = newHead;
- }
-
- private:
- RecordId _head;
- };
+ void checkStructure(const std::string& spec) const;
+ /**
+ * Adds the following key to the bucket and fixes up the child pointers.
+ */
+ void push(const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child);
/**
- * This structure encapsulates a Btree and all the infrastructure needed by it (head manager,
- * record store and a valid disk location to use by the tests).
+ * @return The number of keys inserted.
*/
- template <class OnDiskFormat>
- struct BtreeLogicTestHelper {
- BtreeLogicTestHelper(const BSONObj& order);
+ int fillBucketToExactSize(const DiskLoc bucketLoc, int targetSize, char startKey);
- // Everything needed for a fully-functional Btree logic
- TestHeadManager headManager;
- HeapRecordStoreBtree recordStore;
- SavedCursorRegistry cursorRegistry;
- BtreeLogic<OnDiskFormat> btree;
- DiskLoc dummyDiskLoc;
- };
+private:
+ DiskLoc makeTree(const BSONObj& spec);
+ void checkStructure(const BSONObj& spec, const DiskLoc node) const;
- /**
- * Tool to construct custom tree shapes for tests.
- */
- template <class OnDiskFormat>
- class ArtificialTreeBuilder {
- public:
-
- typedef typename BtreeLogic<OnDiskFormat>::BucketType BucketType;
- typedef typename BtreeLogic<OnDiskFormat>::KeyDataOwnedType KeyDataOwnedType;
- typedef typename BtreeLogic<OnDiskFormat>::KeyHeaderType KeyHeaderType;
-
- typedef typename OnDiskFormat::FixedWidthKeyType FixedWidthKeyType;
-
- /**
- * The tree builder wraps around the passed-in helper and will invoke methods on it. It
- * does not do any cleanup, so constructing multiple trees over the same helper will
- * cause leaked records.
- */
- ArtificialTreeBuilder(OperationContext* txn,
- BtreeLogicTestHelper<OnDiskFormat>* helper)
- : _txn(txn), _helper(helper) {
-
- }
-
- /**
- * Causes the specified tree shape to be built on the associated helper and the tree's
- * root installed as the head. Uses a custom JSON-based language with the following
- * syntax:
- *
- * Btree := BTreeBucket
- * BtreeBucket := { Child_1_Key: <BtreeBucket | null>,
- * Child_2_Key: <BtreeBucket | null>,
- * ...,
- * _: <BtreeBucket | null> }
- *
- * The _ key name specifies the content of the nextChild pointer. The value null means
- * use a fixed disk loc.
- */
- void makeTree(const std::string& spec);
-
- /**
- * Validates that the structure of the Btree in the helper matches the specification.
- */
- void checkStructure(const std::string& spec) const;
-
- /**
- * Adds the following key to the bucket and fixes up the child pointers.
- */
- void push(const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child);
-
- /**
- * @return The number of keys inserted.
- */
- int fillBucketToExactSize(const DiskLoc bucketLoc, int targetSize, char startKey);
-
- private:
- DiskLoc makeTree(const BSONObj& spec);
-
- void checkStructure(const BSONObj& spec, const DiskLoc node) const;
-
- bool isPresent(const BSONObj& key, int direction) const;
-
- static std::string expectedKey(const char* spec);
-
- OperationContext* _txn;
- BtreeLogicTestHelper<OnDiskFormat>* _helper;
- };
-
-} // namespace mongo
+ bool isPresent(const BSONObj& key, int direction) const;
+
+ static std::string expectedKey(const char* spec);
+
+ OperationContext* _txn;
+ BtreeLogicTestHelper<OnDiskFormat>* _helper;
+};
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/key.cpp b/src/mongo/db/storage/mmap_v1/btree/key.cpp
index 5cc1afbdc69..cbb89d8fab9 100644
--- a/src/mongo/db/storage/mmap_v1/btree/key.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/key.cpp
@@ -39,26 +39,26 @@
namespace mongo {
- using std::endl;
- using std::numeric_limits;
- using std::min;
+using std::endl;
+using std::numeric_limits;
+using std::min;
- extern const Ordering nullOrdering = Ordering::make(BSONObj());
+extern const Ordering nullOrdering = Ordering::make(BSONObj());
- // KeyBson is for V0 (version #0) indexes
+// KeyBson is for V0 (version #0) indexes
- int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o);
+int oldCompare(const BSONObj& l, const BSONObj& r, const Ordering& o);
- // "old" = pre signed dates & such; i.e. btree V0
- /* must be same canon type when called */
- int oldCompareElementValues(const BSONElement& l, const BSONElement& r) {
- dassert( l.canonicalType() == r.canonicalType() );
- int f;
- double x;
+// "old" = pre signed dates & such; i.e. btree V0
+/* must be same canon type when called */
+int oldCompareElementValues(const BSONElement& l, const BSONElement& r) {
+ dassert(l.canonicalType() == r.canonicalType());
+ int f;
+ double x;
- switch ( l.type() ) {
+ switch (l.type()) {
case EOO:
- case Undefined: // EOO and Undefined are same canonicalType
+ case Undefined: // EOO and Undefined are same canonicalType
case jstNULL:
case MaxKey:
case MinKey:
@@ -75,35 +75,36 @@ namespace mongo {
return lULL == rULL ? 0 : 1;
}
case NumberLong:
- if( r.type() == NumberLong ) {
+ if (r.type() == NumberLong) {
long long L = l._numberLong();
long long R = r._numberLong();
- if( L < R ) return -1;
- if( L == R ) return 0;
+ if (L < R)
+ return -1;
+ if (L == R)
+ return 0;
return 1;
}
- // else fall through
+ // else fall through
case NumberInt:
case NumberDouble: {
double left = l.number();
double right = r.number();
- bool lNan = !( left <= numeric_limits< double >::max() &&
- left >= -numeric_limits< double >::max() );
- bool rNan = !( right <= numeric_limits< double >::max() &&
- right >= -numeric_limits< double >::max() );
- if ( lNan ) {
- if ( rNan ) {
+ bool lNan =
+ !(left <= numeric_limits<double>::max() && left >= -numeric_limits<double>::max());
+ bool rNan = !(right <= numeric_limits<double>::max() &&
+ right >= -numeric_limits<double>::max());
+ if (lNan) {
+ if (rNan) {
return 0;
- }
- else {
+ } else {
return -1;
}
- }
- else if ( rNan ) {
+ } else if (rNan) {
return 1;
}
x = left - right;
- if ( x < 0 ) return -1;
+ if (x < 0)
+ return -1;
return x == 0 ? 0 : 1;
}
case jstOID:
@@ -119,562 +120,569 @@ namespace mongo {
case DBRef: {
int lsz = l.valuesize();
int rsz = r.valuesize();
- if ( lsz - rsz != 0 ) return lsz - rsz;
+ if (lsz - rsz != 0)
+ return lsz - rsz;
return memcmp(l.value(), r.value(), lsz);
}
case BinData: {
- int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+ int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
int rsz = r.objsize();
- if ( lsz - rsz != 0 ) return lsz - rsz;
- return memcmp(l.value()+4, r.value()+4, lsz+1);
+ if (lsz - rsz != 0)
+ return lsz - rsz;
+ return memcmp(l.value() + 4, r.value() + 4, lsz + 1);
}
case RegEx: {
int c = strcmp(l.regex(), r.regex());
- if ( c )
+ if (c)
return c;
return strcmp(l.regexFlags(), r.regexFlags());
}
- case CodeWScope : {
+ case CodeWScope: {
f = l.canonicalType() - r.canonicalType();
- if ( f )
+ if (f)
return f;
- f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
- if ( f )
+ f = strcmp(l.codeWScopeCode(), r.codeWScopeCode());
+ if (f)
return f;
- f = strcmp( l.codeWScopeScopeDataUnsafe() , r.codeWScopeScopeDataUnsafe() );
- if ( f )
+ f = strcmp(l.codeWScopeScopeDataUnsafe(), r.codeWScopeScopeDataUnsafe());
+ if (f)
return f;
return 0;
}
default:
- log() << "oldCompareElementValues: bad type " << (int) l.type() << endl;
+ log() << "oldCompareElementValues: bad type " << (int)l.type() << endl;
verify(false);
- }
- return -1;
- }
-
- int oldElemCompare(const BSONElement&l , const BSONElement& r) {
- int lt = (int) l.canonicalType();
- int rt = (int) r.canonicalType();
- int x = lt - rt;
- if( x )
- return x;
- return oldCompareElementValues(l, r);
}
-
- // pre signed dates & such
- int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) {
- BSONObjIterator i(l);
- BSONObjIterator j(r);
- unsigned mask = 1;
- while ( 1 ) {
- // so far, equal...
-
- BSONElement l = i.next();
- BSONElement r = j.next();
- if ( l.eoo() )
- return r.eoo() ? 0 : -1;
- if ( r.eoo() )
- return 1;
-
- int x;
- {
- x = oldElemCompare(l, r);
- if( o.descending(mask) )
- x = -x;
- }
- if ( x != 0 )
- return x;
- mask <<= 1;
+ return -1;
+}
+
+int oldElemCompare(const BSONElement& l, const BSONElement& r) {
+ int lt = (int)l.canonicalType();
+ int rt = (int)r.canonicalType();
+ int x = lt - rt;
+ if (x)
+ return x;
+ return oldCompareElementValues(l, r);
+}
+
+// pre signed dates & such
+int oldCompare(const BSONObj& l, const BSONObj& r, const Ordering& o) {
+ BSONObjIterator i(l);
+ BSONObjIterator j(r);
+ unsigned mask = 1;
+ while (1) {
+ // so far, equal...
+
+ BSONElement l = i.next();
+ BSONElement r = j.next();
+ if (l.eoo())
+ return r.eoo() ? 0 : -1;
+ if (r.eoo())
+ return 1;
+
+ int x;
+ {
+ x = oldElemCompare(l, r);
+ if (o.descending(mask))
+ x = -x;
}
- return -1;
- }
-
- /* old style compares:
- - dates are unsigned
- - strings no nulls
- */
- int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const {
- return oldCompare(_o, r._o, o);
- }
-
- // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort
- bool KeyBson::woEqual(const KeyBson& r) const {
- return oldCompare(_o, r._o, nullOrdering) == 0;
- }
-
- // [ ][HASMORE][x][y][canontype_4bits]
- enum CanonicalsEtc {
- cminkey=1,
- cnull=2,
- cdouble=4,
- cstring=6,
- cbindata=7,
- coid=8,
- cfalse=10,
- ctrue=11,
- cdate=12,
- cmaxkey=14,
- cCANONTYPEMASK = 0xf,
- cY = 0x10,
- cint = cY | cdouble,
- cX = 0x20,
- clong = cX | cdouble,
- cHASMORE = 0x40,
- cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
- };
-
- // bindata bson type
- const unsigned BinDataLenMask = 0xf0; // lengths are powers of 2 of this value
- const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value. see BinDataType.
- const int BinDataLenMax = 32;
- const int BinDataLengthToCode[] = {
- 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
- 0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/,
- 0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1,
- 0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1,
- 0xf0/*32*/
- };
- const int BinDataCodeToLength[] = {
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32
- };
-
- int binDataCodeToLength(int codeByte) {
- return BinDataCodeToLength[codeByte >> 4];
- }
-
- /** object cannot be represented in compact format. so store in traditional bson format
- with a leading sentinel byte IsBSON to indicate it's in that format.
-
- Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here
- so that we don't have to do an extra malloc.
- */
- void KeyV1Owned::traditional(const BSONObj& obj) {
- b.reset();
- b.appendUChar(IsBSON);
- b.appendBuf(obj.objdata(), obj.objsize());
- _keyData = (const unsigned char *) b.buf();
- }
-
- KeyV1Owned::KeyV1Owned(const KeyV1& rhs) {
- b.appendBuf( rhs.data(), rhs.dataSize() );
- _keyData = (const unsigned char *) b.buf();
- dassert( b.len() == dataSize() ); // check datasize method is correct
- dassert( (*_keyData & cNOTUSED) == 0 );
+ if (x != 0)
+ return x;
+ mask <<= 1;
}
-
- // fromBSON to Key format
- KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
- BSONObj::iterator i(obj);
- unsigned char bits = 0;
- while( 1 ) {
- BSONElement e = i.next();
- if( i.more() )
- bits |= cHASMORE;
- switch( e.type() ) {
+ return -1;
+}
+
+/* old style compares:
+ - dates are unsigned
+ - strings no nulls
+*/
+int KeyBson::woCompare(const KeyBson& r, const Ordering& o) const {
+ return oldCompare(_o, r._o, o);
+}
+
+// woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort
+bool KeyBson::woEqual(const KeyBson& r) const {
+ return oldCompare(_o, r._o, nullOrdering) == 0;
+}
+
+// [ ][HASMORE][x][y][canontype_4bits]
+enum CanonicalsEtc {
+ cminkey = 1,
+ cnull = 2,
+ cdouble = 4,
+ cstring = 6,
+ cbindata = 7,
+ coid = 8,
+ cfalse = 10,
+ ctrue = 11,
+ cdate = 12,
+ cmaxkey = 14,
+ cCANONTYPEMASK = 0xf,
+ cY = 0x10,
+ cint = cY | cdouble,
+ cX = 0x20,
+ clong = cX | cdouble,
+ cHASMORE = 0x40,
+ cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
+};
+
+// bindata bson type
+const unsigned BinDataLenMask = 0xf0; // lengths are powers of 2 of this value
+const unsigned BinDataTypeMask =
+ 0x0f; // 0-7 as you would expect, 8-15 are 128+value. see BinDataType.
+const int BinDataLenMax = 32;
+const int BinDataLengthToCode[] = {
+ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60,
+ 0x70, 0x80, -1 /*9*/, 0x90 /*10*/, -1 /*11*/, 0xa0 /*12*/, -1 /*13*/,
+ 0xb0 /*14*/, -1 /*15*/, 0xc0 /*16*/, -1, -1, -1, 0xd0 /*20*/,
+ -1, -1, -1, 0xe0 /*24*/, -1, -1, -1,
+ -1, -1, -1, -1, 0xf0 /*32*/
+};
+const int BinDataCodeToLength[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32};
+
+int binDataCodeToLength(int codeByte) {
+ return BinDataCodeToLength[codeByte >> 4];
+}
+
+/** object cannot be represented in compact format. so store in traditional bson format
+ with a leading sentinel byte IsBSON to indicate it's in that format.
+
+ Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here
+ so that we don't have to do an extra malloc.
+*/
+void KeyV1Owned::traditional(const BSONObj& obj) {
+ b.reset();
+ b.appendUChar(IsBSON);
+ b.appendBuf(obj.objdata(), obj.objsize());
+ _keyData = (const unsigned char*)b.buf();
+}
+
+KeyV1Owned::KeyV1Owned(const KeyV1& rhs) {
+ b.appendBuf(rhs.data(), rhs.dataSize());
+ _keyData = (const unsigned char*)b.buf();
+ dassert(b.len() == dataSize()); // check datasize method is correct
+ dassert((*_keyData & cNOTUSED) == 0);
+}
+
+// fromBSON to Key format
+KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
+ BSONObj::iterator i(obj);
+ unsigned char bits = 0;
+ while (1) {
+ BSONElement e = i.next();
+ if (i.more())
+ bits |= cHASMORE;
+ switch (e.type()) {
case MinKey:
- b.appendUChar(cminkey|bits);
+ b.appendUChar(cminkey | bits);
break;
case jstNULL:
- b.appendUChar(cnull|bits);
+ b.appendUChar(cnull | bits);
break;
case MaxKey:
- b.appendUChar(cmaxkey|bits);
+ b.appendUChar(cmaxkey | bits);
break;
case Bool:
- b.appendUChar( (e.boolean()?ctrue:cfalse) | bits );
+ b.appendUChar((e.boolean() ? ctrue : cfalse) | bits);
break;
case jstOID:
- b.appendUChar(coid|bits);
+ b.appendUChar(coid | bits);
b.appendBuf(e.__oid().view().view(), OID::kOIDSize);
break;
- case BinData:
- {
- int t = e.binDataType();
- // 0-7 and 0x80 to 0x87 are supported by Key
- if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
- int len;
- const char * d = e.binData(len);
- if( len <= BinDataLenMax ) {
- int code = BinDataLengthToCode[len];
- if( code >= 0 ) {
- if( t >= 128 )
- t = (t-128) | 0x08;
- dassert( (code&t) == 0 );
- b.appendUChar( cbindata|bits );
- b.appendUChar( code | t );
- b.appendBuf(d, len);
- break;
- }
+ case BinData: {
+ int t = e.binDataType();
+ // 0-7 and 0x80 to 0x87 are supported by Key
+ if ((t & 0x78) == 0 && t != ByteArrayDeprecated) {
+ int len;
+ const char* d = e.binData(len);
+ if (len <= BinDataLenMax) {
+ int code = BinDataLengthToCode[len];
+ if (code >= 0) {
+ if (t >= 128)
+ t = (t - 128) | 0x08;
+ dassert((code & t) == 0);
+ b.appendUChar(cbindata | bits);
+ b.appendUChar(code | t);
+ b.appendBuf(d, len);
+ break;
}
}
- traditional(obj);
- return;
}
+ traditional(obj);
+ return;
+ }
case Date:
- b.appendUChar(cdate|bits);
+ b.appendUChar(cdate | bits);
b.appendStruct(e.date());
break;
- case String:
- {
- b.appendUChar(cstring|bits);
- // note we do not store the terminating null, to save space.
- unsigned x = (unsigned) e.valuestrsize() - 1;
- if( x > 255 ) {
- traditional(obj);
- return;
- }
- b.appendUChar(x);
- b.appendBuf(e.valuestr(), x);
- break;
+ case String: {
+ b.appendUChar(cstring | bits);
+ // note we do not store the terminating null, to save space.
+ unsigned x = (unsigned)e.valuestrsize() - 1;
+ if (x > 255) {
+ traditional(obj);
+ return;
}
+ b.appendUChar(x);
+ b.appendBuf(e.valuestr(), x);
+ break;
+ }
case NumberInt:
- b.appendUChar(cint|bits);
- b.appendNum((double) e._numberInt());
+ b.appendUChar(cint | bits);
+ b.appendNum((double)e._numberInt());
break;
- case NumberLong:
- {
- long long n = e._numberLong();
- long long m = 2LL << 52;
- DEV {
- long long d = m-1;
- verify( ((long long) ((double) -d)) == -d );
- }
- if( n >= m || n <= -m ) {
- // can't represent exactly as a double
- traditional(obj);
- return;
- }
- b.appendUChar(clong|bits);
- b.appendNum((double) n);
- break;
+ case NumberLong: {
+ long long n = e._numberLong();
+ long long m = 2LL << 52;
+ DEV {
+ long long d = m - 1;
+ verify(((long long)((double)-d)) == -d);
}
- case NumberDouble:
- {
- double d = e._numberDouble();
- if( std::isnan(d) ) {
- traditional(obj);
- return;
- }
- b.appendUChar(cdouble|bits);
- b.appendNum(d);
- break;
+ if (n >= m || n <= -m) {
+ // can't represent exactly as a double
+ traditional(obj);
+ return;
+ }
+ b.appendUChar(clong | bits);
+ b.appendNum((double)n);
+ break;
+ }
+ case NumberDouble: {
+ double d = e._numberDouble();
+ if (std::isnan(d)) {
+ traditional(obj);
+ return;
}
+ b.appendUChar(cdouble | bits);
+ b.appendNum(d);
+ break;
+ }
default:
// if other types involved, store as traditional BSON
traditional(obj);
return;
- }
- if( !i.more() )
- break;
- bits = 0;
}
- _keyData = (const unsigned char *) b.buf();
- dassert( b.len() == dataSize() ); // check datasize method is correct
- dassert( (*_keyData & cNOTUSED) == 0 );
+ if (!i.more())
+ break;
+ bits = 0;
}
-
- BSONObj KeyV1::toBson() const {
- verify( _keyData != 0 );
- if( !isCompactFormat() )
- return bson();
-
- BSONObjBuilder b(512);
- const unsigned char *p = _keyData;
- while( 1 ) {
- unsigned bits = *p++;
-
- switch( bits & 0x3f ) {
- case cminkey: b.appendMinKey(""); break;
- case cnull: b.appendNull(""); break;
- case cfalse: b.appendBool("", false); break;
- case ctrue: b.appendBool("", true); break;
- case cmaxkey:
- b.appendMaxKey("");
- break;
- case cstring:
- {
- unsigned sz = *p++;
- // we build the element ourself as we have to null terminate it
- BufBuilder &bb = b.bb();
- bb.appendNum((char) String);
- bb.appendUChar(0); // fieldname ""
- bb.appendNum(sz+1);
- bb.appendBuf(p, sz);
- bb.appendUChar(0); // null char at end of string
- p += sz;
- break;
- }
- case coid:
- {
- OID oid = OID::from(p);
- b.appendOID("", &oid);
- p += OID::kOIDSize;
- break;
- }
- case cbindata:
- {
- int len = binDataCodeToLength(*p);
- int subtype = (*p) & BinDataTypeMask;
- if( subtype & 0x8 ) {
- subtype = (subtype & 0x7) | 0x80;
- }
- b.appendBinData("", len, (BinDataType) subtype, ++p);
- p += len;
- break;
- }
- case cdate:
- b.appendDate("", (Date_t&) *p);
- p += 8;
- break;
- case cdouble:
- b.append("", (double&) *p);
- p += sizeof(double);
- break;
- case cint:
- b.append("", static_cast< int >((reinterpret_cast< const PackedDouble& >(*p)).d));
- p += sizeof(double);
- break;
- case clong:
- b.append("", static_cast< long long>((reinterpret_cast< const PackedDouble& >(*p)).d));
- p += sizeof(double);
- break;
- default:
- verify(false);
- }
-
- if( (bits & cHASMORE) == 0 )
+ _keyData = (const unsigned char*)b.buf();
+ dassert(b.len() == dataSize()); // check datasize method is correct
+ dassert((*_keyData & cNOTUSED) == 0);
+}
+
+BSONObj KeyV1::toBson() const {
+ verify(_keyData != 0);
+ if (!isCompactFormat())
+ return bson();
+
+ BSONObjBuilder b(512);
+ const unsigned char* p = _keyData;
+ while (1) {
+ unsigned bits = *p++;
+
+ switch (bits & 0x3f) {
+ case cminkey:
+ b.appendMinKey("");
break;
- }
- return b.obj();
- }
-
- static int compare(const unsigned char *&l, const unsigned char *&r) {
- int lt = (*l & cCANONTYPEMASK);
- int rt = (*r & cCANONTYPEMASK);
- int x = lt - rt;
- if( x )
- return x;
-
- l++; r++;
-
- // same type
- switch( lt ) {
- case cdouble:
- {
- double L = (reinterpret_cast< const PackedDouble* >(l))->d;
- double R = (reinterpret_cast< const PackedDouble* >(r))->d;
- if( L < R )
- return -1;
- if( L != R )
- return 1;
- l += 8; r += 8;
+ case cnull:
+ b.appendNull("");
+ break;
+ case cfalse:
+ b.appendBool("", false);
+ break;
+ case ctrue:
+ b.appendBool("", true);
+ break;
+ case cmaxkey:
+ b.appendMaxKey("");
+ break;
+ case cstring: {
+ unsigned sz = *p++;
+ // we build the element ourself as we have to null terminate it
+ BufBuilder& bb = b.bb();
+ bb.appendNum((char)String);
+ bb.appendUChar(0); // fieldname ""
+ bb.appendNum(sz + 1);
+ bb.appendBuf(p, sz);
+ bb.appendUChar(0); // null char at end of string
+ p += sz;
break;
}
- case cstring:
- {
- int lsz = *l;
- int rsz = *r;
- int common = min(lsz, rsz);
- l++; r++; // skip the size byte
- // use memcmp as we (will) allow zeros in UTF8 strings
- int res = memcmp(l, r, common);
- if( res )
- return res;
- // longer string is the greater one
- int diff = lsz-rsz;
- if( diff )
- return diff;
- l += lsz; r += lsz;
+ case coid: {
+ OID oid = OID::from(p);
+ b.appendOID("", &oid);
+ p += OID::kOIDSize;
break;
}
- case cbindata:
- {
- int L = *l;
- int R = *r;
- int llen = binDataCodeToLength(L);
- int diff = L-R; // checks length and subtype simultaneously
- if( diff ) {
- // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...)
- int rlen = binDataCodeToLength(R);
- if( llen != rlen )
- return llen - rlen;
- return diff;
+ case cbindata: {
+ int len = binDataCodeToLength(*p);
+ int subtype = (*p) & BinDataTypeMask;
+ if (subtype & 0x8) {
+ subtype = (subtype & 0x7) | 0x80;
}
- // same length, same type
- l++; r++;
- int res = memcmp(l, r, llen);
- if( res )
- return res;
- l += llen; r += llen;
+ b.appendBinData("", len, (BinDataType)subtype, ++p);
+ p += len;
break;
}
- case cdate:
- {
- long long L = *((long long *) l);
- long long R = *((long long *) r);
- if( L < R )
- return -1;
- if( L > R )
- return 1;
- l += 8; r += 8;
+ case cdate:
+ b.appendDate("", (Date_t&)*p);
+ p += 8;
break;
- }
- case coid:
- {
- int res = memcmp(l, r, OID::kOIDSize);
- if( res )
- return res;
- l += OID::kOIDSize; r += OID::kOIDSize;
+ case cdouble:
+ b.append("", (double&)*p);
+ p += sizeof(double);
+ break;
+ case cint:
+ b.append("", static_cast<int>((reinterpret_cast<const PackedDouble&>(*p)).d));
+ p += sizeof(double);
break;
+ case clong:
+ b.append("", static_cast<long long>((reinterpret_cast<const PackedDouble&>(*p)).d));
+ p += sizeof(double);
+ break;
+ default:
+ verify(false);
+ }
+
+ if ((bits & cHASMORE) == 0)
+ break;
+ }
+ return b.obj();
+}
+
+static int compare(const unsigned char*& l, const unsigned char*& r) {
+ int lt = (*l & cCANONTYPEMASK);
+ int rt = (*r & cCANONTYPEMASK);
+ int x = lt - rt;
+ if (x)
+ return x;
+
+ l++;
+ r++;
+
+ // same type
+ switch (lt) {
+ case cdouble: {
+ double L = (reinterpret_cast<const PackedDouble*>(l))->d;
+ double R = (reinterpret_cast<const PackedDouble*>(r))->d;
+ if (L < R)
+ return -1;
+ if (L != R)
+ return 1;
+ l += 8;
+ r += 8;
+ break;
+ }
+ case cstring: {
+ int lsz = *l;
+ int rsz = *r;
+ int common = min(lsz, rsz);
+ l++;
+ r++; // skip the size byte
+ // use memcmp as we (will) allow zeros in UTF8 strings
+ int res = memcmp(l, r, common);
+ if (res)
+ return res;
+ // longer string is the greater one
+ int diff = lsz - rsz;
+ if (diff)
+ return diff;
+ l += lsz;
+ r += lsz;
+ break;
+ }
+ case cbindata: {
+ int L = *l;
+ int R = *r;
+ int llen = binDataCodeToLength(L);
+ int diff = L - R; // checks length and subtype simultaneously
+ if (diff) {
+ // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...)
+ int rlen = binDataCodeToLength(R);
+ if (llen != rlen)
+ return llen - rlen;
+ return diff;
}
+ // same length, same type
+ l++;
+ r++;
+ int res = memcmp(l, r, llen);
+ if (res)
+ return res;
+ l += llen;
+ r += llen;
+ break;
+ }
+ case cdate: {
+ long long L = *((long long*)l);
+ long long R = *((long long*)r);
+ if (L < R)
+ return -1;
+ if (L > R)
+ return 1;
+ l += 8;
+ r += 8;
+ break;
+ }
+ case coid: {
+ int res = memcmp(l, r, OID::kOIDSize);
+ if (res)
+ return res;
+ l += OID::kOIDSize;
+ r += OID::kOIDSize;
+ break;
+ }
default:
// all the others are a match -- e.g. null == null
;
- }
-
- return 0;
- }
-
- // at least one of this and right are traditional BSON format
- int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const {
- BSONObj L = toBson();
- BSONObj R = right.toBson();
- return L.woCompare(R, order, /*considerfieldname*/false);
}
- int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const {
- const unsigned char *l = _keyData;
- const unsigned char *r = right._keyData;
-
- if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained
- return compareHybrid(right, order);
-
- unsigned mask = 1;
- while( 1 ) {
- char lval = *l;
- char rval = *r;
- {
- int x = compare(l, r); // updates l and r pointers
- if( x ) {
- if( order.descending(mask) )
- x = -x;
- return x;
- }
- }
-
- {
- int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
- if( x )
- return x;
- if( (lval & cHASMORE) == 0 )
- break;
+ return 0;
+}
+
+// at least one of this and right are traditional BSON format
+int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const {
+ BSONObj L = toBson();
+ BSONObj R = right.toBson();
+ return L.woCompare(R, order, /*considerfieldname*/ false);
+}
+
+int KeyV1::woCompare(const KeyV1& right, const Ordering& order) const {
+ const unsigned char* l = _keyData;
+ const unsigned char* r = right._keyData;
+
+ if ((*l | *r) == IsBSON) // only can do this if cNOTUSED maintained
+ return compareHybrid(right, order);
+
+ unsigned mask = 1;
+ while (1) {
+ char lval = *l;
+ char rval = *r;
+ {
+ int x = compare(l, r); // updates l and r pointers
+ if (x) {
+ if (order.descending(mask))
+ x = -x;
+ return x;
}
+ }
- mask <<= 1;
+ {
+ int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
+ if (x)
+ return x;
+ if ((lval & cHASMORE) == 0)
+ break;
}
- return 0;
+ mask <<= 1;
}
- static unsigned sizes[] = {
- 0,
- 1, //cminkey=1,
- 1, //cnull=2,
- 0,
- 9, //cdouble=4,
- 0,
- 0, //cstring=6,
- 0,
- 13, //coid=8,
- 0,
- 1, //cfalse=10,
- 1, //ctrue=11,
- 9, //cdate=12,
- 0,
- 1, //cmaxkey=14,
- 0
- };
-
- inline unsigned sizeOfElement(const unsigned char *p) {
- unsigned type = *p & cCANONTYPEMASK;
- unsigned sz = sizes[type];
- if( sz == 0 ) {
- if( type == cstring ) {
- sz = ((unsigned) p[1]) + 2;
- }
- else {
- verify( type == cbindata );
- sz = binDataCodeToLength(p[1]) + 2;
- }
+ return 0;
+}
+
+static unsigned sizes[] = {0,
+ 1, // cminkey=1,
+ 1, // cnull=2,
+ 0,
+ 9, // cdouble=4,
+ 0,
+ 0, // cstring=6,
+ 0,
+ 13, // coid=8,
+ 0,
+ 1, // cfalse=10,
+ 1, // ctrue=11,
+ 9, // cdate=12,
+ 0,
+ 1, // cmaxkey=14,
+ 0};
+
+inline unsigned sizeOfElement(const unsigned char* p) {
+ unsigned type = *p & cCANONTYPEMASK;
+ unsigned sz = sizes[type];
+ if (sz == 0) {
+ if (type == cstring) {
+ sz = ((unsigned)p[1]) + 2;
+ } else {
+ verify(type == cbindata);
+ sz = binDataCodeToLength(p[1]) + 2;
}
- return sz;
}
+ return sz;
+}
- int KeyV1::dataSize() const {
- const unsigned char *p = _keyData;
- if( !isCompactFormat() ) {
- return bson().objsize() + 1;
- }
-
- bool more;
- do {
- unsigned z = sizeOfElement(p);
- more = (*p & cHASMORE) != 0;
- p += z;
- } while( more );
- return p - _keyData;
+int KeyV1::dataSize() const {
+ const unsigned char* p = _keyData;
+ if (!isCompactFormat()) {
+ return bson().objsize() + 1;
}
- bool KeyV1::woEqual(const KeyV1& right) const {
- const unsigned char *l = _keyData;
- const unsigned char *r = right._keyData;
-
- if( (*l|*r) == IsBSON ) {
- return toBson().equal(right.toBson());
- }
+ bool more;
+ do {
+ unsigned z = sizeOfElement(p);
+ more = (*p & cHASMORE) != 0;
+ p += z;
+ } while (more);
+ return p - _keyData;
+}
+
+bool KeyV1::woEqual(const KeyV1& right) const {
+ const unsigned char* l = _keyData;
+ const unsigned char* r = right._keyData;
+
+ if ((*l | *r) == IsBSON) {
+ return toBson().equal(right.toBson());
+ }
- while( 1 ) {
- char lval = *l;
- char rval = *r;
- if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) )
- return false;
- l++; r++;
- switch( lval&cCANONTYPEMASK ) {
+ while (1) {
+ char lval = *l;
+ char rval = *r;
+ if ((lval & (cCANONTYPEMASK | cHASMORE)) != (rval & (cCANONTYPEMASK | cHASMORE)))
+ return false;
+ l++;
+ r++;
+ switch (lval & cCANONTYPEMASK) {
case coid:
- if( *((unsigned*) l) != *((unsigned*) r) )
+ if (*((unsigned*)l) != *((unsigned*)r))
return false;
- l += 4; r += 4;
+ l += 4;
+ r += 4;
case cdate:
- if( *((unsigned long long *) l) != *((unsigned long long *) r) )
+ if (*((unsigned long long*)l) != *((unsigned long long*)r))
return false;
- l += 8; r += 8;
+ l += 8;
+ r += 8;
break;
case cdouble:
- if( (reinterpret_cast< const PackedDouble* > (l))->d != (reinterpret_cast< const PackedDouble* >(r))->d )
+ if ((reinterpret_cast<const PackedDouble*>(l))->d !=
+ (reinterpret_cast<const PackedDouble*>(r))->d)
return false;
- l += 8; r += 8;
+ l += 8;
+ r += 8;
break;
- case cstring:
- {
- if( *l != *r )
- return false; // not same length
- unsigned sz = ((unsigned) *l) + 1;
- if( memcmp(l, r, sz) )
- return false;
- l += sz; r += sz;
- break;
- }
- case cbindata:
- {
- if( *l != *r )
- return false; // len or subtype mismatch
- int len = binDataCodeToLength(*l) + 1;
- if( memcmp(l, r, len) )
- return false;
- l += len; r += len;
- break;
- }
+ case cstring: {
+ if (*l != *r)
+ return false; // not same length
+ unsigned sz = ((unsigned)*l) + 1;
+ if (memcmp(l, r, sz))
+ return false;
+ l += sz;
+ r += sz;
+ break;
+ }
+ case cbindata: {
+ if (*l != *r)
+ return false; // len or subtype mismatch
+ int len = binDataCodeToLength(*l) + 1;
+ if (memcmp(l, r, len))
+ return false;
+ l += len;
+ r += len;
+ break;
+ }
case cminkey:
case cnull:
case cfalse:
@@ -683,23 +691,23 @@ namespace mongo {
break;
default:
verify(false);
- }
- if( (lval&cHASMORE) == 0 )
- break;
}
- return true;
+ if ((lval & cHASMORE) == 0)
+ break;
}
-
- struct CmpUnitTest : public StartupTest {
- void run() {
- char a[2];
- char b[2];
- a[0] = -3;
- a[1] = 0;
- b[0] = 3;
- b[1] = 0;
- verify( strcmp(a,b)>0 && memcmp(a,b,2)>0 );
- }
- } cunittest;
+ return true;
+}
+
+struct CmpUnitTest : public StartupTest {
+ void run() {
+ char a[2];
+ char b[2];
+ a[0] = -3;
+ a[1] = 0;
+ b[0] = 3;
+ b[1] = 0;
+ verify(strcmp(a, b) > 0 && memcmp(a, b, 2) > 0);
+ }
+} cunittest;
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/key.h b/src/mongo/db/storage/mmap_v1/btree/key.h
index 7f886552067..4787d83281a 100644
--- a/src/mongo/db/storage/mmap_v1/btree/key.h
+++ b/src/mongo/db/storage/mmap_v1/btree/key.h
@@ -35,97 +35,132 @@
namespace mongo {
- /** Key class for precomputing a small format index key that is denser than a traditional BSONObj.
+/** Key class for precomputing a small format index key that is denser than a traditional BSONObj.
- KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes.
+ KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes.
- KeyV1 is the new implementation.
+ KeyV1 is the new implementation.
+*/
+class KeyBson /* "KeyV0" */ {
+public:
+ KeyBson() {}
+ explicit KeyBson(const char* keyData) : _o(keyData) {}
+ explicit KeyBson(const BSONObj& obj) : _o(obj) {}
+ int woCompare(const KeyBson& r, const Ordering& o) const;
+ BSONObj toBson() const {
+ return _o;
+ }
+ std::string toString() const {
+ return _o.toString();
+ }
+ int dataSize() const {
+ return _o.objsize();
+ }
+ const char* data() const {
+ return _o.objdata();
+ }
+ BSONElement _firstElement() const {
+ return _o.firstElement();
+ }
+ bool isCompactFormat() const {
+ return false;
+ }
+ bool woEqual(const KeyBson& r) const;
+ void assign(const KeyBson& rhs) {
+ *this = rhs;
+ }
+ bool isValid() const {
+ return true;
+ }
+
+private:
+ BSONObj _o;
+};
+
+class KeyV1Owned;
+
+// corresponding to BtreeData_V1
+class KeyV1 {
+ void operator=(
+ const KeyV1&); // disallowed just to make people be careful as we don't own the buffer
+ KeyV1(
+ const KeyV1Owned&); // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope
+public:
+ KeyV1() {
+ _keyData = 0;
+ }
+ ~KeyV1() {
+ DEV _keyData = (const unsigned char*)1;
+ }
+
+ KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) {
+ dassert(_keyData > (const unsigned char*)1);
+ }
+
+ // explicit version of operator= to be safe
+ void assign(const KeyV1& rhs) {
+ _keyData = rhs._keyData;
+ }
+
+ /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format.
+ when BSON, we are just a wrapper
*/
- class KeyBson /* "KeyV0" */ {
- public:
- KeyBson() { }
- explicit KeyBson(const char *keyData) : _o(keyData) { }
- explicit KeyBson(const BSONObj& obj) : _o(obj) { }
- int woCompare(const KeyBson& r, const Ordering &o) const;
- BSONObj toBson() const { return _o; }
- std::string toString() const { return _o.toString(); }
- int dataSize() const { return _o.objsize(); }
- const char * data() const { return _o.objdata(); }
- BSONElement _firstElement() const { return _o.firstElement(); }
- bool isCompactFormat() const { return false; }
- bool woEqual(const KeyBson& r) const;
- void assign(const KeyBson& rhs) { *this = rhs; }
- bool isValid() const { return true; }
- private:
- BSONObj _o;
- };
-
- class KeyV1Owned;
-
- // corresponding to BtreeData_V1
- class KeyV1 {
- void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer
- KeyV1(const KeyV1Owned&); // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope
- public:
- KeyV1() { _keyData = 0; }
- ~KeyV1() { DEV _keyData = (const unsigned char *) 1; }
-
- KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) {
- dassert( _keyData > (const unsigned char *) 1 );
- }
-
- // explicit version of operator= to be safe
- void assign(const KeyV1& rhs) {
- _keyData = rhs._keyData;
- }
-
- /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format.
- when BSON, we are just a wrapper
- */
- explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { }
-
- int woCompare(const KeyV1& r, const Ordering &o) const;
- bool woEqual(const KeyV1& r) const;
- BSONObj toBson() const;
- std::string toString() const { return toBson().toString(); }
-
- /** get the key data we want to store in the btree bucket */
- const char * data() const { return (const char *) _keyData; }
-
- /** @return size of data() */
- int dataSize() const;
-
- /** only used by geo, which always has bson keys */
- BSONElement _firstElement() const { return bson().firstElement(); }
- bool isCompactFormat() const { return *_keyData != IsBSON; }
-
- bool isValid() const { return _keyData > (const unsigned char*)1; }
- protected:
- enum { IsBSON = 0xff };
- const unsigned char *_keyData;
- BSONObj bson() const {
- dassert( !isCompactFormat() );
- return BSONObj((const char *) _keyData+1);
- }
- private:
- int compareHybrid(const KeyV1& right, const Ordering& order) const;
- };
-
- class KeyV1Owned : public KeyV1 {
- void operator=(const KeyV1Owned&);
- public:
- /** @obj a BSON object to be translated to KeyV1 format. If the object isn't
- representable in KeyV1 format (which happens, intentionally, at times)
- it will stay as bson herein.
- */
- KeyV1Owned(const BSONObj& obj);
-
- /** makes a copy (memcpy's the whole thing) */
- KeyV1Owned(const KeyV1& rhs);
-
- private:
- StackBufBuilder b;
- void traditional(const BSONObj& obj); // store as traditional bson not as compact format
- };
+ explicit KeyV1(const char* keyData) : _keyData((unsigned char*)keyData) {}
+
+ int woCompare(const KeyV1& r, const Ordering& o) const;
+ bool woEqual(const KeyV1& r) const;
+ BSONObj toBson() const;
+ std::string toString() const {
+ return toBson().toString();
+ }
+
+ /** get the key data we want to store in the btree bucket */
+ const char* data() const {
+ return (const char*)_keyData;
+ }
+
+ /** @return size of data() */
+ int dataSize() const;
+
+ /** only used by geo, which always has bson keys */
+ BSONElement _firstElement() const {
+ return bson().firstElement();
+ }
+ bool isCompactFormat() const {
+ return *_keyData != IsBSON;
+ }
+
+ bool isValid() const {
+ return _keyData > (const unsigned char*)1;
+ }
+
+protected:
+ enum { IsBSON = 0xff };
+ const unsigned char* _keyData;
+ BSONObj bson() const {
+ dassert(!isCompactFormat());
+ return BSONObj((const char*)_keyData + 1);
+ }
+
+private:
+ int compareHybrid(const KeyV1& right, const Ordering& order) const;
+};
+
+class KeyV1Owned : public KeyV1 {
+ void operator=(const KeyV1Owned&);
+public:
+ /** @obj a BSON object to be translated to KeyV1 format. If the object isn't
+ representable in KeyV1 format (which happens, intentionally, at times)
+ it will stay as bson herein.
+ */
+ KeyV1Owned(const BSONObj& obj);
+
+ /** makes a copy (memcpy's the whole thing) */
+ KeyV1Owned(const KeyV1& rhs);
+
+private:
+ StackBufBuilder b;
+ void traditional(const BSONObj& obj); // store as traditional bson not as compact format
+};
};
diff --git a/src/mongo/db/storage/mmap_v1/catalog/hashtab.cpp b/src/mongo/db/storage/mmap_v1/catalog/hashtab.cpp
index 9c86a4fffba..df766917fac 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/hashtab.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/hashtab.cpp
@@ -36,52 +36,50 @@
namespace mongo {
- int NamespaceHashTable::_find(const Namespace& k, bool& found) const {
- found = false;
- int h = k.hash();
- int i = h % n;
- int start = i;
- int chain = 0;
- int firstNonUsed = -1;
- while ( 1 ) {
- if ( !_nodes(i).inUse() ) {
- if ( firstNonUsed < 0 )
- firstNonUsed = i;
- }
-
- if ( _nodes(i).hash == h && _nodes(i).key == k ) {
- if ( chain >= 200 )
- log() << "warning: hashtable " << _name << " long chain " << std::endl;
- found = true;
- return i;
- }
- chain++;
- i = (i+1) % n;
- if ( i == start ) {
- // shouldn't get here / defensive for infinite loops
- log() << "error: hashtable " << _name << " is full n:" << n << std::endl;
- return -1;
- }
- if( chain >= maxChain ) {
- if ( firstNonUsed >= 0 )
- return firstNonUsed;
- log() << "error: hashtable " << _name << " max chain reached:" << maxChain << std::endl;
- return -1;
- }
+int NamespaceHashTable::_find(const Namespace& k, bool& found) const {
+ found = false;
+ int h = k.hash();
+ int i = h % n;
+ int start = i;
+ int chain = 0;
+ int firstNonUsed = -1;
+ while (1) {
+ if (!_nodes(i).inUse()) {
+ if (firstNonUsed < 0)
+ firstNonUsed = i;
}
- }
- /* buf must be all zeroes on initialization. */
- NamespaceHashTable::NamespaceHashTable(void* buf, int buflen, const char* name)
- : _name(name),
- _buf(buf) {
-
- n = buflen / sizeof(Node);
- if ((n & 1) == 0) {
- n--;
+ if (_nodes(i).hash == h && _nodes(i).key == k) {
+ if (chain >= 200)
+ log() << "warning: hashtable " << _name << " long chain " << std::endl;
+ found = true;
+ return i;
+ }
+ chain++;
+ i = (i + 1) % n;
+ if (i == start) {
+ // shouldn't get here / defensive for infinite loops
+ log() << "error: hashtable " << _name << " is full n:" << n << std::endl;
+ return -1;
}
+ if (chain >= maxChain) {
+ if (firstNonUsed >= 0)
+ return firstNonUsed;
+ log() << "error: hashtable " << _name << " max chain reached:" << maxChain << std::endl;
+ return -1;
+ }
+ }
+}
- maxChain = (int)(n * 0.05);
+/* buf must be all zeroes on initialization. */
+NamespaceHashTable::NamespaceHashTable(void* buf, int buflen, const char* name)
+ : _name(name), _buf(buf) {
+ n = buflen / sizeof(Node);
+ if ((n & 1) == 0) {
+ n--;
}
+ maxChain = (int)(n * 0.05);
+}
+
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/hashtab.h b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h
index b4ab9d858fa..286de349138 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/hashtab.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h
@@ -35,105 +35,103 @@
namespace mongo {
- /**
- * Simple, fixed size hash table used for namespace mapping (effectively the contents of the
- * MMAP V1 .ns file). Uses a contiguous block of memory, so you can put it in a memory mapped
- * file very easily.
- */
- class NamespaceHashTable {
- MONGO_DISALLOW_COPYING(NamespaceHashTable);
- public:
-
- typedef stdx::function< void(const Namespace& k, NamespaceDetails& v) > IteratorCallback;
-
-
- /* buf must be all zeroes on initialization. */
- NamespaceHashTable(void* buf, int buflen, const char *name);
-
- NamespaceDetails* get(const Namespace& k) const {
- bool found;
- int i = _find(k, found);
- if (found) {
- return &_nodes(i).value;
- }
+/**
+ * Simple, fixed size hash table used for namespace mapping (effectively the contents of the
+ * MMAP V1 .ns file). Uses a contiguous block of memory, so you can put it in a memory mapped
+ * file very easily.
+ */
+class NamespaceHashTable {
+ MONGO_DISALLOW_COPYING(NamespaceHashTable);
- return 0;
- }
+public:
+ typedef stdx::function<void(const Namespace& k, NamespaceDetails& v)> IteratorCallback;
- void kill(OperationContext* txn, const Namespace& k) {
- bool found;
- int i = _find(k, found);
- if ( i >= 0 && found ) {
- Node* n = &_nodes(i);
- n = txn->recoveryUnit()->writing(n);
- n->key.kill();
- n->setUnused();
- }
- }
- /** returns false if too full */
- bool put(OperationContext* txn, const Namespace& k, const NamespaceDetails& value) {
- bool found;
- int i = _find(k, found);
- if (i < 0)
- return false;
-
- Node* n = txn->recoveryUnit()->writing(&_nodes(i));
- if (!found) {
- n->key = k;
- n->hash = k.hash();
- }
- else {
- invariant(n->hash == k.hash());
- }
+ /* buf must be all zeroes on initialization. */
+ NamespaceHashTable(void* buf, int buflen, const char* name);
- n->value = value;
- return true;
+ NamespaceDetails* get(const Namespace& k) const {
+ bool found;
+ int i = _find(k, found);
+ if (found) {
+ return &_nodes(i).value;
}
- void iterAll(IteratorCallback callback) {
- for (int i = 0; i < n; i++) {
- if (_nodes(i).inUse()) {
- callback(_nodes(i).key, _nodes(i).value);
- }
- }
+ return 0;
+ }
+
+ void kill(OperationContext* txn, const Namespace& k) {
+ bool found;
+ int i = _find(k, found);
+ if (i >= 0 && found) {
+ Node* n = &_nodes(i);
+ n = txn->recoveryUnit()->writing(n);
+ n->key.kill();
+ n->setUnused();
+ }
+ }
+
+ /** returns false if too full */
+ bool put(OperationContext* txn, const Namespace& k, const NamespaceDetails& value) {
+ bool found;
+ int i = _find(k, found);
+ if (i < 0)
+ return false;
+
+ Node* n = txn->recoveryUnit()->writing(&_nodes(i));
+ if (!found) {
+ n->key = k;
+ n->hash = k.hash();
+ } else {
+ invariant(n->hash == k.hash());
}
+ n->value = value;
+ return true;
+ }
+
+ void iterAll(IteratorCallback callback) {
+ for (int i = 0; i < n; i++) {
+ if (_nodes(i).inUse()) {
+ callback(_nodes(i).key, _nodes(i).value);
+ }
+ }
+ }
- private:
+private:
#pragma pack(1)
- struct Node {
- int hash;
- Namespace key;
- NamespaceDetails value;
+ struct Node {
+ int hash;
+ Namespace key;
+ NamespaceDetails value;
- bool inUse() const {
- return hash != 0;
- }
+ bool inUse() const {
+ return hash != 0;
+ }
- void setUnused() {
- hash = 0;
- }
- };
+ void setUnused() {
+ hash = 0;
+ }
+ };
#pragma pack()
- BOOST_STATIC_ASSERT(sizeof(Node) == 628);
+ BOOST_STATIC_ASSERT(sizeof(Node) == 628);
- int _find(const Namespace& k, bool& found) const;
+ int _find(const Namespace& k, bool& found) const;
- Node& _nodes(int i) const {
- Node *nodes = (Node *)_buf;
- return nodes[i];
- }
+ Node& _nodes(int i) const {
+ Node* nodes = (Node*)_buf;
+ return nodes[i];
+ }
- const char* _name;
- void* const _buf;
+ const char* _name;
+ void* const _buf;
- int n; // number of hashtable buckets
- int maxChain;
- };
+ int n; // number of hashtable buckets
+ int maxChain;
+};
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp
index bc9cc3ee791..fa9093196f8 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp
@@ -32,9 +32,8 @@
namespace mongo {
- void IndexDetails::_reset() {
- head.setInvalid();
- info.setInvalid();
- }
-
+void IndexDetails::_reset() {
+ head.setInvalid();
+ info.setInvalid();
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.h b/src/mongo/db/storage/mmap_v1/catalog/index_details.h
index 8b343d2ee66..1ee5387c57c 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/index_details.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/index_details.h
@@ -34,38 +34,37 @@
namespace mongo {
- /* Details about a particular index. There is one of these effectively for each object in
- system.namespaces (although this also includes the head pointer, which is not in that
- collection).
+/* Details about a particular index. There is one of these effectively for each object in
+ system.namespaces (although this also includes the head pointer, which is not in that
+ collection).
- This is an internal part of the catalog. Nothing outside of the catalog should use this.
+ This is an internal part of the catalog. Nothing outside of the catalog should use this.
- ** MemoryMapped in NamespaceDetails ** (i.e., this is on disk data)
- */
+ ** MemoryMapped in NamespaceDetails ** (i.e., this is on disk data)
+ */
#pragma pack(1)
- struct IndexDetails {
- /**
- * btree head disk location
- */
- DiskLoc head;
-
- /* Location of index info object. Format:
+struct IndexDetails {
+ /**
+ * btree head disk location
+ */
+ DiskLoc head;
- { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
- [, unique: <bool>, background: <bool>, v:<version>]
- }
+ /* Location of index info object. Format:
- This object is in the system.indexes collection. Note that since we
- have a pointer to the object here, the object in system.indexes MUST NEVER MOVE.
- */
- DiskLoc info;
+ { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
+ [, unique: <bool>, background: <bool>, v:<version>]
+ }
- /**
- * makes head and info invalid
- */
- void _reset();
+ This object is in the system.indexes collection. Note that since we
+ have a pointer to the object here, the object in system.indexes MUST NEVER MOVE.
+ */
+ DiskLoc info;
- };
+ /**
+ * makes head and info invalid
+ */
+ void _reset();
+};
#pragma pack()
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h b/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h
index 318106dc5a7..6ed1bd661ca 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h
@@ -36,43 +36,44 @@
namespace mongo {
- inline Namespace& Namespace::operator=(StringData ns) {
- // we fill the remaining space with all zeroes here. as the full Namespace struct is in
- // the datafiles (the .ns files specifically), that is helpful as then they are deterministic
- // in the bytes they have for a given sequence of operations. that makes testing and debugging
- // the data files easier.
- //
- // if profiling indicates this method is a significant bottleneck, we could have a version we
- // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes.
- //
- memset( buf, 0, sizeof(buf) );
- uassert( 10080 , "ns name too long, max size is 127 bytes", ns.size() <= MaxNsLen);
- uassert( 17380 , "ns name can't contain embedded '\0' byte", ns.find('\0') == std::string::npos);
- ns.copyTo( buf, true );
- return *this;
- }
+inline Namespace& Namespace::operator=(StringData ns) {
+ // we fill the remaining space with all zeroes here. as the full Namespace struct is in
+ // the datafiles (the .ns files specifically), that is helpful as then they are deterministic
+ // in the bytes they have for a given sequence of operations. that makes testing and debugging
+ // the data files easier.
+ //
+ // if profiling indicates this method is a significant bottleneck, we could have a version we
+ // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes.
+ //
+ memset(buf, 0, sizeof(buf));
+ uassert(10080, "ns name too long, max size is 127 bytes", ns.size() <= MaxNsLen);
+ uassert(17380, "ns name can't contain embedded '\0' byte", ns.find('\0') == std::string::npos);
+ ns.copyTo(buf, true);
+ return *this;
+}
- inline std::string Namespace::extraName(int i) const {
- char ex[] = "$extra";
- ex[5] += i;
- std::string s = std::string(buf) + ex;
- massert( 10348 , "$extra: ns name too long", s.size() <= MaxNsLen);
- return s;
- }
+inline std::string Namespace::extraName(int i) const {
+ char ex[] = "$extra";
+ ex[5] += i;
+ std::string s = std::string(buf) + ex;
+ massert(10348, "$extra: ns name too long", s.size() <= MaxNsLen);
+ return s;
+}
- inline bool Namespace::isExtra() const {
- const char *p = strstr(buf, "$extr");
- return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
- }
+inline bool Namespace::isExtra() const {
+ const char* p = strstr(buf, "$extr");
+ return p && p[5] &&
+ p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
+}
- inline int Namespace::hash() const {
- unsigned x = 0;
- const char *p = buf;
- while ( *p ) {
- x = x * 131 + *p;
- p++;
- }
- return (x & 0x7fffffff) | 0x8000000; // must be > 0
+inline int Namespace::hash() const {
+ unsigned x = 0;
+ const char* p = buf;
+ while (*p) {
+ x = x * 131 + *p;
+ p++;
}
+ return (x & 0x7fffffff) | 0x8000000; // must be > 0
+}
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp
index 374761fe386..c9dec65d520 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp
@@ -37,13 +37,12 @@
#include "mongo/db/namespace_string.h"
namespace mongo {
- namespace {
- BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 );
- BOOST_STATIC_ASSERT( Namespace::MaxNsLenWithNUL == MaxDatabaseNameLen );
- BOOST_STATIC_ASSERT((int)Namespace::MaxNsLenWithNUL == (int)NamespaceString::MaxNsLenWithNUL);
- BOOST_STATIC_ASSERT((int)Namespace::MaxNsLen == (int)NamespaceString::MaxNsLen);
- // Note the typo.
- BOOST_STATIC_ASSERT((int)Namespace::MaxNsColletionLen == (int)NamespaceString::MaxNsCollectionLen);
- }
+namespace {
+BOOST_STATIC_ASSERT(sizeof(Namespace) == 128);
+BOOST_STATIC_ASSERT(Namespace::MaxNsLenWithNUL == MaxDatabaseNameLen);
+BOOST_STATIC_ASSERT((int)Namespace::MaxNsLenWithNUL == (int)NamespaceString::MaxNsLenWithNUL);
+BOOST_STATIC_ASSERT((int)Namespace::MaxNsLen == (int)NamespaceString::MaxNsLen);
+// Note the typo.
+BOOST_STATIC_ASSERT((int)Namespace::MaxNsColletionLen == (int)NamespaceString::MaxNsCollectionLen);
+}
}
-
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.h b/src/mongo/db/storage/mmap_v1/catalog/namespace.h
index 556e7adf889..f93112de47f 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace.h
@@ -38,55 +38,77 @@
namespace mongo {
#pragma pack(1)
- /**
- * This is used for storing a namespace on disk in a fixed witdh form
- * it should only be used for that, not for passing internally
- * for that, please use NamespaceString
- */
- class Namespace {
- public:
- Namespace(StringData ns) { *this = ns; }
- Namespace& operator=(StringData ns);
-
- void kill() { buf[0] = 0x7f; }
-
- bool operator==(const char *r) const { return strcmp(buf, r) == 0; }
- bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; }
- bool operator!=(const char *r) const { return strcmp(buf, r) != 0; }
- bool operator!=(const Namespace& r) const { return strcmp(buf, r.buf) != 0; }
-
- bool hasDollarSign() const { return strchr( buf , '$' ) != NULL; }
-
- int hash() const; // value returned is always > 0
-
- size_t size() const { return strlen( buf ); }
-
- std::string toString() const { return buf; }
- operator std::string() const { return buf; }
-
- /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes
- (more than 10 IndexDetails). It's a bit hacky because of this late addition with backward
- file support. */
- std::string extraName(int i) const;
- bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */
-
- enum MaxNsLenValue {
- // Maximum possible length of name any namespace, including special ones like $extra.
- // This includes rum for the NUL byte so it can be used when sizing buffers.
- MaxNsLenWithNUL = 128,
-
- // MaxNsLenWithNUL excluding the NUL byte. Use this when comparing std::string lengths.
- MaxNsLen = MaxNsLenWithNUL - 1,
-
- // Maximum allowed length of fully qualified namespace name of any real collection.
- // Does not include NUL so it can be directly compared to std::string lengths.
- MaxNsColletionLen = MaxNsLen - 7/*strlen(".$extra")*/,
- };
- private:
- char buf[MaxNsLenWithNUL];
+/**
+ * This is used for storing a namespace on disk in a fixed witdh form
+ * it should only be used for that, not for passing internally
+ * for that, please use NamespaceString
+ */
+class Namespace {
+public:
+ Namespace(StringData ns) {
+ *this = ns;
+ }
+ Namespace& operator=(StringData ns);
+
+ void kill() {
+ buf[0] = 0x7f;
+ }
+
+ bool operator==(const char* r) const {
+ return strcmp(buf, r) == 0;
+ }
+ bool operator==(const Namespace& r) const {
+ return strcmp(buf, r.buf) == 0;
+ }
+ bool operator!=(const char* r) const {
+ return strcmp(buf, r) != 0;
+ }
+ bool operator!=(const Namespace& r) const {
+ return strcmp(buf, r.buf) != 0;
+ }
+
+ bool hasDollarSign() const {
+ return strchr(buf, '$') != NULL;
+ }
+
+ int hash() const; // value returned is always > 0
+
+ size_t size() const {
+ return strlen(buf);
+ }
+
+ std::string toString() const {
+ return buf;
+ }
+ operator std::string() const {
+ return buf;
+ }
+
+ /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes
+ (more than 10 IndexDetails). It's a bit hacky because of this late addition with backward
+ file support. */
+ std::string extraName(int i) const;
+ bool isExtra()
+ const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */
+
+ enum MaxNsLenValue {
+ // Maximum possible length of name any namespace, including special ones like $extra.
+ // This includes rum for the NUL byte so it can be used when sizing buffers.
+ MaxNsLenWithNUL = 128,
+
+ // MaxNsLenWithNUL excluding the NUL byte. Use this when comparing std::string lengths.
+ MaxNsLen = MaxNsLenWithNUL - 1,
+
+ // Maximum allowed length of fully qualified namespace name of any real collection.
+ // Does not include NUL so it can be directly compared to std::string lengths.
+ MaxNsColletionLen = MaxNsLen - 7 /*strlen(".$extra")*/,
};
+
+private:
+ char buf[MaxNsLenWithNUL];
+};
#pragma pack()
-} // namespace mongo
+} // namespace mongo
#include "mongo/db/storage/mmap_v1/catalog/namespace-inl.h"
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp
index 38fa8a7ae00..538a4500906 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp
@@ -51,195 +51,193 @@
namespace mongo {
- NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool capped ) {
- BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
-
- /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
- firstExtent = lastExtent = capExtent = loc;
- stats.datasize = stats.nrecords = 0;
- lastExtentSize = 0;
- nIndexes = 0;
- isCapped = capped;
- maxDocsInCapped = 0x7fffffff; // no limit (value is for pre-v2.3.2 compatibility)
- paddingFactorOldDoNotUse = 1.0;
- systemFlagsOldDoNotUse = 0;
- userFlags = 0;
- capFirstNewRecord = DiskLoc();
- // Signal that we are on first allocation iteration through extents.
- capFirstNewRecord.setInvalid();
- // For capped case, signal that we are doing initial extent allocation.
- if ( capped ) {
- // WAS: cappedLastDelRecLastExtent().setInvalid();
- deletedListSmall[1].setInvalid();
- }
- verify( sizeof(_dataFileVersion) == 2 );
- _dataFileVersion = 0;
- _indexFileVersion = 0;
- multiKeyIndexBits = 0;
- _reservedA = 0;
- _extraOffset = 0;
- indexBuildsInProgress = 0;
- memset(_reserved, 0, sizeof(_reserved));
+NamespaceDetails::NamespaceDetails(const DiskLoc& loc, bool capped) {
+ BOOST_STATIC_ASSERT(sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails));
+
+ /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
+ firstExtent = lastExtent = capExtent = loc;
+ stats.datasize = stats.nrecords = 0;
+ lastExtentSize = 0;
+ nIndexes = 0;
+ isCapped = capped;
+ maxDocsInCapped = 0x7fffffff; // no limit (value is for pre-v2.3.2 compatibility)
+ paddingFactorOldDoNotUse = 1.0;
+ systemFlagsOldDoNotUse = 0;
+ userFlags = 0;
+ capFirstNewRecord = DiskLoc();
+ // Signal that we are on first allocation iteration through extents.
+ capFirstNewRecord.setInvalid();
+ // For capped case, signal that we are doing initial extent allocation.
+ if (capped) {
+ // WAS: cappedLastDelRecLastExtent().setInvalid();
+ deletedListSmall[1].setInvalid();
}
-
- NamespaceDetails::Extra* NamespaceDetails::allocExtra( OperationContext* txn,
- StringData ns,
- NamespaceIndex& ni,
- int nindexessofar) {
-
- // Namespace details must always be changed under an exclusive DB lock
- const NamespaceString nss(ns);
- invariant(txn->lockState()->isDbLockedForMode(nss.db(), MODE_X));
-
- int i = (nindexessofar - NIndexesBase) / NIndexesExtra;
- verify( i >= 0 && i <= 1 );
-
- Namespace fullns( ns );
- Namespace extrans( fullns.extraName(i) ); // throws UserException if ns name too long
-
- massert( 10350, "allocExtra: base ns missing?", this );
- massert( 10351, "allocExtra: extra already exists", ni.details(extrans) == 0 );
-
- Extra temp;
- temp.init();
-
- ni.add_ns( txn, extrans, reinterpret_cast<NamespaceDetails*>( &temp ) );
- Extra* e = reinterpret_cast<NamespaceDetails::Extra*>( ni.details( extrans ) );
-
- long ofs = e->ofsFrom(this);
- if( i == 0 ) {
- verify( _extraOffset == 0 );
- *txn->recoveryUnit()->writing(&_extraOffset) = ofs;
- verify( extra() == e );
- }
- else {
- Extra *hd = extra();
- verify( hd->next(this) == 0 );
- hd->setNext(txn, ofs);
- }
- return e;
+ verify(sizeof(_dataFileVersion) == 2);
+ _dataFileVersion = 0;
+ _indexFileVersion = 0;
+ multiKeyIndexBits = 0;
+ _reservedA = 0;
+ _extraOffset = 0;
+ indexBuildsInProgress = 0;
+ memset(_reserved, 0, sizeof(_reserved));
+}
+
+NamespaceDetails::Extra* NamespaceDetails::allocExtra(OperationContext* txn,
+ StringData ns,
+ NamespaceIndex& ni,
+ int nindexessofar) {
+ // Namespace details must always be changed under an exclusive DB lock
+ const NamespaceString nss(ns);
+ invariant(txn->lockState()->isDbLockedForMode(nss.db(), MODE_X));
+
+ int i = (nindexessofar - NIndexesBase) / NIndexesExtra;
+ verify(i >= 0 && i <= 1);
+
+ Namespace fullns(ns);
+ Namespace extrans(fullns.extraName(i)); // throws UserException if ns name too long
+
+ massert(10350, "allocExtra: base ns missing?", this);
+ massert(10351, "allocExtra: extra already exists", ni.details(extrans) == 0);
+
+ Extra temp;
+ temp.init();
+
+ ni.add_ns(txn, extrans, reinterpret_cast<NamespaceDetails*>(&temp));
+ Extra* e = reinterpret_cast<NamespaceDetails::Extra*>(ni.details(extrans));
+
+ long ofs = e->ofsFrom(this);
+ if (i == 0) {
+ verify(_extraOffset == 0);
+ *txn->recoveryUnit()->writing(&_extraOffset) = ofs;
+ verify(extra() == e);
+ } else {
+ Extra* hd = extra();
+ verify(hd->next(this) == 0);
+ hd->setNext(txn, ofs);
}
+ return e;
+}
- IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) {
- if( idxNo < NIndexesBase ) {
- IndexDetails& id = _indexes[idxNo];
- return id;
- }
- Extra *e = extra();
- if ( ! e ) {
- if ( missingExpected )
- throw MsgAssertionException( 13283 , "Missing Extra" );
- massert(14045, "missing Extra", e);
- }
- int i = idxNo - NIndexesBase;
- if( i >= NIndexesExtra ) {
- e = e->next(this);
- if ( ! e ) {
- if ( missingExpected )
- throw MsgAssertionException( 14823 , "missing extra" );
- massert(14824, "missing Extra", e);
- }
- i -= NIndexesExtra;
- }
- return e->details[i];
+IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) {
+ if (idxNo < NIndexesBase) {
+ IndexDetails& id = _indexes[idxNo];
+ return id;
}
-
-
- const IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) const {
- if( idxNo < NIndexesBase ) {
- const IndexDetails& id = _indexes[idxNo];
- return id;
- }
- const Extra *e = extra();
- if ( ! e ) {
- if ( missingExpected )
- throw MsgAssertionException( 17421 , "Missing Extra" );
- massert(17422, "missing Extra", e);
- }
- int i = idxNo - NIndexesBase;
- if( i >= NIndexesExtra ) {
- e = e->next(this);
- if ( ! e ) {
- if ( missingExpected )
- throw MsgAssertionException( 17423 , "missing extra" );
- massert(17424, "missing Extra", e);
- }
- i -= NIndexesExtra;
+ Extra* e = extra();
+ if (!e) {
+ if (missingExpected)
+ throw MsgAssertionException(13283, "Missing Extra");
+ massert(14045, "missing Extra", e);
+ }
+ int i = idxNo - NIndexesBase;
+ if (i >= NIndexesExtra) {
+ e = e->next(this);
+ if (!e) {
+ if (missingExpected)
+ throw MsgAssertionException(14823, "missing extra");
+ massert(14824, "missing Extra", e);
}
- return e->details[i];
+ i -= NIndexesExtra;
}
+ return e->details[i];
+}
- NamespaceDetails::IndexIterator::IndexIterator(const NamespaceDetails *_d,
- bool includeBackgroundInProgress) {
- d = _d;
- i = 0;
- n = d->nIndexes;
- if ( includeBackgroundInProgress )
- n += d->indexBuildsInProgress;
- }
- // must be called when renaming a NS to fix up extra
- void NamespaceDetails::copyingFrom( OperationContext* txn,
- StringData thisns,
- NamespaceIndex& ni,
- NamespaceDetails* src) {
- _extraOffset = 0; // we are a copy -- the old value is wrong. fixing it up below.
- Extra *se = src->extra();
- int n = NIndexesBase;
- if( se ) {
- Extra *e = allocExtra(txn, thisns, ni, n);
- while( 1 ) {
- n += NIndexesExtra;
- e->copy(this, *se);
- se = se->next(src);
- if( se == 0 ) break;
- Extra *nxt = allocExtra(txn, thisns, ni, n);
- e->setNext( txn, nxt->ofsFrom(this) );
- e = nxt;
- }
- verify( _extraOffset );
- }
+const IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) const {
+ if (idxNo < NIndexesBase) {
+ const IndexDetails& id = _indexes[idxNo];
+ return id;
}
-
- NamespaceDetails* NamespaceDetails::writingWithoutExtra( OperationContext* txn ) {
- return txn->recoveryUnit()->writing( this );
+ const Extra* e = extra();
+ if (!e) {
+ if (missingExpected)
+ throw MsgAssertionException(17421, "Missing Extra");
+ massert(17422, "missing Extra", e);
}
-
-
- // XXX - this method should go away
- NamespaceDetails *NamespaceDetails::writingWithExtra( OperationContext* txn ) {
- for( Extra *e = extra(); e; e = e->next( this ) ) {
- txn->recoveryUnit()->writing( e );
+ int i = idxNo - NIndexesBase;
+ if (i >= NIndexesExtra) {
+ e = e->next(this);
+ if (!e) {
+ if (missingExpected)
+ throw MsgAssertionException(17423, "missing extra");
+ massert(17424, "missing Extra", e);
}
- return writingWithoutExtra( txn );
+ i -= NIndexesExtra;
}
-
- void NamespaceDetails::setMaxCappedDocs( OperationContext* txn, long long max ) {
- massert( 16499,
- "max in a capped collection has to be < 2^31 or -1",
- CollectionOptions::validMaxCappedDocs( &max ) );
- maxDocsInCapped = max;
+ return e->details[i];
+}
+
+NamespaceDetails::IndexIterator::IndexIterator(const NamespaceDetails* _d,
+ bool includeBackgroundInProgress) {
+ d = _d;
+ i = 0;
+ n = d->nIndexes;
+ if (includeBackgroundInProgress)
+ n += d->indexBuildsInProgress;
+}
+
+// must be called when renaming a NS to fix up extra
+void NamespaceDetails::copyingFrom(OperationContext* txn,
+ StringData thisns,
+ NamespaceIndex& ni,
+ NamespaceDetails* src) {
+ _extraOffset = 0; // we are a copy -- the old value is wrong. fixing it up below.
+ Extra* se = src->extra();
+ int n = NIndexesBase;
+ if (se) {
+ Extra* e = allocExtra(txn, thisns, ni, n);
+ while (1) {
+ n += NIndexesExtra;
+ e->copy(this, *se);
+ se = se->next(src);
+ if (se == 0)
+ break;
+ Extra* nxt = allocExtra(txn, thisns, ni, n);
+ e->setNext(txn, nxt->ofsFrom(this));
+ e = nxt;
+ }
+ verify(_extraOffset);
}
+}
- /* ------------------------------------------------------------------------- */
+NamespaceDetails* NamespaceDetails::writingWithoutExtra(OperationContext* txn) {
+ return txn->recoveryUnit()->writing(this);
+}
- int NamespaceDetails::_catalogFindIndexByName(OperationContext* txn,
- const Collection* coll,
- StringData name,
- bool includeBackgroundInProgress) const {
- IndexIterator i = ii(includeBackgroundInProgress);
- while( i.more() ) {
- const BSONObj obj = coll->docFor(txn, i.next().info.toRecordId()).value();
- if ( name == obj.getStringField("name") )
- return i.pos()-1;
- }
- return -1;
+// XXX - this method should go away
+NamespaceDetails* NamespaceDetails::writingWithExtra(OperationContext* txn) {
+ for (Extra* e = extra(); e; e = e->next(this)) {
+ txn->recoveryUnit()->writing(e);
}
-
- void NamespaceDetails::Extra::setNext( OperationContext* txn,
- long ofs ) {
- *txn->recoveryUnit()->writing(&_next) = ofs;
+ return writingWithoutExtra(txn);
+}
+
+void NamespaceDetails::setMaxCappedDocs(OperationContext* txn, long long max) {
+ massert(16499,
+ "max in a capped collection has to be < 2^31 or -1",
+ CollectionOptions::validMaxCappedDocs(&max));
+ maxDocsInCapped = max;
+}
+
+/* ------------------------------------------------------------------------- */
+
+
+int NamespaceDetails::_catalogFindIndexByName(OperationContext* txn,
+ const Collection* coll,
+ StringData name,
+ bool includeBackgroundInProgress) const {
+ IndexIterator i = ii(includeBackgroundInProgress);
+ while (i.more()) {
+ const BSONObj obj = coll->docFor(txn, i.next().info.toRecordId()).value();
+ if (name == obj.getStringField("name"))
+ return i.pos() - 1;
}
+ return -1;
+}
+
+void NamespaceDetails::Extra::setNext(OperationContext* txn, long ofs) {
+ *txn->recoveryUnit()->writing(&_next) = ofs;
+}
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h
index 9011d6d27f3..5002bf267c7 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h
@@ -35,200 +35,216 @@
namespace mongo {
- class Collection;
- class NamespaceIndex;
- class OperationContext;
+class Collection;
+class NamespaceIndex;
+class OperationContext;
#pragma pack(1)
- /* NamespaceDetails : this is the "header" for a collection that has all its details.
- It's in the .ns file and this is a memory mapped region (thus the pack pragma above).
+/* NamespaceDetails : this is the "header" for a collection that has all its details.
+ It's in the .ns file and this is a memory mapped region (thus the pack pragma above).
+*/
+class NamespaceDetails {
+public:
+ enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 };
+
+ // deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various
+ // sizes so you can look for a deleted record of about the right size. These buckets are
+ // split into small and large groups for compatibility with old versions.
+ static const int SmallBuckets = 18;
+ static const int LargeBuckets = 8;
+
+
+ /*-------- data fields, as present on disk : */
+
+ DiskLoc firstExtent;
+ DiskLoc lastExtent;
+
+ /* NOTE: capped collections v1 override the meaning of deletedList.
+ deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
+ the capped namespace.
+ deletedList[1] points to the last record in the prev extent. When the "current extent"
+ changes, this value is updated. !deletedList[1].isValid() when this value is not
+ yet computed.
*/
- class NamespaceDetails {
- public:
- enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 };
+ DiskLoc deletedListSmall[SmallBuckets];
+ DiskLoc deletedListLegacyGrabBag; // old implementations put records of multiple sizes here.
- // deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various
- // sizes so you can look for a deleted record of about the right size. These buckets are
- // split into small and large groups for compatibility with old versions.
- static const int SmallBuckets = 18;
- static const int LargeBuckets = 8;
+ // ofs 168 (8 byte aligned)
+ struct Stats {
+ // datasize and nrecords MUST Be adjacent code assumes!
+ long long datasize; // this includes padding, but not record headers
+ long long nrecords;
+ } stats;
- /*-------- data fields, as present on disk : */
+ int lastExtentSize;
- DiskLoc firstExtent;
- DiskLoc lastExtent;
+ int nIndexes;
- /* NOTE: capped collections v1 override the meaning of deletedList.
- deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
- the capped namespace.
- deletedList[1] points to the last record in the prev extent. When the "current extent"
- changes, this value is updated. !deletedList[1].isValid() when this value is not
- yet computed.
- */
- DiskLoc deletedListSmall[SmallBuckets];
- DiskLoc deletedListLegacyGrabBag; // old implementations put records of multiple sizes here.
+ // ofs 192
+ IndexDetails _indexes[NIndexesBase];
- // ofs 168 (8 byte aligned)
- struct Stats {
- // datasize and nrecords MUST Be adjacent code assumes!
- long long datasize; // this includes padding, but not record headers
- long long nrecords;
- } stats;
+public:
+ // ofs 352 (16 byte aligned)
+ int isCapped; // there is wasted space here if I'm right (ERH)
+ int maxDocsInCapped; // max # of objects for a capped table, -1 for inf.
- int lastExtentSize;
+ double paddingFactorOldDoNotUse;
+ // ofs 368 (16)
+ int systemFlagsOldDoNotUse; // things that the system sets/cares about
- int nIndexes;
+ DiskLoc capExtent; // the "current" extent we're writing too for a capped collection
+ DiskLoc capFirstNewRecord;
- // ofs 192
- IndexDetails _indexes[NIndexesBase];
+ unsigned short
+ _dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h
+ unsigned short _indexFileVersion;
- public:
- // ofs 352 (16 byte aligned)
- int isCapped; // there is wasted space here if I'm right (ERH)
+ unsigned long long multiKeyIndexBits;
- int maxDocsInCapped; // max # of objects for a capped table, -1 for inf.
+ // ofs 400 (16)
+ unsigned long long _reservedA;
+ long long _extraOffset; // where the $extra info is located (bytes relative to this)
- double paddingFactorOldDoNotUse;
- // ofs 368 (16)
- int systemFlagsOldDoNotUse; // things that the system sets/cares about
+public:
+ int indexBuildsInProgress; // Number of indexes currently being built
- DiskLoc capExtent; // the "current" extent we're writing too for a capped collection
- DiskLoc capFirstNewRecord;
+ int userFlags;
- unsigned short _dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h
- unsigned short _indexFileVersion;
+ DiskLoc deletedListLarge[LargeBuckets];
- unsigned long long multiKeyIndexBits;
+ // Think carefully before using this. We need at least 8 bytes reserved to leave room for a
+ // DiskLoc pointing to more data (eg in a dummy MmapV1RecordHeader or Extent). There is still _reservedA
+ // above, but these are the final two reserved 8-byte regions.
+ char _reserved[8];
+ /*-------- end data 496 bytes */
+public:
+ explicit NamespaceDetails(const DiskLoc& loc, bool _capped);
- // ofs 400 (16)
- unsigned long long _reservedA;
- long long _extraOffset; // where the $extra info is located (bytes relative to this)
+ class Extra {
+ long long _next;
public:
- int indexBuildsInProgress; // Number of indexes currently being built
-
- int userFlags;
+ IndexDetails details[NIndexesExtra];
- DiskLoc deletedListLarge[LargeBuckets];
+ private:
+ unsigned reserved2;
+ unsigned reserved3;
+ Extra(const Extra&) {
+ verify(false);
+ }
+ Extra& operator=(const Extra& r) {
+ verify(false);
+ return *this;
+ }
- // Think carefully before using this. We need at least 8 bytes reserved to leave room for a
- // DiskLoc pointing to more data (eg in a dummy MmapV1RecordHeader or Extent). There is still _reservedA
- // above, but these are the final two reserved 8-byte regions.
- char _reserved[8];
- /*-------- end data 496 bytes */
public:
- explicit NamespaceDetails( const DiskLoc &loc, bool _capped );
-
- class Extra {
- long long _next;
- public:
- IndexDetails details[NIndexesExtra];
- private:
- unsigned reserved2;
- unsigned reserved3;
- Extra(const Extra&) { verify(false); }
- Extra& operator=(const Extra& r) { verify(false); return *this; }
- public:
- Extra() { }
- long ofsFrom(NamespaceDetails *d) {
- return ((char *) this) - ((char *) d);
- }
- void init() { memset(this, 0, sizeof(Extra)); }
- Extra* next(const NamespaceDetails *d) const {
- if( _next == 0 ) return 0;
- return (Extra*) (((char *) d) + _next);
- }
- void setNext(OperationContext* txn, long ofs);
- void copy(NamespaceDetails *d, const Extra& e) {
- memcpy(this, &e, sizeof(Extra));
- _next = 0;
- }
- };
- Extra* extra() const {
- if( _extraOffset == 0 ) return 0;
- return (Extra *) (((char *) this) + _extraOffset);
+ Extra() {}
+ long ofsFrom(NamespaceDetails* d) {
+ return ((char*)this) - ((char*)d);
}
- /* add extra space for indexes when more than 10 */
- Extra* allocExtra( OperationContext* txn,
- StringData ns,
- NamespaceIndex& ni,
- int nindexessofar );
-
- void copyingFrom( OperationContext* txn,
- StringData thisns,
- NamespaceIndex& ni,
- NamespaceDetails *src); // must be called when renaming a NS to fix up extra
-
+ void init() {
+ memset(this, 0, sizeof(Extra));
+ }
+ Extra* next(const NamespaceDetails* d) const {
+ if (_next == 0)
+ return 0;
+ return (Extra*)(((char*)d) + _next);
+ }
+ void setNext(OperationContext* txn, long ofs);
+ void copy(NamespaceDetails* d, const Extra& e) {
+ memcpy(this, &e, sizeof(Extra));
+ _next = 0;
+ }
+ };
+ Extra* extra() const {
+ if (_extraOffset == 0)
+ return 0;
+ return (Extra*)(((char*)this) + _extraOffset);
+ }
+ /* add extra space for indexes when more than 10 */
+ Extra* allocExtra(OperationContext* txn, StringData ns, NamespaceIndex& ni, int nindexessofar);
+
+ void copyingFrom(OperationContext* txn,
+ StringData thisns,
+ NamespaceIndex& ni,
+ NamespaceDetails* src); // must be called when renaming a NS to fix up extra
+
+public:
+ void setMaxCappedDocs(OperationContext* txn, long long max);
+
+ enum UserFlags {
+ Flag_UsePowerOf2Sizes = 1 << 0,
+ Flag_NoPadding = 1 << 1,
+ };
+
+ IndexDetails& idx(int idxNo, bool missingExpected = false);
+ const IndexDetails& idx(int idxNo, bool missingExpected = false) const;
+
+ class IndexIterator {
public:
- void setMaxCappedDocs( OperationContext* txn, long long max );
-
- enum UserFlags {
- Flag_UsePowerOf2Sizes = 1 << 0,
- Flag_NoPadding = 1 << 1,
- };
-
- IndexDetails& idx(int idxNo, bool missingExpected = false );
- const IndexDetails& idx(int idxNo, bool missingExpected = false ) const;
-
- class IndexIterator {
- public:
- int pos() { return i; } // note this is the next one to come
- bool more() { return i < n; }
- const IndexDetails& next() { return d->idx(i++); }
- private:
- friend class NamespaceDetails;
- int i, n;
- const NamespaceDetails *d;
- IndexIterator(const NamespaceDetails *_d, bool includeBackgroundInProgress);
- };
-
- IndexIterator ii( bool includeBackgroundInProgress = false ) const {
- return IndexIterator(this, includeBackgroundInProgress);
+ int pos() {
+ return i;
+ } // note this is the next one to come
+ bool more() {
+ return i < n;
+ }
+ const IndexDetails& next() {
+ return d->idx(i++);
}
-
- /**
- * This fetches the IndexDetails for the next empty index slot. The caller must populate
- * returned object. This handles allocating extra index space, if necessary.
- */
- IndexDetails& getNextIndexDetails(OperationContext* txn, Collection* collection);
-
- NamespaceDetails *writingWithoutExtra( OperationContext* txn );
-
- /** Make all linked Extra objects writeable as well */
- NamespaceDetails *writingWithExtra( OperationContext* txn );
-
- /**
- * Returns the offset of the specified index name within the array of indexes. Must be
- * passed-in the owning collection to resolve the index record entries to objects.
- *
- * @return > 0 if index name was found, -1 otherwise.
- */
- int _catalogFindIndexByName(OperationContext* txn,
- const Collection* coll,
- StringData name,
- bool includeBackgroundInProgress) const;
private:
-
- /**
- * swaps all meta data for 2 indexes
- * a and b are 2 index ids, whose contents will be swapped
- * must have a lock on the entire collection to do this
- */
- void swapIndex( OperationContext* txn, int a, int b );
-
- friend class IndexCatalog;
- friend class IndexCatalogEntry;
-
- /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */
- void cappedTruncateLastDelUpdate();
- BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 );
- BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits
- BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 );
- }; // NamespaceDetails
- BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 );
+ friend class NamespaceDetails;
+ int i, n;
+ const NamespaceDetails* d;
+ IndexIterator(const NamespaceDetails* _d, bool includeBackgroundInProgress);
+ };
+
+ IndexIterator ii(bool includeBackgroundInProgress = false) const {
+ return IndexIterator(this, includeBackgroundInProgress);
+ }
+
+ /**
+ * This fetches the IndexDetails for the next empty index slot. The caller must populate
+ * returned object. This handles allocating extra index space, if necessary.
+ */
+ IndexDetails& getNextIndexDetails(OperationContext* txn, Collection* collection);
+
+ NamespaceDetails* writingWithoutExtra(OperationContext* txn);
+
+ /** Make all linked Extra objects writeable as well */
+ NamespaceDetails* writingWithExtra(OperationContext* txn);
+
+ /**
+ * Returns the offset of the specified index name within the array of indexes. Must be
+ * passed-in the owning collection to resolve the index record entries to objects.
+ *
+ * @return > 0 if index name was found, -1 otherwise.
+ */
+ int _catalogFindIndexByName(OperationContext* txn,
+ const Collection* coll,
+ StringData name,
+ bool includeBackgroundInProgress) const;
+
+private:
+ /**
+ * swaps all meta data for 2 indexes
+ * a and b are 2 index ids, whose contents will be swapped
+ * must have a lock on the entire collection to do this
+ */
+ void swapIndex(OperationContext* txn, int a, int b);
+
+ friend class IndexCatalog;
+ friend class IndexCatalogEntry;
+
+ /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */
+ void cappedTruncateLastDelUpdate();
+ BOOST_STATIC_ASSERT(NIndexesMax <= NIndexesBase + NIndexesExtra * 2);
+ BOOST_STATIC_ASSERT(NIndexesMax <= 64); // multiKey bits
+ BOOST_STATIC_ASSERT(sizeof(NamespaceDetails::Extra) == 496);
+}; // NamespaceDetails
+BOOST_STATIC_ASSERT(sizeof(NamespaceDetails) == 496);
#pragma pack()
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
index 1d3fef7b918..7e79cfdca9d 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
@@ -43,359 +43,350 @@
namespace mongo {
- using std::string;
-
- NamespaceDetailsCollectionCatalogEntry::NamespaceDetailsCollectionCatalogEntry(
- StringData ns,
- NamespaceDetails* details,
- RecordStore* namespacesRecordStore,
- RecordStore* indexRecordStore,
- MMAPV1DatabaseCatalogEntry* db )
- : CollectionCatalogEntry( ns ),
- _details( details ),
- _namespacesRecordStore(namespacesRecordStore),
- _indexRecordStore( indexRecordStore ),
- _db( db ) {
- }
-
- CollectionOptions NamespaceDetailsCollectionCatalogEntry::getCollectionOptions(OperationContext* txn) const {
- CollectionOptions options = _db->getCollectionOptions( txn, ns().ns() );
-
- if (options.flagsSet) {
- if (options.flags != _details->userFlags) {
- warning() << "system.namespaces and NamespaceDetails disagree about userFlags."
- << " system.namespaces: " << options.flags
- << " NamespaceDetails: " << _details->userFlags;
- dassert(options.flags == _details->userFlags);
- }
+using std::string;
+
+NamespaceDetailsCollectionCatalogEntry::NamespaceDetailsCollectionCatalogEntry(
+ StringData ns,
+ NamespaceDetails* details,
+ RecordStore* namespacesRecordStore,
+ RecordStore* indexRecordStore,
+ MMAPV1DatabaseCatalogEntry* db)
+ : CollectionCatalogEntry(ns),
+ _details(details),
+ _namespacesRecordStore(namespacesRecordStore),
+ _indexRecordStore(indexRecordStore),
+ _db(db) {}
+
+CollectionOptions NamespaceDetailsCollectionCatalogEntry::getCollectionOptions(
+ OperationContext* txn) const {
+ CollectionOptions options = _db->getCollectionOptions(txn, ns().ns());
+
+ if (options.flagsSet) {
+ if (options.flags != _details->userFlags) {
+ warning() << "system.namespaces and NamespaceDetails disagree about userFlags."
+ << " system.namespaces: " << options.flags
+ << " NamespaceDetails: " << _details->userFlags;
+ dassert(options.flags == _details->userFlags);
}
-
- // Fill in the actual flags from the NamespaceDetails.
- // Leaving flagsSet alone since it indicates whether the user actively set the flags.
- options.flags = _details->userFlags;
-
- return options;
}
- int NamespaceDetailsCollectionCatalogEntry::getTotalIndexCount( OperationContext* txn ) const {
- return _details->nIndexes + _details->indexBuildsInProgress;
- }
+ // Fill in the actual flags from the NamespaceDetails.
+ // Leaving flagsSet alone since it indicates whether the user actively set the flags.
+ options.flags = _details->userFlags;
- int NamespaceDetailsCollectionCatalogEntry::getCompletedIndexCount( OperationContext* txn ) const {
- return _details->nIndexes;
- }
+ return options;
+}
- int NamespaceDetailsCollectionCatalogEntry::getMaxAllowedIndexes() const {
- return NamespaceDetails::NIndexesMax;
- }
+int NamespaceDetailsCollectionCatalogEntry::getTotalIndexCount(OperationContext* txn) const {
+ return _details->nIndexes + _details->indexBuildsInProgress;
+}
- void NamespaceDetailsCollectionCatalogEntry::getAllIndexes( OperationContext* txn,
- std::vector<std::string>* names ) const {
- NamespaceDetails::IndexIterator i = _details->ii( true );
- while ( i.more() ) {
- const IndexDetails& id = i.next();
- const BSONObj obj = _indexRecordStore->dataFor( txn, id.info.toRecordId() ).toBson();
- names->push_back( obj.getStringField("name") );
- }
- }
+int NamespaceDetailsCollectionCatalogEntry::getCompletedIndexCount(OperationContext* txn) const {
+ return _details->nIndexes;
+}
- bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(OperationContext* txn,
- StringData idxName) const {
- int idxNo = _findIndexNumber( txn, idxName );
- invariant( idxNo >= 0 );
- return isIndexMultikey( idxNo );
- }
+int NamespaceDetailsCollectionCatalogEntry::getMaxAllowedIndexes() const {
+ return NamespaceDetails::NIndexesMax;
+}
- bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(int idxNo) const {
- return (_details->multiKeyIndexBits & (((unsigned long long) 1) << idxNo)) != 0;
+void NamespaceDetailsCollectionCatalogEntry::getAllIndexes(OperationContext* txn,
+ std::vector<std::string>* names) const {
+ NamespaceDetails::IndexIterator i = _details->ii(true);
+ while (i.more()) {
+ const IndexDetails& id = i.next();
+ const BSONObj obj = _indexRecordStore->dataFor(txn, id.info.toRecordId()).toBson();
+ names->push_back(obj.getStringField("name"));
}
+}
- bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn,
- StringData indexName,
- bool multikey ) {
+bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(OperationContext* txn,
+ StringData idxName) const {
+ int idxNo = _findIndexNumber(txn, idxName);
+ invariant(idxNo >= 0);
+ return isIndexMultikey(idxNo);
+}
- int idxNo = _findIndexNumber( txn, indexName );
- invariant( idxNo >= 0 );
- return setIndexIsMultikey( txn, idxNo, multikey );
- }
+bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(int idxNo) const {
+ return (_details->multiKeyIndexBits & (((unsigned long long)1) << idxNo)) != 0;
+}
- bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn,
- int idxNo,
- bool multikey ) {
- unsigned long long mask = 1ULL << idxNo;
+bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn,
+ StringData indexName,
+ bool multikey) {
+ int idxNo = _findIndexNumber(txn, indexName);
+ invariant(idxNo >= 0);
+ return setIndexIsMultikey(txn, idxNo, multikey);
+}
- if (multikey) {
- // Shortcut if the bit is already set correctly
- if (_details->multiKeyIndexBits & mask) {
- return false;
- }
+bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn,
+ int idxNo,
+ bool multikey) {
+ unsigned long long mask = 1ULL << idxNo;
- *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) |= mask;
+ if (multikey) {
+ // Shortcut if the bit is already set correctly
+ if (_details->multiKeyIndexBits & mask) {
+ return false;
}
- else {
- // Shortcut if the bit is already set correctly
- if (!(_details->multiKeyIndexBits & mask)) {
- return false;
- }
-
- // Invert mask: all 1's except a 0 at the ith bit
- mask = ~mask;
- *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) &= mask;
+
+ *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) |= mask;
+ } else {
+ // Shortcut if the bit is already set correctly
+ if (!(_details->multiKeyIndexBits & mask)) {
+ return false;
}
- return true;
+ // Invert mask: all 1's except a 0 at the ith bit
+ mask = ~mask;
+ *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) &= mask;
}
- RecordId NamespaceDetailsCollectionCatalogEntry::getIndexHead(OperationContext* txn,
- StringData idxName) const {
- int idxNo = _findIndexNumber( txn, idxName );
- invariant( idxNo >= 0 );
- return _details->idx( idxNo ).head.toRecordId();
- }
+ return true;
+}
- BSONObj NamespaceDetailsCollectionCatalogEntry::getIndexSpec( OperationContext* txn,
- StringData idxName ) const {
- int idxNo = _findIndexNumber( txn, idxName );
- invariant( idxNo >= 0 );
- const IndexDetails& id = _details->idx( idxNo );
- return _indexRecordStore->dataFor( txn, id.info.toRecordId() ).toBson();
- }
+RecordId NamespaceDetailsCollectionCatalogEntry::getIndexHead(OperationContext* txn,
+ StringData idxName) const {
+ int idxNo = _findIndexNumber(txn, idxName);
+ invariant(idxNo >= 0);
+ return _details->idx(idxNo).head.toRecordId();
+}
- void NamespaceDetailsCollectionCatalogEntry::setIndexHead( OperationContext* txn,
- StringData idxName,
- const RecordId& newHead ) {
- int idxNo = _findIndexNumber( txn, idxName );
- invariant( idxNo >= 0 );
- *txn->recoveryUnit()->writing(&_details->idx(idxNo).head) = DiskLoc::fromRecordId(newHead);
- }
+BSONObj NamespaceDetailsCollectionCatalogEntry::getIndexSpec(OperationContext* txn,
+ StringData idxName) const {
+ int idxNo = _findIndexNumber(txn, idxName);
+ invariant(idxNo >= 0);
+ const IndexDetails& id = _details->idx(idxNo);
+ return _indexRecordStore->dataFor(txn, id.info.toRecordId()).toBson();
+}
- bool NamespaceDetailsCollectionCatalogEntry::isIndexReady( OperationContext* txn,
- StringData idxName ) const {
- int idxNo = _findIndexNumber( txn, idxName );
- invariant( idxNo >= 0 );
- return idxNo < getCompletedIndexCount( txn );
- }
+void NamespaceDetailsCollectionCatalogEntry::setIndexHead(OperationContext* txn,
+ StringData idxName,
+ const RecordId& newHead) {
+ int idxNo = _findIndexNumber(txn, idxName);
+ invariant(idxNo >= 0);
+ *txn->recoveryUnit()->writing(&_details->idx(idxNo).head) = DiskLoc::fromRecordId(newHead);
+}
- int NamespaceDetailsCollectionCatalogEntry::_findIndexNumber( OperationContext* txn,
- StringData idxName ) const {
- NamespaceDetails::IndexIterator i = _details->ii( true );
- while ( i.more() ) {
- const IndexDetails& id = i.next();
- int idxNo = i.pos() - 1;
- const BSONObj obj = _indexRecordStore->dataFor( txn, id.info.toRecordId() ).toBson();
- if ( idxName == obj.getStringField("name") )
- return idxNo;
- }
- return -1;
+bool NamespaceDetailsCollectionCatalogEntry::isIndexReady(OperationContext* txn,
+ StringData idxName) const {
+ int idxNo = _findIndexNumber(txn, idxName);
+ invariant(idxNo >= 0);
+ return idxNo < getCompletedIndexCount(txn);
+}
+
+int NamespaceDetailsCollectionCatalogEntry::_findIndexNumber(OperationContext* txn,
+ StringData idxName) const {
+ NamespaceDetails::IndexIterator i = _details->ii(true);
+ while (i.more()) {
+ const IndexDetails& id = i.next();
+ int idxNo = i.pos() - 1;
+ const BSONObj obj = _indexRecordStore->dataFor(txn, id.info.toRecordId()).toBson();
+ if (idxName == obj.getStringField("name"))
+ return idxNo;
}
+ return -1;
+}
+
+/* remove bit from a bit array - actually remove its slot, not a clear
+ note: this function does not work with x == 63 -- that is ok
+ but keep in mind in the future if max indexes were extended to
+ exactly 64 it would be a problem
+*/
+unsigned long long removeAndSlideBit(unsigned long long b, int x) {
+ unsigned long long tmp = b;
+ return (tmp & ((((unsigned long long)1) << x) - 1)) | ((tmp >> (x + 1)) << x);
+}
- /* remove bit from a bit array - actually remove its slot, not a clear
- note: this function does not work with x == 63 -- that is ok
- but keep in mind in the future if max indexes were extended to
- exactly 64 it would be a problem
- */
- unsigned long long removeAndSlideBit(unsigned long long b, int x) {
- unsigned long long tmp = b;
- return
- (tmp & ((((unsigned long long) 1) << x)-1)) |
- ((tmp >> (x+1)) << x);
+class IndexUpdateTest : public StartupTest {
+public:
+ void run() {
+ verify(removeAndSlideBit(1, 0) == 0);
+ verify(removeAndSlideBit(2, 0) == 1);
+ verify(removeAndSlideBit(2, 1) == 0);
+ verify(removeAndSlideBit(255, 1) == 127);
+ verify(removeAndSlideBit(21, 2) == 9);
+ verify(removeAndSlideBit(0x4000000000000001ULL, 62) == 1);
}
+} iu_unittest;
- class IndexUpdateTest : public StartupTest {
- public:
- void run() {
- verify( removeAndSlideBit(1, 0) == 0 );
- verify( removeAndSlideBit(2, 0) == 1 );
- verify( removeAndSlideBit(2, 1) == 0 );
- verify( removeAndSlideBit(255, 1) == 127 );
- verify( removeAndSlideBit(21, 2) == 9 );
- verify( removeAndSlideBit(0x4000000000000001ULL, 62) == 1 );
- }
- } iu_unittest;
+Status NamespaceDetailsCollectionCatalogEntry::removeIndex(OperationContext* txn,
+ StringData indexName) {
+ int idxNo = _findIndexNumber(txn, indexName);
+ if (idxNo < 0)
+ return Status(ErrorCodes::NamespaceNotFound, "index not found to remove");
- Status NamespaceDetailsCollectionCatalogEntry::removeIndex( OperationContext* txn,
- StringData indexName ) {
- int idxNo = _findIndexNumber( txn, indexName );
- if ( idxNo < 0 )
- return Status( ErrorCodes::NamespaceNotFound, "index not found to remove" );
+ RecordId infoLocation = _details->idx(idxNo).info.toRecordId();
- RecordId infoLocation = _details->idx( idxNo ).info.toRecordId();
+ { // sanity check
+ BSONObj info = _indexRecordStore->dataFor(txn, infoLocation).toBson();
+ invariant(info["name"].String() == indexName);
+ }
- { // sanity check
- BSONObj info = _indexRecordStore->dataFor( txn, infoLocation ).toBson();
- invariant( info["name"].String() == indexName );
+ { // drop the namespace
+ string indexNamespace = IndexDescriptor::makeIndexNamespace(ns().ns(), indexName);
+ Status status = _db->dropCollection(txn, indexNamespace);
+ if (!status.isOK()) {
+ return status;
}
+ }
- { // drop the namespace
- string indexNamespace = IndexDescriptor::makeIndexNamespace( ns().ns(), indexName );
- Status status = _db->dropCollection( txn, indexNamespace );
- if ( !status.isOK() ) {
- return status;
- }
- }
+ { // all info in the .ns file
+ NamespaceDetails* d = _details->writingWithExtra(txn);
- { // all info in the .ns file
- NamespaceDetails* d = _details->writingWithExtra( txn );
+ // fix the _multiKeyIndexBits, by moving all bits above me down one
+ d->multiKeyIndexBits = removeAndSlideBit(d->multiKeyIndexBits, idxNo);
- // fix the _multiKeyIndexBits, by moving all bits above me down one
- d->multiKeyIndexBits = removeAndSlideBit(d->multiKeyIndexBits, idxNo);
+ if (idxNo >= d->nIndexes)
+ d->indexBuildsInProgress--;
+ else
+ d->nIndexes--;
- if ( idxNo >= d->nIndexes )
- d->indexBuildsInProgress--;
- else
- d->nIndexes--;
+ for (int i = idxNo; i < getTotalIndexCount(txn); i++)
+ d->idx(i) = d->idx(i + 1);
- for ( int i = idxNo; i < getTotalIndexCount( txn ); i++ )
- d->idx(i) = d->idx(i+1);
+ d->idx(getTotalIndexCount(txn)) = IndexDetails();
+ }
- d->idx( getTotalIndexCount( txn ) ) = IndexDetails();
- }
+ // remove from system.indexes
+ _indexRecordStore->deleteRecord(txn, infoLocation);
- // remove from system.indexes
- _indexRecordStore->deleteRecord( txn, infoLocation );
+ return Status::OK();
+}
- return Status::OK();
+Status NamespaceDetailsCollectionCatalogEntry::prepareForIndexBuild(OperationContext* txn,
+ const IndexDescriptor* desc) {
+ BSONObj spec = desc->infoObj();
+ // 1) entry in system.indexs
+ StatusWith<RecordId> systemIndexesEntry =
+ _indexRecordStore->insertRecord(txn, spec.objdata(), spec.objsize(), false);
+ if (!systemIndexesEntry.isOK())
+ return systemIndexesEntry.getStatus();
+
+ // 2) NamespaceDetails mods
+ IndexDetails* id;
+ try {
+ id = &_details->idx(getTotalIndexCount(txn), true);
+ } catch (DBException&) {
+ _details->allocExtra(txn, ns().ns(), _db->_namespaceIndex, getTotalIndexCount(txn));
+ id = &_details->idx(getTotalIndexCount(txn), false);
}
- Status NamespaceDetailsCollectionCatalogEntry::prepareForIndexBuild( OperationContext* txn,
- const IndexDescriptor* desc ) {
- BSONObj spec = desc->infoObj();
- // 1) entry in system.indexs
- StatusWith<RecordId> systemIndexesEntry = _indexRecordStore->insertRecord( txn,
- spec.objdata(),
- spec.objsize(),
- false );
- if ( !systemIndexesEntry.isOK() )
- return systemIndexesEntry.getStatus();
-
- // 2) NamespaceDetails mods
- IndexDetails *id;
- try {
- id = &_details->idx(getTotalIndexCount( txn ), true);
- }
- catch( DBException& ) {
- _details->allocExtra(txn,
- ns().ns(),
- _db->_namespaceIndex,
- getTotalIndexCount( txn ));
- id = &_details->idx(getTotalIndexCount( txn ), false);
- }
-
- const DiskLoc infoLoc = DiskLoc::fromRecordId(systemIndexesEntry.getValue());
- *txn->recoveryUnit()->writing( &id->info ) = infoLoc;
- *txn->recoveryUnit()->writing( &id->head ) = DiskLoc();
+ const DiskLoc infoLoc = DiskLoc::fromRecordId(systemIndexesEntry.getValue());
+ *txn->recoveryUnit()->writing(&id->info) = infoLoc;
+ *txn->recoveryUnit()->writing(&id->head) = DiskLoc();
- txn->recoveryUnit()->writingInt( _details->indexBuildsInProgress ) += 1;
+ txn->recoveryUnit()->writingInt(_details->indexBuildsInProgress) += 1;
- // 3) indexes entry in .ns file and system.namespaces
- _db->createNamespaceForIndex(txn, desc->indexNamespace());
+ // 3) indexes entry in .ns file and system.namespaces
+ _db->createNamespaceForIndex(txn, desc->indexNamespace());
- return Status::OK();
- }
+ return Status::OK();
+}
- void NamespaceDetailsCollectionCatalogEntry::indexBuildSuccess( OperationContext* txn,
- StringData indexName ) {
- int idxNo = _findIndexNumber( txn, indexName );
- fassert( 17202, idxNo >= 0 );
+void NamespaceDetailsCollectionCatalogEntry::indexBuildSuccess(OperationContext* txn,
+ StringData indexName) {
+ int idxNo = _findIndexNumber(txn, indexName);
+ fassert(17202, idxNo >= 0);
- // Make sure the newly created index is relocated to nIndexes, if it isn't already there
- if ( idxNo != getCompletedIndexCount( txn ) ) {
- int toIdxNo = getCompletedIndexCount( txn );
+ // Make sure the newly created index is relocated to nIndexes, if it isn't already there
+ if (idxNo != getCompletedIndexCount(txn)) {
+ int toIdxNo = getCompletedIndexCount(txn);
- //_details->swapIndex( txn, idxNo, toIdxNo );
+ //_details->swapIndex( txn, idxNo, toIdxNo );
- // flip main meta data
- IndexDetails temp = _details->idx(idxNo);
- *txn->recoveryUnit()->writing(&_details->idx(idxNo)) = _details->idx(toIdxNo);
- *txn->recoveryUnit()->writing(&_details->idx(toIdxNo)) = temp;
+ // flip main meta data
+ IndexDetails temp = _details->idx(idxNo);
+ *txn->recoveryUnit()->writing(&_details->idx(idxNo)) = _details->idx(toIdxNo);
+ *txn->recoveryUnit()->writing(&_details->idx(toIdxNo)) = temp;
- // flip multi key bits
- bool tempMultikey = isIndexMultikey(idxNo);
- setIndexIsMultikey( txn, idxNo, isIndexMultikey(toIdxNo) );
- setIndexIsMultikey( txn, toIdxNo, tempMultikey );
+ // flip multi key bits
+ bool tempMultikey = isIndexMultikey(idxNo);
+ setIndexIsMultikey(txn, idxNo, isIndexMultikey(toIdxNo));
+ setIndexIsMultikey(txn, toIdxNo, tempMultikey);
- idxNo = toIdxNo;
- invariant( (idxNo = _findIndexNumber( txn, indexName )) );
- }
+ idxNo = toIdxNo;
+ invariant((idxNo = _findIndexNumber(txn, indexName)));
+ }
- txn->recoveryUnit()->writingInt( _details->indexBuildsInProgress ) -= 1;
- txn->recoveryUnit()->writingInt( _details->nIndexes ) += 1;
+ txn->recoveryUnit()->writingInt(_details->indexBuildsInProgress) -= 1;
+ txn->recoveryUnit()->writingInt(_details->nIndexes) += 1;
- invariant( isIndexReady( txn, indexName ) );
- }
+ invariant(isIndexReady(txn, indexName));
+}
- void NamespaceDetailsCollectionCatalogEntry::updateTTLSetting( OperationContext* txn,
- StringData idxName,
- long long newExpireSeconds ) {
- int idx = _findIndexNumber( txn, idxName );
- invariant( idx >= 0 );
+void NamespaceDetailsCollectionCatalogEntry::updateTTLSetting(OperationContext* txn,
+ StringData idxName,
+ long long newExpireSeconds) {
+ int idx = _findIndexNumber(txn, idxName);
+ invariant(idx >= 0);
- IndexDetails& indexDetails = _details->idx( idx );
+ IndexDetails& indexDetails = _details->idx(idx);
- BSONObj obj = _indexRecordStore->dataFor( txn, indexDetails.info.toRecordId() ).toBson();
- const BSONElement oldExpireSecs = obj.getField("expireAfterSeconds");
+ BSONObj obj = _indexRecordStore->dataFor(txn, indexDetails.info.toRecordId()).toBson();
+ const BSONElement oldExpireSecs = obj.getField("expireAfterSeconds");
- // Important that we set the new value in-place. We are writing directly to the
- // object here so must be careful not to overwrite with a longer numeric type.
+ // Important that we set the new value in-place. We are writing directly to the
+ // object here so must be careful not to overwrite with a longer numeric type.
- char* nonConstPtr = const_cast<char*>(oldExpireSecs.value());
- switch( oldExpireSecs.type() ) {
+ char* nonConstPtr = const_cast<char*>(oldExpireSecs.value());
+ switch (oldExpireSecs.type()) {
case EOO:
- massert( 16631, "index does not have an 'expireAfterSeconds' field", false );
+ massert(16631, "index does not have an 'expireAfterSeconds' field", false);
break;
case NumberInt:
*txn->recoveryUnit()->writing(reinterpret_cast<int*>(nonConstPtr)) = newExpireSeconds;
break;
case NumberDouble:
- *txn->recoveryUnit()->writing(reinterpret_cast<double*>(nonConstPtr)) = newExpireSeconds;
+ *txn->recoveryUnit()->writing(reinterpret_cast<double*>(nonConstPtr)) =
+ newExpireSeconds;
break;
case NumberLong:
- *txn->recoveryUnit()->writing(reinterpret_cast<long long*>(nonConstPtr)) = newExpireSeconds;
+ *txn->recoveryUnit()->writing(reinterpret_cast<long long*>(nonConstPtr)) =
+ newExpireSeconds;
break;
default:
- massert( 16632, "current 'expireAfterSeconds' is not a number", false );
- }
+ massert(16632, "current 'expireAfterSeconds' is not a number", false);
}
+}
namespace {
- void updateSystemNamespaces(OperationContext* txn, RecordStore* namespaces,
- const NamespaceString& ns, const BSONObj& update) {
-
- if (!namespaces)
- return;
-
- auto cursor = namespaces->getCursor(txn);
- while (auto record = cursor->next()) {
- BSONObj oldEntry = record->data.releaseToBson();
- BSONElement e = oldEntry["name"];
- if (e.type() != String)
- continue;
-
- if (e.String() != ns.ns())
- continue;
-
- const BSONObj newEntry = applyUpdateOperators(oldEntry, update);
- StatusWith<RecordId> result = namespaces->updateRecord(txn, record->id,
- newEntry.objdata(),
- newEntry.objsize(),
- false, NULL);
- fassert(17486, result.getStatus());
- return;
- }
- fassertFailed(17488);
+void updateSystemNamespaces(OperationContext* txn,
+ RecordStore* namespaces,
+ const NamespaceString& ns,
+ const BSONObj& update) {
+ if (!namespaces)
+ return;
+
+ auto cursor = namespaces->getCursor(txn);
+ while (auto record = cursor->next()) {
+ BSONObj oldEntry = record->data.releaseToBson();
+ BSONElement e = oldEntry["name"];
+ if (e.type() != String)
+ continue;
+
+ if (e.String() != ns.ns())
+ continue;
+
+ const BSONObj newEntry = applyUpdateOperators(oldEntry, update);
+ StatusWith<RecordId> result = namespaces->updateRecord(
+ txn, record->id, newEntry.objdata(), newEntry.objsize(), false, NULL);
+ fassert(17486, result.getStatus());
+ return;
}
+ fassertFailed(17488);
+}
}
- void NamespaceDetailsCollectionCatalogEntry::updateFlags(OperationContext* txn, int newValue) {
- NamespaceDetailsRSV1MetaData md(ns().ns(), _details);
- md.replaceUserFlags(txn, newValue);
- updateSystemNamespaces(txn, _namespacesRecordStore, ns(),
- BSON("$set" << BSON("options.flags" << newValue)));
- }
+void NamespaceDetailsCollectionCatalogEntry::updateFlags(OperationContext* txn, int newValue) {
+ NamespaceDetailsRSV1MetaData md(ns().ns(), _details);
+ md.replaceUserFlags(txn, newValue);
+ updateSystemNamespaces(
+ txn, _namespacesRecordStore, ns(), BSON("$set" << BSON("options.flags" << newValue)));
+}
- void NamespaceDetailsCollectionCatalogEntry::updateValidator(OperationContext* txn,
- const BSONObj& validator) {
- updateSystemNamespaces(txn, _namespacesRecordStore, ns(),
- BSON("$set" << BSON("options.validator" << validator)));
- }
+void NamespaceDetailsCollectionCatalogEntry::updateValidator(OperationContext* txn,
+ const BSONObj& validator) {
+ updateSystemNamespaces(
+ txn, _namespacesRecordStore, ns(), BSON("$set" << BSON("options.validator" << validator)));
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
index 9080c24c776..2d6751345d6 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
@@ -37,84 +37,73 @@
namespace mongo {
- class NamespaceDetails;
+class NamespaceDetails;
- class MMAPV1DatabaseCatalogEntry;;
- class RecordStore;
- class OperationContext;
+class MMAPV1DatabaseCatalogEntry;
+;
+class RecordStore;
+class OperationContext;
- class NamespaceDetailsCollectionCatalogEntry : public CollectionCatalogEntry {
- public:
- NamespaceDetailsCollectionCatalogEntry( StringData ns,
- NamespaceDetails* details,
- RecordStore* namespacesRecordStore,
- RecordStore* indexRecordStore,
- MMAPV1DatabaseCatalogEntry* db );
+class NamespaceDetailsCollectionCatalogEntry : public CollectionCatalogEntry {
+public:
+ NamespaceDetailsCollectionCatalogEntry(StringData ns,
+ NamespaceDetails* details,
+ RecordStore* namespacesRecordStore,
+ RecordStore* indexRecordStore,
+ MMAPV1DatabaseCatalogEntry* db);
- ~NamespaceDetailsCollectionCatalogEntry(){}
+ ~NamespaceDetailsCollectionCatalogEntry() {}
- CollectionOptions getCollectionOptions(OperationContext* txn) const final;
+ CollectionOptions getCollectionOptions(OperationContext* txn) const final;
- int getTotalIndexCount(OperationContext* txn) const final;
+ int getTotalIndexCount(OperationContext* txn) const final;
- int getCompletedIndexCount(OperationContext* txn) const final;
+ int getCompletedIndexCount(OperationContext* txn) const final;
- int getMaxAllowedIndexes() const final;
+ int getMaxAllowedIndexes() const final;
- void getAllIndexes( OperationContext* txn,
- std::vector<std::string>* names ) const final;
+ void getAllIndexes(OperationContext* txn, std::vector<std::string>* names) const final;
- BSONObj getIndexSpec( OperationContext* txn,
- StringData idxName ) const final;
+ BSONObj getIndexSpec(OperationContext* txn, StringData idxName) const final;
- bool isIndexMultikey(OperationContext* txn,
- StringData indexName) const final;
- bool isIndexMultikey(int idxNo) const;
+ bool isIndexMultikey(OperationContext* txn, StringData indexName) const final;
+ bool isIndexMultikey(int idxNo) const;
- bool setIndexIsMultikey(OperationContext* txn,
- int idxNo,
- bool multikey = true);
- bool setIndexIsMultikey(OperationContext* txn,
- StringData indexName,
- bool multikey = true) final;
+ bool setIndexIsMultikey(OperationContext* txn, int idxNo, bool multikey = true);
+ bool setIndexIsMultikey(OperationContext* txn,
+ StringData indexName,
+ bool multikey = true) final;
- RecordId getIndexHead( OperationContext* txn,
- StringData indexName ) const final;
+ RecordId getIndexHead(OperationContext* txn, StringData indexName) const final;
- void setIndexHead( OperationContext* txn,
- StringData indexName,
- const RecordId& newHead ) final;
+ void setIndexHead(OperationContext* txn, StringData indexName, const RecordId& newHead) final;
- bool isIndexReady( OperationContext* txn,
- StringData indexName ) const final;
+ bool isIndexReady(OperationContext* txn, StringData indexName) const final;
- Status removeIndex( OperationContext* txn,
- StringData indexName ) final;
+ Status removeIndex(OperationContext* txn, StringData indexName) final;
- Status prepareForIndexBuild( OperationContext* txn,
- const IndexDescriptor* spec ) final;
+ Status prepareForIndexBuild(OperationContext* txn, const IndexDescriptor* spec) final;
- void indexBuildSuccess( OperationContext* txn,
- StringData indexName ) final;
+ void indexBuildSuccess(OperationContext* txn, StringData indexName) final;
- void updateTTLSetting( OperationContext* txn,
- StringData idxName,
- long long newExpireSeconds ) final;
+ void updateTTLSetting(OperationContext* txn,
+ StringData idxName,
+ long long newExpireSeconds) final;
- void updateFlags(OperationContext* txn, int newValue) final;
+ void updateFlags(OperationContext* txn, int newValue) final;
- void updateValidator(OperationContext* txn, const BSONObj& validator) final;
+ void updateValidator(OperationContext* txn, const BSONObj& validator) final;
- // not part of interface, but available to my storage engine
+ // not part of interface, but available to my storage engine
- int _findIndexNumber( OperationContext* txn, StringData indexName) const;
+ int _findIndexNumber(OperationContext* txn, StringData indexName) const;
- private:
- NamespaceDetails* _details;
- RecordStore* _namespacesRecordStore;
- RecordStore* _indexRecordStore;
- MMAPV1DatabaseCatalogEntry* _db;
+private:
+ NamespaceDetails* _details;
+ RecordStore* _namespacesRecordStore;
+ RecordStore* _indexRecordStore;
+ MMAPV1DatabaseCatalogEntry* _db;
- friend class MMAPV1DatabaseCatalogEntry;
- };
+ friend class MMAPV1DatabaseCatalogEntry;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp
index 5c95ec2bbc7..51fc1c1ed75 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp
@@ -35,169 +35,165 @@
namespace mongo {
- using std::unique_ptr;
- using std::numeric_limits;
+using std::unique_ptr;
+using std::numeric_limits;
- BOOST_STATIC_ASSERT(RecordStoreV1Base::Buckets
- == NamespaceDetails::SmallBuckets + NamespaceDetails::LargeBuckets);
+BOOST_STATIC_ASSERT(RecordStoreV1Base::Buckets ==
+ NamespaceDetails::SmallBuckets + NamespaceDetails::LargeBuckets);
- NamespaceDetailsRSV1MetaData::NamespaceDetailsRSV1MetaData( StringData ns,
- NamespaceDetails* details )
- : _ns( ns.toString() ),
- _details( details ) {
- }
-
- const DiskLoc& NamespaceDetailsRSV1MetaData::capExtent() const {
- return _details->capExtent;
- }
+NamespaceDetailsRSV1MetaData::NamespaceDetailsRSV1MetaData(StringData ns, NamespaceDetails* details)
+ : _ns(ns.toString()), _details(details) {}
- void NamespaceDetailsRSV1MetaData::setCapExtent( OperationContext* txn, const DiskLoc& loc ) {
- *txn->recoveryUnit()->writing( &_details->capExtent ) = loc;
- }
+const DiskLoc& NamespaceDetailsRSV1MetaData::capExtent() const {
+ return _details->capExtent;
+}
- const DiskLoc& NamespaceDetailsRSV1MetaData::capFirstNewRecord() const {
- return _details->capFirstNewRecord;
- }
+void NamespaceDetailsRSV1MetaData::setCapExtent(OperationContext* txn, const DiskLoc& loc) {
+ *txn->recoveryUnit()->writing(&_details->capExtent) = loc;
+}
- void NamespaceDetailsRSV1MetaData::setCapFirstNewRecord( OperationContext* txn,
- const DiskLoc& loc ) {
- *txn->recoveryUnit()->writing( &_details->capFirstNewRecord ) = loc;
- }
+const DiskLoc& NamespaceDetailsRSV1MetaData::capFirstNewRecord() const {
+ return _details->capFirstNewRecord;
+}
- bool NamespaceDetailsRSV1MetaData::capLooped() const {
- return _details->capFirstNewRecord.isValid();
- }
+void NamespaceDetailsRSV1MetaData::setCapFirstNewRecord(OperationContext* txn, const DiskLoc& loc) {
+ *txn->recoveryUnit()->writing(&_details->capFirstNewRecord) = loc;
+}
- long long NamespaceDetailsRSV1MetaData::dataSize() const {
- return _details->stats.datasize;
- }
- long long NamespaceDetailsRSV1MetaData::numRecords() const {
- return _details->stats.nrecords;
- }
+bool NamespaceDetailsRSV1MetaData::capLooped() const {
+ return _details->capFirstNewRecord.isValid();
+}
- void NamespaceDetailsRSV1MetaData::incrementStats( OperationContext* txn,
- long long dataSizeIncrement,
- long long numRecordsIncrement ) {
- // durability todo : this could be a bit annoying / slow to record constantly
- NamespaceDetails::Stats* s = txn->recoveryUnit()->writing( &_details->stats );
- s->datasize += dataSizeIncrement;
- s->nrecords += numRecordsIncrement;
- }
+long long NamespaceDetailsRSV1MetaData::dataSize() const {
+ return _details->stats.datasize;
+}
+long long NamespaceDetailsRSV1MetaData::numRecords() const {
+ return _details->stats.nrecords;
+}
- void NamespaceDetailsRSV1MetaData::setStats( OperationContext* txn,
- long long dataSize,
- long long numRecords ) {
- NamespaceDetails::Stats* s = txn->recoveryUnit()->writing( &_details->stats );
- s->datasize = dataSize;
- s->nrecords = numRecords;
- }
+void NamespaceDetailsRSV1MetaData::incrementStats(OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement) {
+ // durability todo : this could be a bit annoying / slow to record constantly
+ NamespaceDetails::Stats* s = txn->recoveryUnit()->writing(&_details->stats);
+ s->datasize += dataSizeIncrement;
+ s->nrecords += numRecordsIncrement;
+}
- DiskLoc NamespaceDetailsRSV1MetaData::deletedListEntry( int bucket ) const {
- invariant(bucket >= 0 && bucket < RecordStoreV1Base::Buckets);
- const DiskLoc head = (bucket < NamespaceDetails::SmallBuckets)
- ? _details->deletedListSmall[bucket]
- : _details->deletedListLarge[bucket - NamespaceDetails::SmallBuckets];
+void NamespaceDetailsRSV1MetaData::setStats(OperationContext* txn,
+ long long dataSize,
+ long long numRecords) {
+ NamespaceDetails::Stats* s = txn->recoveryUnit()->writing(&_details->stats);
+ s->datasize = dataSize;
+ s->nrecords = numRecords;
+}
- if (head == DiskLoc(0,0)) {
- // This will happen the first time we use a "large" bucket since they were previously
- // zero-initialized.
- return DiskLoc();
- }
+DiskLoc NamespaceDetailsRSV1MetaData::deletedListEntry(int bucket) const {
+ invariant(bucket >= 0 && bucket < RecordStoreV1Base::Buckets);
+ const DiskLoc head = (bucket < NamespaceDetails::SmallBuckets)
+ ? _details->deletedListSmall[bucket]
+ : _details->deletedListLarge[bucket - NamespaceDetails::SmallBuckets];
- return head;
+ if (head == DiskLoc(0, 0)) {
+ // This will happen the first time we use a "large" bucket since they were previously
+ // zero-initialized.
+ return DiskLoc();
}
- void NamespaceDetailsRSV1MetaData::setDeletedListEntry( OperationContext* txn,
- int bucket,
- const DiskLoc& loc ) {
- DiskLoc* head = (bucket < NamespaceDetails::SmallBuckets)
- ? &_details->deletedListSmall[bucket]
- : &_details->deletedListLarge[bucket - NamespaceDetails::SmallBuckets];
- *txn->recoveryUnit()->writing( head ) = loc;
- }
+ return head;
+}
- DiskLoc NamespaceDetailsRSV1MetaData::deletedListLegacyGrabBag() const {
- return _details->deletedListLegacyGrabBag;
- }
+void NamespaceDetailsRSV1MetaData::setDeletedListEntry(OperationContext* txn,
+ int bucket,
+ const DiskLoc& loc) {
+ DiskLoc* head = (bucket < NamespaceDetails::SmallBuckets)
+ ? &_details->deletedListSmall[bucket]
+ : &_details->deletedListLarge[bucket - NamespaceDetails::SmallBuckets];
+ *txn->recoveryUnit()->writing(head) = loc;
+}
- void NamespaceDetailsRSV1MetaData::setDeletedListLegacyGrabBag(OperationContext* txn,
- const DiskLoc& loc) {
- *txn->recoveryUnit()->writing(&_details->deletedListLegacyGrabBag) = loc;
- }
+DiskLoc NamespaceDetailsRSV1MetaData::deletedListLegacyGrabBag() const {
+ return _details->deletedListLegacyGrabBag;
+}
- void NamespaceDetailsRSV1MetaData::orphanDeletedList( OperationContext* txn ) {
- for( int i = 0; i < RecordStoreV1Base::Buckets; i++ ) {
- setDeletedListEntry( txn, i, DiskLoc() );
- }
- setDeletedListLegacyGrabBag(txn, DiskLoc());
- }
+void NamespaceDetailsRSV1MetaData::setDeletedListLegacyGrabBag(OperationContext* txn,
+ const DiskLoc& loc) {
+ *txn->recoveryUnit()->writing(&_details->deletedListLegacyGrabBag) = loc;
+}
- const DiskLoc& NamespaceDetailsRSV1MetaData::firstExtent( OperationContext* txn ) const {
- return _details->firstExtent;
+void NamespaceDetailsRSV1MetaData::orphanDeletedList(OperationContext* txn) {
+ for (int i = 0; i < RecordStoreV1Base::Buckets; i++) {
+ setDeletedListEntry(txn, i, DiskLoc());
}
+ setDeletedListLegacyGrabBag(txn, DiskLoc());
+}
- void NamespaceDetailsRSV1MetaData::setFirstExtent( OperationContext* txn, const DiskLoc& loc ) {
- *txn->recoveryUnit()->writing( &_details->firstExtent ) = loc;
- }
+const DiskLoc& NamespaceDetailsRSV1MetaData::firstExtent(OperationContext* txn) const {
+ return _details->firstExtent;
+}
- const DiskLoc& NamespaceDetailsRSV1MetaData::lastExtent( OperationContext* txn ) const {
- return _details->lastExtent;
- }
+void NamespaceDetailsRSV1MetaData::setFirstExtent(OperationContext* txn, const DiskLoc& loc) {
+ *txn->recoveryUnit()->writing(&_details->firstExtent) = loc;
+}
- void NamespaceDetailsRSV1MetaData::setLastExtent( OperationContext* txn, const DiskLoc& loc ) {
- *txn->recoveryUnit()->writing( &_details->lastExtent ) = loc;
- }
+const DiskLoc& NamespaceDetailsRSV1MetaData::lastExtent(OperationContext* txn) const {
+ return _details->lastExtent;
+}
- bool NamespaceDetailsRSV1MetaData::isCapped() const {
- return _details->isCapped;
- }
+void NamespaceDetailsRSV1MetaData::setLastExtent(OperationContext* txn, const DiskLoc& loc) {
+ *txn->recoveryUnit()->writing(&_details->lastExtent) = loc;
+}
- bool NamespaceDetailsRSV1MetaData::isUserFlagSet( int flag ) const {
- return _details->userFlags & flag;
- }
+bool NamespaceDetailsRSV1MetaData::isCapped() const {
+ return _details->isCapped;
+}
- int NamespaceDetailsRSV1MetaData::userFlags() const {
- return _details->userFlags;
- }
+bool NamespaceDetailsRSV1MetaData::isUserFlagSet(int flag) const {
+ return _details->userFlags & flag;
+}
- bool NamespaceDetailsRSV1MetaData::setUserFlag( OperationContext* txn, int flag ) {
- if ( ( _details->userFlags & flag ) == flag )
- return false;
+int NamespaceDetailsRSV1MetaData::userFlags() const {
+ return _details->userFlags;
+}
- txn->recoveryUnit()->writingInt( _details->userFlags) |= flag;
- return true;
- }
+bool NamespaceDetailsRSV1MetaData::setUserFlag(OperationContext* txn, int flag) {
+ if ((_details->userFlags & flag) == flag)
+ return false;
- bool NamespaceDetailsRSV1MetaData::clearUserFlag( OperationContext* txn, int flag ) {
- if ( ( _details->userFlags & flag ) == 0 )
- return false;
+ txn->recoveryUnit()->writingInt(_details->userFlags) |= flag;
+ return true;
+}
- txn->recoveryUnit()->writingInt(_details->userFlags) &= ~flag;
- return true;
- }
+bool NamespaceDetailsRSV1MetaData::clearUserFlag(OperationContext* txn, int flag) {
+ if ((_details->userFlags & flag) == 0)
+ return false;
- bool NamespaceDetailsRSV1MetaData::replaceUserFlags( OperationContext* txn, int flags ) {
- if ( _details->userFlags == flags )
- return false;
+ txn->recoveryUnit()->writingInt(_details->userFlags) &= ~flag;
+ return true;
+}
- txn->recoveryUnit()->writingInt(_details->userFlags) = flags;
- return true;
- }
+bool NamespaceDetailsRSV1MetaData::replaceUserFlags(OperationContext* txn, int flags) {
+ if (_details->userFlags == flags)
+ return false;
- int NamespaceDetailsRSV1MetaData::lastExtentSize( OperationContext* txn ) const {
- return _details->lastExtentSize;
- }
+ txn->recoveryUnit()->writingInt(_details->userFlags) = flags;
+ return true;
+}
- void NamespaceDetailsRSV1MetaData::setLastExtentSize( OperationContext* txn, int newMax ) {
- if ( _details->lastExtentSize == newMax )
- return;
- txn->recoveryUnit()->writingInt(_details->lastExtentSize) = newMax;
- }
+int NamespaceDetailsRSV1MetaData::lastExtentSize(OperationContext* txn) const {
+ return _details->lastExtentSize;
+}
- long long NamespaceDetailsRSV1MetaData::maxCappedDocs() const {
- invariant( _details->isCapped );
- if ( _details->maxDocsInCapped == 0x7fffffff )
- return numeric_limits<long long>::max();
- return _details->maxDocsInCapped;
- }
+void NamespaceDetailsRSV1MetaData::setLastExtentSize(OperationContext* txn, int newMax) {
+ if (_details->lastExtentSize == newMax)
+ return;
+ txn->recoveryUnit()->writingInt(_details->lastExtentSize) = newMax;
+}
+
+long long NamespaceDetailsRSV1MetaData::maxCappedDocs() const {
+ invariant(_details->isCapped);
+ if (_details->maxDocsInCapped == 0x7fffffff)
+ return numeric_limits<long long>::max();
+ return _details->maxDocsInCapped;
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h
index 5bc9c475506..a6fde4807b5 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h
@@ -38,70 +38,65 @@
namespace mongo {
- class RecordStore;
+class RecordStore;
- /*
- * NOTE: NamespaceDetails will become a struct
- * all dur, etc... will move here
- */
- class NamespaceDetailsRSV1MetaData : public RecordStoreV1MetaData {
- public:
- explicit NamespaceDetailsRSV1MetaData( StringData ns, NamespaceDetails* details);
-
- virtual ~NamespaceDetailsRSV1MetaData(){}
+/*
+ * NOTE: NamespaceDetails will become a struct
+ * all dur, etc... will move here
+ */
+class NamespaceDetailsRSV1MetaData : public RecordStoreV1MetaData {
+public:
+ explicit NamespaceDetailsRSV1MetaData(StringData ns, NamespaceDetails* details);
- virtual const DiskLoc& capExtent() const;
- virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc );
+ virtual ~NamespaceDetailsRSV1MetaData() {}
- virtual const DiskLoc& capFirstNewRecord() const;
- virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc );
+ virtual const DiskLoc& capExtent() const;
+ virtual void setCapExtent(OperationContext* txn, const DiskLoc& loc);
- virtual bool capLooped() const;
+ virtual const DiskLoc& capFirstNewRecord() const;
+ virtual void setCapFirstNewRecord(OperationContext* txn, const DiskLoc& loc);
- virtual long long dataSize() const;
- virtual long long numRecords() const;
+ virtual bool capLooped() const;
- virtual void incrementStats( OperationContext* txn,
- long long dataSizeIncrement,
- long long numRecordsIncrement );
+ virtual long long dataSize() const;
+ virtual long long numRecords() const;
- virtual void setStats( OperationContext* txn,
- long long dataSize,
- long long numRecords );
+ virtual void incrementStats(OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement);
- virtual DiskLoc deletedListEntry( int bucket ) const;
- virtual void setDeletedListEntry( OperationContext* txn,
- int bucket,
- const DiskLoc& loc );
+ virtual void setStats(OperationContext* txn, long long dataSize, long long numRecords);
- virtual DiskLoc deletedListLegacyGrabBag() const;
- virtual void setDeletedListLegacyGrabBag(OperationContext* txn, const DiskLoc& loc);
+ virtual DiskLoc deletedListEntry(int bucket) const;
+ virtual void setDeletedListEntry(OperationContext* txn, int bucket, const DiskLoc& loc);
- virtual void orphanDeletedList(OperationContext* txn);
+ virtual DiskLoc deletedListLegacyGrabBag() const;
+ virtual void setDeletedListLegacyGrabBag(OperationContext* txn, const DiskLoc& loc);
- virtual const DiskLoc& firstExtent( OperationContext* txn ) const;
- virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc );
+ virtual void orphanDeletedList(OperationContext* txn);
- virtual const DiskLoc& lastExtent( OperationContext* txn ) const;
- virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc );
+ virtual const DiskLoc& firstExtent(OperationContext* txn) const;
+ virtual void setFirstExtent(OperationContext* txn, const DiskLoc& loc);
- virtual bool isCapped() const;
+ virtual const DiskLoc& lastExtent(OperationContext* txn) const;
+ virtual void setLastExtent(OperationContext* txn, const DiskLoc& loc);
- virtual bool isUserFlagSet( int flag ) const;
- virtual int userFlags() const;
- virtual bool setUserFlag( OperationContext* txn, int flag );
- virtual bool clearUserFlag( OperationContext* txn, int flag );
- virtual bool replaceUserFlags( OperationContext* txn, int flags );
+ virtual bool isCapped() const;
- virtual int lastExtentSize( OperationContext* txn ) const;
- virtual void setLastExtentSize( OperationContext* txn, int newMax );
+ virtual bool isUserFlagSet(int flag) const;
+ virtual int userFlags() const;
+ virtual bool setUserFlag(OperationContext* txn, int flag);
+ virtual bool clearUserFlag(OperationContext* txn, int flag);
+ virtual bool replaceUserFlags(OperationContext* txn, int flags);
- virtual long long maxCappedDocs() const;
+ virtual int lastExtentSize(OperationContext* txn) const;
+ virtual void setLastExtentSize(OperationContext* txn, int newMax);
- private:
- std::string _ns;
- NamespaceDetails* _details;
- RecordStore* _namespaceRecordStore;
- };
+ virtual long long maxCappedDocs() const;
+private:
+ std::string _ns;
+ NamespaceDetails* _details;
+ RecordStore* _namespaceRecordStore;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp
index 8f1bb505197..12e90d2db57 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp
@@ -47,211 +47,194 @@
namespace mongo {
- using std::endl;
- using std::list;
- using std::string;
+using std::endl;
+using std::list;
+using std::string;
- NamespaceIndex::NamespaceIndex(const std::string& dir, const std::string& database)
- : _dir(dir),
- _database(database),
- _ht(nullptr) {
+NamespaceIndex::NamespaceIndex(const std::string& dir, const std::string& database)
+ : _dir(dir), _database(database), _ht(nullptr) {}
- }
-
- NamespaceIndex::~NamespaceIndex() {
-
- }
-
- NamespaceDetails* NamespaceIndex::details(StringData ns) const {
- const Namespace n(ns);
- return details(n);
- }
+NamespaceIndex::~NamespaceIndex() {}
- NamespaceDetails* NamespaceIndex::details(const Namespace& ns) const {
- return _ht->get(ns);
- }
-
- void NamespaceIndex::add_ns( OperationContext* txn,
- StringData ns, const DiskLoc& loc, bool capped) {
- NamespaceDetails details( loc, capped );
- add_ns( txn, ns, &details );
- }
+NamespaceDetails* NamespaceIndex::details(StringData ns) const {
+ const Namespace n(ns);
+ return details(n);
+}
- void NamespaceIndex::add_ns( OperationContext* txn,
- StringData ns,
- const NamespaceDetails* details ) {
- Namespace n(ns);
- add_ns( txn, n, details );
- }
+NamespaceDetails* NamespaceIndex::details(const Namespace& ns) const {
+ return _ht->get(ns);
+}
- void NamespaceIndex::add_ns( OperationContext* txn,
- const Namespace& ns,
- const NamespaceDetails* details ) {
- const NamespaceString nss(ns.toString());
- invariant(txn->lockState()->isDbLockedForMode(nss.db(), MODE_X));
+void NamespaceIndex::add_ns(OperationContext* txn, StringData ns, const DiskLoc& loc, bool capped) {
+ NamespaceDetails details(loc, capped);
+ add_ns(txn, ns, &details);
+}
- massert(17315, "no . in ns", nsIsFull(nss.toString()));
+void NamespaceIndex::add_ns(OperationContext* txn, StringData ns, const NamespaceDetails* details) {
+ Namespace n(ns);
+ add_ns(txn, n, details);
+}
- uassert(10081, "too many namespaces/collections", _ht->put(txn, ns, *details));
- }
+void NamespaceIndex::add_ns(OperationContext* txn,
+ const Namespace& ns,
+ const NamespaceDetails* details) {
+ const NamespaceString nss(ns.toString());
+ invariant(txn->lockState()->isDbLockedForMode(nss.db(), MODE_X));
- void NamespaceIndex::kill_ns( OperationContext* txn, StringData ns) {
- const NamespaceString nss(ns.toString());
- invariant(txn->lockState()->isDbLockedForMode(nss.db(), MODE_X));
+ massert(17315, "no . in ns", nsIsFull(nss.toString()));
- const Namespace n(ns);
- _ht->kill(txn, n);
+ uassert(10081, "too many namespaces/collections", _ht->put(txn, ns, *details));
+}
- if (ns.size() <= Namespace::MaxNsColletionLen) {
- // Larger namespace names don't have room for $extras so they can't exist. The code
- // below would cause an "$extra: ns too large" error and stacktrace to be printed to the
- // log even though everything is fine.
- for( int i = 0; i<=1; i++ ) {
- try {
- Namespace extra(n.extraName(i));
- _ht->kill(txn, extra);
- }
- catch(DBException&) {
- LOG(3) << "caught exception in kill_ns" << endl;
- }
+void NamespaceIndex::kill_ns(OperationContext* txn, StringData ns) {
+ const NamespaceString nss(ns.toString());
+ invariant(txn->lockState()->isDbLockedForMode(nss.db(), MODE_X));
+
+ const Namespace n(ns);
+ _ht->kill(txn, n);
+
+ if (ns.size() <= Namespace::MaxNsColletionLen) {
+ // Larger namespace names don't have room for $extras so they can't exist. The code
+ // below would cause an "$extra: ns too large" error and stacktrace to be printed to the
+ // log even though everything is fine.
+ for (int i = 0; i <= 1; i++) {
+ try {
+ Namespace extra(n.extraName(i));
+ _ht->kill(txn, extra);
+ } catch (DBException&) {
+ LOG(3) << "caught exception in kill_ns" << endl;
}
}
}
+}
- bool NamespaceIndex::pathExists() const {
- return boost::filesystem::exists(path());
- }
-
- boost::filesystem::path NamespaceIndex::path() const {
- boost::filesystem::path ret( _dir );
- if (storageGlobalParams.directoryperdb)
- ret /= _database;
- ret /= ( _database + ".ns" );
- return ret;
- }
+bool NamespaceIndex::pathExists() const {
+ return boost::filesystem::exists(path());
+}
- static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , list<string>* l ) {
- if ( ! k.hasDollarSign() || k == "local.oplog.$main" ) {
- // we call out local.oplog.$main specifically as its the only "normal"
- // collection that has a $, so we make sure it gets added
- l->push_back( k.toString() );
- }
- }
+boost::filesystem::path NamespaceIndex::path() const {
+ boost::filesystem::path ret(_dir);
+ if (storageGlobalParams.directoryperdb)
+ ret /= _database;
+ ret /= (_database + ".ns");
+ return ret;
+}
- void NamespaceIndex::getCollectionNamespaces( list<string>* tofill ) const {
- _ht->iterAll(stdx::bind(namespaceGetNamespacesCallback,
- stdx::placeholders::_1,
- stdx::placeholders::_2,
- tofill));
+static void namespaceGetNamespacesCallback(const Namespace& k,
+ NamespaceDetails& v,
+ list<string>* l) {
+ if (!k.hasDollarSign() || k == "local.oplog.$main") {
+ // we call out local.oplog.$main specifically as its the only "normal"
+ // collection that has a $, so we make sure it gets added
+ l->push_back(k.toString());
}
+}
- void NamespaceIndex::maybeMkdir() const {
- if (!storageGlobalParams.directoryperdb)
- return;
- boost::filesystem::path dir( _dir );
- dir /= _database;
- if ( !boost::filesystem::exists( dir ) )
- MONGO_ASSERT_ON_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " );
- }
+void NamespaceIndex::getCollectionNamespaces(list<string>* tofill) const {
+ _ht->iterAll(stdx::bind(
+ namespaceGetNamespacesCallback, stdx::placeholders::_1, stdx::placeholders::_2, tofill));
+}
- void NamespaceIndex::init(OperationContext* txn) {
- invariant(!_ht.get());
+void NamespaceIndex::maybeMkdir() const {
+ if (!storageGlobalParams.directoryperdb)
+ return;
+ boost::filesystem::path dir(_dir);
+ dir /= _database;
+ if (!boost::filesystem::exists(dir))
+ MONGO_ASSERT_ON_EXCEPTION_WITH_MSG(boost::filesystem::create_directory(dir),
+ "create dir for db ");
+}
- unsigned long long len = 0;
+void NamespaceIndex::init(OperationContext* txn) {
+ invariant(!_ht.get());
- const boost::filesystem::path nsPath = path();
- const std::string pathString = nsPath.string();
+ unsigned long long len = 0;
- void* p = 0;
+ const boost::filesystem::path nsPath = path();
+ const std::string pathString = nsPath.string();
- if (boost::filesystem::exists(nsPath)) {
- if (_f.open(pathString, true)) {
- len = _f.length();
+ void* p = 0;
- if (len % (1024 * 1024) != 0) {
- StringBuilder sb;
- sb << "Invalid length: " << len
- << " for .ns file: " << pathString << ". Cannot open database";
+ if (boost::filesystem::exists(nsPath)) {
+ if (_f.open(pathString, true)) {
+ len = _f.length();
- log() << sb.str();
- uassert(10079, sb.str(), len % (1024 * 1024) == 0);
- }
+ if (len % (1024 * 1024) != 0) {
+ StringBuilder sb;
+ sb << "Invalid length: " << len << " for .ns file: " << pathString
+ << ". Cannot open database";
- p = _f.getView();
+ log() << sb.str();
+ uassert(10079, sb.str(), len % (1024 * 1024) == 0);
}
+
+ p = _f.getView();
}
- else {
- // use mmapv1GlobalOptions.lenForNewNsFiles, we are making a new database
- massert(10343,
- "bad mmapv1GlobalOptions.lenForNewNsFiles",
- mmapv1GlobalOptions.lenForNewNsFiles >= 1024*1024);
+ } else {
+ // use mmapv1GlobalOptions.lenForNewNsFiles, we are making a new database
+ massert(10343,
+ "bad mmapv1GlobalOptions.lenForNewNsFiles",
+ mmapv1GlobalOptions.lenForNewNsFiles >= 1024 * 1024);
- maybeMkdir();
+ maybeMkdir();
- unsigned long long l = mmapv1GlobalOptions.lenForNewNsFiles;
- log() << "allocating new ns file " << pathString << ", filling with zeroes..." << endl;
+ unsigned long long l = mmapv1GlobalOptions.lenForNewNsFiles;
+ log() << "allocating new ns file " << pathString << ", filling with zeroes..." << endl;
- {
- // Due to SERVER-15369 we need to explicitly write zero-bytes to the NS file.
- const unsigned long long kBlockSize = 1024*1024;
- invariant(l % kBlockSize == 0); // ns files can only be multiples of 1MB
- const std::vector<char> zeros(kBlockSize, 0);
+ {
+ // Due to SERVER-15369 we need to explicitly write zero-bytes to the NS file.
+ const unsigned long long kBlockSize = 1024 * 1024;
+ invariant(l % kBlockSize == 0); // ns files can only be multiples of 1MB
+ const std::vector<char> zeros(kBlockSize, 0);
- File file;
- file.open(pathString.c_str());
+ File file;
+ file.open(pathString.c_str());
- massert(18825,
- str::stream() << "couldn't create file " << pathString,
- file.is_open());
+ massert(18825, str::stream() << "couldn't create file " << pathString, file.is_open());
- for (fileofs ofs = 0; ofs < l && !file.bad(); ofs += kBlockSize) {
- file.write(ofs, &zeros[0], kBlockSize);
- }
+ for (fileofs ofs = 0; ofs < l && !file.bad(); ofs += kBlockSize) {
+ file.write(ofs, &zeros[0], kBlockSize);
+ }
- if (file.bad()) {
- try {
- boost::filesystem::remove(pathString);
- } catch (const std::exception& e) {
- StringBuilder ss;
- ss << "error removing file: " << e.what();
- massert(18909, ss.str(), 0);
- }
- }
- else {
- file.fsync();
+ if (file.bad()) {
+ try {
+ boost::filesystem::remove(pathString);
+ } catch (const std::exception& e) {
+ StringBuilder ss;
+ ss << "error removing file: " << e.what();
+ massert(18909, ss.str(), 0);
}
-
- massert(18826,
- str::stream() << "failure writing file " << pathString,
- !file.bad());
+ } else {
+ file.fsync();
}
- if (_f.create(pathString, l, true)) {
- // The writes done in this function must not be rolled back. This will leave the
- // file empty, but available for future use. That is why we go directly to the
- // global dur dirty list rather than going through the OperationContext.
- getDur().createdFile(pathString, l);
+ massert(18826, str::stream() << "failure writing file " << pathString, !file.bad());
+ }
- // Commit the journal and all changes to disk so that even if exceptions occur
- // during subsequent initialization, we won't have uncommited changes during file
- // close.
- getDur().commitNow(txn);
+ if (_f.create(pathString, l, true)) {
+ // The writes done in this function must not be rolled back. This will leave the
+ // file empty, but available for future use. That is why we go directly to the
+ // global dur dirty list rather than going through the OperationContext.
+ getDur().createdFile(pathString, l);
- len = l;
- invariant(len == mmapv1GlobalOptions.lenForNewNsFiles);
+ // Commit the journal and all changes to disk so that even if exceptions occur
+ // during subsequent initialization, we won't have uncommited changes during file
+ // close.
+ getDur().commitNow(txn);
- p = _f.getView();
- }
- }
+ len = l;
+ invariant(len == mmapv1GlobalOptions.lenForNewNsFiles);
- if (p == 0) {
- severe() << "error couldn't open file " << pathString << " terminating" << endl;
- invariant(false);
+ p = _f.getView();
}
+ }
- invariant(len <= 0x7fffffff);
- _ht.reset(new NamespaceHashTable(p, (int) len, "namespace index"));
+ if (p == 0) {
+ severe() << "error couldn't open file " << pathString << " terminating" << endl;
+ invariant(false);
}
+ invariant(len <= 0x7fffffff);
+ _ht.reset(new NamespaceHashTable(p, (int)len, "namespace index"));
+}
}
-
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h
index 44f429311ba..53d162bc601 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h
@@ -40,53 +40,53 @@
namespace mongo {
- class NamespaceDetails;
- class NamespaceHashTable;
- class OperationContext;
+class NamespaceDetails;
+class NamespaceHashTable;
+class OperationContext;
- /* NamespaceIndex is the ".ns" file you see in the data directory. It is the "system catalog"
- if you will: at least the core parts. (Additional info in system.* collections.)
- */
- class NamespaceIndex {
- MONGO_DISALLOW_COPYING(NamespaceIndex);
- public:
- NamespaceIndex(const std::string& dir, const std::string& database);
- ~NamespaceIndex();
+/* NamespaceIndex is the ".ns" file you see in the data directory. It is the "system catalog"
+ if you will: at least the core parts. (Additional info in system.* collections.)
+*/
+class NamespaceIndex {
+ MONGO_DISALLOW_COPYING(NamespaceIndex);
- /* returns true if the file represented by this file exists on disk */
- bool pathExists() const;
+public:
+ NamespaceIndex(const std::string& dir, const std::string& database);
+ ~NamespaceIndex();
- void init(OperationContext* txn);
+ /* returns true if the file represented by this file exists on disk */
+ bool pathExists() const;
- void add_ns( OperationContext* txn,
- StringData ns, const DiskLoc& loc, bool capped);
- void add_ns( OperationContext* txn,
- StringData ns, const NamespaceDetails* details );
- void add_ns( OperationContext* txn,
- const Namespace& ns, const NamespaceDetails* details );
+ void init(OperationContext* txn);
- NamespaceDetails* details(StringData ns) const;
- NamespaceDetails* details(const Namespace& ns) const;
+ void add_ns(OperationContext* txn, StringData ns, const DiskLoc& loc, bool capped);
+ void add_ns(OperationContext* txn, StringData ns, const NamespaceDetails* details);
+ void add_ns(OperationContext* txn, const Namespace& ns, const NamespaceDetails* details);
- void kill_ns( OperationContext* txn,
- StringData ns);
+ NamespaceDetails* details(StringData ns) const;
+ NamespaceDetails* details(const Namespace& ns) const;
- bool allocated() const { return _ht.get() != 0; }
+ void kill_ns(OperationContext* txn, StringData ns);
- void getCollectionNamespaces( std::list<std::string>* tofill ) const;
+ bool allocated() const {
+ return _ht.get() != 0;
+ }
- boost::filesystem::path path() const;
+ void getCollectionNamespaces(std::list<std::string>* tofill) const;
- unsigned long long fileLength() const { return _f.length(); }
+ boost::filesystem::path path() const;
- private:
- void maybeMkdir() const;
+ unsigned long long fileLength() const {
+ return _f.length();
+ }
- const std::string _dir;
- const std::string _database;
+private:
+ void maybeMkdir() const;
- DurableMappedFile _f;
- std::unique_ptr<NamespaceHashTable> _ht;
- };
+ const std::string _dir;
+ const std::string _database;
+ DurableMappedFile _f;
+ std::unique_ptr<NamespaceHashTable> _ht;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp
index 6a0edb79ea4..85cd79be43b 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp
@@ -34,36 +34,35 @@
namespace mongo {
- using std::string;
+using std::string;
- TEST( NamespaceTest, Basics ) {
- Namespace foo( "foo.bar" );
- Namespace bar( "bar.foo" );
+TEST(NamespaceTest, Basics) {
+ Namespace foo("foo.bar");
+ Namespace bar("bar.foo");
- ASSERT_EQUALS( foo.toString(), foo.toString() );
- ASSERT_EQUALS( foo.hash(), foo.hash() );
+ ASSERT_EQUALS(foo.toString(), foo.toString());
+ ASSERT_EQUALS(foo.hash(), foo.hash());
- ASSERT_NOT_EQUALS( foo.hash(), bar.hash() );
+ ASSERT_NOT_EQUALS(foo.hash(), bar.hash());
- ASSERT( foo == foo );
- ASSERT( !( foo != foo ) );
- ASSERT( foo != bar );
- ASSERT( !( foo == bar ) );
- }
-
- TEST( NamespaceTest, ExtraName ) {
- Namespace foo( "foo.bar" );
- ASSERT_FALSE( foo.isExtra() );
+ ASSERT(foo == foo);
+ ASSERT(!(foo != foo));
+ ASSERT(foo != bar);
+ ASSERT(!(foo == bar));
+}
- string str0 = foo.extraName( 0 );
- ASSERT_EQUALS( "foo.bar$extra", str0 );
- Namespace ex0( str0 );
- ASSERT_TRUE( ex0.isExtra() );
+TEST(NamespaceTest, ExtraName) {
+ Namespace foo("foo.bar");
+ ASSERT_FALSE(foo.isExtra());
- string str1 = foo.extraName( 1 );
- ASSERT_EQUALS( "foo.bar$extrb", str1 );
- Namespace ex1( str1 );
- ASSERT_TRUE( ex1.isExtra() );
+ string str0 = foo.extraName(0);
+ ASSERT_EQUALS("foo.bar$extra", str0);
+ Namespace ex0(str0);
+ ASSERT_TRUE(ex0.isExtra());
- }
+ string str1 = foo.extraName(1);
+ ASSERT_EQUALS("foo.bar$extrb", str1);
+ Namespace ex1(str1);
+ ASSERT_TRUE(ex1.isExtra());
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/compress.cpp b/src/mongo/db/storage/mmap_v1/compress.cpp
index bae8bc5acba..8f8dce527ed 100644
--- a/src/mongo/db/storage/mmap_v1/compress.cpp
+++ b/src/mongo/db/storage/mmap_v1/compress.cpp
@@ -36,24 +36,22 @@
namespace mongo {
- void rawCompress(const char* input,
- size_t input_length,
- char* compressed,
- size_t* compressed_length)
- {
- snappy::RawCompress(input, input_length, compressed, compressed_length);
- }
-
- size_t maxCompressedLength(size_t source_len) {
- return snappy::MaxCompressedLength(source_len);
- }
-
- size_t compress(const char* input, size_t input_length, std::string* output) {
- return snappy::Compress(input, input_length, output);
- }
-
- bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed) {
- return snappy::Uncompress(compressed, compressed_length, uncompressed);
- }
+void rawCompress(const char* input,
+ size_t input_length,
+ char* compressed,
+ size_t* compressed_length) {
+ snappy::RawCompress(input, input_length, compressed, compressed_length);
+}
+
+size_t maxCompressedLength(size_t source_len) {
+ return snappy::MaxCompressedLength(source_len);
+}
+size_t compress(const char* input, size_t input_length, std::string* output) {
+ return snappy::Compress(input, input_length, output);
+}
+
+bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed) {
+ return snappy::Uncompress(compressed, compressed_length, uncompressed);
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/compress.h b/src/mongo/db/storage/mmap_v1/compress.h
index b8afa4d90c5..8ff828a93a6 100644
--- a/src/mongo/db/storage/mmap_v1/compress.h
+++ b/src/mongo/db/storage/mmap_v1/compress.h
@@ -32,18 +32,15 @@
#include <string>
-namespace mongo {
+namespace mongo {
- size_t compress(const char* input, size_t input_length, std::string* output);
+size_t compress(const char* input, size_t input_length, std::string* output);
- bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed);
-
- size_t maxCompressedLength(size_t source_len);
- void rawCompress(const char* input,
- size_t input_length,
- char* compressed,
- size_t* compressed_length);
+bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed);
+size_t maxCompressedLength(size_t source_len);
+void rawCompress(const char* input,
+ size_t input_length,
+ char* compressed,
+ size_t* compressed_length);
}
-
-
diff --git a/src/mongo/db/storage/mmap_v1/data_file.cpp b/src/mongo/db/storage/mmap_v1/data_file.cpp
index 15fbaba024d..90f6b71b7c6 100644
--- a/src/mongo/db/storage/mmap_v1/data_file.cpp
+++ b/src/mongo/db/storage/mmap_v1/data_file.cpp
@@ -47,216 +47,201 @@
namespace mongo {
- using std::endl;
+using std::endl;
namespace {
- void data_file_check(void *_mb) {
- if (sizeof(char *) == 4) {
- uassert(10084,
- "can't map file memory - mongo requires 64 bit build for larger datasets",
- _mb != NULL);
- }
- else {
- uassert(10085, "can't map file memory", _mb != NULL);
- }
+void data_file_check(void* _mb) {
+ if (sizeof(char*) == 4) {
+ uassert(10084,
+ "can't map file memory - mongo requires 64 bit build for larger datasets",
+ _mb != NULL);
+ } else {
+ uassert(10085, "can't map file memory", _mb != NULL);
}
+}
-} // namespace
+} // namespace
- BOOST_STATIC_ASSERT(DataFileHeader::HeaderSize == 8192);
- BOOST_STATIC_ASSERT(sizeof(static_cast<DataFileHeader*>(NULL)->data) == 4);
- BOOST_STATIC_ASSERT(
- sizeof(DataFileHeader) - sizeof(static_cast<DataFileHeader*>(NULL)->data)
- == DataFileHeader::HeaderSize);
+BOOST_STATIC_ASSERT(DataFileHeader::HeaderSize == 8192);
+BOOST_STATIC_ASSERT(sizeof(static_cast<DataFileHeader*>(NULL)->data) == 4);
+BOOST_STATIC_ASSERT(sizeof(DataFileHeader) - sizeof(static_cast<DataFileHeader*>(NULL)->data) ==
+ DataFileHeader::HeaderSize);
- int DataFile::maxSize() {
- if ( sizeof( int* ) == 4 ) {
- return 512 * 1024 * 1024;
- }
- else if (mmapv1GlobalOptions.smallfiles) {
- return 0x7ff00000 >> 2;
- }
- else {
- return 0x7ff00000;
- }
+int DataFile::maxSize() {
+ if (sizeof(int*) == 4) {
+ return 512 * 1024 * 1024;
+ } else if (mmapv1GlobalOptions.smallfiles) {
+ return 0x7ff00000 >> 2;
+ } else {
+ return 0x7ff00000;
}
+}
+
+NOINLINE_DECL void DataFile::badOfs(int ofs) const {
+ msgasserted(13440,
+ str::stream() << "bad offset:" << ofs << " accessing file: " << mmf.filename()
+ << ". See http://dochub.mongodb.org/core/data-recovery");
+}
- NOINLINE_DECL void DataFile::badOfs(int ofs) const {
- msgasserted(13440, str::stream() << "bad offset:" << ofs
- << " accessing file: " << mmf.filename()
- << ". See http://dochub.mongodb.org/core/data-recovery");
+int DataFile::_defaultSize() const {
+ int size;
+
+ if (_fileNo <= 4) {
+ size = (64 * 1024 * 1024) << _fileNo;
+ } else {
+ size = 0x7ff00000;
}
- int DataFile::_defaultSize() const {
- int size;
+ if (mmapv1GlobalOptions.smallfiles) {
+ size = size >> 2;
+ }
- if (_fileNo <= 4) {
- size = (64 * 1024 * 1024) << _fileNo;
- }
- else {
- size = 0x7ff00000;
- }
+ return size;
+}
- if (mmapv1GlobalOptions.smallfiles) {
- size = size >> 2;
- }
+/** @return true if found and opened. if uninitialized (prealloc only) does not open. */
+Status DataFile::openExisting(const char* filename) {
+ invariant(_mb == 0);
- return size;
+ if (!boost::filesystem::exists(filename)) {
+ return Status(ErrorCodes::InvalidPath, "DataFile::openExisting - file does not exist");
}
- /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
- Status DataFile::openExisting(const char *filename) {
- invariant(_mb == 0);
-
- if (!boost::filesystem::exists(filename)) {
- return Status(ErrorCodes::InvalidPath, "DataFile::openExisting - file does not exist");
- }
+ if (!mmf.open(filename, false)) {
+ return Status(ErrorCodes::InternalError, "DataFile::openExisting - mmf.open failed");
+ }
- if (!mmf.open(filename, false)) {
- return Status(ErrorCodes::InternalError, "DataFile::openExisting - mmf.open failed");
- }
+ // The mapped view of the file should never be NULL if the open call above succeeded.
+ _mb = mmf.getView();
+ invariant(_mb);
- // The mapped view of the file should never be NULL if the open call above succeeded.
- _mb = mmf.getView();
- invariant(_mb);
+ const uint64_t sz = mmf.length();
+ invariant(sz <= 0x7fffffff);
+ invariant(sz % 4096 == 0);
- const uint64_t sz = mmf.length();
- invariant(sz <= 0x7fffffff);
- invariant(sz % 4096 == 0);
-
- if (sz < 64*1024*1024 && !mmapv1GlobalOptions.smallfiles) {
- if( sz >= 16*1024*1024 && sz % (1024*1024) == 0 ) {
- log() << "info openExisting file size " << sz
- << " but mmapv1GlobalOptions.smallfiles=false: "
- << filename << endl;
- }
- else {
- log() << "openExisting size " << sz << " less than minimum file size expectation "
- << filename << endl;
- verify(false);
- }
+ if (sz < 64 * 1024 * 1024 && !mmapv1GlobalOptions.smallfiles) {
+ if (sz >= 16 * 1024 * 1024 && sz % (1024 * 1024) == 0) {
+ log() << "info openExisting file size " << sz
+ << " but mmapv1GlobalOptions.smallfiles=false: " << filename << endl;
+ } else {
+ log() << "openExisting size " << sz << " less than minimum file size expectation "
+ << filename << endl;
+ verify(false);
}
-
- data_file_check(_mb);
- return Status::OK();
}
- void DataFile::open( OperationContext* txn,
- const char *filename,
- int minSize,
- bool preallocateOnly ) {
-
- long size = _defaultSize();
-
- while (size < minSize) {
- if (size < maxSize() / 2) {
- size *= 2;
- }
- else {
- size = maxSize();
- break;
- }
- }
+ data_file_check(_mb);
+ return Status::OK();
+}
+
+void DataFile::open(OperationContext* txn,
+ const char* filename,
+ int minSize,
+ bool preallocateOnly) {
+ long size = _defaultSize();
- if (size > maxSize()) {
+ while (size < minSize) {
+ if (size < maxSize() / 2) {
+ size *= 2;
+ } else {
size = maxSize();
+ break;
}
+ }
- invariant(size >= 64 * 1024 * 1024 || mmapv1GlobalOptions.smallfiles);
- invariant( size % 4096 == 0 );
+ if (size > maxSize()) {
+ size = maxSize();
+ }
- if ( preallocateOnly ) {
- if (mmapv1GlobalOptions.prealloc) {
- FileAllocator::get()->requestAllocation( filename, size );
- }
- return;
- }
+ invariant(size >= 64 * 1024 * 1024 || mmapv1GlobalOptions.smallfiles);
+ invariant(size % 4096 == 0);
- {
- invariant(_mb == 0);
- unsigned long long sz = size;
- if (mmf.create(filename, sz, false)) {
- _mb = mmf.getView();
- }
+ if (preallocateOnly) {
+ if (mmapv1GlobalOptions.prealloc) {
+ FileAllocator::get()->requestAllocation(filename, size);
+ }
+ return;
+ }
- invariant(sz <= 0x7fffffff);
- size = (int)sz;
+ {
+ invariant(_mb == 0);
+ unsigned long long sz = size;
+ if (mmf.create(filename, sz, false)) {
+ _mb = mmf.getView();
}
- data_file_check(_mb);
- header()->init(txn, _fileNo, size, filename);
+ invariant(sz <= 0x7fffffff);
+ size = (int)sz;
}
- void DataFile::flush( bool sync ) {
- mmf.flush( sync );
- }
+ data_file_check(_mb);
+ header()->init(txn, _fileNo, size, filename);
+}
- DiskLoc DataFile::allocExtentArea( OperationContext* txn, int size ) {
- // The header would be NULL if file open failed. However, if file open failed we should
- // never be entering here.
- invariant(header());
- invariant(size <= header()->unusedLength);
+void DataFile::flush(bool sync) {
+ mmf.flush(sync);
+}
- int offset = header()->unused.getOfs();
+DiskLoc DataFile::allocExtentArea(OperationContext* txn, int size) {
+ // The header would be NULL if file open failed. However, if file open failed we should
+ // never be entering here.
+ invariant(header());
+ invariant(size <= header()->unusedLength);
- DataFileHeader *h = header();
- *txn->recoveryUnit()->writing(&h->unused) = DiskLoc(_fileNo, offset + size);
- txn->recoveryUnit()->writingInt(h->unusedLength) = h->unusedLength - size;
+ int offset = header()->unused.getOfs();
- return DiskLoc(_fileNo, offset);
- }
+ DataFileHeader* h = header();
+ *txn->recoveryUnit()->writing(&h->unused) = DiskLoc(_fileNo, offset + size);
+ txn->recoveryUnit()->writingInt(h->unusedLength) = h->unusedLength - size;
- // -------------------------------------------------------------------------------
-
- void DataFileHeader::init(OperationContext* txn,
- int fileno,
- int filelength,
- const char* filename) {
-
- if (uninitialized()) {
- DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl;
-
- massert(13640,
- str::stream() << "DataFileHeader looks corrupt at file open filelength:"
- << filelength << " fileno:" << fileno,
- filelength > 32768);
-
- // The writes done in this function must not be rolled back. If the containing
- // UnitOfWork rolls back it should roll back to the state *after* these writes. This
- // will leave the file empty, but available for future use. That is why we go directly
- // to the global dur dirty list rather than going through the RecoveryUnit.
- getDur().createdFile(filename, filelength);
-
- typedef std::pair<void*, unsigned> Intent;
- std::vector<Intent> intent;
- intent.push_back(std::make_pair(this, sizeof(DataFileHeader)));
- privateViews.makeWritable(this, sizeof(DataFileHeader));
- getDur().declareWriteIntents(intent);
-
- fileLength = filelength;
- version = DataFileVersion::defaultForNewFiles();
- unused.set(fileno, HeaderSize);
- unusedLength = fileLength - HeaderSize - 16;
- freeListStart.Null();
- freeListEnd.Null();
- }
- else {
- checkUpgrade(txn);
- }
- }
+ return DiskLoc(_fileNo, offset);
+}
- void DataFileHeader::checkUpgrade(OperationContext* txn) {
- if ( freeListStart == DiskLoc(0, 0) ) {
- // we are upgrading from 2.4 to 2.6
- invariant(freeListEnd == DiskLoc(0, 0)); // both start and end should be (0,0) or real
- WriteUnitOfWork wunit(txn);
- *txn->recoveryUnit()->writing( &freeListStart ) = DiskLoc();
- *txn->recoveryUnit()->writing( &freeListEnd ) = DiskLoc();
- wunit.commit();
- }
+// -------------------------------------------------------------------------------
+
+void DataFileHeader::init(OperationContext* txn, int fileno, int filelength, const char* filename) {
+ if (uninitialized()) {
+ DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl;
+
+ massert(13640,
+ str::stream() << "DataFileHeader looks corrupt at file open filelength:"
+ << filelength << " fileno:" << fileno,
+ filelength > 32768);
+
+ // The writes done in this function must not be rolled back. If the containing
+ // UnitOfWork rolls back it should roll back to the state *after* these writes. This
+ // will leave the file empty, but available for future use. That is why we go directly
+ // to the global dur dirty list rather than going through the RecoveryUnit.
+ getDur().createdFile(filename, filelength);
+
+ typedef std::pair<void*, unsigned> Intent;
+ std::vector<Intent> intent;
+ intent.push_back(std::make_pair(this, sizeof(DataFileHeader)));
+ privateViews.makeWritable(this, sizeof(DataFileHeader));
+ getDur().declareWriteIntents(intent);
+
+ fileLength = filelength;
+ version = DataFileVersion::defaultForNewFiles();
+ unused.set(fileno, HeaderSize);
+ unusedLength = fileLength - HeaderSize - 16;
+ freeListStart.Null();
+ freeListEnd.Null();
+ } else {
+ checkUpgrade(txn);
}
+}
+void DataFileHeader::checkUpgrade(OperationContext* txn) {
+ if (freeListStart == DiskLoc(0, 0)) {
+ // we are upgrading from 2.4 to 2.6
+ invariant(freeListEnd == DiskLoc(0, 0)); // both start and end should be (0,0) or real
+ WriteUnitOfWork wunit(txn);
+ *txn->recoveryUnit()->writing(&freeListStart) = DiskLoc();
+ *txn->recoveryUnit()->writing(&freeListEnd) = DiskLoc();
+ wunit.commit();
+ }
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/data_file.h b/src/mongo/db/storage/mmap_v1/data_file.h
index 6eddb092478..ed6e08e7931 100644
--- a/src/mongo/db/storage/mmap_v1/data_file.h
+++ b/src/mongo/db/storage/mmap_v1/data_file.h
@@ -35,158 +35,181 @@
namespace mongo {
- class OperationContext;
+class OperationContext;
#pragma pack(1)
- class DataFileVersion {
- public:
- DataFileVersion(uint32_t major, uint32_t minor) :_major(major), _minor(minor) {}
-
- static DataFileVersion defaultForNewFiles() {
- return DataFileVersion(kCurrentMajor, kIndexes24AndNewer
- | kMayHave28Freelist
- );
- }
-
- bool isCompatibleWithCurrentCode() const {
- if (_major != kCurrentMajor)
- return false;
-
- if (_minor & ~kUsedMinorFlagsMask)
- return false;
-
- const uint32_t indexCleanliness = _minor & kIndexPluginMask;
- if (indexCleanliness != kIndexes24AndNewer && indexCleanliness != kIndexes22AndOlder)
- return false;
-
- // We are compatible with either setting of kMayHave28Freelist.
-
- return true;
- }
-
- bool is24IndexClean() const { return (_minor & kIndexPluginMask) == kIndexes24AndNewer; }
- void setIs24IndexClean() { _minor = ((_minor & ~kIndexPluginMask) | kIndexes24AndNewer); }
-
- bool mayHave28Freelist() const { return _minor & kMayHave28Freelist; }
- void setMayHave28Freelist() { _minor |= kMayHave28Freelist; }
-
- uint32_t majorRaw() const { return _major; }
- uint32_t minorRaw() const { return _minor; }
-
- private:
- static const uint32_t kCurrentMajor = 4;
-
- // minor layout:
- // first 4 bits - index plugin cleanliness.
- // see IndexCatalog::_upgradeDatabaseMinorVersionIfNeeded for details
- // 5th bit - 1 if started with 3.0-style freelist implementation (SERVER-14081)
- // 6th through 31st bit - reserved and must be set to 0.
- static const uint32_t kIndexPluginMask = 0xf;
- static const uint32_t kIndexes22AndOlder = 5;
- static const uint32_t kIndexes24AndNewer = 6;
-
- static const uint32_t kMayHave28Freelist = (1 << 4);
-
- // All set bits we know about are covered by this mask.
- static const uint32_t kUsedMinorFlagsMask = 0x1f;
-
- uint32_t _major;
- uint32_t _minor;
- };
-
- // Note: Intentionally not defining relational operators for DataFileVersion as there is no
- // total ordering of all versions now that '_minor' is used as a bit vector.
+class DataFileVersion {
+public:
+ DataFileVersion(uint32_t major, uint32_t minor) : _major(major), _minor(minor) {}
+
+ static DataFileVersion defaultForNewFiles() {
+ return DataFileVersion(kCurrentMajor, kIndexes24AndNewer | kMayHave28Freelist);
+ }
+
+ bool isCompatibleWithCurrentCode() const {
+ if (_major != kCurrentMajor)
+ return false;
+
+ if (_minor & ~kUsedMinorFlagsMask)
+ return false;
+
+ const uint32_t indexCleanliness = _minor & kIndexPluginMask;
+ if (indexCleanliness != kIndexes24AndNewer && indexCleanliness != kIndexes22AndOlder)
+ return false;
+
+ // We are compatible with either setting of kMayHave28Freelist.
+
+ return true;
+ }
+
+ bool is24IndexClean() const {
+ return (_minor & kIndexPluginMask) == kIndexes24AndNewer;
+ }
+ void setIs24IndexClean() {
+ _minor = ((_minor & ~kIndexPluginMask) | kIndexes24AndNewer);
+ }
+
+ bool mayHave28Freelist() const {
+ return _minor & kMayHave28Freelist;
+ }
+ void setMayHave28Freelist() {
+ _minor |= kMayHave28Freelist;
+ }
+
+ uint32_t majorRaw() const {
+ return _major;
+ }
+ uint32_t minorRaw() const {
+ return _minor;
+ }
+
+private:
+ static const uint32_t kCurrentMajor = 4;
+
+ // minor layout:
+ // first 4 bits - index plugin cleanliness.
+ // see IndexCatalog::_upgradeDatabaseMinorVersionIfNeeded for details
+ // 5th bit - 1 if started with 3.0-style freelist implementation (SERVER-14081)
+ // 6th through 31st bit - reserved and must be set to 0.
+ static const uint32_t kIndexPluginMask = 0xf;
+ static const uint32_t kIndexes22AndOlder = 5;
+ static const uint32_t kIndexes24AndNewer = 6;
+
+ static const uint32_t kMayHave28Freelist = (1 << 4);
+
+ // All set bits we know about are covered by this mask.
+ static const uint32_t kUsedMinorFlagsMask = 0x1f;
+
+ uint32_t _major;
+ uint32_t _minor;
+};
+
+// Note: Intentionally not defining relational operators for DataFileVersion as there is no
+// total ordering of all versions now that '_minor' is used as a bit vector.
#pragma pack()
- /* a datafile - i.e. the "dbname.<#>" files :
-
- ----------------------
- DataFileHeader
- ----------------------
- Extent (for a particular namespace)
- MmapV1RecordHeader
- ...
- MmapV1RecordHeader (some chained for unused space)
- ----------------------
- more Extents...
- ----------------------
- */
+/* a datafile - i.e. the "dbname.<#>" files :
+
+ ----------------------
+ DataFileHeader
+ ----------------------
+ Extent (for a particular namespace)
+ MmapV1RecordHeader
+ ...
+ MmapV1RecordHeader (some chained for unused space)
+ ----------------------
+ more Extents...
+ ----------------------
+*/
#pragma pack(1)
- class DataFileHeader {
- public:
- DataFileVersion version;
- int fileLength;
- DiskLoc unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */
- int unusedLength;
- DiskLoc freeListStart;
- DiskLoc freeListEnd;
- char reserved[8192 - 4*4 - 8*3];
+class DataFileHeader {
+public:
+ DataFileVersion version;
+ int fileLength;
+ DiskLoc
+ unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */
+ int unusedLength;
+ DiskLoc freeListStart;
+ DiskLoc freeListEnd;
+ char reserved[8192 - 4 * 4 - 8 * 3];
- char data[4]; // first extent starts here
+ char data[4]; // first extent starts here
- enum { HeaderSize = 8192 };
+ enum { HeaderSize = 8192 };
- bool uninitialized() const { return version.majorRaw() == 0; }
+ bool uninitialized() const {
+ return version.majorRaw() == 0;
+ }
- void init(OperationContext* txn, int fileno, int filelength, const char* filename);
+ void init(OperationContext* txn, int fileno, int filelength, const char* filename);
- void checkUpgrade(OperationContext* txn);
+ void checkUpgrade(OperationContext* txn);
- bool isEmpty() const {
- return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 );
- }
- };
+ bool isEmpty() const {
+ return uninitialized() || (unusedLength == fileLength - HeaderSize - 16);
+ }
+};
#pragma pack()
- class DataFile {
- public:
- DataFile(int fn) : _fileNo(fn), _mb(NULL) {
-
- }
-
- /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
- Status openExisting(const char *filename );
-
- /** creates if DNE */
- void open(OperationContext* txn,
- const char *filename,
- int requestedDataSize = 0,
- bool preallocateOnly = false);
+class DataFile {
+public:
+ DataFile(int fn) : _fileNo(fn), _mb(NULL) {}
- DiskLoc allocExtentArea( OperationContext* txn, int size );
+ /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
+ Status openExisting(const char* filename);
- DataFileHeader* getHeader() { return header(); }
- const DataFileHeader* getHeader() const { return header(); }
+ /** creates if DNE */
+ void open(OperationContext* txn,
+ const char* filename,
+ int requestedDataSize = 0,
+ bool preallocateOnly = false);
- HANDLE getFd() { return mmf.getFd(); }
- unsigned long long length() const { return mmf.length(); }
+ DiskLoc allocExtentArea(OperationContext* txn, int size);
- /* return max size an extent may be */
- static int maxSize();
+ DataFileHeader* getHeader() {
+ return header();
+ }
+ const DataFileHeader* getHeader() const {
+ return header();
+ }
- /** fsync */
- void flush( bool sync );
+ HANDLE getFd() {
+ return mmf.getFd();
+ }
+ unsigned long long length() const {
+ return mmf.length();
+ }
- private:
- friend class MmapV1ExtentManager;
+ /* return max size an extent may be */
+ static int maxSize();
+ /** fsync */
+ void flush(bool sync);
- void badOfs(int) const;
- int _defaultSize() const;
+private:
+ friend class MmapV1ExtentManager;
- void grow(DiskLoc dl, int size);
- char* p() const { return (char *) _mb; }
- DataFileHeader* header() { return static_cast<DataFileHeader*>( _mb ); }
- const DataFileHeader* header() const { return static_cast<DataFileHeader*>( _mb ); }
+ void badOfs(int) const;
+ int _defaultSize() const;
+ void grow(DiskLoc dl, int size);
- const int _fileNo;
+ char* p() const {
+ return (char*)_mb;
+ }
+ DataFileHeader* header() {
+ return static_cast<DataFileHeader*>(_mb);
+ }
+ const DataFileHeader* header() const {
+ return static_cast<DataFileHeader*>(_mb);
+ }
- DurableMappedFile mmf;
- void *_mb; // the memory mapped view
- };
+ const int _fileNo;
+ DurableMappedFile mmf;
+ void* _mb; // the memory mapped view
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/data_file_sync.cpp b/src/mongo/db/storage/mmap_v1/data_file_sync.cpp
index 9579278ded1..013877cb08b 100644
--- a/src/mongo/db/storage/mmap_v1/data_file_sync.cpp
+++ b/src/mongo/db/storage/mmap_v1/data_file_sync.cpp
@@ -44,95 +44,90 @@
namespace mongo {
- using std::endl;
+using std::endl;
- DataFileSync dataFileSync;
+DataFileSync dataFileSync;
- DataFileSync::DataFileSync()
- : ServerStatusSection( "backgroundFlushing" ),
- _total_time( 0 ),
- _flushes( 0 ),
- _last() {
+DataFileSync::DataFileSync()
+ : ServerStatusSection("backgroundFlushing"), _total_time(0), _flushes(0), _last() {}
- }
-
- void DataFileSync::run() {
- Client::initThread( name().c_str() );
+void DataFileSync::run() {
+ Client::initThread(name().c_str());
+ if (storageGlobalParams.syncdelay == 0) {
+ log() << "warning: --syncdelay 0 is not recommended and can have strange performance"
+ << endl;
+ } else if (storageGlobalParams.syncdelay == 1) {
+ log() << "--syncdelay 1" << endl;
+ } else if (storageGlobalParams.syncdelay != 60) {
+ LOG(1) << "--syncdelay " << storageGlobalParams.syncdelay << endl;
+ }
+ int time_flushing = 0;
+ while (!inShutdown()) {
+ _diaglog.flush();
if (storageGlobalParams.syncdelay == 0) {
- log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl;
- }
- else if (storageGlobalParams.syncdelay == 1) {
- log() << "--syncdelay 1" << endl;
+ // in case at some point we add an option to change at runtime
+ sleepsecs(5);
+ continue;
}
- else if (storageGlobalParams.syncdelay != 60) {
- LOG(1) << "--syncdelay " << storageGlobalParams.syncdelay << endl;
- }
- int time_flushing = 0;
- while ( ! inShutdown() ) {
- _diaglog.flush();
- if (storageGlobalParams.syncdelay == 0) {
- // in case at some point we add an option to change at runtime
- sleepsecs(5);
- continue;
- }
-
- sleepmillis((long long) std::max(0.0, (storageGlobalParams.syncdelay * 1000) - time_flushing));
-
- if ( inShutdown() ) {
- // occasional issue trying to flush during shutdown when sleep interrupted
- break;
- }
-
- Date_t start = jsTime();
- StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine();
- int numFiles = storageEngine->flushAllFiles( true );
- time_flushing = (jsTime() - start).count();
-
- _flushed(time_flushing);
-
- if( shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000 ) {
- log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl;
- }
- }
- }
- BSONObj DataFileSync::generateSection(OperationContext* txn,
- const BSONElement& configElement) const {
- if (!running()) {
- return BSONObj();
+ sleepmillis(
+ (long long)std::max(0.0, (storageGlobalParams.syncdelay * 1000) - time_flushing));
+
+ if (inShutdown()) {
+ // occasional issue trying to flush during shutdown when sleep interrupted
+ break;
}
- BSONObjBuilder b;
- b.appendNumber( "flushes" , _flushes );
- b.appendNumber( "total_ms" , _total_time );
- b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) );
- b.appendNumber( "last_ms" , _last_time );
- b.append("last_finished", _last);
- return b.obj();
+ Date_t start = jsTime();
+ StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine();
+ int numFiles = storageEngine->flushAllFiles(true);
+ time_flushing = (jsTime() - start).count();
+
+ _flushed(time_flushing);
+
+ if (shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000) {
+ log() << "flushing mmaps took " << time_flushing << "ms "
+ << " for " << numFiles << " files" << endl;
+ }
}
+}
- void DataFileSync::_flushed(int ms) {
- _flushes++;
- _total_time += ms;
- _last_time = ms;
- _last = jsTime();
+BSONObj DataFileSync::generateSection(OperationContext* txn,
+ const BSONElement& configElement) const {
+ if (!running()) {
+ return BSONObj();
}
+ BSONObjBuilder b;
+ b.appendNumber("flushes", _flushes);
+ b.appendNumber("total_ms", _total_time);
+ b.appendNumber("average_ms", (_flushes ? (_total_time / double(_flushes)) : 0.0));
+ b.appendNumber("last_ms", _last_time);
+ b.append("last_finished", _last);
+ return b.obj();
+}
+
+void DataFileSync::_flushed(int ms) {
+ _flushes++;
+ _total_time += ms;
+ _last_time = ms;
+ _last = jsTime();
+}
- class MemJournalServerStatusMetric : public ServerStatusMetric {
- public:
- MemJournalServerStatusMetric() : ServerStatusMetric(".mem.mapped") {}
- virtual void appendAtLeaf( BSONObjBuilder& b ) const {
- int m = static_cast<int>(MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
- b.appendNumber( "mapped" , m );
- if (storageGlobalParams.dur) {
- m *= 2;
- b.appendNumber( "mappedWithJournal" , m );
- }
+class MemJournalServerStatusMetric : public ServerStatusMetric {
+public:
+ MemJournalServerStatusMetric() : ServerStatusMetric(".mem.mapped") {}
+ virtual void appendAtLeaf(BSONObjBuilder& b) const {
+ int m = static_cast<int>(MemoryMappedFile::totalMappedLength() / (1024 * 1024));
+ b.appendNumber("mapped", m);
+ if (storageGlobalParams.dur) {
+ m *= 2;
+ b.appendNumber("mappedWithJournal", m);
}
+ }
- } memJournalServerStatusMetric;
+} memJournalServerStatusMetric;
}
diff --git a/src/mongo/db/storage/mmap_v1/data_file_sync.h b/src/mongo/db/storage/mmap_v1/data_file_sync.h
index a92f55b64f8..b204fdad019 100644
--- a/src/mongo/db/storage/mmap_v1/data_file_sync.h
+++ b/src/mongo/db/storage/mmap_v1/data_file_sync.h
@@ -33,30 +33,32 @@
namespace mongo {
- /**
- * does background async flushes of mmapped files
- */
- class DataFileSync : public BackgroundJob , public ServerStatusSection {
- public:
- DataFileSync();
-
- virtual bool includeByDefault() const { return true; }
- virtual std::string name() const { return "DataFileSync"; }
+/**
+ * does background async flushes of mmapped files
+ */
+class DataFileSync : public BackgroundJob, public ServerStatusSection {
+public:
+ DataFileSync();
- void run();
+ virtual bool includeByDefault() const {
+ return true;
+ }
+ virtual std::string name() const {
+ return "DataFileSync";
+ }
- virtual BSONObj generateSection(OperationContext* txn,
- const BSONElement& configElement) const;
+ void run();
- private:
- void _flushed(int ms);
+ virtual BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const;
- long long _total_time;
- long long _flushes;
- int _last_time;
- Date_t _last;
+private:
+ void _flushed(int ms);
- };
+ long long _total_time;
+ long long _flushes;
+ int _last_time;
+ Date_t _last;
+};
- extern DataFileSync dataFileSync;
+extern DataFileSync dataFileSync;
}
diff --git a/src/mongo/db/storage/mmap_v1/diskloc.h b/src/mongo/db/storage/mmap_v1/diskloc.h
index 9d3adc64da7..662daf074d5 100644
--- a/src/mongo/db/storage/mmap_v1/diskloc.h
+++ b/src/mongo/db/storage/mmap_v1/diskloc.h
@@ -43,149 +43,176 @@
namespace mongo {
- template< class Version > class BtreeBucket;
+template <class Version>
+class BtreeBucket;
#pragma pack(1)
- /** represents a disk location/offset on disk in a database. 64 bits.
- it is assumed these will be passed around by value a lot so don't do anything to make them large
- (such as adding a virtual function)
- */
- class DiskLoc {
- int _a; // this will be volume, file #, etc. but is a logical value could be anything depending on storage engine
- int ofs;
-
- public:
-
- enum SentinelValues {
- /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
- NullOfs = -1,
-
- // Caps the number of files that may be allocated in a database, allowing about 32TB of
- // data per db. Note that the DiskLoc and DiskLoc56Bit types supports more files than
- // this value, as does the data storage format.
- MaxFiles=16000,
-
- // How invalid DiskLocs are represented in RecordIds.
- InvalidRepr = -2LL,
- };
-
- DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) { }
- DiskLoc() { Null(); }
-
- // Minimum allowed DiskLoc. No MmapV1RecordHeader may begin at this location because file and extent
- // headers must precede Records in a file.
- static DiskLoc min() { return DiskLoc(0, 0); }
-
- // Maximum allowed DiskLoc.
- // No MmapV1RecordHeader may begin at this location because the minimum size of a MmapV1RecordHeader is larger than
- // one byte. Also, the last bit is not able to be used because mmapv1 uses that for "used".
- static DiskLoc max() { return DiskLoc(0x7fffffff, 0x7ffffffe); }
-
- bool questionable() const {
- return ofs < -1 ||
- _a < -1 ||
- _a > 524288;
- }
+/** represents a disk location/offset on disk in a database. 64 bits.
+ it is assumed these will be passed around by value a lot so don't do anything to make them large
+ (such as adding a virtual function)
+ */
+class DiskLoc {
+ int _a; // this will be volume, file #, etc. but is a logical value could be anything depending on storage engine
+ int ofs;
+
+public:
+ enum SentinelValues {
+ /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
+ NullOfs = -1,
+
+ // Caps the number of files that may be allocated in a database, allowing about 32TB of
+ // data per db. Note that the DiskLoc and DiskLoc56Bit types supports more files than
+ // this value, as does the data storage format.
+ MaxFiles = 16000,
+
+ // How invalid DiskLocs are represented in RecordIds.
+ InvalidRepr = -2LL,
+ };
- bool isNull() const { return _a == -1; }
- DiskLoc& Null() {
- _a = -1;
- ofs = 0; /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
- return *this;
- }
- void assertOk() const { verify(!isNull()); }
- DiskLoc& setInvalid() {
- _a = -2;
- ofs = 0;
- return *this;
- }
- bool isValid() const { return _a != -2; }
-
- std::string toString() const {
- if ( isNull() )
- return "null";
- std::stringstream ss;
- ss << _a << ':' << std::hex << ofs;
- return ss.str();
- }
+ DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) {}
+ DiskLoc() {
+ Null();
+ }
- BSONObj toBSONObj() const { return BSON( "file" << _a << "offset" << ofs ); }
+ // Minimum allowed DiskLoc. No MmapV1RecordHeader may begin at this location because file and extent
+ // headers must precede Records in a file.
+ static DiskLoc min() {
+ return DiskLoc(0, 0);
+ }
- int a() const { return _a; }
+ // Maximum allowed DiskLoc.
+ // No MmapV1RecordHeader may begin at this location because the minimum size of a MmapV1RecordHeader is larger than
+ // one byte. Also, the last bit is not able to be used because mmapv1 uses that for "used".
+ static DiskLoc max() {
+ return DiskLoc(0x7fffffff, 0x7ffffffe);
+ }
- int& GETOFS() { return ofs; }
- int getOfs() const { return ofs; }
- void set(int a, int b) {
- _a=a;
- ofs=b;
- }
+ bool questionable() const {
+ return ofs < -1 || _a < -1 || _a > 524288;
+ }
- void inc(int amt) {
- verify( !isNull() );
- ofs += amt;
- }
+ bool isNull() const {
+ return _a == -1;
+ }
+ DiskLoc& Null() {
+ _a = -1;
+ ofs =
+ 0; /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
+ return *this;
+ }
+ void assertOk() const {
+ verify(!isNull());
+ }
+ DiskLoc& setInvalid() {
+ _a = -2;
+ ofs = 0;
+ return *this;
+ }
+ bool isValid() const {
+ return _a != -2;
+ }
- bool sameFile(DiskLoc b) {
- return _a== b._a;
- }
+ std::string toString() const {
+ if (isNull())
+ return "null";
+ std::stringstream ss;
+ ss << _a << ':' << std::hex << ofs;
+ return ss.str();
+ }
- bool operator==(const DiskLoc& b) const {
- return _a==b._a&& ofs == b.ofs;
- }
- bool operator!=(const DiskLoc& b) const {
- return !(*this==b);
- }
- int compare(const DiskLoc& b) const {
- int x = _a - b._a;
- if ( x )
- return x;
- return ofs - b.ofs;
- }
+ BSONObj toBSONObj() const {
+ return BSON("file" << _a << "offset" << ofs);
+ }
- static DiskLoc fromRecordId(RecordId id) {
- if (id.isNormal())
- return DiskLoc((id.repr() >> 32), uint32_t(id.repr()));
+ int a() const {
+ return _a;
+ }
- if (id.isNull())
- return DiskLoc();
+ int& GETOFS() {
+ return ofs;
+ }
+ int getOfs() const {
+ return ofs;
+ }
+ void set(int a, int b) {
+ _a = a;
+ ofs = b;
+ }
- if (id == RecordId::max())
- return DiskLoc::max();
+ void inc(int amt) {
+ verify(!isNull());
+ ofs += amt;
+ }
- if (id == RecordId::min())
- return DiskLoc::min();
+ bool sameFile(DiskLoc b) {
+ return _a == b._a;
+ }
- dassert(id.repr() == InvalidRepr);
- return DiskLoc().setInvalid();
- }
+ bool operator==(const DiskLoc& b) const {
+ return _a == b._a && ofs == b.ofs;
+ }
+ bool operator!=(const DiskLoc& b) const {
+ return !(*this == b);
+ }
+ int compare(const DiskLoc& b) const {
+ int x = _a - b._a;
+ if (x)
+ return x;
+ return ofs - b.ofs;
+ }
+
+ static DiskLoc fromRecordId(RecordId id) {
+ if (id.isNormal())
+ return DiskLoc((id.repr() >> 32), uint32_t(id.repr()));
- RecordId toRecordId() const {
- if (_a >= 0) {
- if (*this == DiskLoc::min())
- return RecordId::min();
+ if (id.isNull())
+ return DiskLoc();
- if (*this == DiskLoc::max())
- return RecordId::max();
+ if (id == RecordId::max())
+ return DiskLoc::max();
- return RecordId(uint64_t(_a) << 32 | uint32_t(ofs));
- }
+ if (id == RecordId::min())
+ return DiskLoc::min();
+
+ dassert(id.repr() == InvalidRepr);
+ return DiskLoc().setInvalid();
+ }
- if (isNull())
- return RecordId();
+ RecordId toRecordId() const {
+ if (_a >= 0) {
+ if (*this == DiskLoc::min())
+ return RecordId::min();
- dassert(!isValid());
- return RecordId(InvalidRepr);
+ if (*this == DiskLoc::max())
+ return RecordId::max();
+
+ return RecordId(uint64_t(_a) << 32 | uint32_t(ofs));
}
- };
-#pragma pack()
- inline bool operator< (const DiskLoc& rhs, const DiskLoc& lhs) { return rhs.compare(lhs) < 0; }
- inline bool operator<=(const DiskLoc& rhs, const DiskLoc& lhs) { return rhs.compare(lhs) <= 0; }
- inline bool operator> (const DiskLoc& rhs, const DiskLoc& lhs) { return rhs.compare(lhs) > 0; }
- inline bool operator>=(const DiskLoc& rhs, const DiskLoc& lhs) { return rhs.compare(lhs) >= 0; }
+ if (isNull())
+ return RecordId();
- inline std::ostream& operator<<( std::ostream &stream, const DiskLoc &loc ) {
- return stream << loc.toString();
+ dassert(!isValid());
+ return RecordId(InvalidRepr);
}
+};
+#pragma pack()
-} // namespace mongo
+inline bool operator<(const DiskLoc& rhs, const DiskLoc& lhs) {
+ return rhs.compare(lhs) < 0;
+}
+inline bool operator<=(const DiskLoc& rhs, const DiskLoc& lhs) {
+ return rhs.compare(lhs) <= 0;
+}
+inline bool operator>(const DiskLoc& rhs, const DiskLoc& lhs) {
+ return rhs.compare(lhs) > 0;
+}
+inline bool operator>=(const DiskLoc& rhs, const DiskLoc& lhs) {
+ return rhs.compare(lhs) >= 0;
+}
+
+inline std::ostream& operator<<(std::ostream& stream, const DiskLoc& loc) {
+ return stream << loc.toString();
+}
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/dur.cpp b/src/mongo/db/storage/mmap_v1/dur.cpp
index a596bba061f..21c729eea17 100644
--- a/src/mongo/db/storage/mmap_v1/dur.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur.cpp
@@ -38,15 +38,15 @@
have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
for now (1.7.5/1.8.0) we are in read lock which is not ideal.
WRITETODATAFILES
- actually write to the database data files in this phase. currently done by memcpy'ing the writes back to
- the non-private MMF. alternatively one could write to the files the traditional way; however the way our
+ actually write to the database data files in this phase. currently done by memcpy'ing the writes back to
+ the non-private MMF. alternatively one could write to the files the traditional way; however the way our
storage engine works that isn't any faster (actually measured a tiny bit slower).
REMAPPRIVATEVIEW
we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
to be too frequent.
there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
- be required. so doing these remaps fractionally is helpful.
+ be required. so doing these remaps fractionally is helpful.
mutexes:
@@ -99,820 +99,788 @@
namespace mongo {
- using std::endl;
- using std::fixed;
- using std::hex;
- using std::set;
- using std::setprecision;
- using std::setw;
- using std::string;
- using std::stringstream;
+using std::endl;
+using std::fixed;
+using std::hex;
+using std::set;
+using std::setprecision;
+using std::setw;
+using std::string;
+using std::stringstream;
namespace dur {
namespace {
- // Used to activate the flush thread
- stdx::mutex flushMutex;
- stdx::condition_variable flushRequested;
+// Used to activate the flush thread
+stdx::mutex flushMutex;
+stdx::condition_variable flushRequested;
- // This is waited on for getlasterror acknowledgements. It means that data has been written to
- // the journal, but not necessarily applied to the shared view, so it is all right to
- // acknowledge the user operation, but NOT all right to delete the journal files for example.
- NotifyAll commitNotify;
+// This is waited on for getlasterror acknowledgements. It means that data has been written to
+// the journal, but not necessarily applied to the shared view, so it is all right to
+// acknowledge the user operation, but NOT all right to delete the journal files for example.
+NotifyAll commitNotify;
- // This is waited on for complete flush. It means that data has been both written to journal
- // and applied to the shared view, so it is allowed to delete the journal files. Used for
- // fsync:true, close DB, shutdown acknowledgements.
- NotifyAll applyToDataFilesNotify;
+// This is waited on for complete flush. It means that data has been both written to journal
+// and applied to the shared view, so it is allowed to delete the journal files. Used for
+// fsync:true, close DB, shutdown acknowledgements.
+NotifyAll applyToDataFilesNotify;
- // When set, the flush thread will exit
- AtomicUInt32 shutdownRequested(0);
+// When set, the flush thread will exit
+AtomicUInt32 shutdownRequested(0);
- enum {
- // How many commit cycles to do before considering doing a remap
- NumCommitsBeforeRemap = 10,
+enum {
+ // How many commit cycles to do before considering doing a remap
+ NumCommitsBeforeRemap = 10,
- // How many outstanding journal flushes should be allowed before applying writer back
- // pressure. Size of 1 allows two journal blocks to be in the process of being written -
- // one on the journal writer's buffer and one blocked waiting to be picked up.
- NumAsyncJournalWrites = 1,
- };
+ // How many outstanding journal flushes should be allowed before applying writer back
+ // pressure. Size of 1 allows two journal blocks to be in the process of being written -
+ // one on the journal writer's buffer and one blocked waiting to be picked up.
+ NumAsyncJournalWrites = 1,
+};
- // Remap loop state
- unsigned remapFileToStartAt;
+// Remap loop state
+unsigned remapFileToStartAt;
- // How frequently to reset the durability statistics
- enum { DurStatsResetIntervalMillis = 3 * 1000 };
+// How frequently to reset the durability statistics
+enum { DurStatsResetIntervalMillis = 3 * 1000 };
- // Size sanity checks
- BOOST_STATIC_ASSERT(UncommittedBytesLimit > BSONObjMaxInternalSize * 3);
- BOOST_STATIC_ASSERT(sizeof(void*) == 4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6);
+// Size sanity checks
+BOOST_STATIC_ASSERT(UncommittedBytesLimit > BSONObjMaxInternalSize * 3);
+BOOST_STATIC_ASSERT(sizeof(void*) == 4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6);
- /**
- * MMAP V1 durability server status section.
- */
- class DurSSS : public ServerStatusSection {
- public:
- DurSSS() : ServerStatusSection("dur") {
+/**
+ * MMAP V1 durability server status section.
+ */
+class DurSSS : public ServerStatusSection {
+public:
+ DurSSS() : ServerStatusSection("dur") {}
- }
+ virtual bool includeByDefault() const {
+ return true;
+ }
- virtual bool includeByDefault() const { return true; }
+ virtual BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const {
+ if (!getDur().isDurable()) {
+ return BSONObj();
+ }
- virtual BSONObj generateSection(OperationContext* txn,
- const BSONElement& configElement) const {
+ return dur::stats.asObj();
+ }
- if (!getDur().isDurable()) {
- return BSONObj();
- }
+} durSSS;
- return dur::stats.asObj();
- }
- } durSSS;
+/**
+ * A no-op durability interface. Used for the case when journaling is not enabled.
+ */
+class NonDurableImpl : public DurableInterface {
+public:
+ NonDurableImpl() {}
+ // DurableInterface virtual methods
+ virtual void* writingPtr(void* x, unsigned len) {
+ return x;
+ }
+ virtual void declareWriteIntent(void*, unsigned) {}
+ virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) {}
+ virtual void createdFile(const std::string& filename, unsigned long long len) {}
+ virtual bool waitUntilDurable() {
+ return false;
+ }
+ virtual bool commitNow(OperationContext* txn) {
+ return false;
+ }
+ virtual bool commitIfNeeded() {
+ return false;
+ }
+ virtual void syncDataAndTruncateJournal(OperationContext* txn) {}
+ virtual bool isDurable() const {
+ return false;
+ }
+ virtual void closingFileNotification() {}
+ virtual void commitAndStopDurThread() {}
+};
- /**
- * A no-op durability interface. Used for the case when journaling is not enabled.
- */
- class NonDurableImpl : public DurableInterface {
- public:
- NonDurableImpl() { }
- // DurableInterface virtual methods
- virtual void* writingPtr(void *x, unsigned len) { return x; }
- virtual void declareWriteIntent(void*, unsigned) { }
- virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned> >& intents) {
+/**
+ * The actual durability interface, when journaling is enabled.
+ */
+class DurableImpl : public DurableInterface {
+public:
+ DurableImpl() {}
+
+ // DurableInterface virtual methods
+ virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents);
+ virtual void createdFile(const std::string& filename, unsigned long long len);
+ virtual bool waitUntilDurable();
+ virtual bool commitNow(OperationContext* txn);
+ virtual bool commitIfNeeded();
+ virtual void syncDataAndTruncateJournal(OperationContext* txn);
+ virtual bool isDurable() const {
+ return true;
+ }
+ virtual void closingFileNotification();
+ virtual void commitAndStopDurThread();
- }
- virtual void createdFile(const std::string& filename, unsigned long long len) { }
- virtual bool waitUntilDurable() { return false; }
- virtual bool commitNow(OperationContext* txn) { return false; }
- virtual bool commitIfNeeded() { return false; }
- virtual void syncDataAndTruncateJournal(OperationContext* txn) {}
- virtual bool isDurable() const { return false; }
- virtual void closingFileNotification() { }
- virtual void commitAndStopDurThread() { }
- };
-
-
- /**
- * The actual durability interface, when journaling is enabled.
- */
- class DurableImpl : public DurableInterface {
- public:
- DurableImpl() { }
-
- // DurableInterface virtual methods
- virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned> >& intents);
- virtual void createdFile(const std::string& filename, unsigned long long len);
- virtual bool waitUntilDurable();
- virtual bool commitNow(OperationContext* txn);
- virtual bool commitIfNeeded();
- virtual void syncDataAndTruncateJournal(OperationContext* txn);
- virtual bool isDurable() const { return true; }
- virtual void closingFileNotification();
- virtual void commitAndStopDurThread();
-
- void start();
-
- private:
- stdx::thread _durThreadHandle;
- };
-
-
- /**
- * Diagnostic to check that the private view and the non-private view are in sync after
- * applying the journal changes. This function is very slow and only runs when paranoid checks
- * are enabled.
- *
- * Must be called under at least S flush lock to ensure that there are no concurrent writes
- * happening.
- */
- void debugValidateFileMapsMatch(const DurableMappedFile* mmf) {
- const unsigned char *p = (const unsigned char *)mmf->getView();
- const unsigned char *w = (const unsigned char *)mmf->view_write();
-
- // Ignore pre-allocated files that are not fully created yet
- if (!p || !w) {
- return;
- }
+ void start();
- if (memcmp(p, w, (unsigned)mmf->length()) == 0) {
- return;
- }
+private:
+ stdx::thread _durThreadHandle;
+};
- unsigned low = 0xffffffff;
- unsigned high = 0;
- log() << "DurParanoid mismatch in " << mmf->filename();
+/**
+ * Diagnostic to check that the private view and the non-private view are in sync after
+ * applying the journal changes. This function is very slow and only runs when paranoid checks
+ * are enabled.
+ *
+ * Must be called under at least S flush lock to ensure that there are no concurrent writes
+ * happening.
+ */
+void debugValidateFileMapsMatch(const DurableMappedFile* mmf) {
+ const unsigned char* p = (const unsigned char*)mmf->getView();
+ const unsigned char* w = (const unsigned char*)mmf->view_write();
- int logged = 0;
- unsigned lastMismatch = 0xffffffff;
+ // Ignore pre-allocated files that are not fully created yet
+ if (!p || !w) {
+ return;
+ }
- for (unsigned i = 0; i < mmf->length(); i++) {
- if (p[i] != w[i]) {
+ if (memcmp(p, w, (unsigned)mmf->length()) == 0) {
+ return;
+ }
- if (lastMismatch != 0xffffffff && lastMismatch + 1 != i) {
- // Separate blocks of mismatches
- log() << std::endl;
- }
+ unsigned low = 0xffffffff;
+ unsigned high = 0;
- lastMismatch = i;
+ log() << "DurParanoid mismatch in " << mmf->filename();
- if (++logged < 60) {
- if (logged == 1) {
- // For .ns files to find offset in record
- log() << "ofs % 628 = 0x" << hex << (i % 628) << endl;
- }
+ int logged = 0;
+ unsigned lastMismatch = 0xffffffff;
- stringstream ss;
- ss << "mismatch ofs:" << hex << i
- << "\tfilemap:" << setw(2) << (unsigned)w[i]
- << "\tprivmap:" << setw(2) << (unsigned)p[i];
+ for (unsigned i = 0; i < mmf->length(); i++) {
+ if (p[i] != w[i]) {
+ if (lastMismatch != 0xffffffff && lastMismatch + 1 != i) {
+ // Separate blocks of mismatches
+ log() << std::endl;
+ }
- if (p[i] > 32 && p[i] <= 126) {
- ss << '\t' << p[i];
- }
+ lastMismatch = i;
- log() << ss.str() << endl;
+ if (++logged < 60) {
+ if (logged == 1) {
+ // For .ns files to find offset in record
+ log() << "ofs % 628 = 0x" << hex << (i % 628) << endl;
}
- if (logged == 60) {
- log() << "..." << endl;
+ stringstream ss;
+ ss << "mismatch ofs:" << hex << i << "\tfilemap:" << setw(2) << (unsigned)w[i]
+ << "\tprivmap:" << setw(2) << (unsigned)p[i];
+
+ if (p[i] > 32 && p[i] <= 126) {
+ ss << '\t' << p[i];
}
- if (i < low) low = i;
- if (i > high) high = i;
+ log() << ss.str() << endl;
}
+
+ if (logged == 60) {
+ log() << "..." << endl;
+ }
+
+ if (i < low)
+ low = i;
+ if (i > high)
+ high = i;
}
+ }
- if (low != 0xffffffff) {
- std::stringstream ss;
- ss << "journal error warning views mismatch " << mmf->filename() << ' '
- << hex << low << ".." << high
- << " len:" << high - low + 1;
+ if (low != 0xffffffff) {
+ std::stringstream ss;
+ ss << "journal error warning views mismatch " << mmf->filename() << ' ' << hex << low
+ << ".." << high << " len:" << high - low + 1;
- log() << ss.str() << endl;
- log() << "priv loc: " << (void*)(p + low) << ' ' << endl;
+ log() << ss.str() << endl;
+ log() << "priv loc: " << (void*)(p + low) << ' ' << endl;
- severe() << "Written data does not match in-memory view. Missing WriteIntent?";
- invariant(false);
- }
+ severe() << "Written data does not match in-memory view. Missing WriteIntent?";
+ invariant(false);
}
+}
- /**
- * Main code of the remap private view function.
- */
- void remapPrivateViewImpl(double fraction) {
- LOG(4) << "journal REMAPPRIVATEVIEW" << endl;
-
- // There is no way that the set of files can change while we are in this method, because
- // we hold the flush lock in X mode. For files to go away, a database needs to be dropped,
- // which means acquiring the flush lock in at least IX mode.
- //
- // However, the record fetcher logic unfortunately operates without any locks and on
- // Windows and Solaris remap is not atomic and there is a window where the record fetcher
- // might get an access violation. That's why we acquire the mongo files mutex here in X
- // mode and the record fetcher takes in in S-mode (see MmapV1RecordFetcher for more
- // detail).
- //
- // See SERVER-5723 for performance improvement.
- // See SERVER-5680 to see why this code is necessary on Windows.
- // See SERVER-8795 to see why this code is necessary on Solaris.
+/**
+ * Main code of the remap private view function.
+ */
+void remapPrivateViewImpl(double fraction) {
+ LOG(4) << "journal REMAPPRIVATEVIEW" << endl;
+
+// There is no way that the set of files can change while we are in this method, because
+// we hold the flush lock in X mode. For files to go away, a database needs to be dropped,
+// which means acquiring the flush lock in at least IX mode.
+//
+// However, the record fetcher logic unfortunately operates without any locks and on
+// Windows and Solaris remap is not atomic and there is a window where the record fetcher
+// might get an access violation. That's why we acquire the mongo files mutex here in X
+// mode and the record fetcher takes in in S-mode (see MmapV1RecordFetcher for more
+// detail).
+//
+// See SERVER-5723 for performance improvement.
+// See SERVER-5680 to see why this code is necessary on Windows.
+// See SERVER-8795 to see why this code is necessary on Solaris.
#if defined(_WIN32) || defined(__sun)
- LockMongoFilesExclusive lk;
+ LockMongoFilesExclusive lk;
#else
- LockMongoFilesShared lk;
+ LockMongoFilesShared lk;
#endif
- std::set<MongoFile*>& files = MongoFile::getAllFiles();
+ std::set<MongoFile*>& files = MongoFile::getAllFiles();
- const unsigned sz = files.size();
- if (sz == 0) {
- return;
- }
+ const unsigned sz = files.size();
+ if (sz == 0) {
+ return;
+ }
- unsigned ntodo = (unsigned) (sz * fraction);
- if( ntodo < 1 ) ntodo = 1;
- if( ntodo > sz ) ntodo = sz;
+ unsigned ntodo = (unsigned)(sz * fraction);
+ if (ntodo < 1)
+ ntodo = 1;
+ if (ntodo > sz)
+ ntodo = sz;
+
+ const set<MongoFile*>::iterator b = files.begin();
+ const set<MongoFile*>::iterator e = files.end();
+ set<MongoFile*>::iterator i = b;
+
+ // Skip to our starting position as remembered from the last remap cycle
+ for (unsigned x = 0; x < remapFileToStartAt; x++) {
+ i++;
+ if (i == e)
+ i = b;
+ }
- const set<MongoFile*>::iterator b = files.begin();
- const set<MongoFile*>::iterator e = files.end();
- set<MongoFile*>::iterator i = b;
+ // Mark where to start on the next cycle
+ const unsigned startedAt = remapFileToStartAt;
+ remapFileToStartAt = (remapFileToStartAt + ntodo) % sz;
- // Skip to our starting position as remembered from the last remap cycle
- for (unsigned x = 0; x < remapFileToStartAt; x++) {
- i++;
- if (i == e) i = b;
- }
+ Timer t;
- // Mark where to start on the next cycle
- const unsigned startedAt = remapFileToStartAt;
- remapFileToStartAt = (remapFileToStartAt + ntodo) % sz;
+ for (unsigned x = 0; x < ntodo; x++) {
+ if ((*i)->isDurableMappedFile()) {
+ DurableMappedFile* const mmf = (DurableMappedFile*)*i;
- Timer t;
+ // Sanity check that the contents of the shared and the private view match so we
+ // don't end up overwriting data.
+ if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalParanoid) {
+ debugValidateFileMapsMatch(mmf);
+ }
- for (unsigned x = 0; x < ntodo; x++) {
- if ((*i)->isDurableMappedFile()) {
- DurableMappedFile* const mmf = (DurableMappedFile*) *i;
+ if (mmf->willNeedRemap()) {
+ mmf->remapThePrivateView();
+ }
- // Sanity check that the contents of the shared and the private view match so we
- // don't end up overwriting data.
- if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalParanoid) {
- debugValidateFileMapsMatch(mmf);
- }
+ i++;
- if (mmf->willNeedRemap()) {
- mmf->remapThePrivateView();
- }
+ if (i == e)
+ i = b;
+ }
+ }
- i++;
+ LOG(3) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' '
+ << t.millis() << "ms";
+}
- if (i == e) i = b;
- }
- }
- LOG(3) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo
- << ' ' << t.millis() << "ms";
- }
+// One instance of each durability interface
+DurableImpl durableImpl;
+NonDurableImpl nonDurableImpl;
+} // namespace
- // One instance of each durability interface
- DurableImpl durableImpl;
- NonDurableImpl nonDurableImpl;
-} // namespace
+// Declared in dur_preplogbuffer.cpp
+void PREPLOGBUFFER(JSectHeader& outHeader, AlignedBuilder& outBuffer);
+// Declared in dur_journal.cpp
+boost::filesystem::path getJournalDir();
+void preallocateFiles();
- // Declared in dur_preplogbuffer.cpp
- void PREPLOGBUFFER(JSectHeader& outHeader, AlignedBuilder& outBuffer);
+// Forward declaration
+static void durThread();
- // Declared in dur_journal.cpp
- boost::filesystem::path getJournalDir();
- void preallocateFiles();
+// Durability activity statistics
+Stats stats;
- // Forward declaration
- static void durThread();
+// Reference to the write intents tracking object
+CommitJob commitJob;
- // Durability activity statistics
- Stats stats;
+// Reference to the active durability interface
+DurableInterface* DurableInterface::_impl(&nonDurableImpl);
- // Reference to the write intents tracking object
- CommitJob commitJob;
- // Reference to the active durability interface
- DurableInterface* DurableInterface::_impl(&nonDurableImpl);
+//
+// Stats
+//
+Stats::Stats() : _currIdx(0) {}
- //
- // Stats
- //
+void Stats::reset() {
+ // Seal the current metrics
+ _stats[_currIdx]._durationMillis = _stats[_currIdx].getCurrentDurationMillis();
- Stats::Stats() : _currIdx(0) {
+ // Use a new metric
+ const unsigned newCurrIdx = (_currIdx + 1) % (sizeof(_stats) / sizeof(_stats[0]));
+ _stats[newCurrIdx].reset();
- }
+ _currIdx = newCurrIdx;
+}
- void Stats::reset() {
- // Seal the current metrics
- _stats[_currIdx]._durationMillis = _stats[_currIdx].getCurrentDurationMillis();
+BSONObj Stats::asObj() const {
+ // Use the previous statistic
+ const S& stats = _stats[(_currIdx - 1) % (sizeof(_stats) / sizeof(_stats[0]))];
- // Use a new metric
- const unsigned newCurrIdx = (_currIdx + 1) % (sizeof(_stats) / sizeof(_stats[0]));
- _stats[newCurrIdx].reset();
+ BSONObjBuilder builder;
+ stats._asObj(&builder);
- _currIdx = newCurrIdx;
- }
+ return builder.obj();
+}
- BSONObj Stats::asObj() const {
- // Use the previous statistic
- const S& stats = _stats[(_currIdx - 1) % (sizeof(_stats) / sizeof(_stats[0]))];
+void Stats::S::reset() {
+ memset(this, 0, sizeof(*this));
+ _startTimeMicros = curTimeMicros64();
+}
- BSONObjBuilder builder;
- stats._asObj(&builder);
+std::string Stats::S::_CSVHeader() const {
+ return "cmts\t jrnMB\t wrDFMB\t cIWLk\t early\t prpLgB\t wrToJ\t wrToDF\t rmpPrVw";
+}
- return builder.obj();
- }
+std::string Stats::S::_asCSV() const {
+ stringstream ss;
+ ss << setprecision(2) << _commits << '\t' << _journaledBytes / 1000000.0 << '\t'
+ << _writeToDataFilesBytes / 1000000.0 << '\t' << _commitsInWriteLock << '\t' << 0 << '\t'
+ << (unsigned)(_prepLogBufferMicros / 1000) << '\t'
+ << (unsigned)(_writeToJournalMicros / 1000) << '\t'
+ << (unsigned)(_writeToDataFilesMicros / 1000) << '\t'
+ << (unsigned)(_remapPrivateViewMicros / 1000) << '\t' << (unsigned)(_commitsMicros / 1000)
+ << '\t' << (unsigned)(_commitsInWriteLockMicros / 1000) << '\t';
- void Stats::S::reset() {
- memset(this, 0, sizeof(*this));
- _startTimeMicros = curTimeMicros64();
- }
+ return ss.str();
+}
- std::string Stats::S::_CSVHeader() const {
- return "cmts\t jrnMB\t wrDFMB\t cIWLk\t early\t prpLgB\t wrToJ\t wrToDF\t rmpPrVw";
- }
+void Stats::S::_asObj(BSONObjBuilder* builder) const {
+ BSONObjBuilder& b = *builder;
+ b << "commits" << _commits << "journaledMB" << _journaledBytes / 1000000.0
+ << "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 << "compression"
+ << _journaledBytes / (_uncompressedBytes + 1.0) << "commitsInWriteLock" << _commitsInWriteLock
+ << "earlyCommits" << 0 << "timeMs"
+ << BSON("dt" << _durationMillis << "prepLogBuffer" << (unsigned)(_prepLogBufferMicros / 1000)
+ << "writeToJournal" << (unsigned)(_writeToJournalMicros / 1000)
+ << "writeToDataFiles" << (unsigned)(_writeToDataFilesMicros / 1000)
+ << "remapPrivateView" << (unsigned)(_remapPrivateViewMicros / 1000) << "commits"
+ << (unsigned)(_commitsMicros / 1000) << "commitsInWriteLock"
+ << (unsigned)(_commitsInWriteLockMicros / 1000));
- std::string Stats::S::_asCSV() const {
- stringstream ss;
- ss << setprecision(2)
- << _commits << '\t'
- << _journaledBytes / 1000000.0 << '\t'
- << _writeToDataFilesBytes / 1000000.0 << '\t'
- << _commitsInWriteLock << '\t'
- << 0 << '\t'
- << (unsigned) (_prepLogBufferMicros / 1000) << '\t'
- << (unsigned) (_writeToJournalMicros / 1000) << '\t'
- << (unsigned) (_writeToDataFilesMicros / 1000) << '\t'
- << (unsigned) (_remapPrivateViewMicros / 1000) << '\t'
- << (unsigned) (_commitsMicros / 1000) << '\t'
- << (unsigned) (_commitsInWriteLockMicros / 1000) << '\t';
-
- return ss.str();
+ if (mmapv1GlobalOptions.journalCommitInterval != 0) {
+ b << "journalCommitIntervalMs" << mmapv1GlobalOptions.journalCommitInterval;
}
+}
- void Stats::S::_asObj(BSONObjBuilder* builder) const {
- BSONObjBuilder& b = *builder;
- b << "commits" << _commits
- << "journaledMB" << _journaledBytes / 1000000.0
- << "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0
- << "compression" << _journaledBytes / (_uncompressedBytes + 1.0)
- << "commitsInWriteLock" << _commitsInWriteLock
- << "earlyCommits" << 0
- << "timeMs" << BSON("dt" << _durationMillis <<
- "prepLogBuffer" << (unsigned) (_prepLogBufferMicros / 1000) <<
- "writeToJournal" << (unsigned) (_writeToJournalMicros / 1000) <<
- "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros / 1000) <<
- "remapPrivateView" << (unsigned) (_remapPrivateViewMicros / 1000) <<
- "commits" << (unsigned)(_commitsMicros / 1000) <<
- "commitsInWriteLock"
- << (unsigned)(_commitsInWriteLockMicros / 1000));
-
- if (mmapv1GlobalOptions.journalCommitInterval != 0) {
- b << "journalCommitIntervalMs" << mmapv1GlobalOptions.journalCommitInterval;
- }
- }
+//
+// DurableInterface
+//
- //
- // DurableInterface
- //
+DurableInterface::DurableInterface() {}
- DurableInterface::DurableInterface() {
+DurableInterface::~DurableInterface() {}
- }
- DurableInterface::~DurableInterface() {
+//
+// DurableImpl
+//
- }
+bool DurableImpl::commitNow(OperationContext* txn) {
+ NotifyAll::When when = commitNotify.now();
+ AutoYieldFlushLockForMMAPV1Commit flushLockYield(txn->lockState());
- //
- // DurableImpl
- //
+ // There is always just one waiting anyways
+ flushRequested.notify_one();
- bool DurableImpl::commitNow(OperationContext* txn) {
- NotifyAll::When when = commitNotify.now();
+ // commitNotify.waitFor ensures that whatever was scheduled for journaling before this
+ // call has been persisted to the journal file. This does not mean that this data has been
+ // applied to the shared view yet though, that's why we wait for applyToDataFilesNotify.
+ applyToDataFilesNotify.waitFor(when);
- AutoYieldFlushLockForMMAPV1Commit flushLockYield(txn->lockState());
+ return true;
+}
- // There is always just one waiting anyways
- flushRequested.notify_one();
+bool DurableImpl::waitUntilDurable() {
+ commitNotify.awaitBeyondNow();
+ return true;
+}
- // commitNotify.waitFor ensures that whatever was scheduled for journaling before this
- // call has been persisted to the journal file. This does not mean that this data has been
- // applied to the shared view yet though, that's why we wait for applyToDataFilesNotify.
- applyToDataFilesNotify.waitFor(when);
+void DurableImpl::createdFile(const std::string& filename, unsigned long long len) {
+ std::shared_ptr<DurOp> op(new FileCreatedOp(filename, len));
+ commitJob.noteOp(op);
+}
- return true;
- }
- bool DurableImpl::waitUntilDurable() {
- commitNotify.awaitBeyondNow();
- return true;
+void DurableImpl::declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) {
+ typedef std::vector<std::pair<void*, unsigned>> Intents;
+ stdx::lock_guard<SimpleMutex> lk(commitJob.groupCommitMutex);
+ for (Intents::const_iterator it(intents.begin()), end(intents.end()); it != end; ++it) {
+ commitJob.note(it->first, it->second);
}
+}
- void DurableImpl::createdFile(const std::string& filename, unsigned long long len) {
- std::shared_ptr<DurOp> op(new FileCreatedOp(filename, len));
- commitJob.noteOp(op);
+bool DurableImpl::commitIfNeeded() {
+ if (MONGO_likely(commitJob.bytes() < UncommittedBytesLimit)) {
+ return false;
}
+ // Just wake up the flush thread
+ flushRequested.notify_one();
+ return true;
+}
- void DurableImpl::declareWriteIntents(
- const std::vector<std::pair<void*, unsigned> >& intents) {
- typedef std::vector<std::pair<void*, unsigned> > Intents;
- stdx::lock_guard<SimpleMutex> lk(commitJob.groupCommitMutex);
- for (Intents::const_iterator it(intents.begin()), end(intents.end()); it != end; ++it) {
- commitJob.note(it->first, it->second);
- }
- }
-
- bool DurableImpl::commitIfNeeded() {
- if (MONGO_likely(commitJob.bytes() < UncommittedBytesLimit)) {
- return false;
- }
+void DurableImpl::syncDataAndTruncateJournal(OperationContext* txn) {
+ invariant(txn->lockState()->isW());
- // Just wake up the flush thread
- flushRequested.notify_one();
- return true;
- }
+ // Once this returns, all the outstanding journal has been applied to the data files and
+ // so it's safe to do the flushAll/journalCleanup below.
+ commitNow(txn);
- void DurableImpl::syncDataAndTruncateJournal(OperationContext* txn) {
- invariant(txn->lockState()->isW());
+ // Flush the shared view to disk.
+ MongoFile::flushAll(true);
- // Once this returns, all the outstanding journal has been applied to the data files and
- // so it's safe to do the flushAll/journalCleanup below.
- commitNow(txn);
+ // Once the shared view has been flushed, we do not need the journal files anymore.
+ journalCleanup(true);
- // Flush the shared view to disk.
- MongoFile::flushAll(true);
+ // Double check post-conditions
+ invariant(!haveJournalFiles());
+}
- // Once the shared view has been flushed, we do not need the journal files anymore.
- journalCleanup(true);
+void DurableImpl::closingFileNotification() {
+ if (commitJob.hasWritten()) {
+ severe() << "journal warning files are closing outside locks with writes pending";
- // Double check post-conditions
- invariant(!haveJournalFiles());
+ // File is closing while there are unwritten changes
+ invariant(false);
}
+}
- void DurableImpl::closingFileNotification() {
- if (commitJob.hasWritten()) {
- severe() << "journal warning files are closing outside locks with writes pending";
+void DurableImpl::commitAndStopDurThread() {
+ NotifyAll::When when = commitNotify.now();
- // File is closing while there are unwritten changes
- invariant(false);
- }
- }
+ // There is always just one waiting anyways
+ flushRequested.notify_one();
- void DurableImpl::commitAndStopDurThread() {
- NotifyAll::When when = commitNotify.now();
+ // commitNotify.waitFor ensures that whatever was scheduled for journaling before this
+ // call has been persisted to the journal file. This does not mean that this data has been
+ // applied to the shared view yet though, that's why we wait for applyToDataFilesNotify.
+ applyToDataFilesNotify.waitFor(when);
- // There is always just one waiting anyways
- flushRequested.notify_one();
+ // Flush the shared view to disk.
+ MongoFile::flushAll(true);
- // commitNotify.waitFor ensures that whatever was scheduled for journaling before this
- // call has been persisted to the journal file. This does not mean that this data has been
- // applied to the shared view yet though, that's why we wait for applyToDataFilesNotify.
- applyToDataFilesNotify.waitFor(when);
+ // Once the shared view has been flushed, we do not need the journal files anymore.
+ journalCleanup(true);
- // Flush the shared view to disk.
- MongoFile::flushAll(true);
+ // Double check post-conditions
+ invariant(!haveJournalFiles());
- // Once the shared view has been flushed, we do not need the journal files anymore.
- journalCleanup(true);
+ shutdownRequested.store(1);
- // Double check post-conditions
- invariant(!haveJournalFiles());
+ // Wait for the durability thread to terminate
+ log() << "Terminating durability thread ...";
+ _durThreadHandle.join();
+}
- shutdownRequested.store(1);
+void DurableImpl::start() {
+ // Start the durability thread
+ stdx::thread t(durThread);
+ _durThreadHandle.swap(t);
+}
- // Wait for the durability thread to terminate
- log() << "Terminating durability thread ...";
- _durThreadHandle.join();
- }
- void DurableImpl::start() {
- // Start the durability thread
- stdx::thread t(durThread);
- _durThreadHandle.swap(t);
+/**
+ * Remaps the private view from the shared view so that it does not consume too much
+ * copy-on-write/swap space. Must only be called after the in-memory journal has been flushed
+ * to disk and applied on top of the shared view.
+ *
+ * @param fraction Value between (0, 1] indicating what fraction of the memory to remap.
+ * Remapping too much or too frequently incurs copy-on-write page fault cost.
+ */
+static void remapPrivateView(double fraction) {
+ // Remapping private views must occur after WRITETODATAFILES otherwise we wouldn't see any
+ // newly written data on reads.
+ invariant(!commitJob.hasWritten());
+
+ try {
+ Timer t;
+ remapPrivateViewImpl(fraction);
+ stats.curr()->_remapPrivateViewMicros += t.micros();
+
+ LOG(4) << "remapPrivateView end";
+ return;
+ } catch (DBException& e) {
+ severe() << "dbexception in remapPrivateView causing immediate shutdown: " << e.toString();
+ } catch (std::ios_base::failure& e) {
+ severe() << "ios_base exception in remapPrivateView causing immediate shutdown: "
+ << e.what();
+ } catch (std::bad_alloc& e) {
+ severe() << "bad_alloc exception in remapPrivateView causing immediate shutdown: "
+ << e.what();
+ } catch (std::exception& e) {
+ severe() << "exception in remapPrivateView causing immediate shutdown: " << e.what();
+ } catch (...) {
+ severe() << "unknown exception in remapPrivateView causing immediate shutdown: ";
}
+ invariant(false);
+}
- /**
- * Remaps the private view from the shared view so that it does not consume too much
- * copy-on-write/swap space. Must only be called after the in-memory journal has been flushed
- * to disk and applied on top of the shared view.
- *
- * @param fraction Value between (0, 1] indicating what fraction of the memory to remap.
- * Remapping too much or too frequently incurs copy-on-write page fault cost.
- */
- static void remapPrivateView(double fraction) {
- // Remapping private views must occur after WRITETODATAFILES otherwise we wouldn't see any
- // newly written data on reads.
- invariant(!commitJob.hasWritten());
- try {
- Timer t;
- remapPrivateViewImpl(fraction);
- stats.curr()->_remapPrivateViewMicros += t.micros();
+/**
+ * The main durability thread loop. There is a single instance of this function running.
+ */
+static void durThread() {
+ Client::initThread("durability");
- LOG(4) << "remapPrivateView end";
- return;
- }
- catch (DBException& e) {
- severe() << "dbexception in remapPrivateView causing immediate shutdown: "
- << e.toString();
- }
- catch (std::ios_base::failure& e) {
- severe() << "ios_base exception in remapPrivateView causing immediate shutdown: "
- << e.what();
- }
- catch (std::bad_alloc& e) {
- severe() << "bad_alloc exception in remapPrivateView causing immediate shutdown: "
- << e.what();
- }
- catch (std::exception& e) {
- severe() << "exception in remapPrivateView causing immediate shutdown: "
- << e.what();
- }
- catch (...) {
- severe() << "unknown exception in remapPrivateView causing immediate shutdown: ";
- }
+ log() << "Durability thread started";
- invariant(false);
+ bool samePartition = true;
+ try {
+ const std::string dbpathDir = boost::filesystem::path(storageGlobalParams.dbpath).string();
+ samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
+ } catch (...) {
}
+ // Spawn the journal writer thread
+ JournalWriter journalWriter(&commitNotify, &applyToDataFilesNotify, NumAsyncJournalWrites);
+ journalWriter.start();
- /**
- * The main durability thread loop. There is a single instance of this function running.
- */
- static void durThread() {
- Client::initThread("durability");
+ // Used as an estimate of how much / how fast to remap
+ uint64_t commitCounter(0);
+ uint64_t estimatedPrivateMapSize(0);
+ uint64_t remapLastTimestamp(0);
- log() << "Durability thread started";
-
- bool samePartition = true;
- try {
- const std::string dbpathDir =
- boost::filesystem::path(storageGlobalParams.dbpath).string();
- samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
+ while (shutdownRequested.loadRelaxed() == 0) {
+ unsigned ms = mmapv1GlobalOptions.journalCommitInterval;
+ if (ms == 0) {
+ ms = samePartition ? 100 : 30;
}
- catch(...) {
- }
+ // +1 so it never goes down to zero
+ const unsigned oneThird = (ms / 3) + 1;
- // Spawn the journal writer thread
- JournalWriter journalWriter(&commitNotify, &applyToDataFilesNotify, NumAsyncJournalWrites);
- journalWriter.start();
+ // Reset the stats based on the reset interval
+ if (stats.curr()->getCurrentDurationMillis() > DurStatsResetIntervalMillis) {
+ stats.reset();
+ }
- // Used as an estimate of how much / how fast to remap
- uint64_t commitCounter(0);
- uint64_t estimatedPrivateMapSize(0);
- uint64_t remapLastTimestamp(0);
+ try {
+ stdx::unique_lock<stdx::mutex> lock(flushMutex);
- while (shutdownRequested.loadRelaxed() == 0) {
- unsigned ms = mmapv1GlobalOptions.journalCommitInterval;
- if (ms == 0) {
- ms = samePartition ? 100 : 30;
- }
+ for (unsigned i = 0; i <= 2; i++) {
+ if (boost::cv_status::no_timeout ==
+ flushRequested.wait_for(lock, Milliseconds(oneThird))) {
+ // Someone forced a flush
+ break;
+ }
- // +1 so it never goes down to zero
- const unsigned oneThird = (ms / 3) + 1;
+ if (commitNotify.nWaiting()) {
+ // One or more getLastError j:true is pending
+ break;
+ }
- // Reset the stats based on the reset interval
- if (stats.curr()->getCurrentDurationMillis() > DurStatsResetIntervalMillis) {
- stats.reset();
+ if (commitJob.bytes() > UncommittedBytesLimit / 2) {
+ // The number of written bytes is growing
+ break;
+ }
}
- try {
- stdx::unique_lock<stdx::mutex> lock(flushMutex);
+ // The commit logic itself
+ LOG(4) << "groupCommit begin";
- for (unsigned i = 0; i <= 2; i++) {
- if (boost::cv_status::no_timeout == flushRequested.wait_for(
- lock, Milliseconds(oneThird))) {
- // Someone forced a flush
- break;
- }
+ Timer t;
- if (commitNotify.nWaiting()) {
- // One or more getLastError j:true is pending
- break;
+ OperationContextImpl txn;
+ AutoAcquireFlushLockForMMAPV1Commit autoFlushLock(txn.lockState());
+
+ // We need to snapshot the commitNumber after the flush lock has been obtained,
+ // because at this point we know that we have a stable snapshot of the data.
+ const NotifyAll::When commitNumber(commitNotify.now());
+
+ LOG(4) << "Processing commit number " << commitNumber;
+
+ if (!commitJob.hasWritten()) {
+ // We do not need the journal lock anymore. Free it here, for the really
+ // unlikely possibility that the writeBuffer command below blocks.
+ autoFlushLock.release();
+
+ // getlasterror request could have came after the data was already committed.
+ // No need to call committingReset though, because we have not done any
+ // writes (hasWritten == false).
+ JournalWriter::Buffer* const buffer = journalWriter.newBuffer();
+ buffer->setNoop();
+
+ journalWriter.writeBuffer(buffer, commitNumber);
+ } else {
+ // This copies all the in-memory changes into the journal writer's buffer.
+ JournalWriter::Buffer* const buffer = journalWriter.newBuffer();
+ PREPLOGBUFFER(buffer->getHeader(), buffer->getBuilder());
+
+ estimatedPrivateMapSize += commitJob.bytes();
+ commitCounter++;
+
+ // Now that the write intents have been copied to the buffer, the commit job is
+ // free to be reused. We need to reset the commit job's contents while under
+ // the S flush lock, because otherwise someone might have done a write and this
+ // would wipe out their changes without ever being committed.
+ commitJob.committingReset();
+
+ double systemMemoryPressurePercentage =
+ ProcessInfo::getSystemMemoryPressurePercentage();
+
+ // Now that the in-memory modifications have been collected, we can potentially
+ // release the flush lock if remap is not necessary.
+ // When we remap due to memory pressure, we look at two criteria
+ // 1. If the amount of 4k pages touched exceeds 512 MB,
+ // a reasonable estimate of memory pressure on Linux.
+ // 2. Check if the amount of free memory on the machine is running low,
+ // since #1 is underestimates the memory pressure on Windows since
+ // commits in 64MB chunks.
+ const bool shouldRemap = (estimatedPrivateMapSize >= UncommittedBytesLimit) ||
+ (systemMemoryPressurePercentage > 0.0) ||
+ (commitCounter % NumCommitsBeforeRemap == 0) ||
+ (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap);
+
+ double remapFraction = 0.0;
+
+ if (shouldRemap) {
+ // We want to remap all private views about every 2 seconds. There could be
+ // ~1000 views so we do a little each pass. There will be copy on write
+ // faults after remapping, so doing a little bit at a time will avoid big
+ // load spikes when the pages are touched.
+ //
+ // TODO: Instead of the time-based logic above, consider using ProcessInfo
+ // and watching for getResidentSize to drop, which is more precise.
+ remapFraction = (curTimeMicros64() - remapLastTimestamp) / 2000000.0;
+
+ if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap) {
+ remapFraction = 1;
+ } else {
+ // We don't want to get close to the UncommittedBytesLimit
+ const double remapMemFraction =
+ estimatedPrivateMapSize / ((double)UncommittedBytesLimit);
+
+ remapFraction = std::max(remapMemFraction, remapFraction);
+
+ remapFraction = std::max(systemMemoryPressurePercentage, remapFraction);
}
+ } else {
+ LOG(4) << "Early release flush lock";
- if (commitJob.bytes() > UncommittedBytesLimit / 2) {
- // The number of written bytes is growing
- break;
- }
+ // We will not be doing a remap so drop the flush lock. That way we will be
+ // doing the journal I/O outside of lock, so other threads can proceed.
+ invariant(!shouldRemap);
+ autoFlushLock.release();
}
- // The commit logic itself
- LOG(4) << "groupCommit begin";
+ // Request async I/O to the journal. This may block.
+ journalWriter.writeBuffer(buffer, commitNumber);
+
+ // Data has now been written to the shared view. If remap was requested, we
+ // would still be holding the S flush lock here, so just upgrade it and
+ // perform the remap.
+ if (shouldRemap) {
+ // Need to wait for the previously scheduled journal writes to complete
+ // before any remap is attempted.
+ journalWriter.flush();
+ journalWriter.assertIdle();
+
+ // Upgrading the journal lock to flush stops all activity on the system,
+ // because we will be remapping memory and we don't want readers to be
+ // accessing it. Technically this step could be avoided on systems, which
+ // support atomic remap.
+ autoFlushLock.upgradeFlushLockToExclusive();
+ remapPrivateView(remapFraction);
- Timer t;
-
- OperationContextImpl txn;
- AutoAcquireFlushLockForMMAPV1Commit autoFlushLock(txn.lockState());
-
- // We need to snapshot the commitNumber after the flush lock has been obtained,
- // because at this point we know that we have a stable snapshot of the data.
- const NotifyAll::When commitNumber(commitNotify.now());
-
- LOG(4) << "Processing commit number " << commitNumber;
-
- if (!commitJob.hasWritten()) {
- // We do not need the journal lock anymore. Free it here, for the really
- // unlikely possibility that the writeBuffer command below blocks.
autoFlushLock.release();
- // getlasterror request could have came after the data was already committed.
- // No need to call committingReset though, because we have not done any
- // writes (hasWritten == false).
- JournalWriter::Buffer* const buffer = journalWriter.newBuffer();
- buffer->setNoop();
-
- journalWriter.writeBuffer(buffer, commitNumber);
- }
- else {
- // This copies all the in-memory changes into the journal writer's buffer.
- JournalWriter::Buffer* const buffer = journalWriter.newBuffer();
- PREPLOGBUFFER(buffer->getHeader(), buffer->getBuilder());
-
- estimatedPrivateMapSize += commitJob.bytes();
- commitCounter++;
-
- // Now that the write intents have been copied to the buffer, the commit job is
- // free to be reused. We need to reset the commit job's contents while under
- // the S flush lock, because otherwise someone might have done a write and this
- // would wipe out their changes without ever being committed.
- commitJob.committingReset();
-
- double systemMemoryPressurePercentage =
- ProcessInfo::getSystemMemoryPressurePercentage();
-
- // Now that the in-memory modifications have been collected, we can potentially
- // release the flush lock if remap is not necessary.
- // When we remap due to memory pressure, we look at two criteria
- // 1. If the amount of 4k pages touched exceeds 512 MB,
- // a reasonable estimate of memory pressure on Linux.
- // 2. Check if the amount of free memory on the machine is running low,
- // since #1 is underestimates the memory pressure on Windows since
- // commits in 64MB chunks.
- const bool shouldRemap =
- (estimatedPrivateMapSize >= UncommittedBytesLimit) ||
- (systemMemoryPressurePercentage > 0.0) ||
- (commitCounter % NumCommitsBeforeRemap == 0) ||
- (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap);
-
- double remapFraction = 0.0;
-
- if (shouldRemap) {
- // We want to remap all private views about every 2 seconds. There could be
- // ~1000 views so we do a little each pass. There will be copy on write
- // faults after remapping, so doing a little bit at a time will avoid big
- // load spikes when the pages are touched.
- //
- // TODO: Instead of the time-based logic above, consider using ProcessInfo
- // and watching for getResidentSize to drop, which is more precise.
- remapFraction = (curTimeMicros64() - remapLastTimestamp) / 2000000.0;
-
- if (mmapv1GlobalOptions.journalOptions &
- MMAPV1Options::JournalAlwaysRemap) {
- remapFraction = 1;
- }
- else {
- // We don't want to get close to the UncommittedBytesLimit
- const double remapMemFraction =
- estimatedPrivateMapSize / ((double)UncommittedBytesLimit);
-
- remapFraction = std::max(remapMemFraction, remapFraction);
-
- remapFraction = std::max(systemMemoryPressurePercentage, remapFraction);
- }
- }
- else {
- LOG(4) << "Early release flush lock";
-
- // We will not be doing a remap so drop the flush lock. That way we will be
- // doing the journal I/O outside of lock, so other threads can proceed.
- invariant(!shouldRemap);
- autoFlushLock.release();
- }
+ // Reset the private map estimate outside of the lock
+ estimatedPrivateMapSize = 0;
+ remapLastTimestamp = curTimeMicros64();
- // Request async I/O to the journal. This may block.
- journalWriter.writeBuffer(buffer, commitNumber);
-
- // Data has now been written to the shared view. If remap was requested, we
- // would still be holding the S flush lock here, so just upgrade it and
- // perform the remap.
- if (shouldRemap) {
- // Need to wait for the previously scheduled journal writes to complete
- // before any remap is attempted.
- journalWriter.flush();
- journalWriter.assertIdle();
-
- // Upgrading the journal lock to flush stops all activity on the system,
- // because we will be remapping memory and we don't want readers to be
- // accessing it. Technically this step could be avoided on systems, which
- // support atomic remap.
- autoFlushLock.upgradeFlushLockToExclusive();
- remapPrivateView(remapFraction);
-
- autoFlushLock.release();
-
- // Reset the private map estimate outside of the lock
- estimatedPrivateMapSize = 0;
- remapLastTimestamp = curTimeMicros64();
-
- stats.curr()->_commitsInWriteLock++;
- stats.curr()->_commitsInWriteLockMicros += t.micros();
- }
+ stats.curr()->_commitsInWriteLock++;
+ stats.curr()->_commitsInWriteLockMicros += t.micros();
}
-
- stats.curr()->_commits++;
- stats.curr()->_commitsMicros += t.micros();
-
- LOG(4) << "groupCommit end";
- }
- catch (DBException& e) {
- severe() << "dbexception in durThread causing immediate shutdown: "
- << e.toString();
- invariant(false);
- }
- catch (std::ios_base::failure& e) {
- severe() << "ios_base exception in durThread causing immediate shutdown: "
- << e.what();
- invariant(false);
}
- catch (std::bad_alloc& e) {
- severe() << "bad_alloc exception in durThread causing immediate shutdown: "
- << e.what();
- invariant(false);
- }
- catch (std::exception& e) {
- severe() << "exception in durThread causing immediate shutdown: "
- << e.what();
- invariant(false);
- }
- catch (...) {
- severe() << "unhandled exception in durThread causing immediate shutdown";
- invariant(false);
- }
- }
- // Stops the journal thread and ensures everything was written
- invariant(!commitJob.hasWritten());
+ stats.curr()->_commits++;
+ stats.curr()->_commitsMicros += t.micros();
- journalWriter.flush();
- journalWriter.shutdown();
-
- log() << "Durability thread stopped";
+ LOG(4) << "groupCommit end";
+ } catch (DBException& e) {
+ severe() << "dbexception in durThread causing immediate shutdown: " << e.toString();
+ invariant(false);
+ } catch (std::ios_base::failure& e) {
+ severe() << "ios_base exception in durThread causing immediate shutdown: " << e.what();
+ invariant(false);
+ } catch (std::bad_alloc& e) {
+ severe() << "bad_alloc exception in durThread causing immediate shutdown: " << e.what();
+ invariant(false);
+ } catch (std::exception& e) {
+ severe() << "exception in durThread causing immediate shutdown: " << e.what();
+ invariant(false);
+ } catch (...) {
+ severe() << "unhandled exception in durThread causing immediate shutdown";
+ invariant(false);
+ }
}
+ // Stops the journal thread and ensures everything was written
+ invariant(!commitJob.hasWritten());
- /**
- * Invoked at server startup. Recovers the database by replaying journal files and then
- * starts the durability thread.
- */
- void startup() {
- if (!storageGlobalParams.dur) {
- return;
- }
+ journalWriter.flush();
+ journalWriter.shutdown();
- journalMakeDir();
+ log() << "Durability thread stopped";
+}
- try {
- replayJournalFilesAtStartup();
- }
- catch (DBException& e) {
- severe() << "dbexception during recovery: " << e.toString();
- throw;
- }
- catch (std::exception& e) {
- severe() << "std::exception during recovery: " << e.what();
- throw;
- }
- catch (...) {
- severe() << "exception during recovery";
- throw;
- }
- preallocateFiles();
+/**
+ * Invoked at server startup. Recovers the database by replaying journal files and then
+ * starts the durability thread.
+ */
+void startup() {
+ if (!storageGlobalParams.dur) {
+ return;
+ }
- durableImpl.start();
- DurableInterface::_impl = &durableImpl;
+ journalMakeDir();
+
+ try {
+ replayJournalFilesAtStartup();
+ } catch (DBException& e) {
+ severe() << "dbexception during recovery: " << e.toString();
+ throw;
+ } catch (std::exception& e) {
+ severe() << "std::exception during recovery: " << e.what();
+ throw;
+ } catch (...) {
+ severe() << "exception during recovery";
+ throw;
}
-} // namespace dur
-} // namespace mongo
+ preallocateFiles();
+
+ durableImpl.start();
+ DurableInterface::_impl = &durableImpl;
+}
+
+} // namespace dur
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/dur.h b/src/mongo/db/storage/mmap_v1/dur.h
index 2915ece1439..7cfd46fada3 100644
--- a/src/mongo/db/storage/mmap_v1/dur.h
+++ b/src/mongo/db/storage/mmap_v1/dur.h
@@ -35,126 +35,130 @@
namespace mongo {
- class OperationContext;
+class OperationContext;
namespace dur {
- // a smaller limit is likely better on 32 bit
- const unsigned UncommittedBytesLimit = (sizeof(void*) == 4) ? 50 * 1024 * 1024 : 512 * 1024 * 1024;
+// a smaller limit is likely better on 32 bit
+const unsigned UncommittedBytesLimit = (sizeof(void*) == 4) ? 50 * 1024 * 1024 : 512 * 1024 * 1024;
- class DurableInterface {
- MONGO_DISALLOW_COPYING(DurableInterface);
- public:
- virtual ~DurableInterface();
+class DurableInterface {
+ MONGO_DISALLOW_COPYING(DurableInterface);
- /**
- * Declare that a file has been created. Normally writes are applied only after journaling
- * for safety. But here the file is created first, and the journal will just replay the
- * creation if the create didn't happen due to a crash.
- */
- virtual void createdFile(const std::string& filename, unsigned long long len) = 0;
+public:
+ virtual ~DurableInterface();
- // Declare write intents. Use these methods to declare "i'm about to write to x and it
- // should be logged for redo."
- //
- // Failure to call declare write intents is checked in MONGO_CONFIG_DEBUG_BUILD mode by
- // using a read only mapped view (i.e., you'll segfault if the code is covered in that
- // situation). The debug check doesn't verify that your length is correct though.
- virtual void declareWriteIntents(
- const std::vector<std::pair<void*, unsigned> >& intents) = 0;
+ /**
+ * Declare that a file has been created. Normally writes are applied only after journaling
+ * for safety. But here the file is created first, and the journal will just replay the
+ * creation if the create didn't happen due to a crash.
+ */
+ virtual void createdFile(const std::string& filename, unsigned long long len) = 0;
- /** Wait for acknowledgement of the next group commit.
- @return true if --dur is on. There will be delay.
- @return false if --dur is off.
- */
- virtual bool waitUntilDurable() = 0;
+ // Declare write intents. Use these methods to declare "i'm about to write to x and it
+ // should be logged for redo."
+ //
+ // Failure to call declare write intents is checked in MONGO_CONFIG_DEBUG_BUILD mode by
+ // using a read only mapped view (i.e., you'll segfault if the code is covered in that
+ // situation). The debug check doesn't verify that your length is correct though.
+ virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) = 0;
- /** Commit immediately.
+ /** Wait for acknowledgement of the next group commit.
+ @return true if --dur is on. There will be delay.
+ @return false if --dur is off.
+ */
+ virtual bool waitUntilDurable() = 0;
- Generally, you do not want to do this often, as highly granular committing may affect
- performance.
+ /** Commit immediately.
- Does not return until the commit is complete.
+ Generally, you do not want to do this often, as highly granular committing may affect
+ performance.
- You must be at least read locked when you call this. Ideally, you are not write locked
- and then read operations can occur concurrently.
+ Does not return until the commit is complete.
- Do not use this. Use commitIfNeeded() instead.
+ You must be at least read locked when you call this. Ideally, you are not write locked
+ and then read operations can occur concurrently.
- @return true if --dur is on.
- @return false if --dur is off. (in which case there is action)
- */
- virtual bool commitNow(OperationContext* txn) = 0;
+ Do not use this. Use commitIfNeeded() instead.
- /** Commit if enough bytes have been modified. Current threshold is 50MB
+ @return true if --dur is on.
+ @return false if --dur is off. (in which case there is action)
+ */
+ virtual bool commitNow(OperationContext* txn) = 0;
- The idea is that long running write operations that don't yield
- (like creating an index or update with $atomic) can call this
- whenever the db is in a sane state and it will prevent commits
- from growing too large.
- @return true if commited
- */
- virtual bool commitIfNeeded() = 0;
+ /** Commit if enough bytes have been modified. Current threshold is 50MB
+ The idea is that long running write operations that don't yield
+ (like creating an index or update with $atomic) can call this
+ whenever the db is in a sane state and it will prevent commits
+ from growing too large.
+ @return true if commited
+ */
+ virtual bool commitIfNeeded() = 0;
- /**
- * Called when a DurableMappedFile is closing. Asserts that there are no unwritten changes,
- * because that would mean journal replay on recovery would try to write to non-existent
- * files and fail.
- */
- virtual void closingFileNotification() = 0;
- /**
- * Invoked at clean shutdown time. Performs one last commit/flush and terminates the
- * flush thread.
- *
- * Must be called under the global X lock.
- */
- virtual void commitAndStopDurThread() = 0;
+ /**
+ * Called when a DurableMappedFile is closing. Asserts that there are no unwritten changes,
+ * because that would mean journal replay on recovery would try to write to non-existent
+ * files and fail.
+ */
+ virtual void closingFileNotification() = 0;
- /**
- * Commits pending changes, flushes all changes to main data files, then removes the
- * journal.
- *
- * WARNING: Data *must* be in a crash-recoverable state when this is called and must
- * not be inside of a write unit of work.
- *
- * This is useful as a "barrier" to ensure that writes before this call will never go
- * through recovery and be applied to files that have had changes made after this call
- * applied.
- */
- virtual void syncDataAndTruncateJournal(OperationContext* txn) = 0;
+ /**
+ * Invoked at clean shutdown time. Performs one last commit/flush and terminates the
+ * flush thread.
+ *
+ * Must be called under the global X lock.
+ */
+ virtual void commitAndStopDurThread() = 0;
- virtual bool isDurable() const = 0;
+ /**
+ * Commits pending changes, flushes all changes to main data files, then removes the
+ * journal.
+ *
+ * WARNING: Data *must* be in a crash-recoverable state when this is called and must
+ * not be inside of a write unit of work.
+ *
+ * This is useful as a "barrier" to ensure that writes before this call will never go
+ * through recovery and be applied to files that have had changes made after this call
+ * applied.
+ */
+ virtual void syncDataAndTruncateJournal(OperationContext* txn) = 0;
- static DurableInterface& getDur() { return *_impl; }
+ virtual bool isDurable() const = 0;
- protected:
- DurableInterface();
+ static DurableInterface& getDur() {
+ return *_impl;
+ }
- private:
- friend void startup();
+protected:
+ DurableInterface();
- static DurableInterface* _impl;
- };
+private:
+ friend void startup();
+ static DurableInterface* _impl;
+};
- /**
- * Called during startup to startup the durability module.
- * Does nothing if storageGlobalParams.dur is false
- */
- void startup();
-} // namespace dur
+/**
+ * Called during startup to startup the durability module.
+ * Does nothing if storageGlobalParams.dur is false
+ */
+void startup();
+} // namespace dur
- /**
- * Provides a reference to the active durability interface.
- *
- * TODO: The only reason this is an inline function is that tests try to link it and fail if
- * the MMAP V1 engine is not included.
- */
- inline dur::DurableInterface& getDur() { return dur::DurableInterface::getDur(); }
-} // namespace mongo
+/**
+ * Provides a reference to the active durability interface.
+ *
+ * TODO: The only reason this is an inline function is that tests try to link it and fail if
+ * the MMAP V1 engine is not included.
+ */
+inline dur::DurableInterface& getDur() {
+ return dur::DurableInterface::getDur();
+}
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/dur_commitjob.cpp b/src/mongo/db/storage/mmap_v1/dur_commitjob.cpp
index 27e7681b17c..aff01c1c7bf 100644
--- a/src/mongo/db/storage/mmap_v1/dur_commitjob.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur_commitjob.cpp
@@ -44,83 +44,76 @@
namespace mongo {
- using std::shared_ptr;
- using std::endl;
- using std::max;
- using std::min;
+using std::shared_ptr;
+using std::endl;
+using std::max;
+using std::min;
namespace dur {
- void WriteIntent::absorb(const WriteIntent& other) {
- dassert(overlaps(other));
+void WriteIntent::absorb(const WriteIntent& other) {
+ dassert(overlaps(other));
- void* newStart = min(start(), other.start());
- p = max(p, other.p);
- len = (char*)p - (char*)newStart;
+ void* newStart = min(start(), other.start());
+ p = max(p, other.p);
+ len = (char*)p - (char*)newStart;
- dassert(contains(other));
- }
+ dassert(contains(other));
+}
- CommitJob::CommitJob() :
- _hasWritten(false),
- _lastNotedPos(0),
- _bytes(0) {
+CommitJob::CommitJob() : _hasWritten(false), _lastNotedPos(0), _bytes(0) {}
- }
+CommitJob::~CommitJob() {}
- CommitJob::~CommitJob() {
+void CommitJob::noteOp(shared_ptr<DurOp> p) {
+ stdx::lock_guard<SimpleMutex> lk(groupCommitMutex);
+ _hasWritten = true;
+ _durOps.push_back(p);
+}
- }
-
- void CommitJob::noteOp(shared_ptr<DurOp> p) {
- stdx::lock_guard<SimpleMutex> lk(groupCommitMutex);
- _hasWritten = true;
- _durOps.push_back(p);
- }
+void CommitJob::note(void* p, int len) {
+ _hasWritten = true;
- void CommitJob::note(void* p, int len) {
- _hasWritten = true;
+ if (!_alreadyNoted.checkAndSet(p, len)) {
+ // Remember intent. We will journal it in a bit.
+ _insertWriteIntent(p, len);
- if (!_alreadyNoted.checkAndSet(p, len)) {
- // Remember intent. We will journal it in a bit.
- _insertWriteIntent(p, len);
+ // Round off to page address (4KB).
+ const size_t x = ((size_t)p) & ~0xfff;
- // Round off to page address (4KB).
- const size_t x = ((size_t)p) & ~0xfff;
+ if (x != _lastNotedPos) {
+ _lastNotedPos = x;
- if (x != _lastNotedPos) {
- _lastNotedPos = x;
+ // Add the full page amount
+ _bytes += (len + 4095) & ~0xfff;
- // Add the full page amount
- _bytes += (len + 4095) & ~0xfff;
+ if (_bytes > UncommittedBytesLimit * 3) {
+ _complains++;
- if (_bytes > UncommittedBytesLimit * 3) {
- _complains++;
+ // Throttle logging
+ if (_complains < 100 || (curTimeMillis64() - _lastComplainMs >= 60000)) {
+ _lastComplainMs = curTimeMillis64();
- // Throttle logging
- if (_complains < 100 || (curTimeMillis64() - _lastComplainMs >= 60000)) {
- _lastComplainMs = curTimeMillis64();
+ warning() << "DR102 too much data written uncommitted (" << _bytes / 1000000.0
+ << "MB)";
- warning() << "DR102 too much data written uncommitted ("
- << _bytes / 1000000.0 << "MB)";
-
- if (_complains < 10 || _complains % 10 == 0) {
- printStackTrace();
- }
+ if (_complains < 10 || _complains % 10 == 0) {
+ printStackTrace();
}
}
}
}
}
-
- void CommitJob::committingReset() {
- _hasWritten = false;
- _alreadyNoted.clear();
- _intents.clear();
- _durOps.clear();
- _bytes = 0;
- }
-
-} // namespace "dur"
-} // namespace "mongo"
+}
+
+void CommitJob::committingReset() {
+ _hasWritten = false;
+ _alreadyNoted.clear();
+ _intents.clear();
+ _durOps.clear();
+ _bytes = 0;
+}
+
+} // namespace "dur"
+} // namespace "mongo"
diff --git a/src/mongo/db/storage/mmap_v1/dur_commitjob.h b/src/mongo/db/storage/mmap_v1/dur_commitjob.h
index b2d07c3b293..8261b613c57 100644
--- a/src/mongo/db/storage/mmap_v1/dur_commitjob.h
+++ b/src/mongo/db/storage/mmap_v1/dur_commitjob.h
@@ -35,179 +35,191 @@
namespace mongo {
namespace dur {
- typedef std::vector<std::shared_ptr<DurOp> > DurOpsVector;
+typedef std::vector<std::shared_ptr<DurOp>> DurOpsVector;
- /**
- * Declaration of an intent to write to a region of a memory mapped view. We store the end
- * rather than the start pointer to make operator < faster since that is heavily used in
- * set lookup.
- */
- struct WriteIntent {
- WriteIntent() : p(0) { }
- WriteIntent(void *a, unsigned b) : p((char*)a + b), len(b) { }
+/**
+ * Declaration of an intent to write to a region of a memory mapped view. We store the end
+ * rather than the start pointer to make operator < faster since that is heavily used in
+ * set lookup.
+ */
+struct WriteIntent {
+ WriteIntent() : p(0) {}
+ WriteIntent(void* a, unsigned b) : p((char*)a + b), len(b) {}
+
+ void* start() const {
+ return (char*)p - len;
+ }
+ void* end() const {
+ return p;
+ }
+ unsigned length() const {
+ return len;
+ }
+ bool operator<(const WriteIntent& rhs) const {
+ return end() < rhs.end();
+ }
+
+ bool overlaps(const WriteIntent& rhs) const {
+ return (start() <= rhs.end() && end() >= rhs.start());
+ }
+
+ bool contains(const WriteIntent& rhs) const {
+ return (start() <= rhs.start() && end() >= rhs.end());
+ }
+
+ // merge into me:
+ void absorb(const WriteIntent& other);
+
+ friend std::ostream& operator<<(std::ostream& out, const WriteIntent& wi) {
+ return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len);
+ }
+
+private:
+ void* p; // intent to write up to p
+ unsigned len; // up to this len
+};
+
+typedef std::vector<WriteIntent> WriteIntentsVector;
- void* start() const { return (char*)p - len; }
- void* end() const { return p; }
- unsigned length() const { return len; }
- bool operator < (const WriteIntent& rhs) const { return end() < rhs.end(); }
- bool overlaps(const WriteIntent& rhs) const {
- return (start() <= rhs.end() && end() >= rhs.start());
- }
+/**
+ * Bitmap to remember things we have already marked for journaling. False negatives are ok
+ * if infrequent, since they impact performance.
+ */
+template <int Prime>
+class Already {
+ MONGO_DISALLOW_COPYING(Already);
+
+public:
+ Already() {
+ clear();
+ }
+
+ void clear() {
+ memset(this, 0, sizeof(*this));
+ }
- bool contains(const WriteIntent& rhs) const {
- return (start() <= rhs.start() && end() >= rhs.end());
+ /**
+ * Checks if we have Already recorded/indicated our write intent for this region of
+ * memory and automatically upgrades the length if the length was shorter previously.
+ *
+ * @return true if already indicated.
+ */
+ bool checkAndSet(void* p, int len) {
+ const unsigned x = hashPointer(p);
+ std::pair<void*, int>& nd = nodes[x % Prime];
+
+ if (nd.first == p) {
+ if (nd.second < len) {
+ nd.second = len;
+ return false; // haven't indicated this len yet
+ }
+ return true; // already indicated
}
- // merge into me:
- void absorb(const WriteIntent& other);
-
- friend std::ostream& operator << (std::ostream& out, const WriteIntent& wi) {
- return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len);
+ nd.first = p;
+ nd.second = len;
+ return false; // a new set
+ }
+
+private:
+ static unsigned hashPointer(void* v) {
+ unsigned x = 0;
+ unsigned char* p = (unsigned char*)&v;
+ for (unsigned i = 0; i < sizeof(void*); i++) {
+ x = x * 131 + p[i];
}
+ return x;
+ }
+
+ std::pair<void*, int> nodes[Prime];
+};
- private:
- void *p; // intent to write up to p
- unsigned len; // up to this len
- };
- typedef std::vector<WriteIntent> WriteIntentsVector;
+/**
+ * Tracks all write operations on the private view so they can be journaled.
+ */
+class CommitJob {
+ MONGO_DISALLOW_COPYING(CommitJob);
+public:
+ CommitJob();
+ ~CommitJob();
/**
- * Bitmap to remember things we have already marked for journaling. False negatives are ok
- * if infrequent, since they impact performance.
+ * Note an operation other than a "basic write".
*/
- template<int Prime>
- class Already {
- MONGO_DISALLOW_COPYING(Already);
- public:
- Already() {
- clear();
- }
-
- void clear() {
- memset(this, 0, sizeof(*this));
- }
-
- /**
- * Checks if we have Already recorded/indicated our write intent for this region of
- * memory and automatically upgrades the length if the length was shorter previously.
- *
- * @return true if already indicated.
- */
- bool checkAndSet(void* p, int len) {
- const unsigned x = hashPointer(p);
- std::pair<void*, int>& nd = nodes[x % Prime];
-
- if (nd.first == p) {
- if (nd.second < len) {
- nd.second = len;
- return false; // haven't indicated this len yet
- }
- return true; // already indicated
- }
-
- nd.first = p;
- nd.second = len;
- return false; // a new set
- }
+ void noteOp(std::shared_ptr<DurOp> p);
- private:
-
- static unsigned hashPointer(void *v) {
- unsigned x = 0;
- unsigned char *p = (unsigned char *)&v;
- for (unsigned i = 0; i < sizeof(void*); i++) {
- x = x * 131 + p[i];
- }
- return x;
- }
+ /**
+ * Record/note an intent to write.
+ *
+ * NOTE: Not thread safe. Requires the mutex to be locked.
+ */
+ void note(void* p, int len);
- std::pair<void*, int> nodes[Prime];
- };
+ /**
+ * When this value is false we don't have to do any group commit.
+ */
+ bool hasWritten() const {
+ return _hasWritten;
+ }
+ /**
+ * We use the commitjob object over and over, calling committingReset() rather than
+ * reconstructing.
+ */
+ void committingReset();
/**
- * Tracks all write operations on the private view so they can be journaled.
+ * We check how much written and if it is getting to be a lot, we commit sooner.
*/
- class CommitJob {
- MONGO_DISALLOW_COPYING(CommitJob);
- public:
- CommitJob();
- ~CommitJob();
-
- /**
- * Note an operation other than a "basic write".
- */
- void noteOp(std::shared_ptr<DurOp> p);
-
- /**
- * Record/note an intent to write.
- *
- * NOTE: Not thread safe. Requires the mutex to be locked.
- */
- void note(void* p, int len);
-
- /**
- * When this value is false we don't have to do any group commit.
- */
- bool hasWritten() const { return _hasWritten; }
-
- /**
- * We use the commitjob object over and over, calling committingReset() rather than
- * reconstructing.
- */
- void committingReset();
-
- /**
- * We check how much written and if it is getting to be a lot, we commit sooner.
- */
- size_t bytes() const { return _bytes; }
-
- /**
- * Sorts the internal list of write intents so that overlapping and duplicate items can be
- * merged. We do the sort here so the caller receives something they must keep const from
- * their POV.
- */
- const WriteIntentsVector& getIntentsSorted() {
- sort(_intents.begin(), _intents.end());
- return _intents;
- }
+ size_t bytes() const {
+ return _bytes;
+ }
- const DurOpsVector& ops() const {
- return _durOps;
- }
+ /**
+ * Sorts the internal list of write intents so that overlapping and duplicate items can be
+ * merged. We do the sort here so the caller receives something they must keep const from
+ * their POV.
+ */
+ const WriteIntentsVector& getIntentsSorted() {
+ sort(_intents.begin(), _intents.end());
+ return _intents;
+ }
- SimpleMutex groupCommitMutex;
+ const DurOpsVector& ops() const {
+ return _durOps;
+ }
- private:
+ SimpleMutex groupCommitMutex;
- void _insertWriteIntent(void* p, int len) {
- _intents.push_back(WriteIntent(p, len));
- wassert(_intents.size() < 2000000);
- }
+private:
+ void _insertWriteIntent(void* p, int len) {
+ _intents.push_back(WriteIntent(p, len));
+ wassert(_intents.size() < 2000000);
+ }
- // Whether we put write intents or durops
- bool _hasWritten;
+ // Whether we put write intents or durops
+ bool _hasWritten;
- // Write intents along with a bitmask for whether we have already noted them
- Already<127> _alreadyNoted;
- WriteIntentsVector _intents;
+ // Write intents along with a bitmask for whether we have already noted them
+ Already<127> _alreadyNoted;
+ WriteIntentsVector _intents;
- // All the ops other than basic writes
- DurOpsVector _durOps;
+ // All the ops other than basic writes
+ DurOpsVector _durOps;
- // Used to count the private map used bytes. Note that _lastNotedPos doesn't reset with
- // each commit, but that is ok we aren't being that precise.
- size_t _lastNotedPos;
- size_t _bytes;
+ // Used to count the private map used bytes. Note that _lastNotedPos doesn't reset with
+ // each commit, but that is ok we aren't being that precise.
+ size_t _lastNotedPos;
+ size_t _bytes;
- // Warning logging for large commits
- uint64_t _lastComplainMs;
- unsigned _complains;
- };
+ // Warning logging for large commits
+ uint64_t _lastComplainMs;
+ unsigned _complains;
+};
-} // namespace "dur"
-} // namespace "mongo"
+} // namespace "dur"
+} // namespace "mongo"
diff --git a/src/mongo/db/storage/mmap_v1/dur_journal.cpp b/src/mongo/db/storage/mmap_v1/dur_journal.cpp
index a76ade46128..66c88e3e156 100644
--- a/src/mongo/db/storage/mmap_v1/dur_journal.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur_journal.cpp
@@ -58,7 +58,7 @@
#include "mongo/util/hex.h"
#include "mongo/util/log.h"
#include "mongo/util/mongoutils/str.h"
-#include "mongo/util/net/listen.h" // getelapsedtimemillis
+#include "mongo/util/net/listen.h" // getelapsedtimemillis
#include "mongo/util/progress_meter.h"
#include "mongo/util/timer.h"
@@ -66,732 +66,727 @@ using namespace mongoutils;
namespace mongo {
- using std::endl;
- using std::hex;
- using std::string;
+using std::endl;
+using std::hex;
+using std::string;
- class AlignedBuilder;
+class AlignedBuilder;
- namespace dur {
- // Rotate after reaching this data size in a journal (j._<n>) file
- // We use a smaller size for 32 bit as the journal is mmapped during recovery (only)
- // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must
- // work. (and should as-is)
- // --smallfiles makes the limit small.
+namespace dur {
+// Rotate after reaching this data size in a journal (j._<n>) file
+// We use a smaller size for 32 bit as the journal is mmapped during recovery (only)
+// Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must
+// work. (and should as-is)
+// --smallfiles makes the limit small.
#if defined(MONGO_CONFIG_DEBUG_BUILD)
- unsigned long long DataLimitPerJournalFile = 128 * 1024 * 1024;
+unsigned long long DataLimitPerJournalFile = 128 * 1024 * 1024;
#elif defined(__APPLE__)
- // assuming a developer box if OS X
- unsigned long long DataLimitPerJournalFile = 256 * 1024 * 1024;
+// assuming a developer box if OS X
+unsigned long long DataLimitPerJournalFile = 256 * 1024 * 1024;
#else
- unsigned long long DataLimitPerJournalFile = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024;
+unsigned long long DataLimitPerJournalFile =
+ (sizeof(void*) == 4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024;
#endif
- MONGO_INITIALIZER(InitializeJournalingParams)(InitializerContext* context) {
- if (mmapv1GlobalOptions.smallfiles == true) {
- verify(dur::DataLimitPerJournalFile >= 128 * 1024 * 1024);
- dur::DataLimitPerJournalFile = 128 * 1024 * 1024;
- }
- return Status::OK();
- }
+MONGO_INITIALIZER(InitializeJournalingParams)(InitializerContext* context) {
+ if (mmapv1GlobalOptions.smallfiles == true) {
+ verify(dur::DataLimitPerJournalFile >= 128 * 1024 * 1024);
+ dur::DataLimitPerJournalFile = 128 * 1024 * 1024;
+ }
+ return Status::OK();
+}
- BOOST_STATIC_ASSERT( sizeof(Checksum) == 16 );
- BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 );
- BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 );
- BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 );
- BOOST_STATIC_ASSERT( sizeof(JEntry) == 12 );
- BOOST_STATIC_ASSERT( sizeof(LSNFile) == 88 );
+BOOST_STATIC_ASSERT(sizeof(Checksum) == 16);
+BOOST_STATIC_ASSERT(sizeof(JHeader) == 8192);
+BOOST_STATIC_ASSERT(sizeof(JSectHeader) == 20);
+BOOST_STATIC_ASSERT(sizeof(JSectFooter) == 32);
+BOOST_STATIC_ASSERT(sizeof(JEntry) == 12);
+BOOST_STATIC_ASSERT(sizeof(LSNFile) == 88);
- bool usingPreallocate = false;
+bool usingPreallocate = false;
- void removeOldJournalFile(boost::filesystem::path p);
+void removeOldJournalFile(boost::filesystem::path p);
- boost::filesystem::path getJournalDir() {
- boost::filesystem::path p(storageGlobalParams.dbpath);
- p /= "journal";
- return p;
- }
+boost::filesystem::path getJournalDir() {
+ boost::filesystem::path p(storageGlobalParams.dbpath);
+ p /= "journal";
+ return p;
+}
- boost::filesystem::path lsnPath() {
- return getJournalDir()/"lsn";
- }
+boost::filesystem::path lsnPath() {
+ return getJournalDir() / "lsn";
+}
- /** this should be called when something really bad happens so that we can flag appropriately
- */
- void journalingFailure(const char *msg) {
- /** todo:
- (1) don't log too much
- (2) make an indicator in the journal dir that something bad happened.
- (2b) refuse to do a recovery startup if that is there without manual override.
- */
- log() << "journaling failure/error: " << msg << endl;
- verify(false);
- }
+/** this should be called when something really bad happens so that we can flag appropriately
+*/
+void journalingFailure(const char* msg) {
+ /** todo:
+ (1) don't log too much
+ (2) make an indicator in the journal dir that something bad happened.
+ (2b) refuse to do a recovery startup if that is there without manual override.
+ */
+ log() << "journaling failure/error: " << msg << endl;
+ verify(false);
+}
- JSectFooter::JSectFooter() {
- memset(this, 0, sizeof(*this));
- sentinel = JEntry::OpCode_Footer;
- }
+JSectFooter::JSectFooter() {
+ memset(this, 0, sizeof(*this));
+ sentinel = JEntry::OpCode_Footer;
+}
- JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash
- sentinel = JEntry::OpCode_Footer;
- reserved = 0;
- magic[0] = magic[1] = magic[2] = magic[3] = '\n';
+JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash
+ sentinel = JEntry::OpCode_Footer;
+ reserved = 0;
+ magic[0] = magic[1] = magic[2] = magic[3] = '\n';
- Checksum c;
- c.gen(begin, (unsigned) len);
- memcpy(hash, c.bytes, sizeof(hash));
- }
+ Checksum c;
+ c.gen(begin, (unsigned)len);
+ memcpy(hash, c.bytes, sizeof(hash));
+}
- bool JSectFooter::checkHash(const void* begin, int len) const {
- if( !magicOk() ) {
- log() << "journal footer not valid" << endl;
- return false;
- }
- Checksum c;
- c.gen(begin, len);
- DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(c.bytes, 16) << endl;
- if( memcmp(hash, c.bytes, sizeof(hash)) == 0 )
- return true;
- log() << "journal checkHash mismatch, got: " << toHex(c.bytes, 16) << " expected: " << toHex(hash,16) << endl;
- return false;
- }
+bool JSectFooter::checkHash(const void* begin, int len) const {
+ if (!magicOk()) {
+ log() << "journal footer not valid" << endl;
+ return false;
+ }
+ Checksum c;
+ c.gen(begin, len);
+ DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16)
+ << " current:" << toHex(c.bytes, 16) << endl;
+ if (memcmp(hash, c.bytes, sizeof(hash)) == 0)
+ return true;
+ log() << "journal checkHash mismatch, got: " << toHex(c.bytes, 16)
+ << " expected: " << toHex(hash, 16) << endl;
+ return false;
+}
- namespace {
- SecureRandom* mySecureRandom = NULL;
- stdx::mutex mySecureRandomMutex;
- int64_t getMySecureRandomNumber() {
- stdx::lock_guard<stdx::mutex> lk( mySecureRandomMutex );
- if ( ! mySecureRandom )
- mySecureRandom = SecureRandom::create();
- return mySecureRandom->nextInt64();
- }
- }
+namespace {
+SecureRandom* mySecureRandom = NULL;
+stdx::mutex mySecureRandomMutex;
+int64_t getMySecureRandomNumber() {
+ stdx::lock_guard<stdx::mutex> lk(mySecureRandomMutex);
+ if (!mySecureRandom)
+ mySecureRandom = SecureRandom::create();
+ return mySecureRandom->nextInt64();
+}
+}
- JHeader::JHeader(string fname) {
- magic[0] = 'j'; magic[1] = '\n';
- _version = CurrentVersion;
- memset(ts, 0, sizeof(ts));
- time_t t = time(0);
- strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts)-1);
- memset(dbpath, 0, sizeof(dbpath));
- strncpy(dbpath, fname.c_str(), sizeof(dbpath)-1);
- {
- fileId = t&0xffffffff;
- fileId |= static_cast<unsigned long long>( getMySecureRandomNumber() ) << 32;
- }
- memset(reserved3, 0, sizeof(reserved3));
- txt2[0] = txt2[1] = '\n';
- n1 = n2 = n3 = n4 = '\n';
- }
+JHeader::JHeader(string fname) {
+ magic[0] = 'j';
+ magic[1] = '\n';
+ _version = CurrentVersion;
+ memset(ts, 0, sizeof(ts));
+ time_t t = time(0);
+ strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts) - 1);
+ memset(dbpath, 0, sizeof(dbpath));
+ strncpy(dbpath, fname.c_str(), sizeof(dbpath) - 1);
+ {
+ fileId = t & 0xffffffff;
+ fileId |= static_cast<unsigned long long>(getMySecureRandomNumber()) << 32;
+ }
+ memset(reserved3, 0, sizeof(reserved3));
+ txt2[0] = txt2[1] = '\n';
+ n1 = n2 = n3 = n4 = '\n';
+}
- Journal j;
+Journal j;
- const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0);
+const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0);
- Journal::Journal() {
- _written = 0;
- _nextFileNumber = 0;
- _curLogFile = 0;
- _curFileId = 0;
- _preFlushTime = 0;
- _lastFlushTime = 0;
- _writeToLSNNeeded = false;
- }
+Journal::Journal() {
+ _written = 0;
+ _nextFileNumber = 0;
+ _curLogFile = 0;
+ _curFileId = 0;
+ _preFlushTime = 0;
+ _lastFlushTime = 0;
+ _writeToLSNNeeded = false;
+}
- boost::filesystem::path Journal::getFilePathFor(int filenumber) const {
- boost::filesystem::path p(dir);
- p /= string(str::stream() << "j._" << filenumber);
- return p;
- }
+boost::filesystem::path Journal::getFilePathFor(int filenumber) const {
+ boost::filesystem::path p(dir);
+ p /= string(str::stream() << "j._" << filenumber);
+ return p;
+}
- /** never throws
- @param anyFiles by default we only look at j._* files. If anyFiles is true, return true
- if there are any files in the journal directory. acquirePathLock() uses this to
- make sure that the journal directory is mounted.
- @return true if journal dir is not empty
- */
- bool haveJournalFiles(bool anyFiles) {
- try {
- boost::filesystem::path jdir = getJournalDir();
- if ( !boost::filesystem::exists( jdir ) )
- return false;
-
- for ( boost::filesystem::directory_iterator i( jdir );
- i != boost::filesystem::directory_iterator();
- ++i ) {
- string fileName = boost::filesystem::path(*i).leaf().string();
- if( anyFiles || str::startsWith(fileName, "j._") )
- return true;
- }
- }
- catch(const std::exception& e) {
- log() << "Unable to check for journal files due to: " << e.what() << endl;
- }
+/** never throws
+ @param anyFiles by default we only look at j._* files. If anyFiles is true, return true
+ if there are any files in the journal directory. acquirePathLock() uses this to
+ make sure that the journal directory is mounted.
+ @return true if journal dir is not empty
+*/
+bool haveJournalFiles(bool anyFiles) {
+ try {
+ boost::filesystem::path jdir = getJournalDir();
+ if (!boost::filesystem::exists(jdir))
return false;
+
+ for (boost::filesystem::directory_iterator i(jdir);
+ i != boost::filesystem::directory_iterator();
+ ++i) {
+ string fileName = boost::filesystem::path(*i).leaf().string();
+ if (anyFiles || str::startsWith(fileName, "j._"))
+ return true;
}
+ } catch (const std::exception& e) {
+ log() << "Unable to check for journal files due to: " << e.what() << endl;
+ }
+ return false;
+}
- /** throws */
- void removeJournalFiles() {
- log() << "removeJournalFiles" << endl;
- try {
- for ( boost::filesystem::directory_iterator i( getJournalDir() );
- i != boost::filesystem::directory_iterator();
- ++i ) {
- string fileName = boost::filesystem::path(*i).leaf().string();
- if( str::startsWith(fileName, "j._") ) {
- try {
- removeOldJournalFile(*i);
- }
- catch(std::exception& e) {
- log() << "couldn't remove " << fileName << ' ' << e.what() << endl;
- throw;
- }
- }
- }
+/** throws */
+void removeJournalFiles() {
+ log() << "removeJournalFiles" << endl;
+ try {
+ for (boost::filesystem::directory_iterator i(getJournalDir());
+ i != boost::filesystem::directory_iterator();
+ ++i) {
+ string fileName = boost::filesystem::path(*i).leaf().string();
+ if (str::startsWith(fileName, "j._")) {
try {
- boost::filesystem::remove(lsnPath());
- }
- catch(...) {
- // std::exception details logged in catch below
- log() << "couldn't remove " << lsnPath().string() << endl;
+ removeOldJournalFile(*i);
+ } catch (std::exception& e) {
+ log() << "couldn't remove " << fileName << ' ' << e.what() << endl;
throw;
}
}
- catch( std::exception& e ) {
- log() << "error removing journal files " << e.what() << endl;
- throw;
- }
- verify(!haveJournalFiles());
-
- flushMyDirectory(getJournalDir() / "file"); // flushes parent of argument (in this case journal dir)
-
- LOG(1) << "removeJournalFiles end" << endl;
}
+ try {
+ boost::filesystem::remove(lsnPath());
+ } catch (...) {
+ // std::exception details logged in catch below
+ log() << "couldn't remove " << lsnPath().string() << endl;
+ throw;
+ }
+ } catch (std::exception& e) {
+ log() << "error removing journal files " << e.what() << endl;
+ throw;
+ }
+ verify(!haveJournalFiles());
- /** at clean shutdown */
- bool okToCleanUp = false; // successful recovery would set this to true
- void Journal::cleanup(bool _log) {
- if( !okToCleanUp )
- return;
+ flushMyDirectory(getJournalDir() /
+ "file"); // flushes parent of argument (in this case journal dir)
- if( _log )
- log() << "journalCleanup..." << endl;
- try {
- stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex);
- closeCurrentJournalFile();
- removeJournalFiles();
- }
- catch(std::exception& e) {
- log() << "error couldn't remove journal file during shutdown " << e.what() << endl;
- throw;
- }
- }
- void journalCleanup(bool log) { j.cleanup(log); }
+ LOG(1) << "removeJournalFiles end" << endl;
+}
- bool _preallocateIsFaster() {
- bool faster = false;
- boost::filesystem::path p = getJournalDir() / "tempLatencyTest";
- if (boost::filesystem::exists(p)) {
- try {
- remove(p);
- }
- catch(const std::exception& e) {
- log() << "Unable to remove temporary file due to: " << e.what() << endl;
- }
- }
- try {
- AlignedBuilder b(8192);
- int millis[2];
- const int N = 50;
- for( int pass = 0; pass < 2; pass++ ) {
- LogFile f(p.string());
- Timer t;
- for( int i = 0 ; i < N; i++ ) {
- f.synchronousAppend(b.buf(), 8192);
- }
- millis[pass] = t.millis();
- // second time through, file exists and is prealloc case
- }
- int diff = millis[0] - millis[1];
- if( diff > 2 * N ) {
- // at least 2ms faster for prealloc case?
- faster = true;
- log() << "preallocateIsFaster=true " << diff / (1.0*N) << endl;
- }
- }
- catch (const std::exception& e) {
- log() << "info preallocateIsFaster couldn't run due to: " << e.what()
- << "; returning false" << endl;
- }
- if (boost::filesystem::exists(p)) {
- try {
- remove(p);
- }
- catch(const std::exception& e) {
- log() << "Unable to remove temporary file due to: " << e.what() << endl;
- }
- }
- return faster;
+/** at clean shutdown */
+bool okToCleanUp = false; // successful recovery would set this to true
+void Journal::cleanup(bool _log) {
+ if (!okToCleanUp)
+ return;
+
+ if (_log)
+ log() << "journalCleanup..." << endl;
+ try {
+ stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex);
+ closeCurrentJournalFile();
+ removeJournalFiles();
+ } catch (std::exception& e) {
+ log() << "error couldn't remove journal file during shutdown " << e.what() << endl;
+ throw;
+ }
+}
+void journalCleanup(bool log) {
+ j.cleanup(log);
+}
+
+bool _preallocateIsFaster() {
+ bool faster = false;
+ boost::filesystem::path p = getJournalDir() / "tempLatencyTest";
+ if (boost::filesystem::exists(p)) {
+ try {
+ remove(p);
+ } catch (const std::exception& e) {
+ log() << "Unable to remove temporary file due to: " << e.what() << endl;
}
- bool preallocateIsFaster() {
+ }
+ try {
+ AlignedBuilder b(8192);
+ int millis[2];
+ const int N = 50;
+ for (int pass = 0; pass < 2; pass++) {
+ LogFile f(p.string());
Timer t;
- bool res = false;
- if( _preallocateIsFaster() && _preallocateIsFaster() ) {
- // maybe system is just super busy at the moment? sleep a second to let it calm down.
- // deciding to to prealloc is a medium big decision:
- sleepsecs(1);
- res = _preallocateIsFaster();
+ for (int i = 0; i < N; i++) {
+ f.synchronousAppend(b.buf(), 8192);
}
- if( t.millis() > 3000 )
- log() << "preallocateIsFaster check took " << t.millis()/1000.0 << " secs" << endl;
- return res;
+ millis[pass] = t.millis();
+ // second time through, file exists and is prealloc case
}
-
- // throws
- void preallocateFile(boost::filesystem::path p, unsigned long long len) {
- if( exists(p) )
- return;
-
- log() << "preallocating a journal file " << p.string() << endl;
-
- const unsigned BLKSZ = 1024 * 1024;
- verify( len % BLKSZ == 0 );
-
- AlignedBuilder b(BLKSZ);
- memset((void*)b.buf(), 0, BLKSZ);
-
- ProgressMeter m(len, 3/*secs*/, 10/*hits between time check (once every 6.4MB)*/);
- m.setName("File Preallocator Progress");
-
- File f;
- f.open( p.string().c_str() , /*read-only*/false , /*direct-io*/false );
- verify( f.is_open() );
- fileofs loc = 0;
- while ( loc < len ) {
- f.write( loc , b.buf() , BLKSZ );
- loc += BLKSZ;
- m.hit(BLKSZ);
- }
- verify( loc == len );
- f.fsync();
+ int diff = millis[0] - millis[1];
+ if (diff > 2 * N) {
+ // at least 2ms faster for prealloc case?
+ faster = true;
+ log() << "preallocateIsFaster=true " << diff / (1.0 * N) << endl;
}
-
- const int NUM_PREALLOC_FILES = 3;
- inline boost::filesystem::path preallocPath(int n) {
- verify(n >= 0);
- verify(n < NUM_PREALLOC_FILES);
- string fn = str::stream() << "prealloc." << n;
- return getJournalDir() / fn;
+ } catch (const std::exception& e) {
+ log() << "info preallocateIsFaster couldn't run due to: " << e.what() << "; returning false"
+ << endl;
+ }
+ if (boost::filesystem::exists(p)) {
+ try {
+ remove(p);
+ } catch (const std::exception& e) {
+ log() << "Unable to remove temporary file due to: " << e.what() << endl;
}
+ }
+ return faster;
+}
+bool preallocateIsFaster() {
+ Timer t;
+ bool res = false;
+ if (_preallocateIsFaster() && _preallocateIsFaster()) {
+ // maybe system is just super busy at the moment? sleep a second to let it calm down.
+ // deciding to to prealloc is a medium big decision:
+ sleepsecs(1);
+ res = _preallocateIsFaster();
+ }
+ if (t.millis() > 3000)
+ log() << "preallocateIsFaster check took " << t.millis() / 1000.0 << " secs" << endl;
+ return res;
+}
- // throws
- void _preallocateFiles() {
- for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
- boost::filesystem::path filepath = preallocPath(i);
+// throws
+void preallocateFile(boost::filesystem::path p, unsigned long long len) {
+ if (exists(p))
+ return;
- unsigned long long limit = DataLimitPerJournalFile;
- if( kDebugBuild && i == 1 ) {
- // moving 32->64, the prealloc files would be short. that is "ok", but we
- // want to exercise that case, so we force exercising here when
- // MONGO_CONFIG_DEBUG_BUILD is set by arbitrarily stopping prealloc at a
- // low limit for a file. also we want to be able to change in the future
- // the constant without a lot of work anyway.
- limit = 16 * 1024 * 1024;
- }
- preallocateFile(filepath, limit);
- }
- }
+ log() << "preallocating a journal file " << p.string() << endl;
- void checkFreeSpace() {
- unsigned long long spaceNeeded = static_cast<unsigned long long>(3 * DataLimitPerJournalFile * 1.1); // add 10% for headroom
- unsigned long long freeSpace = File::freeSpace(getJournalDir().string());
- unsigned long long prealloced = 0;
- for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
- boost::filesystem::path filepath = preallocPath(i);
- if (exists(filepath))
- prealloced += file_size(filepath);
- }
+ const unsigned BLKSZ = 1024 * 1024;
+ verify(len % BLKSZ == 0);
- if (freeSpace + prealloced < spaceNeeded) {
- log() << endl;
- error() << "Insufficient free space for journal files" << endl;
- log() << "Please make at least " << spaceNeeded/(1024*1024) << "MB available in " << getJournalDir().string() << " or use --smallfiles" << endl;
- log() << endl;
- throw UserException(15926, "Insufficient free space for journals");
- }
- }
+ AlignedBuilder b(BLKSZ);
+ memset((void*)b.buf(), 0, BLKSZ);
- void preallocateFiles() {
- if (!(mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalNoCheckSpace))
- checkFreeSpace();
+ ProgressMeter m(len, 3 /*secs*/, 10 /*hits between time check (once every 6.4MB)*/);
+ m.setName("File Preallocator Progress");
- if( exists(preallocPath(0)) || // if enabled previously, keep using
- exists(preallocPath(1)) ||
- (mmapv1GlobalOptions.preallocj && preallocateIsFaster()) ) {
- usingPreallocate = true;
- try {
- _preallocateFiles();
- }
- catch (const std::exception& e) {
- log() << "warning caught exception (" << e.what()
- << ") in preallocateFiles, continuing" << endl;
- }
- }
- j.open();
- }
+ File f;
+ f.open(p.string().c_str(), /*read-only*/ false, /*direct-io*/ false);
+ verify(f.is_open());
+ fileofs loc = 0;
+ while (loc < len) {
+ f.write(loc, b.buf(), BLKSZ);
+ loc += BLKSZ;
+ m.hit(BLKSZ);
+ }
+ verify(loc == len);
+ f.fsync();
+}
- void removeOldJournalFile(boost::filesystem::path p) {
- if( usingPreallocate ) {
- try {
- for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
- boost::filesystem::path filepath = preallocPath(i);
- if( !boost::filesystem::exists(filepath) ) {
- // we can recycle this file into this prealloc file location
- boost::filesystem::path temppath = filepath.string() + ".temp";
- boost::filesystem::rename(p, temppath);
- {
- // zero the header
- File f;
- f.open(temppath.string().c_str(), false, false);
- char buf[8192];
- memset(buf, 0, 8192);
- f.write(0, buf, 8192);
- f.truncate(DataLimitPerJournalFile);
- f.fsync();
- }
- boost::filesystem::rename(temppath, filepath);
- return;
- }
- }
- } catch (const std::exception& e) {
- log() << "warning exception in dur::removeOldJournalFile " << p.string()
- << ": " << e.what() << endl;
- // fall through and try to delete the file
- }
- }
+const int NUM_PREALLOC_FILES = 3;
+inline boost::filesystem::path preallocPath(int n) {
+ verify(n >= 0);
+ verify(n < NUM_PREALLOC_FILES);
+ string fn = str::stream() << "prealloc." << n;
+ return getJournalDir() / fn;
+}
- // already have 3 prealloc files, so delete this file
- try {
- boost::filesystem::remove(p);
- }
- catch (const std::exception& e) {
- log() << "warning exception removing " << p.string() << ": " << e.what() << endl;
- }
- }
+// throws
+void _preallocateFiles() {
+ for (int i = 0; i < NUM_PREALLOC_FILES; i++) {
+ boost::filesystem::path filepath = preallocPath(i);
+
+ unsigned long long limit = DataLimitPerJournalFile;
+ if (kDebugBuild && i == 1) {
+ // moving 32->64, the prealloc files would be short. that is "ok", but we
+ // want to exercise that case, so we force exercising here when
+ // MONGO_CONFIG_DEBUG_BUILD is set by arbitrarily stopping prealloc at a
+ // low limit for a file. also we want to be able to change in the future
+ // the constant without a lot of work anyway.
+ limit = 16 * 1024 * 1024;
+ }
+ preallocateFile(filepath, limit);
+ }
+}
- // find a prealloc.<n> file, presumably to take and use
- boost::filesystem::path findPrealloced() {
- try {
- for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
- boost::filesystem::path filepath = preallocPath(i);
- if( boost::filesystem::exists(filepath) )
- return filepath;
- }
- } catch (const std::exception& e) {
- log() << "warning exception in dur::findPrealloced(): " << e.what() << endl;
- }
- return boost::filesystem::path();
- }
+void checkFreeSpace() {
+ unsigned long long spaceNeeded =
+ static_cast<unsigned long long>(3 * DataLimitPerJournalFile * 1.1); // add 10% for headroom
+ unsigned long long freeSpace = File::freeSpace(getJournalDir().string());
+ unsigned long long prealloced = 0;
+ for (int i = 0; i < NUM_PREALLOC_FILES; i++) {
+ boost::filesystem::path filepath = preallocPath(i);
+ if (exists(filepath))
+ prealloced += file_size(filepath);
+ }
- /** assure journal/ dir exists. throws. call during startup. */
- void journalMakeDir() {
- j.init();
+ if (freeSpace + prealloced < spaceNeeded) {
+ log() << endl;
+ error() << "Insufficient free space for journal files" << endl;
+ log() << "Please make at least " << spaceNeeded / (1024 * 1024) << "MB available in "
+ << getJournalDir().string() << " or use --smallfiles" << endl;
+ log() << endl;
+ throw UserException(15926, "Insufficient free space for journals");
+ }
+}
- boost::filesystem::path p = getJournalDir();
- j.dir = p.string();
- log() << "journal dir=" << j.dir << endl;
- if( !boost::filesystem::exists(j.dir) ) {
- try {
- boost::filesystem::create_directory(j.dir);
- }
- catch(std::exception& e) {
- log() << "error creating directory " << j.dir << ' ' << e.what() << endl;
- throw;
- }
- }
+void preallocateFiles() {
+ if (!(mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalNoCheckSpace))
+ checkFreeSpace();
+
+ if (exists(preallocPath(0)) || // if enabled previously, keep using
+ exists(preallocPath(1)) ||
+ (mmapv1GlobalOptions.preallocj && preallocateIsFaster())) {
+ usingPreallocate = true;
+ try {
+ _preallocateFiles();
+ } catch (const std::exception& e) {
+ log() << "warning caught exception (" << e.what() << ") in preallocateFiles, continuing"
+ << endl;
}
+ }
+ j.open();
+}
- void Journal::_open() {
- _curFileId = 0;
- verify( _curLogFile == 0 );
- boost::filesystem::path fname = getFilePathFor(_nextFileNumber);
-
- // if we have a prealloced file, use it
- {
- boost::filesystem::path p = findPrealloced();
- if( !p.empty() ) {
- try {
- {
- // JHeader::fileId must be updated before renaming to be race-safe
- LogFile f(p.string());
- JHeader h(p.string());
- AlignedBuilder b(8192);
- b.appendStruct(h);
- f.synchronousAppend(b.buf(), b.len());
- }
- boost::filesystem::rename(p, fname);
- }
- catch (const std::exception& e) {
- log() << "warning couldn't write to / rename file " << p.string()
- << ": " << e.what() << endl;
+void removeOldJournalFile(boost::filesystem::path p) {
+ if (usingPreallocate) {
+ try {
+ for (int i = 0; i < NUM_PREALLOC_FILES; i++) {
+ boost::filesystem::path filepath = preallocPath(i);
+ if (!boost::filesystem::exists(filepath)) {
+ // we can recycle this file into this prealloc file location
+ boost::filesystem::path temppath = filepath.string() + ".temp";
+ boost::filesystem::rename(p, temppath);
+ {
+ // zero the header
+ File f;
+ f.open(temppath.string().c_str(), false, false);
+ char buf[8192];
+ memset(buf, 0, 8192);
+ f.write(0, buf, 8192);
+ f.truncate(DataLimitPerJournalFile);
+ f.fsync();
}
+ boost::filesystem::rename(temppath, filepath);
+ return;
}
}
-
- _curLogFile = new LogFile(fname.string());
- _nextFileNumber++;
- {
- JHeader h(fname.string());
- _curFileId = h.fileId;
- verify(_curFileId);
- AlignedBuilder b(8192);
- b.appendStruct(h);
- _curLogFile->synchronousAppend(b.buf(), b.len());
- }
+ } catch (const std::exception& e) {
+ log() << "warning exception in dur::removeOldJournalFile " << p.string() << ": "
+ << e.what() << endl;
+ // fall through and try to delete the file
}
+ }
- void Journal::init() {
- verify( _curLogFile == 0 );
- MongoFile::notifyPreFlush = preFlush;
- MongoFile::notifyPostFlush = postFlush;
- }
+ // already have 3 prealloc files, so delete this file
+ try {
+ boost::filesystem::remove(p);
+ } catch (const std::exception& e) {
+ log() << "warning exception removing " << p.string() << ": " << e.what() << endl;
+ }
+}
- void Journal::open() {
- verify( MongoFile::notifyPreFlush == preFlush );
- stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex);
- _open();
+// find a prealloc.<n> file, presumably to take and use
+boost::filesystem::path findPrealloced() {
+ try {
+ for (int i = 0; i < NUM_PREALLOC_FILES; i++) {
+ boost::filesystem::path filepath = preallocPath(i);
+ if (boost::filesystem::exists(filepath))
+ return filepath;
}
+ } catch (const std::exception& e) {
+ log() << "warning exception in dur::findPrealloced(): " << e.what() << endl;
+ }
+ return boost::filesystem::path();
+}
- void LSNFile::set(unsigned long long x) {
- memset(this, 0, sizeof(*this));
- lsn = x;
- checkbytes = ~x;
- }
+/** assure journal/ dir exists. throws. call during startup. */
+void journalMakeDir() {
+ j.init();
- /** logs details of the situation, and returns 0, if anything surprising in the LSNFile
- if something highly surprising, throws to abort
- */
- unsigned long long LSNFile::get() {
- uassert(13614, str::stream() << "unexpected version number of lsn file in journal/ directory got: " << ver , ver == 0);
- if( ~lsn != checkbytes ) {
- log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn << " checkbytes: " << hex << checkbytes << endl;
- return 0;
- }
- return lsn;
+ boost::filesystem::path p = getJournalDir();
+ j.dir = p.string();
+ log() << "journal dir=" << j.dir << endl;
+ if (!boost::filesystem::exists(j.dir)) {
+ try {
+ boost::filesystem::create_directory(j.dir);
+ } catch (std::exception& e) {
+ log() << "error creating directory " << j.dir << ' ' << e.what() << endl;
+ throw;
}
+ }
+}
- /** called during recovery (the error message text below assumes that)
- */
- unsigned long long journalReadLSN() {
- if( !exists(lsnPath()) ) {
- log() << "info no lsn file in journal/ directory" << endl;
- return 0;
- }
+void Journal::_open() {
+ _curFileId = 0;
+ verify(_curLogFile == 0);
+ boost::filesystem::path fname = getFilePathFor(_nextFileNumber);
+ // if we have a prealloced file, use it
+ {
+ boost::filesystem::path p = findPrealloced();
+ if (!p.empty()) {
try {
- // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery.
- // however, given we actually close the file when writing, that seems unlikely.
- LSNFile L;
- File f;
- f.open(lsnPath().string().c_str());
- verify(f.is_open());
- if( f.len() == 0 ) {
- // this could be 'normal' if we crashed at the right moment
- log() << "info lsn file is zero bytes long" << endl;
- return 0;
+ {
+ // JHeader::fileId must be updated before renaming to be race-safe
+ LogFile f(p.string());
+ JHeader h(p.string());
+ AlignedBuilder b(8192);
+ b.appendStruct(h);
+ f.synchronousAppend(b.buf(), b.len());
}
- f.read(0,(char*)&L, sizeof(L));
- unsigned long long lsn = L.get();
- return lsn;
- }
- catch(std::exception& e) {
- uasserted(13611, str::stream() << "can't read lsn file in journal directory : " << e.what());
+ boost::filesystem::rename(p, fname);
+ } catch (const std::exception& e) {
+ log() << "warning couldn't write to / rename file " << p.string() << ": "
+ << e.what() << endl;
}
- return 0;
}
+ }
- unsigned long long getLastDataFileFlushTime() {
- return j.lastFlushTime();
- }
+ _curLogFile = new LogFile(fname.string());
+ _nextFileNumber++;
+ {
+ JHeader h(fname.string());
+ _curFileId = h.fileId;
+ verify(_curFileId);
+ AlignedBuilder b(8192);
+ b.appendStruct(h);
+ _curLogFile->synchronousAppend(b.buf(), b.len());
+ }
+}
- /** remember "last sequence number" to speed recoveries
- concurrency: called by durThread only.
- */
- void Journal::updateLSNFile() {
- if( !_writeToLSNNeeded )
- return;
- _writeToLSNNeeded = false;
- try {
- // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery.
- // however, given we actually close the file, that seems unlikely.
- File f;
- f.open(lsnPath().string().c_str());
- if( !f.is_open() ) {
- // can get 0 if an i/o error
- log() << "warning: open of lsn file failed" << endl;
- return;
- }
- LOG(1) << "lsn set " << _lastFlushTime << endl;
- LSNFile lsnf;
- lsnf.set(_lastFlushTime);
- f.write(0, (char*)&lsnf, sizeof(lsnf));
- // do we want to fsync here? if we do it probably needs to be async so the durthread
- // is not delayed.
- }
- catch(std::exception& e) {
- log() << "warning: write to lsn file failed " << e.what() << endl;
- // keep running (ignore the error). recovery will be slow.
- }
- }
+void Journal::init() {
+ verify(_curLogFile == 0);
+ MongoFile::notifyPreFlush = preFlush;
+ MongoFile::notifyPostFlush = postFlush;
+}
- void Journal::preFlush() {
- j._preFlushTime = Listener::getElapsedTimeMillis();
- }
+void Journal::open() {
+ verify(MongoFile::notifyPreFlush == preFlush);
+ stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex);
+ _open();
+}
- void Journal::postFlush() {
- j._lastFlushTime = j._preFlushTime;
- j._writeToLSNNeeded = true;
- }
+void LSNFile::set(unsigned long long x) {
+ memset(this, 0, sizeof(*this));
+ lsn = x;
+ checkbytes = ~x;
+}
- // call from within _curLogFileMutex
- void Journal::closeCurrentJournalFile() {
- if (!_curLogFile)
- return;
+/** logs details of the situation, and returns 0, if anything surprising in the LSNFile
+ if something highly surprising, throws to abort
+*/
+unsigned long long LSNFile::get() {
+ uassert(
+ 13614,
+ str::stream() << "unexpected version number of lsn file in journal/ directory got: " << ver,
+ ver == 0);
+ if (~lsn != checkbytes) {
+ log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn
+ << " checkbytes: " << hex << checkbytes << endl;
+ return 0;
+ }
+ return lsn;
+}
- JFile jf;
- jf.filename = _curLogFile->_name;
- jf.lastEventTimeMs = Listener::getElapsedTimeMillis();
- _oldJournalFiles.push_back(jf);
+/** called during recovery (the error message text below assumes that)
+*/
+unsigned long long journalReadLSN() {
+ if (!exists(lsnPath())) {
+ log() << "info no lsn file in journal/ directory" << endl;
+ return 0;
+ }
- delete _curLogFile; // close
- _curLogFile = 0;
- _written = 0;
+ try {
+ // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery.
+ // however, given we actually close the file when writing, that seems unlikely.
+ LSNFile L;
+ File f;
+ f.open(lsnPath().string().c_str());
+ verify(f.is_open());
+ if (f.len() == 0) {
+ // this could be 'normal' if we crashed at the right moment
+ log() << "info lsn file is zero bytes long" << endl;
+ return 0;
}
+ f.read(0, (char*)&L, sizeof(L));
+ unsigned long long lsn = L.get();
+ return lsn;
+ } catch (std::exception& e) {
+ uasserted(13611,
+ str::stream() << "can't read lsn file in journal directory : " << e.what());
+ }
+ return 0;
+}
- /** remove older journal files.
- be in _curLogFileMutex but not dbMutex when calling
- */
- void Journal::removeUnneededJournalFiles() {
- while( !_oldJournalFiles.empty() ) {
- JFile f = _oldJournalFiles.front();
-
- if( f.lastEventTimeMs < _lastFlushTime + ExtraKeepTimeMs ) {
- // eligible for deletion
- boost::filesystem::path p( f.filename );
- log() << "old journal file will be removed: " << f.filename << endl;
- removeOldJournalFile(p);
- }
- else {
- break;
- }
+unsigned long long getLastDataFileFlushTime() {
+ return j.lastFlushTime();
+}
- _oldJournalFiles.pop_front();
- }
- }
+/** remember "last sequence number" to speed recoveries
+ concurrency: called by durThread only.
+*/
+void Journal::updateLSNFile() {
+ if (!_writeToLSNNeeded)
+ return;
+ _writeToLSNNeeded = false;
+ try {
+ // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery.
+ // however, given we actually close the file, that seems unlikely.
+ File f;
+ f.open(lsnPath().string().c_str());
+ if (!f.is_open()) {
+ // can get 0 if an i/o error
+ log() << "warning: open of lsn file failed" << endl;
+ return;
+ }
+ LOG(1) << "lsn set " << _lastFlushTime << endl;
+ LSNFile lsnf;
+ lsnf.set(_lastFlushTime);
+ f.write(0, (char*)&lsnf, sizeof(lsnf));
+ // do we want to fsync here? if we do it probably needs to be async so the durthread
+ // is not delayed.
+ } catch (std::exception& e) {
+ log() << "warning: write to lsn file failed " << e.what() << endl;
+ // keep running (ignore the error). recovery will be slow.
+ }
+}
- void Journal::_rotate() {
+void Journal::preFlush() {
+ j._preFlushTime = Listener::getElapsedTimeMillis();
+}
- if ( inShutdown() || !_curLogFile )
- return;
+void Journal::postFlush() {
+ j._lastFlushTime = j._preFlushTime;
+ j._writeToLSNNeeded = true;
+}
- j.updateLSNFile();
+// call from within _curLogFileMutex
+void Journal::closeCurrentJournalFile() {
+ if (!_curLogFile)
+ return;
- if( _curLogFile && _written < DataLimitPerJournalFile )
- return;
+ JFile jf;
+ jf.filename = _curLogFile->_name;
+ jf.lastEventTimeMs = Listener::getElapsedTimeMillis();
+ _oldJournalFiles.push_back(jf);
- if( _curLogFile ) {
- _curLogFile->truncate();
- closeCurrentJournalFile();
- removeUnneededJournalFiles();
- }
+ delete _curLogFile; // close
+ _curLogFile = 0;
+ _written = 0;
+}
- try {
- Timer t;
- _open();
- int ms = t.millis();
- if( ms >= 200 ) {
- log() << "DR101 latency warning on journal file open " << ms << "ms" << endl;
- }
- }
- catch(std::exception& e) {
- log() << "warning exception opening journal file " << e.what() << endl;
- throw;
- }
- }
+/** remove older journal files.
+ be in _curLogFileMutex but not dbMutex when calling
+*/
+void Journal::removeUnneededJournalFiles() {
+ while (!_oldJournalFiles.empty()) {
+ JFile f = _oldJournalFiles.front();
- /** write (append) the buffer we have built to the journal and fsync it.
- outside of dbMutex lock as this could be slow.
- @param uncompressed - a buffer that will be written to the journal after compression
- will not return until on disk
- */
- void WRITETOJOURNAL(const JSectHeader& h, const AlignedBuilder& uncompressed) {
- Timer t;
- j.journal(h, uncompressed);
- stats.curr()->_writeToJournalMicros += t.micros();
+ if (f.lastEventTimeMs < _lastFlushTime + ExtraKeepTimeMs) {
+ // eligible for deletion
+ boost::filesystem::path p(f.filename);
+ log() << "old journal file will be removed: " << f.filename << endl;
+ removeOldJournalFile(p);
+ } else {
+ break;
}
- void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) {
- static AlignedBuilder b(32*1024*1024);
- /* buffer to journal will be
- JSectHeader
- compressed operations
- JSectFooter
- */
- const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter);
- const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize;
- b.reset(max);
-
- {
- dassert( h.sectionLen() == (unsigned) 0xffffffff ); // we will backfill later
- b.appendStruct(h);
- }
+ _oldJournalFiles.pop_front();
+ }
+}
- size_t compressedLength = 0;
- rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength);
- verify( compressedLength < 0xffffffff );
- verify( compressedLength < max );
- b.skip(compressedLength);
-
- // footer
- unsigned L = 0xffffffff;
- {
- // pad to alignment, and set the total section length in the JSectHeader
- verify( 0xffffe000 == (~(Alignment-1)) );
- unsigned lenUnpadded = b.len() + sizeof(JSectFooter);
- L = (lenUnpadded + Alignment-1) & (~(Alignment-1));
- dassert( L >= lenUnpadded );
-
- ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded);
-
- JSectFooter f(b.buf(), b.len()); // computes checksum
- b.appendStruct(f);
- dassert( b.len() == lenUnpadded );
-
- b.skip(L - lenUnpadded);
- dassert( b.len() % Alignment == 0 );
- }
+void Journal::_rotate() {
+ if (inShutdown() || !_curLogFile)
+ return;
- try {
- stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex);
-
- // must already be open -- so that _curFileId is correct for previous buffer building
- verify( _curLogFile );
-
- stats.curr()->_uncompressedBytes += uncompressed.len();
- unsigned w = b.len();
- _written += w;
- verify( w <= L );
- stats.curr()->_journaledBytes += L;
- _curLogFile->synchronousAppend((const void *) b.buf(), L);
- _rotate();
- }
- catch(std::exception& e) {
- log() << "error exception in dur::journal " << e.what() << endl;
- throw;
- }
+ j.updateLSNFile();
+
+ if (_curLogFile && _written < DataLimitPerJournalFile)
+ return;
+
+ if (_curLogFile) {
+ _curLogFile->truncate();
+ closeCurrentJournalFile();
+ removeUnneededJournalFiles();
+ }
+
+ try {
+ Timer t;
+ _open();
+ int ms = t.millis();
+ if (ms >= 200) {
+ log() << "DR101 latency warning on journal file open " << ms << "ms" << endl;
}
+ } catch (std::exception& e) {
+ log() << "warning exception opening journal file " << e.what() << endl;
+ throw;
+ }
+}
+
+/** write (append) the buffer we have built to the journal and fsync it.
+ outside of dbMutex lock as this could be slow.
+ @param uncompressed - a buffer that will be written to the journal after compression
+ will not return until on disk
+*/
+void WRITETOJOURNAL(const JSectHeader& h, const AlignedBuilder& uncompressed) {
+ Timer t;
+ j.journal(h, uncompressed);
+ stats.curr()->_writeToJournalMicros += t.micros();
+}
+void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) {
+ static AlignedBuilder b(32 * 1024 * 1024);
+ /* buffer to journal will be
+ JSectHeader
+ compressed operations
+ JSectFooter
+ */
+ const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter);
+ const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize;
+ b.reset(max);
+
+ {
+ dassert(h.sectionLen() == (unsigned)0xffffffff); // we will backfill later
+ b.appendStruct(h);
}
+
+ size_t compressedLength = 0;
+ rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength);
+ verify(compressedLength < 0xffffffff);
+ verify(compressedLength < max);
+ b.skip(compressedLength);
+
+ // footer
+ unsigned L = 0xffffffff;
+ {
+ // pad to alignment, and set the total section length in the JSectHeader
+ verify(0xffffe000 == (~(Alignment - 1)));
+ unsigned lenUnpadded = b.len() + sizeof(JSectFooter);
+ L = (lenUnpadded + Alignment - 1) & (~(Alignment - 1));
+ dassert(L >= lenUnpadded);
+
+ ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded);
+
+ JSectFooter f(b.buf(), b.len()); // computes checksum
+ b.appendStruct(f);
+ dassert(b.len() == lenUnpadded);
+
+ b.skip(L - lenUnpadded);
+ dassert(b.len() % Alignment == 0);
+ }
+
+ try {
+ stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex);
+
+ // must already be open -- so that _curFileId is correct for previous buffer building
+ verify(_curLogFile);
+
+ stats.curr()->_uncompressedBytes += uncompressed.len();
+ unsigned w = b.len();
+ _written += w;
+ verify(w <= L);
+ stats.curr()->_journaledBytes += L;
+ _curLogFile->synchronousAppend((const void*)b.buf(), L);
+ _rotate();
+ } catch (std::exception& e) {
+ log() << "error exception in dur::journal " << e.what() << endl;
+ throw;
+ }
+}
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/dur_journal.h b/src/mongo/db/storage/mmap_v1/dur_journal.h
index 469732c59a2..07def586090 100644
--- a/src/mongo/db/storage/mmap_v1/dur_journal.h
+++ b/src/mongo/db/storage/mmap_v1/dur_journal.h
@@ -32,59 +32,58 @@
namespace mongo {
- class AlignedBuilder;
- class JSectHeader;
+class AlignedBuilder;
+class JSectHeader;
- namespace dur {
+namespace dur {
- /** true if ok to cleanup journal files at termination. otherwise, files journal will be retained.
- */
- extern bool okToCleanUp;
-
- /** at termination after db files closed & fsynced
- also after recovery
- closes and removes journal files
- @param log report in log that we are cleaning up if we actually do any work
- */
- void journalCleanup(bool log = false);
+/** true if ok to cleanup journal files at termination. otherwise, files journal will be retained.
+*/
+extern bool okToCleanUp;
- /** assure journal/ dir exists. throws */
- void journalMakeDir();
+/** at termination after db files closed & fsynced
+ also after recovery
+ closes and removes journal files
+ @param log report in log that we are cleaning up if we actually do any work
+*/
+void journalCleanup(bool log = false);
- /** check if time to rotate files; assure a file is open.
- done separately from the journal() call as we can do this part
- outside of lock.
- only called by durThread.
- */
- void journalRotate();
+/** assure journal/ dir exists. throws */
+void journalMakeDir();
- /** flag that something has gone wrong during writing to the journal
- (not for recovery mode)
- */
- void journalingFailure(const char *msg);
+/** check if time to rotate files; assure a file is open.
+ done separately from the journal() call as we can do this part
+ outside of lock.
+ only called by durThread.
+ */
+void journalRotate();
- /** read lsn from disk from the last run before doing recovery */
- unsigned long long journalReadLSN();
+/** flag that something has gone wrong during writing to the journal
+ (not for recovery mode)
+*/
+void journalingFailure(const char* msg);
- unsigned long long getLastDataFileFlushTime();
+/** read lsn from disk from the last run before doing recovery */
+unsigned long long journalReadLSN();
- /** never throws.
- @param anyFiles by default we only look at j._* files. If anyFiles is true, return true
- if there are any files in the journal directory. acquirePathLock() uses this to
- make sure that the journal directory is mounted.
- @return true if there are any journal files in the journal dir.
- */
- bool haveJournalFiles(bool anyFiles=false);
+unsigned long long getLastDataFileFlushTime();
- /**
- * Writes the specified uncompressed buffer to the journal.
- */
- void WRITETOJOURNAL(const JSectHeader& h, const AlignedBuilder& uncompressed);
+/** never throws.
+ @param anyFiles by default we only look at j._* files. If anyFiles is true, return true
+ if there are any files in the journal directory. acquirePathLock() uses this to
+ make sure that the journal directory is mounted.
+ @return true if there are any journal files in the journal dir.
+*/
+bool haveJournalFiles(bool anyFiles = false);
- // in case disk controller buffers writes
- const long long ExtraKeepTimeMs = 10000;
+/**
+ * Writes the specified uncompressed buffer to the journal.
+ */
+void WRITETOJOURNAL(const JSectHeader& h, const AlignedBuilder& uncompressed);
- const unsigned JournalCommitIntervalDefault = 100;
+// in case disk controller buffers writes
+const long long ExtraKeepTimeMs = 10000;
- }
+const unsigned JournalCommitIntervalDefault = 100;
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/dur_journal_writer.cpp b/src/mongo/db/storage/mmap_v1/dur_journal_writer.cpp
index 4c6eb8ec8cc..971f2aa0e60 100644
--- a/src/mongo/db/storage/mmap_v1/dur_journal_writer.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur_journal_writer.cpp
@@ -47,268 +47,251 @@ namespace dur {
namespace {
- /**
- * Apply the writes back to the non-private MMF after they are for certain in the journal.
- *
- * (1) TODO we don't need to write back everything every group commit. We MUST write back that
- * which is going to be a remapped on its private view - but that might not be all views.
- *
- * (2) TODO should we do this using N threads? Would be quite easy see Hackenberg paper table
- * 5 and 6. 2 threads might be a good balance.
- */
- void WRITETODATAFILES(const JSectHeader& h, const AlignedBuilder& uncompressed) {
- Timer t;
-
- LOG(4) << "WRITETODATAFILES BEGIN";
-
- RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), NULL);
-
- const long long m = t.micros();
- stats.curr()->_writeToDataFilesMicros += m;
-
- LOG(4) << "journal WRITETODATAFILES " << m / 1000.0 << "ms";
- }
-
-} // namespace
-
+/**
+ * Apply the writes back to the non-private MMF after they are for certain in the journal.
+ *
+ * (1) TODO we don't need to write back everything every group commit. We MUST write back that
+ * which is going to be a remapped on its private view - but that might not be all views.
+ *
+ * (2) TODO should we do this using N threads? Would be quite easy see Hackenberg paper table
+ * 5 and 6. 2 threads might be a good balance.
+ */
+void WRITETODATAFILES(const JSectHeader& h, const AlignedBuilder& uncompressed) {
+ Timer t;
- /**
- * Used inside the journal writer thread to ensure that used buffers are cleaned up properly.
- */
- class BufferGuard {
- MONGO_DISALLOW_COPYING(BufferGuard);
- public:
- BufferGuard(JournalWriter::Buffer* buffer, JournalWriter::BufferQueue* bufferQueue)
- : _buffer(buffer),
- _bufferQueue(bufferQueue) {
+ LOG(4) << "WRITETODATAFILES BEGIN";
- }
+ RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), NULL);
- ~BufferGuard() {
- // This buffer is done. Reset and remove it from the journal queue and put it on
- // the ready queue.
- _buffer->_reset();
+ const long long m = t.micros();
+ stats.curr()->_writeToDataFilesMicros += m;
- // This should never block. Otherwise we will stall the journaling pipeline
- // permanently and cause deadlock.
- invariant(_bufferQueue->count() < _bufferQueue->maxSize());
- _bufferQueue->push(_buffer);
- }
+ LOG(4) << "journal WRITETODATAFILES " << m / 1000.0 << "ms";
+}
- private:
- // Buffer that this scoped object is managing. Owned until destruction time. Then, the
- // bufferQueue owns it.
- JournalWriter::Buffer* const _buffer;
+} // namespace
- // Queue where the buffer should be returned to at destruction time. Not owned.
- JournalWriter::BufferQueue* const _bufferQueue;
- };
+/**
+ * Used inside the journal writer thread to ensure that used buffers are cleaned up properly.
+ */
+class BufferGuard {
+ MONGO_DISALLOW_COPYING(BufferGuard);
+
+public:
+ BufferGuard(JournalWriter::Buffer* buffer, JournalWriter::BufferQueue* bufferQueue)
+ : _buffer(buffer), _bufferQueue(bufferQueue) {}
+
+ ~BufferGuard() {
+ // This buffer is done. Reset and remove it from the journal queue and put it on
+ // the ready queue.
+ _buffer->_reset();
+
+ // This should never block. Otherwise we will stall the journaling pipeline
+ // permanently and cause deadlock.
+ invariant(_bufferQueue->count() < _bufferQueue->maxSize());
+ _bufferQueue->push(_buffer);
+ }
- //
- // JournalWriter
- //
+private:
+ // Buffer that this scoped object is managing. Owned until destruction time. Then, the
+ // bufferQueue owns it.
+ JournalWriter::Buffer* const _buffer;
+
+ // Queue where the buffer should be returned to at destruction time. Not owned.
+ JournalWriter::BufferQueue* const _bufferQueue;
+};
+
+
+//
+// JournalWriter
+//
+
+JournalWriter::JournalWriter(NotifyAll* commitNotify,
+ NotifyAll* applyToDataFilesNotify,
+ size_t numBuffers)
+ : _commitNotify(commitNotify),
+ _applyToDataFilesNotify(applyToDataFilesNotify),
+ _shutdownRequested(false),
+ _journalQueue(numBuffers),
+ _lastCommitNumber(0),
+ _readyQueue(numBuffers) {
+ invariant(_journalQueue.maxSize() == _readyQueue.maxSize());
+}
+
+JournalWriter::~JournalWriter() {
+ // Never close the journal writer with outstanding or unaccounted writes
+ invariant(_journalQueue.empty());
+ invariant(_readyQueue.empty());
+}
+
+void JournalWriter::start() {
+ // Do not allow reuse
+ invariant(!_shutdownRequested);
+
+ // Pre-allocate the journal buffers and push them on the ready queue
+ for (size_t i = 0; i < _readyQueue.maxSize(); i++) {
+ _readyQueue.push(new Buffer(InitialBufferSizeBytes));
+ }
- JournalWriter::JournalWriter(NotifyAll* commitNotify,
- NotifyAll* applyToDataFilesNotify,
- size_t numBuffers)
- : _commitNotify(commitNotify),
- _applyToDataFilesNotify(applyToDataFilesNotify),
- _shutdownRequested(false),
- _journalQueue(numBuffers),
- _lastCommitNumber(0),
- _readyQueue(numBuffers) {
+ // Start the thread
+ stdx::thread t(stdx::bind(&JournalWriter::_journalWriterThread, this));
+ _journalWriterThreadHandle.swap(t);
+}
- invariant(_journalQueue.maxSize() == _readyQueue.maxSize());
- }
+void JournalWriter::shutdown() {
+ // There is no reason to call shutdown multiple times
+ invariant(!_shutdownRequested);
+ _shutdownRequested = true;
- JournalWriter::~JournalWriter() {
- // Never close the journal writer with outstanding or unaccounted writes
- invariant(_journalQueue.empty());
- invariant(_readyQueue.empty());
- }
+ // Never terminate the journal writer with outstanding or unaccounted writes
+ assertIdle();
- void JournalWriter::start() {
- // Do not allow reuse
- invariant(!_shutdownRequested);
+ Buffer* const shutdownBuffer = newBuffer();
+ shutdownBuffer->_setShutdown();
- // Pre-allocate the journal buffers and push them on the ready queue
- for (size_t i = 0; i < _readyQueue.maxSize(); i++) {
- _readyQueue.push(new Buffer(InitialBufferSizeBytes));
- }
+ // This will terminate the journal thread. No need to specify commit number, since we are
+ // shutting down and nothing will be notified anyways.
+ writeBuffer(shutdownBuffer, 0);
- // Start the thread
- stdx::thread t(stdx::bind(&JournalWriter::_journalWriterThread, this));
- _journalWriterThreadHandle.swap(t);
- }
+ // Ensure the journal thread has stopped and everything accounted for.
+ _journalWriterThreadHandle.join();
+ assertIdle();
- void JournalWriter::shutdown() {
- // There is no reason to call shutdown multiple times
- invariant(!_shutdownRequested);
- _shutdownRequested = true;
-
- // Never terminate the journal writer with outstanding or unaccounted writes
- assertIdle();
-
- Buffer* const shutdownBuffer = newBuffer();
- shutdownBuffer->_setShutdown();
-
- // This will terminate the journal thread. No need to specify commit number, since we are
- // shutting down and nothing will be notified anyways.
- writeBuffer(shutdownBuffer, 0);
-
- // Ensure the journal thread has stopped and everything accounted for.
- _journalWriterThreadHandle.join();
- assertIdle();
-
- // Delete the buffers (this deallocates the journal buffer memory)
- while (!_readyQueue.empty()) {
- Buffer* const buffer = _readyQueue.blockingPop();
- delete buffer;
- }
+ // Delete the buffers (this deallocates the journal buffer memory)
+ while (!_readyQueue.empty()) {
+ Buffer* const buffer = _readyQueue.blockingPop();
+ delete buffer;
}
+}
- void JournalWriter::assertIdle() {
- // All buffers are in the ready queue means there is nothing pending.
- invariant(_journalQueue.empty());
- invariant(_readyQueue.count() == _readyQueue.maxSize());
- }
+void JournalWriter::assertIdle() {
+ // All buffers are in the ready queue means there is nothing pending.
+ invariant(_journalQueue.empty());
+ invariant(_readyQueue.count() == _readyQueue.maxSize());
+}
- JournalWriter::Buffer* JournalWriter::newBuffer() {
- Buffer* const buffer = _readyQueue.blockingPop();
- buffer->_assertEmpty();
+JournalWriter::Buffer* JournalWriter::newBuffer() {
+ Buffer* const buffer = _readyQueue.blockingPop();
+ buffer->_assertEmpty();
- return buffer;
- }
+ return buffer;
+}
- void JournalWriter::writeBuffer(Buffer* buffer, NotifyAll::When commitNumber) {
- invariant(buffer->_commitNumber == 0);
- invariant((commitNumber > _lastCommitNumber) ||
- (buffer->_isShutdown && (commitNumber == 0)));
+void JournalWriter::writeBuffer(Buffer* buffer, NotifyAll::When commitNumber) {
+ invariant(buffer->_commitNumber == 0);
+ invariant((commitNumber > _lastCommitNumber) || (buffer->_isShutdown && (commitNumber == 0)));
- buffer->_commitNumber = commitNumber;
+ buffer->_commitNumber = commitNumber;
- _journalQueue.push(buffer);
- }
+ _journalQueue.push(buffer);
+}
- void JournalWriter::flush() {
- std::vector<Buffer*> buffers;
+void JournalWriter::flush() {
+ std::vector<Buffer*> buffers;
- // Pop the expected number of buffers from the ready queue. This will block until all
- // in-progress buffers have completed.
- for (size_t i = 0; i < _readyQueue.maxSize(); i++) {
- buffers.push_back(_readyQueue.blockingPop());
- }
+ // Pop the expected number of buffers from the ready queue. This will block until all
+ // in-progress buffers have completed.
+ for (size_t i = 0; i < _readyQueue.maxSize(); i++) {
+ buffers.push_back(_readyQueue.blockingPop());
+ }
- // Put them back in to restore the original state.
- for (size_t i = 0; i < buffers.size(); i++) {
- _readyQueue.push(buffers[i]);
- }
+ // Put them back in to restore the original state.
+ for (size_t i = 0; i < buffers.size(); i++) {
+ _readyQueue.push(buffers[i]);
}
+}
- void JournalWriter::_journalWriterThread() {
- Client::initThread("journal writer");
+void JournalWriter::_journalWriterThread() {
+ Client::initThread("journal writer");
- log() << "Journal writer thread started";
+ log() << "Journal writer thread started";
- try {
- while (true) {
- Buffer* const buffer = _journalQueue.blockingPop();
- BufferGuard bufferGuard(buffer, &_readyQueue);
+ try {
+ while (true) {
+ Buffer* const buffer = _journalQueue.blockingPop();
+ BufferGuard bufferGuard(buffer, &_readyQueue);
- if (buffer->_isShutdown) {
- invariant(buffer->_builder.len() == 0);
+ if (buffer->_isShutdown) {
+ invariant(buffer->_builder.len() == 0);
- // The journal writer thread is terminating. Nothing to notify or write.
- break;
- }
+ // The journal writer thread is terminating. Nothing to notify or write.
+ break;
+ }
- if (buffer->_isNoop) {
- invariant(buffer->_builder.len() == 0);
+ if (buffer->_isNoop) {
+ invariant(buffer->_builder.len() == 0);
- // There's nothing to be writen, but we still need to notify this commit number
- _commitNotify->notifyAll(buffer->_commitNumber);
- _applyToDataFilesNotify->notifyAll(buffer->_commitNumber);
- continue;
- }
+ // There's nothing to be writen, but we still need to notify this commit number
+ _commitNotify->notifyAll(buffer->_commitNumber);
+ _applyToDataFilesNotify->notifyAll(buffer->_commitNumber);
+ continue;
+ }
- LOG(4) << "Journaling commit number " << buffer->_commitNumber
- << " (journal file " << buffer->_header.fileId
- << ", sequence " << buffer->_header.seqNumber
- << ", size " << buffer->_builder.len() << " bytes)";
+ LOG(4) << "Journaling commit number " << buffer->_commitNumber << " (journal file "
+ << buffer->_header.fileId << ", sequence " << buffer->_header.seqNumber
+ << ", size " << buffer->_builder.len() << " bytes)";
- // This performs synchronous I/O to the journal file and will block.
- WRITETOJOURNAL(buffer->_header, buffer->_builder);
+ // This performs synchronous I/O to the journal file and will block.
+ WRITETOJOURNAL(buffer->_header, buffer->_builder);
- // Data is now persisted in the journal, which is sufficient for acknowledging
- // getLastError
- _commitNotify->notifyAll(buffer->_commitNumber);
+ // Data is now persisted in the journal, which is sufficient for acknowledging
+ // getLastError
+ _commitNotify->notifyAll(buffer->_commitNumber);
- // Apply the journal entries on top of the shared view so that when flush is
- // requested it would write the latest.
- WRITETODATAFILES(buffer->_header, buffer->_builder);
+ // Apply the journal entries on top of the shared view so that when flush is
+ // requested it would write the latest.
+ WRITETODATAFILES(buffer->_header, buffer->_builder);
- // Data is now persisted on the shared view, so notify any potential journal file
- // cleanup waiters.
- _applyToDataFilesNotify->notifyAll(buffer->_commitNumber);
- }
+ // Data is now persisted on the shared view, so notify any potential journal file
+ // cleanup waiters.
+ _applyToDataFilesNotify->notifyAll(buffer->_commitNumber);
}
- catch (const DBException& e) {
- severe() << "dbexception in journalWriterThread causing immediate shutdown: "
- << e.toString();
- invariant(false);
- }
- catch (const std::ios_base::failure& e) {
- severe() << "ios_base exception in journalWriterThread causing immediate shutdown: "
- << e.what();
- invariant(false);
- }
- catch (const std::bad_alloc& e) {
- severe() << "bad_alloc exception in journalWriterThread causing immediate shutdown: "
- << e.what();
- invariant(false);
- }
- catch (const std::exception& e) {
- severe() << "exception in journalWriterThread causing immediate shutdown: "
- << e.what();
- invariant(false);
- }
- catch (...) {
- severe() << "unhandled exception in journalWriterThread causing immediate shutdown";
- invariant(false);
- }
-
- log() << "Journal writer thread stopped";
+ } catch (const DBException& e) {
+ severe() << "dbexception in journalWriterThread causing immediate shutdown: "
+ << e.toString();
+ invariant(false);
+ } catch (const std::ios_base::failure& e) {
+ severe() << "ios_base exception in journalWriterThread causing immediate shutdown: "
+ << e.what();
+ invariant(false);
+ } catch (const std::bad_alloc& e) {
+ severe() << "bad_alloc exception in journalWriterThread causing immediate shutdown: "
+ << e.what();
+ invariant(false);
+ } catch (const std::exception& e) {
+ severe() << "exception in journalWriterThread causing immediate shutdown: " << e.what();
+ invariant(false);
+ } catch (...) {
+ severe() << "unhandled exception in journalWriterThread causing immediate shutdown";
+ invariant(false);
}
+ log() << "Journal writer thread stopped";
+}
- //
- // Buffer
- //
-
- JournalWriter::Buffer::Buffer(size_t initialSize)
- : _commitNumber(0),
- _isNoop(false),
- _isShutdown(false),
- _header(),
- _builder(initialSize) {
- }
+//
+// Buffer
+//
- JournalWriter::Buffer::~Buffer() {
- _assertEmpty();
- }
+JournalWriter::Buffer::Buffer(size_t initialSize)
+ : _commitNumber(0), _isNoop(false), _isShutdown(false), _header(), _builder(initialSize) {}
- void JournalWriter::Buffer::_assertEmpty() {
- invariant(_commitNumber == 0);
- invariant(_builder.len() == 0);
- }
+JournalWriter::Buffer::~Buffer() {
+ _assertEmpty();
+}
- void JournalWriter::Buffer::_reset() {
- _commitNumber = 0;
- _isNoop = false;
- _builder.reset();
- }
+void JournalWriter::Buffer::_assertEmpty() {
+ invariant(_commitNumber == 0);
+ invariant(_builder.len() == 0);
+}
+
+void JournalWriter::Buffer::_reset() {
+ _commitNumber = 0;
+ _isNoop = false;
+ _builder.reset();
+}
-} // namespace dur
-} // namespace mongo
+} // namespace dur
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/dur_journal_writer.h b/src/mongo/db/storage/mmap_v1/dur_journal_writer.h
index 6ac91de6532..2f738cbb380 100644
--- a/src/mongo/db/storage/mmap_v1/dur_journal_writer.h
+++ b/src/mongo/db/storage/mmap_v1/dur_journal_writer.h
@@ -38,150 +38,158 @@
namespace mongo {
namespace dur {
+/**
+ * Manages the thread and queues used for writing the journal to disk and notify parties with
+ * are waiting on the write concern.
+ *
+ * NOTE: Not thread-safe and must not be used from more than one thread.
+ */
+class JournalWriter {
+ MONGO_DISALLOW_COPYING(JournalWriter);
+
+public:
/**
- * Manages the thread and queues used for writing the journal to disk and notify parties with
- * are waiting on the write concern.
- *
- * NOTE: Not thread-safe and must not be used from more than one thread.
+ * Stores the memory and the header for a complete journal buffer which is pending to be
+ * written by the journal writer thread.
*/
- class JournalWriter {
- MONGO_DISALLOW_COPYING(JournalWriter);
+ class Buffer {
public:
+ Buffer(size_t initialSize);
+ ~Buffer();
+
+ JSectHeader& getHeader() {
+ return _header;
+ }
+ AlignedBuilder& getBuilder() {
+ return _builder;
+ }
- /**
- * Stores the memory and the header for a complete journal buffer which is pending to be
- * written by the journal writer thread.
- */
- class Buffer {
- public:
- Buffer(size_t initialSize);
- ~Buffer();
-
- JSectHeader& getHeader() { return _header; }
- AlignedBuilder& getBuilder() { return _builder; }
-
- void setNoop() { _isNoop = true; }
-
- private:
- friend class BufferGuard;
- friend class JournalWriter;
-
-
- void _assertEmpty();
- void _reset();
- void _setShutdown() { _isShutdown = true; }
-
- // Specifies the commit number which flushing this buffer would notify. This value is
- // zero, if there is no data to be flushed or if the buffer is noop/shutdown.
- NotifyAll::When _commitNumber;
-
- // Special buffer that's posted when there is nothing to be written to the journal,
- // but we want to order a notification so it happens after all other writes have
- // completed.
- bool _isNoop;
-
- // Special buffer that's posted when the receiving thread must terminate. This should
- // be the last entry posted to the queue and the commit number should be zero.
- bool _isShutdown;
-
- JSectHeader _header;
- AlignedBuilder _builder;
- };
-
-
- /**
- * Initializes the journal writer.
- *
- * @param commitNotify Notification object to be called after journal entries have been
- * written to disk. The caller retains ownership and the notify object must outlive
- * the journal writer object.
- * @param applyToDataFilesNotify Notification object to be called after journal entries
- * have been applied to the shared view. This means that if the shared view were to be
- * flushed at this point, the journal files before this point are not necessary. The
- * caller retains ownership and the notify object must outlive the journal writer
- * object.
- * @param numBuffers How many buffers to create to hold outstanding writes. If there are
- * more than this number of journal writes that have not completed, the write calls
- * will block.
- */
- JournalWriter(NotifyAll* commitNotify, NotifyAll* applyToDataFilesNotify, size_t numBuffers);
- ~JournalWriter();
-
- /**
- * Allocates buffer memory and starts the journal writer thread.
- */
- void start();
-
- /**
- * Terminates the journal writer thread and frees memory for the buffers. Must not be
- * called if there are any pending journal writes.
- */
- void shutdown();
-
- /**
- * Asserts that there are no pending journal writes.
- */
- void assertIdle();
-
- /**
- * Obtains a new empty buffer into which a journal entry should be written.
- *
- * This method may block if there are no free buffers.
- *
- * The caller does not own the buffer and needs to "return" it to the writer by calling
- * writeBuffer. Buffers with data on them should never be discarded until they are written.
- */
- Buffer* newBuffer();
-
- /**
- * Requests that the specified buffer be written asynchronously.
- *
- * This method may block if there are too many outstanding unwritten buffers.
- *
- * @param buffer Buffer entry to be written. The buffer object must not be used anymore
- * after it has been given to this function.
- * @param commitNumber What commit number to be notified once the buffer has been written
- * to disk.
- */
- void writeBuffer(Buffer* buffer, NotifyAll::When commitNumber);
-
- /**
- * Ensures that all previously submitted write requests complete. This call is blocking.
- */
- void flush();
+ void setNoop() {
+ _isNoop = true;
+ }
private:
friend class BufferGuard;
+ friend class JournalWriter;
- typedef BlockingQueue<Buffer*> BufferQueue;
- // Start all buffers with 4MB of size
- enum { InitialBufferSizeBytes = 4 * 1024 * 1024 };
+ void _assertEmpty();
+ void _reset();
+ void _setShutdown() {
+ _isShutdown = true;
+ }
+ // Specifies the commit number which flushing this buffer would notify. This value is
+ // zero, if there is no data to be flushed or if the buffer is noop/shutdown.
+ NotifyAll::When _commitNumber;
- void _journalWriterThread();
+ // Special buffer that's posted when there is nothing to be written to the journal,
+ // but we want to order a notification so it happens after all other writes have
+ // completed.
+ bool _isNoop;
+ // Special buffer that's posted when the receiving thread must terminate. This should
+ // be the last entry posted to the queue and the commit number should be zero.
+ bool _isShutdown;
- // This gets notified as journal buffers are written. It is not owned and needs to outlive
- // the journal writer object.
- NotifyAll* const _commitNotify;
+ JSectHeader _header;
+ AlignedBuilder _builder;
+ };
- // This gets notified as journal buffers are done being applied to the shared view
- NotifyAll* const _applyToDataFilesNotify;
- // Wraps and controls the journal writer thread
- stdx::thread _journalWriterThreadHandle;
+ /**
+ * Initializes the journal writer.
+ *
+ * @param commitNotify Notification object to be called after journal entries have been
+ * written to disk. The caller retains ownership and the notify object must outlive
+ * the journal writer object.
+ * @param applyToDataFilesNotify Notification object to be called after journal entries
+ * have been applied to the shared view. This means that if the shared view were to be
+ * flushed at this point, the journal files before this point are not necessary. The
+ * caller retains ownership and the notify object must outlive the journal writer
+ * object.
+ * @param numBuffers How many buffers to create to hold outstanding writes. If there are
+ * more than this number of journal writes that have not completed, the write calls
+ * will block.
+ */
+ JournalWriter(NotifyAll* commitNotify, NotifyAll* applyToDataFilesNotify, size_t numBuffers);
+ ~JournalWriter();
- // Indicates that shutdown has been requested. Used for idempotency of the shutdown call.
- bool _shutdownRequested;
+ /**
+ * Allocates buffer memory and starts the journal writer thread.
+ */
+ void start();
- // Queue of buffers, which need to be written by the journal writer thread
- BufferQueue _journalQueue;
- NotifyAll::When _lastCommitNumber;
+ /**
+ * Terminates the journal writer thread and frees memory for the buffers. Must not be
+ * called if there are any pending journal writes.
+ */
+ void shutdown();
- // Queue of buffers, whose write has been completed by the journal writer thread.
- BufferQueue _readyQueue;
- };
+ /**
+ * Asserts that there are no pending journal writes.
+ */
+ void assertIdle();
+
+ /**
+ * Obtains a new empty buffer into which a journal entry should be written.
+ *
+ * This method may block if there are no free buffers.
+ *
+ * The caller does not own the buffer and needs to "return" it to the writer by calling
+ * writeBuffer. Buffers with data on them should never be discarded until they are written.
+ */
+ Buffer* newBuffer();
+
+ /**
+ * Requests that the specified buffer be written asynchronously.
+ *
+ * This method may block if there are too many outstanding unwritten buffers.
+ *
+ * @param buffer Buffer entry to be written. The buffer object must not be used anymore
+ * after it has been given to this function.
+ * @param commitNumber What commit number to be notified once the buffer has been written
+ * to disk.
+ */
+ void writeBuffer(Buffer* buffer, NotifyAll::When commitNumber);
+
+ /**
+ * Ensures that all previously submitted write requests complete. This call is blocking.
+ */
+ void flush();
+
+private:
+ friend class BufferGuard;
+
+ typedef BlockingQueue<Buffer*> BufferQueue;
+
+ // Start all buffers with 4MB of size
+ enum { InitialBufferSizeBytes = 4 * 1024 * 1024 };
+
+
+ void _journalWriterThread();
+
+
+ // This gets notified as journal buffers are written. It is not owned and needs to outlive
+ // the journal writer object.
+ NotifyAll* const _commitNotify;
+
+ // This gets notified as journal buffers are done being applied to the shared view
+ NotifyAll* const _applyToDataFilesNotify;
+
+ // Wraps and controls the journal writer thread
+ stdx::thread _journalWriterThreadHandle;
+
+ // Indicates that shutdown has been requested. Used for idempotency of the shutdown call.
+ bool _shutdownRequested;
+
+ // Queue of buffers, which need to be written by the journal writer thread
+ BufferQueue _journalQueue;
+ NotifyAll::When _lastCommitNumber;
+
+ // Queue of buffers, whose write has been completed by the journal writer thread.
+ BufferQueue _readyQueue;
+};
-} // namespace dur
-} // namespace mongo
+} // namespace dur
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/dur_journalformat.h b/src/mongo/db/storage/mmap_v1/dur_journalformat.h
index 80ea90bd78a..3c31c2686dd 100644
--- a/src/mongo/db/storage/mmap_v1/dur_journalformat.h
+++ b/src/mongo/db/storage/mmap_v1/dur_journalformat.h
@@ -37,155 +37,181 @@
namespace mongo {
- namespace dur {
+namespace dur {
- const unsigned Alignment = 8192;
+const unsigned Alignment = 8192;
#pragma pack(1)
- /** beginning header for a journal/j._<n> file
- there is nothing important int this header at this time. except perhaps version #.
- */
- struct JHeader {
- JHeader() { }
- JHeader(std::string fname);
+/** beginning header for a journal/j._<n> file
+ there is nothing important int this header at this time. except perhaps version #.
+*/
+struct JHeader {
+ JHeader() {}
+ JHeader(std::string fname);
- char magic[2]; // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or something...
+ char magic
+ [2]; // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or something...
- // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
- // that. simply incrementing the version # is safe on a fwd basis.
+// x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
+// that. simply incrementing the version # is safe on a fwd basis.
#if defined(_NOCOMPRESS)
- enum { CurrentVersion = 0x4148 };
+ enum { CurrentVersion = 0x4148 };
#else
- enum { CurrentVersion = 0x4149 };
+ enum { CurrentVersion = 0x4149 };
#endif
- unsigned short _version;
-
- // these are just for diagnostic ease (make header more useful as plain text)
- char n1; // '\n'
- char ts[20]; // ascii timestamp of file generation. for user reading, not used by code.
- char n2; // '\n'
- char dbpath[128]; // path/filename of this file for human reading and diagnostics. not used by code.
- char n3, n4; // '\n', '\n'
-
- unsigned long long fileId; // unique identifier that will be in each JSectHeader. important as we recycle prealloced files
-
- char reserved3[8026]; // 8KB total for the file header
- char txt2[2]; // "\n\n" at the end
-
- bool versionOk() const { return _version == CurrentVersion; }
- bool valid() const { return magic[0] == 'j' && txt2[1] == '\n' && fileId; }
- };
-
- /** "Section" header. A section corresponds to a group commit.
- len is length of the entire section including header and footer.
- header and footer are not compressed, just the stuff in between.
- */
- struct JSectHeader {
- private:
- unsigned _sectionLen; // unpadded length in bytes of the whole section
- public:
- unsigned long long seqNumber; // sequence number that can be used on recovery to not do too much work
- unsigned long long fileId; // matches JHeader::fileId
- unsigned sectionLen() const { return _sectionLen; }
-
- // we store the unpadded length so we can use that when we uncompress. to
- // get the true total size this must be rounded up to the Alignment.
- void setSectionLen(unsigned lenUnpadded) { _sectionLen = lenUnpadded; }
-
- unsigned sectionLenWithPadding() const {
- unsigned x = (sectionLen() + (Alignment-1)) & (~(Alignment-1));
- dassert( x % Alignment == 0 );
- return x;
- }
- };
-
- /** an individual write operation within a group commit section. Either the entire section should
- be applied, or nothing. (We check the md5 for the whole section before doing anything on recovery.)
- */
- struct JEntry {
- enum OpCodes {
- OpCode_Footer = 0xffffffff,
- OpCode_DbContext = 0xfffffffe,
- OpCode_FileCreated = 0xfffffffd,
- OpCode_DropDb = 0xfffffffc,
- OpCode_Min = 0xfffff000
- };
- union {
- unsigned len; // length in bytes of the data of the JEntry. does not include the JEntry header
- OpCodes opcode;
- };
-
- unsigned ofs; // offset in file
-
- // sentinel and masks for _fileNo
- enum {
- DotNsSuffix = 0x7fffffff, // ".ns" file
- LocalDbBit = 0x80000000 // assuming "local" db instead of using the JDbContext
- };
- int _fileNo; // high bit is set to indicate it should be the <dbpath>/local database
- // char data[len] follows
-
- const char * srcData() const {
- const int *i = &_fileNo;
- return (const char *) (i+1);
- }
-
- int getFileNo() const { return _fileNo & (~LocalDbBit); }
- void setFileNo(int f) { _fileNo = f; }
- bool isNsSuffix() const { return getFileNo() == DotNsSuffix; }
-
- void setLocalDbContextBit() { _fileNo |= LocalDbBit; }
- bool isLocalDbContext() const { return _fileNo & LocalDbBit; }
- void clearLocalDbContextBit() { _fileNo = getFileNo(); }
-
- static std::string suffix(int fileno) {
- if( fileno == DotNsSuffix ) return "ns";
- std::stringstream ss;
- ss << fileno;
- return ss.str();
- }
- };
-
- /** group commit section footer. md5 is a key field. */
- struct JSectFooter {
- JSectFooter();
- JSectFooter(const void* begin, int len); // needs buffer to compute hash
- unsigned sentinel;
- unsigned char hash[16];
- unsigned long long reserved;
- char magic[4]; // "\n\n\n\n"
-
- /** used by recovery to see if buffer is valid
- @param begin the buffer
- @param len buffer len
- @return true if buffer looks valid
- */
- bool checkHash(const void* begin, int len) const;
-
- bool magicOk() const { return *((unsigned*)magic) == 0x0a0a0a0a; }
- };
-
- /** declares "the next entry(s) are for this database / file path prefix" */
- struct JDbContext {
- JDbContext() : sentinel(JEntry::OpCode_DbContext) { }
- const unsigned sentinel; // compare to JEntry::len -- zero is our sentinel
- //char dbname[];
- };
-
- /** "last sequence number" */
- struct LSNFile {
- unsigned ver;
- unsigned reserved2;
- unsigned long long lsn;
- unsigned long long checkbytes;
- unsigned long long reserved[8];
-
- void set(unsigned long long lsn);
- unsigned long long get();
- };
+ unsigned short _version;
-#pragma pack()
+ // these are just for diagnostic ease (make header more useful as plain text)
+ char n1; // '\n'
+ char ts[20]; // ascii timestamp of file generation. for user reading, not used by code.
+ char n2; // '\n'
+ char dbpath
+ [128]; // path/filename of this file for human reading and diagnostics. not used by code.
+ char n3, n4; // '\n', '\n'
+
+ unsigned long long
+ fileId; // unique identifier that will be in each JSectHeader. important as we recycle prealloced files
+
+ char reserved3[8026]; // 8KB total for the file header
+ char txt2[2]; // "\n\n" at the end
+
+ bool versionOk() const {
+ return _version == CurrentVersion;
+ }
+ bool valid() const {
+ return magic[0] == 'j' && txt2[1] == '\n' && fileId;
+ }
+};
+
+/** "Section" header. A section corresponds to a group commit.
+ len is length of the entire section including header and footer.
+ header and footer are not compressed, just the stuff in between.
+*/
+struct JSectHeader {
+private:
+ unsigned _sectionLen; // unpadded length in bytes of the whole section
+public:
+ unsigned long long
+ seqNumber; // sequence number that can be used on recovery to not do too much work
+ unsigned long long fileId; // matches JHeader::fileId
+ unsigned sectionLen() const {
+ return _sectionLen;
+ }
+
+ // we store the unpadded length so we can use that when we uncompress. to
+ // get the true total size this must be rounded up to the Alignment.
+ void setSectionLen(unsigned lenUnpadded) {
+ _sectionLen = lenUnpadded;
+ }
+
+ unsigned sectionLenWithPadding() const {
+ unsigned x = (sectionLen() + (Alignment - 1)) & (~(Alignment - 1));
+ dassert(x % Alignment == 0);
+ return x;
+ }
+};
+/** an individual write operation within a group commit section. Either the entire section should
+ be applied, or nothing. (We check the md5 for the whole section before doing anything on recovery.)
+*/
+struct JEntry {
+ enum OpCodes {
+ OpCode_Footer = 0xffffffff,
+ OpCode_DbContext = 0xfffffffe,
+ OpCode_FileCreated = 0xfffffffd,
+ OpCode_DropDb = 0xfffffffc,
+ OpCode_Min = 0xfffff000
+ };
+ union {
+ unsigned
+ len; // length in bytes of the data of the JEntry. does not include the JEntry header
+ OpCodes opcode;
+ };
+
+ unsigned ofs; // offset in file
+
+ // sentinel and masks for _fileNo
+ enum {
+ DotNsSuffix = 0x7fffffff, // ".ns" file
+ LocalDbBit = 0x80000000 // assuming "local" db instead of using the JDbContext
+ };
+ int _fileNo; // high bit is set to indicate it should be the <dbpath>/local database
+ // char data[len] follows
+
+ const char* srcData() const {
+ const int* i = &_fileNo;
+ return (const char*)(i + 1);
+ }
+
+ int getFileNo() const {
+ return _fileNo & (~LocalDbBit);
+ }
+ void setFileNo(int f) {
+ _fileNo = f;
+ }
+ bool isNsSuffix() const {
+ return getFileNo() == DotNsSuffix;
+ }
+
+ void setLocalDbContextBit() {
+ _fileNo |= LocalDbBit;
+ }
+ bool isLocalDbContext() const {
+ return _fileNo & LocalDbBit;
+ }
+ void clearLocalDbContextBit() {
+ _fileNo = getFileNo();
}
+ static std::string suffix(int fileno) {
+ if (fileno == DotNsSuffix)
+ return "ns";
+ std::stringstream ss;
+ ss << fileno;
+ return ss.str();
+ }
+};
+
+/** group commit section footer. md5 is a key field. */
+struct JSectFooter {
+ JSectFooter();
+ JSectFooter(const void* begin, int len); // needs buffer to compute hash
+ unsigned sentinel;
+ unsigned char hash[16];
+ unsigned long long reserved;
+ char magic[4]; // "\n\n\n\n"
+
+ /** used by recovery to see if buffer is valid
+ @param begin the buffer
+ @param len buffer len
+ @return true if buffer looks valid
+ */
+ bool checkHash(const void* begin, int len) const;
+
+ bool magicOk() const {
+ return *((unsigned*)magic) == 0x0a0a0a0a;
+ }
+};
+
+/** declares "the next entry(s) are for this database / file path prefix" */
+struct JDbContext {
+ JDbContext() : sentinel(JEntry::OpCode_DbContext) {}
+ const unsigned sentinel; // compare to JEntry::len -- zero is our sentinel
+ // char dbname[];
+};
+
+/** "last sequence number" */
+struct LSNFile {
+ unsigned ver;
+ unsigned reserved2;
+ unsigned long long lsn;
+ unsigned long long checkbytes;
+ unsigned long long reserved[8];
+
+ void set(unsigned long long lsn);
+ unsigned long long get();
+};
+
+#pragma pack()
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/dur_journalimpl.h b/src/mongo/db/storage/mmap_v1/dur_journalimpl.h
index 365f38aec71..86a2d19de97 100644
--- a/src/mongo/db/storage/mmap_v1/dur_journalimpl.h
+++ b/src/mongo/db/storage/mmap_v1/dur_journalimpl.h
@@ -34,80 +34,84 @@
#include "mongo/db/storage/mmap_v1/logfile.h"
namespace mongo {
- namespace dur {
+namespace dur {
- /** the writeahead journal for durability */
- class Journal {
- public:
- std::string dir; // set by journalMakeDir() during initialization
+/** the writeahead journal for durability */
+class Journal {
+public:
+ std::string dir; // set by journalMakeDir() during initialization
- Journal();
+ Journal();
- /** call during startup by journalMakeDir() */
- void init();
+ /** call during startup by journalMakeDir() */
+ void init();
- /** check if time to rotate files. assure a file is open.
- done separately from the journal() call as we can do this part
- outside of lock.
- thread: durThread()
- */
- void rotate();
+ /** check if time to rotate files. assure a file is open.
+ done separately from the journal() call as we can do this part
+ outside of lock.
+ thread: durThread()
+ */
+ void rotate();
- /** append to the journal file
- */
- void journal(const JSectHeader& h, const AlignedBuilder& b);
+ /** append to the journal file
+ */
+ void journal(const JSectHeader& h, const AlignedBuilder& b);
- boost::filesystem::path getFilePathFor(int filenumber) const;
+ boost::filesystem::path getFilePathFor(int filenumber) const;
- unsigned long long lastFlushTime() const { return _lastFlushTime; }
- void cleanup(bool log); // closes and removes journal files
-
- unsigned long long curFileId() const { return _curFileId; }
-
- void assureLogFileOpen() {
- stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex);
- if( _curLogFile == 0 )
- _open();
- }
-
- /** open a journal file to journal operations to. */
- void open();
-
- private:
- /** check if time to rotate files. assure a file is open.
- * internally called with every commit
- */
- void _rotate();
-
- void _open();
- void closeCurrentJournalFile();
- void removeUnneededJournalFiles();
-
- unsigned long long _written; // bytes written so far to the current journal (log) file
- unsigned _nextFileNumber;
-
- SimpleMutex _curLogFileMutex;
-
- LogFile *_curLogFile; // use _curLogFileMutex
- unsigned long long _curFileId; // current file id see JHeader::fileId
-
- struct JFile {
- std::string filename;
- unsigned long long lastEventTimeMs;
- };
-
- // files which have been closed but not unlinked (rotated out) yet
- // ordered oldest to newest
- std::list<JFile> _oldJournalFiles; // use _curLogFileMutex
+ unsigned long long lastFlushTime() const {
+ return _lastFlushTime;
+ }
+ void cleanup(bool log); // closes and removes journal files
- // lsn related
- static void preFlush();
- static void postFlush();
- unsigned long long _preFlushTime;
- unsigned long long _lastFlushTime; // data < this time is fsynced in the datafiles (unless hard drive controller is caching)
- bool _writeToLSNNeeded;
- void updateLSNFile();
- };
+ unsigned long long curFileId() const {
+ return _curFileId;
+ }
+ void assureLogFileOpen() {
+ stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex);
+ if (_curLogFile == 0)
+ _open();
}
+
+ /** open a journal file to journal operations to. */
+ void open();
+
+private:
+ /** check if time to rotate files. assure a file is open.
+ * internally called with every commit
+ */
+ void _rotate();
+
+ void _open();
+ void closeCurrentJournalFile();
+ void removeUnneededJournalFiles();
+
+ unsigned long long _written; // bytes written so far to the current journal (log) file
+ unsigned _nextFileNumber;
+
+ SimpleMutex _curLogFileMutex;
+
+ LogFile* _curLogFile; // use _curLogFileMutex
+ unsigned long long _curFileId; // current file id see JHeader::fileId
+
+ struct JFile {
+ std::string filename;
+ unsigned long long lastEventTimeMs;
+ };
+
+ // files which have been closed but not unlinked (rotated out) yet
+ // ordered oldest to newest
+ std::list<JFile> _oldJournalFiles; // use _curLogFileMutex
+
+ // lsn related
+ static void preFlush();
+ static void postFlush();
+ unsigned long long _preFlushTime;
+ unsigned long long
+ _lastFlushTime; // data < this time is fsynced in the datafiles (unless hard drive controller is caching)
+ bool _writeToLSNNeeded;
+ void updateLSNFile();
+};
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/dur_preplogbuffer.cpp b/src/mongo/db/storage/mmap_v1/dur_preplogbuffer.cpp
index 171254eb946..dc9d7fb2b7a 100644
--- a/src/mongo/db/storage/mmap_v1/dur_preplogbuffer.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur_preplogbuffer.cpp
@@ -53,152 +53,147 @@
namespace mongo {
- using std::endl;
- using std::min;
- using std::stringstream;
+using std::endl;
+using std::min;
+using std::stringstream;
- namespace dur {
+namespace dur {
- extern Journal j;
- extern CommitJob commitJob;
+extern Journal j;
+extern CommitJob commitJob;
- const RelativePath local = RelativePath::fromRelativePath("local");
+const RelativePath local = RelativePath::fromRelativePath("local");
- static DurableMappedFile* findMMF_inlock(void *ptr, size_t &ofs) {
- DurableMappedFile *f = privateViews.find_inlock(ptr, ofs);
- if( f == 0 ) {
- error() << "findMMF_inlock failed " << privateViews.numberOfViews_inlock() << endl;
- printStackTrace(); // we want a stack trace and the assert below didn't print a trace once in the real world - not sure why
- stringstream ss;
- ss << "view pointer cannot be resolved " << std::hex << (size_t) ptr;
- journalingFailure(ss.str().c_str()); // asserts, which then abends
- }
- return f;
- }
-
- /** put the basic write operation into the buffer (bb) to be journaled */
- static void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) {
- size_t ofs = 1;
- DurableMappedFile *mmf = findMMF_inlock(i->start(), /*out*/ofs);
+static DurableMappedFile* findMMF_inlock(void* ptr, size_t& ofs) {
+ DurableMappedFile* f = privateViews.find_inlock(ptr, ofs);
+ if (f == 0) {
+ error() << "findMMF_inlock failed " << privateViews.numberOfViews_inlock() << endl;
+ printStackTrace(); // we want a stack trace and the assert below didn't print a trace once in the real world - not sure why
+ stringstream ss;
+ ss << "view pointer cannot be resolved " << std::hex << (size_t)ptr;
+ journalingFailure(ss.str().c_str()); // asserts, which then abends
+ }
+ return f;
+}
- if( MONGO_unlikely(!mmf->willNeedRemap()) ) {
- // tag this mmf as needed a remap of its private view later.
- // usually it will already be dirty/already set, so we do the if above first
- // to avoid possibility of cpu cache line contention
- mmf->setWillNeedRemap();
- }
+/** put the basic write operation into the buffer (bb) to be journaled */
+static void prepBasicWrite_inlock(AlignedBuilder& bb,
+ const WriteIntent* i,
+ RelativePath& lastDbPath) {
+ size_t ofs = 1;
+ DurableMappedFile* mmf = findMMF_inlock(i->start(), /*out*/ ofs);
+
+ if (MONGO_unlikely(!mmf->willNeedRemap())) {
+ // tag this mmf as needed a remap of its private view later.
+ // usually it will already be dirty/already set, so we do the if above first
+ // to avoid possibility of cpu cache line contention
+ mmf->setWillNeedRemap();
+ }
- // since we have already looked up the mmf, we go ahead and remember the write view location
- // so we don't have to find the DurableMappedFile again later in WRITETODATAFILES()
- //
- // this was for WRITETODATAFILES_Impl2 so commented out now
- //
- /*
- dassert( i->w_ptr == 0 );
- i->w_ptr = ((char*)mmf->view_write()) + ofs;
- */
-
- JEntry e;
- e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //don't write past end of file
- verify( ofs <= 0x80000000 );
- e.ofs = (unsigned) ofs;
- e.setFileNo( mmf->fileSuffixNo() );
-
- if( mmf->relativePath() == local ) {
- e.setLocalDbContextBit();
- }
- else if( mmf->relativePath() != lastDbPath ) {
- lastDbPath = mmf->relativePath();
- JDbContext c;
- bb.appendStruct(c);
- bb.appendStr(lastDbPath.toString());
- }
+ // since we have already looked up the mmf, we go ahead and remember the write view location
+ // so we don't have to find the DurableMappedFile again later in WRITETODATAFILES()
+ //
+ // this was for WRITETODATAFILES_Impl2 so commented out now
+ //
+ /*
+ dassert( i->w_ptr == 0 );
+ i->w_ptr = ((char*)mmf->view_write()) + ofs;
+ */
+
+ JEntry e;
+ e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); // don't write past end of file
+ verify(ofs <= 0x80000000);
+ e.ofs = (unsigned)ofs;
+ e.setFileNo(mmf->fileSuffixNo());
+
+ if (mmf->relativePath() == local) {
+ e.setLocalDbContextBit();
+ } else if (mmf->relativePath() != lastDbPath) {
+ lastDbPath = mmf->relativePath();
+ JDbContext c;
+ bb.appendStruct(c);
+ bb.appendStr(lastDbPath.toString());
+ }
- bb.appendStruct(e);
- bb.appendBuf(i->start(), e.len);
+ bb.appendStruct(e);
+ bb.appendBuf(i->start(), e.len);
- if (MONGO_unlikely(e.len != (unsigned)i->length())) {
- log() << "journal info splitting prepBasicWrite at boundary" << endl;
+ if (MONGO_unlikely(e.len != (unsigned)i->length())) {
+ log() << "journal info splitting prepBasicWrite at boundary" << endl;
- // This only happens if we write to the last byte in a file and
- // the fist byte in another file that is mapped adjacently. I
- // think most OSs leave at least a one page gap between
- // mappings, but better to be safe.
+ // This only happens if we write to the last byte in a file and
+ // the fist byte in another file that is mapped adjacently. I
+ // think most OSs leave at least a one page gap between
+ // mappings, but better to be safe.
- WriteIntent next ((char*)i->start() + e.len, i->length() - e.len);
- prepBasicWrite_inlock(bb, &next, lastDbPath);
- }
- }
+ WriteIntent next((char*)i->start() + e.len, i->length() - e.len);
+ prepBasicWrite_inlock(bb, &next, lastDbPath);
+ }
+}
- /** basic write ops / write intents. note there is no particular order to these : if we have
- two writes to the same location during the group commit interval, it is likely
- (although not assured) that it is journaled here once.
- */
- static void prepBasicWrites(AlignedBuilder& bb, const std::vector<WriteIntent>& intents) {
- stdx::lock_guard<stdx::mutex> lk(privateViews._mutex());
-
- // Each time write intents switch to a different database we journal a JDbContext.
- // Switches will be rare as we sort by memory location first and we batch commit.
- RelativePath lastDbPath;
-
- invariant(!intents.empty());
-
- WriteIntent last;
- for (std::vector<WriteIntent>::const_iterator i = intents.begin();
- i != intents.end();
- i++) {
-
- if( i->start() < last.end() ) {
- // overlaps
- last.absorb(*i);
- }
- else {
- // discontinuous
- if (i != intents.begin()) {
- prepBasicWrite_inlock(bb, &last, lastDbPath);
- }
-
- last = *i;
- }
+/** basic write ops / write intents. note there is no particular order to these : if we have
+ two writes to the same location during the group commit interval, it is likely
+ (although not assured) that it is journaled here once.
+*/
+static void prepBasicWrites(AlignedBuilder& bb, const std::vector<WriteIntent>& intents) {
+ stdx::lock_guard<stdx::mutex> lk(privateViews._mutex());
+
+ // Each time write intents switch to a different database we journal a JDbContext.
+ // Switches will be rare as we sort by memory location first and we batch commit.
+ RelativePath lastDbPath;
+
+ invariant(!intents.empty());
+
+ WriteIntent last;
+ for (std::vector<WriteIntent>::const_iterator i = intents.begin(); i != intents.end(); i++) {
+ if (i->start() < last.end()) {
+ // overlaps
+ last.absorb(*i);
+ } else {
+ // discontinuous
+ if (i != intents.begin()) {
+ prepBasicWrite_inlock(bb, &last, lastDbPath);
}
- prepBasicWrite_inlock(bb, &last, lastDbPath);
+ last = *i;
}
+ }
- /** we will build an output buffer ourself and then use O_DIRECT
- we could be in read lock for this
- caller handles locking
- @return partially populated sectheader and _ab set
- */
- static void _PREPLOGBUFFER(JSectHeader& h, AlignedBuilder& bb) {
- // Add the JSectHeader
-
- // Invalidate the total length, we will fill it in later.
- h.setSectionLen(0xffffffff);
- h.seqNumber = getLastDataFileFlushTime();
- h.fileId = j.curFileId();
-
- // Ops other than basic writes (DurOp's) go first
- const std::vector<std::shared_ptr<DurOp> >& durOps = commitJob.ops();
- for (std::vector<std::shared_ptr<DurOp> >::const_iterator i = durOps.begin();
- i != durOps.end();
- i++) {
-
- (*i)->serialize(bb);
- }
+ prepBasicWrite_inlock(bb, &last, lastDbPath);
+}
- // Write intents
- const std::vector<WriteIntent>& intents = commitJob.getIntentsSorted();
- if (!intents.empty()) {
- prepBasicWrites(bb, intents);
- }
- }
+/** we will build an output buffer ourself and then use O_DIRECT
+ we could be in read lock for this
+ caller handles locking
+ @return partially populated sectheader and _ab set
+*/
+static void _PREPLOGBUFFER(JSectHeader& h, AlignedBuilder& bb) {
+ // Add the JSectHeader
+
+ // Invalidate the total length, we will fill it in later.
+ h.setSectionLen(0xffffffff);
+ h.seqNumber = getLastDataFileFlushTime();
+ h.fileId = j.curFileId();
+
+ // Ops other than basic writes (DurOp's) go first
+ const std::vector<std::shared_ptr<DurOp>>& durOps = commitJob.ops();
+ for (std::vector<std::shared_ptr<DurOp>>::const_iterator i = durOps.begin(); i != durOps.end();
+ i++) {
+ (*i)->serialize(bb);
+ }
- void PREPLOGBUFFER(/*out*/ JSectHeader& outHeader, AlignedBuilder& outBuffer) {
- Timer t;
- j.assureLogFileOpen(); // so fileId is set
- _PREPLOGBUFFER(outHeader, outBuffer);
- stats.curr()->_prepLogBufferMicros += t.micros();
- }
+ // Write intents
+ const std::vector<WriteIntent>& intents = commitJob.getIntentsSorted();
+ if (!intents.empty()) {
+ prepBasicWrites(bb, intents);
}
}
+
+void PREPLOGBUFFER(/*out*/ JSectHeader& outHeader, AlignedBuilder& outBuffer) {
+ Timer t;
+ j.assureLogFileOpen(); // so fileId is set
+ _PREPLOGBUFFER(outHeader, outBuffer);
+ stats.curr()->_prepLogBufferMicros += t.micros();
+}
+}
+}
diff --git a/src/mongo/db/storage/mmap_v1/dur_recover.cpp b/src/mongo/db/storage/mmap_v1/dur_recover.cpp
index bfd023affab..a6958ad1aec 100644
--- a/src/mongo/db/storage/mmap_v1/dur_recover.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur_recover.cpp
@@ -58,571 +58,564 @@
namespace mongo {
- using std::shared_ptr;
- using std::unique_ptr;
- using std::endl;
- using std::hex;
- using std::map;
- using std::pair;
- using std::setw;
- using std::string;
- using std::stringstream;
- using std::vector;
-
- /**
- * Thrown when a journal section is corrupt. This is considered OK as long as it occurs while
- * processing the last file. Processing stops at the first corrupt section.
- *
- * Any logging about the nature of the corruption should happen before throwing as this class
- * contains no data.
- */
- class JournalSectionCorruptException {};
+using std::shared_ptr;
+using std::unique_ptr;
+using std::endl;
+using std::hex;
+using std::map;
+using std::pair;
+using std::setw;
+using std::string;
+using std::stringstream;
+using std::vector;
- namespace dur {
+/**
+ * Thrown when a journal section is corrupt. This is considered OK as long as it occurs while
+ * processing the last file. Processing stops at the first corrupt section.
+ *
+ * Any logging about the nature of the corruption should happen before throwing as this class
+ * contains no data.
+ */
+class JournalSectionCorruptException {};
- // The singleton recovery job object
- RecoveryJob& RecoveryJob::_instance = *(new RecoveryJob());
+namespace dur {
+// The singleton recovery job object
+RecoveryJob& RecoveryJob::_instance = *(new RecoveryJob());
- void removeJournalFiles();
- boost::filesystem::path getJournalDir();
+void removeJournalFiles();
+boost::filesystem::path getJournalDir();
- struct ParsedJournalEntry { /*copyable*/
- ParsedJournalEntry() : e(0) { }
- // relative path of database for the operation.
- // might be a pointer into mmaped Journal file
- const char *dbName;
+struct ParsedJournalEntry {/*copyable*/
+ ParsedJournalEntry() : e(0) {}
- // those are pointers into the memory mapped journal file
- const JEntry *e; // local db sentinel is already parsed out here into dbName
+ // relative path of database for the operation.
+ // might be a pointer into mmaped Journal file
+ const char* dbName;
- // if not one of the two simple JEntry's above, this is the operation:
- std::shared_ptr<DurOp> op;
- };
+ // those are pointers into the memory mapped journal file
+ const JEntry* e; // local db sentinel is already parsed out here into dbName
+ // if not one of the two simple JEntry's above, this is the operation:
+ std::shared_ptr<DurOp> op;
+};
- /**
- * Get journal filenames, in order. Throws if unexpected content found.
- */
- static void getFiles(boost::filesystem::path dir, vector<boost::filesystem::path>& files) {
- map<unsigned,boost::filesystem::path> m;
- for ( boost::filesystem::directory_iterator i( dir );
- i != boost::filesystem::directory_iterator();
- ++i ) {
- boost::filesystem::path filepath = *i;
- string fileName = boost::filesystem::path(*i).leaf().string();
- if( str::startsWith(fileName, "j._") ) {
- unsigned u = str::toUnsigned( str::after(fileName, '_') );
- if( m.count(u) ) {
- uasserted(13531, str::stream() << "unexpected files in journal directory " << dir.string() << " : " << fileName);
- }
- m.insert( pair<unsigned,boost::filesystem::path>(u,filepath) );
- }
- }
- for( map<unsigned,boost::filesystem::path>::iterator i = m.begin(); i != m.end(); ++i ) {
- if( i != m.begin() && m.count(i->first - 1) == 0 ) {
- uasserted(13532,
- str::stream() << "unexpected file in journal directory " << dir.string()
- << " : " << boost::filesystem::path(i->second).leaf().string() << " : can't find its preceding file");
- }
- files.push_back(i->second);
+
+/**
+ * Get journal filenames, in order. Throws if unexpected content found.
+ */
+static void getFiles(boost::filesystem::path dir, vector<boost::filesystem::path>& files) {
+ map<unsigned, boost::filesystem::path> m;
+ for (boost::filesystem::directory_iterator i(dir); i != boost::filesystem::directory_iterator();
+ ++i) {
+ boost::filesystem::path filepath = *i;
+ string fileName = boost::filesystem::path(*i).leaf().string();
+ if (str::startsWith(fileName, "j._")) {
+ unsigned u = str::toUnsigned(str::after(fileName, '_'));
+ if (m.count(u)) {
+ uasserted(13531,
+ str::stream() << "unexpected files in journal directory " << dir.string()
+ << " : " << fileName);
}
+ m.insert(pair<unsigned, boost::filesystem::path>(u, filepath));
}
+ }
+ for (map<unsigned, boost::filesystem::path>::iterator i = m.begin(); i != m.end(); ++i) {
+ if (i != m.begin() && m.count(i->first - 1) == 0) {
+ uasserted(13532,
+ str::stream() << "unexpected file in journal directory " << dir.string()
+ << " : " << boost::filesystem::path(i->second).leaf().string()
+ << " : can't find its preceding file");
+ }
+ files.push_back(i->second);
+ }
+}
- /** read through the memory mapped data of a journal file (journal/j._<n> file)
- throws
- */
- class JournalSectionIterator {
- MONGO_DISALLOW_COPYING(JournalSectionIterator);
- public:
- JournalSectionIterator(const JSectHeader& h,
- const void *compressed,
- unsigned compressedLen,
- bool doDurOpsRecovering)
- : _h(h),
- _lastDbName(0),
- _doDurOps(doDurOpsRecovering) {
-
- verify(doDurOpsRecovering);
-
- if (!uncompress((const char *)compressed, compressedLen, &_uncompressed)) {
- // We check the checksum before we uncompress, but this may still fail as the
- // checksum isn't foolproof.
- log() << "couldn't uncompress journal section" << endl;
- throw JournalSectionCorruptException();
- }
-
- const char *p = _uncompressed.c_str();
- verify(compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader));
-
- _entries = unique_ptr<BufReader>(new BufReader(p, _uncompressed.size()));
- }
+/** read through the memory mapped data of a journal file (journal/j._<n> file)
+ throws
+*/
+class JournalSectionIterator {
+ MONGO_DISALLOW_COPYING(JournalSectionIterator);
+
+public:
+ JournalSectionIterator(const JSectHeader& h,
+ const void* compressed,
+ unsigned compressedLen,
+ bool doDurOpsRecovering)
+ : _h(h), _lastDbName(0), _doDurOps(doDurOpsRecovering) {
+ verify(doDurOpsRecovering);
+
+ if (!uncompress((const char*)compressed, compressedLen, &_uncompressed)) {
+ // We check the checksum before we uncompress, but this may still fail as the
+ // checksum isn't foolproof.
+ log() << "couldn't uncompress journal section" << endl;
+ throw JournalSectionCorruptException();
+ }
- // We work with the uncompressed buffer when doing a WRITETODATAFILES (for speed)
- JournalSectionIterator(const JSectHeader &h, const void *p, unsigned len)
- : _entries(new BufReader((const char *)p, len)),
- _h(h),
- _lastDbName(0),
- _doDurOps(false) {
+ const char* p = _uncompressed.c_str();
+ verify(compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader));
- }
+ _entries = unique_ptr<BufReader>(new BufReader(p, _uncompressed.size()));
+ }
- bool atEof() const { return _entries->atEof(); }
+ // We work with the uncompressed buffer when doing a WRITETODATAFILES (for speed)
+ JournalSectionIterator(const JSectHeader& h, const void* p, unsigned len)
+ : _entries(new BufReader((const char*)p, len)), _h(h), _lastDbName(0), _doDurOps(false) {}
- unsigned long long seqNumber() const { return _h.seqNumber; }
+ bool atEof() const {
+ return _entries->atEof();
+ }
- /** get the next entry from the log. this function parses and combines JDbContext and JEntry's.
- * throws on premature end of section.
- */
- void next(ParsedJournalEntry& e) {
- unsigned lenOrOpCode;
- _entries->read(lenOrOpCode);
+ unsigned long long seqNumber() const {
+ return _h.seqNumber;
+ }
- if (lenOrOpCode > JEntry::OpCode_Min) {
- switch( lenOrOpCode ) {
+ /** get the next entry from the log. this function parses and combines JDbContext and JEntry's.
+ * throws on premature end of section.
+ */
+ void next(ParsedJournalEntry& e) {
+ unsigned lenOrOpCode;
+ _entries->read(lenOrOpCode);
- case JEntry::OpCode_Footer: {
- verify( false );
- }
+ if (lenOrOpCode > JEntry::OpCode_Min) {
+ switch (lenOrOpCode) {
+ case JEntry::OpCode_Footer: {
+ verify(false);
+ }
- case JEntry::OpCode_FileCreated:
- case JEntry::OpCode_DropDb: {
- e.dbName = 0;
- std::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries);
- if (_doDurOps) {
- e.op = op;
- }
- return;
+ case JEntry::OpCode_FileCreated:
+ case JEntry::OpCode_DropDb: {
+ e.dbName = 0;
+ std::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries);
+ if (_doDurOps) {
+ e.op = op;
}
+ return;
+ }
- case JEntry::OpCode_DbContext: {
- _lastDbName = (const char*) _entries->pos();
- const unsigned limit = _entries->remaining();
- const unsigned len = strnlen(_lastDbName, limit);
- if (_lastDbName[len] != '\0') {
- log() << "problem processing journal file during recovery";
- throw JournalSectionCorruptException();
- }
-
- _entries->skip(len+1); // skip '\0' too
- _entries->read(lenOrOpCode); // read this for the fall through
+ case JEntry::OpCode_DbContext: {
+ _lastDbName = (const char*)_entries->pos();
+ const unsigned limit = _entries->remaining();
+ const unsigned len = strnlen(_lastDbName, limit);
+ if (_lastDbName[len] != '\0') {
+ log() << "problem processing journal file during recovery";
+ throw JournalSectionCorruptException();
}
- // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
- default:
- // fall through
- ;
- }
+ _entries->skip(len + 1); // skip '\0' too
+ _entries->read(lenOrOpCode); // read this for the fall through
}
+ // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
- // JEntry - a basic write
- verify( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
- _entries->rewind(4);
- e.e = (JEntry *) _entries->skip(sizeof(JEntry));
- e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
- verify( e.e->len == lenOrOpCode );
- _entries->skip(e.e->len);
+ default:
+ // fall through
+ ;
}
+ }
-
- private:
- unique_ptr<BufReader> _entries;
- const JSectHeader _h;
- const char *_lastDbName; // pointer into mmaped journal file
- const bool _doDurOps;
- string _uncompressed;
- };
-
-
- static string fileName(const char* dbName, int fileNo) {
+ // JEntry - a basic write
+ verify(lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min);
+ _entries->rewind(4);
+ e.e = (JEntry*)_entries->skip(sizeof(JEntry));
+ e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
+ verify(e.e->len == lenOrOpCode);
+ _entries->skip(e.e->len);
+ }
+
+
+private:
+ unique_ptr<BufReader> _entries;
+ const JSectHeader _h;
+ const char* _lastDbName; // pointer into mmaped journal file
+ const bool _doDurOps;
+ string _uncompressed;
+};
+
+
+static string fileName(const char* dbName, int fileNo) {
+ stringstream ss;
+ ss << dbName << '.';
+ verify(fileNo >= 0);
+ if (fileNo == JEntry::DotNsSuffix)
+ ss << "ns";
+ else
+ ss << fileNo;
+
+ // relative name -> full path name
+ boost::filesystem::path full(storageGlobalParams.dbpath);
+ full /= ss.str();
+ return full.string();
+}
+
+
+RecoveryJob::RecoveryJob()
+ : _recovering(false), _lastDataSyncedFromLastRun(0), _lastSeqMentionedInConsoleLog(1) {}
+
+RecoveryJob::~RecoveryJob() {
+ DESTRUCTOR_GUARD(if (!_mmfs.empty()) {} close();)
+}
+
+void RecoveryJob::close() {
+ stdx::lock_guard<stdx::mutex> lk(_mx);
+ _close();
+}
+
+void RecoveryJob::_close() {
+ MongoFile::flushAll(true);
+ _mmfs.clear();
+}
+
+RecoveryJob::Last::Last() : mmf(NULL), fileNo(-1) {
+ // Make sure the files list does not change from underneath
+ LockMongoFilesShared::assertAtLeastReadLocked();
+}
+
+DurableMappedFile* RecoveryJob::Last::newEntry(const dur::ParsedJournalEntry& entry,
+ RecoveryJob& rj) {
+ int num = entry.e->getFileNo();
+ if (num == fileNo && entry.dbName == dbName)
+ return mmf;
+
+ string fn = fileName(entry.dbName, num);
+ MongoFile* file;
+ {
+ MongoFileFinder finder; // must release lock before creating new DurableMappedFile
+ file = finder.findByPath(fn);
+ }
+
+ if (file) {
+ verify(file->isDurableMappedFile());
+ mmf = (DurableMappedFile*)file;
+ } else {
+ if (!rj._recovering) {
+ log() << "journal error applying writes, file " << fn << " is not open" << endl;
+ verify(false);
+ }
+ std::shared_ptr<DurableMappedFile> sp(new DurableMappedFile);
+ verify(sp->open(fn, false));
+ rj._mmfs.push_back(sp);
+ mmf = sp.get();
+ }
+
+ // we do this last so that if an exception were thrown, there isn't any wrong memory
+ dbName = entry.dbName;
+ fileNo = num;
+ return mmf;
+}
+
+void RecoveryJob::write(Last& last, const ParsedJournalEntry& entry) {
+ // TODO(mathias): look into making some of these dasserts
+ verify(entry.e);
+ verify(entry.dbName);
+
+ DurableMappedFile* mmf = last.newEntry(entry, *this);
+
+ if ((entry.e->ofs + entry.e->len) <= mmf->length()) {
+ verify(mmf->view_write());
+ verify(entry.e->srcData());
+
+ void* dest = (char*)mmf->view_write() + entry.e->ofs;
+ memcpy(dest, entry.e->srcData(), entry.e->len);
+ stats.curr()->_writeToDataFilesBytes += entry.e->len;
+ } else {
+ massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering);
+ }
+}
+
+void RecoveryJob::applyEntry(Last& last, const ParsedJournalEntry& entry, bool apply, bool dump) {
+ if (entry.e) {
+ if (dump) {
stringstream ss;
- ss << dbName << '.';
- verify( fileNo >= 0 );
- if( fileNo == JEntry::DotNsSuffix )
+ ss << " BASICWRITE " << setw(20) << entry.dbName << '.';
+ if (entry.e->isNsSuffix())
ss << "ns";
else
- ss << fileNo;
-
- // relative name -> full path name
- boost::filesystem::path full(storageGlobalParams.dbpath);
- full /= ss.str();
- return full.string();
- }
-
-
- RecoveryJob::RecoveryJob()
- : _recovering(false),
- _lastDataSyncedFromLastRun(0),
- _lastSeqMentionedInConsoleLog(1) {
-
- }
-
- RecoveryJob::~RecoveryJob() {
- DESTRUCTOR_GUARD(
- if (!_mmfs.empty()) {}
- close();
- )
+ ss << setw(2) << entry.e->getFileNo();
+ ss << ' ' << setw(6) << entry.e->len << ' '
+ << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/
+ " " << hexdump(entry.e->srcData(), entry.e->len);
+ log() << ss.str() << endl;
}
-
- void RecoveryJob::close() {
- stdx::lock_guard<stdx::mutex> lk(_mx);
- _close();
+ if (apply) {
+ write(last, entry);
}
-
- void RecoveryJob::_close() {
- MongoFile::flushAll(true);
- _mmfs.clear();
+ } else if (entry.op) {
+ // a DurOp subclass operation
+ if (dump) {
+ log() << " OP " << entry.op->toString() << endl;
}
-
- RecoveryJob::Last::Last() : mmf(NULL), fileNo(-1) {
- // Make sure the files list does not change from underneath
- LockMongoFilesShared::assertAtLeastReadLocked();
- }
-
- DurableMappedFile* RecoveryJob::Last::newEntry(const dur::ParsedJournalEntry& entry, RecoveryJob& rj) {
- int num = entry.e->getFileNo();
- if( num == fileNo && entry.dbName == dbName )
- return mmf;
-
- string fn = fileName(entry.dbName, num);
- MongoFile *file;
- {
- MongoFileFinder finder; // must release lock before creating new DurableMappedFile
- file = finder.findByPath(fn);
- }
-
- if (file) {
- verify(file->isDurableMappedFile());
- mmf = (DurableMappedFile*)file;
- }
- else {
- if( !rj._recovering ) {
- log() << "journal error applying writes, file " << fn << " is not open" << endl;
- verify(false);
- }
- std::shared_ptr<DurableMappedFile> sp (new DurableMappedFile);
- verify(sp->open(fn, false));
- rj._mmfs.push_back(sp);
- mmf = sp.get();
+ if (apply) {
+ if (entry.op->needFilesClosed()) {
+ _close(); // locked in processSection
}
-
- // we do this last so that if an exception were thrown, there isn't any wrong memory
- dbName = entry.dbName;
- fileNo = num;
- return mmf;
+ entry.op->replay();
}
-
- void RecoveryJob::write(Last& last, const ParsedJournalEntry& entry) {
- //TODO(mathias): look into making some of these dasserts
- verify(entry.e);
- verify(entry.dbName);
-
- DurableMappedFile *mmf = last.newEntry(entry, *this);
-
- if ((entry.e->ofs + entry.e->len) <= mmf->length()) {
- verify(mmf->view_write());
- verify(entry.e->srcData());
-
- void* dest = (char*)mmf->view_write() + entry.e->ofs;
- memcpy(dest, entry.e->srcData(), entry.e->len);
- stats.curr()->_writeToDataFilesBytes += entry.e->len;
- }
- else {
- massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering);
- }
+ }
+}
+
+void RecoveryJob::applyEntries(const vector<ParsedJournalEntry>& entries) {
+ const bool apply = (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalScanOnly) == 0;
+ const bool dump = (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal);
+
+ if (dump) {
+ log() << "BEGIN section" << endl;
+ }
+
+ Last last;
+ for (vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
+ applyEntry(last, *i, apply, dump);
+ }
+
+ if (dump) {
+ log() << "END section" << endl;
+ }
+}
+
+void RecoveryJob::processSection(const JSectHeader* h,
+ const void* p,
+ unsigned len,
+ const JSectFooter* f) {
+ LockMongoFilesShared lkFiles; // for RecoveryJob::Last
+ stdx::lock_guard<stdx::mutex> lk(_mx);
+
+ // Check the footer checksum before doing anything else.
+ if (_recovering) {
+ verify(((const char*)h) + sizeof(JSectHeader) == p);
+ if (!f->checkHash(h, len + sizeof(JSectHeader))) {
+ log() << "journal section checksum doesn't match";
+ throw JournalSectionCorruptException();
}
-
- void RecoveryJob::applyEntry(Last& last, const ParsedJournalEntry& entry, bool apply, bool dump) {
- if( entry.e ) {
- if( dump ) {
- stringstream ss;
- ss << " BASICWRITE " << setw(20) << entry.dbName << '.';
- if( entry.e->isNsSuffix() )
- ss << "ns";
- else
- ss << setw(2) << entry.e->getFileNo();
- ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/
- " " << hexdump(entry.e->srcData(), entry.e->len);
- log() << ss.str() << endl;
- }
- if( apply ) {
- write(last, entry);
- }
- }
- else if(entry.op) {
- // a DurOp subclass operation
- if( dump ) {
- log() << " OP " << entry.op->toString() << endl;
- }
- if( apply ) {
- if( entry.op->needFilesClosed() ) {
- _close(); // locked in processSection
- }
- entry.op->replay();
- }
+ }
+
+ if (_recovering && _lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs) {
+ if (h->seqNumber != _lastSeqMentionedInConsoleLog) {
+ static int n;
+ if (++n < 10) {
+ log() << "recover skipping application of section seq:" << h->seqNumber
+ << " < lsn:" << _lastDataSyncedFromLastRun << endl;
+ } else if (n == 10) {
+ log() << "recover skipping application of section more..." << endl;
}
+ _lastSeqMentionedInConsoleLog = h->seqNumber;
}
-
- void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) {
- const bool apply =
- (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalScanOnly) == 0;
- const bool dump =
- (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal);
-
- if (dump) {
- log() << "BEGIN section" << endl;
- }
-
- Last last;
- for (vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
- applyEntry(last, *i, apply, dump);
- }
-
- if (dump) {
- log() << "END section" << endl;
- }
- }
-
- void RecoveryJob::processSection(const JSectHeader *h, const void *p, unsigned len, const JSectFooter *f) {
- LockMongoFilesShared lkFiles; // for RecoveryJob::Last
- stdx::lock_guard<stdx::mutex> lk(_mx);
-
- // Check the footer checksum before doing anything else.
- if (_recovering) {
- verify( ((const char *)h) + sizeof(JSectHeader) == p );
- if (!f->checkHash(h, len + sizeof(JSectHeader))) {
- log() << "journal section checksum doesn't match";
- throw JournalSectionCorruptException();
- }
- }
-
- if( _recovering && _lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs ) {
- if( h->seqNumber != _lastSeqMentionedInConsoleLog ) {
- static int n;
- if( ++n < 10 ) {
- log() << "recover skipping application of section seq:" << h->seqNumber << " < lsn:" << _lastDataSyncedFromLastRun << endl;
+ return;
+ }
+
+ unique_ptr<JournalSectionIterator> i;
+ if (_recovering) {
+ i = unique_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering));
+ } else {
+ i = unique_ptr<JournalSectionIterator>(
+ new JournalSectionIterator(*h, /*after header*/ p, /*w/out header*/ len));
+ }
+
+ // we use a static so that we don't have to reallocate every time through. occasionally we
+ // go back to a small allocation so that if there were a spiky growth it won't stick forever.
+ static vector<ParsedJournalEntry> entries;
+ entries.clear();
+ /** TEMP uncomment
+ RARELY OCCASIONALLY {
+ if( entries.capacity() > 2048 ) {
+ entries.shrink_to_fit();
+ entries.reserve(2048);
}
- else if( n == 10 ) {
- log() << "recover skipping application of section more..." << endl;
- }
- _lastSeqMentionedInConsoleLog = h->seqNumber;
- }
- return;
- }
-
- unique_ptr<JournalSectionIterator> i;
- if( _recovering ) {
- i = unique_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering));
- }
- else {
- i = unique_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, /*after header*/p, /*w/out header*/len));
- }
-
- // we use a static so that we don't have to reallocate every time through. occasionally we
- // go back to a small allocation so that if there were a spiky growth it won't stick forever.
- static vector<ParsedJournalEntry> entries;
- entries.clear();
-/** TEMP uncomment
- RARELY OCCASIONALLY {
- if( entries.capacity() > 2048 ) {
- entries.shrink_to_fit();
- entries.reserve(2048);
}
- }
+ */
+
+ // first read all entries to make sure this section is valid
+ ParsedJournalEntry e;
+ while (!i->atEof()) {
+ i->next(e);
+ entries.push_back(e);
+ }
+
+ // got all the entries for one group commit. apply them:
+ applyEntries(entries);
+}
+
+/** apply a specific journal file, that is already mmap'd
+ @param p start of the memory mapped file
+ @return true if this is detected to be the last file (ends abruptly)
*/
-
- // first read all entries to make sure this section is valid
- ParsedJournalEntry e;
- while( !i->atEof() ) {
- i->next(e);
- entries.push_back(e);
+bool RecoveryJob::processFileBuffer(const void* p, unsigned len) {
+ try {
+ unsigned long long fileId;
+ BufReader br(p, len);
+
+ {
+ // read file header
+ JHeader h;
+ br.read(h);
+
+ if (!h.valid()) {
+ log() << "Journal file header invalid. This could indicate corruption, or "
+ << "an unclean shutdown while writing the first section in a journal "
+ << "file.";
+ throw JournalSectionCorruptException();
}
- // got all the entries for one group commit. apply them:
- applyEntries(entries);
- }
-
- /** apply a specific journal file, that is already mmap'd
- @param p start of the memory mapped file
- @return true if this is detected to be the last file (ends abruptly)
- */
- bool RecoveryJob::processFileBuffer(const void *p, unsigned len) {
- try {
- unsigned long long fileId;
- BufReader br(p,len);
-
- {
- // read file header
- JHeader h;
- br.read(h);
-
- if (!h.valid()) {
- log() << "Journal file header invalid. This could indicate corruption, or "
- << "an unclean shutdown while writing the first section in a journal "
- << "file.";
- throw JournalSectionCorruptException();
- }
-
- if( !h.versionOk() ) {
- log() << "journal file version number mismatch got:" << hex << h._version
- << " expected:" << hex << (unsigned) JHeader::CurrentVersion
- << ". if you have just upgraded, recover with old version of mongod, terminate cleanly, then upgrade."
- << endl;
- // Not using JournalSectionCurruptException as we don't want to ignore
- // journal files on upgrade.
- uasserted(13536, str::stream() << "journal version number mismatch " << h._version);
- }
- fileId = h.fileId;
- if (mmapv1GlobalOptions.journalOptions &
- MMAPV1Options::JournalDumpJournal) {
- log() << "JHeader::fileId=" << fileId << endl;
- }
- }
-
- // read sections
- while ( !br.atEof() ) {
- JSectHeader h;
- br.peek(h);
- if( h.fileId != fileId ) {
- if (kDebugBuild || (mmapv1GlobalOptions.journalOptions &
- MMAPV1Options::JournalDumpJournal)) {
- log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl;
- log() << " sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl;
- }
- return true;
- }
- unsigned slen = h.sectionLen();
- unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter);
- const char *hdr = (const char *) br.skip(h.sectionLenWithPadding());
- const char *data = hdr + sizeof(JSectHeader);
- const char *footer = data + dataLen;
- processSection((const JSectHeader*) hdr, data, dataLen, (const JSectFooter*) footer);
-
- // ctrl c check
- uassert(ErrorCodes::Interrupted, "interrupted during journal recovery", !inShutdown());
- }
+ if (!h.versionOk()) {
+ log() << "journal file version number mismatch got:" << hex << h._version
+ << " expected:" << hex << (unsigned)JHeader::CurrentVersion
+ << ". if you have just upgraded, recover with old version of mongod, "
+ "terminate cleanly, then upgrade." << endl;
+ // Not using JournalSectionCurruptException as we don't want to ignore
+ // journal files on upgrade.
+ uasserted(13536, str::stream() << "journal version number mismatch " << h._version);
}
- catch (const BufReader::eof&) {
- if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal)
- log() << "ABRUPT END" << endl;
- return true; // abrupt end
+ fileId = h.fileId;
+ if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal) {
+ log() << "JHeader::fileId=" << fileId << endl;
}
- catch (const JournalSectionCorruptException&) {
- if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal)
- log() << "ABRUPT END" << endl;
- return true; // abrupt end
- }
-
- return false; // non-abrupt end
}
- /** apply a specific journal file */
- bool RecoveryJob::processFile(boost::filesystem::path journalfile) {
- log() << "recover " << journalfile.string() << endl;
-
- try {
- if( boost::filesystem::file_size( journalfile.string() ) == 0 ) {
- log() << "recover info " << journalfile.string() << " has zero length" << endl;
- return true;
+ // read sections
+ while (!br.atEof()) {
+ JSectHeader h;
+ br.peek(h);
+ if (h.fileId != fileId) {
+ if (kDebugBuild ||
+ (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal)) {
+ log() << "Ending processFileBuffer at differing fileId want:" << fileId
+ << " got:" << h.fileId << endl;
+ log() << " sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl;
}
- } catch(...) {
- // if something weird like a permissions problem keep going so the massert down below can happen (presumably)
- log() << "recover exception checking filesize" << endl;
+ return true;
}
-
- MemoryMappedFile f;
- void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL);
- massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p);
- return processFileBuffer(p, (unsigned) f.length());
+ unsigned slen = h.sectionLen();
+ unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter);
+ const char* hdr = (const char*)br.skip(h.sectionLenWithPadding());
+ const char* data = hdr + sizeof(JSectHeader);
+ const char* footer = data + dataLen;
+ processSection((const JSectHeader*)hdr, data, dataLen, (const JSectFooter*)footer);
+
+ // ctrl c check
+ uassert(ErrorCodes::Interrupted, "interrupted during journal recovery", !inShutdown());
}
-
- /** @param files all the j._0 style files we need to apply for recovery */
- void RecoveryJob::go(vector<boost::filesystem::path>& files) {
- log() << "recover begin" << endl;
- LockMongoFilesExclusive lkFiles; // for RecoveryJob::Last
- _recovering = true;
-
- // load the last sequence number synced to the datafiles on disk before the last crash
- _lastDataSyncedFromLastRun = journalReadLSN();
- log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl;
-
- for( unsigned i = 0; i != files.size(); ++i ) {
- bool abruptEnd = processFile(files[i]);
- if( abruptEnd && i+1 < files.size() ) {
- log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
- close();
- uasserted(13535, "recover abrupt journal file end");
- }
- }
-
- close();
-
- if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalScanOnly) {
- uasserted(13545, str::stream() << "--durOptions "
- << (int) MMAPV1Options::JournalScanOnly
- << " (scan only) specified");
- }
-
- log() << "recover cleaning up" << endl;
- removeJournalFiles();
- log() << "recover done" << endl;
- okToCleanUp = true;
- _recovering = false;
+ } catch (const BufReader::eof&) {
+ if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal)
+ log() << "ABRUPT END" << endl;
+ return true; // abrupt end
+ } catch (const JournalSectionCorruptException&) {
+ if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal)
+ log() << "ABRUPT END" << endl;
+ return true; // abrupt end
+ }
+
+ return false; // non-abrupt end
+}
+
+/** apply a specific journal file */
+bool RecoveryJob::processFile(boost::filesystem::path journalfile) {
+ log() << "recover " << journalfile.string() << endl;
+
+ try {
+ if (boost::filesystem::file_size(journalfile.string()) == 0) {
+ log() << "recover info " << journalfile.string() << " has zero length" << endl;
+ return true;
}
-
- void _recover() {
- verify(storageGlobalParams.dur);
-
- boost::filesystem::path p = getJournalDir();
- if( !exists(p) ) {
- log() << "directory " << p.string() << " does not exist, there will be no recovery startup step" << endl;
- okToCleanUp = true;
- return;
- }
-
- vector<boost::filesystem::path> journalFiles;
- getFiles(p, journalFiles);
-
- if( journalFiles.empty() ) {
- log() << "recover : no journal files present, no recovery needed" << endl;
- okToCleanUp = true;
- return;
- }
-
- RecoveryJob::get().go(journalFiles);
- }
-
- /** recover from a crash
- called during startup
- throws on error
- */
- void replayJournalFilesAtStartup() {
- // we use a lock so that exitCleanly will wait for us
- // to finish (or at least to notice what is up and stop)
- OperationContextImpl txn;
- ScopedTransaction transaction(&txn, MODE_X);
- Lock::GlobalWrite lk(txn.lockState());
-
- _recover(); // throws on interruption
+ } catch (...) {
+ // if something weird like a permissions problem keep going so the massert down below can happen (presumably)
+ log() << "recover exception checking filesize" << endl;
+ }
+
+ MemoryMappedFile f;
+ void* p =
+ f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL);
+ massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p);
+ return processFileBuffer(p, (unsigned)f.length());
+}
+
+/** @param files all the j._0 style files we need to apply for recovery */
+void RecoveryJob::go(vector<boost::filesystem::path>& files) {
+ log() << "recover begin" << endl;
+ LockMongoFilesExclusive lkFiles; // for RecoveryJob::Last
+ _recovering = true;
+
+ // load the last sequence number synced to the datafiles on disk before the last crash
+ _lastDataSyncedFromLastRun = journalReadLSN();
+ log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl;
+
+ for (unsigned i = 0; i != files.size(); ++i) {
+ bool abruptEnd = processFile(files[i]);
+ if (abruptEnd && i + 1 < files.size()) {
+ log() << "recover error: abrupt end to file " << files[i].string()
+ << ", yet it isn't the last journal file" << endl;
+ close();
+ uasserted(13535, "recover abrupt journal file end");
}
-
- struct BufReaderY { int a,b; };
- class BufReaderUnitTest : public StartupTest {
- public:
- void run() {
- BufReader r((void*) "abcdabcdabcd", 12);
- char x;
- BufReaderY y;
- r.read(x); //cout << x; // a
- verify( x == 'a' );
- r.read(y);
- r.read(x);
- verify( x == 'b' );
- }
- } brunittest;
-
- } // namespace dur
-} // namespace mongo
-
+ }
+
+ close();
+
+ if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalScanOnly) {
+ uasserted(13545,
+ str::stream() << "--durOptions " << (int)MMAPV1Options::JournalScanOnly
+ << " (scan only) specified");
+ }
+
+ log() << "recover cleaning up" << endl;
+ removeJournalFiles();
+ log() << "recover done" << endl;
+ okToCleanUp = true;
+ _recovering = false;
+}
+
+void _recover() {
+ verify(storageGlobalParams.dur);
+
+ boost::filesystem::path p = getJournalDir();
+ if (!exists(p)) {
+ log() << "directory " << p.string()
+ << " does not exist, there will be no recovery startup step" << endl;
+ okToCleanUp = true;
+ return;
+ }
+
+ vector<boost::filesystem::path> journalFiles;
+ getFiles(p, journalFiles);
+
+ if (journalFiles.empty()) {
+ log() << "recover : no journal files present, no recovery needed" << endl;
+ okToCleanUp = true;
+ return;
+ }
+
+ RecoveryJob::get().go(journalFiles);
+}
+
+/** recover from a crash
+ called during startup
+ throws on error
+*/
+void replayJournalFilesAtStartup() {
+ // we use a lock so that exitCleanly will wait for us
+ // to finish (or at least to notice what is up and stop)
+ OperationContextImpl txn;
+ ScopedTransaction transaction(&txn, MODE_X);
+ Lock::GlobalWrite lk(txn.lockState());
+
+ _recover(); // throws on interruption
+}
+
+struct BufReaderY {
+ int a, b;
+};
+class BufReaderUnitTest : public StartupTest {
+public:
+ void run() {
+ BufReader r((void*)"abcdabcdabcd", 12);
+ char x;
+ BufReaderY y;
+ r.read(x); // cout << x; // a
+ verify(x == 'a');
+ r.read(y);
+ r.read(x);
+ verify(x == 'b');
+ }
+} brunittest;
+
+} // namespace dur
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/dur_recover.h b/src/mongo/db/storage/mmap_v1/dur_recover.h
index 886f278a66a..e05e7926215 100644
--- a/src/mongo/db/storage/mmap_v1/dur_recover.h
+++ b/src/mongo/db/storage/mmap_v1/dur_recover.h
@@ -38,67 +38,69 @@
namespace mongo {
- class DurableMappedFile;
+class DurableMappedFile;
- namespace dur {
+namespace dur {
- struct ParsedJournalEntry;
+struct ParsedJournalEntry;
- /** call go() to execute a recovery from existing journal files.
- */
- class RecoveryJob {
- MONGO_DISALLOW_COPYING(RecoveryJob);
- public:
- RecoveryJob();
- ~RecoveryJob();
+/** call go() to execute a recovery from existing journal files.
+ */
+class RecoveryJob {
+ MONGO_DISALLOW_COPYING(RecoveryJob);
- void go(std::vector<boost::filesystem::path>& files);
+public:
+ RecoveryJob();
+ ~RecoveryJob();
- /** @param data data between header and footer. compressed if recovering. */
- void processSection(const JSectHeader *h, const void *data, unsigned len, const JSectFooter *f);
+ void go(std::vector<boost::filesystem::path>& files);
- // locks and calls _close()
- void close();
+ /** @param data data between header and footer. compressed if recovering. */
+ void processSection(const JSectHeader* h, const void* data, unsigned len, const JSectFooter* f);
- static RecoveryJob& get() { return _instance; }
+ // locks and calls _close()
+ void close();
- private:
+ static RecoveryJob& get() {
+ return _instance;
+ }
- class Last {
- public:
- Last();
- DurableMappedFile* newEntry(const ParsedJournalEntry&, RecoveryJob&);
+private:
+ class Last {
+ public:
+ Last();
+ DurableMappedFile* newEntry(const ParsedJournalEntry&, RecoveryJob&);
- private:
- DurableMappedFile* mmf;
- std::string dbName;
- int fileNo;
- };
+ private:
+ DurableMappedFile* mmf;
+ std::string dbName;
+ int fileNo;
+ };
- void write(Last& last, const ParsedJournalEntry& entry); // actually writes to the file
- void applyEntry(Last& last, const ParsedJournalEntry& entry, bool apply, bool dump);
- void applyEntries(const std::vector<ParsedJournalEntry> &entries);
- bool processFileBuffer(const void *, unsigned len);
- bool processFile(boost::filesystem::path journalfile);
- void _close(); // doesn't lock
+ void write(Last& last, const ParsedJournalEntry& entry); // actually writes to the file
+ void applyEntry(Last& last, const ParsedJournalEntry& entry, bool apply, bool dump);
+ void applyEntries(const std::vector<ParsedJournalEntry>& entries);
+ bool processFileBuffer(const void*, unsigned len);
+ bool processFile(boost::filesystem::path journalfile);
+ void _close(); // doesn't lock
- // Set of memory mapped files and a mutex to protect them
- stdx::mutex _mx;
- std::list<std::shared_ptr<DurableMappedFile> > _mmfs;
+ // Set of memory mapped files and a mutex to protect them
+ stdx::mutex _mx;
+ std::list<std::shared_ptr<DurableMappedFile>> _mmfs;
- // Are we in recovery or WRITETODATAFILES
- bool _recovering;
+ // Are we in recovery or WRITETODATAFILES
+ bool _recovering;
- unsigned long long _lastDataSyncedFromLastRun;
- unsigned long long _lastSeqMentionedInConsoleLog;
+ unsigned long long _lastDataSyncedFromLastRun;
+ unsigned long long _lastSeqMentionedInConsoleLog;
- static RecoveryJob& _instance;
- };
+ static RecoveryJob& _instance;
+};
- void replayJournalFilesAtStartup();
- }
+void replayJournalFilesAtStartup();
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/dur_recovery_unit.cpp b/src/mongo/db/storage/mmap_v1/dur_recovery_unit.cpp
index e826277e7ff..0c9f58988e2 100644
--- a/src/mongo/db/storage/mmap_v1/dur_recovery_unit.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur_recovery_unit.cpp
@@ -45,284 +45,272 @@
namespace mongo {
- DurRecoveryUnit::DurRecoveryUnit()
- : _writeCount(0), _writeBytes(0), _inUnitOfWork(false), _rollbackWritesDisabled(false) {
- }
+DurRecoveryUnit::DurRecoveryUnit()
+ : _writeCount(0), _writeBytes(0), _inUnitOfWork(false), _rollbackWritesDisabled(false) {}
- void DurRecoveryUnit::beginUnitOfWork(OperationContext* opCtx) {
- invariant(!_inUnitOfWork);
- _inUnitOfWork = true;
- }
+void DurRecoveryUnit::beginUnitOfWork(OperationContext* opCtx) {
+ invariant(!_inUnitOfWork);
+ _inUnitOfWork = true;
+}
- void DurRecoveryUnit::commitUnitOfWork() {
- invariant(_inUnitOfWork);
+void DurRecoveryUnit::commitUnitOfWork() {
+ invariant(_inUnitOfWork);
- commitChanges();
+ commitChanges();
- // global journal flush opportunity
- getDur().commitIfNeeded();
+ // global journal flush opportunity
+ getDur().commitIfNeeded();
- resetChanges();
- }
+ resetChanges();
+}
- void DurRecoveryUnit::abortUnitOfWork() {
- invariant(_inUnitOfWork);
+void DurRecoveryUnit::abortUnitOfWork() {
+ invariant(_inUnitOfWork);
- rollbackChanges();
- resetChanges();
- }
+ rollbackChanges();
+ resetChanges();
+}
- void DurRecoveryUnit::abandonSnapshot() {
- invariant(!_inUnitOfWork);
- // no-op since we have no transaction
- }
+void DurRecoveryUnit::abandonSnapshot() {
+ invariant(!_inUnitOfWork);
+ // no-op since we have no transaction
+}
- void DurRecoveryUnit::commitChanges() {
- if (getDur().isDurable())
- markWritesForJournaling();
+void DurRecoveryUnit::commitChanges() {
+ if (getDur().isDurable())
+ markWritesForJournaling();
- try {
- for (Changes::const_iterator it = _changes.begin(), end = _changes.end();
- it != end; ++it) {
- (*it)->commit();
- }
- }
- catch (...) {
- std::terminate();
+ try {
+ for (Changes::const_iterator it = _changes.begin(), end = _changes.end(); it != end; ++it) {
+ (*it)->commit();
}
+ } catch (...) {
+ std::terminate();
}
-
- void DurRecoveryUnit::markWritesForJournaling() {
- if (!_writeCount)
- return;
-
- typedef std::pair<void*, unsigned> Intent;
- std::vector<Intent> intents;
- const size_t numStoredWrites = _initialWrites.size() + _mergedWrites.size();
- intents.reserve(numStoredWrites);
-
- // Show very large units of work at LOG(1) level as they may hint at performance issues
- const int logLevel = (_writeCount > 100*1000 || _writeBytes > 50*1024*1024) ? 1 : 3;
-
- LOG(logLevel) << _writeCount << " writes (" << _writeBytes / 1024 << " kB) covered by "
- << numStoredWrites << " pre-images ("
- << _preimageBuffer.size() / 1024 << " kB) ";
-
- // orders the initial, unmerged writes, by address so we can coalesce overlapping and
- // adjacent writes
- std::sort(_initialWrites.begin(), _initialWrites.end());
-
- if (!_initialWrites.empty()) {
- intents.push_back(std::make_pair(_initialWrites.front().addr,
- _initialWrites.front().len));
- for (InitialWrites::iterator it = (_initialWrites.begin() + 1),
- end = _initialWrites.end();
- it != end;
- ++it) {
- Intent& lastIntent = intents.back();
- char* lastEnd = static_cast<char*>(lastIntent.first) + lastIntent.second;
- if (it->addr <= lastEnd) {
- // overlapping or adjacent, so extend.
- ptrdiff_t extendedLen = (it->end()) - static_cast<char*>(lastIntent.first);
- lastIntent.second = std::max(lastIntent.second, unsigned(extendedLen));
- }
- else {
- // not overlapping, so create a new intent
- intents.push_back(std::make_pair(it->addr, it->len));
- }
+}
+
+void DurRecoveryUnit::markWritesForJournaling() {
+ if (!_writeCount)
+ return;
+
+ typedef std::pair<void*, unsigned> Intent;
+ std::vector<Intent> intents;
+ const size_t numStoredWrites = _initialWrites.size() + _mergedWrites.size();
+ intents.reserve(numStoredWrites);
+
+ // Show very large units of work at LOG(1) level as they may hint at performance issues
+ const int logLevel = (_writeCount > 100 * 1000 || _writeBytes > 50 * 1024 * 1024) ? 1 : 3;
+
+ LOG(logLevel) << _writeCount << " writes (" << _writeBytes / 1024 << " kB) covered by "
+ << numStoredWrites << " pre-images (" << _preimageBuffer.size() / 1024 << " kB) ";
+
+ // orders the initial, unmerged writes, by address so we can coalesce overlapping and
+ // adjacent writes
+ std::sort(_initialWrites.begin(), _initialWrites.end());
+
+ if (!_initialWrites.empty()) {
+ intents.push_back(std::make_pair(_initialWrites.front().addr, _initialWrites.front().len));
+ for (InitialWrites::iterator it = (_initialWrites.begin() + 1), end = _initialWrites.end();
+ it != end;
+ ++it) {
+ Intent& lastIntent = intents.back();
+ char* lastEnd = static_cast<char*>(lastIntent.first) + lastIntent.second;
+ if (it->addr <= lastEnd) {
+ // overlapping or adjacent, so extend.
+ ptrdiff_t extendedLen = (it->end()) - static_cast<char*>(lastIntent.first);
+ lastIntent.second = std::max(lastIntent.second, unsigned(extendedLen));
+ } else {
+ // not overlapping, so create a new intent
+ intents.push_back(std::make_pair(it->addr, it->len));
}
}
+ }
- MergedWrites::iterator it = _mergedWrites.begin();
- if (it != _mergedWrites.end()) {
- intents.push_back(std::make_pair(it->addr, it->len));
- while (++it != _mergedWrites.end()) {
- // Check the property that write intents are sorted and don't overlap.
- invariant(it->addr >= intents.back().first);
- Intent& lastIntent = intents.back();
- char* lastEnd = static_cast<char*>(lastIntent.first) + lastIntent.second;
- if (it->addr == lastEnd) {
- // adjacent, so extend.
- lastIntent.second += it->len;
- }
- else {
- // not overlapping, so create a new intent
- invariant(it->addr > lastEnd);
- intents.push_back(std::make_pair(it->addr, it->len));
- }
+ MergedWrites::iterator it = _mergedWrites.begin();
+ if (it != _mergedWrites.end()) {
+ intents.push_back(std::make_pair(it->addr, it->len));
+ while (++it != _mergedWrites.end()) {
+ // Check the property that write intents are sorted and don't overlap.
+ invariant(it->addr >= intents.back().first);
+ Intent& lastIntent = intents.back();
+ char* lastEnd = static_cast<char*>(lastIntent.first) + lastIntent.second;
+ if (it->addr == lastEnd) {
+ // adjacent, so extend.
+ lastIntent.second += it->len;
+ } else {
+ // not overlapping, so create a new intent
+ invariant(it->addr > lastEnd);
+ intents.push_back(std::make_pair(it->addr, it->len));
}
}
- LOG(logLevel) << _mergedWrites.size() << " pre-images " << "coalesced into "
- << intents.size() << " write intents";
-
- getDur().declareWriteIntents(intents);
- }
-
- void DurRecoveryUnit::resetChanges() {
- _writeCount = 0;
- _writeBytes = 0;
- _initialWrites.clear();
- _mergedWrites.clear();
- _changes.clear();
- _preimageBuffer.clear();
- _rollbackWritesDisabled = false;
- _inUnitOfWork = false;
}
-
- void DurRecoveryUnit::rollbackChanges() {
- // First rollback disk writes, then Changes. This matches behavior in other storage engines
- // that either rollback a transaction or don't write a writebatch.
-
- if (_rollbackWritesDisabled) {
- LOG(2) << " ***** NOT ROLLING BACK " << _writeCount << " disk writes";
+ LOG(logLevel) << _mergedWrites.size() << " pre-images "
+ << "coalesced into " << intents.size() << " write intents";
+
+ getDur().declareWriteIntents(intents);
+}
+
+void DurRecoveryUnit::resetChanges() {
+ _writeCount = 0;
+ _writeBytes = 0;
+ _initialWrites.clear();
+ _mergedWrites.clear();
+ _changes.clear();
+ _preimageBuffer.clear();
+ _rollbackWritesDisabled = false;
+ _inUnitOfWork = false;
+}
+
+void DurRecoveryUnit::rollbackChanges() {
+ // First rollback disk writes, then Changes. This matches behavior in other storage engines
+ // that either rollback a transaction or don't write a writebatch.
+
+ if (_rollbackWritesDisabled) {
+ LOG(2) << " ***** NOT ROLLING BACK " << _writeCount << " disk writes";
+ } else {
+ LOG(2) << " ***** ROLLING BACK " << _writeCount << " disk writes";
+
+ // First roll back the merged writes. These have no overlap or ordering requirement
+ // other than needing to be rolled back before all _initialWrites.
+ for (MergedWrites::iterator it = _mergedWrites.begin(); it != _mergedWrites.end(); ++it) {
+ _preimageBuffer.copy(it->addr, it->len, it->offset);
}
- else {
- LOG(2) << " ***** ROLLING BACK " << _writeCount << " disk writes";
-
- // First roll back the merged writes. These have no overlap or ordering requirement
- // other than needing to be rolled back before all _initialWrites.
- for (MergedWrites::iterator it = _mergedWrites.begin();
- it != _mergedWrites.end();
- ++it) {
- _preimageBuffer.copy(it->addr, it->len, it->offset);
- }
- // Then roll back the initial writes in LIFO order, as these might have overlaps.
- for (InitialWrites::reverse_iterator rit = _initialWrites.rbegin();
- rit != _initialWrites.rend();
- ++rit) {
- _preimageBuffer.copy(rit->addr, rit->len, rit->offset);
- }
+ // Then roll back the initial writes in LIFO order, as these might have overlaps.
+ for (InitialWrites::reverse_iterator rit = _initialWrites.rbegin();
+ rit != _initialWrites.rend();
+ ++rit) {
+ _preimageBuffer.copy(rit->addr, rit->len, rit->offset);
}
+ }
- LOG(2) << " ***** ROLLING BACK " << (_changes.size()) << " custom changes";
+ LOG(2) << " ***** ROLLING BACK " << (_changes.size()) << " custom changes";
- try {
- for (int i = _changes.size() - 1; i >= 0; i--) {
- LOG(2) << "CUSTOM ROLLBACK " << demangleName(typeid(*_changes[i]));
- _changes[i]->rollback();
- }
- }
- catch (...) {
- std::terminate();
+ try {
+ for (int i = _changes.size() - 1; i >= 0; i--) {
+ LOG(2) << "CUSTOM ROLLBACK " << demangleName(typeid(*_changes[i]));
+ _changes[i]->rollback();
}
+ } catch (...) {
+ std::terminate();
}
+}
- bool DurRecoveryUnit::waitUntilDurable() {
- invariant(!_inUnitOfWork);
- return getDur().waitUntilDurable();
- }
+bool DurRecoveryUnit::waitUntilDurable() {
+ invariant(!_inUnitOfWork);
+ return getDur().waitUntilDurable();
+}
- void DurRecoveryUnit::mergingWritingPtr(char* addr, size_t len) {
- // The invariant is that all writes are non-overlapping and non-empty. So, a single
- // writingPtr call may result in a number of new segments added. At this point, we cannot
- // in general merge adjacent writes, as that would require inefficient operations on the
- // preimage buffer.
+void DurRecoveryUnit::mergingWritingPtr(char* addr, size_t len) {
+ // The invariant is that all writes are non-overlapping and non-empty. So, a single
+ // writingPtr call may result in a number of new segments added. At this point, we cannot
+ // in general merge adjacent writes, as that would require inefficient operations on the
+ // preimage buffer.
- MergedWrites::iterator coveringWrite = _mergedWrites.upper_bound(Write(addr, 0, 0));
+ MergedWrites::iterator coveringWrite = _mergedWrites.upper_bound(Write(addr, 0, 0));
- char* const end = addr + len;
- while (addr < end) {
- dassert(coveringWrite == _mergedWrites.end() || coveringWrite->end() > addr);
+ char* const end = addr + len;
+ while (addr < end) {
+ dassert(coveringWrite == _mergedWrites.end() || coveringWrite->end() > addr);
- // Determine whether addr[0] is already covered by a write or not.
- // If covered, adjust addr and len to exclude the covered run from addr[0] onwards.
+ // Determine whether addr[0] is already covered by a write or not.
+ // If covered, adjust addr and len to exclude the covered run from addr[0] onwards.
- if (coveringWrite != _mergedWrites.end()) {
- char* const cwEnd = coveringWrite->end();
+ if (coveringWrite != _mergedWrites.end()) {
+ char* const cwEnd = coveringWrite->end();
- if (coveringWrite->addr <= addr) {
- // If the begin of the covering write at or before addr[0], addr[0] is covered.
- // While the existing pre-image will not generally be the same as the data
- // being written now, during rollback only the oldest pre-image matters.
+ if (coveringWrite->addr <= addr) {
+ // If the begin of the covering write at or before addr[0], addr[0] is covered.
+ // While the existing pre-image will not generally be the same as the data
+ // being written now, during rollback only the oldest pre-image matters.
- if (end <= cwEnd) {
- break; // fully covered
- }
-
- addr = cwEnd;
- coveringWrite++;
- dassert(coveringWrite == _mergedWrites.end() || coveringWrite->addr >= cwEnd);
+ if (end <= cwEnd) {
+ break; // fully covered
}
- }
- dassert(coveringWrite == _mergedWrites.end() || coveringWrite->end() > addr);
- // If the next coveringWrite overlaps, adjust the end of the uncovered region.
- char* uncoveredEnd = end;
- if (coveringWrite != _mergedWrites.end() && coveringWrite->addr < end) {
- uncoveredEnd = coveringWrite->addr;
+ addr = cwEnd;
+ coveringWrite++;
+ dassert(coveringWrite == _mergedWrites.end() || coveringWrite->addr >= cwEnd);
}
+ }
+ dassert(coveringWrite == _mergedWrites.end() || coveringWrite->end() > addr);
- const size_t uncoveredLen = uncoveredEnd - addr;
- if (uncoveredLen) {
- // We are writing to a region that hasn't been declared previously.
- _mergedWrites.insert(Write(addr, uncoveredLen, _preimageBuffer.size()));
+ // If the next coveringWrite overlaps, adjust the end of the uncovered region.
+ char* uncoveredEnd = end;
+ if (coveringWrite != _mergedWrites.end() && coveringWrite->addr < end) {
+ uncoveredEnd = coveringWrite->addr;
+ }
- // Windows requires us to adjust the address space *before* we write to anything.
- privateViews.makeWritable(addr, uncoveredLen);
+ const size_t uncoveredLen = uncoveredEnd - addr;
+ if (uncoveredLen) {
+ // We are writing to a region that hasn't been declared previously.
+ _mergedWrites.insert(Write(addr, uncoveredLen, _preimageBuffer.size()));
- if (!_rollbackWritesDisabled) {
- _preimageBuffer.append(addr, uncoveredLen);
- }
- addr = uncoveredEnd;
+ // Windows requires us to adjust the address space *before* we write to anything.
+ privateViews.makeWritable(addr, uncoveredLen);
+
+ if (!_rollbackWritesDisabled) {
+ _preimageBuffer.append(addr, uncoveredLen);
}
+ addr = uncoveredEnd;
}
}
+}
- void* DurRecoveryUnit::writingPtr(void* addr, size_t len) {
- invariant(_inUnitOfWork);
-
- if (len == 0) {
- return addr; // Don't need to do anything for empty ranges.
- }
+void* DurRecoveryUnit::writingPtr(void* addr, size_t len) {
+ invariant(_inUnitOfWork);
- invariant(len < size_t(std::numeric_limits<int>::max()));
+ if (len == 0) {
+ return addr; // Don't need to do anything for empty ranges.
+ }
- _writeCount++;
- _writeBytes += len;
- char* const data = static_cast<char*>(addr);
+ invariant(len < size_t(std::numeric_limits<int>::max()));
- // The initial writes are stored in a faster, but less memory-efficient way. This will
- // typically be enough for simple operations, where the extra cost of incremental
- // coalescing and merging would be too much. For larger writes, more redundancy is
- // is expected, so the cost of checking for duplicates is offset by savings in copying
- // and allocating preimage buffers. Total memory use of the preimage buffer may be up to
- // kMaxUnmergedPreimageBytes larger than the amount memory covered by the write intents.
+ _writeCount++;
+ _writeBytes += len;
+ char* const data = static_cast<char*>(addr);
- const size_t kMaxUnmergedPreimageBytes = kDebugBuild ? 16*1024 : 10*1024*1024;
+ // The initial writes are stored in a faster, but less memory-efficient way. This will
+ // typically be enough for simple operations, where the extra cost of incremental
+ // coalescing and merging would be too much. For larger writes, more redundancy is
+ // is expected, so the cost of checking for duplicates is offset by savings in copying
+ // and allocating preimage buffers. Total memory use of the preimage buffer may be up to
+ // kMaxUnmergedPreimageBytes larger than the amount memory covered by the write intents.
- if (_preimageBuffer.size() + len > kMaxUnmergedPreimageBytes) {
- mergingWritingPtr(data, len);
+ const size_t kMaxUnmergedPreimageBytes = kDebugBuild ? 16 * 1024 : 10 * 1024 * 1024;
- // After a merged write, no more initial writes can occur or there would be an
- // ordering violation during rollback. So, ensure that the if-condition will be true
- // for any future write regardless of length. This is true now because
- // mergingWritingPtr also will store its first write in _preimageBuffer as well.
- invariant(_preimageBuffer.size() >= kMaxUnmergedPreimageBytes);
+ if (_preimageBuffer.size() + len > kMaxUnmergedPreimageBytes) {
+ mergingWritingPtr(data, len);
- return addr;
- }
+ // After a merged write, no more initial writes can occur or there would be an
+ // ordering violation during rollback. So, ensure that the if-condition will be true
+ // for any future write regardless of length. This is true now because
+ // mergingWritingPtr also will store its first write in _preimageBuffer as well.
+ invariant(_preimageBuffer.size() >= kMaxUnmergedPreimageBytes);
- // Windows requires us to adjust the address space *before* we write to anything.
- privateViews.makeWritable(data, len);
+ return addr;
+ }
- _initialWrites.push_back(Write(data, len, _preimageBuffer.size()));
+ // Windows requires us to adjust the address space *before* we write to anything.
+ privateViews.makeWritable(data, len);
- if (!_rollbackWritesDisabled) {
- _preimageBuffer.append(data, len);
- }
+ _initialWrites.push_back(Write(data, len, _preimageBuffer.size()));
- return addr;
+ if (!_rollbackWritesDisabled) {
+ _preimageBuffer.append(data, len);
}
- void DurRecoveryUnit::setRollbackWritesDisabled() {
- invariant(_inUnitOfWork);
- _rollbackWritesDisabled = true;
- }
+ return addr;
+}
- void DurRecoveryUnit::registerChange(Change* change) {
- invariant(_inUnitOfWork);
- _changes.push_back(change);
- }
+void DurRecoveryUnit::setRollbackWritesDisabled() {
+ invariant(_inUnitOfWork);
+ _rollbackWritesDisabled = true;
+}
+
+void DurRecoveryUnit::registerChange(Change* change) {
+ invariant(_inUnitOfWork);
+ _changes.push_back(change);
+}
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/dur_recovery_unit.h b/src/mongo/db/storage/mmap_v1/dur_recovery_unit.h
index d26032e8f26..52f717d29b2 100644
--- a/src/mongo/db/storage/mmap_v1/dur_recovery_unit.h
+++ b/src/mongo/db/storage/mmap_v1/dur_recovery_unit.h
@@ -39,127 +39,131 @@
namespace mongo {
+/**
+ * Just pass through to getDur().
+ */
+class DurRecoveryUnit : public RecoveryUnit {
+public:
+ DurRecoveryUnit();
+
+ void beginUnitOfWork(OperationContext* opCtx) final;
+ void commitUnitOfWork() final;
+ void abortUnitOfWork() final;
+
+ virtual bool waitUntilDurable();
+
+ virtual void abandonSnapshot();
+
+ // The recovery unit takes ownership of change.
+ virtual void registerChange(Change* change);
+
+ virtual void* writingPtr(void* addr, size_t len);
+
+ virtual void setRollbackWritesDisabled();
+
+ virtual SnapshotId getSnapshotId() const {
+ return SnapshotId();
+ }
+
+private:
/**
- * Just pass through to getDur().
+ * Marks writes for journaling, if enabled, and then commits all other Changes in order.
+ * Returns with empty _initialWrites, _mergedWrites, _changes and _preimageBuffer, but
+ * does not reset the _rollbackWritesDisabled or _mustRollback flags. This leaves the
+ * RecoveryUnit ready for more changes that may be committed or rolled back.
*/
- class DurRecoveryUnit : public RecoveryUnit {
- public:
- DurRecoveryUnit();
-
- void beginUnitOfWork(OperationContext* opCtx) final;
- void commitUnitOfWork() final;
- void abortUnitOfWork() final;
-
- virtual bool waitUntilDurable();
-
- virtual void abandonSnapshot();
-
- // The recovery unit takes ownership of change.
- virtual void registerChange(Change* change);
-
- virtual void* writingPtr(void* addr, size_t len);
-
- virtual void setRollbackWritesDisabled();
-
- virtual SnapshotId getSnapshotId() const { return SnapshotId(); }
-
- private:
- /**
- * Marks writes for journaling, if enabled, and then commits all other Changes in order.
- * Returns with empty _initialWrites, _mergedWrites, _changes and _preimageBuffer, but
- * does not reset the _rollbackWritesDisabled or _mustRollback flags. This leaves the
- * RecoveryUnit ready for more changes that may be committed or rolled back.
- */
- void commitChanges();
-
- /**
- * Creates a list of write intents to be journaled, and hands it of to the active
- * DurabilityInterface.
- */
- void markWritesForJournaling();
-
- /**
- * Restores state by rolling back all writes using the saved pre-images, and then
- * rolling back all other Changes in LIFO order. Resets internal state.
- */
- void rollbackChanges();
-
-
- /**
- * Version of writingPtr that checks existing writes for overlap and only stores those
- * changes not yet covered by an existing write intent and pre-image.
- */
- void mergingWritingPtr(char* data, size_t len);
-
- /**
- * Reset to a clean state without any uncommitted changes or write.
- */
- void resetChanges();
-
- // Changes are ordered from oldest to newest.
- typedef OwnedPointerVector<Change> Changes;
- Changes _changes;
-
-
- // Number of pending uncommitted writes. Incremented even if new write is fully covered by
- // existing writes.
- size_t _writeCount;
- // Total size of the pending uncommitted writes.
- size_t _writeBytes;
-
- /**
- * These are memory writes inside the mmapv1 mmap-ed files. A pointer past the end is just
- * instead of a pointer to the beginning for the benefit of MergedWrites.
- */
- struct Write {
- Write(char* addr, int len, int offset) : addr(addr), len(len), offset(offset) { }
- Write(const Write& rhs) : addr(rhs.addr), len(rhs.len), offset(rhs.offset) { }
- Write() : addr(0), len(0), offset(0) { }
- bool operator< (const Write& rhs) const { return addr < rhs.addr; }
-
- struct compareEnd {
- bool operator() (const Write& lhs, const Write& rhs) const {
- return lhs.addr + lhs.len < rhs.addr + rhs.len;
- }
- };
-
- char* end() const {
- return addr + len;
- }
+ void commitChanges();
+
+ /**
+ * Creates a list of write intents to be journaled, and hands it of to the active
+ * DurabilityInterface.
+ */
+ void markWritesForJournaling();
+
+ /**
+ * Restores state by rolling back all writes using the saved pre-images, and then
+ * rolling back all other Changes in LIFO order. Resets internal state.
+ */
+ void rollbackChanges();
+
- char* addr;
- int len;
- int offset; // index into _preimageBuffer
+ /**
+ * Version of writingPtr that checks existing writes for overlap and only stores those
+ * changes not yet covered by an existing write intent and pre-image.
+ */
+ void mergingWritingPtr(char* data, size_t len);
+
+ /**
+ * Reset to a clean state without any uncommitted changes or write.
+ */
+ void resetChanges();
+
+ // Changes are ordered from oldest to newest.
+ typedef OwnedPointerVector<Change> Changes;
+ Changes _changes;
+
+
+ // Number of pending uncommitted writes. Incremented even if new write is fully covered by
+ // existing writes.
+ size_t _writeCount;
+ // Total size of the pending uncommitted writes.
+ size_t _writeBytes;
+
+ /**
+ * These are memory writes inside the mmapv1 mmap-ed files. A pointer past the end is just
+ * instead of a pointer to the beginning for the benefit of MergedWrites.
+ */
+ struct Write {
+ Write(char* addr, int len, int offset) : addr(addr), len(len), offset(offset) {}
+ Write(const Write& rhs) : addr(rhs.addr), len(rhs.len), offset(rhs.offset) {}
+ Write() : addr(0), len(0), offset(0) {}
+ bool operator<(const Write& rhs) const {
+ return addr < rhs.addr;
+ }
+
+ struct compareEnd {
+ bool operator()(const Write& lhs, const Write& rhs) const {
+ return lhs.addr + lhs.len < rhs.addr + rhs.len;
+ }
};
- /**
- * Writes are ordered by ending address, so MergedWrites::upper_bound() can find the first
- * overlapping write, if any. Overlapping and duplicate regions are forbidden, as rollback
- * of MergedChanges undoes changes by address rather than LIFO order. In addition, empty
- * regions are not allowed. Storing writes by age does not work well for large indexed
- * arrays, as coalescing is needed to bound the size of the preimage buffer.
- */
- typedef std::set<Write, Write::compareEnd> MergedWrites;
- MergedWrites _mergedWrites;
-
- // Generally it's more efficient to just store pre-images unconditionally and then
- // sort/eliminate duplicates at commit time. However, this can lead to excessive memory
- // use in cases involving large indexes arrays, where the same memory is written many
- // times. To keep the speed for the general case and bound memory use, the first few MB of
- // pre-images are stored unconditionally, but once the threshold has been exceeded, the
- // remainder is stored in a more space-efficient datastructure.
- typedef std::vector<Write> InitialWrites;
- InitialWrites _initialWrites;
-
- std::string _preimageBuffer;
-
- bool _inUnitOfWork;
-
-
- // Default is false.
- // If true, no preimages are tracked. If rollback is subsequently attempted, the process
- // will abort.
- bool _rollbackWritesDisabled;
+ char* end() const {
+ return addr + len;
+ }
+
+ char* addr;
+ int len;
+ int offset; // index into _preimageBuffer
};
+ /**
+ * Writes are ordered by ending address, so MergedWrites::upper_bound() can find the first
+ * overlapping write, if any. Overlapping and duplicate regions are forbidden, as rollback
+ * of MergedChanges undoes changes by address rather than LIFO order. In addition, empty
+ * regions are not allowed. Storing writes by age does not work well for large indexed
+ * arrays, as coalescing is needed to bound the size of the preimage buffer.
+ */
+ typedef std::set<Write, Write::compareEnd> MergedWrites;
+ MergedWrites _mergedWrites;
+
+ // Generally it's more efficient to just store pre-images unconditionally and then
+ // sort/eliminate duplicates at commit time. However, this can lead to excessive memory
+ // use in cases involving large indexes arrays, where the same memory is written many
+ // times. To keep the speed for the general case and bound memory use, the first few MB of
+ // pre-images are stored unconditionally, but once the threshold has been exceeded, the
+ // remainder is stored in a more space-efficient datastructure.
+ typedef std::vector<Write> InitialWrites;
+ InitialWrites _initialWrites;
+
+ std::string _preimageBuffer;
+
+ bool _inUnitOfWork;
+
+
+ // Default is false.
+ // If true, no preimages are tracked. If rollback is subsequently attempted, the process
+ // will abort.
+ bool _rollbackWritesDisabled;
+};
+
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/dur_stats.h b/src/mongo/db/storage/mmap_v1/dur_stats.h
index 27532e9ee59..8ec6f8c024f 100644
--- a/src/mongo/db/storage/mmap_v1/dur_stats.h
+++ b/src/mongo/db/storage/mmap_v1/dur_stats.h
@@ -31,61 +31,64 @@
#include "mongo/db/jsobj.h"
namespace mongo {
- namespace dur {
+namespace dur {
- /** journaling stats. the model here is that the commit thread is the only writer, and that reads are
- uncommon (from a serverStatus command and such). Thus, there should not be multicore chatter overhead.
- */
- struct Stats {
-
- struct S {
- std::string _CSVHeader() const;
- std::string _asCSV() const;
+/** journaling stats. the model here is that the commit thread is the only writer, and that reads are
+ uncommon (from a serverStatus command and such). Thus, there should not be multicore chatter overhead.
+*/
+struct Stats {
+ struct S {
+ std::string _CSVHeader() const;
+ std::string _asCSV() const;
- void _asObj(BSONObjBuilder* builder) const;
+ void _asObj(BSONObjBuilder* builder) const;
- void reset();
+ void reset();
- uint64_t getCurrentDurationMillis() const {
- return ((curTimeMicros64() - _startTimeMicros) / 1000);
- }
+ uint64_t getCurrentDurationMillis() const {
+ return ((curTimeMicros64() - _startTimeMicros) / 1000);
+ }
- // Not reported. Internal use only.
- uint64_t _startTimeMicros;
+ // Not reported. Internal use only.
+ uint64_t _startTimeMicros;
- // Reported statistics
- unsigned _durationMillis;
+ // Reported statistics
+ unsigned _durationMillis;
- unsigned _commits;
- unsigned _commitsInWriteLock;
+ unsigned _commits;
+ unsigned _commitsInWriteLock;
- uint64_t _journaledBytes;
- uint64_t _uncompressedBytes;
- uint64_t _writeToDataFilesBytes;
+ uint64_t _journaledBytes;
+ uint64_t _uncompressedBytes;
+ uint64_t _writeToDataFilesBytes;
- uint64_t _prepLogBufferMicros;
- uint64_t _writeToJournalMicros;
- uint64_t _writeToDataFilesMicros;
- uint64_t _remapPrivateViewMicros;
- uint64_t _commitsMicros;
- uint64_t _commitsInWriteLockMicros;
- };
+ uint64_t _prepLogBufferMicros;
+ uint64_t _writeToJournalMicros;
+ uint64_t _writeToDataFilesMicros;
+ uint64_t _remapPrivateViewMicros;
+ uint64_t _commitsMicros;
+ uint64_t _commitsInWriteLockMicros;
+ };
- Stats();
- void reset();
+ Stats();
+ void reset();
- BSONObj asObj() const;
+ BSONObj asObj() const;
- const S* curr() const { return &_stats[_currIdx]; }
- S* curr() { return &_stats[_currIdx]; }
+ const S* curr() const {
+ return &_stats[_currIdx];
+ }
+ S* curr() {
+ return &_stats[_currIdx];
+ }
- private:
- S _stats[5];
- unsigned _currIdx;
- };
+private:
+ S _stats[5];
+ unsigned _currIdx;
+};
- extern Stats stats;
- }
+extern Stats stats;
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/durable_mapped_file.cpp b/src/mongo/db/storage/mmap_v1/durable_mapped_file.cpp
index e32c0b15ffe..fad28753372 100644
--- a/src/mongo/db/storage/mmap_v1/durable_mapped_file.cpp
+++ b/src/mongo/db/storage/mmap_v1/durable_mapped_file.cpp
@@ -53,258 +53,260 @@ using namespace mongoutils;
namespace mongo {
- using std::dec;
- using std::endl;
- using std::hex;
- using std::map;
- using std::pair;
- using std::string;
-
- void DurableMappedFile::remapThePrivateView() {
- verify(storageGlobalParams.dur);
-
- _willNeedRemap = false;
-
- // todo 1.9 : it turns out we require that we always remap to the same address.
- // so the remove / add isn't necessary and can be removed?
- void *old = _view_private;
- //privateViews.remove(_view_private);
- _view_private = remapPrivateView(_view_private);
- //privateViews.add(_view_private, this);
- fassert( 16112, _view_private == old );
- }
+using std::dec;
+using std::endl;
+using std::hex;
+using std::map;
+using std::pair;
+using std::string;
+
+void DurableMappedFile::remapThePrivateView() {
+ verify(storageGlobalParams.dur);
+
+ _willNeedRemap = false;
+
+ // todo 1.9 : it turns out we require that we always remap to the same address.
+ // so the remove / add isn't necessary and can be removed?
+ void* old = _view_private;
+ // privateViews.remove(_view_private);
+ _view_private = remapPrivateView(_view_private);
+ // privateViews.add(_view_private, this);
+ fassert(16112, _view_private == old);
+}
- /** register view. threadsafe */
- void PointerToDurableMappedFile::add_inlock(void *view, DurableMappedFile *f) {
- verify(view);
- verify(f);
- clearWritableBits_inlock(view, f->length());
- _views.insert(pair<void*, DurableMappedFile*>(view, f));
- }
+/** register view. threadsafe */
+void PointerToDurableMappedFile::add_inlock(void* view, DurableMappedFile* f) {
+ verify(view);
+ verify(f);
+ clearWritableBits_inlock(view, f->length());
+ _views.insert(pair<void*, DurableMappedFile*>(view, f));
+}
- /** de-register view. threadsafe */
- void PointerToDurableMappedFile::remove(void *view, size_t len) {
- if( view ) {
- stdx::lock_guard<stdx::mutex> lk(_m);
- clearWritableBits_inlock(view, len);
- _views.erase(view);
- }
+/** de-register view. threadsafe */
+void PointerToDurableMappedFile::remove(void* view, size_t len) {
+ if (view) {
+ stdx::lock_guard<stdx::mutex> lk(_m);
+ clearWritableBits_inlock(view, len);
+ _views.erase(view);
}
+}
#ifdef _WIN32
- void PointerToDurableMappedFile::clearWritableBits(void *privateView, size_t len) {
- stdx::lock_guard<stdx::mutex> lk(_m);
- clearWritableBits_inlock(privateView, len);
- }
+void PointerToDurableMappedFile::clearWritableBits(void* privateView, size_t len) {
+ stdx::lock_guard<stdx::mutex> lk(_m);
+ clearWritableBits_inlock(privateView, len);
+}
- /** notification on unmapping so we can clear writable bits */
- void PointerToDurableMappedFile::clearWritableBits_inlock(void *privateView, size_t len) {
- for (unsigned i = reinterpret_cast<size_t>(privateView) / MemoryMappedCOWBitset::ChunkSize;
- i <= (reinterpret_cast<size_t>(privateView) + len) / MemoryMappedCOWBitset::ChunkSize;
- ++i) {
- writable.clear(i);
- dassert(!writable.get(i));
- }
+/** notification on unmapping so we can clear writable bits */
+void PointerToDurableMappedFile::clearWritableBits_inlock(void* privateView, size_t len) {
+ for (unsigned i = reinterpret_cast<size_t>(privateView) / MemoryMappedCOWBitset::ChunkSize;
+ i <= (reinterpret_cast<size_t>(privateView) + len) / MemoryMappedCOWBitset::ChunkSize;
+ ++i) {
+ writable.clear(i);
+ dassert(!writable.get(i));
}
+}
- extern stdx::mutex mapViewMutex;
+extern stdx::mutex mapViewMutex;
- __declspec(noinline) void PointerToDurableMappedFile::makeChunkWritable(size_t chunkno) {
- stdx::lock_guard<stdx::mutex> lkPrivateViews(_m);
+__declspec(noinline) void PointerToDurableMappedFile::makeChunkWritable(size_t chunkno) {
+ stdx::lock_guard<stdx::mutex> lkPrivateViews(_m);
- if (writable.get(chunkno)) // double check lock
- return;
+ if (writable.get(chunkno)) // double check lock
+ return;
- // remap all maps in this chunk.
- // common case is a single map, but could have more than one with smallfiles or .ns files
- size_t chunkStart = chunkno * MemoryMappedCOWBitset::ChunkSize;
- size_t chunkNext = chunkStart + MemoryMappedCOWBitset::ChunkSize;
+ // remap all maps in this chunk.
+ // common case is a single map, but could have more than one with smallfiles or .ns files
+ size_t chunkStart = chunkno * MemoryMappedCOWBitset::ChunkSize;
+ size_t chunkNext = chunkStart + MemoryMappedCOWBitset::ChunkSize;
- stdx::lock_guard<stdx::mutex> lkMapView(mapViewMutex);
+ stdx::lock_guard<stdx::mutex> lkMapView(mapViewMutex);
- map<void*, DurableMappedFile*>::iterator i = _views.upper_bound((void*)(chunkNext - 1));
- while (1) {
- const pair<void*, DurableMappedFile*> x = *(--i);
- DurableMappedFile *mmf = x.second;
- if (mmf == 0)
- break;
+ map<void*, DurableMappedFile*>::iterator i = _views.upper_bound((void*)(chunkNext - 1));
+ while (1) {
+ const pair<void*, DurableMappedFile*> x = *(--i);
+ DurableMappedFile* mmf = x.second;
+ if (mmf == 0)
+ break;
- size_t viewStart = reinterpret_cast<size_t>(x.first);
- size_t viewEnd = viewStart + mmf->length();
- if (viewEnd <= chunkStart)
- break;
+ size_t viewStart = reinterpret_cast<size_t>(x.first);
+ size_t viewEnd = viewStart + mmf->length();
+ if (viewEnd <= chunkStart)
+ break;
- size_t protectStart = std::max(viewStart, chunkStart);
- dassert(protectStart < chunkNext);
+ size_t protectStart = std::max(viewStart, chunkStart);
+ dassert(protectStart < chunkNext);
- size_t protectEnd = std::min(viewEnd, chunkNext);
- size_t protectSize = protectEnd - protectStart;
- dassert(protectSize > 0 && protectSize <= MemoryMappedCOWBitset::ChunkSize);
+ size_t protectEnd = std::min(viewEnd, chunkNext);
+ size_t protectSize = protectEnd - protectStart;
+ dassert(protectSize > 0 && protectSize <= MemoryMappedCOWBitset::ChunkSize);
- DWORD oldProtection;
- bool ok = VirtualProtect(reinterpret_cast<void*>(protectStart),
- protectSize,
- PAGE_WRITECOPY,
- &oldProtection);
- if (!ok) {
- DWORD dosError = GetLastError();
+ DWORD oldProtection;
+ bool ok = VirtualProtect(
+ reinterpret_cast<void*>(protectStart), protectSize, PAGE_WRITECOPY, &oldProtection);
+ if (!ok) {
+ DWORD dosError = GetLastError();
- if (dosError == ERROR_COMMITMENT_LIMIT) {
- // System has run out of memory between physical RAM & page file, tell the user
- BSONObjBuilder bb;
+ if (dosError == ERROR_COMMITMENT_LIMIT) {
+ // System has run out of memory between physical RAM & page file, tell the user
+ BSONObjBuilder bb;
- ProcessInfo p;
- p.getExtraInfo(bb);
+ ProcessInfo p;
+ p.getExtraInfo(bb);
- severe() << "MongoDB has exhausted the system memory capacity.";
- severe() << "Current Memory Status: " << bb.obj().toString();
- }
+ severe() << "MongoDB has exhausted the system memory capacity.";
+ severe() << "Current Memory Status: " << bb.obj().toString();
+ }
- severe() << "VirtualProtect for " << mmf->filename()
- << " chunk " << chunkno
- << " failed with " << errnoWithDescription(dosError)
- << " (chunk size is " << protectSize
- << ", address is " << hex << protectStart << dec << ")"
- << " in mongo::makeChunkWritable, terminating"
- << endl;
+ severe() << "VirtualProtect for " << mmf->filename() << " chunk " << chunkno
+ << " failed with " << errnoWithDescription(dosError) << " (chunk size is "
+ << protectSize << ", address is " << hex << protectStart << dec << ")"
+ << " in mongo::makeChunkWritable, terminating" << endl;
- fassertFailed(16362);
- }
+ fassertFailed(16362);
}
-
- writable.set(chunkno);
}
+
+ writable.set(chunkno);
+}
#else
- void PointerToDurableMappedFile::clearWritableBits(void *privateView, size_t len) {
- }
+void PointerToDurableMappedFile::clearWritableBits(void* privateView, size_t len) {}
- void PointerToDurableMappedFile::clearWritableBits_inlock(void *privateView, size_t len) {
- }
+void PointerToDurableMappedFile::clearWritableBits_inlock(void* privateView, size_t len) {}
#endif
- PointerToDurableMappedFile::PointerToDurableMappedFile() {
+PointerToDurableMappedFile::PointerToDurableMappedFile() {
#if defined(SIZE_MAX)
- size_t max = SIZE_MAX;
+ size_t max = SIZE_MAX;
#else
- size_t max = ~((size_t)0);
+ size_t max = ~((size_t)0);
#endif
- verify( max > (size_t) this ); // just checking that no one redef'd SIZE_MAX and that it is sane
+ verify(max > (size_t) this); // just checking that no one redef'd SIZE_MAX and that it is sane
- // this way we don't need any boundary checking in _find()
- _views.insert( pair<void*,DurableMappedFile*>((void*)0,(DurableMappedFile*)0) );
- _views.insert( pair<void*,DurableMappedFile*>((void*)max,(DurableMappedFile*)0) );
- }
+ // this way we don't need any boundary checking in _find()
+ _views.insert(pair<void*, DurableMappedFile*>((void*)0, (DurableMappedFile*)0));
+ _views.insert(pair<void*, DurableMappedFile*>((void*)max, (DurableMappedFile*)0));
+}
- /** underscore version of find is for when you are already locked
- @param ofs out return our offset in the view
- @return the DurableMappedFile to which this pointer belongs
- */
- DurableMappedFile* PointerToDurableMappedFile::find_inlock(void *p, /*out*/ size_t& ofs) {
- //
- // .................memory..........................
- // v1 p v2
- // [--------------------] [-------]
- //
- // e.g., _find(p) == v1
- //
- const pair<void*,DurableMappedFile*> x = *(--_views.upper_bound(p));
- DurableMappedFile *mmf = x.second;
- if( mmf ) {
- size_t o = ((char *)p) - ((char*)x.first);
- if( o < mmf->length() ) {
- ofs = o;
- return mmf;
- }
+/** underscore version of find is for when you are already locked
+ @param ofs out return our offset in the view
+ @return the DurableMappedFile to which this pointer belongs
+*/
+DurableMappedFile* PointerToDurableMappedFile::find_inlock(void* p, /*out*/ size_t& ofs) {
+ //
+ // .................memory..........................
+ // v1 p v2
+ // [--------------------] [-------]
+ //
+ // e.g., _find(p) == v1
+ //
+ const pair<void*, DurableMappedFile*> x = *(--_views.upper_bound(p));
+ DurableMappedFile* mmf = x.second;
+ if (mmf) {
+ size_t o = ((char*)p) - ((char*)x.first);
+ if (o < mmf->length()) {
+ ofs = o;
+ return mmf;
}
- return 0;
}
+ return 0;
+}
- /** find associated MMF object for a given pointer.
- threadsafe
- @param ofs out returns offset into the view of the pointer, if found.
- @return the DurableMappedFile to which this pointer belongs. null if not found.
- */
- DurableMappedFile* PointerToDurableMappedFile::find(void *p, /*out*/ size_t& ofs) {
- stdx::lock_guard<stdx::mutex> lk(_m);
- return find_inlock(p, ofs);
- }
-
- PointerToDurableMappedFile privateViews;
-
- // here so that it is precomputed...
- void DurableMappedFile::setPath(const std::string& f) {
- string suffix;
- string prefix;
- bool ok = str::rSplitOn(f, '.', prefix, suffix);
- uassert(13520, str::stream() << "DurableMappedFile only supports filenames in a certain format " << f, ok);
- if( suffix == "ns" )
- _fileSuffixNo = dur::JEntry::DotNsSuffix;
- else
- _fileSuffixNo = (int) str::toUnsigned(suffix);
+/** find associated MMF object for a given pointer.
+ threadsafe
+ @param ofs out returns offset into the view of the pointer, if found.
+ @return the DurableMappedFile to which this pointer belongs. null if not found.
+*/
+DurableMappedFile* PointerToDurableMappedFile::find(void* p, /*out*/ size_t& ofs) {
+ stdx::lock_guard<stdx::mutex> lk(_m);
+ return find_inlock(p, ofs);
+}
- _p = RelativePath::fromFullPath(storageGlobalParams.dbpath, prefix);
- }
+PointerToDurableMappedFile privateViews;
+
+// here so that it is precomputed...
+void DurableMappedFile::setPath(const std::string& f) {
+ string suffix;
+ string prefix;
+ bool ok = str::rSplitOn(f, '.', prefix, suffix);
+ uassert(13520,
+ str::stream() << "DurableMappedFile only supports filenames in a certain format " << f,
+ ok);
+ if (suffix == "ns")
+ _fileSuffixNo = dur::JEntry::DotNsSuffix;
+ else
+ _fileSuffixNo = (int)str::toUnsigned(suffix);
+
+ _p = RelativePath::fromFullPath(storageGlobalParams.dbpath, prefix);
+}
- bool DurableMappedFile::open(const std::string& fname, bool sequentialHint) {
- LOG(3) << "mmf open " << fname;
- invariant(!_view_write);
+bool DurableMappedFile::open(const std::string& fname, bool sequentialHint) {
+ LOG(3) << "mmf open " << fname;
+ invariant(!_view_write);
- setPath(fname);
- _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0);
- return finishOpening();
- }
+ setPath(fname);
+ _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0);
+ return finishOpening();
+}
- bool DurableMappedFile::create(const std::string& fname, unsigned long long& len, bool sequentialHint) {
- LOG(3) << "mmf create " << fname;
- invariant(!_view_write);
+bool DurableMappedFile::create(const std::string& fname,
+ unsigned long long& len,
+ bool sequentialHint) {
+ LOG(3) << "mmf create " << fname;
+ invariant(!_view_write);
- setPath(fname);
- _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0);
- return finishOpening();
- }
+ setPath(fname);
+ _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0);
+ return finishOpening();
+}
- bool DurableMappedFile::finishOpening() {
- LOG(3) << "mmf finishOpening " << (void*) _view_write << ' ' << filename() << " len:" << length();
- if( _view_write ) {
- if (storageGlobalParams.dur) {
- stdx::lock_guard<stdx::mutex> lk2(privateViews._mutex());
-
- _view_private = createPrivateMap();
- if( _view_private == 0 ) {
- msgasserted(13636, str::stream() << "file " << filename() << " open/create failed in createPrivateMap (look in log for more information)");
- }
- privateViews.add_inlock(_view_private, this); // note that testIntent builds use this, even though it points to view_write then...
- }
- else {
- _view_private = _view_write;
+bool DurableMappedFile::finishOpening() {
+ LOG(3) << "mmf finishOpening " << (void*)_view_write << ' ' << filename()
+ << " len:" << length();
+ if (_view_write) {
+ if (storageGlobalParams.dur) {
+ stdx::lock_guard<stdx::mutex> lk2(privateViews._mutex());
+
+ _view_private = createPrivateMap();
+ if (_view_private == 0) {
+ msgasserted(13636,
+ str::stream() << "file " << filename() << " open/create failed "
+ "in createPrivateMap "
+ "(look in log for "
+ "more information)");
}
- return true;
+ privateViews.add_inlock(
+ _view_private,
+ this); // note that testIntent builds use this, even though it points to view_write then...
+ } else {
+ _view_private = _view_write;
}
- return false;
+ return true;
}
+ return false;
+}
- DurableMappedFile::DurableMappedFile() : _willNeedRemap(false) {
- _view_write = _view_private = 0;
- }
+DurableMappedFile::DurableMappedFile() : _willNeedRemap(false) {
+ _view_write = _view_private = 0;
+}
- DurableMappedFile::~DurableMappedFile() {
- try {
- LOG(3) << "mmf close " << filename();
+DurableMappedFile::~DurableMappedFile() {
+ try {
+ LOG(3) << "mmf close " << filename();
- // If _view_private was not set, this means file open failed
- if (_view_private) {
- // Notify the durability system that we are closing a file so it can ensure we
- // will not have journaled operations with no corresponding file.
- getDur().closingFileNotification();
- }
+ // If _view_private was not set, this means file open failed
+ if (_view_private) {
+ // Notify the durability system that we are closing a file so it can ensure we
+ // will not have journaled operations with no corresponding file.
+ getDur().closingFileNotification();
+ }
- LockMongoFilesExclusive lk;
- privateViews.remove(_view_private, length());
+ LockMongoFilesExclusive lk;
+ privateViews.remove(_view_private, length());
- MemoryMappedFile::close();
- }
- catch (...) {
- error() << "exception in ~DurableMappedFile";
- }
+ MemoryMappedFile::close();
+ } catch (...) {
+ error() << "exception in ~DurableMappedFile";
}
}
+}
diff --git a/src/mongo/db/storage/mmap_v1/durable_mapped_file.h b/src/mongo/db/storage/mmap_v1/durable_mapped_file.h
index c4cfb5a6131..02906f112fe 100644
--- a/src/mongo/db/storage/mmap_v1/durable_mapped_file.h
+++ b/src/mongo/db/storage/mmap_v1/durable_mapped_file.h
@@ -37,220 +37,245 @@
namespace mongo {
- /** DurableMappedFile adds some layers atop memory mapped files - specifically our handling of private views & such.
- if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class,
- not this.
- */
- class DurableMappedFile : private MemoryMappedFile {
- protected:
- virtual void* viewForFlushing() { return _view_write; }
+/** DurableMappedFile adds some layers atop memory mapped files - specifically our handling of private views & such.
+ if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class,
+ not this.
+*/
+class DurableMappedFile : private MemoryMappedFile {
+protected:
+ virtual void* viewForFlushing() {
+ return _view_write;
+ }
- public:
- DurableMappedFile();
- virtual ~DurableMappedFile();
+public:
+ DurableMappedFile();
+ virtual ~DurableMappedFile();
- /** @return true if opened ok. */
- bool open(const std::string& fname, bool sequentialHint /*typically we open with this false*/);
+ /** @return true if opened ok. */
+ bool open(const std::string& fname, bool sequentialHint /*typically we open with this false*/);
- /** @return file length */
- unsigned long long length() const { return MemoryMappedFile::length(); }
+ /** @return file length */
+ unsigned long long length() const {
+ return MemoryMappedFile::length();
+ }
- std::string filename() const { return MemoryMappedFile::filename(); }
+ std::string filename() const {
+ return MemoryMappedFile::filename();
+ }
- void flush(bool sync) { MemoryMappedFile::flush(sync); }
+ void flush(bool sync) {
+ MemoryMappedFile::flush(sync);
+ }
- /* Creates with length if DNE, otherwise uses existing file length,
- passed length.
- @param sequentialHint if true will be sequentially accessed
- @return true for ok
- */
- bool create(const std::string& fname, unsigned long long& len, bool sequentialHint);
+ /* Creates with length if DNE, otherwise uses existing file length,
+ passed length.
+ @param sequentialHint if true will be sequentially accessed
+ @return true for ok
+ */
+ bool create(const std::string& fname, unsigned long long& len, bool sequentialHint);
- /* Get the "standard" view (which is the private one).
- @return the private view.
- */
- void* getView() const { return _view_private; }
-
- /* Get the "write" view (which is required for writing).
- @return the write view.
- */
- void* view_write() const { return _view_write; }
+ /* Get the "standard" view (which is the private one).
+ @return the private view.
+ */
+ void* getView() const {
+ return _view_private;
+ }
- /** for a filename a/b/c.3
- filePath() is "a/b/c"
- fileSuffixNo() is 3
- if the suffix is "ns", fileSuffixNo -1
- */
- const RelativePath& relativePath() const {
- DEV verify( !_p._p.empty() );
- return _p;
- }
+ /* Get the "write" view (which is required for writing).
+ @return the write view.
+ */
+ void* view_write() const {
+ return _view_write;
+ }
- int fileSuffixNo() const { return _fileSuffixNo; }
- HANDLE getFd() { return MemoryMappedFile::getFd(); }
+ /** for a filename a/b/c.3
+ filePath() is "a/b/c"
+ fileSuffixNo() is 3
+ if the suffix is "ns", fileSuffixNo -1
+ */
+ const RelativePath& relativePath() const {
+ DEV verify(!_p._p.empty());
+ return _p;
+ }
- /** true if we have written.
- set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration.
- reset to false in REMAPPRIVATEVIEW
- */
- bool willNeedRemap() { return _willNeedRemap; }
- void setWillNeedRemap() { _willNeedRemap = true; }
+ int fileSuffixNo() const {
+ return _fileSuffixNo;
+ }
+ HANDLE getFd() {
+ return MemoryMappedFile::getFd();
+ }
- void remapThePrivateView();
+ /** true if we have written.
+ set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration.
+ reset to false in REMAPPRIVATEVIEW
+ */
+ bool willNeedRemap() {
+ return _willNeedRemap;
+ }
+ void setWillNeedRemap() {
+ _willNeedRemap = true;
+ }
- virtual bool isDurableMappedFile() { return true; }
+ void remapThePrivateView();
- private:
+ virtual bool isDurableMappedFile() {
+ return true;
+ }
- void *_view_write;
- void *_view_private;
- bool _willNeedRemap;
- RelativePath _p; // e.g. "somepath/dbname"
- int _fileSuffixNo; // e.g. 3. -1="ns"
+private:
+ void* _view_write;
+ void* _view_private;
+ bool _willNeedRemap;
+ RelativePath _p; // e.g. "somepath/dbname"
+ int _fileSuffixNo; // e.g. 3. -1="ns"
- void setPath(const std::string& pathAndFileName);
- bool finishOpening();
- };
+ void setPath(const std::string& pathAndFileName);
+ bool finishOpening();
+};
#ifdef _WIN32
- // Simple array based bitset to track COW chunks in memory mapped files on Windows
- // A chunk is a 64MB granular region in virtual memory that we mark as COW everytime we need
- // to write to a memory mapped files on Windows
- //
- class MemoryMappedCOWBitset {
- MONGO_DISALLOW_COPYING(MemoryMappedCOWBitset);
- public:
- // Size of the chunks we mark Copy-On-Write with VirtualProtect
- static const unsigned long long ChunkSize = 64 * 1024 * 1024;
-
- // Number of chunks we store in our bitset which are really 32-bit ints
- static const unsigned long long NChunks = 64 * 1024;
-
- // Total Virtual Memory space we can cover with the bitset
- static const unsigned long long MaxChunkMemory = ChunkSize * NChunks
- * sizeof(unsigned int) * 8;
-
- // Size in bytes of the bitset we allocate
- static const unsigned long long MaxChunkBytes = NChunks * sizeof(unsigned int);
-
- // 128 TB Virtual Memory space in Windows 8.1/2012 R2, 8TB before
- static const unsigned long long MaxWinMemory =
- 128ULL * 1024 * 1024 * 1024 * 1024;
-
- // Make sure that the chunk memory covers the Max Windows user process VM space
- static_assert(MaxChunkMemory == MaxWinMemory,
- "Need a larger bitset to cover max process VM space");
- public:
- MemoryMappedCOWBitset() {
- static_assert(MemoryMappedCOWBitset::MaxChunkBytes == sizeof(bits),
- "Validate our predicted bitset size is correct");
- }
+// Simple array based bitset to track COW chunks in memory mapped files on Windows
+// A chunk is a 64MB granular region in virtual memory that we mark as COW everytime we need
+// to write to a memory mapped files on Windows
+//
+class MemoryMappedCOWBitset {
+ MONGO_DISALLOW_COPYING(MemoryMappedCOWBitset);
- bool get(uintptr_t i) const {
- uintptr_t x = i / 32;
- verify(x < MemoryMappedCOWBitset::NChunks);
- return (bits[x].loadRelaxed() & (1 << (i % 32))) != 0;
- }
+public:
+ // Size of the chunks we mark Copy-On-Write with VirtualProtect
+ static const unsigned long long ChunkSize = 64 * 1024 * 1024;
- // Note: assumes caller holds privateViews.mutex
- void set(uintptr_t i) {
- uintptr_t x = i / 32;
- verify(x < MemoryMappedCOWBitset::NChunks);
- bits[x].store( bits[x].loadRelaxed() | (1 << (i % 32)));
- }
+ // Number of chunks we store in our bitset which are really 32-bit ints
+ static const unsigned long long NChunks = 64 * 1024;
- // Note: assumes caller holds privateViews.mutex
- void clear(uintptr_t i) {
- uintptr_t x = i / 32;
- verify(x < MemoryMappedCOWBitset::NChunks);
- bits[x].store(bits[x].loadRelaxed() & ~(1 << (i % 32)));
- }
+ // Total Virtual Memory space we can cover with the bitset
+ static const unsigned long long MaxChunkMemory = ChunkSize * NChunks * sizeof(unsigned int) * 8;
+
+ // Size in bytes of the bitset we allocate
+ static const unsigned long long MaxChunkBytes = NChunks * sizeof(unsigned int);
+
+ // 128 TB Virtual Memory space in Windows 8.1/2012 R2, 8TB before
+ static const unsigned long long MaxWinMemory = 128ULL * 1024 * 1024 * 1024 * 1024;
+
+ // Make sure that the chunk memory covers the Max Windows user process VM space
+ static_assert(MaxChunkMemory == MaxWinMemory,
+ "Need a larger bitset to cover max process VM space");
+
+public:
+ MemoryMappedCOWBitset() {
+ static_assert(MemoryMappedCOWBitset::MaxChunkBytes == sizeof(bits),
+ "Validate our predicted bitset size is correct");
+ }
+
+ bool get(uintptr_t i) const {
+ uintptr_t x = i / 32;
+ verify(x < MemoryMappedCOWBitset::NChunks);
+ return (bits[x].loadRelaxed() & (1 << (i % 32))) != 0;
+ }
+
+ // Note: assumes caller holds privateViews.mutex
+ void set(uintptr_t i) {
+ uintptr_t x = i / 32;
+ verify(x < MemoryMappedCOWBitset::NChunks);
+ bits[x].store(bits[x].loadRelaxed() | (1 << (i % 32)));
+ }
- private:
- // atomic as we are doing double check locking
- AtomicUInt32 bits[MemoryMappedCOWBitset::NChunks];
- };
+ // Note: assumes caller holds privateViews.mutex
+ void clear(uintptr_t i) {
+ uintptr_t x = i / 32;
+ verify(x < MemoryMappedCOWBitset::NChunks);
+ bits[x].store(bits[x].loadRelaxed() & ~(1 << (i % 32)));
+ }
+
+private:
+ // atomic as we are doing double check locking
+ AtomicUInt32 bits[MemoryMappedCOWBitset::NChunks];
+};
#endif
- /** for durability support we want to be able to map pointers to specific DurableMappedFile objects.
- */
- class PointerToDurableMappedFile {
- MONGO_DISALLOW_COPYING(PointerToDurableMappedFile);
- public:
- PointerToDurableMappedFile();
+/** for durability support we want to be able to map pointers to specific DurableMappedFile objects.
+*/
+class PointerToDurableMappedFile {
+ MONGO_DISALLOW_COPYING(PointerToDurableMappedFile);
- /** register view.
- not-threadsafe, caller must hold _mutex()
- */
- void add_inlock(void *view, DurableMappedFile *f);
+public:
+ PointerToDurableMappedFile();
- /** de-register view.
- threadsafe
- */
- void remove(void *view, size_t length);
+ /** register view.
+ not-threadsafe, caller must hold _mutex()
+ */
+ void add_inlock(void* view, DurableMappedFile* f);
- /** find associated MMF object for a given pointer.
- threadsafe
- @param ofs out returns offset into the view of the pointer, if found.
- @return the DurableMappedFile to which this pointer belongs. null if not found.
+ /** de-register view.
+ threadsafe
*/
- DurableMappedFile* find(void *p, /*out*/ size_t& ofs);
+ void remove(void* view, size_t length);
- /** for doing many finds in a row with one lock operation */
- stdx::mutex& _mutex() { return _m; }
+ /** find associated MMF object for a given pointer.
+ threadsafe
+ @param ofs out returns offset into the view of the pointer, if found.
+ @return the DurableMappedFile to which this pointer belongs. null if not found.
+ */
+ DurableMappedFile* find(void* p, /*out*/ size_t& ofs);
- /** not-threadsafe, caller must hold _mutex() */
- DurableMappedFile* find_inlock(void *p, /*out*/ size_t& ofs);
+ /** for doing many finds in a row with one lock operation */
+ stdx::mutex& _mutex() {
+ return _m;
+ }
+
+ /** not-threadsafe, caller must hold _mutex() */
+ DurableMappedFile* find_inlock(void* p, /*out*/ size_t& ofs);
- /** not-threadsafe, caller must hold _mutex() */
- unsigned numberOfViews_inlock() const { return _views.size(); }
+ /** not-threadsafe, caller must hold _mutex() */
+ unsigned numberOfViews_inlock() const {
+ return _views.size();
+ }
- /** make the private map range writable (necessary for our windows implementation) */
- void makeWritable(void *, unsigned len);
+ /** make the private map range writable (necessary for our windows implementation) */
+ void makeWritable(void*, unsigned len);
- void clearWritableBits(void *privateView, size_t len);
+ void clearWritableBits(void* privateView, size_t len);
- private:
- void clearWritableBits_inlock(void *privateView, size_t len);
+private:
+ void clearWritableBits_inlock(void* privateView, size_t len);
#ifdef _WIN32
- void makeChunkWritable(size_t chunkno);
+ void makeChunkWritable(size_t chunkno);
#endif
- private:
- // PointerToDurableMappedFile Mutex
- //
- // Protects:
- // Protects internal consistency of data structure
- // Lock Ordering:
- // Must be taken before MapViewMutex if both are taken to prevent deadlocks
- stdx::mutex _m;
- std::map<void*, DurableMappedFile*> _views;
+private:
+ // PointerToDurableMappedFile Mutex
+ //
+ // Protects:
+ // Protects internal consistency of data structure
+ // Lock Ordering:
+ // Must be taken before MapViewMutex if both are taken to prevent deadlocks
+ stdx::mutex _m;
+ std::map<void*, DurableMappedFile*> _views;
#ifdef _WIN32
- // Tracks which memory mapped regions are marked as Copy on Write
- MemoryMappedCOWBitset writable;
+ // Tracks which memory mapped regions are marked as Copy on Write
+ MemoryMappedCOWBitset writable;
#endif
- };
+};
#ifdef _WIN32
- inline void PointerToDurableMappedFile::makeWritable(void *privateView, unsigned len) {
- size_t p = reinterpret_cast<size_t>(privateView);
- unsigned a = p / MemoryMappedCOWBitset::ChunkSize;
- unsigned b = (p + len) / MemoryMappedCOWBitset::ChunkSize;
-
- for (unsigned i = a; i <= b; i++) {
- if (!writable.get(i)) {
- makeChunkWritable(i);
- }
+inline void PointerToDurableMappedFile::makeWritable(void* privateView, unsigned len) {
+ size_t p = reinterpret_cast<size_t>(privateView);
+ unsigned a = p / MemoryMappedCOWBitset::ChunkSize;
+ unsigned b = (p + len) / MemoryMappedCOWBitset::ChunkSize;
+
+ for (unsigned i = a; i <= b; i++) {
+ if (!writable.get(i)) {
+ makeChunkWritable(i);
}
}
+}
#else
- inline void PointerToDurableMappedFile::makeWritable(void *_p, unsigned len) {
- }
+inline void PointerToDurableMappedFile::makeWritable(void* _p, unsigned len) {}
#endif
- // allows a pointer into any private view of a DurableMappedFile to be resolved to the DurableMappedFile object
- extern PointerToDurableMappedFile privateViews;
+// allows a pointer into any private view of a DurableMappedFile to be resolved to the DurableMappedFile object
+extern PointerToDurableMappedFile privateViews;
}
diff --git a/src/mongo/db/storage/mmap_v1/durop.cpp b/src/mongo/db/storage/mmap_v1/durop.cpp
index 2a049596593..8efd7720c3e 100644
--- a/src/mongo/db/storage/mmap_v1/durop.cpp
+++ b/src/mongo/db/storage/mmap_v1/durop.cpp
@@ -47,134 +47,133 @@
namespace mongo {
- using std::unique_ptr;
- using std::shared_ptr;
- using std::endl;
- using std::string;
-
- namespace dur {
-
- /** read a durop from journal file referenced by br.
- @param opcode the opcode which has already been written from the bufreader
- */
- shared_ptr<DurOp> DurOp::read(unsigned opcode, BufReader& br) {
- shared_ptr<DurOp> op;
- switch( opcode ) {
- case JEntry::OpCode_FileCreated:
- op = shared_ptr<DurOp>( new FileCreatedOp(br) );
- break;
- case JEntry::OpCode_DropDb:
- op = shared_ptr<DurOp>( new DropDbOp(br) );
- break;
- default:
- massert(13546, (str::stream() << "journal recover: unrecognized opcode in journal " << opcode), false);
- }
- return op;
- }
+using std::unique_ptr;
+using std::shared_ptr;
+using std::endl;
+using std::string;
- void DurOp::serialize(AlignedBuilder& ab) {
- ab.appendNum(_opcode);
- _serialize(ab);
- }
+namespace dur {
- DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) {
- unsigned long long reserved;
- log.read(reserved);
- log.read(reserved);
- log.readStr(_db);
- string reservedStr;
- log.readStr(reservedStr);
- }
+/** read a durop from journal file referenced by br.
+ @param opcode the opcode which has already been written from the bufreader
+*/
+shared_ptr<DurOp> DurOp::read(unsigned opcode, BufReader& br) {
+ shared_ptr<DurOp> op;
+ switch (opcode) {
+ case JEntry::OpCode_FileCreated:
+ op = shared_ptr<DurOp>(new FileCreatedOp(br));
+ break;
+ case JEntry::OpCode_DropDb:
+ op = shared_ptr<DurOp>(new DropDbOp(br));
+ break;
+ default:
+ massert(13546,
+ (str::stream() << "journal recover: unrecognized opcode in journal " << opcode),
+ false);
+ }
+ return op;
+}
- void DropDbOp::_serialize(AlignedBuilder& ab) {
- ab.appendNum((unsigned long long) 0); // reserved for future use
- ab.appendNum((unsigned long long) 0); // reserved for future use
- ab.appendStr(_db);
- ab.appendStr(""); // reserved
- }
+void DurOp::serialize(AlignedBuilder& ab) {
+ ab.appendNum(_opcode);
+ _serialize(ab);
+}
- /** throws */
- void DropDbOp::replay() {
- log() << "recover replay drop db " << _db << endl;
- _deleteDataFiles(_db);
- }
+DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) {
+ unsigned long long reserved;
+ log.read(reserved);
+ log.read(reserved);
+ log.readStr(_db);
+ string reservedStr;
+ log.readStr(reservedStr);
+}
- FileCreatedOp::FileCreatedOp(const std::string& f, unsigned long long l) :
- DurOp(JEntry::OpCode_FileCreated) {
- _p = RelativePath::fromFullPath(storageGlobalParams.dbpath, f);
- _len = l;
- }
+void DropDbOp::_serialize(AlignedBuilder& ab) {
+ ab.appendNum((unsigned long long)0); // reserved for future use
+ ab.appendNum((unsigned long long)0); // reserved for future use
+ ab.appendStr(_db);
+ ab.appendStr(""); // reserved
+}
- FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) {
- unsigned long long reserved;
- log.read(reserved);
- log.read(reserved);
- log.read(_len); // size of file, not length of name
- string s;
- log.readStr(s);
- _p._p = s;
- }
+/** throws */
+void DropDbOp::replay() {
+ log() << "recover replay drop db " << _db << endl;
+ _deleteDataFiles(_db);
+}
- void FileCreatedOp::_serialize(AlignedBuilder& ab) {
- ab.appendNum((unsigned long long) 0); // reserved for future use
- ab.appendNum((unsigned long long) 0); // reserved for future use
- ab.appendNum(_len);
- ab.appendStr(_p.toString());
- }
+FileCreatedOp::FileCreatedOp(const std::string& f, unsigned long long l)
+ : DurOp(JEntry::OpCode_FileCreated) {
+ _p = RelativePath::fromFullPath(storageGlobalParams.dbpath, f);
+ _len = l;
+}
- string FileCreatedOp::toString() {
- return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len/1024.0/1024.0 << "MB";
- }
+FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) {
+ unsigned long long reserved;
+ log.read(reserved);
+ log.read(reserved);
+ log.read(_len); // size of file, not length of name
+ string s;
+ log.readStr(s);
+ _p._p = s;
+}
- // if an operation deletes or creates a file (or moves etc.), it may need files closed.
- bool FileCreatedOp::needFilesClosed() {
- return boost::filesystem::exists( _p.asFullPath() );
- }
+void FileCreatedOp::_serialize(AlignedBuilder& ab) {
+ ab.appendNum((unsigned long long)0); // reserved for future use
+ ab.appendNum((unsigned long long)0); // reserved for future use
+ ab.appendNum(_len);
+ ab.appendStr(_p.toString());
+}
- void FileCreatedOp::replay() {
- // i believe the code assumes new files are filled with zeros. thus we have to recreate the file,
- // or rewrite at least, even if it were the right length. perhaps one day we should change that
- // although easier to avoid defects if we assume it is zeros perhaps.
- string full = _p.asFullPath();
- if( boost::filesystem::exists(full) ) {
- try {
- boost::filesystem::remove(full);
- }
- catch(std::exception& e) {
- LOG(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl;
- }
- }
-
- log() << "recover create file " << full << ' ' << _len/1024.0/1024.0 << "MB" << endl;
- if( boost::filesystem::exists(full) ) {
- // first delete if exists.
- try {
- boost::filesystem::remove(full);
- }
- catch(...) {
- log() << "warning could not delete file " << full << endl;
- }
- }
- ensureParentDirCreated(full);
- File f;
- f.open(full.c_str());
- massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open());
- unsigned long long left = _len;
- const unsigned blksz = 64 * 1024;
- unique_ptr<char[]> v( new char[blksz] );
- memset( v.get(), 0, blksz );
- fileofs ofs = 0;
- while( left ) {
- unsigned long long w = left < blksz ? left : blksz;
- f.write(ofs, v.get(), (unsigned) w);
- left -= w;
- ofs += w;
- }
- f.fsync();
- flushMyDirectory(full);
- massert(13628, str::stream() << "recover failure writing file " << full, !f.bad() );
- }
+string FileCreatedOp::toString() {
+ return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len / 1024.0 / 1024.0
+ << "MB";
+}
+// if an operation deletes or creates a file (or moves etc.), it may need files closed.
+bool FileCreatedOp::needFilesClosed() {
+ return boost::filesystem::exists(_p.asFullPath());
+}
+
+void FileCreatedOp::replay() {
+ // i believe the code assumes new files are filled with zeros. thus we have to recreate the file,
+ // or rewrite at least, even if it were the right length. perhaps one day we should change that
+ // although easier to avoid defects if we assume it is zeros perhaps.
+ string full = _p.asFullPath();
+ if (boost::filesystem::exists(full)) {
+ try {
+ boost::filesystem::remove(full);
+ } catch (std::exception& e) {
+ LOG(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl;
+ }
}
+ log() << "recover create file " << full << ' ' << _len / 1024.0 / 1024.0 << "MB" << endl;
+ if (boost::filesystem::exists(full)) {
+ // first delete if exists.
+ try {
+ boost::filesystem::remove(full);
+ } catch (...) {
+ log() << "warning could not delete file " << full << endl;
+ }
+ }
+ ensureParentDirCreated(full);
+ File f;
+ f.open(full.c_str());
+ massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open());
+ unsigned long long left = _len;
+ const unsigned blksz = 64 * 1024;
+ unique_ptr<char[]> v(new char[blksz]);
+ memset(v.get(), 0, blksz);
+ fileofs ofs = 0;
+ while (left) {
+ unsigned long long w = left < blksz ? left : blksz;
+ f.write(ofs, v.get(), (unsigned)w);
+ left -= w;
+ ofs += w;
+ }
+ f.fsync();
+ flushMyDirectory(full);
+ massert(13628, str::stream() << "recover failure writing file " << full, !f.bad());
+}
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/durop.h b/src/mongo/db/storage/mmap_v1/durop.h
index 9ebddb3dfc0..a798f210616 100644
--- a/src/mongo/db/storage/mmap_v1/durop.h
+++ b/src/mongo/db/storage/mmap_v1/durop.h
@@ -37,86 +37,93 @@
namespace mongo {
- class AlignedBuilder;
-
- namespace dur {
-
- /** DurOp - Operations we journal that aren't just basic writes.
- *
- * Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent.
- * We don't make WriteIntent inherit from DurOp to keep it as lean as possible as there will be millions of
- * them (we don't want a vtable for example there).
- *
- * For each op we want to journal, we define a subclass.
- */
- class DurOp { /* copyable */
- public:
- // @param opcode a sentinel value near max unsigned which uniquely identifies the operation.
- // @see dur::JEntry
- DurOp(unsigned opcode) : _opcode(opcode) { }
-
- virtual ~DurOp() { }
-
- /** serialize the op out to a builder which will then be written (presumably) to the journal */
- void serialize(AlignedBuilder& ab);
-
- /** read a durop from journal file referenced by br.
- @param opcode the opcode which has already been written from the bufreader
- */
- static std::shared_ptr<DurOp> read(unsigned opcode, BufReader& br);
-
- /** replay the operation (during recovery)
- throws
-
- For now, these are not replayed during the normal WRITETODATAFILES phase, since these
- operations are handled in other parts of the code. At some point this may change.
- */
- virtual void replay() = 0;
-
- virtual std::string toString() = 0;
-
- /** if the op requires all file to be closed before doing its work, returns true. */
- virtual bool needFilesClosed() { return false; }
-
- protected:
- /** DurOp will have already written the opcode for you */
- virtual void _serialize(AlignedBuilder& ab) = 0;
-
- private:
- const unsigned _opcode;
- };
-
- /** indicates creation of a new file */
- class FileCreatedOp : public DurOp {
- public:
- FileCreatedOp(BufReader& log);
- /** param f filename to create with path */
- FileCreatedOp(const std::string& f, unsigned long long l);
- virtual void replay();
- virtual std::string toString();
- virtual bool needFilesClosed();
- protected:
- virtual void _serialize(AlignedBuilder& ab);
- private:
- RelativePath _p;
- unsigned long long _len; // size of file, not length of name
- };
-
- /** record drop of a database */
- class DropDbOp : public DurOp {
- public:
- DropDbOp(BufReader& log);
- DropDbOp(const std::string& db) :
- DurOp(JEntry::OpCode_DropDb), _db(db) { }
- virtual void replay();
- virtual std::string toString() { return std::string("DropDbOp ") + _db; }
- virtual bool needFilesClosed() { return true; }
- protected:
- virtual void _serialize(AlignedBuilder& ab);
- private:
- std::string _db;
- };
+class AlignedBuilder;
+
+namespace dur {
+
+/** DurOp - Operations we journal that aren't just basic writes.
+ *
+ * Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent.
+ * We don't make WriteIntent inherit from DurOp to keep it as lean as possible as there will be millions of
+ * them (we don't want a vtable for example there).
+ *
+ * For each op we want to journal, we define a subclass.
+ */
+class DurOp {/* copyable */
+public:
+ // @param opcode a sentinel value near max unsigned which uniquely identifies the operation.
+ // @see dur::JEntry
+ DurOp(unsigned opcode) : _opcode(opcode) {}
+
+ virtual ~DurOp() {}
+
+ /** serialize the op out to a builder which will then be written (presumably) to the journal */
+ void serialize(AlignedBuilder& ab);
+
+ /** read a durop from journal file referenced by br.
+ @param opcode the opcode which has already been written from the bufreader
+ */
+ static std::shared_ptr<DurOp> read(unsigned opcode, BufReader& br);
+
+ /** replay the operation (during recovery)
+ throws
+
+ For now, these are not replayed during the normal WRITETODATAFILES phase, since these
+ operations are handled in other parts of the code. At some point this may change.
+ */
+ virtual void replay() = 0;
+
+ virtual std::string toString() = 0;
+
+ /** if the op requires all file to be closed before doing its work, returns true. */
+ virtual bool needFilesClosed() {
+ return false;
+ }
+protected:
+ /** DurOp will have already written the opcode for you */
+ virtual void _serialize(AlignedBuilder& ab) = 0;
+
+private:
+ const unsigned _opcode;
+};
+
+/** indicates creation of a new file */
+class FileCreatedOp : public DurOp {
+public:
+ FileCreatedOp(BufReader& log);
+ /** param f filename to create with path */
+ FileCreatedOp(const std::string& f, unsigned long long l);
+ virtual void replay();
+ virtual std::string toString();
+ virtual bool needFilesClosed();
+
+protected:
+ virtual void _serialize(AlignedBuilder& ab);
+
+private:
+ RelativePath _p;
+ unsigned long long _len; // size of file, not length of name
+};
+
+/** record drop of a database */
+class DropDbOp : public DurOp {
+public:
+ DropDbOp(BufReader& log);
+ DropDbOp(const std::string& db) : DurOp(JEntry::OpCode_DropDb), _db(db) {}
+ virtual void replay();
+ virtual std::string toString() {
+ return std::string("DropDbOp ") + _db;
+ }
+ virtual bool needFilesClosed() {
+ return true;
}
+protected:
+ virtual void _serialize(AlignedBuilder& ab);
+
+private:
+ std::string _db;
+};
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/extent.cpp b/src/mongo/db/storage/mmap_v1/extent.cpp
index 905e4d28a9e..7f6d41cde80 100644
--- a/src/mongo/db/storage/mmap_v1/extent.cpp
+++ b/src/mongo/db/storage/mmap_v1/extent.cpp
@@ -36,82 +36,70 @@
namespace mongo {
- using std::iostream;
- using std::string;
- using std::vector;
+using std::iostream;
+using std::string;
+using std::vector;
- BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 );
+BOOST_STATIC_ASSERT(sizeof(Extent) - 4 == 48 + 128);
- BSONObj Extent::dump() const {
- return BSON( "loc" << myLoc.toString()
- << "xnext" << xnext.toString()
- << "xprev" << xprev.toString()
- << "nsdiag" << nsDiagnostic.toString()
- << "size" << length
- << "firstRecord"
- << firstRecord.toString()
- << "lastRecord" << lastRecord.toString() );
- }
+BSONObj Extent::dump() const {
+ return BSON("loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev"
+ << xprev.toString() << "nsdiag" << nsDiagnostic.toString() << "size" << length
+ << "firstRecord" << firstRecord.toString() << "lastRecord"
+ << lastRecord.toString());
+}
- void Extent::dump(iostream& s) const {
- s << " loc:" << myLoc.toString()
- << " xnext:" << xnext.toString()
- << " xprev:" << xprev.toString() << '\n';
- s << " nsdiag:" << nsDiagnostic.toString() << '\n';
- s << " size:" << length
- << " firstRecord:" << firstRecord.toString()
- << " lastRecord:" << lastRecord.toString() << '\n';
- }
+void Extent::dump(iostream& s) const {
+ s << " loc:" << myLoc.toString() << " xnext:" << xnext.toString()
+ << " xprev:" << xprev.toString() << '\n';
+ s << " nsdiag:" << nsDiagnostic.toString() << '\n';
+ s << " size:" << length << " firstRecord:" << firstRecord.toString()
+ << " lastRecord:" << lastRecord.toString() << '\n';
+}
- bool Extent::validates(const DiskLoc diskLoc, vector<string>* errors) const {
- bool extentOk = true;
- if (magic != extentSignature) {
- if (errors) {
- StringBuilder sb;
- sb << "bad extent signature " << integerToHex(magic)
- << " in extent " << diskLoc.toString();
- errors->push_back( sb.str() );
- }
- extentOk = false;
+bool Extent::validates(const DiskLoc diskLoc, vector<string>* errors) const {
+ bool extentOk = true;
+ if (magic != extentSignature) {
+ if (errors) {
+ StringBuilder sb;
+ sb << "bad extent signature " << integerToHex(magic) << " in extent "
+ << diskLoc.toString();
+ errors->push_back(sb.str());
}
- if (myLoc != diskLoc) {
- if (errors) {
- StringBuilder sb;
- sb << "extent " << diskLoc.toString()
- << " self-pointer is " << myLoc.toString();
- errors->push_back( sb.str() );
- }
- extentOk = false;
+ extentOk = false;
+ }
+ if (myLoc != diskLoc) {
+ if (errors) {
+ StringBuilder sb;
+ sb << "extent " << diskLoc.toString() << " self-pointer is " << myLoc.toString();
+ errors->push_back(sb.str());
}
- if (firstRecord.isNull() != lastRecord.isNull()) {
- if (errors) {
- StringBuilder sb;
- if (firstRecord.isNull()) {
- sb << "in extent " << diskLoc.toString()
- << ", firstRecord is null but lastRecord is "
- << lastRecord.toString();
- }
- else {
- sb << "in extent " << diskLoc.toString()
- << ", firstRecord is " << firstRecord.toString()
- << " but lastRecord is null";
- }
- errors->push_back( sb.str() );
+ extentOk = false;
+ }
+ if (firstRecord.isNull() != lastRecord.isNull()) {
+ if (errors) {
+ StringBuilder sb;
+ if (firstRecord.isNull()) {
+ sb << "in extent " << diskLoc.toString()
+ << ", firstRecord is null but lastRecord is " << lastRecord.toString();
+ } else {
+ sb << "in extent " << diskLoc.toString() << ", firstRecord is "
+ << firstRecord.toString() << " but lastRecord is null";
}
- extentOk = false;
+ errors->push_back(sb.str());
}
- static const int minSize = 0x1000;
- if (length < minSize) {
- if (errors) {
- StringBuilder sb;
- sb << "length of extent " << diskLoc.toString()
- << " is " << length
- << ", which is less than minimum length of " << minSize;
- errors->push_back( sb.str() );
- }
- extentOk = false;
+ extentOk = false;
+ }
+ static const int minSize = 0x1000;
+ if (length < minSize) {
+ if (errors) {
+ StringBuilder sb;
+ sb << "length of extent " << diskLoc.toString() << " is " << length
+ << ", which is less than minimum length of " << minSize;
+ errors->push_back(sb.str());
}
- return extentOk;
+ extentOk = false;
}
-
+ return extentOk;
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/extent.h b/src/mongo/db/storage/mmap_v1/extent.h
index a25d34c49e0..9d6d3935346 100644
--- a/src/mongo/db/storage/mmap_v1/extent.h
+++ b/src/mongo/db/storage/mmap_v1/extent.h
@@ -39,45 +39,50 @@
namespace mongo {
- /* extents are datafile regions where all the records within the region
- belong to the same namespace.
+/* extents are datafile regions where all the records within the region
+ belong to the same namespace.
- (11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord
- (11:12:55 AM) dm10gen: and that is placed on the free list
- */
+(11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord
+(11:12:55 AM) dm10gen: and that is placed on the free list
+*/
#pragma pack(1)
- struct Extent {
- enum { extentSignature = 0x41424344 };
- unsigned magic;
- DiskLoc myLoc;
+struct Extent {
+ enum { extentSignature = 0x41424344 };
+ unsigned magic;
+ DiskLoc myLoc;
- /* next/prev extent for this namespace */
- DiskLoc xnext;
- DiskLoc xprev;
+ /* next/prev extent for this namespace */
+ DiskLoc xnext;
+ DiskLoc xprev;
- /* which namespace this extent is for. this is just for troubleshooting really
- and won't even be correct if the collection were renamed!
- */
- Namespace nsDiagnostic;
+ /* which namespace this extent is for. this is just for troubleshooting really
+ and won't even be correct if the collection were renamed!
+ */
+ Namespace nsDiagnostic;
- int length; /* size of the extent, including these fields */
- DiskLoc firstRecord;
- DiskLoc lastRecord;
- char _extentData[4];
+ int length; /* size of the extent, including these fields */
+ DiskLoc firstRecord;
+ DiskLoc lastRecord;
+ char _extentData[4];
- // -----
+ // -----
- bool validates(const DiskLoc diskLoc, std::vector<std::string>* errors = NULL) const;
+ bool validates(const DiskLoc diskLoc, std::vector<std::string>* errors = NULL) const;
- BSONObj dump() const;
+ BSONObj dump() const;
- void dump(std::iostream& s) const;
+ void dump(std::iostream& s) const;
- bool isOk() const { return magic == extentSignature; }
- void assertOk() const { verify(isOk()); }
+ bool isOk() const {
+ return magic == extentSignature;
+ }
+ void assertOk() const {
+ verify(isOk());
+ }
- static int HeaderSize() { return sizeof(Extent)-4; }
- };
+ static int HeaderSize() {
+ return sizeof(Extent) - 4;
+ }
+};
#pragma pack()
-
}
diff --git a/src/mongo/db/storage/mmap_v1/extent_manager.cpp b/src/mongo/db/storage/mmap_v1/extent_manager.cpp
index 8efc2cbc50f..15222fac01a 100644
--- a/src/mongo/db/storage/mmap_v1/extent_manager.cpp
+++ b/src/mongo/db/storage/mmap_v1/extent_manager.cpp
@@ -34,66 +34,64 @@
namespace mongo {
- int ExtentManager::quantizeExtentSize( int size ) const {
+int ExtentManager::quantizeExtentSize(int size) const {
+ if (size == maxSize()) {
+ // no point doing quantizing for the entire file
+ return size;
+ }
- if ( size == maxSize() ) {
- // no point doing quantizing for the entire file
- return size;
- }
+ invariant(size <= maxSize());
- invariant( size <= maxSize() );
+ // make sizes align with VM page size
+ int newSize = (size + 0xfff) & 0xfffff000;
- // make sizes align with VM page size
- int newSize = (size + 0xfff) & 0xfffff000;
+ if (newSize > maxSize()) {
+ return maxSize();
+ }
- if ( newSize > maxSize() ) {
- return maxSize();
- }
+ if (newSize < minSize()) {
+ return minSize();
+ }
- if ( newSize < minSize() ) {
- return minSize();
- }
+ return newSize;
+}
- return newSize;
+int ExtentManager::followupSize(int len, int lastExtentLen) const {
+ invariant(len < maxSize());
+ int x = initialSize(len);
+ // changed from 1.20 to 1.35 in v2.1.x to get to larger extent size faster
+ int y = (int)(lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.35);
+ int sz = y > x ? y : x;
+
+ if (sz < lastExtentLen) {
+ // this means there was an int overflow
+ // so we should turn it into maxSize
+ return maxSize();
+ } else if (sz > maxSize()) {
+ return maxSize();
}
- int ExtentManager::followupSize( int len, int lastExtentLen ) const {
- invariant( len < maxSize() );
- int x = initialSize(len);
- // changed from 1.20 to 1.35 in v2.1.x to get to larger extent size faster
- int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.35);
- int sz = y > x ? y : x;
-
- if ( sz < lastExtentLen ) {
- // this means there was an int overflow
- // so we should turn it into maxSize
- return maxSize();
- }
- else if ( sz > maxSize() ) {
- return maxSize();
- }
-
- sz = quantizeExtentSize( sz );
- verify( sz >= len );
-
- return sz;
- }
+ sz = quantizeExtentSize(sz);
+ verify(sz >= len);
- int ExtentManager::initialSize( int len ) const {
- invariant( len <= maxSize() );
+ return sz;
+}
- long long sz = len * 16;
- if ( len < 1000 )
- sz = len * 64;
+int ExtentManager::initialSize(int len) const {
+ invariant(len <= maxSize());
- if ( sz >= maxSize() )
- return maxSize();
+ long long sz = len * 16;
+ if (len < 1000)
+ sz = len * 64;
- if ( sz <= minSize() )
- return minSize();
+ if (sz >= maxSize())
+ return maxSize();
- int z = ExtentManager::quantizeExtentSize( sz );
- verify( z >= len );
- return z;
- }
+ if (sz <= minSize())
+ return minSize();
+
+ int z = ExtentManager::quantizeExtentSize(sz);
+ verify(z >= len);
+ return z;
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/extent_manager.h b/src/mongo/db/storage/mmap_v1/extent_manager.h
index 54191faa2cf..6151f8e11a2 100644
--- a/src/mongo/db/storage/mmap_v1/extent_manager.h
+++ b/src/mongo/db/storage/mmap_v1/extent_manager.h
@@ -40,141 +40,141 @@
namespace mongo {
- class DataFile;
- class MmapV1RecordHeader;
- class RecordFetcher;
- class OperationContext;
+class DataFile;
+class MmapV1RecordHeader;
+class RecordFetcher;
+class OperationContext;
- struct Extent;
+struct Extent;
+
+/**
+ * ExtentManager basics
+ * - one per database
+ * - responsible for managing <db>.# files
+ * - NOT responsible for .ns file
+ * - gives out extents
+ * - responsible for figuring out how to get a new extent
+ * - can use any method it wants to do so
+ * - this structure is NOT stored on disk
+ * - files will not be removed from the EM
+ * - extent size and loc are immutable
+ * - this class is thread safe, once constructed and init()-ialized
+ */
+class ExtentManager {
+ MONGO_DISALLOW_COPYING(ExtentManager);
+
+public:
+ ExtentManager() {}
+
+ virtual ~ExtentManager() {}
+
+ /**
+ * opens all current files
+ */
+ virtual Status init(OperationContext* txn) = 0;
+
+ virtual int numFiles() const = 0;
+ virtual long long fileSize() const = 0;
+
+ // must call Extent::reuse on the returned extent
+ virtual DiskLoc allocateExtent(OperationContext* txn,
+ bool capped,
+ int size,
+ bool enforceQuota) = 0;
+
+ /**
+ * firstExt has to be == lastExt or a chain
+ */
+ virtual void freeExtents(OperationContext* txn, DiskLoc firstExt, DiskLoc lastExt) = 0;
/**
- * ExtentManager basics
- * - one per database
- * - responsible for managing <db>.# files
- * - NOT responsible for .ns file
- * - gives out extents
- * - responsible for figuring out how to get a new extent
- * - can use any method it wants to do so
- * - this structure is NOT stored on disk
- * - files will not be removed from the EM
- * - extent size and loc are immutable
- * - this class is thread safe, once constructed and init()-ialized
+ * frees a single extent
+ * ignores all fields in the Extent except: magic, myLoc, length
*/
- class ExtentManager {
- MONGO_DISALLOW_COPYING( ExtentManager );
+ virtual void freeExtent(OperationContext* txn, DiskLoc extent) = 0;
+ /**
+ * Retrieve statistics on the the free list managed by this ExtentManger.
+ * @param numExtents - non-null pointer to an int that will receive the number of extents
+ * @param totalFreeSizeBytes - non-null pointer to an int64_t receiving the total free
+ * space in the free list.
+ */
+ virtual void freeListStats(OperationContext* txn,
+ int* numExtents,
+ int64_t* totalFreeSizeBytes) const = 0;
+
+ /**
+ * @param loc - has to be for a specific MmapV1RecordHeader
+ * Note(erh): this sadly cannot be removed.
+ * A MmapV1RecordHeader DiskLoc has an offset from a file, while a RecordStore really wants an offset
+ * from an extent. This intrinsically links an original record store to the original extent
+ * manager.
+ */
+ virtual MmapV1RecordHeader* recordForV1(const DiskLoc& loc) const = 0;
+
+ /**
+ * The extent manager tracks accesses to DiskLocs. This returns non-NULL if the DiskLoc has
+ * been recently accessed, and therefore has likely been paged into physical memory.
+ * Returns nullptr if the DiskLoc is Null.
+ *
+ */
+ virtual std::unique_ptr<RecordFetcher> recordNeedsFetch(const DiskLoc& loc) const = 0;
+
+ /**
+ * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent)
+ * Note(erh) see comment on recordFor
+ */
+ virtual Extent* extentForV1(const DiskLoc& loc) const = 0;
+
+ /**
+ * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent)
+ * Note(erh) see comment on recordFor
+ */
+ virtual DiskLoc extentLocForV1(const DiskLoc& loc) const = 0;
+
+ /**
+ * @param loc - has to be for a specific Extent
+ */
+ virtual Extent* getExtent(const DiskLoc& loc, bool doSanityCheck = true) const = 0;
+
+ /**
+ * @return maximum size of an Extent
+ */
+ virtual int maxSize() const = 0;
+
+ /**
+ * @return minimum size of an Extent
+ */
+ virtual int minSize() const {
+ return 0x1000;
+ }
+
+ /**
+ * @param recordLen length of record we need
+ * @param lastExt size of last extent which is a factor in next extent size
+ */
+ virtual int followupSize(int recordLen, int lastExtentLen) const;
+
+ /** get a suggested size for the first extent in a namespace
+ * @param recordLen length of record we need to insert
+ */
+ virtual int initialSize(int recordLen) const;
+
+ /**
+ * quantizes extent size to >= min + page boundary
+ */
+ virtual int quantizeExtentSize(int size) const;
+
+ // see cacheHint methods
+ enum HintType { Sequential, Random };
+ class CacheHint {
public:
- ExtentManager(){}
-
- virtual ~ExtentManager(){}
-
- /**
- * opens all current files
- */
- virtual Status init(OperationContext* txn) = 0;
-
- virtual int numFiles() const = 0;
- virtual long long fileSize() const = 0;
-
- // must call Extent::reuse on the returned extent
- virtual DiskLoc allocateExtent( OperationContext* txn,
- bool capped,
- int size,
- bool enforceQuota ) = 0;
-
- /**
- * firstExt has to be == lastExt or a chain
- */
- virtual void freeExtents( OperationContext* txn,
- DiskLoc firstExt, DiskLoc lastExt ) = 0;
-
- /**
- * frees a single extent
- * ignores all fields in the Extent except: magic, myLoc, length
- */
- virtual void freeExtent( OperationContext* txn, DiskLoc extent ) = 0;
-
- /**
- * Retrieve statistics on the the free list managed by this ExtentManger.
- * @param numExtents - non-null pointer to an int that will receive the number of extents
- * @param totalFreeSizeBytes - non-null pointer to an int64_t receiving the total free
- * space in the free list.
- */
- virtual void freeListStats(OperationContext* txn,
- int* numExtents,
- int64_t* totalFreeSizeBytes) const = 0;
-
- /**
- * @param loc - has to be for a specific MmapV1RecordHeader
- * Note(erh): this sadly cannot be removed.
- * A MmapV1RecordHeader DiskLoc has an offset from a file, while a RecordStore really wants an offset
- * from an extent. This intrinsically links an original record store to the original extent
- * manager.
- */
- virtual MmapV1RecordHeader* recordForV1( const DiskLoc& loc ) const = 0;
-
- /**
- * The extent manager tracks accesses to DiskLocs. This returns non-NULL if the DiskLoc has
- * been recently accessed, and therefore has likely been paged into physical memory.
- * Returns nullptr if the DiskLoc is Null.
- *
- */
- virtual std::unique_ptr<RecordFetcher> recordNeedsFetch( const DiskLoc& loc ) const = 0;
-
- /**
- * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent)
- * Note(erh) see comment on recordFor
- */
- virtual Extent* extentForV1( const DiskLoc& loc ) const = 0;
-
- /**
- * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent)
- * Note(erh) see comment on recordFor
- */
- virtual DiskLoc extentLocForV1( const DiskLoc& loc ) const = 0;
-
- /**
- * @param loc - has to be for a specific Extent
- */
- virtual Extent* getExtent( const DiskLoc& loc, bool doSanityCheck = true ) const = 0;
-
- /**
- * @return maximum size of an Extent
- */
- virtual int maxSize() const = 0;
-
- /**
- * @return minimum size of an Extent
- */
- virtual int minSize() const { return 0x1000; }
-
- /**
- * @param recordLen length of record we need
- * @param lastExt size of last extent which is a factor in next extent size
- */
- virtual int followupSize( int recordLen, int lastExtentLen ) const;
-
- /** get a suggested size for the first extent in a namespace
- * @param recordLen length of record we need to insert
- */
- virtual int initialSize( int recordLen ) const;
-
- /**
- * quantizes extent size to >= min + page boundary
- */
- virtual int quantizeExtentSize( int size ) const;
-
- // see cacheHint methods
- enum HintType { Sequential, Random };
- class CacheHint {
- public:
- virtual ~CacheHint(){}
- };
- /**
- * Tell the system that for this extent, it will have this kind of disk access.
- * Caller takes owernship of CacheHint
- */
- virtual CacheHint* cacheHint( const DiskLoc& extentLoc, const HintType& hint ) = 0;
+ virtual ~CacheHint() {}
};
-
+ /**
+ * Tell the system that for this extent, it will have this kind of disk access.
+ * Caller takes owernship of CacheHint
+ */
+ virtual CacheHint* cacheHint(const DiskLoc& extentLoc, const HintType& hint) = 0;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/file_allocator.cpp b/src/mongo/db/storage/mmap_v1/file_allocator.cpp
index bedd7d9e03d..0500ad43a83 100644
--- a/src/mongo/db/storage/mmap_v1/file_allocator.cpp
+++ b/src/mongo/db/storage/mmap_v1/file_allocator.cpp
@@ -38,16 +38,16 @@
#include <fcntl.h>
#if defined(__FreeBSD__)
-# include <sys/param.h>
-# include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/mount.h>
#endif
#if defined(__linux__)
-# include <sys/vfs.h>
+#include <sys/vfs.h>
#endif
#if defined(_WIN32)
-# include <io.h>
+#include <io.h>
#endif
#include "mongo/db/storage/paths.h"
@@ -71,402 +71,410 @@ using namespace mongoutils;
namespace mongo {
- using std::endl;
- using std::list;
- using std::string;
- using std::stringstream;
+using std::endl;
+using std::list;
+using std::string;
+using std::stringstream;
- // unique number for temporary file names
- unsigned long long FileAllocator::_uniqueNumber = 0;
- static SimpleMutex _uniqueNumberMutex;
+// unique number for temporary file names
+unsigned long long FileAllocator::_uniqueNumber = 0;
+static SimpleMutex _uniqueNumberMutex;
- MONGO_FP_DECLARE(allocateDiskFull);
+MONGO_FP_DECLARE(allocateDiskFull);
- /**
- * Aliases for Win32 CRT functions
- */
+/**
+ * Aliases for Win32 CRT functions
+ */
#if defined(_WIN32)
- static inline long lseek(int fd, long offset, int origin) { return _lseek(fd, offset, origin); }
- static inline int write(int fd, const void *data, int count) { return _write(fd, data, count); }
- static inline int close(int fd) { return _close(fd); }
-
- typedef BOOL (CALLBACK *GetVolumeInformationByHandleWPtr)(HANDLE, LPWSTR, DWORD, LPDWORD, LPDWORD, LPDWORD, LPWSTR, DWORD);
- GetVolumeInformationByHandleWPtr GetVolumeInformationByHandleWFunc;
-
- MONGO_INITIALIZER(InitGetVolumeInformationByHandleW)(InitializerContext *context) {
- HMODULE kernelLib = LoadLibraryA("kernel32.dll");
- if (kernelLib) {
- GetVolumeInformationByHandleWFunc = reinterpret_cast<GetVolumeInformationByHandleWPtr>
- (GetProcAddress(kernelLib, "GetVolumeInformationByHandleW"));
- }
- return Status::OK();
+static inline long lseek(int fd, long offset, int origin) {
+ return _lseek(fd, offset, origin);
+}
+static inline int write(int fd, const void* data, int count) {
+ return _write(fd, data, count);
+}
+static inline int close(int fd) {
+ return _close(fd);
+}
+
+typedef BOOL(CALLBACK* GetVolumeInformationByHandleWPtr)(
+ HANDLE, LPWSTR, DWORD, LPDWORD, LPDWORD, LPDWORD, LPWSTR, DWORD);
+GetVolumeInformationByHandleWPtr GetVolumeInformationByHandleWFunc;
+
+MONGO_INITIALIZER(InitGetVolumeInformationByHandleW)(InitializerContext* context) {
+ HMODULE kernelLib = LoadLibraryA("kernel32.dll");
+ if (kernelLib) {
+ GetVolumeInformationByHandleWFunc = reinterpret_cast<GetVolumeInformationByHandleWPtr>(
+ GetProcAddress(kernelLib, "GetVolumeInformationByHandleW"));
}
+ return Status::OK();
+}
#endif
- boost::filesystem::path ensureParentDirCreated(const boost::filesystem::path& p){
- const boost::filesystem::path parent = p.branch_path();
+boost::filesystem::path ensureParentDirCreated(const boost::filesystem::path& p) {
+ const boost::filesystem::path parent = p.branch_path();
- if (! boost::filesystem::exists(parent)){
- ensureParentDirCreated(parent);
- log() << "creating directory " << parent.string() << endl;
- boost::filesystem::create_directory(parent);
- flushMyDirectory(parent); // flushes grandparent to ensure parent exists after crash
- }
-
- verify(boost::filesystem::is_directory(parent));
- return parent;
+ if (!boost::filesystem::exists(parent)) {
+ ensureParentDirCreated(parent);
+ log() << "creating directory " << parent.string() << endl;
+ boost::filesystem::create_directory(parent);
+ flushMyDirectory(parent); // flushes grandparent to ensure parent exists after crash
}
- FileAllocator::FileAllocator() : _failed() {}
+ verify(boost::filesystem::is_directory(parent));
+ return parent;
+}
+FileAllocator::FileAllocator() : _failed() {}
- void FileAllocator::start() {
- stdx::thread t( stdx::bind( &FileAllocator::run , this ) );
- }
- void FileAllocator::requestAllocation( const string &name, long &size ) {
- stdx::lock_guard<stdx::mutex> lk( _pendingMutex );
- if ( _failed )
- return;
- long oldSize = prevSize( name );
- if ( oldSize != -1 ) {
- size = oldSize;
- return;
- }
- _pending.push_back( name );
- _pendingSize[ name ] = size;
- _pendingUpdated.notify_all();
- }
-
- void FileAllocator::allocateAsap( const string &name, unsigned long long &size ) {
- stdx::unique_lock<stdx::mutex> lk( _pendingMutex );
-
- // In case the allocator is in failed state, check once before starting so that subsequent
- // requests for the same database would fail fast after the first one has failed.
- checkFailure();
-
- long oldSize = prevSize( name );
- if ( oldSize != -1 ) {
- size = oldSize;
- if ( !inProgress( name ) )
- return;
- }
- checkFailure();
- _pendingSize[ name ] = size;
- if ( _pending.size() == 0 )
- _pending.push_back( name );
- else if ( _pending.front() != name ) {
- _pending.remove( name );
- list< string >::iterator i = _pending.begin();
- ++i;
- _pending.insert( i, name );
- }
- _pendingUpdated.notify_all();
- while( inProgress( name ) ) {
- checkFailure();
- _pendingUpdated.wait(lk);
- }
+void FileAllocator::start() {
+ stdx::thread t(stdx::bind(&FileAllocator::run, this));
+}
+void FileAllocator::requestAllocation(const string& name, long& size) {
+ stdx::lock_guard<stdx::mutex> lk(_pendingMutex);
+ if (_failed)
+ return;
+ long oldSize = prevSize(name);
+ if (oldSize != -1) {
+ size = oldSize;
+ return;
}
-
- void FileAllocator::waitUntilFinished() const {
- if ( _failed )
+ _pending.push_back(name);
+ _pendingSize[name] = size;
+ _pendingUpdated.notify_all();
+}
+
+void FileAllocator::allocateAsap(const string& name, unsigned long long& size) {
+ stdx::unique_lock<stdx::mutex> lk(_pendingMutex);
+
+ // In case the allocator is in failed state, check once before starting so that subsequent
+ // requests for the same database would fail fast after the first one has failed.
+ checkFailure();
+
+ long oldSize = prevSize(name);
+ if (oldSize != -1) {
+ size = oldSize;
+ if (!inProgress(name))
return;
- stdx::unique_lock<stdx::mutex> lk( _pendingMutex );
- while( _pending.size() != 0 )
- _pendingUpdated.wait(lk);
}
-
- // TODO: pull this out to per-OS files once they exist
- static bool useSparseFiles(int fd) {
-
+ checkFailure();
+ _pendingSize[name] = size;
+ if (_pending.size() == 0)
+ _pending.push_back(name);
+ else if (_pending.front() != name) {
+ _pending.remove(name);
+ list<string>::iterator i = _pending.begin();
+ ++i;
+ _pending.insert(i, name);
+ }
+ _pendingUpdated.notify_all();
+ while (inProgress(name)) {
+ checkFailure();
+ _pendingUpdated.wait(lk);
+ }
+}
+
+void FileAllocator::waitUntilFinished() const {
+ if (_failed)
+ return;
+ stdx::unique_lock<stdx::mutex> lk(_pendingMutex);
+ while (_pending.size() != 0)
+ _pendingUpdated.wait(lk);
+}
+
+// TODO: pull this out to per-OS files once they exist
+static bool useSparseFiles(int fd) {
#if defined(__linux__) || defined(__FreeBSD__)
- struct statfs fs_stats;
- int ret = fstatfs(fd, &fs_stats);
- uassert(16062, "fstatfs failed: " + errnoWithDescription(), ret == 0);
+ struct statfs fs_stats;
+ int ret = fstatfs(fd, &fs_stats);
+ uassert(16062, "fstatfs failed: " + errnoWithDescription(), ret == 0);
#endif
#if defined(__linux__)
// these are from <linux/magic.h> but that isn't available on all systems
-# define NFS_SUPER_MAGIC 0x6969
-# define TMPFS_MAGIC 0x01021994
+#define NFS_SUPER_MAGIC 0x6969
+#define TMPFS_MAGIC 0x01021994
- return (fs_stats.f_type == NFS_SUPER_MAGIC)
- || (fs_stats.f_type == TMPFS_MAGIC)
- ;
+ return (fs_stats.f_type == NFS_SUPER_MAGIC) || (fs_stats.f_type == TMPFS_MAGIC);
#elif defined(__FreeBSD__)
- return (str::equals(fs_stats.f_fstypename, "zfs") ||
+ return (str::equals(fs_stats.f_fstypename, "zfs") ||
str::equals(fs_stats.f_fstypename, "nfs") ||
str::equals(fs_stats.f_fstypename, "oldnfs"));
#elif defined(__sun)
- // assume using ZFS which is copy-on-write so no benefit to zero-filling
- // TODO: check which fs we are using like we do elsewhere
- return true;
+ // assume using ZFS which is copy-on-write so no benefit to zero-filling
+ // TODO: check which fs we are using like we do elsewhere
+ return true;
#else
- return false;
+ return false;
#endif
- }
+}
#if defined(_WIN32)
- static bool isFileOnNTFSVolume(int fd) {
- if (!GetVolumeInformationByHandleWFunc) {
- warning() << "Could not retrieve pointer to GetVolumeInformationByHandleW function";
- return false;
- }
-
- HANDLE fileHandle = (HANDLE)_get_osfhandle(fd);
- if (fileHandle == INVALID_HANDLE_VALUE) {
- warning() << "_get_osfhandle() failed with " << _strerror(NULL);
- return false;
- }
+static bool isFileOnNTFSVolume(int fd) {
+ if (!GetVolumeInformationByHandleWFunc) {
+ warning() << "Could not retrieve pointer to GetVolumeInformationByHandleW function";
+ return false;
+ }
- WCHAR fileSystemName[MAX_PATH + 1];
- if (!GetVolumeInformationByHandleWFunc(fileHandle, NULL, 0, NULL, 0, NULL, fileSystemName, sizeof(fileSystemName))) {
- DWORD gle = GetLastError();
- warning() << "GetVolumeInformationByHandleW failed with " << errnoWithDescription(gle);
- return false;
- }
+ HANDLE fileHandle = (HANDLE)_get_osfhandle(fd);
+ if (fileHandle == INVALID_HANDLE_VALUE) {
+ warning() << "_get_osfhandle() failed with " << _strerror(NULL);
+ return false;
+ }
- return lstrcmpW(fileSystemName, L"NTFS") == 0;
+ WCHAR fileSystemName[MAX_PATH + 1];
+ if (!GetVolumeInformationByHandleWFunc(
+ fileHandle, NULL, 0, NULL, 0, NULL, fileSystemName, sizeof(fileSystemName))) {
+ DWORD gle = GetLastError();
+ warning() << "GetVolumeInformationByHandleW failed with " << errnoWithDescription(gle);
+ return false;
}
+
+ return lstrcmpW(fileSystemName, L"NTFS") == 0;
+}
#endif
- void FileAllocator::ensureLength(int fd , long size) {
- // Test running out of disk scenarios
- if (MONGO_FAIL_POINT(allocateDiskFull)) {
- uasserted( 10444 , "File allocation failed due to failpoint.");
- }
+void FileAllocator::ensureLength(int fd, long size) {
+ // Test running out of disk scenarios
+ if (MONGO_FAIL_POINT(allocateDiskFull)) {
+ uasserted(10444, "File allocation failed due to failpoint.");
+ }
#if !defined(_WIN32)
- if (useSparseFiles(fd)) {
- LOG(1) << "using ftruncate to create a sparse file" << endl;
- int ret = ftruncate(fd, size);
- uassert(16063, "ftruncate failed: " + errnoWithDescription(), ret == 0);
- return;
- }
+ if (useSparseFiles(fd)) {
+ LOG(1) << "using ftruncate to create a sparse file" << endl;
+ int ret = ftruncate(fd, size);
+ uassert(16063, "ftruncate failed: " + errnoWithDescription(), ret == 0);
+ return;
+ }
#endif
#if defined(__linux__)
- int ret = posix_fallocate(fd,0,size);
- if ( ret == 0 )
- return;
+ int ret = posix_fallocate(fd, 0, size);
+ if (ret == 0)
+ return;
- log() << "FileAllocator: posix_fallocate failed: " << errnoWithDescription( ret ) << " falling back" << endl;
+ log() << "FileAllocator: posix_fallocate failed: " << errnoWithDescription(ret)
+ << " falling back" << endl;
#endif
- off_t filelen = lseek( fd, 0, SEEK_END );
- if ( filelen < size ) {
- if (filelen != 0) {
- stringstream ss;
- ss << "failure creating new datafile; lseek failed for fd " << fd << " with errno: " << errnoWithDescription();
- uassert( 10440 , ss.str(), filelen == 0 );
- }
- // Check for end of disk.
-
- uassert( 10441 , str::stream() << "Unable to allocate new file of size " << size << ' ' << errnoWithDescription(),
- size - 1 == lseek(fd, size - 1, SEEK_SET) );
- uassert( 10442 , str::stream() << "Unable to allocate new file of size " << size << ' ' << errnoWithDescription(),
- 1 == write(fd, "", 1) );
-
- // File expansion is completed here. Do not do the zeroing out on OS-es where there
- // is no risk of triggering allocation-related bugs such as
- // http://support.microsoft.com/kb/2731284.
- //
- if (!ProcessInfo::isDataFileZeroingNeeded()) {
- return;
- }
+ off_t filelen = lseek(fd, 0, SEEK_END);
+ if (filelen < size) {
+ if (filelen != 0) {
+ stringstream ss;
+ ss << "failure creating new datafile; lseek failed for fd " << fd
+ << " with errno: " << errnoWithDescription();
+ uassert(10440, ss.str(), filelen == 0);
+ }
+ // Check for end of disk.
+
+ uassert(10441,
+ str::stream() << "Unable to allocate new file of size " << size << ' '
+ << errnoWithDescription(),
+ size - 1 == lseek(fd, size - 1, SEEK_SET));
+ uassert(10442,
+ str::stream() << "Unable to allocate new file of size " << size << ' '
+ << errnoWithDescription(),
+ 1 == write(fd, "", 1));
+
+ // File expansion is completed here. Do not do the zeroing out on OS-es where there
+ // is no risk of triggering allocation-related bugs such as
+ // http://support.microsoft.com/kb/2731284.
+ //
+ if (!ProcessInfo::isDataFileZeroingNeeded()) {
+ return;
+ }
#if defined(_WIN32)
- if (!isFileOnNTFSVolume(fd)) {
- log() << "No need to zero out datafile on non-NTFS volume" << endl;
- return;
- }
-#endif
-
- lseek(fd, 0, SEEK_SET);
-
- const long z = 256 * 1024;
- const std::unique_ptr<char[]> buf_holder (new char[z]);
- char* buf = buf_holder.get();
- memset(buf, 0, z);
- long left = size;
- while ( left > 0 ) {
- long towrite = left;
- if ( towrite > z )
- towrite = z;
-
- int written = write( fd , buf , towrite );
- uassert( 10443 , errnoWithPrefix("FileAllocator: file write failed" ), written > 0 );
- left -= written;
- }
+ if (!isFileOnNTFSVolume(fd)) {
+ log() << "No need to zero out datafile on non-NTFS volume" << endl;
+ return;
}
- }
+#endif
- void FileAllocator::checkFailure() {
- if (_failed) {
- // we want to log the problem (diskfull.js expects it) but we do not want to dump a stack tracke
- msgassertedNoTrace( 12520, "new file allocation failure" );
+ lseek(fd, 0, SEEK_SET);
+
+ const long z = 256 * 1024;
+ const std::unique_ptr<char[]> buf_holder(new char[z]);
+ char* buf = buf_holder.get();
+ memset(buf, 0, z);
+ long left = size;
+ while (left > 0) {
+ long towrite = left;
+ if (towrite > z)
+ towrite = z;
+
+ int written = write(fd, buf, towrite);
+ uassert(10443, errnoWithPrefix("FileAllocator: file write failed"), written > 0);
+ left -= written;
}
}
+}
- long FileAllocator::prevSize( const string &name ) const {
- if ( _pendingSize.count( name ) > 0 )
- return _pendingSize[ name ];
- if ( boost::filesystem::exists( name ) )
- return boost::filesystem::file_size( name );
- return -1;
+void FileAllocator::checkFailure() {
+ if (_failed) {
+ // we want to log the problem (diskfull.js expects it) but we do not want to dump a stack tracke
+ msgassertedNoTrace(12520, "new file allocation failure");
}
-
- // caller must hold _pendingMutex lock.
- bool FileAllocator::inProgress( const string &name ) const {
- for( list< string >::const_iterator i = _pending.begin(); i != _pending.end(); ++i )
- if ( *i == name )
- return true;
- return false;
- }
-
- string FileAllocator::makeTempFileName( boost::filesystem::path root ) {
- while( 1 ) {
- boost::filesystem::path p = root / "_tmp";
- stringstream ss;
- unsigned long long thisUniqueNumber;
- {
- // increment temporary file name counter
- // TODO: SERVER-6055 -- Unify temporary file name selection
- stdx::lock_guard<SimpleMutex> lk(_uniqueNumberMutex);
- thisUniqueNumber = _uniqueNumber;
- ++_uniqueNumber;
- }
- ss << thisUniqueNumber;
- p /= ss.str();
- string fn = p.string();
- if( !boost::filesystem::exists(p) )
- return fn;
- }
- return "";
- }
-
- void FileAllocator::run( FileAllocator * fa ) {
- setThreadName( "FileAllocator" );
+}
+
+long FileAllocator::prevSize(const string& name) const {
+ if (_pendingSize.count(name) > 0)
+ return _pendingSize[name];
+ if (boost::filesystem::exists(name))
+ return boost::filesystem::file_size(name);
+ return -1;
+}
+
+// caller must hold _pendingMutex lock.
+bool FileAllocator::inProgress(const string& name) const {
+ for (list<string>::const_iterator i = _pending.begin(); i != _pending.end(); ++i)
+ if (*i == name)
+ return true;
+ return false;
+}
+
+string FileAllocator::makeTempFileName(boost::filesystem::path root) {
+ while (1) {
+ boost::filesystem::path p = root / "_tmp";
+ stringstream ss;
+ unsigned long long thisUniqueNumber;
{
- // initialize unique temporary file name counter
+ // increment temporary file name counter
// TODO: SERVER-6055 -- Unify temporary file name selection
stdx::lock_guard<SimpleMutex> lk(_uniqueNumberMutex);
- _uniqueNumber = curTimeMicros64();
+ thisUniqueNumber = _uniqueNumber;
+ ++_uniqueNumber;
}
- while( 1 ) {
+ ss << thisUniqueNumber;
+ p /= ss.str();
+ string fn = p.string();
+ if (!boost::filesystem::exists(p))
+ return fn;
+ }
+ return "";
+}
+
+void FileAllocator::run(FileAllocator* fa) {
+ setThreadName("FileAllocator");
+ {
+ // initialize unique temporary file name counter
+ // TODO: SERVER-6055 -- Unify temporary file name selection
+ stdx::lock_guard<SimpleMutex> lk(_uniqueNumberMutex);
+ _uniqueNumber = curTimeMicros64();
+ }
+ while (1) {
+ {
+ stdx::unique_lock<stdx::mutex> lk(fa->_pendingMutex);
+ if (fa->_pending.size() == 0)
+ fa->_pendingUpdated.wait(lk);
+ }
+ while (1) {
+ string name;
+ long size = 0;
{
- stdx::unique_lock<stdx::mutex> lk( fa->_pendingMutex );
- if ( fa->_pending.size() == 0 )
- fa->_pendingUpdated.wait(lk);
+ stdx::lock_guard<stdx::mutex> lk(fa->_pendingMutex);
+ if (fa->_pending.size() == 0)
+ break;
+ name = fa->_pending.front();
+ size = fa->_pendingSize[name];
}
- while( 1 ) {
- string name;
- long size = 0;
- {
- stdx::lock_guard<stdx::mutex> lk( fa->_pendingMutex );
- if ( fa->_pending.size() == 0 )
- break;
- name = fa->_pending.front();
- size = fa->_pendingSize[ name ];
- }
- string tmp;
- long fd = 0;
- try {
- log() << "allocating new datafile " << name << ", filling with zeroes..." << endl;
-
- boost::filesystem::path parent = ensureParentDirCreated(name);
- tmp = fa->makeTempFileName( parent );
- ensureParentDirCreated(tmp);
+ string tmp;
+ long fd = 0;
+ try {
+ log() << "allocating new datafile " << name << ", filling with zeroes..." << endl;
+
+ boost::filesystem::path parent = ensureParentDirCreated(name);
+ tmp = fa->makeTempFileName(parent);
+ ensureParentDirCreated(tmp);
#if defined(_WIN32)
- fd = _open( tmp.c_str(), _O_RDWR | _O_CREAT | O_NOATIME, _S_IREAD | _S_IWRITE );
+ fd = _open(tmp.c_str(), _O_RDWR | _O_CREAT | O_NOATIME, _S_IREAD | _S_IWRITE);
#else
- fd = open(tmp.c_str(), O_CREAT | O_RDWR | O_NOATIME, S_IRUSR | S_IWUSR);
+ fd = open(tmp.c_str(), O_CREAT | O_RDWR | O_NOATIME, S_IRUSR | S_IWUSR);
#endif
- if ( fd < 0 ) {
- log() << "FileAllocator: couldn't create " << name << " (" << tmp << ") " << errnoWithDescription() << endl;
- uasserted(10439, "");
- }
+ if (fd < 0) {
+ log() << "FileAllocator: couldn't create " << name << " (" << tmp << ") "
+ << errnoWithDescription() << endl;
+ uasserted(10439, "");
+ }
#if defined(POSIX_FADV_DONTNEED)
- if( posix_fadvise(fd, 0, size, POSIX_FADV_DONTNEED) ) {
- log() << "warning: posix_fadvise fails " << name << " (" << tmp << ") " << errnoWithDescription() << endl;
- }
+ if (posix_fadvise(fd, 0, size, POSIX_FADV_DONTNEED)) {
+ log() << "warning: posix_fadvise fails " << name << " (" << tmp << ") "
+ << errnoWithDescription() << endl;
+ }
#endif
- Timer t;
+ Timer t;
- /* make sure the file is the full desired length */
- ensureLength( fd , size );
+ /* make sure the file is the full desired length */
+ ensureLength(fd, size);
- close( fd );
- fd = 0;
+ close(fd);
+ fd = 0;
- if( rename(tmp.c_str(), name.c_str()) ) {
- const string& errStr = errnoWithDescription();
- const string& errMessage = str::stream()
- << "error: couldn't rename " << tmp
- << " to " << name << ' ' << errStr;
- msgasserted(13653, errMessage);
- }
- flushMyDirectory(name);
-
- log() << "done allocating datafile " << name << ", "
- << "size: " << size/1024/1024 << "MB, "
- << " took " << ((double)t.millis())/1000.0 << " secs"
- << endl;
-
- // no longer in a failed state. allow new writers.
- fa->_failed = false;
+ if (rename(tmp.c_str(), name.c_str())) {
+ const string& errStr = errnoWithDescription();
+ const string& errMessage = str::stream() << "error: couldn't rename " << tmp
+ << " to " << name << ' ' << errStr;
+ msgasserted(13653, errMessage);
}
- catch ( const std::exception& e ) {
- log() << "error: failed to allocate new file: " << name
- << " size: " << size << ' ' << e.what()
- << ". will try again in 10 seconds" << endl;
- if ( fd > 0 )
- close( fd );
- try {
- if ( ! tmp.empty() )
- boost::filesystem::remove( tmp );
- boost::filesystem::remove( name );
- } catch ( const std::exception& e ) {
- log() << "error removing files: " << e.what() << endl;
- }
-
- {
- stdx::lock_guard<stdx::mutex> lk(fa->_pendingMutex);
- fa->_failed = true;
-
- // TODO: Should we remove the file from pending?
- fa->_pendingUpdated.notify_all();
- }
-
-
- sleepsecs(10);
- continue;
+ flushMyDirectory(name);
+
+ log() << "done allocating datafile " << name << ", "
+ << "size: " << size / 1024 / 1024 << "MB, "
+ << " took " << ((double)t.millis()) / 1000.0 << " secs" << endl;
+
+ // no longer in a failed state. allow new writers.
+ fa->_failed = false;
+ } catch (const std::exception& e) {
+ log() << "error: failed to allocate new file: " << name << " size: " << size << ' '
+ << e.what() << ". will try again in 10 seconds" << endl;
+ if (fd > 0)
+ close(fd);
+ try {
+ if (!tmp.empty())
+ boost::filesystem::remove(tmp);
+ boost::filesystem::remove(name);
+ } catch (const std::exception& e) {
+ log() << "error removing files: " << e.what() << endl;
}
{
- stdx::lock_guard<stdx::mutex> lk( fa->_pendingMutex );
- fa->_pendingSize.erase( name );
- fa->_pending.pop_front();
+ stdx::lock_guard<stdx::mutex> lk(fa->_pendingMutex);
+ fa->_failed = true;
+
+ // TODO: Should we remove the file from pending?
fa->_pendingUpdated.notify_all();
}
+
+
+ sleepsecs(10);
+ continue;
+ }
+
+ {
+ stdx::lock_guard<stdx::mutex> lk(fa->_pendingMutex);
+ fa->_pendingSize.erase(name);
+ fa->_pending.pop_front();
+ fa->_pendingUpdated.notify_all();
}
}
}
+}
- FileAllocator* FileAllocator::_instance = 0;
+FileAllocator* FileAllocator::_instance = 0;
- FileAllocator* FileAllocator::get(){
- if ( ! _instance )
- _instance = new FileAllocator();
- return _instance;
- }
+FileAllocator* FileAllocator::get() {
+ if (!_instance)
+ _instance = new FileAllocator();
+ return _instance;
+}
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/file_allocator.h b/src/mongo/db/storage/mmap_v1/file_allocator.h
index d3f9b6cceda..e3e4ad55881 100644
--- a/src/mongo/db/storage/mmap_v1/file_allocator.h
+++ b/src/mongo/db/storage/mmap_v1/file_allocator.h
@@ -37,73 +37,71 @@
namespace mongo {
+/*
+ * Handles allocation of contiguous files on disk. Allocation may be
+ * requested asynchronously or synchronously.
+ * singleton
+ */
+class FileAllocator {
+ MONGO_DISALLOW_COPYING(FileAllocator);
/*
- * Handles allocation of contiguous files on disk. Allocation may be
- * requested asynchronously or synchronously.
- * singleton
+ * The public functions may not be called concurrently. The allocation
+ * functions may be called multiple times per file, but only the first
+ * size specified per file will be used.
+ */
+public:
+ void start();
+
+ /**
+ * May be called if file exists. If file exists, or its allocation has
+ * been requested, size is updated to match existing file size.
*/
- class FileAllocator {
- MONGO_DISALLOW_COPYING(FileAllocator);
- /*
- * The public functions may not be called concurrently. The allocation
- * functions may be called multiple times per file, but only the first
- * size specified per file will be used.
- */
- public:
- void start();
-
- /**
- * May be called if file exists. If file exists, or its allocation has
- * been requested, size is updated to match existing file size.
- */
- void requestAllocation( const std::string &name, long &size );
+ void requestAllocation(const std::string& name, long& size);
- /**
- * Returns when file has been allocated. If file exists, size is
- * updated to match existing file size.
- */
- void allocateAsap( const std::string &name, unsigned long long &size );
-
- void waitUntilFinished() const;
+ /**
+ * Returns when file has been allocated. If file exists, size is
+ * updated to match existing file size.
+ */
+ void allocateAsap(const std::string& name, unsigned long long& size);
- static void ensureLength(int fd, long size);
+ void waitUntilFinished() const;
- /** @return the singleton */
- static FileAllocator * get();
-
- private:
+ static void ensureLength(int fd, long size);
- FileAllocator();
+ /** @return the singleton */
+ static FileAllocator* get();
- void checkFailure();
+private:
+ FileAllocator();
- // caller must hold pendingMutex_ lock. Returns size if allocated or
- // allocation requested, -1 otherwise.
- long prevSize( const std::string &name ) const;
+ void checkFailure();
- // caller must hold pendingMutex_ lock.
- bool inProgress( const std::string &name ) const;
+ // caller must hold pendingMutex_ lock. Returns size if allocated or
+ // allocation requested, -1 otherwise.
+ long prevSize(const std::string& name) const;
- /** called from the worked thread */
- static void run( FileAllocator * fa );
+ // caller must hold pendingMutex_ lock.
+ bool inProgress(const std::string& name) const;
- // generate a unique name for temporary files
- std::string makeTempFileName( boost::filesystem::path root );
+ /** called from the worked thread */
+ static void run(FileAllocator* fa);
- mutable stdx::mutex _pendingMutex;
- mutable stdx::condition_variable _pendingUpdated;
+ // generate a unique name for temporary files
+ std::string makeTempFileName(boost::filesystem::path root);
- std::list< std::string > _pending;
- mutable std::map< std::string, long > _pendingSize;
+ mutable stdx::mutex _pendingMutex;
+ mutable stdx::condition_variable _pendingUpdated;
- // unique number for temporary files
- static unsigned long long _uniqueNumber;
+ std::list<std::string> _pending;
+ mutable std::map<std::string, long> _pendingSize;
- bool _failed;
+ // unique number for temporary files
+ static unsigned long long _uniqueNumber;
- static FileAllocator* _instance;
+ bool _failed;
- };
+ static FileAllocator* _instance;
+};
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/heap_record_store_btree.cpp b/src/mongo/db/storage/mmap_v1/heap_record_store_btree.cpp
index dfe51554836..934f9807628 100644
--- a/src/mongo/db/storage/mmap_v1/heap_record_store_btree.cpp
+++ b/src/mongo/db/storage/mmap_v1/heap_record_store_btree.cpp
@@ -40,117 +40,117 @@
namespace mongo {
- RecordData HeapRecordStoreBtree::dataFor(OperationContext* txn, const RecordId& loc) const {
- Records::const_iterator it = _records.find(loc);
- invariant(it != _records.end());
- const MmapV1RecordHeader& rec = it->second;
-
- return RecordData(rec.data.get(), rec.dataSize);
- }
-
- bool HeapRecordStoreBtree::findRecord(OperationContext* txn,
- const RecordId& loc, RecordData* out) const {
- Records::const_iterator it = _records.find(loc);
- if ( it == _records.end() )
- return false;
- const MmapV1RecordHeader& rec = it->second;
- *out = RecordData(rec.data.get(), rec.dataSize);
- return true;
- }
-
- void HeapRecordStoreBtree::deleteRecord(OperationContext* txn, const RecordId& loc) {
- invariant(_records.erase(loc) == 1);
- }
-
- StatusWith<RecordId> HeapRecordStoreBtree::insertRecord(OperationContext* txn,
- const char* data,
- int len,
- bool enforceQuota) {
- MmapV1RecordHeader rec(len);
- memcpy(rec.data.get(), data, len);
-
- const RecordId loc = allocateLoc();
- _records[loc] = rec;
-
- HeapRecordStoreBtreeRecoveryUnit::notifyInsert( txn, this, loc );
-
- return StatusWith<RecordId>(loc);
+RecordData HeapRecordStoreBtree::dataFor(OperationContext* txn, const RecordId& loc) const {
+ Records::const_iterator it = _records.find(loc);
+ invariant(it != _records.end());
+ const MmapV1RecordHeader& rec = it->second;
+
+ return RecordData(rec.data.get(), rec.dataSize);
+}
+
+bool HeapRecordStoreBtree::findRecord(OperationContext* txn,
+ const RecordId& loc,
+ RecordData* out) const {
+ Records::const_iterator it = _records.find(loc);
+ if (it == _records.end())
+ return false;
+ const MmapV1RecordHeader& rec = it->second;
+ *out = RecordData(rec.data.get(), rec.dataSize);
+ return true;
+}
+
+void HeapRecordStoreBtree::deleteRecord(OperationContext* txn, const RecordId& loc) {
+ invariant(_records.erase(loc) == 1);
+}
+
+StatusWith<RecordId> HeapRecordStoreBtree::insertRecord(OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota) {
+ MmapV1RecordHeader rec(len);
+ memcpy(rec.data.get(), data, len);
+
+ const RecordId loc = allocateLoc();
+ _records[loc] = rec;
+
+ HeapRecordStoreBtreeRecoveryUnit::notifyInsert(txn, this, loc);
+
+ return StatusWith<RecordId>(loc);
+}
+
+StatusWith<RecordId> HeapRecordStoreBtree::insertRecord(OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota) {
+ MmapV1RecordHeader rec(doc->documentSize());
+ doc->writeDocument(rec.data.get());
+
+ const RecordId loc = allocateLoc();
+ _records[loc] = rec;
+
+ HeapRecordStoreBtreeRecoveryUnit::notifyInsert(txn, this, loc);
+
+ return StatusWith<RecordId>(loc);
+}
+
+RecordId HeapRecordStoreBtree::allocateLoc() {
+ const int64_t id = _nextId++;
+ // This is a hack, but both the high and low order bits of RecordId offset must be 0, and the
+ // file must fit in 23 bits. This gives us a total of 30 + 23 == 53 bits.
+ invariant(id < (1LL << 53));
+ RecordId dl(int(id >> 30), int((id << 1) & ~(1 << 31)));
+ invariant((dl.repr() & 0x1) == 0);
+ return dl;
+}
+
+Status HeapRecordStoreBtree::touch(OperationContext* txn, BSONObjBuilder* output) const {
+ // not currently called from the tests, but called from btree_logic.h
+ return Status::OK();
+}
+
+// ---------------------------
+
+void HeapRecordStoreBtreeRecoveryUnit::commitUnitOfWork() {
+ _insertions.clear();
+ _mods.clear();
+}
+
+void HeapRecordStoreBtreeRecoveryUnit::abortUnitOfWork() {
+ // reverse in case we write same area twice
+ for (size_t i = _mods.size(); i > 0; i--) {
+ ModEntry& e = _mods[i - 1];
+ memcpy(e.data, e.old.get(), e.len);
}
- StatusWith<RecordId> HeapRecordStoreBtree::insertRecord(OperationContext* txn,
- const DocWriter* doc,
- bool enforceQuota) {
- MmapV1RecordHeader rec(doc->documentSize());
- doc->writeDocument(rec.data.get());
+ invariant(_insertions.size() == 0); // todo
+}
- const RecordId loc = allocateLoc();
- _records[loc] = rec;
+void* HeapRecordStoreBtreeRecoveryUnit::writingPtr(void* data, size_t len) {
+ ModEntry e = {data, len, boost::shared_array<char>(new char[len])};
+ memcpy(e.old.get(), data, len);
+ _mods.push_back(e);
+ return data;
+}
- HeapRecordStoreBtreeRecoveryUnit::notifyInsert( txn, this, loc );
+void HeapRecordStoreBtreeRecoveryUnit::notifyInsert(HeapRecordStoreBtree* rs, const RecordId& loc) {
+ InsertEntry e = {rs, loc};
+ _insertions.push_back(e);
+}
- return StatusWith<RecordId>(loc);
- }
+void HeapRecordStoreBtreeRecoveryUnit::notifyInsert(OperationContext* ctx,
+ HeapRecordStoreBtree* rs,
+ const RecordId& loc) {
+ if (!ctx)
+ return;
- RecordId HeapRecordStoreBtree::allocateLoc() {
- const int64_t id = _nextId++;
- // This is a hack, but both the high and low order bits of RecordId offset must be 0, and the
- // file must fit in 23 bits. This gives us a total of 30 + 23 == 53 bits.
- invariant(id < (1LL << 53));
- RecordId dl(int(id >> 30), int((id << 1) & ~(1<<31)));
- invariant( (dl.repr() & 0x1) == 0 );
- return dl;
- }
-
- Status HeapRecordStoreBtree::touch(OperationContext* txn, BSONObjBuilder* output) const {
- // not currently called from the tests, but called from btree_logic.h
- return Status::OK();
- }
+ // This dynamic_cast has semantics, should change ideally.
+ HeapRecordStoreBtreeRecoveryUnit* ru =
+ dynamic_cast<HeapRecordStoreBtreeRecoveryUnit*>(ctx->recoveryUnit());
- // ---------------------------
+ if (!ru)
+ return;
- void HeapRecordStoreBtreeRecoveryUnit::commitUnitOfWork() {
- _insertions.clear();
- _mods.clear();
- }
-
- void HeapRecordStoreBtreeRecoveryUnit::abortUnitOfWork() {
- // reverse in case we write same area twice
- for ( size_t i = _mods.size(); i > 0; i-- ) {
- ModEntry& e = _mods[i-1];
- memcpy( e.data, e.old.get(), e.len );
- }
-
- invariant( _insertions.size() == 0 ); // todo
- }
-
- void* HeapRecordStoreBtreeRecoveryUnit::writingPtr(void* data, size_t len) {
- ModEntry e = { data, len, boost::shared_array<char>( new char[len] ) };
- memcpy( e.old.get(), data, len );
- _mods.push_back( e );
- return data;
- }
-
- void HeapRecordStoreBtreeRecoveryUnit::notifyInsert( HeapRecordStoreBtree* rs,
- const RecordId& loc ) {
- InsertEntry e = { rs, loc };
- _insertions.push_back( e );
- }
-
- void HeapRecordStoreBtreeRecoveryUnit::notifyInsert( OperationContext* ctx,
- HeapRecordStoreBtree* rs,
- const RecordId& loc ) {
- if ( !ctx )
- return;
-
- // This dynamic_cast has semantics, should change ideally.
- HeapRecordStoreBtreeRecoveryUnit* ru =
- dynamic_cast<HeapRecordStoreBtreeRecoveryUnit*>( ctx->recoveryUnit() );
-
- if ( !ru )
- return;
-
- ru->notifyInsert( rs, loc );
- }
+ ru->notifyInsert(rs, loc);
+}
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/heap_record_store_btree.h b/src/mongo/db/storage/mmap_v1/heap_record_store_btree.h
index c44dcf3f473..aa193549440 100644
--- a/src/mongo/db/storage/mmap_v1/heap_record_store_btree.h
+++ b/src/mongo/db/storage/mmap_v1/heap_record_store_btree.h
@@ -38,174 +38,190 @@
namespace mongo {
- /**
- * A RecordStore that stores all data on the heap. This implementation contains only the
- * functionality necessary to test btree.
- */
- class HeapRecordStoreBtree : public RecordStore {
- struct MmapV1RecordHeader;
-
- public:
- // RecordId(0,0) isn't valid for records.
- explicit HeapRecordStoreBtree(StringData ns): RecordStore(ns), _nextId(1) { }
-
- virtual RecordData dataFor(OperationContext* txn, const RecordId& loc) const;
-
- virtual bool findRecord(OperationContext* txn, const RecordId& loc, RecordData* out) const;
-
- virtual void deleteRecord(OperationContext* txn, const RecordId& dl);
-
- virtual StatusWith<RecordId> insertRecord(OperationContext* txn,
- const char* data,
- int len,
- bool enforceQuota);
-
- virtual StatusWith<RecordId> insertRecord(OperationContext* txn,
- const DocWriter* doc,
- bool enforceQuota);
-
- virtual long long numRecords( OperationContext* txn ) const { return _records.size(); }
-
- virtual Status touch(OperationContext* txn, BSONObjBuilder* output) const;
-
- // public methods below here are not necessary to test btree, and will crash when called.
-
- // ------------------------------
-
- virtual StatusWith<RecordId> updateRecord(OperationContext* txn,
- const RecordId& oldLocation,
- const char* data,
- int len,
- bool enforceQuota,
- UpdateNotifier* notifier) {
- invariant(false);
- }
-
- virtual bool updateWithDamagesSupported() const {
- return true;
- }
-
- virtual Status updateWithDamages(OperationContext* txn,
- const RecordId& loc,
- const RecordData& oldRec,
- const char* damageSource,
- const mutablebson::DamageVector& damages) {
- invariant(false);
- }
-
- std::unique_ptr<RecordCursor> getCursor(OperationContext* txn, bool forward) const final {
- invariant(false);
- }
-
-
- virtual Status truncate(OperationContext* txn) { invariant(false); }
-
- virtual void temp_cappedTruncateAfter(OperationContext* txn,
- RecordId end,
- bool inclusive) {
- invariant(false);
- }
-
- virtual bool compactSupported() const { invariant(false); }
-
- virtual Status validate(OperationContext* txn,
- bool full,
- bool scanData,
- ValidateAdaptor* adaptor,
- ValidateResults* results, BSONObjBuilder* output) {
- invariant(false);
- }
+/**
+ * A RecordStore that stores all data on the heap. This implementation contains only the
+ * functionality necessary to test btree.
+ */
+class HeapRecordStoreBtree : public RecordStore {
+ struct MmapV1RecordHeader;
+
+public:
+ // RecordId(0,0) isn't valid for records.
+ explicit HeapRecordStoreBtree(StringData ns) : RecordStore(ns), _nextId(1) {}
+
+ virtual RecordData dataFor(OperationContext* txn, const RecordId& loc) const;
+
+ virtual bool findRecord(OperationContext* txn, const RecordId& loc, RecordData* out) const;
+
+ virtual void deleteRecord(OperationContext* txn, const RecordId& dl);
+
+ virtual StatusWith<RecordId> insertRecord(OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota);
+
+ virtual StatusWith<RecordId> insertRecord(OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota);
+
+ virtual long long numRecords(OperationContext* txn) const {
+ return _records.size();
+ }
+
+ virtual Status touch(OperationContext* txn, BSONObjBuilder* output) const;
+
+ // public methods below here are not necessary to test btree, and will crash when called.
+
+ // ------------------------------
+
+ virtual StatusWith<RecordId> updateRecord(OperationContext* txn,
+ const RecordId& oldLocation,
+ const char* data,
+ int len,
+ bool enforceQuota,
+ UpdateNotifier* notifier) {
+ invariant(false);
+ }
+
+ virtual bool updateWithDamagesSupported() const {
+ return true;
+ }
+
+ virtual Status updateWithDamages(OperationContext* txn,
+ const RecordId& loc,
+ const RecordData& oldRec,
+ const char* damageSource,
+ const mutablebson::DamageVector& damages) {
+ invariant(false);
+ }
+
+ std::unique_ptr<RecordCursor> getCursor(OperationContext* txn, bool forward) const final {
+ invariant(false);
+ }
+
+
+ virtual Status truncate(OperationContext* txn) {
+ invariant(false);
+ }
+
+ virtual void temp_cappedTruncateAfter(OperationContext* txn, RecordId end, bool inclusive) {
+ invariant(false);
+ }
+
+ virtual bool compactSupported() const {
+ invariant(false);
+ }
+
+ virtual Status validate(OperationContext* txn,
+ bool full,
+ bool scanData,
+ ValidateAdaptor* adaptor,
+ ValidateResults* results,
+ BSONObjBuilder* output) {
+ invariant(false);
+ }
+
+ virtual void appendCustomStats(OperationContext* txn,
+ BSONObjBuilder* result,
+ double scale) const {
+ invariant(false);
+ }
+
+ virtual void increaseStorageSize(OperationContext* txn, int size, bool enforceQuota) {
+ invariant(false);
+ }
+
+ virtual int64_t storageSize(OperationContext* txn,
+ BSONObjBuilder* extraInfo = NULL,
+ int infoLevel = 0) const {
+ invariant(false);
+ }
+
+ virtual long long dataSize(OperationContext* txn) const {
+ invariant(false);
+ }
+
+ virtual MmapV1RecordHeader* recordFor(const RecordId& loc) const {
+ invariant(false);
+ }
+
+ virtual bool isCapped() const {
+ invariant(false);
+ }
+
+ virtual const char* name() const {
+ invariant(false);
+ }
+
+ virtual void updateStatsAfterRepair(OperationContext* txn,
+ long long numRecords,
+ long long dataSize) {
+ invariant(false);
+ }
+ // more things that we actually care about below
+
+private:
+ struct MmapV1RecordHeader {
+ MmapV1RecordHeader() : dataSize(-1), data() {}
+ explicit MmapV1RecordHeader(int size) : dataSize(size), data(new char[size]) {}
+
+ int dataSize;
+ boost::shared_array<char> data;
+ };
- virtual void appendCustomStats(OperationContext* txn,
- BSONObjBuilder* result,
- double scale) const {
- invariant(false);
- }
+ RecordId allocateLoc();
- virtual void increaseStorageSize(OperationContext* txn, int size, bool enforceQuota) {
- invariant(false);
- }
+ typedef std::map<RecordId, HeapRecordStoreBtree::MmapV1RecordHeader> Records;
+ Records _records;
+ int64_t _nextId;
+};
- virtual int64_t storageSize(OperationContext* txn,
- BSONObjBuilder* extraInfo = NULL,
- int infoLevel = 0) const {
- invariant(false);
- }
+/**
+ * A RecoveryUnit for HeapRecordStoreBtree, this is for testing btree only.
+ */
+class HeapRecordStoreBtreeRecoveryUnit : public RecoveryUnit {
+public:
+ void beginUnitOfWork(OperationContext* opCtx) final{};
+ void commitUnitOfWork() final;
+ void abortUnitOfWork() final;
- virtual long long dataSize(OperationContext* txn) const { invariant(false); }
+ virtual bool waitUntilDurable() {
+ return true;
+ }
- virtual MmapV1RecordHeader* recordFor(const RecordId& loc) const { invariant(false); }
+ virtual void abandonSnapshot() {}
- virtual bool isCapped() const { invariant(false); }
+ virtual void registerChange(Change* change) {
+ change->commit();
+ delete change;
+ }
- virtual const char* name() const { invariant(false); }
+ virtual void* writingPtr(void* data, size_t len);
- virtual void updateStatsAfterRepair(OperationContext* txn,
- long long numRecords,
- long long dataSize) {
- invariant(false);
- }
- // more things that we actually care about below
+ virtual void setRollbackWritesDisabled() {}
- private:
- struct MmapV1RecordHeader {
- MmapV1RecordHeader(): dataSize(-1), data() { }
- explicit MmapV1RecordHeader(int size): dataSize(size), data(new char[size]) { }
+ virtual SnapshotId getSnapshotId() const {
+ return SnapshotId();
+ }
- int dataSize;
- boost::shared_array<char> data;
- };
+ // -----------------------
- RecordId allocateLoc();
+ void notifyInsert(HeapRecordStoreBtree* rs, const RecordId& loc);
+ static void notifyInsert(OperationContext* ctx, HeapRecordStoreBtree* rs, const RecordId& loc);
- typedef std::map<RecordId, HeapRecordStoreBtree::MmapV1RecordHeader> Records;
- Records _records;
- int64_t _nextId;
+private:
+ struct InsertEntry {
+ HeapRecordStoreBtree* rs;
+ RecordId loc;
};
+ std::vector<InsertEntry> _insertions;
- /**
- * A RecoveryUnit for HeapRecordStoreBtree, this is for testing btree only.
- */
- class HeapRecordStoreBtreeRecoveryUnit : public RecoveryUnit {
- public:
- void beginUnitOfWork(OperationContext* opCtx) final { };
- void commitUnitOfWork() final;
- void abortUnitOfWork() final;
-
- virtual bool waitUntilDurable() { return true; }
-
- virtual void abandonSnapshot() {}
-
- virtual void registerChange(Change* change) {
- change->commit();
- delete change;
- }
-
- virtual void* writingPtr(void* data, size_t len);
-
- virtual void setRollbackWritesDisabled() {}
-
- virtual SnapshotId getSnapshotId() const { return SnapshotId(); }
-
- // -----------------------
-
- void notifyInsert( HeapRecordStoreBtree* rs, const RecordId& loc );
- static void notifyInsert( OperationContext* ctx,
- HeapRecordStoreBtree* rs, const RecordId& loc );
-
- private:
- struct InsertEntry {
- HeapRecordStoreBtree* rs;
- RecordId loc;
- };
- std::vector<InsertEntry> _insertions;
-
- struct ModEntry {
- void* data;
- size_t len;
- boost::shared_array<char> old;
- };
- std::vector<ModEntry> _mods;
+ struct ModEntry {
+ void* data;
+ size_t len;
+ boost::shared_array<char> old;
};
+ std::vector<ModEntry> _mods;
+};
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/journal_latency_test_cmd.cpp b/src/mongo/db/storage/mmap_v1/journal_latency_test_cmd.cpp
index 8c29741ed7e..0f21961d459 100644
--- a/src/mongo/db/storage/mmap_v1/journal_latency_test_cmd.cpp
+++ b/src/mongo/db/storage/mmap_v1/journal_latency_test_cmd.cpp
@@ -54,102 +54,108 @@
namespace mongo {
- using std::max;
- using std::min;
- using std::string;
- using std::stringstream;
+using std::max;
+using std::min;
+using std::string;
+using std::stringstream;
- namespace dur {
- boost::filesystem::path getJournalDir();
- }
+namespace dur {
+boost::filesystem::path getJournalDir();
+}
- // Testing-only, enabled via command line
- class JournalLatencyTestCmd : public Command {
- public:
- JournalLatencyTestCmd() : Command( "journalLatencyTest" ) {}
+// Testing-only, enabled via command line
+class JournalLatencyTestCmd : public Command {
+public:
+ JournalLatencyTestCmd() : Command("journalLatencyTest") {}
- virtual bool slaveOk() const { return true; }
- virtual bool isWriteCommandForConfigServer() const { return false; }
- virtual bool adminOnly() const { return true; }
- virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; }
- // No auth needed because it only works when enabled via command line.
- virtual void addRequiredPrivileges(const std::string& dbname,
- const BSONObj& cmdObj,
- std::vector<Privilege>* out) {}
- bool run(OperationContext* txn,
- const string& dbname,
- BSONObj& cmdObj,
- int,
- string& errmsg,
- BSONObjBuilder& result) {
- boost::filesystem::path p = dur::getJournalDir();
- p /= "journalLatencyTest";
-
- // remove file if already present
- try {
- boost::filesystem::remove(p);
- }
- catch(...) { }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual bool isWriteCommandForConfigServer() const {
+ return false;
+ }
+ virtual bool adminOnly() const {
+ return true;
+ }
+ virtual void help(stringstream& h) const {
+ h << "test how long to write and fsync to a test file in the journal/ directory";
+ }
+ // No auth needed because it only works when enabled via command line.
+ virtual void addRequiredPrivileges(const std::string& dbname,
+ const BSONObj& cmdObj,
+ std::vector<Privilege>* out) {}
+ bool run(OperationContext* txn,
+ const string& dbname,
+ BSONObj& cmdObj,
+ int,
+ string& errmsg,
+ BSONObjBuilder& result) {
+ boost::filesystem::path p = dur::getJournalDir();
+ p /= "journalLatencyTest";
- BSONObjBuilder bb[2];
- for( int pass = 0; pass < 2; pass++ ) {
- LogFile f(p.string());
- AlignedBuilder b(1024 * 1024);
- {
- Timer t;
- for( int i = 0 ; i < 100; i++ ) {
- f.synchronousAppend(b.buf(), 8192);
- }
- bb[pass].append("8KB", t.millis() / 100.0);
- }
- {
- const int N = 50;
- Timer t2;
- long long x = 0;
- for( int i = 0 ; i < N; i++ ) {
- Timer t;
- f.synchronousAppend(b.buf(), 8192);
- x += t.micros();
- sleepmillis(4);
- }
- long long y = t2.micros() - 4*N*1000;
- // not really trusting the timer granularity on all platforms so whichever is higher of x and y
- bb[pass].append("8KBWithPauses", max(x,y) / (N*1000.0));
+ // remove file if already present
+ try {
+ boost::filesystem::remove(p);
+ } catch (...) {
+ }
+
+ BSONObjBuilder bb[2];
+ for (int pass = 0; pass < 2; pass++) {
+ LogFile f(p.string());
+ AlignedBuilder b(1024 * 1024);
+ {
+ Timer t;
+ for (int i = 0; i < 100; i++) {
+ f.synchronousAppend(b.buf(), 8192);
}
- {
+ bb[pass].append("8KB", t.millis() / 100.0);
+ }
+ {
+ const int N = 50;
+ Timer t2;
+ long long x = 0;
+ for (int i = 0; i < N; i++) {
Timer t;
- for( int i = 0 ; i < 20; i++ ) {
- f.synchronousAppend(b.buf(), 1024 * 1024);
- }
- bb[pass].append("1MB", t.millis() / 20.0);
+ f.synchronousAppend(b.buf(), 8192);
+ x += t.micros();
+ sleepmillis(4);
}
- // second time around, we are prealloced.
+ long long y = t2.micros() - 4 * N * 1000;
+ // not really trusting the timer granularity on all platforms so whichever is higher of x and y
+ bb[pass].append("8KBWithPauses", max(x, y) / (N * 1000.0));
}
- result.append("timeMillis", bb[0].obj());
- result.append("timeMillisWithPrealloc", bb[1].obj());
-
- try {
- remove(p);
- }
- catch(...) { }
-
- try {
- result.append("onSamePartition", onSamePartition(dur::getJournalDir().string(),
- storageGlobalParams.dbpath));
+ {
+ Timer t;
+ for (int i = 0; i < 20; i++) {
+ f.synchronousAppend(b.buf(), 1024 * 1024);
+ }
+ bb[pass].append("1MB", t.millis() / 20.0);
}
- catch(...) { }
-
- return 1;
- }
- };
- MONGO_INITIALIZER(RegisterJournalLatencyTestCmd)(InitializerContext* context) {
- if (Command::testCommandsEnabled) {
- // Leaked intentionally: a Command registers itself when constructed.
- new JournalLatencyTestCmd();
+ // second time around, we are prealloced.
}
- return Status::OK();
- }
+ result.append("timeMillis", bb[0].obj());
+ result.append("timeMillisWithPrealloc", bb[1].obj());
+ try {
+ remove(p);
+ } catch (...) {
+ }
+ try {
+ result.append(
+ "onSamePartition",
+ onSamePartition(dur::getJournalDir().string(), storageGlobalParams.dbpath));
+ } catch (...) {
+ }
+ return 1;
+ }
+};
+MONGO_INITIALIZER(RegisterJournalLatencyTestCmd)(InitializerContext* context) {
+ if (Command::testCommandsEnabled) {
+ // Leaked intentionally: a Command registers itself when constructed.
+ new JournalLatencyTestCmd();
+ }
+ return Status::OK();
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/logfile.cpp b/src/mongo/db/storage/mmap_v1/logfile.cpp
index 8aa5e32626f..62c3b61bc73 100644
--- a/src/mongo/db/storage/mmap_v1/logfile.cpp
+++ b/src/mongo/db/storage/mmap_v1/logfile.cpp
@@ -53,83 +53,84 @@ using std::string;
namespace mongo {
- LogFile::LogFile(const std::string& name, bool readwrite) : _name(name) {
- _fd = CreateFile(
- toNativeString(name.c_str()).c_str(),
- (readwrite?GENERIC_READ:0)|GENERIC_WRITE,
- FILE_SHARE_READ,
- NULL,
- OPEN_ALWAYS,
- FILE_FLAG_NO_BUFFERING,
- NULL);
- if( _fd == INVALID_HANDLE_VALUE ) {
- DWORD e = GetLastError();
- uasserted(13518, str::stream() << "couldn't open file " << name << " for writing " << errnoWithDescription(e));
- }
- SetFilePointer(_fd, 0, 0, FILE_BEGIN);
+LogFile::LogFile(const std::string& name, bool readwrite) : _name(name) {
+ _fd = CreateFile(toNativeString(name.c_str()).c_str(),
+ (readwrite ? GENERIC_READ : 0) | GENERIC_WRITE,
+ FILE_SHARE_READ,
+ NULL,
+ OPEN_ALWAYS,
+ FILE_FLAG_NO_BUFFERING,
+ NULL);
+ if (_fd == INVALID_HANDLE_VALUE) {
+ DWORD e = GetLastError();
+ uasserted(13518,
+ str::stream() << "couldn't open file " << name << " for writing "
+ << errnoWithDescription(e));
}
+ SetFilePointer(_fd, 0, 0, FILE_BEGIN);
+}
- LogFile::~LogFile() {
- if( _fd != INVALID_HANDLE_VALUE )
- CloseHandle(_fd);
- }
+LogFile::~LogFile() {
+ if (_fd != INVALID_HANDLE_VALUE)
+ CloseHandle(_fd);
+}
- void LogFile::truncate() {
- verify(_fd != INVALID_HANDLE_VALUE);
+void LogFile::truncate() {
+ verify(_fd != INVALID_HANDLE_VALUE);
- if (!SetEndOfFile(_fd)){
- msgasserted(15871, "Couldn't truncate file: " + errnoWithDescription());
- }
+ if (!SetEndOfFile(_fd)) {
+ msgasserted(15871, "Couldn't truncate file: " + errnoWithDescription());
}
+}
- void LogFile::writeAt(unsigned long long offset, const void *_buf, size_t _len) {
-// TODO 64 bit offsets
- OVERLAPPED o;
- memset(&o,0,sizeof(o));
- (unsigned long long&) o.Offset = offset;
- BOOL ok= WriteFile(_fd, _buf, _len, 0, &o);
- verify(ok);
- }
+void LogFile::writeAt(unsigned long long offset, const void* _buf, size_t _len) {
+ // TODO 64 bit offsets
+ OVERLAPPED o;
+ memset(&o, 0, sizeof(o));
+ (unsigned long long&)o.Offset = offset;
+ BOOL ok = WriteFile(_fd, _buf, _len, 0, &o);
+ verify(ok);
+}
- void LogFile::readAt(unsigned long long offset, void *_buf, size_t _len) {
-// TODO 64 bit offsets
- OVERLAPPED o;
- memset(&o,0,sizeof(o));
- (unsigned long long&) o.Offset = offset;
- DWORD nr;
- BOOL ok = ReadFile(_fd, _buf, _len, &nr, &o);
- if( !ok ) {
- string e = errnoWithDescription();
- //DWORD e = GetLastError();
- log() << "LogFile readAt(" << offset << ") len:" << _len << "errno:" << e << endl;
- verify(false);
- }
+void LogFile::readAt(unsigned long long offset, void* _buf, size_t _len) {
+ // TODO 64 bit offsets
+ OVERLAPPED o;
+ memset(&o, 0, sizeof(o));
+ (unsigned long long&)o.Offset = offset;
+ DWORD nr;
+ BOOL ok = ReadFile(_fd, _buf, _len, &nr, &o);
+ if (!ok) {
+ string e = errnoWithDescription();
+ // DWORD e = GetLastError();
+ log() << "LogFile readAt(" << offset << ") len:" << _len << "errno:" << e << endl;
+ verify(false);
}
+}
- void LogFile::synchronousAppend(const void *_buf, size_t _len) {
- const size_t BlockSize = 8 * 1024 * 1024;
- verify(_fd);
- verify(_len % g_minOSPageSizeBytes == 0);
- const char *buf = (const char *) _buf;
- size_t left = _len;
- while( left ) {
- size_t toWrite = std::min(left, BlockSize);
- DWORD written;
- if( !WriteFile(_fd, buf, toWrite, &written, NULL) ) {
- DWORD e = GetLastError();
- if( e == 87 )
- msgasserted(13519, "error 87 appending to file - invalid parameter");
- else
- uasserted(13517, str::stream() << "error appending to file " << _name << ' ' << _len << ' ' << toWrite << ' ' << errnoWithDescription(e));
- }
- else {
- dassert( written == toWrite );
- }
- left -= written;
- buf += written;
+void LogFile::synchronousAppend(const void* _buf, size_t _len) {
+ const size_t BlockSize = 8 * 1024 * 1024;
+ verify(_fd);
+ verify(_len % g_minOSPageSizeBytes == 0);
+ const char* buf = (const char*)_buf;
+ size_t left = _len;
+ while (left) {
+ size_t toWrite = std::min(left, BlockSize);
+ DWORD written;
+ if (!WriteFile(_fd, buf, toWrite, &written, NULL)) {
+ DWORD e = GetLastError();
+ if (e == 87)
+ msgasserted(13519, "error 87 appending to file - invalid parameter");
+ else
+ uasserted(13517,
+ str::stream() << "error appending to file " << _name << ' ' << _len << ' '
+ << toWrite << ' ' << errnoWithDescription(e));
+ } else {
+ dassert(written == toWrite);
}
+ left -= written;
+ buf += written;
}
-
+}
}
#else
@@ -147,124 +148,123 @@ namespace mongo {
namespace mongo {
- LogFile::LogFile(const std::string& name, bool readwrite) : _name(name) {
- int options = O_CREAT
- | (readwrite?O_RDWR:O_WRONLY)
+LogFile::LogFile(const std::string& name, bool readwrite) : _name(name) {
+ int options = O_CREAT | (readwrite ? O_RDWR : O_WRONLY)
#if defined(O_DIRECT)
- | O_DIRECT
+ | O_DIRECT
#endif
#if defined(O_NOATIME)
- | O_NOATIME
+ | O_NOATIME
#endif
- ;
+ ;
- _fd = open(name.c_str(), options, S_IRUSR | S_IWUSR);
- _blkSize = g_minOSPageSizeBytes;
+ _fd = open(name.c_str(), options, S_IRUSR | S_IWUSR);
+ _blkSize = g_minOSPageSizeBytes;
#if defined(O_DIRECT)
- _direct = true;
- if( _fd < 0 ) {
- _direct = false;
- options &= ~O_DIRECT;
- _fd = open(name.c_str(), options, S_IRUSR | S_IWUSR);
- }
+ _direct = true;
+ if (_fd < 0) {
+ _direct = false;
+ options &= ~O_DIRECT;
+ _fd = open(name.c_str(), options, S_IRUSR | S_IWUSR);
+ }
#ifdef __linux__
- ssize_t tmpBlkSize = ioctl(_fd, BLKBSZGET);
- // TODO: We need some sanity checking on tmpBlkSize even if ioctl() did not fail.
- if (tmpBlkSize > 0) {
- _blkSize = (size_t)tmpBlkSize;
- }
+ ssize_t tmpBlkSize = ioctl(_fd, BLKBSZGET);
+ // TODO: We need some sanity checking on tmpBlkSize even if ioctl() did not fail.
+ if (tmpBlkSize > 0) {
+ _blkSize = (size_t)tmpBlkSize;
+ }
#endif
#else
- _direct = false;
+ _direct = false;
#endif
- if( _fd < 0 ) {
- uasserted(13516, str::stream() << "couldn't open file " << name << " for writing " << errnoWithDescription());
- }
-
- flushMyDirectory(name);
+ if (_fd < 0) {
+ uasserted(13516,
+ str::stream() << "couldn't open file " << name << " for writing "
+ << errnoWithDescription());
}
- LogFile::~LogFile() {
- if( _fd >= 0 )
- close(_fd);
- _fd = -1;
- }
+ flushMyDirectory(name);
+}
- void LogFile::truncate() {
- verify(_fd >= 0);
+LogFile::~LogFile() {
+ if (_fd >= 0)
+ close(_fd);
+ _fd = -1;
+}
- BOOST_STATIC_ASSERT(sizeof(off_t) == 8); // we don't want overflow here
- const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek
- if (ftruncate(_fd, pos) != 0){
- msgasserted(15873, "Couldn't truncate file: " + errnoWithDescription());
- }
+void LogFile::truncate() {
+ verify(_fd >= 0);
- fsync(_fd);
+ BOOST_STATIC_ASSERT(sizeof(off_t) == 8); // we don't want overflow here
+ const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek
+ if (ftruncate(_fd, pos) != 0) {
+ msgasserted(15873, "Couldn't truncate file: " + errnoWithDescription());
}
- void LogFile::writeAt(unsigned long long offset, const void *buf, size_t len) {
- verify(((size_t)buf) % g_minOSPageSizeBytes == 0); // aligned
- ssize_t written = pwrite(_fd, buf, len, offset);
- if( written != (ssize_t) len ) {
- log() << "writeAt fails " << errnoWithDescription() << endl;
- }
+ fsync(_fd);
+}
+
+void LogFile::writeAt(unsigned long long offset, const void* buf, size_t len) {
+ verify(((size_t)buf) % g_minOSPageSizeBytes == 0); // aligned
+ ssize_t written = pwrite(_fd, buf, len, offset);
+ if (written != (ssize_t)len) {
+ log() << "writeAt fails " << errnoWithDescription() << endl;
+ }
#if defined(__linux__)
- fdatasync(_fd);
+ fdatasync(_fd);
#else
- fsync(_fd);
+ fsync(_fd);
#endif
- }
-
- void LogFile::readAt(unsigned long long offset, void *_buf, size_t _len) {
- verify(((size_t)_buf) % g_minOSPageSizeBytes == 0); // aligned
- ssize_t rd = pread(_fd, _buf, _len, offset);
- verify( rd != -1 );
- }
+}
- void LogFile::synchronousAppend(const void *b, size_t len) {
+void LogFile::readAt(unsigned long long offset, void* _buf, size_t _len) {
+ verify(((size_t)_buf) % g_minOSPageSizeBytes == 0); // aligned
+ ssize_t rd = pread(_fd, _buf, _len, offset);
+ verify(rd != -1);
+}
- const char *buf = static_cast<const char *>( b );
- ssize_t charsToWrite = static_cast<ssize_t>( len );
+void LogFile::synchronousAppend(const void* b, size_t len) {
+ const char* buf = static_cast<const char*>(b);
+ ssize_t charsToWrite = static_cast<ssize_t>(len);
- fassert( 16144, charsToWrite >= 0 );
- fassert( 16142, _fd >= 0 );
- fassert( 16143, reinterpret_cast<size_t>( buf ) % _blkSize == 0 ); // aligned
+ fassert(16144, charsToWrite >= 0);
+ fassert(16142, _fd >= 0);
+ fassert(16143, reinterpret_cast<size_t>(buf) % _blkSize == 0); // aligned
#ifdef POSIX_FADV_DONTNEED
- const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek, just get current position
+ const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek, just get current position
#endif
- while ( charsToWrite > 0 ) {
- const ssize_t written = write( _fd, buf, static_cast<size_t>( charsToWrite ) );
- if ( -1 == written ) {
- log() << "LogFile::synchronousAppend failed with " << charsToWrite
- << " bytes unwritten out of " << len << " bytes; b=" << b << ' '
- << errnoWithDescription() << std::endl;
- fassertFailed( 13515 );
- }
- buf += written;
- charsToWrite -= written;
+ while (charsToWrite > 0) {
+ const ssize_t written = write(_fd, buf, static_cast<size_t>(charsToWrite));
+ if (-1 == written) {
+ log() << "LogFile::synchronousAppend failed with " << charsToWrite
+ << " bytes unwritten out of " << len << " bytes; b=" << b << ' '
+ << errnoWithDescription() << std::endl;
+ fassertFailed(13515);
}
+ buf += written;
+ charsToWrite -= written;
+ }
- if(
+ if (
#if defined(__linux__)
- fdatasync(_fd) < 0
+ fdatasync(_fd) < 0
#else
- fsync(_fd)
+ fsync(_fd)
#endif
- ) {
- log() << "error appending to file on fsync " << ' ' << errnoWithDescription();
- fassertFailed( 13514 );
- }
+ ) {
+ log() << "error appending to file on fsync " << ' ' << errnoWithDescription();
+ fassertFailed(13514);
+ }
#ifdef POSIX_FADV_DONTNEED
- if (!_direct)
- posix_fadvise(_fd, pos, len, POSIX_FADV_DONTNEED);
+ if (!_direct)
+ posix_fadvise(_fd, pos, len, POSIX_FADV_DONTNEED);
#endif
- }
-
+}
}
#endif
diff --git a/src/mongo/db/storage/mmap_v1/logfile.h b/src/mongo/db/storage/mmap_v1/logfile.h
index 278b9c162aa..4a3bb5535e2 100644
--- a/src/mongo/db/storage/mmap_v1/logfile.h
+++ b/src/mongo/db/storage/mmap_v1/logfile.h
@@ -35,43 +35,42 @@
namespace mongo {
- class LogFile {
- public:
- /** create the file and open. must not already exist.
- throws UserAssertion on i/o error
- */
- LogFile(const std::string& name, bool readwrite = false);
+class LogFile {
+public:
+ /** create the file and open. must not already exist.
+ throws UserAssertion on i/o error
+ */
+ LogFile(const std::string& name, bool readwrite = false);
- /** closes */
- ~LogFile();
+ /** closes */
+ ~LogFile();
- /** append to file. does not return until sync'd. uses direct i/o when possible.
- throws UserAssertion on an i/o error
- note direct i/o may have alignment requirements
- */
- void synchronousAppend(const void *buf, size_t len);
+ /** append to file. does not return until sync'd. uses direct i/o when possible.
+ throws UserAssertion on an i/o error
+ note direct i/o may have alignment requirements
+ */
+ void synchronousAppend(const void* buf, size_t len);
- /** write at specified offset. must be aligned. noreturn until physically written. thread safe */
- void writeAt(unsigned long long offset, const void *_bug, size_t _len);
+ /** write at specified offset. must be aligned. noreturn until physically written. thread safe */
+ void writeAt(unsigned long long offset, const void* _bug, size_t _len);
- void readAt(unsigned long long offset, void *_buf, size_t _len);
+ void readAt(unsigned long long offset, void* _buf, size_t _len);
- const std::string _name;
+ const std::string _name;
- void truncate(); // Removes extra data after current position
+ void truncate(); // Removes extra data after current position
- private:
+private:
#if defined(_WIN32)
- typedef HANDLE fd_type;
+ typedef HANDLE fd_type;
#else
- typedef int fd_type;
+ typedef int fd_type;
#endif
- fd_type _fd;
- bool _direct; // are we using direct I/O
-
- // Block size, in case of direct I/O we need to test alignment against the page size,
- // which can be different than 4kB.
- size_t _blkSize;
- };
+ fd_type _fd;
+ bool _direct; // are we using direct I/O
+ // Block size, in case of direct I/O we need to test alignment against the page size,
+ // which can be different than 4kB.
+ size_t _blkSize;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/mmap.cpp b/src/mongo/db/storage/mmap_v1/mmap.cpp
index e9519fc7d94..57559d3038e 100644
--- a/src/mongo/db/storage/mmap_v1/mmap.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap.cpp
@@ -46,213 +46,220 @@
namespace mongo {
- using std::endl;
- using std::map;
- using std::set;
- using std::string;
- using std::stringstream;
- using std::vector;
-
- void minOSPageSizeBytesTest(size_t minOSPageSizeBytes) {
- fassert( 16325, minOSPageSizeBytes > 0 );
- fassert( 16326, minOSPageSizeBytes < 1000000 );
- // check to see if the page size is a power of 2
- fassert( 16327, (minOSPageSizeBytes & (minOSPageSizeBytes - 1)) == 0);
- }
+using std::endl;
+using std::map;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+void minOSPageSizeBytesTest(size_t minOSPageSizeBytes) {
+ fassert(16325, minOSPageSizeBytes > 0);
+ fassert(16326, minOSPageSizeBytes < 1000000);
+ // check to see if the page size is a power of 2
+ fassert(16327, (minOSPageSizeBytes & (minOSPageSizeBytes - 1)) == 0);
+}
namespace {
- set<MongoFile*> mmfiles;
- map<string,MongoFile*> pathToFile;
+set<MongoFile*> mmfiles;
+map<string, MongoFile*> pathToFile;
} // namespace
- /* Create. Must not exist.
- @param zero fill file with zeros when true
- */
- void* MemoryMappedFile::create(const std::string& filename, unsigned long long len, bool zero) {
- uassert( 13468, string("can't create file already exists ") + filename, ! boost::filesystem::exists(filename) );
- void *p = map(filename.c_str(), len);
- if( p && zero ) {
- size_t sz = (size_t) len;
- verify( len == sz );
- memset(p, 0, sz);
- }
- return p;
+/* Create. Must not exist.
+@param zero fill file with zeros when true
+*/
+void* MemoryMappedFile::create(const std::string& filename, unsigned long long len, bool zero) {
+ uassert(13468,
+ string("can't create file already exists ") + filename,
+ !boost::filesystem::exists(filename));
+ void* p = map(filename.c_str(), len);
+ if (p && zero) {
+ size_t sz = (size_t)len;
+ verify(len == sz);
+ memset(p, 0, sz);
}
-
- /*static*/ void MemoryMappedFile::updateLength( const char *filename, unsigned long long &length ) {
- if ( !boost::filesystem::exists( filename ) )
- return;
- // make sure we map full length if preexisting file.
- boost::uintmax_t l = boost::filesystem::file_size( filename );
- length = l;
+ return p;
+}
+
+/*static*/ void MemoryMappedFile::updateLength(const char* filename, unsigned long long& length) {
+ if (!boost::filesystem::exists(filename))
+ return;
+ // make sure we map full length if preexisting file.
+ boost::uintmax_t l = boost::filesystem::file_size(filename);
+ length = l;
+}
+
+void* MemoryMappedFile::map(const char* filename) {
+ unsigned long long l;
+ try {
+ l = boost::filesystem::file_size(filename);
+ } catch (boost::filesystem::filesystem_error& e) {
+ uasserted(15922,
+ mongoutils::str::stream() << "couldn't get file length when opening mapping "
+ << filename << ' ' << e.what());
}
-
- void* MemoryMappedFile::map(const char *filename) {
- unsigned long long l;
- try {
- l = boost::filesystem::file_size( filename );
- }
- catch(boost::filesystem::filesystem_error& e) {
- uasserted(15922, mongoutils::str::stream() << "couldn't get file length when opening mapping " << filename << ' ' << e.what() );
- }
- return map( filename , l );
+ return map(filename, l);
+}
+void* MemoryMappedFile::mapWithOptions(const char* filename, int options) {
+ unsigned long long l;
+ try {
+ l = boost::filesystem::file_size(filename);
+ } catch (boost::filesystem::filesystem_error& e) {
+ uasserted(15923,
+ mongoutils::str::stream() << "couldn't get file length when opening mapping "
+ << filename << ' ' << e.what());
}
- void* MemoryMappedFile::mapWithOptions(const char *filename, int options) {
- unsigned long long l;
- try {
- l = boost::filesystem::file_size( filename );
- }
- catch(boost::filesystem::filesystem_error& e) {
- uasserted(15923, mongoutils::str::stream() << "couldn't get file length when opening mapping " << filename << ' ' << e.what() );
- }
- return map( filename , l, options );
+ return map(filename, l, options);
+}
+
+/* --- MongoFile -------------------------------------------------
+ this is the administrative stuff
+*/
+
+RWLockRecursiveNongreedy LockMongoFilesShared::mmmutex("mmmutex", 10 * 60 * 1000 /* 10 minutes */);
+unsigned LockMongoFilesShared::era = 99; // note this rolls over
+
+set<MongoFile*>& MongoFile::getAllFiles() {
+ return mmfiles;
+}
+
+/* subclass must call in destructor (or at close).
+ removes this from pathToFile and other maps
+ safe to call more than once, albeit might be wasted work
+ ideal to call close to the close, if the close is well before object destruction
+*/
+void MongoFile::destroyed() {
+ LockMongoFilesShared::assertExclusivelyLocked();
+ mmfiles.erase(this);
+ pathToFile.erase(filename());
+}
+
+/*static*/
+void MongoFile::closeAllFiles(stringstream& message) {
+ static int closingAllFiles = 0;
+ if (closingAllFiles) {
+ message << "warning closingAllFiles=" << closingAllFiles << endl;
+ return;
}
+ ++closingAllFiles;
- /* --- MongoFile -------------------------------------------------
- this is the administrative stuff
- */
-
- RWLockRecursiveNongreedy LockMongoFilesShared::mmmutex("mmmutex",10*60*1000 /* 10 minutes */);
- unsigned LockMongoFilesShared::era = 99; // note this rolls over
-
- set<MongoFile*>& MongoFile::getAllFiles() { return mmfiles; }
+ LockMongoFilesExclusive lk;
- /* subclass must call in destructor (or at close).
- removes this from pathToFile and other maps
- safe to call more than once, albeit might be wasted work
- ideal to call close to the close, if the close is well before object destruction
- */
- void MongoFile::destroyed() {
- LockMongoFilesShared::assertExclusivelyLocked();
- mmfiles.erase(this);
- pathToFile.erase( filename() );
+ ProgressMeter pm(mmfiles.size(), 2, 1, "files", "File Closing Progress");
+ set<MongoFile*> temp = mmfiles;
+ for (set<MongoFile*>::iterator i = temp.begin(); i != temp.end(); i++) {
+ (*i)->close(); // close() now removes from mmfiles
+ pm.hit();
}
+ message << "closeAllFiles() finished";
+ --closingAllFiles;
+}
- /*static*/
- void MongoFile::closeAllFiles( stringstream &message ) {
- static int closingAllFiles = 0;
- if ( closingAllFiles ) {
- message << "warning closingAllFiles=" << closingAllFiles << endl;
- return;
- }
- ++closingAllFiles;
+/*static*/ long long MongoFile::totalMappedLength() {
+ unsigned long long total = 0;
- LockMongoFilesExclusive lk;
+ LockMongoFilesShared lk;
- ProgressMeter pm(mmfiles.size(), 2, 1, "files", "File Closing Progress");
- set<MongoFile*> temp = mmfiles;
- for ( set<MongoFile*>::iterator i = temp.begin(); i != temp.end(); i++ ) {
- (*i)->close(); // close() now removes from mmfiles
- pm.hit();
- }
- message << "closeAllFiles() finished";
- --closingAllFiles;
- }
+ for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++)
+ total += (*i)->length();
- /*static*/ long long MongoFile::totalMappedLength() {
- unsigned long long total = 0;
+ return total;
+}
- LockMongoFilesShared lk;
-
- for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ )
- total += (*i)->length();
-
- return total;
- }
+void nullFunc() {}
- void nullFunc() { }
+// callback notifications
+void (*MongoFile::notifyPreFlush)() = nullFunc;
+void (*MongoFile::notifyPostFlush)() = nullFunc;
- // callback notifications
- void (*MongoFile::notifyPreFlush)() = nullFunc;
- void (*MongoFile::notifyPostFlush)() = nullFunc;
+/*static*/ int MongoFile::flushAll(bool sync) {
+ if (sync)
+ notifyPreFlush();
+ int x = _flushAll(sync);
+ if (sync)
+ notifyPostFlush();
+ return x;
+}
- /*static*/ int MongoFile::flushAll( bool sync ) {
- if ( sync ) notifyPreFlush();
- int x = _flushAll(sync);
- if ( sync ) notifyPostFlush();
- return x;
- }
+/*static*/ int MongoFile::_flushAll(bool sync) {
+ if (!sync) {
+ int num = 0;
+ LockMongoFilesShared lk;
+ for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) {
+ num++;
+ MongoFile* mmf = *i;
+ if (!mmf)
+ continue;
- /*static*/ int MongoFile::_flushAll( bool sync ) {
- if ( ! sync ) {
- int num = 0;
- LockMongoFilesShared lk;
- for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ ) {
- num++;
- MongoFile * mmf = *i;
- if ( ! mmf )
- continue;
-
- mmf->flush( sync );
- }
- return num;
+ mmf->flush(sync);
}
+ return num;
+ }
- // want to do it sync
-
- // get a thread-safe Flushable object for each file first in a single lock
- // so that we can iterate and flush without doing any locking here
- OwnedPointerVector<Flushable> thingsToFlushWrapper;
- vector<Flushable*>& thingsToFlush = thingsToFlushWrapper.mutableVector();
- {
- LockMongoFilesShared lk;
- for ( set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++ ) {
- MongoFile* mmf = *i;
- if ( !mmf )
- continue;
- thingsToFlush.push_back( mmf->prepareFlush() );
- }
- }
+ // want to do it sync
- for ( size_t i = 0; i < thingsToFlush.size(); i++ ) {
- thingsToFlush[i]->flush();
+ // get a thread-safe Flushable object for each file first in a single lock
+ // so that we can iterate and flush without doing any locking here
+ OwnedPointerVector<Flushable> thingsToFlushWrapper;
+ vector<Flushable*>& thingsToFlush = thingsToFlushWrapper.mutableVector();
+ {
+ LockMongoFilesShared lk;
+ for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) {
+ MongoFile* mmf = *i;
+ if (!mmf)
+ continue;
+ thingsToFlush.push_back(mmf->prepareFlush());
}
-
- return thingsToFlush.size();
- }
-
- void MongoFile::created() {
- LockMongoFilesExclusive lk;
- mmfiles.insert(this);
}
- void MongoFile::setFilename(const std::string& fn) {
- LockMongoFilesExclusive lk;
- verify( _filename.empty() );
- _filename = boost::filesystem::absolute(fn).generic_string();
- MongoFile *&ptf = pathToFile[_filename];
- massert(13617, "MongoFile : multiple opens of same filename", ptf == 0);
- ptf = this;
+ for (size_t i = 0; i < thingsToFlush.size(); i++) {
+ thingsToFlush[i]->flush();
}
- MongoFile* MongoFileFinder::findByPath(const std::string& path) const {
- return mapFindWithDefault(pathToFile,
- boost::filesystem::absolute(path).generic_string(),
- static_cast<MongoFile*>(NULL));
+ return thingsToFlush.size();
+}
+
+void MongoFile::created() {
+ LockMongoFilesExclusive lk;
+ mmfiles.insert(this);
+}
+
+void MongoFile::setFilename(const std::string& fn) {
+ LockMongoFilesExclusive lk;
+ verify(_filename.empty());
+ _filename = boost::filesystem::absolute(fn).generic_string();
+ MongoFile*& ptf = pathToFile[_filename];
+ massert(13617, "MongoFile : multiple opens of same filename", ptf == 0);
+ ptf = this;
+}
+
+MongoFile* MongoFileFinder::findByPath(const std::string& path) const {
+ return mapFindWithDefault(pathToFile,
+ boost::filesystem::absolute(path).generic_string(),
+ static_cast<MongoFile*>(NULL));
+}
+
+
+void printMemInfo(const char* where) {
+ LogstreamBuilder out = log();
+ out << "mem info: ";
+ if (where)
+ out << where << " ";
+
+ ProcessInfo pi;
+ if (!pi.supported()) {
+ out << " not supported";
+ return;
}
+ out << "vsize: " << pi.getVirtualMemorySize() << " resident: " << pi.getResidentSize()
+ << " mapped: " << (MemoryMappedFile::totalMappedLength() / (1024 * 1024));
+}
- void printMemInfo( const char * where ) {
- LogstreamBuilder out = log();
- out << "mem info: ";
- if ( where )
- out << where << " ";
-
- ProcessInfo pi;
- if ( ! pi.supported() ) {
- out << " not supported";
- return;
- }
-
- out << "vsize: " << pi.getVirtualMemorySize()
- << " resident: " << pi.getResidentSize()
- << " mapped: " << ( MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) );
- }
-
- void dataSyncFailedHandler() {
- log() << "error syncing data to disk, probably a disk error";
- log() << " shutting down immediately to avoid corruption";
- fassertFailed( 17346 );
- }
+void dataSyncFailedHandler() {
+ log() << "error syncing data to disk, probably a disk error";
+ log() << " shutting down immediately to avoid corruption";
+ fassertFailed(17346);
+}
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/mmap.h b/src/mongo/db/storage/mmap_v1/mmap.h
index f70b64c96eb..ae9a0796a4b 100644
--- a/src/mongo/db/storage/mmap_v1/mmap.h
+++ b/src/mongo/db/storage/mmap_v1/mmap.h
@@ -38,225 +38,249 @@
namespace mongo {
#if !defined(_WIN32)
- typedef int HANDLE;
+typedef int HANDLE;
#endif
- extern const size_t g_minOSPageSizeBytes;
- void minOSPageSizeBytesTest(size_t minOSPageSizeBytes); // lame-o
+extern const size_t g_minOSPageSizeBytes;
+void minOSPageSizeBytesTest(size_t minOSPageSizeBytes); // lame-o
- // call this if syncing data fails
- void dataSyncFailedHandler();
+// call this if syncing data fails
+void dataSyncFailedHandler();
- class MAdvise {
- MONGO_DISALLOW_COPYING(MAdvise);
- public:
- enum Advice { Sequential=1 , Random=2 };
- MAdvise(void *p, unsigned len, Advice a);
- ~MAdvise(); // destructor resets the range to MADV_NORMAL
- private:
- void *_p;
- unsigned _len;
- };
+class MAdvise {
+ MONGO_DISALLOW_COPYING(MAdvise);
- // lock order: lock dbMutex before this if you lock both
- class LockMongoFilesShared {
- friend class LockMongoFilesExclusive;
- static RWLockRecursiveNongreedy mmmutex;
- static unsigned era;
- RWLockRecursive::Shared lk;
- public:
- LockMongoFilesShared() : lk(mmmutex) { }
+public:
+ enum Advice { Sequential = 1, Random = 2 };
+ MAdvise(void* p, unsigned len, Advice a);
+ ~MAdvise(); // destructor resets the range to MADV_NORMAL
+private:
+ void* _p;
+ unsigned _len;
+};
- /** era changes anytime memory maps come and go. thus you can use this as a cheap way to check
- if nothing has changed since the last time you locked. Of course you must be shared locked
- at the time of this call, otherwise someone could be in progress.
+// lock order: lock dbMutex before this if you lock both
+class LockMongoFilesShared {
+ friend class LockMongoFilesExclusive;
+ static RWLockRecursiveNongreedy mmmutex;
+ static unsigned era;
+ RWLockRecursive::Shared lk;
- This is used for yielding; see PageFaultException::touch().
- */
- static unsigned getEra() { return era; }
+public:
+ LockMongoFilesShared() : lk(mmmutex) {}
- static void assertExclusivelyLocked() { mmmutex.assertExclusivelyLocked(); }
- static void assertAtLeastReadLocked() { mmmutex.assertAtLeastReadLocked(); }
- };
+ /** era changes anytime memory maps come and go. thus you can use this as a cheap way to check
+ if nothing has changed since the last time you locked. Of course you must be shared locked
+ at the time of this call, otherwise someone could be in progress.
+
+ This is used for yielding; see PageFaultException::touch().
+ */
+ static unsigned getEra() {
+ return era;
+ }
+
+ static void assertExclusivelyLocked() {
+ mmmutex.assertExclusivelyLocked();
+ }
+ static void assertAtLeastReadLocked() {
+ mmmutex.assertAtLeastReadLocked();
+ }
+};
+
+class LockMongoFilesExclusive {
+ RWLockRecursive::Exclusive lk;
- class LockMongoFilesExclusive {
- RWLockRecursive::Exclusive lk;
+public:
+ LockMongoFilesExclusive() : lk(LockMongoFilesShared::mmmutex) {
+ LockMongoFilesShared::era++;
+ }
+};
+
+/* the administrative-ish stuff here */
+class MongoFile {
+ MONGO_DISALLOW_COPYING(MongoFile);
+
+public:
+ /** Flushable has to fail nicely if the underlying object gets killed */
+ class Flushable {
public:
- LockMongoFilesExclusive() : lk(LockMongoFilesShared::mmmutex) {
- LockMongoFilesShared::era++;
- }
+ virtual ~Flushable() {}
+ virtual void flush() = 0;
};
- /* the administrative-ish stuff here */
- class MongoFile {
- MONGO_DISALLOW_COPYING(MongoFile);
- public:
- /** Flushable has to fail nicely if the underlying object gets killed */
- class Flushable {
- public:
- virtual ~Flushable() {}
- virtual void flush() = 0;
- };
-
- MongoFile() {}
- virtual ~MongoFile() {}
-
- enum Options {
- SEQUENTIAL = 1, // hint - e.g. FILE_FLAG_SEQUENTIAL_SCAN on windows
- READONLY = 2 // not contractually guaranteed, but if specified the impl has option to fault writes
- };
-
- /** @param fun is called for each MongoFile.
- called from within a mutex that MongoFile uses. so be careful not to deadlock.
- */
- template < class F >
- static void forEach( F fun );
+ MongoFile() {}
+ virtual ~MongoFile() {}
- /** note: you need to be in mmmutex when using this. forEach (above) handles that for you automatically.
-*/
- static std::set<MongoFile*>& getAllFiles();
+ enum Options {
+ SEQUENTIAL = 1, // hint - e.g. FILE_FLAG_SEQUENTIAL_SCAN on windows
+ READONLY =
+ 2 // not contractually guaranteed, but if specified the impl has option to fault writes
+ };
- // callbacks if you need them
- static void (*notifyPreFlush)();
- static void (*notifyPostFlush)();
+ /** @param fun is called for each MongoFile.
+ called from within a mutex that MongoFile uses. so be careful not to deadlock.
+ */
+ template <class F>
+ static void forEach(F fun);
- static int flushAll( bool sync ); // returns n flushed
- static long long totalMappedLength();
- static void closeAllFiles( std::stringstream &message );
+ /** note: you need to be in mmmutex when using this. forEach (above) handles that for you automatically.
+*/
+ static std::set<MongoFile*>& getAllFiles();
- virtual bool isDurableMappedFile() { return false; }
+ // callbacks if you need them
+ static void (*notifyPreFlush)();
+ static void (*notifyPostFlush)();
- std::string filename() const { return _filename; }
- void setFilename(const std::string& fn);
+ static int flushAll(bool sync); // returns n flushed
+ static long long totalMappedLength();
+ static void closeAllFiles(std::stringstream& message);
- virtual uint64_t getUniqueId() const = 0;
+ virtual bool isDurableMappedFile() {
+ return false;
+ }
- private:
- std::string _filename;
- static int _flushAll( bool sync ); // returns n flushed
- protected:
- virtual void close() = 0;
- virtual void flush(bool sync) = 0;
- /**
- * returns a thread safe object that you can call flush on
- * Flushable has to fail nicely if the underlying object gets killed
- */
- virtual Flushable * prepareFlush() = 0;
+ std::string filename() const {
+ return _filename;
+ }
+ void setFilename(const std::string& fn);
+
+ virtual uint64_t getUniqueId() const = 0;
+
+private:
+ std::string _filename;
+ static int _flushAll(bool sync); // returns n flushed
+protected:
+ virtual void close() = 0;
+ virtual void flush(bool sync) = 0;
+ /**
+ * returns a thread safe object that you can call flush on
+ * Flushable has to fail nicely if the underlying object gets killed
+ */
+ virtual Flushable* prepareFlush() = 0;
+
+ void created(); /* subclass must call after create */
+
+ /* subclass must call in destructor (or at close).
+ removes this from pathToFile and other maps
+ safe to call more than once, albeit might be wasted work
+ ideal to call close to the close, if the close is well before object destruction
+ */
+ void destroyed();
- void created(); /* subclass must call after create */
+ virtual unsigned long long length() const = 0;
+};
- /* subclass must call in destructor (or at close).
- removes this from pathToFile and other maps
- safe to call more than once, albeit might be wasted work
- ideal to call close to the close, if the close is well before object destruction
- */
- void destroyed();
+/** look up a MMF by filename. scoped mutex locking convention.
+ example:
+ MMFFinderByName finder;
+ DurableMappedFile *a = finder.find("file_name_a");
+ DurableMappedFile *b = finder.find("file_name_b");
+*/
+class MongoFileFinder {
+ MONGO_DISALLOW_COPYING(MongoFileFinder);
- virtual unsigned long long length() const = 0;
- };
+public:
+ MongoFileFinder() {}
- /** look up a MMF by filename. scoped mutex locking convention.
- example:
- MMFFinderByName finder;
- DurableMappedFile *a = finder.find("file_name_a");
- DurableMappedFile *b = finder.find("file_name_b");
+ /** @return The MongoFile object associated with the specified file name. If no file is open
+ with the specified name, returns null.
*/
- class MongoFileFinder {
- MONGO_DISALLOW_COPYING(MongoFileFinder);
- public:
- MongoFileFinder() { }
+ MongoFile* findByPath(const std::string& path) const;
+
+private:
+ LockMongoFilesShared _lk;
+};
+
+class MemoryMappedFile : public MongoFile {
+protected:
+ virtual void* viewForFlushing() {
+ if (views.size() == 0)
+ return 0;
+ verify(views.size() == 1);
+ return views[0];
+ }
- /** @return The MongoFile object associated with the specified file name. If no file is open
- with the specified name, returns null.
- */
- MongoFile* findByPath(const std::string& path) const;
+public:
+ MemoryMappedFile();
- private:
- LockMongoFilesShared _lk;
- };
+ virtual ~MemoryMappedFile() {
+ LockMongoFilesExclusive lk;
+ close();
+ }
- class MemoryMappedFile : public MongoFile {
- protected:
- virtual void* viewForFlushing() {
- if( views.size() == 0 )
- return 0;
- verify( views.size() == 1 );
- return views[0];
- }
- public:
- MemoryMappedFile();
+ virtual void close();
- virtual ~MemoryMappedFile() {
- LockMongoFilesExclusive lk;
- close();
- }
+ // Throws exception if file doesn't exist. (dm may2010: not sure if this is always true?)
+ void* map(const char* filename);
- virtual void close();
+ /** @param options see MongoFile::Options
+ */
+ void* mapWithOptions(const char* filename, int options);
- // Throws exception if file doesn't exist. (dm may2010: not sure if this is always true?)
- void* map(const char *filename);
+ /* Creates with length if DNE, otherwise uses existing file length,
+ passed length.
+ @param options MongoFile::Options bits
+ */
+ void* map(const char* filename, unsigned long long& length, int options = 0);
- /** @param options see MongoFile::Options
- */
- void* mapWithOptions(const char *filename, int options);
+ /* Create. Must not exist.
+ @param zero fill file with zeros when true
+ */
+ void* create(const std::string& filename, unsigned long long len, bool zero);
- /* Creates with length if DNE, otherwise uses existing file length,
- passed length.
- @param options MongoFile::Options bits
- */
- void* map(const char *filename, unsigned long long &length, int options = 0 );
+ void flush(bool sync);
+ virtual Flushable* prepareFlush();
- /* Create. Must not exist.
- @param zero fill file with zeros when true
+ long shortLength() const {
+ return (long)len;
+ }
+ unsigned long long length() const {
+ return len;
+ }
+ HANDLE getFd() const {
+ return fd;
+ }
+ /** create a new view with the specified properties.
+ automatically cleaned up upon close/destruction of the MemoryMappedFile object.
*/
- void* create(const std::string& filename, unsigned long long len, bool zero);
-
- void flush(bool sync);
- virtual Flushable * prepareFlush();
-
- long shortLength() const { return (long) len; }
- unsigned long long length() const { return len; }
- HANDLE getFd() const { return fd; }
- /** create a new view with the specified properties.
- automatically cleaned up upon close/destruction of the MemoryMappedFile object.
- */
- void* createReadOnlyMap();
- void* createPrivateMap();
-
- virtual uint64_t getUniqueId() const { return _uniqueId; }
-
- private:
- static void updateLength( const char *filename, unsigned long long &length );
-
- HANDLE fd;
- HANDLE maphandle;
- std::vector<void *> views;
- unsigned long long len;
- const uint64_t _uniqueId;
-#ifdef _WIN32
- // flush Mutex
- //
- // Protects:
- // Prevent flush() and close() from concurrently running.
- // It ensures close() cannot complete while flush() is running
- // Lock Ordering:
- // LockMongoFilesShared must be taken before _flushMutex if both are taken
- stdx::mutex _flushMutex;
-#endif
+ void* createReadOnlyMap();
+ void* createPrivateMap();
- protected:
+ virtual uint64_t getUniqueId() const {
+ return _uniqueId;
+ }
- /** close the current private view and open a new replacement */
- void* remapPrivateView(void *oldPrivateAddr);
- };
+private:
+ static void updateLength(const char* filename, unsigned long long& length);
- /** p is called from within a mutex that MongoFile uses. so be careful not to deadlock. */
- template < class F >
- inline void MongoFile::forEach( F p ) {
- LockMongoFilesShared lklk;
- const std::set<MongoFile*>& mmfiles = MongoFile::getAllFiles();
- for ( std::set<MongoFile*>::const_iterator i = mmfiles.begin(); i != mmfiles.end(); i++ )
- p(*i);
- }
+ HANDLE fd;
+ HANDLE maphandle;
+ std::vector<void*> views;
+ unsigned long long len;
+ const uint64_t _uniqueId;
+#ifdef _WIN32
+ // flush Mutex
+ //
+ // Protects:
+ // Prevent flush() and close() from concurrently running.
+ // It ensures close() cannot complete while flush() is running
+ // Lock Ordering:
+ // LockMongoFilesShared must be taken before _flushMutex if both are taken
+ stdx::mutex _flushMutex;
+#endif
-} // namespace mongo
+protected:
+ /** close the current private view and open a new replacement */
+ void* remapPrivateView(void* oldPrivateAddr);
+};
+
+/** p is called from within a mutex that MongoFile uses. so be careful not to deadlock. */
+template <class F>
+inline void MongoFile::forEach(F p) {
+ LockMongoFilesShared lklk;
+ const std::set<MongoFile*>& mmfiles = MongoFile::getAllFiles();
+ for (std::set<MongoFile*>::const_iterator i = mmfiles.begin(); i != mmfiles.end(); i++)
+ p(*i);
+}
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/mmap_posix.cpp b/src/mongo/db/storage/mmap_v1/mmap_posix.cpp
index f7dffae468f..a673d3e5fde 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_posix.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_posix.cpp
@@ -53,38 +53,37 @@ using std::vector;
using namespace mongoutils;
namespace {
- mongo::AtomicUInt64 mmfNextId(0);
+mongo::AtomicUInt64 mmfNextId(0);
}
namespace mongo {
- static size_t fetchMinOSPageSizeBytes() {
- size_t minOSPageSizeBytes = sysconf( _SC_PAGESIZE );
- minOSPageSizeBytesTest(minOSPageSizeBytes);
- return minOSPageSizeBytes;
- }
- const size_t g_minOSPageSizeBytes = fetchMinOSPageSizeBytes();
-
-
-
- MemoryMappedFile::MemoryMappedFile() : _uniqueId(mmfNextId.fetchAndAdd(1)) {
- fd = 0;
- maphandle = 0;
- len = 0;
- created();
- }
+static size_t fetchMinOSPageSizeBytes() {
+ size_t minOSPageSizeBytes = sysconf(_SC_PAGESIZE);
+ minOSPageSizeBytesTest(minOSPageSizeBytes);
+ return minOSPageSizeBytes;
+}
+const size_t g_minOSPageSizeBytes = fetchMinOSPageSizeBytes();
- void MemoryMappedFile::close() {
- LockMongoFilesShared::assertExclusivelyLocked();
- for( vector<void*>::iterator i = views.begin(); i != views.end(); i++ ) {
- munmap(*i,len);
- }
- views.clear();
- if ( fd )
- ::close(fd);
- fd = 0;
- destroyed(); // cleans up from the master list of mmaps
+MemoryMappedFile::MemoryMappedFile() : _uniqueId(mmfNextId.fetchAndAdd(1)) {
+ fd = 0;
+ maphandle = 0;
+ len = 0;
+ created();
+}
+
+void MemoryMappedFile::close() {
+ LockMongoFilesShared::assertExclusivelyLocked();
+ for (vector<void*>::iterator i = views.begin(); i != views.end(); i++) {
+ munmap(*i, len);
}
+ views.clear();
+
+ if (fd)
+ ::close(fd);
+ fd = 0;
+ destroyed(); // cleans up from the master list of mmaps
+}
#ifndef O_NOATIME
#define O_NOATIME (0)
@@ -94,231 +93,234 @@ namespace mongo {
#define MAP_NORESERVE (0)
#endif
- namespace {
- void* _pageAlign( void* p ) {
- return (void*)((int64_t)p & ~(g_minOSPageSizeBytes-1));
+namespace {
+void* _pageAlign(void* p) {
+ return (void*)((int64_t)p & ~(g_minOSPageSizeBytes - 1));
+}
+
+class PageAlignTest : public StartupTest {
+public:
+ void run() {
+ {
+ int64_t x = g_minOSPageSizeBytes + 123;
+ void* y = _pageAlign(reinterpret_cast<void*>(x));
+ invariant(g_minOSPageSizeBytes == reinterpret_cast<size_t>(y));
}
+ {
+ int64_t a = static_cast<uint64_t>(numeric_limits<int>::max());
+ a = a / g_minOSPageSizeBytes;
+ a = a * g_minOSPageSizeBytes;
+ // a should now be page aligned
- class PageAlignTest : public StartupTest {
- public:
- void run() {
- {
- int64_t x = g_minOSPageSizeBytes + 123;
- void* y = _pageAlign( reinterpret_cast<void*>( x ) );
- invariant( g_minOSPageSizeBytes == reinterpret_cast<size_t>(y) );
- }
- {
- int64_t a = static_cast<uint64_t>( numeric_limits<int>::max() );
- a = a / g_minOSPageSizeBytes;
- a = a * g_minOSPageSizeBytes;
- // a should now be page aligned
-
- // b is not page aligned
- int64_t b = a + 123;
-
- void* y = _pageAlign( reinterpret_cast<void*>( b ) );
- invariant( a == reinterpret_cast<int64_t>(y) );
- }
+ // b is not page aligned
+ int64_t b = a + 123;
- }
- } pageAlignTest;
+ void* y = _pageAlign(reinterpret_cast<void*>(b));
+ invariant(a == reinterpret_cast<int64_t>(y));
+ }
}
+} pageAlignTest;
+}
#if defined(__sun)
- MAdvise::MAdvise(void *,unsigned, Advice) { }
- MAdvise::~MAdvise() { }
+MAdvise::MAdvise(void*, unsigned, Advice) {}
+MAdvise::~MAdvise() {}
#else
- MAdvise::MAdvise(void *p, unsigned len, Advice a) {
+MAdvise::MAdvise(void* p, unsigned len, Advice a) {
+ _p = _pageAlign(p);
- _p = _pageAlign( p );
+ _len = len + static_cast<unsigned>(reinterpret_cast<size_t>(p) - reinterpret_cast<size_t>(_p));
- _len = len + static_cast<unsigned>( reinterpret_cast<size_t>(p) -
- reinterpret_cast<size_t>(_p) );
-
- int advice = 0;
- switch ( a ) {
+ int advice = 0;
+ switch (a) {
case Sequential:
advice = MADV_SEQUENTIAL;
break;
case Random:
advice = MADV_RANDOM;
break;
- }
-
- if ( madvise(_p,_len,advice ) ) {
- error() << "madvise failed: " << errnoWithDescription();
- }
-
}
- MAdvise::~MAdvise() {
- madvise(_p,_len,MADV_NORMAL);
+
+ if (madvise(_p, _len, advice)) {
+ error() << "madvise failed: " << errnoWithDescription();
}
+}
+MAdvise::~MAdvise() {
+ madvise(_p, _len, MADV_NORMAL);
+}
#endif
- void* MemoryMappedFile::map(const char *filename, unsigned long long &length, int options) {
- // length may be updated by callee.
- setFilename(filename);
- FileAllocator::get()->allocateAsap( filename, length );
- len = length;
+void* MemoryMappedFile::map(const char* filename, unsigned long long& length, int options) {
+ // length may be updated by callee.
+ setFilename(filename);
+ FileAllocator::get()->allocateAsap(filename, length);
+ len = length;
- massert( 10446 , str::stream() << "mmap: can't map area of size 0 file: " << filename, length > 0 );
+ massert(
+ 10446, str::stream() << "mmap: can't map area of size 0 file: " << filename, length > 0);
- fd = open(filename, O_RDWR | O_NOATIME);
- if ( fd <= 0 ) {
- log() << "couldn't open " << filename << ' ' << errnoWithDescription() << endl;
- fd = 0; // our sentinel for not opened
- return 0;
- }
+ fd = open(filename, O_RDWR | O_NOATIME);
+ if (fd <= 0) {
+ log() << "couldn't open " << filename << ' ' << errnoWithDescription() << endl;
+ fd = 0; // our sentinel for not opened
+ return 0;
+ }
- unsigned long long filelen = lseek(fd, 0, SEEK_END);
- uassert(10447, str::stream() << "map file alloc failed, wanted: " << length << " filelen: " << filelen << ' ' << sizeof(size_t), filelen == length );
- lseek( fd, 0, SEEK_SET );
-
- void * view = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
- if ( view == MAP_FAILED ) {
- error() << " mmap() failed for " << filename << " len:" << length << " " << errnoWithDescription() << endl;
- if ( errno == ENOMEM ) {
- if( sizeof(void*) == 4 )
- error() << "mmap failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64" << endl;
- else
- error() << "mmap failed with out of memory. (64 bit build)" << endl;
- }
- return 0;
+ unsigned long long filelen = lseek(fd, 0, SEEK_END);
+ uassert(10447,
+ str::stream() << "map file alloc failed, wanted: " << length << " filelen: " << filelen
+ << ' ' << sizeof(size_t),
+ filelen == length);
+ lseek(fd, 0, SEEK_SET);
+
+ void* view = mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (view == MAP_FAILED) {
+ error() << " mmap() failed for " << filename << " len:" << length << " "
+ << errnoWithDescription() << endl;
+ if (errno == ENOMEM) {
+ if (sizeof(void*) == 4)
+ error() << "mmap failed with out of memory. You are using a 32-bit build and "
+ "probably need to upgrade to 64" << endl;
+ else
+ error() << "mmap failed with out of memory. (64 bit build)" << endl;
}
+ return 0;
+ }
#if defined(__sun)
#warning madvise not supported on solaris yet
#else
- if ( options & SEQUENTIAL ) {
- if ( madvise( view , length , MADV_SEQUENTIAL ) ) {
- warning() << "map: madvise failed for " << filename << ' ' << errnoWithDescription() << endl;
- }
+ if (options & SEQUENTIAL) {
+ if (madvise(view, length, MADV_SEQUENTIAL)) {
+ warning() << "map: madvise failed for " << filename << ' ' << errnoWithDescription()
+ << endl;
}
+ }
#endif
- views.push_back( view );
+ views.push_back(view);
- return view;
- }
+ return view;
+}
- void* MemoryMappedFile::createReadOnlyMap() {
- void * x = mmap( /*start*/0 , len , PROT_READ , MAP_SHARED , fd , 0 );
- if( x == MAP_FAILED ) {
- if ( errno == ENOMEM ) {
- if( sizeof(void*) == 4 )
- error() << "mmap ro failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64" << endl;
- else
- error() << "mmap ro failed with out of memory. (64 bit build)" << endl;
- }
- return 0;
+void* MemoryMappedFile::createReadOnlyMap() {
+ void* x = mmap(/*start*/ 0, len, PROT_READ, MAP_SHARED, fd, 0);
+ if (x == MAP_FAILED) {
+ if (errno == ENOMEM) {
+ if (sizeof(void*) == 4)
+ error() << "mmap ro failed with out of memory. You are using a 32-bit build and "
+ "probably need to upgrade to 64" << endl;
+ else
+ error() << "mmap ro failed with out of memory. (64 bit build)" << endl;
}
- return x;
+ return 0;
}
+ return x;
+}
- void* MemoryMappedFile::createPrivateMap() {
- void * x = mmap( /*start*/0 , len , PROT_READ|PROT_WRITE , MAP_PRIVATE|MAP_NORESERVE , fd , 0 );
- if( x == MAP_FAILED ) {
- if ( errno == ENOMEM ) {
- if( sizeof(void*) == 4 ) {
- error() << "mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64" << endl;
- }
- else {
- error() << "mmap private failed with out of memory. (64 bit build)" << endl;
- }
+void* MemoryMappedFile::createPrivateMap() {
+ void* x = mmap(/*start*/ 0, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
+ if (x == MAP_FAILED) {
+ if (errno == ENOMEM) {
+ if (sizeof(void*) == 4) {
+ error() << "mmap private failed with out of memory. You are using a 32-bit build "
+ "and probably need to upgrade to 64" << endl;
+ } else {
+ error() << "mmap private failed with out of memory. (64 bit build)" << endl;
}
- else {
- error() << "mmap private failed " << errnoWithDescription() << endl;
- }
- return 0;
+ } else {
+ error() << "mmap private failed " << errnoWithDescription() << endl;
}
-
- views.push_back(x);
- return x;
+ return 0;
}
- void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) {
-#if defined(__sun) // SERVER-8795
- LockMongoFilesExclusive lockMongoFiles;
+ views.push_back(x);
+ return x;
+}
+
+void* MemoryMappedFile::remapPrivateView(void* oldPrivateAddr) {
+#if defined(__sun) // SERVER-8795
+ LockMongoFilesExclusive lockMongoFiles;
#endif
- // don't unmap, just mmap over the old region
- void * x = mmap( oldPrivateAddr, len , PROT_READ|PROT_WRITE , MAP_PRIVATE|MAP_NORESERVE|MAP_FIXED , fd , 0 );
- if( x == MAP_FAILED ) {
- int err = errno;
- error() << "13601 Couldn't remap private view: " << errnoWithDescription(err) << endl;
- log() << "aborting" << endl;
- printMemInfo();
- abort();
- }
- verify( x == oldPrivateAddr );
- return x;
+ // don't unmap, just mmap over the old region
+ void* x = mmap(oldPrivateAddr,
+ len,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_NORESERVE | MAP_FIXED,
+ fd,
+ 0);
+ if (x == MAP_FAILED) {
+ int err = errno;
+ error() << "13601 Couldn't remap private view: " << errnoWithDescription(err) << endl;
+ log() << "aborting" << endl;
+ printMemInfo();
+ abort();
}
+ verify(x == oldPrivateAddr);
+ return x;
+}
- void MemoryMappedFile::flush(bool sync) {
- if ( views.empty() || fd == 0 )
- return;
+void MemoryMappedFile::flush(bool sync) {
+ if (views.empty() || fd == 0)
+ return;
- bool useFsync = sync && !ProcessInfo::preferMsyncOverFSync();
+ bool useFsync = sync && !ProcessInfo::preferMsyncOverFSync();
- if ( useFsync ?
- fsync(fd) != 0 :
- msync(viewForFlushing(), len, sync ? MS_SYNC : MS_ASYNC) ) {
- // msync failed, this is very bad
- log() << (useFsync ? "fsync failed: " : "msync failed: ") << errnoWithDescription()
- << " file: " << filename() << endl;
- dataSyncFailedHandler();
- }
+ if (useFsync ? fsync(fd) != 0 : msync(viewForFlushing(), len, sync ? MS_SYNC : MS_ASYNC)) {
+ // msync failed, this is very bad
+ log() << (useFsync ? "fsync failed: " : "msync failed: ") << errnoWithDescription()
+ << " file: " << filename() << endl;
+ dataSyncFailedHandler();
}
+}
- class PosixFlushable : public MemoryMappedFile::Flushable {
- public:
- PosixFlushable( MemoryMappedFile* theFile, void* view , HANDLE fd , long len)
- : _theFile( theFile ), _view( view ), _fd(fd), _len(len), _id(_theFile->getUniqueId()) {
- }
-
- void flush() {
- if ( _view == NULL || _fd == 0 )
- return;
-
- if ( ProcessInfo::preferMsyncOverFSync() ?
- msync(_view, _len, MS_SYNC ) == 0 :
- fsync(_fd) == 0 ) {
- return;
- }
+class PosixFlushable : public MemoryMappedFile::Flushable {
+public:
+ PosixFlushable(MemoryMappedFile* theFile, void* view, HANDLE fd, long len)
+ : _theFile(theFile), _view(view), _fd(fd), _len(len), _id(_theFile->getUniqueId()) {}
- if ( errno == EBADF ) {
- // ok, we were unlocked, so this file was closed
- return;
- }
+ void flush() {
+ if (_view == NULL || _fd == 0)
+ return;
- // some error, lets see if we're supposed to exist
- LockMongoFilesShared mmfilesLock;
- std::set<MongoFile*> mmfs = MongoFile::getAllFiles();
- std::set<MongoFile*>::const_iterator it = mmfs.find(_theFile);
- if ( (it == mmfs.end()) || ((*it)->getUniqueId() != _id) ) {
- log() << "msync failed with: " << errnoWithDescription()
- << " but file doesn't exist anymore, so ignoring";
- // this was deleted while we were unlocked
- return;
- }
+ if (ProcessInfo::preferMsyncOverFSync() ? msync(_view, _len, MS_SYNC) == 0
+ : fsync(_fd) == 0) {
+ return;
+ }
- // we got an error, and we still exist, so this is bad, we fail
- log() << "msync " << errnoWithDescription() << endl;
- dataSyncFailedHandler();
+ if (errno == EBADF) {
+ // ok, we were unlocked, so this file was closed
+ return;
}
- MemoryMappedFile* _theFile;
- void * _view;
- HANDLE _fd;
- long _len;
- const uint64_t _id;
- };
+ // some error, lets see if we're supposed to exist
+ LockMongoFilesShared mmfilesLock;
+ std::set<MongoFile*> mmfs = MongoFile::getAllFiles();
+ std::set<MongoFile*>::const_iterator it = mmfs.find(_theFile);
+ if ((it == mmfs.end()) || ((*it)->getUniqueId() != _id)) {
+ log() << "msync failed with: " << errnoWithDescription()
+ << " but file doesn't exist anymore, so ignoring";
+ // this was deleted while we were unlocked
+ return;
+ }
- MemoryMappedFile::Flushable * MemoryMappedFile::prepareFlush() {
- return new PosixFlushable( this, viewForFlushing(), fd, len);
+ // we got an error, and we still exist, so this is bad, we fail
+ log() << "msync " << errnoWithDescription() << endl;
+ dataSyncFailedHandler();
}
+ MemoryMappedFile* _theFile;
+ void* _view;
+ HANDLE _fd;
+ long _len;
+ const uint64_t _id;
+};
+
+MemoryMappedFile::Flushable* MemoryMappedFile::prepareFlush() {
+ return new PosixFlushable(this, viewForFlushing(), fd, len);
+}
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
index 6cc9d9cef73..8cdbd4ad7a4 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
@@ -55,844 +55,782 @@
namespace mongo {
- using std::unique_ptr;
+using std::unique_ptr;
namespace {
- /**
- * Declaration for the "newCollectionsUsePowerOf2Sizes" server parameter, which is now
- * deprecated in 3.0.
- * Note that:
- * - setting to true performs a no-op.
- * - setting to false will fail.
- */
- class NewCollectionsUsePowerOf2SizesParameter : public ExportedServerParameter<bool> {
- public:
- NewCollectionsUsePowerOf2SizesParameter() :
- ExportedServerParameter<bool>(ServerParameterSet::getGlobal(),
- "newCollectionsUsePowerOf2Sizes",
- &newCollectionsUsePowerOf2SizesFlag,
- true,
- true),
- newCollectionsUsePowerOf2SizesFlag(true) {
-
+/**
+ * Declaration for the "newCollectionsUsePowerOf2Sizes" server parameter, which is now
+ * deprecated in 3.0.
+ * Note that:
+ * - setting to true performs a no-op.
+ * - setting to false will fail.
+ */
+class NewCollectionsUsePowerOf2SizesParameter : public ExportedServerParameter<bool> {
+public:
+ NewCollectionsUsePowerOf2SizesParameter()
+ : ExportedServerParameter<bool>(ServerParameterSet::getGlobal(),
+ "newCollectionsUsePowerOf2Sizes",
+ &newCollectionsUsePowerOf2SizesFlag,
+ true,
+ true),
+ newCollectionsUsePowerOf2SizesFlag(true) {}
+
+ virtual Status validate(const bool& potentialNewValue) {
+ if (!potentialNewValue) {
+ return Status(ErrorCodes::BadValue,
+ "newCollectionsUsePowerOf2Sizes cannot be set to false. "
+ "Use noPadding instead during createCollection.");
}
- virtual Status validate(const bool& potentialNewValue) {
- if (!potentialNewValue) {
- return Status(ErrorCodes::BadValue,
- "newCollectionsUsePowerOf2Sizes cannot be set to false. "
- "Use noPadding instead during createCollection.");
- }
+ return Status::OK();
+ }
- return Status::OK();
- }
+private:
+ // Unused, needed for server parameter.
+ bool newCollectionsUsePowerOf2SizesFlag;
- private:
- // Unused, needed for server parameter.
- bool newCollectionsUsePowerOf2SizesFlag;
+} exportedNewCollectionsUsePowerOf2SizesParameter;
- } exportedNewCollectionsUsePowerOf2SizesParameter;
+int _massageExtentSize(const ExtentManager* em, long long size) {
+ if (size < em->minSize())
+ return em->minSize();
+ if (size > em->maxSize())
+ return em->maxSize();
- int _massageExtentSize(const ExtentManager* em, long long size) {
- if (size < em->minSize())
- return em->minSize();
- if (size > em->maxSize())
- return em->maxSize();
+ return static_cast<int>(size);
+}
- return static_cast<int>(size);
- }
+} // namespace
-} // namespace
+/**
+ * Registers the insertion of a new entry in the _collections cache with the RecoveryUnit,
+ * allowing for rollback.
+ */
+class MMAPV1DatabaseCatalogEntry::EntryInsertion : public RecoveryUnit::Change {
+public:
+ EntryInsertion(StringData ns, MMAPV1DatabaseCatalogEntry* entry)
+ : _ns(ns.toString()), _entry(entry) {}
- /**
- * Registers the insertion of a new entry in the _collections cache with the RecoveryUnit,
- * allowing for rollback.
- */
- class MMAPV1DatabaseCatalogEntry::EntryInsertion : public RecoveryUnit::Change {
- public:
- EntryInsertion(StringData ns, MMAPV1DatabaseCatalogEntry* entry)
- : _ns(ns.toString()), _entry(entry) { }
+ void rollback() {
+ _entry->_removeFromCache(NULL, _ns);
+ }
- void rollback() {
- _entry->_removeFromCache(NULL, _ns);
- }
+ void commit() {}
- void commit() { }
- private:
- const std::string _ns;
- MMAPV1DatabaseCatalogEntry* const _entry;
- };
-
- /**
- * Registers the removal of an entry from the _collections cache with the RecoveryUnit,
- * delaying actual deletion of the information until the change is commited. This allows
- * for easy rollback.
- */
- class MMAPV1DatabaseCatalogEntry::EntryRemoval : public RecoveryUnit::Change {
- public:
- // Rollback removing the collection from the cache. Takes ownership of the cachedEntry,
- // and will delete it if removal is final.
- EntryRemoval(StringData ns,
- MMAPV1DatabaseCatalogEntry* catalogEntry,
- Entry *cachedEntry)
- : _ns(ns.toString()), _catalogEntry(catalogEntry), _cachedEntry(cachedEntry) { }
-
- void rollback() {
- _catalogEntry->_collections[_ns] = _cachedEntry;
- }
+private:
+ const std::string _ns;
+ MMAPV1DatabaseCatalogEntry* const _entry;
+};
- void commit() {
- delete _cachedEntry;
- }
+/**
+ * Registers the removal of an entry from the _collections cache with the RecoveryUnit,
+ * delaying actual deletion of the information until the change is commited. This allows
+ * for easy rollback.
+ */
+class MMAPV1DatabaseCatalogEntry::EntryRemoval : public RecoveryUnit::Change {
+public:
+ // Rollback removing the collection from the cache. Takes ownership of the cachedEntry,
+ // and will delete it if removal is final.
+ EntryRemoval(StringData ns, MMAPV1DatabaseCatalogEntry* catalogEntry, Entry* cachedEntry)
+ : _ns(ns.toString()), _catalogEntry(catalogEntry), _cachedEntry(cachedEntry) {}
+
+ void rollback() {
+ _catalogEntry->_collections[_ns] = _cachedEntry;
+ }
- private:
- const std::string _ns;
- MMAPV1DatabaseCatalogEntry* const _catalogEntry;
- Entry* const _cachedEntry;
- };
-
- MMAPV1DatabaseCatalogEntry::MMAPV1DatabaseCatalogEntry( OperationContext* txn,
- StringData name,
- StringData path,
- bool directoryPerDB,
- bool transient )
- : DatabaseCatalogEntry( name ),
- _path( path.toString() ),
- _namespaceIndex(_path, name.toString()),
- _extentManager(name, path, directoryPerDB) {
-
- invariant(txn->lockState()->isDbLockedForMode(name, MODE_X));
-
- try {
- // First init the .ns file. If this fails, we may leak the .ns file, but this is OK
- // because subsequent openDB will go through this code path again.
- _namespaceIndex.init(txn);
-
- // Initialize the extent manager. This will create the first data file (.0) if needed
- // and if this fails we would leak the .ns file above. Leaking the .ns or .0 file is
- // acceptable, because subsequent openDB calls will exercise the code path again.
- Status s = _extentManager.init(txn);
- if (!s.isOK()) {
- msgasserted(16966, str::stream() << "_extentManager.init failed: " << s.toString());
- }
+ void commit() {
+ delete _cachedEntry;
+ }
- // This is the actual loading of the on-disk structures into cache.
- _init( txn );
- }
- catch (const DBException& dbe) {
- warning() << "database " << path << " " << name
- << " could not be opened due to DBException " << dbe.getCode() << ": "
- << dbe.what();
- throw;
- }
- catch (const std::exception& e) {
- warning() << "database " << path << " " << name
- << " could not be opened " << e.what();
- throw;
- }
+private:
+ const std::string _ns;
+ MMAPV1DatabaseCatalogEntry* const _catalogEntry;
+ Entry* const _cachedEntry;
+};
+
+MMAPV1DatabaseCatalogEntry::MMAPV1DatabaseCatalogEntry(
+ OperationContext* txn, StringData name, StringData path, bool directoryPerDB, bool transient)
+ : DatabaseCatalogEntry(name),
+ _path(path.toString()),
+ _namespaceIndex(_path, name.toString()),
+ _extentManager(name, path, directoryPerDB) {
+ invariant(txn->lockState()->isDbLockedForMode(name, MODE_X));
+
+ try {
+ // First init the .ns file. If this fails, we may leak the .ns file, but this is OK
+ // because subsequent openDB will go through this code path again.
+ _namespaceIndex.init(txn);
+
+ // Initialize the extent manager. This will create the first data file (.0) if needed
+ // and if this fails we would leak the .ns file above. Leaking the .ns or .0 file is
+ // acceptable, because subsequent openDB calls will exercise the code path again.
+ Status s = _extentManager.init(txn);
+ if (!s.isOK()) {
+ msgasserted(16966, str::stream() << "_extentManager.init failed: " << s.toString());
+ }
+
+ // This is the actual loading of the on-disk structures into cache.
+ _init(txn);
+ } catch (const DBException& dbe) {
+ warning() << "database " << path << " " << name
+ << " could not be opened due to DBException " << dbe.getCode() << ": "
+ << dbe.what();
+ throw;
+ } catch (const std::exception& e) {
+ warning() << "database " << path << " " << name << " could not be opened " << e.what();
+ throw;
}
+}
- MMAPV1DatabaseCatalogEntry::~MMAPV1DatabaseCatalogEntry() {
- for ( CollectionMap::const_iterator i = _collections.begin();
- i != _collections.end();
- ++i ) {
- delete i->second;
- }
- _collections.clear();
+MMAPV1DatabaseCatalogEntry::~MMAPV1DatabaseCatalogEntry() {
+ for (CollectionMap::const_iterator i = _collections.begin(); i != _collections.end(); ++i) {
+ delete i->second;
}
+ _collections.clear();
+}
- intmax_t dbSize( const string& database ); // from repair_database.cpp
+intmax_t dbSize(const string& database); // from repair_database.cpp
- int64_t MMAPV1DatabaseCatalogEntry::sizeOnDisk( OperationContext* opCtx ) const {
- return static_cast<int64_t>( dbSize( name() ) );
+int64_t MMAPV1DatabaseCatalogEntry::sizeOnDisk(OperationContext* opCtx) const {
+ return static_cast<int64_t>(dbSize(name()));
+}
+
+void MMAPV1DatabaseCatalogEntry::_removeFromCache(RecoveryUnit* ru, StringData ns) {
+ CollectionMap::iterator i = _collections.find(ns.toString());
+ if (i == _collections.end()) {
+ return;
}
- void MMAPV1DatabaseCatalogEntry::_removeFromCache(RecoveryUnit* ru,
- StringData ns) {
- CollectionMap::iterator i = _collections.find(ns.toString());
- if (i == _collections.end()) {
- return;
- }
+ // If there is an operation context, register a rollback to restore the cache entry
+ if (ru) {
+ ru->registerChange(new EntryRemoval(ns, this, i->second));
+ } else {
+ delete i->second;
+ }
+ _collections.erase(i);
+}
- // If there is an operation context, register a rollback to restore the cache entry
- if (ru) {
- ru->registerChange(new EntryRemoval(ns, this, i->second));
- }
- else {
- delete i->second;
- }
- _collections.erase(i);
+Status MMAPV1DatabaseCatalogEntry::dropCollection(OperationContext* txn, StringData ns) {
+ invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X));
+ _removeFromCache(txn->recoveryUnit(), ns);
+
+ NamespaceDetails* details = _namespaceIndex.details(ns);
+
+ if (!details) {
+ return Status(ErrorCodes::NamespaceNotFound, str::stream() << "ns not found: " << ns);
}
- Status MMAPV1DatabaseCatalogEntry::dropCollection(OperationContext* txn, StringData ns) {
- invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X));
- _removeFromCache(txn->recoveryUnit(), ns);
+ invariant(details->nIndexes == 0); // TODO: delete instead?
+ invariant(details->indexBuildsInProgress == 0); // TODO: delete instead?
- NamespaceDetails* details = _namespaceIndex.details( ns );
+ _removeNamespaceFromNamespaceCollection(txn, ns);
- if ( !details ) {
- return Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns not found: " << ns );
- }
+ // free extents
+ if (!details->firstExtent.isNull()) {
+ _extentManager.freeExtents(txn, details->firstExtent, details->lastExtent);
+ *txn->recoveryUnit()->writing(&details->firstExtent) = DiskLoc().setInvalid();
+ *txn->recoveryUnit()->writing(&details->lastExtent) = DiskLoc().setInvalid();
+ }
- invariant( details->nIndexes == 0 ); // TODO: delete instead?
- invariant( details->indexBuildsInProgress == 0 ); // TODO: delete instead?
+ // remove from the catalog hashtable
+ _namespaceIndex.kill_ns(txn, ns);
- _removeNamespaceFromNamespaceCollection( txn, ns );
+ return Status::OK();
+}
- // free extents
- if( !details->firstExtent.isNull() ) {
- _extentManager.freeExtents(txn, details->firstExtent, details->lastExtent);
- *txn->recoveryUnit()->writing( &details->firstExtent ) = DiskLoc().setInvalid();
- *txn->recoveryUnit()->writing( &details->lastExtent ) = DiskLoc().setInvalid();
- }
- // remove from the catalog hashtable
- _namespaceIndex.kill_ns( txn, ns );
+Status MMAPV1DatabaseCatalogEntry::renameCollection(OperationContext* txn,
+ StringData fromNS,
+ StringData toNS,
+ bool stayTemp) {
+ Status s = _renameSingleNamespace(txn, fromNS, toNS, stayTemp);
+ if (!s.isOK())
+ return s;
- return Status::OK();
- }
+ NamespaceDetails* details = _namespaceIndex.details(toNS);
+ invariant(details);
+ RecordStoreV1Base* systemIndexRecordStore = _getIndexRecordStore();
+ auto cursor = systemIndexRecordStore->getCursor(txn);
+ while (auto record = cursor->next()) {
+ BSONObj oldIndexSpec = record->data.releaseToBson();
+ if (fromNS != oldIndexSpec["ns"].valuestrsafe())
+ continue;
- Status MMAPV1DatabaseCatalogEntry::renameCollection( OperationContext* txn,
- StringData fromNS,
- StringData toNS,
- bool stayTemp ) {
- Status s = _renameSingleNamespace( txn, fromNS, toNS, stayTemp );
- if ( !s.isOK() )
- return s;
-
- NamespaceDetails* details = _namespaceIndex.details( toNS );
- invariant( details );
-
- RecordStoreV1Base* systemIndexRecordStore = _getIndexRecordStore();
- auto cursor = systemIndexRecordStore->getCursor(txn);
- while (auto record = cursor->next()) {
- BSONObj oldIndexSpec = record->data.releaseToBson();
- if ( fromNS != oldIndexSpec["ns"].valuestrsafe() )
- continue;
-
- BSONObj newIndexSpec;
- {
- BSONObjBuilder b;
- BSONObjIterator i( oldIndexSpec );
- while( i.more() ) {
- BSONElement e = i.next();
- if ( strcmp( e.fieldName(), "ns" ) != 0 )
- b.append( e );
- else
- b << "ns" << toNS;
- }
- newIndexSpec = b.obj();
+ BSONObj newIndexSpec;
+ {
+ BSONObjBuilder b;
+ BSONObjIterator i(oldIndexSpec);
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (strcmp(e.fieldName(), "ns") != 0)
+ b.append(e);
+ else
+ b << "ns" << toNS;
}
+ newIndexSpec = b.obj();
+ }
- StatusWith<RecordId> newIndexSpecLoc =
- systemIndexRecordStore->insertRecord( txn,
- newIndexSpec.objdata(),
- newIndexSpec.objsize(),
- false );
- if ( !newIndexSpecLoc.isOK() )
- return newIndexSpecLoc.getStatus();
-
- const string& indexName = oldIndexSpec.getStringField( "name" );
-
- {
- // fix IndexDetails pointer
- NamespaceDetailsCollectionCatalogEntry ce( toNS,
- details,
- _getNamespaceRecordStore(),
- systemIndexRecordStore,
- this );
- int indexI = ce._findIndexNumber( txn, indexName );
-
- IndexDetails& indexDetails = details->idx(indexI);
- *txn->recoveryUnit()->writing(&indexDetails.info) =
- DiskLoc::fromRecordId(newIndexSpecLoc.getValue());
- }
+ StatusWith<RecordId> newIndexSpecLoc = systemIndexRecordStore->insertRecord(
+ txn, newIndexSpec.objdata(), newIndexSpec.objsize(), false);
+ if (!newIndexSpecLoc.isOK())
+ return newIndexSpecLoc.getStatus();
- {
- // move underlying namespac
- string oldIndexNs = IndexDescriptor::makeIndexNamespace( fromNS, indexName );
- string newIndexNs = IndexDescriptor::makeIndexNamespace( toNS, indexName );
+ const string& indexName = oldIndexSpec.getStringField("name");
- Status s = _renameSingleNamespace( txn, oldIndexNs, newIndexNs, false );
- if ( !s.isOK() )
- return s;
- }
+ {
+ // fix IndexDetails pointer
+ NamespaceDetailsCollectionCatalogEntry ce(
+ toNS, details, _getNamespaceRecordStore(), systemIndexRecordStore, this);
+ int indexI = ce._findIndexNumber(txn, indexName);
- systemIndexRecordStore->deleteRecord( txn, record->id );
+ IndexDetails& indexDetails = details->idx(indexI);
+ *txn->recoveryUnit()->writing(&indexDetails.info) =
+ DiskLoc::fromRecordId(newIndexSpecLoc.getValue());
}
- return Status::OK();
+ {
+ // move underlying namespac
+ string oldIndexNs = IndexDescriptor::makeIndexNamespace(fromNS, indexName);
+ string newIndexNs = IndexDescriptor::makeIndexNamespace(toNS, indexName);
+
+ Status s = _renameSingleNamespace(txn, oldIndexNs, newIndexNs, false);
+ if (!s.isOK())
+ return s;
+ }
+
+ systemIndexRecordStore->deleteRecord(txn, record->id);
}
- Status MMAPV1DatabaseCatalogEntry::_renameSingleNamespace( OperationContext* txn,
- StringData fromNS,
- StringData toNS,
- bool stayTemp ) {
- // some sanity checking
- NamespaceDetails* fromDetails = _namespaceIndex.details( fromNS );
- if ( !fromDetails )
- return Status( ErrorCodes::BadValue, "from namespace doesn't exist" );
+ return Status::OK();
+}
- if ( _namespaceIndex.details( toNS ) )
- return Status( ErrorCodes::BadValue, "to namespace already exists" );
+Status MMAPV1DatabaseCatalogEntry::_renameSingleNamespace(OperationContext* txn,
+ StringData fromNS,
+ StringData toNS,
+ bool stayTemp) {
+ // some sanity checking
+ NamespaceDetails* fromDetails = _namespaceIndex.details(fromNS);
+ if (!fromDetails)
+ return Status(ErrorCodes::BadValue, "from namespace doesn't exist");
- _removeFromCache(txn->recoveryUnit(), fromNS);
+ if (_namespaceIndex.details(toNS))
+ return Status(ErrorCodes::BadValue, "to namespace already exists");
- // at this point, we haven't done anything destructive yet
+ _removeFromCache(txn->recoveryUnit(), fromNS);
- // ----
- // actually start moving
- // ----
+ // at this point, we haven't done anything destructive yet
- // this could throw, but if it does we're ok
- _namespaceIndex.add_ns( txn, toNS, fromDetails );
- NamespaceDetails* toDetails = _namespaceIndex.details( toNS );
+ // ----
+ // actually start moving
+ // ----
- try {
- toDetails->copyingFrom(txn,
- toNS,
- _namespaceIndex,
- fromDetails); // fixes extraOffset
- }
- catch( DBException& ) {
- // could end up here if .ns is full - if so try to clean up / roll back a little
- _namespaceIndex.kill_ns( txn, toNS );
- throw;
- }
+ // this could throw, but if it does we're ok
+ _namespaceIndex.add_ns(txn, toNS, fromDetails);
+ NamespaceDetails* toDetails = _namespaceIndex.details(toNS);
- // at this point, code .ns stuff moved
+ try {
+ toDetails->copyingFrom(txn, toNS, _namespaceIndex, fromDetails); // fixes extraOffset
+ } catch (DBException&) {
+ // could end up here if .ns is full - if so try to clean up / roll back a little
+ _namespaceIndex.kill_ns(txn, toNS);
+ throw;
+ }
- _namespaceIndex.kill_ns( txn, fromNS );
- fromDetails = NULL;
+ // at this point, code .ns stuff moved
- // fix system.namespaces
- BSONObj newSpec;
- RecordId oldSpecLocation;
- {
+ _namespaceIndex.kill_ns(txn, fromNS);
+ fromDetails = NULL;
- BSONObj oldSpec;
- {
- RecordStoreV1Base* rs = _getNamespaceRecordStore();
- auto cursor = rs->getCursor(txn);
- while (auto record = cursor->next()) {
- BSONObj entry = record->data.releaseToBson();
- if ( fromNS == entry["name"].String() ) {
- oldSpecLocation = record->id;
- oldSpec = entry.getOwned();
- break;
- }
+ // fix system.namespaces
+ BSONObj newSpec;
+ RecordId oldSpecLocation;
+ {
+ BSONObj oldSpec;
+ {
+ RecordStoreV1Base* rs = _getNamespaceRecordStore();
+ auto cursor = rs->getCursor(txn);
+ while (auto record = cursor->next()) {
+ BSONObj entry = record->data.releaseToBson();
+ if (fromNS == entry["name"].String()) {
+ oldSpecLocation = record->id;
+ oldSpec = entry.getOwned();
+ break;
}
}
- invariant( !oldSpec.isEmpty() );
- invariant( !oldSpecLocation.isNull() );
+ }
+ invariant(!oldSpec.isEmpty());
+ invariant(!oldSpecLocation.isNull());
- BSONObjBuilder b;
- BSONObjIterator i( oldSpec.getObjectField( "options" ) );
- while( i.more() ) {
- BSONElement e = i.next();
- if ( strcmp( e.fieldName(), "create" ) != 0 ) {
- if (stayTemp || (strcmp(e.fieldName(), "temp") != 0))
- b.append( e );
- }
- else {
- b << "create" << toNS;
- }
+ BSONObjBuilder b;
+ BSONObjIterator i(oldSpec.getObjectField("options"));
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (strcmp(e.fieldName(), "create") != 0) {
+ if (stayTemp || (strcmp(e.fieldName(), "temp") != 0))
+ b.append(e);
+ } else {
+ b << "create" << toNS;
}
- newSpec = b.obj();
}
+ newSpec = b.obj();
+ }
- _addNamespaceToNamespaceCollection( txn, toNS, newSpec.isEmpty() ? 0 : &newSpec );
-
- _getNamespaceRecordStore()->deleteRecord( txn, oldSpecLocation );
+ _addNamespaceToNamespaceCollection(txn, toNS, newSpec.isEmpty() ? 0 : &newSpec);
- Entry*& entry = _collections[toNS.toString()];
- invariant( entry == NULL );
- txn->recoveryUnit()->registerChange(new EntryInsertion(toNS, this));
- entry = new Entry();
- _insertInCache(txn, toNS, entry);
+ _getNamespaceRecordStore()->deleteRecord(txn, oldSpecLocation);
- return Status::OK();
- }
+ Entry*& entry = _collections[toNS.toString()];
+ invariant(entry == NULL);
+ txn->recoveryUnit()->registerChange(new EntryInsertion(toNS, this));
+ entry = new Entry();
+ _insertInCache(txn, toNS, entry);
- void MMAPV1DatabaseCatalogEntry::appendExtraStats( OperationContext* opCtx,
- BSONObjBuilder* output,
- double scale ) const {
- if ( isEmpty() ) {
- output->appendNumber( "fileSize", 0 );
- }
- else {
- output->appendNumber( "fileSize", _extentManager.fileSize() / scale );
- output->appendNumber( "nsSizeMB", static_cast<int>( _namespaceIndex.fileLength() /
- ( 1024 * 1024 ) ) );
+ return Status::OK();
+}
- int freeListSize = 0;
- int64_t freeListSpace = 0;
- _extentManager.freeListStats(opCtx, &freeListSize, &freeListSpace);
+void MMAPV1DatabaseCatalogEntry::appendExtraStats(OperationContext* opCtx,
+ BSONObjBuilder* output,
+ double scale) const {
+ if (isEmpty()) {
+ output->appendNumber("fileSize", 0);
+ } else {
+ output->appendNumber("fileSize", _extentManager.fileSize() / scale);
+ output->appendNumber("nsSizeMB",
+ static_cast<int>(_namespaceIndex.fileLength() / (1024 * 1024)));
- BSONObjBuilder extentFreeList( output->subobjStart( "extentFreeList" ) );
- extentFreeList.append( "num", freeListSize );
- extentFreeList.appendNumber( "totalSize",
- static_cast<long long>( freeListSpace / scale ) );
- extentFreeList.done();
+ int freeListSize = 0;
+ int64_t freeListSpace = 0;
+ _extentManager.freeListStats(opCtx, &freeListSize, &freeListSpace);
- {
+ BSONObjBuilder extentFreeList(output->subobjStart("extentFreeList"));
+ extentFreeList.append("num", freeListSize);
+ extentFreeList.appendNumber("totalSize", static_cast<long long>(freeListSpace / scale));
+ extentFreeList.done();
- const DataFileVersion version = _extentManager.getFileFormat(opCtx);
+ {
+ const DataFileVersion version = _extentManager.getFileFormat(opCtx);
- BSONObjBuilder dataFileVersion( output->subobjStart( "dataFileVersion" ) );
- dataFileVersion.append( "major", version.majorRaw() );
- dataFileVersion.append( "minor", version.minorRaw() );
- dataFileVersion.done();
- }
+ BSONObjBuilder dataFileVersion(output->subobjStart("dataFileVersion"));
+ dataFileVersion.append("major", version.majorRaw());
+ dataFileVersion.append("minor", version.minorRaw());
+ dataFileVersion.done();
}
-
}
+}
- bool MMAPV1DatabaseCatalogEntry::isOlderThan24( OperationContext* opCtx ) const {
- if ( _extentManager.numFiles() == 0 )
- return false;
+bool MMAPV1DatabaseCatalogEntry::isOlderThan24(OperationContext* opCtx) const {
+ if (_extentManager.numFiles() == 0)
+ return false;
- const DataFileVersion version = _extentManager.getFileFormat(opCtx);
+ const DataFileVersion version = _extentManager.getFileFormat(opCtx);
- invariant(version.isCompatibleWithCurrentCode());
+ invariant(version.isCompatibleWithCurrentCode());
- return !version.is24IndexClean();
- }
+ return !version.is24IndexClean();
+}
- void MMAPV1DatabaseCatalogEntry::markIndexSafe24AndUp( OperationContext* opCtx ) {
- if ( _extentManager.numFiles() == 0 )
- return;
+void MMAPV1DatabaseCatalogEntry::markIndexSafe24AndUp(OperationContext* opCtx) {
+ if (_extentManager.numFiles() == 0)
+ return;
- DataFileVersion version = _extentManager.getFileFormat(opCtx);
+ DataFileVersion version = _extentManager.getFileFormat(opCtx);
- invariant(version.isCompatibleWithCurrentCode());
+ invariant(version.isCompatibleWithCurrentCode());
- if (version.is24IndexClean())
- return; // nothing to do
+ if (version.is24IndexClean())
+ return; // nothing to do
- version.setIs24IndexClean();
- _extentManager.setFileFormat(opCtx, version);
- }
+ version.setIs24IndexClean();
+ _extentManager.setFileFormat(opCtx, version);
+}
- bool MMAPV1DatabaseCatalogEntry::currentFilesCompatible( OperationContext* opCtx ) const {
- if ( _extentManager.numFiles() == 0 )
- return true;
+bool MMAPV1DatabaseCatalogEntry::currentFilesCompatible(OperationContext* opCtx) const {
+ if (_extentManager.numFiles() == 0)
+ return true;
- return _extentManager.getOpenFile( 0 )->getHeader()->version.isCompatibleWithCurrentCode();
- }
+ return _extentManager.getOpenFile(0)->getHeader()->version.isCompatibleWithCurrentCode();
+}
- void MMAPV1DatabaseCatalogEntry::getCollectionNamespaces( std::list<std::string>* tofill ) const {
- _namespaceIndex.getCollectionNamespaces( tofill );
- }
+void MMAPV1DatabaseCatalogEntry::getCollectionNamespaces(std::list<std::string>* tofill) const {
+ _namespaceIndex.getCollectionNamespaces(tofill);
+}
- void MMAPV1DatabaseCatalogEntry::_ensureSystemCollection(OperationContext* txn,
- StringData ns) {
-
- NamespaceDetails* details = _namespaceIndex.details(ns);
- if (details) {
- return;
- }
- _namespaceIndex.add_ns( txn, ns, DiskLoc(), false );
+void MMAPV1DatabaseCatalogEntry::_ensureSystemCollection(OperationContext* txn, StringData ns) {
+ NamespaceDetails* details = _namespaceIndex.details(ns);
+ if (details) {
+ return;
}
+ _namespaceIndex.add_ns(txn, ns, DiskLoc(), false);
+}
- void MMAPV1DatabaseCatalogEntry::_init(OperationContext* txn) {
- WriteUnitOfWork wunit(txn);
+void MMAPV1DatabaseCatalogEntry::_init(OperationContext* txn) {
+ WriteUnitOfWork wunit(txn);
- // Upgrade freelist
- const NamespaceString oldFreeList(name(), "$freelist");
- NamespaceDetails* freeListDetails = _namespaceIndex.details(oldFreeList.ns());
- if (freeListDetails) {
- if (!freeListDetails->firstExtent.isNull()) {
- _extentManager.freeExtents(txn,
- freeListDetails->firstExtent,
- freeListDetails->lastExtent);
- }
-
- _namespaceIndex.kill_ns(txn, oldFreeList.ns());
+ // Upgrade freelist
+ const NamespaceString oldFreeList(name(), "$freelist");
+ NamespaceDetails* freeListDetails = _namespaceIndex.details(oldFreeList.ns());
+ if (freeListDetails) {
+ if (!freeListDetails->firstExtent.isNull()) {
+ _extentManager.freeExtents(
+ txn, freeListDetails->firstExtent, freeListDetails->lastExtent);
}
- DataFileVersion version = _extentManager.getFileFormat(txn);
- if (version.isCompatibleWithCurrentCode() && !version.mayHave28Freelist()) {
- // Any DB that can be opened and written to gets this flag set.
- version.setMayHave28Freelist();
- _extentManager.setFileFormat(txn, version);
- }
+ _namespaceIndex.kill_ns(txn, oldFreeList.ns());
+ }
- const NamespaceString nsi(name(), "system.indexes");
- const NamespaceString nsn(name(), "system.namespaces");
+ DataFileVersion version = _extentManager.getFileFormat(txn);
+ if (version.isCompatibleWithCurrentCode() && !version.mayHave28Freelist()) {
+ // Any DB that can be opened and written to gets this flag set.
+ version.setMayHave28Freelist();
+ _extentManager.setFileFormat(txn, version);
+ }
- bool isSystemNamespacesGoingToBeNew = _namespaceIndex.details(nsn.toString()) == NULL;
- bool isSystemIndexesGoingToBeNew = _namespaceIndex.details(nsi.toString()) == NULL;
+ const NamespaceString nsi(name(), "system.indexes");
+ const NamespaceString nsn(name(), "system.namespaces");
- _ensureSystemCollection(txn, nsn.toString());
- _ensureSystemCollection(txn, nsi.toString());
+ bool isSystemNamespacesGoingToBeNew = _namespaceIndex.details(nsn.toString()) == NULL;
+ bool isSystemIndexesGoingToBeNew = _namespaceIndex.details(nsi.toString()) == NULL;
- if (isSystemNamespacesGoingToBeNew) {
- txn->recoveryUnit()->registerChange(new EntryInsertion(nsn.toString(), this));
- }
- if (isSystemIndexesGoingToBeNew) {
- txn->recoveryUnit()->registerChange(new EntryInsertion(nsi.toString(), this));
- }
+ _ensureSystemCollection(txn, nsn.toString());
+ _ensureSystemCollection(txn, nsi.toString());
- Entry*& indexEntry = _collections[nsi.toString()];
- Entry*& nsEntry = _collections[nsn.toString()];
+ if (isSystemNamespacesGoingToBeNew) {
+ txn->recoveryUnit()->registerChange(new EntryInsertion(nsn.toString(), this));
+ }
+ if (isSystemIndexesGoingToBeNew) {
+ txn->recoveryUnit()->registerChange(new EntryInsertion(nsi.toString(), this));
+ }
- NamespaceDetails* const indexDetails = _namespaceIndex.details(nsi.toString());
- NamespaceDetails* const nsDetails = _namespaceIndex.details(nsn.toString());
+ Entry*& indexEntry = _collections[nsi.toString()];
+ Entry*& nsEntry = _collections[nsn.toString()];
- // order has to be:
- // 1) ns rs
- // 2) i rs
- // 3) catalog entries
+ NamespaceDetails* const indexDetails = _namespaceIndex.details(nsi.toString());
+ NamespaceDetails* const nsDetails = _namespaceIndex.details(nsn.toString());
- if (!nsEntry) {
- nsEntry = new Entry();
+ // order has to be:
+ // 1) ns rs
+ // 2) i rs
+ // 3) catalog entries
- NamespaceDetailsRSV1MetaData* md = new NamespaceDetailsRSV1MetaData(nsn.toString(),
- nsDetails);
- nsEntry->recordStore.reset(new SimpleRecordStoreV1(txn,
- nsn.toString(),
- md,
- &_extentManager,
- false));
- }
+ if (!nsEntry) {
+ nsEntry = new Entry();
- if (!indexEntry) {
- indexEntry = new Entry();
+ NamespaceDetailsRSV1MetaData* md =
+ new NamespaceDetailsRSV1MetaData(nsn.toString(), nsDetails);
+ nsEntry->recordStore.reset(
+ new SimpleRecordStoreV1(txn, nsn.toString(), md, &_extentManager, false));
+ }
- NamespaceDetailsRSV1MetaData* md =
- new NamespaceDetailsRSV1MetaData(nsi.toString(), indexDetails);
+ if (!indexEntry) {
+ indexEntry = new Entry();
- indexEntry->recordStore.reset(new SimpleRecordStoreV1(txn,
- nsi.toString(),
- md,
- &_extentManager,
- true));
- }
+ NamespaceDetailsRSV1MetaData* md =
+ new NamespaceDetailsRSV1MetaData(nsi.toString(), indexDetails);
- if (isSystemIndexesGoingToBeNew) {
- _addNamespaceToNamespaceCollection(txn, nsi.toString(), NULL);
- }
+ indexEntry->recordStore.reset(
+ new SimpleRecordStoreV1(txn, nsi.toString(), md, &_extentManager, true));
+ }
- if (!nsEntry->catalogEntry) {
- nsEntry->catalogEntry.reset(
- new NamespaceDetailsCollectionCatalogEntry(nsn.toString(),
- nsDetails,
- nsEntry->recordStore.get(),
- indexEntry->recordStore.get(),
- this));
- }
+ if (isSystemIndexesGoingToBeNew) {
+ _addNamespaceToNamespaceCollection(txn, nsi.toString(), NULL);
+ }
- if (!indexEntry->catalogEntry) {
- indexEntry->catalogEntry.reset(
- new NamespaceDetailsCollectionCatalogEntry(nsi.toString(),
- indexDetails,
- nsEntry->recordStore.get(),
- indexEntry->recordStore.get(),
- this));
- }
+ if (!nsEntry->catalogEntry) {
+ nsEntry->catalogEntry.reset(
+ new NamespaceDetailsCollectionCatalogEntry(nsn.toString(),
+ nsDetails,
+ nsEntry->recordStore.get(),
+ indexEntry->recordStore.get(),
+ this));
+ }
- wunit.commit();
+ if (!indexEntry->catalogEntry) {
+ indexEntry->catalogEntry.reset(
+ new NamespaceDetailsCollectionCatalogEntry(nsi.toString(),
+ indexDetails,
+ nsEntry->recordStore.get(),
+ indexEntry->recordStore.get(),
+ this));
+ }
- // Now put everything in the cache of namespaces. None of the operations below do any
- // transactional operations.
- std::list<std::string> namespaces;
- _namespaceIndex.getCollectionNamespaces(&namespaces);
+ wunit.commit();
- for (std::list<std::string>::const_iterator i = namespaces.begin();
- i != namespaces.end(); // we add to the list in the loop so can't cache end().
- i++) {
+ // Now put everything in the cache of namespaces. None of the operations below do any
+ // transactional operations.
+ std::list<std::string> namespaces;
+ _namespaceIndex.getCollectionNamespaces(&namespaces);
- const std::string& ns = *i;
- Entry*& entry = _collections[ns];
+ for (std::list<std::string>::const_iterator i = namespaces.begin();
+ i != namespaces.end(); // we add to the list in the loop so can't cache end().
+ i++) {
+ const std::string& ns = *i;
+ Entry*& entry = _collections[ns];
- // The two cases where entry is not null is for system.indexes and system.namespaces,
- // which we manually instantiated above. It is OK to skip these two collections,
- // because they don't have indexes on them anyway.
- if (entry) {
- continue;
- }
+ // The two cases where entry is not null is for system.indexes and system.namespaces,
+ // which we manually instantiated above. It is OK to skip these two collections,
+ // because they don't have indexes on them anyway.
+ if (entry) {
+ continue;
+ }
- entry = new Entry();
- _insertInCache(txn, ns, entry);
+ entry = new Entry();
+ _insertInCache(txn, ns, entry);
- // Add the indexes on this namespace to the list of namespaces to load.
- std::vector<std::string> indexNames;
- entry->catalogEntry->getAllIndexes(txn, &indexNames);
+ // Add the indexes on this namespace to the list of namespaces to load.
+ std::vector<std::string> indexNames;
+ entry->catalogEntry->getAllIndexes(txn, &indexNames);
- for (size_t i = 0; i < indexNames.size(); i++) {
- namespaces.push_back(IndexDescriptor::makeIndexNamespace(ns, indexNames[i]));
- }
+ for (size_t i = 0; i < indexNames.size(); i++) {
+ namespaces.push_back(IndexDescriptor::makeIndexNamespace(ns, indexNames[i]));
}
}
+}
- Status MMAPV1DatabaseCatalogEntry::createCollection( OperationContext* txn,
- StringData ns,
- const CollectionOptions& options,
- bool allocateDefaultSpace ) {
- if ( _namespaceIndex.details( ns ) ) {
- return Status( ErrorCodes::NamespaceExists,
- str::stream() << "namespace already exists: " << ns );
- }
+Status MMAPV1DatabaseCatalogEntry::createCollection(OperationContext* txn,
+ StringData ns,
+ const CollectionOptions& options,
+ bool allocateDefaultSpace) {
+ if (_namespaceIndex.details(ns)) {
+ return Status(ErrorCodes::NamespaceExists,
+ str::stream() << "namespace already exists: " << ns);
+ }
- BSONObj optionsAsBSON = options.toBSON();
- _addNamespaceToNamespaceCollection( txn, ns, &optionsAsBSON );
+ BSONObj optionsAsBSON = options.toBSON();
+ _addNamespaceToNamespaceCollection(txn, ns, &optionsAsBSON);
- _namespaceIndex.add_ns( txn, ns, DiskLoc(), options.capped );
- NamespaceDetails* details = _namespaceIndex.details(ns);
+ _namespaceIndex.add_ns(txn, ns, DiskLoc(), options.capped);
+ NamespaceDetails* details = _namespaceIndex.details(ns);
- // Set the flags.
- NamespaceDetailsRSV1MetaData(ns, details).replaceUserFlags(txn, options.flags);
+ // Set the flags.
+ NamespaceDetailsRSV1MetaData(ns, details).replaceUserFlags(txn, options.flags);
- if (options.capped && options.cappedMaxDocs > 0) {
- txn->recoveryUnit()->writingInt( details->maxDocsInCapped ) = options.cappedMaxDocs;
- }
-
- Entry*& entry = _collections[ns.toString()];
- invariant( !entry );
- txn->recoveryUnit()->registerChange(new EntryInsertion(ns, this));
- entry = new Entry();
- _insertInCache(txn, ns, entry);
+ if (options.capped && options.cappedMaxDocs > 0) {
+ txn->recoveryUnit()->writingInt(details->maxDocsInCapped) = options.cappedMaxDocs;
+ }
- if ( allocateDefaultSpace ) {
- RecordStoreV1Base* rs = _getRecordStore( ns );
- if ( options.initialNumExtents > 0 ) {
- int size = _massageExtentSize( &_extentManager, options.cappedSize );
- for ( int i = 0; i < options.initialNumExtents; i++ ) {
- rs->increaseStorageSize( txn, size, false );
- }
- }
- else if ( !options.initialExtentSizes.empty() ) {
- for ( size_t i = 0; i < options.initialExtentSizes.size(); i++ ) {
- int size = options.initialExtentSizes[i];
- size = _massageExtentSize( &_extentManager, size );
- rs->increaseStorageSize( txn, size, false );
- }
+ Entry*& entry = _collections[ns.toString()];
+ invariant(!entry);
+ txn->recoveryUnit()->registerChange(new EntryInsertion(ns, this));
+ entry = new Entry();
+ _insertInCache(txn, ns, entry);
+
+ if (allocateDefaultSpace) {
+ RecordStoreV1Base* rs = _getRecordStore(ns);
+ if (options.initialNumExtents > 0) {
+ int size = _massageExtentSize(&_extentManager, options.cappedSize);
+ for (int i = 0; i < options.initialNumExtents; i++) {
+ rs->increaseStorageSize(txn, size, false);
}
- else if ( options.capped ) {
- // normal
- do {
- // Must do this at least once, otherwise we leave the collection with no
- // extents, which is invalid.
- int sz = _massageExtentSize( &_extentManager,
- options.cappedSize - rs->storageSize(txn) );
- sz &= 0xffffff00;
- rs->increaseStorageSize( txn, sz, false );
- } while( rs->storageSize(txn) < options.cappedSize );
- }
- else {
- rs->increaseStorageSize( txn, _extentManager.initialSize( 128 ), false );
+ } else if (!options.initialExtentSizes.empty()) {
+ for (size_t i = 0; i < options.initialExtentSizes.size(); i++) {
+ int size = options.initialExtentSizes[i];
+ size = _massageExtentSize(&_extentManager, size);
+ rs->increaseStorageSize(txn, size, false);
}
+ } else if (options.capped) {
+ // normal
+ do {
+ // Must do this at least once, otherwise we leave the collection with no
+ // extents, which is invalid.
+ int sz =
+ _massageExtentSize(&_extentManager, options.cappedSize - rs->storageSize(txn));
+ sz &= 0xffffff00;
+ rs->increaseStorageSize(txn, sz, false);
+ } while (rs->storageSize(txn) < options.cappedSize);
+ } else {
+ rs->increaseStorageSize(txn, _extentManager.initialSize(128), false);
}
-
- return Status::OK();
}
- void MMAPV1DatabaseCatalogEntry::createNamespaceForIndex(OperationContext* txn,
- StringData name) {
- // This is a simplified form of createCollection.
- invariant(!_namespaceIndex.details(name));
-
- _addNamespaceToNamespaceCollection(txn, name, NULL);
- _namespaceIndex.add_ns(txn, name, DiskLoc(), false);
+ return Status::OK();
+}
- Entry*& entry = _collections[name.toString()];
- invariant( !entry );
- txn->recoveryUnit()->registerChange(new EntryInsertion(name, this));
- entry = new Entry();
- _insertInCache(txn, name, entry);
- }
+void MMAPV1DatabaseCatalogEntry::createNamespaceForIndex(OperationContext* txn, StringData name) {
+ // This is a simplified form of createCollection.
+ invariant(!_namespaceIndex.details(name));
- CollectionCatalogEntry* MMAPV1DatabaseCatalogEntry::getCollectionCatalogEntry(
- StringData ns ) const {
+ _addNamespaceToNamespaceCollection(txn, name, NULL);
+ _namespaceIndex.add_ns(txn, name, DiskLoc(), false);
- CollectionMap::const_iterator i = _collections.find( ns.toString() );
- if (i == _collections.end()) {
- return NULL;
- }
+ Entry*& entry = _collections[name.toString()];
+ invariant(!entry);
+ txn->recoveryUnit()->registerChange(new EntryInsertion(name, this));
+ entry = new Entry();
+ _insertInCache(txn, name, entry);
+}
- invariant( i->second->catalogEntry.get() );
- return i->second->catalogEntry.get();
+CollectionCatalogEntry* MMAPV1DatabaseCatalogEntry::getCollectionCatalogEntry(StringData ns) const {
+ CollectionMap::const_iterator i = _collections.find(ns.toString());
+ if (i == _collections.end()) {
+ return NULL;
}
- void MMAPV1DatabaseCatalogEntry::_insertInCache(OperationContext* txn,
- StringData ns,
- Entry* entry) {
+ invariant(i->second->catalogEntry.get());
+ return i->second->catalogEntry.get();
+}
- NamespaceDetails* details = _namespaceIndex.details(ns);
- invariant(details);
+void MMAPV1DatabaseCatalogEntry::_insertInCache(OperationContext* txn,
+ StringData ns,
+ Entry* entry) {
+ NamespaceDetails* details = _namespaceIndex.details(ns);
+ invariant(details);
- entry->catalogEntry.reset(
- new NamespaceDetailsCollectionCatalogEntry(ns,
- details,
- _getNamespaceRecordStore(),
- _getIndexRecordStore(),
- this));
+ entry->catalogEntry.reset(new NamespaceDetailsCollectionCatalogEntry(
+ ns, details, _getNamespaceRecordStore(), _getIndexRecordStore(), this));
- unique_ptr<NamespaceDetailsRSV1MetaData> md(new NamespaceDetailsRSV1MetaData(ns, details));
- const NamespaceString nss(ns);
+ unique_ptr<NamespaceDetailsRSV1MetaData> md(new NamespaceDetailsRSV1MetaData(ns, details));
+ const NamespaceString nss(ns);
- if (details->isCapped) {
- entry->recordStore.reset(new CappedRecordStoreV1(txn,
- NULL,
- ns,
- md.release(),
- &_extentManager,
- nss.coll() == "system.indexes"));
- }
- else {
- entry->recordStore.reset(new SimpleRecordStoreV1(txn,
- ns,
- md.release(),
- &_extentManager,
- nss.coll() == "system.indexes"));
- }
+ if (details->isCapped) {
+ entry->recordStore.reset(new CappedRecordStoreV1(
+ txn, NULL, ns, md.release(), &_extentManager, nss.coll() == "system.indexes"));
+ } else {
+ entry->recordStore.reset(new SimpleRecordStoreV1(
+ txn, ns, md.release(), &_extentManager, nss.coll() == "system.indexes"));
}
+}
- RecordStore* MMAPV1DatabaseCatalogEntry::getRecordStore( StringData ns ) const {
- return _getRecordStore( ns );
+RecordStore* MMAPV1DatabaseCatalogEntry::getRecordStore(StringData ns) const {
+ return _getRecordStore(ns);
+}
+
+RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getRecordStore(StringData ns) const {
+ CollectionMap::const_iterator i = _collections.find(ns.toString());
+ if (i == _collections.end()) {
+ return NULL;
}
- RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getRecordStore( StringData ns ) const {
- CollectionMap::const_iterator i = _collections.find( ns.toString() );
- if (i == _collections.end()) {
- return NULL;
- }
+ invariant(i->second->recordStore.get());
+ return i->second->recordStore.get();
+}
- invariant( i->second->recordStore.get() );
- return i->second->recordStore.get();
- }
+IndexAccessMethod* MMAPV1DatabaseCatalogEntry::getIndex(OperationContext* txn,
+ const CollectionCatalogEntry* collection,
+ IndexCatalogEntry* entry) {
+ const string& type = entry->descriptor()->getAccessMethodName();
- IndexAccessMethod* MMAPV1DatabaseCatalogEntry::getIndex( OperationContext* txn,
- const CollectionCatalogEntry* collection,
- IndexCatalogEntry* entry ) {
- const string& type = entry->descriptor()->getAccessMethodName();
+ string ns = collection->ns().ns();
- string ns = collection->ns().ns();
+ RecordStoreV1Base* rs = _getRecordStore(entry->descriptor()->indexNamespace());
+ invariant(rs);
- RecordStoreV1Base* rs = _getRecordStore(entry->descriptor()->indexNamespace());
- invariant(rs);
+ std::unique_ptr<SortedDataInterface> btree(
+ getMMAPV1Interface(entry->headManager(),
+ rs,
+ &rs->savedCursors,
+ entry->ordering(),
+ entry->descriptor()->indexNamespace(),
+ entry->descriptor()->version()));
- std::unique_ptr<SortedDataInterface> btree(
- getMMAPV1Interface(entry->headManager(),
- rs,
- &rs->savedCursors,
- entry->ordering(),
- entry->descriptor()->indexNamespace(),
- entry->descriptor()->version()));
+ if (IndexNames::HASHED == type)
+ return new HashAccessMethod(entry, btree.release());
- if (IndexNames::HASHED == type)
- return new HashAccessMethod( entry, btree.release() );
+ if (IndexNames::GEO_2DSPHERE == type)
+ return new S2AccessMethod(entry, btree.release());
- if (IndexNames::GEO_2DSPHERE == type)
- return new S2AccessMethod( entry, btree.release() );
+ if (IndexNames::TEXT == type)
+ return new FTSAccessMethod(entry, btree.release());
- if (IndexNames::TEXT == type)
- return new FTSAccessMethod( entry, btree.release() );
+ if (IndexNames::GEO_HAYSTACK == type)
+ return new HaystackAccessMethod(entry, btree.release());
- if (IndexNames::GEO_HAYSTACK == type)
- return new HaystackAccessMethod( entry, btree.release() );
+ if ("" == type)
+ return new BtreeAccessMethod(entry, btree.release());
- if ("" == type)
- return new BtreeAccessMethod( entry, btree.release() );
+ if (IndexNames::GEO_2D == type)
+ return new TwoDAccessMethod(entry, btree.release());
- if (IndexNames::GEO_2D == type)
- return new TwoDAccessMethod( entry, btree.release() );
+ log() << "Can't find index for keyPattern " << entry->descriptor()->keyPattern();
+ fassertFailed(17489);
+}
- log() << "Can't find index for keyPattern " << entry->descriptor()->keyPattern();
- fassertFailed(17489);
- }
+RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getIndexRecordStore() {
+ const NamespaceString nss(name(), "system.indexes");
+ Entry* entry = _collections[nss.toString()];
+ invariant(entry);
- RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getIndexRecordStore() {
- const NamespaceString nss(name(), "system.indexes");
- Entry* entry = _collections[nss.toString()];
- invariant( entry );
+ return entry->recordStore.get();
+}
- return entry->recordStore.get();
- }
+RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getNamespaceRecordStore() const {
+ const NamespaceString nss(name(), "system.namespaces");
+ CollectionMap::const_iterator i = _collections.find(nss.toString());
+ invariant(i != _collections.end());
- RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getNamespaceRecordStore() const {
- const NamespaceString nss( name(), "system.namespaces" );
- CollectionMap::const_iterator i = _collections.find( nss.toString() );
- invariant( i != _collections.end() );
+ return i->second->recordStore.get();
+}
- return i->second->recordStore.get();
+void MMAPV1DatabaseCatalogEntry::_addNamespaceToNamespaceCollection(OperationContext* txn,
+ StringData ns,
+ const BSONObj* options) {
+ if (nsToCollectionSubstring(ns) == "system.namespaces") {
+ // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
+ return;
}
- void MMAPV1DatabaseCatalogEntry::_addNamespaceToNamespaceCollection(OperationContext* txn,
- StringData ns,
- const BSONObj* options) {
+ BSONObjBuilder b;
+ b.append("name", ns);
+ if (options && !options->isEmpty()) {
+ b.append("options", *options);
+ }
- if (nsToCollectionSubstring(ns) == "system.namespaces") {
- // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
- return;
- }
+ const BSONObj obj = b.done();
- BSONObjBuilder b;
- b.append("name", ns);
- if (options && !options->isEmpty()) {
- b.append("options", *options);
- }
+ RecordStoreV1Base* rs = _getNamespaceRecordStore();
+ invariant(rs);
- const BSONObj obj = b.done();
+ StatusWith<RecordId> loc = rs->insertRecord(txn, obj.objdata(), obj.objsize(), false);
+ massertStatusOK(loc.getStatus());
+}
- RecordStoreV1Base* rs = _getNamespaceRecordStore();
- invariant( rs );
-
- StatusWith<RecordId> loc = rs->insertRecord( txn, obj.objdata(), obj.objsize(), false );
- massertStatusOK( loc.getStatus() );
+void MMAPV1DatabaseCatalogEntry::_removeNamespaceFromNamespaceCollection(OperationContext* txn,
+ StringData ns) {
+ if (nsToCollectionSubstring(ns) == "system.namespaces") {
+ // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
+ return;
}
- void MMAPV1DatabaseCatalogEntry::_removeNamespaceFromNamespaceCollection(
- OperationContext* txn,
- StringData ns ) {
+ RecordStoreV1Base* rs = _getNamespaceRecordStore();
+ invariant(rs);
- if ( nsToCollectionSubstring( ns ) == "system.namespaces" ) {
- // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
- return;
- }
-
- RecordStoreV1Base* rs = _getNamespaceRecordStore();
- invariant( rs );
-
- auto cursor = rs->getCursor(txn);
- while (auto record = cursor->next()) {
- BSONObj entry = record->data.releaseToBson();
- BSONElement name = entry["name"];
- if ( name.type() == String && name.String() == ns ) {
- rs->deleteRecord( txn, record->id );
- break;
- }
+ auto cursor = rs->getCursor(txn);
+ while (auto record = cursor->next()) {
+ BSONObj entry = record->data.releaseToBson();
+ BSONElement name = entry["name"];
+ if (name.type() == String && name.String() == ns) {
+ rs->deleteRecord(txn, record->id);
+ break;
}
}
+}
- CollectionOptions MMAPV1DatabaseCatalogEntry::getCollectionOptions( OperationContext* txn,
- StringData ns ) const {
- if ( nsToCollectionSubstring( ns ) == "system.namespaces" ) {
- return CollectionOptions();
- }
+CollectionOptions MMAPV1DatabaseCatalogEntry::getCollectionOptions(OperationContext* txn,
+ StringData ns) const {
+ if (nsToCollectionSubstring(ns) == "system.namespaces") {
+ return CollectionOptions();
+ }
- RecordStoreV1Base* rs = _getNamespaceRecordStore();
- invariant( rs );
-
- auto cursor = rs->getCursor(txn);
- while (auto record = cursor->next()) {
- BSONObj entry = record->data.releaseToBson();
- BSONElement name = entry["name"];
- if ( name.type() == String && name.String() == ns ) {
- CollectionOptions options;
- if ( entry["options"].isABSONObj() ) {
- Status status = options.parse( entry["options"].Obj() );
- fassert( 18523, status );
- }
- return options;
+ RecordStoreV1Base* rs = _getNamespaceRecordStore();
+ invariant(rs);
+
+ auto cursor = rs->getCursor(txn);
+ while (auto record = cursor->next()) {
+ BSONObj entry = record->data.releaseToBson();
+ BSONElement name = entry["name"];
+ if (name.type() == String && name.String() == ns) {
+ CollectionOptions options;
+ if (entry["options"].isABSONObj()) {
+ Status status = options.parse(entry["options"].Obj());
+ fassert(18523, status);
}
+ return options;
}
-
- return CollectionOptions();
}
-} // namespace mongo
+
+ return CollectionOptions();
+}
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
index 1db5e8a1f87..2a922d3d89c 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
@@ -39,145 +39,150 @@
namespace mongo {
- class CollectionCatalogEntry;
- struct CollectionOptions;
- class IndexAccessMethod;
- class IndexCatalogEntry;
- class IndexDescriptor;
- class RecordStore;
- class RecordStoreV1Base;
- class RecoveryUnit;
- class OperationContext;
-
- class MMAPV1DatabaseCatalogEntry : public DatabaseCatalogEntry {
- public:
- MMAPV1DatabaseCatalogEntry( OperationContext* txn,
- StringData name,
- StringData path,
- bool directoryperdb,
- bool transient );
-
- virtual ~MMAPV1DatabaseCatalogEntry();
-
- // these two seem the same and yet different
- // TODO(ERH): consolidate into one ideally
- virtual bool exists() const { return _namespaceIndex.pathExists(); }
- virtual bool isEmpty() const { return !_namespaceIndex.allocated(); }
- virtual bool hasUserData() const {
- // The two collections which exist and can't be removed are:
- // system.indexes
- // system.namespaces
- return _collections.size() > 2;
- }
-
- virtual int64_t sizeOnDisk( OperationContext* opCtx ) const;
-
- virtual bool isOlderThan24( OperationContext* opCtx ) const;
- virtual void markIndexSafe24AndUp( OperationContext* opCtx );
-
- virtual bool currentFilesCompatible( OperationContext* opCtx ) const;
+class CollectionCatalogEntry;
+struct CollectionOptions;
+class IndexAccessMethod;
+class IndexCatalogEntry;
+class IndexDescriptor;
+class RecordStore;
+class RecordStoreV1Base;
+class RecoveryUnit;
+class OperationContext;
+
+class MMAPV1DatabaseCatalogEntry : public DatabaseCatalogEntry {
+public:
+ MMAPV1DatabaseCatalogEntry(OperationContext* txn,
+ StringData name,
+ StringData path,
+ bool directoryperdb,
+ bool transient);
+
+ virtual ~MMAPV1DatabaseCatalogEntry();
+
+ // these two seem the same and yet different
+ // TODO(ERH): consolidate into one ideally
+ virtual bool exists() const {
+ return _namespaceIndex.pathExists();
+ }
+ virtual bool isEmpty() const {
+ return !_namespaceIndex.allocated();
+ }
+ virtual bool hasUserData() const {
+ // The two collections which exist and can't be removed are:
+ // system.indexes
+ // system.namespaces
+ return _collections.size() > 2;
+ }
+
+ virtual int64_t sizeOnDisk(OperationContext* opCtx) const;
+
+ virtual bool isOlderThan24(OperationContext* opCtx) const;
+ virtual void markIndexSafe24AndUp(OperationContext* opCtx);
+
+ virtual bool currentFilesCompatible(OperationContext* opCtx) const;
+
+ virtual void appendExtraStats(OperationContext* opCtx, BSONObjBuilder* out, double scale) const;
+
+ Status createCollection(OperationContext* txn,
+ StringData ns,
+ const CollectionOptions& options,
+ bool allocateDefaultSpace);
+
+ Status dropCollection(OperationContext* txn, StringData ns);
+
+ Status renameCollection(OperationContext* txn,
+ StringData fromNS,
+ StringData toNS,
+ bool stayTemp);
+
+ void getCollectionNamespaces(std::list<std::string>* tofill) const;
+
+ /**
+ * will return NULL if ns does not exist
+ */
+ CollectionCatalogEntry* getCollectionCatalogEntry(StringData ns) const;
+
+ RecordStore* getRecordStore(StringData ns) const;
+
+ IndexAccessMethod* getIndex(OperationContext* txn,
+ const CollectionCatalogEntry* collection,
+ IndexCatalogEntry* index);
+
+ const MmapV1ExtentManager* getExtentManager() const {
+ return &_extentManager;
+ }
+ MmapV1ExtentManager* getExtentManager() {
+ return &_extentManager;
+ }
+
+ CollectionOptions getCollectionOptions(OperationContext* txn, StringData ns) const;
+
+ /**
+ * Creates a CollectionCatalogEntry in the form of an index rather than a collection.
+ * MMAPv1 puts both indexes and collections into CCEs. A namespace named 'name' must not
+ * exist.
+ */
+ void createNamespaceForIndex(OperationContext* txn, StringData name);
+
+private:
+ class EntryInsertion;
+ class EntryRemoval;
+
+ friend class NamespaceDetailsCollectionCatalogEntry;
+
+ // The _collections map is a cache for efficiently looking up namespace information. Access
+ // to the cache is protected by holding the appropriate DB lock. Regular operations
+ // (insert/update/delete/query) hold intent locks on the database and they access the cache
+ // directly. Metadata operations, such as create db/collection, etc acquire exclusive lock
+ // on the database, which protects against concurrent readers of the cache.
+ //
+ // Once initialized, the cache must remain consistent with the data in the memory-mapped
+ // database files through _removeFromCache and _insertInCache. These methods use the
+ // RecoveryUnit to ensure correct handling of rollback.
+
+ struct Entry {
+ std::unique_ptr<CollectionCatalogEntry> catalogEntry;
+ std::unique_ptr<RecordStoreV1Base> recordStore;
+ };
- virtual void appendExtraStats( OperationContext* opCtx,
- BSONObjBuilder* out,
- double scale ) const;
-
- Status createCollection( OperationContext* txn,
- StringData ns,
- const CollectionOptions& options,
- bool allocateDefaultSpace );
-
- Status dropCollection( OperationContext* txn, StringData ns );
-
- Status renameCollection( OperationContext* txn,
- StringData fromNS,
- StringData toNS,
- bool stayTemp );
-
- void getCollectionNamespaces( std::list<std::string>* tofill ) const;
-
- /**
- * will return NULL if ns does not exist
- */
- CollectionCatalogEntry* getCollectionCatalogEntry( StringData ns ) const;
-
- RecordStore* getRecordStore( StringData ns ) const;
-
- IndexAccessMethod* getIndex( OperationContext* txn,
- const CollectionCatalogEntry* collection,
- IndexCatalogEntry* index );
-
- const MmapV1ExtentManager* getExtentManager() const { return &_extentManager; }
- MmapV1ExtentManager* getExtentManager() { return &_extentManager; }
-
- CollectionOptions getCollectionOptions( OperationContext* txn,
- StringData ns ) const;
-
- /**
- * Creates a CollectionCatalogEntry in the form of an index rather than a collection.
- * MMAPv1 puts both indexes and collections into CCEs. A namespace named 'name' must not
- * exist.
- */
- void createNamespaceForIndex(OperationContext* txn, StringData name);
-
- private:
- class EntryInsertion;
- class EntryRemoval;
-
- friend class NamespaceDetailsCollectionCatalogEntry;
-
- // The _collections map is a cache for efficiently looking up namespace information. Access
- // to the cache is protected by holding the appropriate DB lock. Regular operations
- // (insert/update/delete/query) hold intent locks on the database and they access the cache
- // directly. Metadata operations, such as create db/collection, etc acquire exclusive lock
- // on the database, which protects against concurrent readers of the cache.
- //
- // Once initialized, the cache must remain consistent with the data in the memory-mapped
- // database files through _removeFromCache and _insertInCache. These methods use the
- // RecoveryUnit to ensure correct handling of rollback.
-
- struct Entry {
- std::unique_ptr<CollectionCatalogEntry> catalogEntry;
- std::unique_ptr<RecordStoreV1Base> recordStore;
- };
-
- typedef std::map<std::string, Entry*> CollectionMap;
+ typedef std::map<std::string, Entry*> CollectionMap;
- RecordStoreV1Base* _getIndexRecordStore();
- RecordStoreV1Base* _getNamespaceRecordStore() const;
- RecordStoreV1Base* _getRecordStore(StringData ns) const;
+ RecordStoreV1Base* _getIndexRecordStore();
+ RecordStoreV1Base* _getNamespaceRecordStore() const;
+ RecordStoreV1Base* _getRecordStore(StringData ns) const;
- void _addNamespaceToNamespaceCollection(OperationContext* txn,
- StringData ns,
- const BSONObj* options);
+ void _addNamespaceToNamespaceCollection(OperationContext* txn,
+ StringData ns,
+ const BSONObj* options);
- void _removeNamespaceFromNamespaceCollection(OperationContext* txn, StringData ns);
+ void _removeNamespaceFromNamespaceCollection(OperationContext* txn, StringData ns);
- Status _renameSingleNamespace( OperationContext* txn,
- StringData fromNS,
- StringData toNS,
- bool stayTemp );
+ Status _renameSingleNamespace(OperationContext* txn,
+ StringData fromNS,
+ StringData toNS,
+ bool stayTemp);
- void _ensureSystemCollection(OperationContext* txn, StringData ns);
+ void _ensureSystemCollection(OperationContext* txn, StringData ns);
- void _init( OperationContext* txn );
+ void _init(OperationContext* txn);
- /**
- * Populate the _collections cache.
- */
- void _insertInCache(OperationContext* opCtx, StringData ns, Entry* entry);
+ /**
+ * Populate the _collections cache.
+ */
+ void _insertInCache(OperationContext* opCtx, StringData ns, Entry* entry);
- /**
- * Drop cached information for specified namespace. If a RecoveryUnit is specified,
- * use it to allow rollback. When ru is null, removal is unconditional.
- */
- void _removeFromCache(RecoveryUnit* ru, StringData ns);
+ /**
+ * Drop cached information for specified namespace. If a RecoveryUnit is specified,
+ * use it to allow rollback. When ru is null, removal is unconditional.
+ */
+ void _removeFromCache(RecoveryUnit* ru, StringData ns);
- const std::string _path;
+ const std::string _path;
- NamespaceIndex _namespaceIndex;
- MmapV1ExtentManager _extentManager;
- CollectionMap _collections;
- };
+ NamespaceIndex _namespaceIndex;
+ MmapV1ExtentManager _extentManager;
+ CollectionMap _collections;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_engine.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_engine.cpp
index b4550f135db..b1fd028a1d5 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_engine.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_engine.cpp
@@ -53,304 +53,300 @@
namespace mongo {
- using std::endl;
- using std::ifstream;
- using std::string;
- using std::stringstream;
- using std::vector;
+using std::endl;
+using std::ifstream;
+using std::string;
+using std::stringstream;
+using std::vector;
namespace {
#if !defined(__sun)
- // if doingRepair is true don't consider unclean shutdown an error
- void acquirePathLock(MMAPV1Engine* storageEngine,
- bool doingRepair,
- const StorageEngineLockFile& lockFile) {
- string name = lockFile.getFilespec();
- bool oldFile = lockFile.createdByUncleanShutdown();
-
- if ( oldFile ) {
- // we check this here because we want to see if we can get the lock
- // if we can't, then its probably just another mongod running
-
- string errmsg;
- if (doingRepair && dur::haveJournalFiles()) {
- errmsg = "************** \n"
- "You specified --repair but there are dirty journal files. Please\n"
- "restart without --repair to allow the journal files to be replayed.\n"
- "If you wish to repair all databases, please shutdown cleanly and\n"
- "run with --repair again.\n"
- "**************";
- }
- else if (storageGlobalParams.dur) {
- if (!dur::haveJournalFiles(/*anyFiles=*/true)) {
- // Passing anyFiles=true as we are trying to protect against starting in an
- // unclean state with the journal directory unmounted. If there are any files,
- // even prealloc files, then it means that it is mounted so we can continue.
- // Previously there was an issue (SERVER-5056) where we would fail to start up
- // if killed during prealloc.
-
- vector<string> dbnames;
- storageEngine->listDatabases( &dbnames );
-
- if ( dbnames.size() == 0 ) {
- // this means that mongod crashed
- // between initial startup and when journaling was initialized
- // it is safe to continue
- }
- else {
- errmsg = str::stream()
- << "************** \n"
- << "old lock file: " << name << ". probably means unclean shutdown,\n"
- << "but there are no journal files to recover.\n"
- << "this is likely human error or filesystem corruption.\n"
- << "please make sure that your journal directory is mounted.\n"
- << "found " << dbnames.size() << " dbs.\n"
- << "see: http://dochub.mongodb.org/core/repair for more information\n"
- << "*************";
- }
-
- }
- }
- else {
- if (!dur::haveJournalFiles() && !doingRepair) {
+// if doingRepair is true don't consider unclean shutdown an error
+void acquirePathLock(MMAPV1Engine* storageEngine,
+ bool doingRepair,
+ const StorageEngineLockFile& lockFile) {
+ string name = lockFile.getFilespec();
+ bool oldFile = lockFile.createdByUncleanShutdown();
+
+ if (oldFile) {
+ // we check this here because we want to see if we can get the lock
+ // if we can't, then its probably just another mongod running
+
+ string errmsg;
+ if (doingRepair && dur::haveJournalFiles()) {
+ errmsg =
+ "************** \n"
+ "You specified --repair but there are dirty journal files. Please\n"
+ "restart without --repair to allow the journal files to be replayed.\n"
+ "If you wish to repair all databases, please shutdown cleanly and\n"
+ "run with --repair again.\n"
+ "**************";
+ } else if (storageGlobalParams.dur) {
+ if (!dur::haveJournalFiles(/*anyFiles=*/true)) {
+ // Passing anyFiles=true as we are trying to protect against starting in an
+ // unclean state with the journal directory unmounted. If there are any files,
+ // even prealloc files, then it means that it is mounted so we can continue.
+ // Previously there was an issue (SERVER-5056) where we would fail to start up
+ // if killed during prealloc.
+
+ vector<string> dbnames;
+ storageEngine->listDatabases(&dbnames);
+
+ if (dbnames.size() == 0) {
+ // this means that mongod crashed
+ // between initial startup and when journaling was initialized
+ // it is safe to continue
+ } else {
errmsg = str::stream()
- << "************** \n"
- << "Unclean shutdown detected.\n"
- << "Please visit http://dochub.mongodb.org/core/repair for recovery instructions.\n"
- << "*************";
+ << "************** \n"
+ << "old lock file: " << name << ". probably means unclean shutdown,\n"
+ << "but there are no journal files to recover.\n"
+ << "this is likely human error or filesystem corruption.\n"
+ << "please make sure that your journal directory is mounted.\n"
+ << "found " << dbnames.size() << " dbs.\n"
+ << "see: http://dochub.mongodb.org/core/repair for more information\n"
+ << "*************";
}
}
-
- if (!errmsg.empty()) {
- log() << errmsg << endl;
- uassert( 12596 , "old lock file" , 0 );
+ } else {
+ if (!dur::haveJournalFiles() && !doingRepair) {
+ errmsg = str::stream() << "************** \n"
+ << "Unclean shutdown detected.\n"
+ << "Please visit http://dochub.mongodb.org/core/repair for "
+ "recovery instructions.\n"
+ << "*************";
}
}
- // Not related to lock file, but this is where we handle unclean shutdown
- if (!storageGlobalParams.dur && dur::haveJournalFiles()) {
- log() << "**************" << endl;
- log() << "Error: journal files are present in journal directory, yet starting without journaling enabled." << endl;
- log() << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
- log() << "**************" << endl;
- uasserted(13597, "can't start without --journal enabled when journal/ files are present");
+ if (!errmsg.empty()) {
+ log() << errmsg << endl;
+ uassert(12596, "old lock file", 0);
}
}
+
+ // Not related to lock file, but this is where we handle unclean shutdown
+ if (!storageGlobalParams.dur && dur::haveJournalFiles()) {
+ log() << "**************" << endl;
+ log() << "Error: journal files are present in journal directory, yet starting without "
+ "journaling enabled." << endl;
+ log() << "It is recommended that you start with journaling enabled so that recovery may "
+ "occur." << endl;
+ log() << "**************" << endl;
+ uasserted(13597, "can't start without --journal enabled when journal/ files are present");
+ }
+}
#else
- void acquirePathLock(MMAPV1Engine* storageEngine,
- bool doingRepair,
- const StorageEngineLockFile& lockFile) {
- // TODO - this is very bad that the code above not running here.
-
- // Not related to lock file, but this is where we handle unclean shutdown
- if (!storageGlobalParams.dur && dur::haveJournalFiles()) {
- log() << "**************" << endl;
- log() << "Error: journal files are present in journal directory, yet starting without --journal enabled." << endl;
- log() << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
- log() << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl;
- log() << "**************" << endl;
- uasserted(13618, "can't start without --journal enabled when journal/ files are present");
- }
+void acquirePathLock(MMAPV1Engine* storageEngine,
+ bool doingRepair,
+ const StorageEngineLockFile& lockFile) {
+ // TODO - this is very bad that the code above not running here.
+
+ // Not related to lock file, but this is where we handle unclean shutdown
+ if (!storageGlobalParams.dur && dur::haveJournalFiles()) {
+ log() << "**************" << endl;
+ log() << "Error: journal files are present in journal directory, yet starting without "
+ "--journal enabled." << endl;
+ log() << "It is recommended that you start with journaling enabled so that recovery may "
+ "occur." << endl;
+ log() << "Alternatively (not recommended), you can backup everything, then delete the "
+ "journal files, and run --repair" << endl;
+ log() << "**************" << endl;
+ uasserted(13618, "can't start without --journal enabled when journal/ files are present");
}
+}
#endif // !defined(__sun)
- /// warn if readahead > 256KB (gridfs chunk size)
- void checkReadAhead(const string& dir) {
+/// warn if readahead > 256KB (gridfs chunk size)
+void checkReadAhead(const string& dir) {
#ifdef __linux__
- try {
- const dev_t dev = getPartition(dir);
-
- // This path handles the case where the filesystem uses the whole device (including LVM)
- string path = str::stream() <<
- "/sys/dev/block/" << major(dev) << ':' << minor(dev) << "/queue/read_ahead_kb";
-
- if (!boost::filesystem::exists(path)){
- // This path handles the case where the filesystem is on a partition.
- path = str::stream()
- << "/sys/dev/block/" << major(dev) << ':' << minor(dev) // this is a symlink
- << "/.." // parent directory of a partition is for the whole device
- << "/queue/read_ahead_kb";
- }
+ try {
+ const dev_t dev = getPartition(dir);
+
+ // This path handles the case where the filesystem uses the whole device (including LVM)
+ string path = str::stream() << "/sys/dev/block/" << major(dev) << ':' << minor(dev)
+ << "/queue/read_ahead_kb";
+
+ if (!boost::filesystem::exists(path)) {
+ // This path handles the case where the filesystem is on a partition.
+ path =
+ str::stream() << "/sys/dev/block/" << major(dev) << ':'
+ << minor(dev) // this is a symlink
+ << "/.." // parent directory of a partition is for the whole device
+ << "/queue/read_ahead_kb";
+ }
- if (boost::filesystem::exists(path)) {
- ifstream file (path.c_str());
- if (file.is_open()) {
- int kb;
- file >> kb;
- if (kb > 256) {
- log() << startupWarningsLog;
+ if (boost::filesystem::exists(path)) {
+ ifstream file(path.c_str());
+ if (file.is_open()) {
+ int kb;
+ file >> kb;
+ if (kb > 256) {
+ log() << startupWarningsLog;
- log() << "** WARNING: Readahead for " << dir << " is set to " << kb << "KB"
- << startupWarningsLog;
+ log() << "** WARNING: Readahead for " << dir << " is set to " << kb << "KB"
+ << startupWarningsLog;
- log() << "** We suggest setting it to 256KB (512 sectors) or less"
- << startupWarningsLog;
+ log() << "** We suggest setting it to 256KB (512 sectors) or less"
+ << startupWarningsLog;
- log() << "** http://dochub.mongodb.org/core/readahead"
- << startupWarningsLog;
- }
+ log() << "** http://dochub.mongodb.org/core/readahead"
+ << startupWarningsLog;
}
}
}
- catch (const std::exception& e) {
- log() << "unable to validate readahead settings due to error: " << e.what()
- << startupWarningsLog;
- log() << "for more information, see http://dochub.mongodb.org/core/readahead"
- << startupWarningsLog;
- }
-#endif // __linux__
+ } catch (const std::exception& e) {
+ log() << "unable to validate readahead settings due to error: " << e.what()
+ << startupWarningsLog;
+ log() << "for more information, see http://dochub.mongodb.org/core/readahead"
+ << startupWarningsLog;
}
+#endif // __linux__
+}
- // This is unrelated to the _tmp directory in dbpath.
- void clearTmpFiles() {
- boost::filesystem::path path(storageGlobalParams.dbpath);
- for ( boost::filesystem::directory_iterator i( path );
- i != boost::filesystem::directory_iterator(); ++i ) {
- string fileName = boost::filesystem::path(*i).leaf().string();
- if ( boost::filesystem::is_directory( *i ) &&
- fileName.length() && fileName[ 0 ] == '$' )
- boost::filesystem::remove_all( *i );
- }
+// This is unrelated to the _tmp directory in dbpath.
+void clearTmpFiles() {
+ boost::filesystem::path path(storageGlobalParams.dbpath);
+ for (boost::filesystem::directory_iterator i(path);
+ i != boost::filesystem::directory_iterator();
+ ++i) {
+ string fileName = boost::filesystem::path(*i).leaf().string();
+ if (boost::filesystem::is_directory(*i) && fileName.length() && fileName[0] == '$')
+ boost::filesystem::remove_all(*i);
}
-} // namespace
+}
+} // namespace
- MMAPV1Engine::MMAPV1Engine(const StorageEngineLockFile& lockFile) {
- // TODO check non-journal subdirs if using directory-per-db
- checkReadAhead(storageGlobalParams.dbpath);
+MMAPV1Engine::MMAPV1Engine(const StorageEngineLockFile& lockFile) {
+ // TODO check non-journal subdirs if using directory-per-db
+ checkReadAhead(storageGlobalParams.dbpath);
- acquirePathLock(this, storageGlobalParams.repair, lockFile);
+ acquirePathLock(this, storageGlobalParams.repair, lockFile);
- FileAllocator::get()->start();
+ FileAllocator::get()->start();
- MONGO_ASSERT_ON_EXCEPTION_WITH_MSG( clearTmpFiles(), "clear tmp files" );
- }
+ MONGO_ASSERT_ON_EXCEPTION_WITH_MSG(clearTmpFiles(), "clear tmp files");
+}
- void MMAPV1Engine::finishInit() {
- dataFileSync.go();
+void MMAPV1Engine::finishInit() {
+ dataFileSync.go();
- // Replays the journal (if needed) and starts the background thread. This requires the
- // ability to create OperationContexts.
- dur::startup();
- }
+ // Replays the journal (if needed) and starts the background thread. This requires the
+ // ability to create OperationContexts.
+ dur::startup();
+}
- MMAPV1Engine::~MMAPV1Engine() {
- for ( EntryMap::const_iterator it = _entryMap.begin(); it != _entryMap.end(); ++it ) {
- delete it->second;
- }
- _entryMap.clear();
+MMAPV1Engine::~MMAPV1Engine() {
+ for (EntryMap::const_iterator it = _entryMap.begin(); it != _entryMap.end(); ++it) {
+ delete it->second;
}
+ _entryMap.clear();
+}
- RecoveryUnit* MMAPV1Engine::newRecoveryUnit() {
- return new DurRecoveryUnit();
- }
+RecoveryUnit* MMAPV1Engine::newRecoveryUnit() {
+ return new DurRecoveryUnit();
+}
- void MMAPV1Engine::listDatabases( std::vector<std::string>* out ) const {
- _listDatabases( storageGlobalParams.dbpath, out );
- }
+void MMAPV1Engine::listDatabases(std::vector<std::string>* out) const {
+ _listDatabases(storageGlobalParams.dbpath, out);
+}
- DatabaseCatalogEntry* MMAPV1Engine::getDatabaseCatalogEntry( OperationContext* opCtx,
- StringData db ) {
- {
- stdx::lock_guard<stdx::mutex> lk(_entryMapMutex);
- EntryMap::const_iterator iter = _entryMap.find(db.toString());
- if (iter != _entryMap.end()) {
- return iter->second;
- }
+DatabaseCatalogEntry* MMAPV1Engine::getDatabaseCatalogEntry(OperationContext* opCtx,
+ StringData db) {
+ {
+ stdx::lock_guard<stdx::mutex> lk(_entryMapMutex);
+ EntryMap::const_iterator iter = _entryMap.find(db.toString());
+ if (iter != _entryMap.end()) {
+ return iter->second;
}
+ }
- // This is an on-demand database create/open. At this point, we are locked under X lock for
- // the database (MMAPV1DatabaseCatalogEntry's constructor checks that) so no two threads
- // can be creating the same database concurrenty. We need to create the database outside of
- // the _entryMapMutex so we do not deadlock (see SERVER-15880).
- MMAPV1DatabaseCatalogEntry* entry =
- new MMAPV1DatabaseCatalogEntry(opCtx,
- db,
- storageGlobalParams.dbpath,
- storageGlobalParams.directoryperdb,
- false);
+ // This is an on-demand database create/open. At this point, we are locked under X lock for
+ // the database (MMAPV1DatabaseCatalogEntry's constructor checks that) so no two threads
+ // can be creating the same database concurrenty. We need to create the database outside of
+ // the _entryMapMutex so we do not deadlock (see SERVER-15880).
+ MMAPV1DatabaseCatalogEntry* entry = new MMAPV1DatabaseCatalogEntry(
+ opCtx, db, storageGlobalParams.dbpath, storageGlobalParams.directoryperdb, false);
- stdx::lock_guard<stdx::mutex> lk(_entryMapMutex);
+ stdx::lock_guard<stdx::mutex> lk(_entryMapMutex);
- // Sanity check that we are not overwriting something
- invariant(_entryMap.insert(EntryMap::value_type(db.toString(), entry)).second);
+ // Sanity check that we are not overwriting something
+ invariant(_entryMap.insert(EntryMap::value_type(db.toString(), entry)).second);
- return entry;
- }
+ return entry;
+}
- Status MMAPV1Engine::closeDatabase( OperationContext* txn, StringData db ) {
- // Before the files are closed, flush any potentially outstanding changes, which might
- // reference this database. Otherwise we will assert when subsequent applications of the
- // global journal entries occur, which happen to have write intents for the removed files.
- getDur().syncDataAndTruncateJournal(txn);
-
- stdx::lock_guard<stdx::mutex> lk( _entryMapMutex );
- MMAPV1DatabaseCatalogEntry* entry = _entryMap[db.toString()];
- delete entry;
- _entryMap.erase( db.toString() );
- return Status::OK();
- }
+Status MMAPV1Engine::closeDatabase(OperationContext* txn, StringData db) {
+ // Before the files are closed, flush any potentially outstanding changes, which might
+ // reference this database. Otherwise we will assert when subsequent applications of the
+ // global journal entries occur, which happen to have write intents for the removed files.
+ getDur().syncDataAndTruncateJournal(txn);
+
+ stdx::lock_guard<stdx::mutex> lk(_entryMapMutex);
+ MMAPV1DatabaseCatalogEntry* entry = _entryMap[db.toString()];
+ delete entry;
+ _entryMap.erase(db.toString());
+ return Status::OK();
+}
- Status MMAPV1Engine::dropDatabase( OperationContext* txn, StringData db ) {
- Status status = closeDatabase( txn, db );
- if ( !status.isOK() )
- return status;
+Status MMAPV1Engine::dropDatabase(OperationContext* txn, StringData db) {
+ Status status = closeDatabase(txn, db);
+ if (!status.isOK())
+ return status;
- _deleteDataFiles( db.toString() );
+ _deleteDataFiles(db.toString());
- return Status::OK();
- }
+ return Status::OK();
+}
- void MMAPV1Engine::_listDatabases( const std::string& directory,
- std::vector<std::string>* out ) {
- boost::filesystem::path path( directory );
- for ( boost::filesystem::directory_iterator i( path );
- i != boost::filesystem::directory_iterator();
- ++i ) {
- if (storageGlobalParams.directoryperdb) {
- boost::filesystem::path p = *i;
- string dbName = p.leaf().string();
- p /= ( dbName + ".ns" );
- if ( exists( p ) )
- out->push_back( dbName );
- }
- else {
- string fileName = boost::filesystem::path(*i).leaf().string();
- if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" )
- out->push_back( fileName.substr( 0, fileName.length() - 3 ) );
- }
+void MMAPV1Engine::_listDatabases(const std::string& directory, std::vector<std::string>* out) {
+ boost::filesystem::path path(directory);
+ for (boost::filesystem::directory_iterator i(path);
+ i != boost::filesystem::directory_iterator();
+ ++i) {
+ if (storageGlobalParams.directoryperdb) {
+ boost::filesystem::path p = *i;
+ string dbName = p.leaf().string();
+ p /= (dbName + ".ns");
+ if (exists(p))
+ out->push_back(dbName);
+ } else {
+ string fileName = boost::filesystem::path(*i).leaf().string();
+ if (fileName.length() > 3 && fileName.substr(fileName.length() - 3, 3) == ".ns")
+ out->push_back(fileName.substr(0, fileName.length() - 3));
}
}
+}
- int MMAPV1Engine::flushAllFiles( bool sync ) {
- return MongoFile::flushAll( sync );
- }
-
- bool MMAPV1Engine::isDurable() const {
- return getDur().isDurable();
- }
+int MMAPV1Engine::flushAllFiles(bool sync) {
+ return MongoFile::flushAll(sync);
+}
- RecordAccessTracker& MMAPV1Engine::getRecordAccessTracker() {
- return _recordAccessTracker;
- }
+bool MMAPV1Engine::isDurable() const {
+ return getDur().isDurable();
+}
- void MMAPV1Engine::cleanShutdown() {
- // wait until file preallocation finishes
- // we would only hang here if the file_allocator code generates a
- // synchronous signal, which we don't expect
- log() << "shutdown: waiting for fs preallocator..." << endl;
- FileAllocator::get()->waitUntilFinished();
+RecordAccessTracker& MMAPV1Engine::getRecordAccessTracker() {
+ return _recordAccessTracker;
+}
- if (storageGlobalParams.dur) {
- log() << "shutdown: final commit..." << endl;
+void MMAPV1Engine::cleanShutdown() {
+ // wait until file preallocation finishes
+ // we would only hang here if the file_allocator code generates a
+ // synchronous signal, which we don't expect
+ log() << "shutdown: waiting for fs preallocator..." << endl;
+ FileAllocator::get()->waitUntilFinished();
- getDur().commitAndStopDurThread();
- }
+ if (storageGlobalParams.dur) {
+ log() << "shutdown: final commit..." << endl;
- log() << "shutdown: closing all files..." << endl;
- stringstream ss3;
- MemoryMappedFile::closeAllFiles( ss3 );
- log() << ss3.str() << endl;
+ getDur().commitAndStopDurThread();
}
+
+ log() << "shutdown: closing all files..." << endl;
+ stringstream ss3;
+ MemoryMappedFile::closeAllFiles(ss3);
+ log() << ss3.str() << endl;
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_engine.h b/src/mongo/db/storage/mmap_v1/mmap_v1_engine.h
index 4141794c426..25c38500831 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_engine.h
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_engine.h
@@ -38,68 +38,70 @@
namespace mongo {
- class MMAPV1DatabaseCatalogEntry;
+class MMAPV1DatabaseCatalogEntry;
- class MMAPV1Engine : public StorageEngine {
- public:
- MMAPV1Engine(const StorageEngineLockFile& lockFile);
- virtual ~MMAPV1Engine();
+class MMAPV1Engine : public StorageEngine {
+public:
+ MMAPV1Engine(const StorageEngineLockFile& lockFile);
+ virtual ~MMAPV1Engine();
- void finishInit();
+ void finishInit();
- RecoveryUnit* newRecoveryUnit();
- void listDatabases( std::vector<std::string>* out ) const;
- int flushAllFiles( bool sync );
+ RecoveryUnit* newRecoveryUnit();
+ void listDatabases(std::vector<std::string>* out) const;
+ int flushAllFiles(bool sync);
- DatabaseCatalogEntry* getDatabaseCatalogEntry( OperationContext* opCtx,
- StringData db );
+ DatabaseCatalogEntry* getDatabaseCatalogEntry(OperationContext* opCtx, StringData db);
- virtual bool supportsDocLocking() const { return false; }
- virtual bool isMmapV1() const { return true; }
+ virtual bool supportsDocLocking() const {
+ return false;
+ }
+ virtual bool isMmapV1() const {
+ return true;
+ }
- virtual bool isDurable() const;
+ virtual bool isDurable() const;
- virtual Status closeDatabase(OperationContext* txn, StringData db);
+ virtual Status closeDatabase(OperationContext* txn, StringData db);
- virtual Status dropDatabase(OperationContext* txn, StringData db);
+ virtual Status dropDatabase(OperationContext* txn, StringData db);
- virtual void cleanShutdown();
+ virtual void cleanShutdown();
- // Callers should use repairDatabase instead.
- virtual Status repairRecordStore(OperationContext* txn, const std::string& ns) {
- return Status(ErrorCodes::InternalError, "MMAPv1 doesn't support repairRecordStore");
- }
+ // Callers should use repairDatabase instead.
+ virtual Status repairRecordStore(OperationContext* txn, const std::string& ns) {
+ return Status(ErrorCodes::InternalError, "MMAPv1 doesn't support repairRecordStore");
+ }
- // MMAPv1 specific (non-virtual)
- Status repairDatabase( OperationContext* txn,
- const std::string& dbName,
- bool preserveClonedFilesOnFailure,
- bool backupOriginalFiles );
+ // MMAPv1 specific (non-virtual)
+ Status repairDatabase(OperationContext* txn,
+ const std::string& dbName,
+ bool preserveClonedFilesOnFailure,
+ bool backupOriginalFiles);
- /**
- * Gets a reference to the abstraction used by MMAP v1 to track recently used memory
- * addresses.
- *
- * MMAPv1 specific (non-virtual). This is non-const because callers are allowed to use
- * the returned reference to modify the RecordAccessTracker.
- *
- * The RecordAccessTracker is thread-safe (it uses its own mutex internally).
- */
- RecordAccessTracker& getRecordAccessTracker();
+ /**
+ * Gets a reference to the abstraction used by MMAP v1 to track recently used memory
+ * addresses.
+ *
+ * MMAPv1 specific (non-virtual). This is non-const because callers are allowed to use
+ * the returned reference to modify the RecordAccessTracker.
+ *
+ * The RecordAccessTracker is thread-safe (it uses its own mutex internally).
+ */
+ RecordAccessTracker& getRecordAccessTracker();
- private:
- static void _listDatabases( const std::string& directory,
- std::vector<std::string>* out );
+private:
+ static void _listDatabases(const std::string& directory, std::vector<std::string>* out);
- stdx::mutex _entryMapMutex;
- typedef std::map<std::string,MMAPV1DatabaseCatalogEntry*> EntryMap;
- EntryMap _entryMap;
+ stdx::mutex _entryMapMutex;
+ typedef std::map<std::string, MMAPV1DatabaseCatalogEntry*> EntryMap;
+ EntryMap _entryMap;
- // A record access tracker is essentially a large table which tracks recently used
- // addresses. It is used when higher layers (e.g. the query system) need to ask
- // the storage engine whether data is likely in physical memory.
- RecordAccessTracker _recordAccessTracker;
- };
+ // A record access tracker is essentially a large table which tracks recently used
+ // addresses. It is used when higher layers (e.g. the query system) need to ask
+ // the storage engine whether data is likely in physical memory.
+ RecordAccessTracker _recordAccessTracker;
+};
- void _deleteDataFiles(const std::string& database);
+void _deleteDataFiles(const std::string& database);
}
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.cpp
index ed4f160e1a9..69d80422e66 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.cpp
@@ -55,632 +55,612 @@
namespace mongo {
- using std::unique_ptr;
- using std::endl;
- using std::max;
- using std::string;
- using std::stringstream;
-
- // Turn on this failpoint to force the system to yield for a fetch. Setting to "alwaysOn"
- // will cause yields for fetching to occur on every 'kNeedsFetchFailFreq'th call to
- // recordNeedsFetch().
- static const int kNeedsFetchFailFreq = 2;
- static Counter64 needsFetchFailCounter;
- MONGO_FP_DECLARE(recordNeedsFetchFail);
-
- // Used to make sure the compiler doesn't get too smart on us when we're
- // trying to touch records.
- volatile int __record_touch_dummy = 1;
-
- class MmapV1RecordFetcher : public RecordFetcher {
- MONGO_DISALLOW_COPYING(MmapV1RecordFetcher);
- public:
- explicit MmapV1RecordFetcher(const MmapV1RecordHeader* record)
- : _record(record) { }
-
- virtual void setup() {
- invariant(!_filesLock.get());
- _filesLock.reset(new LockMongoFilesShared());
- }
+using std::unique_ptr;
+using std::endl;
+using std::max;
+using std::string;
+using std::stringstream;
+
+// Turn on this failpoint to force the system to yield for a fetch. Setting to "alwaysOn"
+// will cause yields for fetching to occur on every 'kNeedsFetchFailFreq'th call to
+// recordNeedsFetch().
+static const int kNeedsFetchFailFreq = 2;
+static Counter64 needsFetchFailCounter;
+MONGO_FP_DECLARE(recordNeedsFetchFail);
+
+// Used to make sure the compiler doesn't get too smart on us when we're
+// trying to touch records.
+volatile int __record_touch_dummy = 1;
+
+class MmapV1RecordFetcher : public RecordFetcher {
+ MONGO_DISALLOW_COPYING(MmapV1RecordFetcher);
+
+public:
+ explicit MmapV1RecordFetcher(const MmapV1RecordHeader* record) : _record(record) {}
+
+ virtual void setup() {
+ invariant(!_filesLock.get());
+ _filesLock.reset(new LockMongoFilesShared());
+ }
+
+ virtual void fetch() {
+ // It's only legal to touch the record while we're holding a lock on the data files.
+ invariant(_filesLock.get());
+
+ const char* recordChar = reinterpret_cast<const char*>(_record);
+
+ // Here's where we actually deference a pointer into the record. This is where
+ // we expect a page fault to occur, so we should this out of the lock.
+ __record_touch_dummy += *recordChar;
+
+ // We're not going to touch the record anymore, so we can give up our
+ // lock on mongo files. We do this here because we have to release the
+ // lock on mongo files prior to reacquiring lock mgr locks.
+ _filesLock.reset();
+ }
+
+private:
+ // The record which needs to be touched in order to page fault. Not owned by us.
+ const MmapV1RecordHeader* _record;
+
+ // This ensures that our MmapV1RecordHeader* does not drop out from under our feet before
+ // we dereference it.
+ std::unique_ptr<LockMongoFilesShared> _filesLock;
+};
+
+MmapV1ExtentManager::MmapV1ExtentManager(StringData dbname, StringData path, bool directoryPerDB)
+ : _dbname(dbname.toString()),
+ _path(path.toString()),
+ _directoryPerDB(directoryPerDB),
+ _rid(RESOURCE_METADATA, dbname) {
+ StorageEngine* engine = getGlobalServiceContext()->getGlobalStorageEngine();
+ invariant(engine->isMmapV1());
+ MMAPV1Engine* mmapEngine = static_cast<MMAPV1Engine*>(engine);
+ _recordAccessTracker = &mmapEngine->getRecordAccessTracker();
+}
- virtual void fetch() {
- // It's only legal to touch the record while we're holding a lock on the data files.
- invariant(_filesLock.get());
+boost::filesystem::path MmapV1ExtentManager::_fileName(int n) const {
+ stringstream ss;
+ ss << _dbname << '.' << n;
+ boost::filesystem::path fullName(_path);
+ if (_directoryPerDB)
+ fullName /= _dbname;
+ fullName /= ss.str();
+ return fullName;
+}
- const char* recordChar = reinterpret_cast<const char*>(_record);
- // Here's where we actually deference a pointer into the record. This is where
- // we expect a page fault to occur, so we should this out of the lock.
- __record_touch_dummy += *recordChar;
+Status MmapV1ExtentManager::init(OperationContext* txn) {
+ invariant(_files.empty());
- // We're not going to touch the record anymore, so we can give up our
- // lock on mongo files. We do this here because we have to release the
- // lock on mongo files prior to reacquiring lock mgr locks.
- _filesLock.reset();
+ for (int n = 0; n < DiskLoc::MaxFiles; n++) {
+ const boost::filesystem::path fullName = _fileName(n);
+ if (!boost::filesystem::exists(fullName)) {
+ break;
}
- private:
- // The record which needs to be touched in order to page fault. Not owned by us.
- const MmapV1RecordHeader* _record;
-
- // This ensures that our MmapV1RecordHeader* does not drop out from under our feet before
- // we dereference it.
- std::unique_ptr<LockMongoFilesShared> _filesLock;
- };
-
- MmapV1ExtentManager::MmapV1ExtentManager(StringData dbname,
- StringData path,
- bool directoryPerDB)
- : _dbname(dbname.toString()),
- _path(path.toString()),
- _directoryPerDB(directoryPerDB),
- _rid(RESOURCE_METADATA, dbname) {
- StorageEngine* engine = getGlobalServiceContext()->getGlobalStorageEngine();
- invariant(engine->isMmapV1());
- MMAPV1Engine* mmapEngine = static_cast<MMAPV1Engine*>(engine);
- _recordAccessTracker = &mmapEngine->getRecordAccessTracker();
- }
+ const std::string fullNameString = fullName.string();
- boost::filesystem::path MmapV1ExtentManager::_fileName(int n) const {
- stringstream ss;
- ss << _dbname << '.' << n;
- boost::filesystem::path fullName( _path );
- if ( _directoryPerDB )
- fullName /= _dbname;
- fullName /= ss.str();
- return fullName;
- }
-
-
- Status MmapV1ExtentManager::init(OperationContext* txn) {
- invariant(_files.empty());
-
- for (int n = 0; n < DiskLoc::MaxFiles; n++) {
- const boost::filesystem::path fullName = _fileName(n);
- if (!boost::filesystem::exists(fullName)) {
+ {
+ // If the file is uninitialized we exit the loop because it is just prealloced. We
+ // do this on a bare File object rather than using the DataFile because closing a
+ // DataFile triggers dur::closingFileNotification() which is fatal if there are any
+ // pending writes. Therefore we must only open files that we know we want to keep.
+ File preview;
+ preview.open(fullNameString.c_str(), /*readOnly*/ true);
+ invariant(preview.is_open());
+
+ // File can't be initialized if too small.
+ if (preview.len() < sizeof(DataFileHeader)) {
break;
}
- const std::string fullNameString = fullName.string();
-
- {
- // If the file is uninitialized we exit the loop because it is just prealloced. We
- // do this on a bare File object rather than using the DataFile because closing a
- // DataFile triggers dur::closingFileNotification() which is fatal if there are any
- // pending writes. Therefore we must only open files that we know we want to keep.
- File preview;
- preview.open(fullNameString.c_str(), /*readOnly*/ true);
- invariant(preview.is_open());
-
- // File can't be initialized if too small.
- if (preview.len() < sizeof(DataFileHeader)) {
- break;
- }
-
- // This is the equivalent of DataFileHeader::uninitialized().
- int version;
- preview.read(0, reinterpret_cast<char*>(&version), sizeof(version));
- invariant(!preview.bad());
- if (version == 0) {
- break;
- }
- }
-
- unique_ptr<DataFile> df(new DataFile(n));
-
- Status s = df->openExisting(fullNameString.c_str());
- if (!s.isOK()) {
- return s;
+ // This is the equivalent of DataFileHeader::uninitialized().
+ int version;
+ preview.read(0, reinterpret_cast<char*>(&version), sizeof(version));
+ invariant(!preview.bad());
+ if (version == 0) {
+ break;
}
+ }
- invariant(!df->getHeader()->uninitialized());
-
- // We only checkUpgrade on files that we are keeping, not preallocs.
- df->getHeader()->checkUpgrade(txn);
+ unique_ptr<DataFile> df(new DataFile(n));
- _files.push_back( df.release() );
+ Status s = df->openExisting(fullNameString.c_str());
+ if (!s.isOK()) {
+ return s;
}
- // If this is a new database being created, instantiate the first file and one extent so
- // we can have a coherent database.
- if (_files.empty()) {
- WriteUnitOfWork wuow(txn);
- _createExtent(txn, initialSize(128), false);
- wuow.commit();
+ invariant(!df->getHeader()->uninitialized());
- // Commit the journal and all changes to disk so that even if exceptions occur during
- // subsequent initialization, we won't have uncommited changes during file close.
- getDur().commitNow(txn);
- }
+ // We only checkUpgrade on files that we are keeping, not preallocs.
+ df->getHeader()->checkUpgrade(txn);
- return Status::OK();
+ _files.push_back(df.release());
}
- const DataFile* MmapV1ExtentManager::_getOpenFile(int fileId) const {
- if (fileId < 0 || fileId >= _files.size()) {
- log() << "_getOpenFile() invalid file index requested " << fileId;
- invariant(false);
- }
+ // If this is a new database being created, instantiate the first file and one extent so
+ // we can have a coherent database.
+ if (_files.empty()) {
+ WriteUnitOfWork wuow(txn);
+ _createExtent(txn, initialSize(128), false);
+ wuow.commit();
- return _files[fileId];
+ // Commit the journal and all changes to disk so that even if exceptions occur during
+ // subsequent initialization, we won't have uncommited changes during file close.
+ getDur().commitNow(txn);
}
- DataFile* MmapV1ExtentManager::_getOpenFile(int fileId) {
- if (fileId < 0 || fileId >= _files.size()) {
- log() << "_getOpenFile() invalid file index requested " << fileId;
- invariant(false);
- }
+ return Status::OK();
+}
- return _files[fileId];
+const DataFile* MmapV1ExtentManager::_getOpenFile(int fileId) const {
+ if (fileId < 0 || fileId >= _files.size()) {
+ log() << "_getOpenFile() invalid file index requested " << fileId;
+ invariant(false);
}
- DataFile* MmapV1ExtentManager::_addAFile(OperationContext* txn,
- int sizeNeeded,
- bool preallocateNextFile) {
-
- // Database must be stable and we need to be in some sort of an update operation in order
- // to add a new file.
- invariant(txn->lockState()->isDbLockedForMode(_dbname, MODE_IX));
+ return _files[fileId];
+}
- const int allocFileId = _files.size();
+DataFile* MmapV1ExtentManager::_getOpenFile(int fileId) {
+ if (fileId < 0 || fileId >= _files.size()) {
+ log() << "_getOpenFile() invalid file index requested " << fileId;
+ invariant(false);
+ }
- int minSize = 0;
- if (allocFileId > 0) {
- // Make the next file at least as large as the previous
- minSize = _files[allocFileId - 1]->getHeader()->fileLength;
- }
+ return _files[fileId];
+}
- if (minSize < sizeNeeded + DataFileHeader::HeaderSize) {
- minSize = sizeNeeded + DataFileHeader::HeaderSize;
- }
+DataFile* MmapV1ExtentManager::_addAFile(OperationContext* txn,
+ int sizeNeeded,
+ bool preallocateNextFile) {
+ // Database must be stable and we need to be in some sort of an update operation in order
+ // to add a new file.
+ invariant(txn->lockState()->isDbLockedForMode(_dbname, MODE_IX));
- {
- unique_ptr<DataFile> allocFile(new DataFile(allocFileId));
- const string allocFileName = _fileName(allocFileId).string();
+ const int allocFileId = _files.size();
- Timer t;
+ int minSize = 0;
+ if (allocFileId > 0) {
+ // Make the next file at least as large as the previous
+ minSize = _files[allocFileId - 1]->getHeader()->fileLength;
+ }
- allocFile->open(txn, allocFileName.c_str(), minSize, false);
- if (t.seconds() > 1) {
- log() << "MmapV1ExtentManager took "
- << t.seconds()
- << " seconds to open: "
- << allocFileName;
- }
+ if (minSize < sizeNeeded + DataFileHeader::HeaderSize) {
+ minSize = sizeNeeded + DataFileHeader::HeaderSize;
+ }
- // It's all good
- _files.push_back(allocFile.release());
- }
+ {
+ unique_ptr<DataFile> allocFile(new DataFile(allocFileId));
+ const string allocFileName = _fileName(allocFileId).string();
- // Preallocate is asynchronous
- if (preallocateNextFile) {
- unique_ptr<DataFile> nextFile(new DataFile(allocFileId + 1));
- const string nextFileName = _fileName(allocFileId + 1).string();
+ Timer t;
- nextFile->open(txn, nextFileName.c_str(), minSize, false);
+ allocFile->open(txn, allocFileName.c_str(), minSize, false);
+ if (t.seconds() > 1) {
+ log() << "MmapV1ExtentManager took " << t.seconds()
+ << " seconds to open: " << allocFileName;
}
- // Returns the last file added
- return _files[allocFileId];
- }
-
- int MmapV1ExtentManager::numFiles() const {
- return _files.size();
+ // It's all good
+ _files.push_back(allocFile.release());
}
- long long MmapV1ExtentManager::fileSize() const {
- long long size = 0;
- for (int n = 0; boost::filesystem::exists(_fileName(n)); n++) {
- size += boost::filesystem::file_size(_fileName(n));
- }
+ // Preallocate is asynchronous
+ if (preallocateNextFile) {
+ unique_ptr<DataFile> nextFile(new DataFile(allocFileId + 1));
+ const string nextFileName = _fileName(allocFileId + 1).string();
- return size;
+ nextFile->open(txn, nextFileName.c_str(), minSize, false);
}
- MmapV1RecordHeader* MmapV1ExtentManager::_recordForV1( const DiskLoc& loc ) const {
- loc.assertOk();
- const DataFile* df = _getOpenFile( loc.a() );
+ // Returns the last file added
+ return _files[allocFileId];
+}
- int ofs = loc.getOfs();
- if ( ofs < DataFileHeader::HeaderSize ) {
- df->badOfs(ofs); // will msgassert - external call to keep out of the normal code path
- }
+int MmapV1ExtentManager::numFiles() const {
+ return _files.size();
+}
- return reinterpret_cast<MmapV1RecordHeader*>( df->p() + ofs );
+long long MmapV1ExtentManager::fileSize() const {
+ long long size = 0;
+ for (int n = 0; boost::filesystem::exists(_fileName(n)); n++) {
+ size += boost::filesystem::file_size(_fileName(n));
}
- MmapV1RecordHeader* MmapV1ExtentManager::recordForV1( const DiskLoc& loc ) const {
- MmapV1RecordHeader* record = _recordForV1( loc );
- _recordAccessTracker->markAccessed( record );
- return record;
- }
+ return size;
+}
- std::unique_ptr<RecordFetcher> MmapV1ExtentManager::recordNeedsFetch(const DiskLoc& loc) const {
- if (loc.isNull()) return {};
- MmapV1RecordHeader* record = _recordForV1( loc );
+MmapV1RecordHeader* MmapV1ExtentManager::_recordForV1(const DiskLoc& loc) const {
+ loc.assertOk();
+ const DataFile* df = _getOpenFile(loc.a());
- // For testing: if failpoint is enabled we randomly request fetches without
- // going to the RecordAccessTracker.
- if ( MONGO_FAIL_POINT( recordNeedsFetchFail ) ) {
- needsFetchFailCounter.increment();
- if ( ( needsFetchFailCounter.get() % kNeedsFetchFailFreq ) == 0 ) {
- return stdx::make_unique<MmapV1RecordFetcher>( record );
- }
- }
+ int ofs = loc.getOfs();
+ if (ofs < DataFileHeader::HeaderSize) {
+ df->badOfs(ofs); // will msgassert - external call to keep out of the normal code path
+ }
- if ( !_recordAccessTracker->checkAccessedAndMark( record ) ) {
- return stdx::make_unique<MmapV1RecordFetcher>( record );
- }
+ return reinterpret_cast<MmapV1RecordHeader*>(df->p() + ofs);
+}
+
+MmapV1RecordHeader* MmapV1ExtentManager::recordForV1(const DiskLoc& loc) const {
+ MmapV1RecordHeader* record = _recordForV1(loc);
+ _recordAccessTracker->markAccessed(record);
+ return record;
+}
+std::unique_ptr<RecordFetcher> MmapV1ExtentManager::recordNeedsFetch(const DiskLoc& loc) const {
+ if (loc.isNull())
return {};
+ MmapV1RecordHeader* record = _recordForV1(loc);
+
+ // For testing: if failpoint is enabled we randomly request fetches without
+ // going to the RecordAccessTracker.
+ if (MONGO_FAIL_POINT(recordNeedsFetchFail)) {
+ needsFetchFailCounter.increment();
+ if ((needsFetchFailCounter.get() % kNeedsFetchFailFreq) == 0) {
+ return stdx::make_unique<MmapV1RecordFetcher>(record);
+ }
}
- DiskLoc MmapV1ExtentManager::extentLocForV1( const DiskLoc& loc ) const {
- MmapV1RecordHeader* record = recordForV1( loc );
- return DiskLoc( loc.a(), record->extentOfs() );
+ if (!_recordAccessTracker->checkAccessedAndMark(record)) {
+ return stdx::make_unique<MmapV1RecordFetcher>(record);
}
- Extent* MmapV1ExtentManager::extentForV1( const DiskLoc& loc ) const {
- DiskLoc extentLoc = extentLocForV1( loc );
- return getExtent( extentLoc );
- }
+ return {};
+}
- Extent* MmapV1ExtentManager::getExtent( const DiskLoc& loc, bool doSanityCheck ) const {
- loc.assertOk();
- Extent* e = reinterpret_cast<Extent*>( _getOpenFile( loc.a() )->p() + loc.getOfs() );
- if ( doSanityCheck )
- e->assertOk();
+DiskLoc MmapV1ExtentManager::extentLocForV1(const DiskLoc& loc) const {
+ MmapV1RecordHeader* record = recordForV1(loc);
+ return DiskLoc(loc.a(), record->extentOfs());
+}
- _recordAccessTracker->markAccessed( e );
+Extent* MmapV1ExtentManager::extentForV1(const DiskLoc& loc) const {
+ DiskLoc extentLoc = extentLocForV1(loc);
+ return getExtent(extentLoc);
+}
- return e;
- }
+Extent* MmapV1ExtentManager::getExtent(const DiskLoc& loc, bool doSanityCheck) const {
+ loc.assertOk();
+ Extent* e = reinterpret_cast<Extent*>(_getOpenFile(loc.a())->p() + loc.getOfs());
+ if (doSanityCheck)
+ e->assertOk();
- void _checkQuota( bool enforceQuota, int fileNo ) {
- if ( !enforceQuota )
- return;
+ _recordAccessTracker->markAccessed(e);
- if ( fileNo < mmapv1GlobalOptions.quotaFiles )
- return;
+ return e;
+}
- uasserted(12501, "quota exceeded");
- }
+void _checkQuota(bool enforceQuota, int fileNo) {
+ if (!enforceQuota)
+ return;
- int MmapV1ExtentManager::maxSize() const {
- return DataFile::maxSize() - DataFileHeader::HeaderSize - 16;
- }
+ if (fileNo < mmapv1GlobalOptions.quotaFiles)
+ return;
- DiskLoc MmapV1ExtentManager::_createExtentInFile( OperationContext* txn,
- int fileNo,
- DataFile* f,
- int size,
- bool enforceQuota ) {
+ uasserted(12501, "quota exceeded");
+}
- _checkQuota( enforceQuota, fileNo - 1 );
+int MmapV1ExtentManager::maxSize() const {
+ return DataFile::maxSize() - DataFileHeader::HeaderSize - 16;
+}
- massert( 10358, "bad new extent size", size >= minSize() && size <= maxSize() );
+DiskLoc MmapV1ExtentManager::_createExtentInFile(
+ OperationContext* txn, int fileNo, DataFile* f, int size, bool enforceQuota) {
+ _checkQuota(enforceQuota, fileNo - 1);
- DiskLoc loc = f->allocExtentArea( txn, size );
- loc.assertOk();
+ massert(10358, "bad new extent size", size >= minSize() && size <= maxSize());
- Extent *e = getExtent( loc, false );
- verify( e );
+ DiskLoc loc = f->allocExtentArea(txn, size);
+ loc.assertOk();
- *txn->recoveryUnit()->writing(&e->magic) = Extent::extentSignature;
- *txn->recoveryUnit()->writing(&e->myLoc) = loc;
- *txn->recoveryUnit()->writing(&e->length) = size;
+ Extent* e = getExtent(loc, false);
+ verify(e);
- return loc;
- }
+ *txn->recoveryUnit()->writing(&e->magic) = Extent::extentSignature;
+ *txn->recoveryUnit()->writing(&e->myLoc) = loc;
+ *txn->recoveryUnit()->writing(&e->length) = size;
+ return loc;
+}
- DiskLoc MmapV1ExtentManager::_createExtent( OperationContext* txn,
- int size,
- bool enforceQuota ) {
- size = quantizeExtentSize( size );
- if ( size > maxSize() )
- size = maxSize();
+DiskLoc MmapV1ExtentManager::_createExtent(OperationContext* txn, int size, bool enforceQuota) {
+ size = quantizeExtentSize(size);
- verify( size < DataFile::maxSize() );
+ if (size > maxSize())
+ size = maxSize();
- for ( int i = numFiles() - 1; i >= 0; i-- ) {
- DataFile* f = _getOpenFile(i);
- invariant(f);
+ verify(size < DataFile::maxSize());
- if ( f->getHeader()->unusedLength >= size ) {
- return _createExtentInFile( txn, i, f, size, enforceQuota );
- }
- }
+ for (int i = numFiles() - 1; i >= 0; i--) {
+ DataFile* f = _getOpenFile(i);
+ invariant(f);
- _checkQuota( enforceQuota, numFiles() );
+ if (f->getHeader()->unusedLength >= size) {
+ return _createExtentInFile(txn, i, f, size, enforceQuota);
+ }
+ }
- // no space in an existing file
- // allocate files until we either get one big enough or hit maxSize
- for ( int i = 0; i < 8; i++ ) {
- DataFile* f = _addAFile( txn, size, false );
+ _checkQuota(enforceQuota, numFiles());
- if ( f->getHeader()->unusedLength >= size ) {
- return _createExtentInFile( txn, numFiles() - 1, f, size, enforceQuota );
- }
+ // no space in an existing file
+ // allocate files until we either get one big enough or hit maxSize
+ for (int i = 0; i < 8; i++) {
+ DataFile* f = _addAFile(txn, size, false);
+ if (f->getHeader()->unusedLength >= size) {
+ return _createExtentInFile(txn, numFiles() - 1, f, size, enforceQuota);
}
-
- // callers don't check for null return code, so assert
- msgasserted(14810, "couldn't allocate space for a new extent" );
}
- DiskLoc MmapV1ExtentManager::_allocFromFreeList( OperationContext* txn,
- int approxSize,
- bool capped ) {
- // setup extent constraints
-
- int low, high;
- if ( capped ) {
- // be strict about the size
- low = approxSize;
- if ( low > 2048 ) low -= 256;
- high = (int) (approxSize * 1.05) + 256;
- }
- else {
- low = (int) (approxSize * 0.8);
- high = (int) (approxSize * 1.4);
- }
- if ( high <= 0 ) {
- // overflowed
- high = max(approxSize, maxSize());
- }
- if ( high <= minSize() ) {
- // the minimum extent size is 4097
- high = minSize() + 1;
- }
-
- // scan free list looking for something suitable
+ // callers don't check for null return code, so assert
+ msgasserted(14810, "couldn't allocate space for a new extent");
+}
- int n = 0;
- Extent *best = 0;
- int bestDiff = 0x7fffffff;
- {
- Timer t;
- DiskLoc L = _getFreeListStart();
- while( !L.isNull() ) {
- Extent* e = getExtent( L );
- if ( e->length >= low && e->length <= high ) {
- int diff = abs(e->length - approxSize);
- if ( diff < bestDiff ) {
- bestDiff = diff;
- best = e;
- if ( ((double) diff) / approxSize < 0.1 ) {
- // close enough
- break;
- }
- if ( t.seconds() >= 2 ) {
- // have spent lots of time in write lock, and we are in [low,high], so close enough
- // could come into play if extent freelist is very long
- break;
- }
+DiskLoc MmapV1ExtentManager::_allocFromFreeList(OperationContext* txn,
+ int approxSize,
+ bool capped) {
+ // setup extent constraints
+
+ int low, high;
+ if (capped) {
+ // be strict about the size
+ low = approxSize;
+ if (low > 2048)
+ low -= 256;
+ high = (int)(approxSize * 1.05) + 256;
+ } else {
+ low = (int)(approxSize * 0.8);
+ high = (int)(approxSize * 1.4);
+ }
+ if (high <= 0) {
+ // overflowed
+ high = max(approxSize, maxSize());
+ }
+ if (high <= minSize()) {
+ // the minimum extent size is 4097
+ high = minSize() + 1;
+ }
+
+ // scan free list looking for something suitable
+
+ int n = 0;
+ Extent* best = 0;
+ int bestDiff = 0x7fffffff;
+ {
+ Timer t;
+ DiskLoc L = _getFreeListStart();
+ while (!L.isNull()) {
+ Extent* e = getExtent(L);
+ if (e->length >= low && e->length <= high) {
+ int diff = abs(e->length - approxSize);
+ if (diff < bestDiff) {
+ bestDiff = diff;
+ best = e;
+ if (((double)diff) / approxSize < 0.1) {
+ // close enough
+ break;
}
- else {
- OCCASIONALLY {
- if ( high < 64 * 1024 && t.seconds() >= 2 ) {
- // be less picky if it is taking a long time
- high = 64 * 1024;
- }
+ if (t.seconds() >= 2) {
+ // have spent lots of time in write lock, and we are in [low,high], so close enough
+ // could come into play if extent freelist is very long
+ break;
+ }
+ } else {
+ OCCASIONALLY {
+ if (high < 64 * 1024 && t.seconds() >= 2) {
+ // be less picky if it is taking a long time
+ high = 64 * 1024;
}
}
}
- L = e->xnext;
- ++n;
- }
- if ( t.seconds() >= 10 ) {
- log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl;
}
+ L = e->xnext;
+ ++n;
}
+ if (t.seconds() >= 10) {
+ log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl;
+ }
+ }
- if ( n > 128 ) { LOG( n < 512 ? 1 : 0 ) << "warning: newExtent " << n << " scanned\n"; }
-
- if ( !best )
- return DiskLoc();
-
- // remove from the free list
- if ( !best->xprev.isNull() )
- *txn->recoveryUnit()->writing(&getExtent( best->xprev )->xnext) = best->xnext;
- if ( !best->xnext.isNull() )
- *txn->recoveryUnit()->writing(&getExtent( best->xnext )->xprev) = best->xprev;
- if ( _getFreeListStart() == best->myLoc )
- _setFreeListStart( txn, best->xnext );
- if ( _getFreeListEnd() == best->myLoc )
- _setFreeListEnd( txn, best->xprev );
-
- return best->myLoc;
+ if (n > 128) {
+ LOG(n < 512 ? 1 : 0) << "warning: newExtent " << n << " scanned\n";
}
- DiskLoc MmapV1ExtentManager::allocateExtent(OperationContext* txn,
- bool capped,
- int size,
- bool enforceQuota) {
- Lock::ResourceLock rlk(txn->lockState(), _rid, MODE_X);
- bool fromFreeList = true;
- DiskLoc eloc = _allocFromFreeList( txn, size, capped );
- if ( eloc.isNull() ) {
- fromFreeList = false;
- eloc = _createExtent( txn, size, enforceQuota );
- }
+ if (!best)
+ return DiskLoc();
- invariant( !eloc.isNull() );
- invariant( eloc.isValid() );
+ // remove from the free list
+ if (!best->xprev.isNull())
+ *txn->recoveryUnit()->writing(&getExtent(best->xprev)->xnext) = best->xnext;
+ if (!best->xnext.isNull())
+ *txn->recoveryUnit()->writing(&getExtent(best->xnext)->xprev) = best->xprev;
+ if (_getFreeListStart() == best->myLoc)
+ _setFreeListStart(txn, best->xnext);
+ if (_getFreeListEnd() == best->myLoc)
+ _setFreeListEnd(txn, best->xprev);
- LOG(1) << "MmapV1ExtentManager::allocateExtent"
- << " desiredSize:" << size
- << " fromFreeList: " << fromFreeList
- << " eloc: " << eloc;
+ return best->myLoc;
+}
- return eloc;
+DiskLoc MmapV1ExtentManager::allocateExtent(OperationContext* txn,
+ bool capped,
+ int size,
+ bool enforceQuota) {
+ Lock::ResourceLock rlk(txn->lockState(), _rid, MODE_X);
+ bool fromFreeList = true;
+ DiskLoc eloc = _allocFromFreeList(txn, size, capped);
+ if (eloc.isNull()) {
+ fromFreeList = false;
+ eloc = _createExtent(txn, size, enforceQuota);
}
- void MmapV1ExtentManager::freeExtent(OperationContext* txn, DiskLoc firstExt ) {
- Lock::ResourceLock rlk(txn->lockState(), _rid, MODE_X);
- Extent* e = getExtent( firstExt );
- txn->recoveryUnit()->writing( &e->xnext )->Null();
- txn->recoveryUnit()->writing( &e->xprev )->Null();
- txn->recoveryUnit()->writing( &e->firstRecord )->Null();
- txn->recoveryUnit()->writing( &e->lastRecord )->Null();
-
-
- if( _getFreeListStart().isNull() ) {
- _setFreeListStart( txn, firstExt );
- _setFreeListEnd( txn, firstExt );
- }
- else {
- DiskLoc a = _getFreeListStart();
- invariant( getExtent( a )->xprev.isNull() );
- *txn->recoveryUnit()->writing( &getExtent( a )->xprev ) = firstExt;
- *txn->recoveryUnit()->writing( &getExtent( firstExt )->xnext ) = a;
- _setFreeListStart( txn, firstExt );
- }
+ invariant(!eloc.isNull());
+ invariant(eloc.isValid());
- }
+ LOG(1) << "MmapV1ExtentManager::allocateExtent"
+ << " desiredSize:" << size << " fromFreeList: " << fromFreeList << " eloc: " << eloc;
- void MmapV1ExtentManager::freeExtents(OperationContext* txn, DiskLoc firstExt, DiskLoc lastExt) {
- Lock::ResourceLock rlk(txn->lockState(), _rid, MODE_X);
+ return eloc;
+}
- if ( firstExt.isNull() && lastExt.isNull() )
- return;
+void MmapV1ExtentManager::freeExtent(OperationContext* txn, DiskLoc firstExt) {
+ Lock::ResourceLock rlk(txn->lockState(), _rid, MODE_X);
+ Extent* e = getExtent(firstExt);
+ txn->recoveryUnit()->writing(&e->xnext)->Null();
+ txn->recoveryUnit()->writing(&e->xprev)->Null();
+ txn->recoveryUnit()->writing(&e->firstRecord)->Null();
+ txn->recoveryUnit()->writing(&e->lastRecord)->Null();
- {
- verify( !firstExt.isNull() && !lastExt.isNull() );
- Extent *f = getExtent( firstExt );
- Extent *l = getExtent( lastExt );
- verify( f->xprev.isNull() );
- verify( l->xnext.isNull() );
- verify( f==l || !f->xnext.isNull() );
- verify( f==l || !l->xprev.isNull() );
- }
- if( _getFreeListStart().isNull() ) {
- _setFreeListStart( txn, firstExt );
- _setFreeListEnd( txn, lastExt );
- }
- else {
- DiskLoc a = _getFreeListStart();
- invariant( getExtent( a )->xprev.isNull() );
- *txn->recoveryUnit()->writing( &getExtent( a )->xprev ) = lastExt;
- *txn->recoveryUnit()->writing( &getExtent( lastExt )->xnext ) = a;
- _setFreeListStart( txn, firstExt );
- }
+ if (_getFreeListStart().isNull()) {
+ _setFreeListStart(txn, firstExt);
+ _setFreeListEnd(txn, firstExt);
+ } else {
+ DiskLoc a = _getFreeListStart();
+ invariant(getExtent(a)->xprev.isNull());
+ *txn->recoveryUnit()->writing(&getExtent(a)->xprev) = firstExt;
+ *txn->recoveryUnit()->writing(&getExtent(firstExt)->xnext) = a;
+ _setFreeListStart(txn, firstExt);
}
+}
- DiskLoc MmapV1ExtentManager::_getFreeListStart() const {
- if ( _files.empty() )
- return DiskLoc();
- const DataFile* file = _getOpenFile(0);
- return file->header()->freeListStart;
- }
+void MmapV1ExtentManager::freeExtents(OperationContext* txn, DiskLoc firstExt, DiskLoc lastExt) {
+ Lock::ResourceLock rlk(txn->lockState(), _rid, MODE_X);
- DiskLoc MmapV1ExtentManager::_getFreeListEnd() const {
- if ( _files.empty() )
- return DiskLoc();
- const DataFile* file = _getOpenFile(0);
- return file->header()->freeListEnd;
- }
+ if (firstExt.isNull() && lastExt.isNull())
+ return;
- void MmapV1ExtentManager::_setFreeListStart( OperationContext* txn, DiskLoc loc ) {
- invariant( !_files.empty() );
- DataFile* file = _files[0];
- *txn->recoveryUnit()->writing( &file->header()->freeListStart ) = loc;
+ {
+ verify(!firstExt.isNull() && !lastExt.isNull());
+ Extent* f = getExtent(firstExt);
+ Extent* l = getExtent(lastExt);
+ verify(f->xprev.isNull());
+ verify(l->xnext.isNull());
+ verify(f == l || !f->xnext.isNull());
+ verify(f == l || !l->xprev.isNull());
}
- void MmapV1ExtentManager::_setFreeListEnd( OperationContext* txn, DiskLoc loc ) {
- invariant( !_files.empty() );
- DataFile* file = _files[0];
- *txn->recoveryUnit()->writing( &file->header()->freeListEnd ) = loc;
+ if (_getFreeListStart().isNull()) {
+ _setFreeListStart(txn, firstExt);
+ _setFreeListEnd(txn, lastExt);
+ } else {
+ DiskLoc a = _getFreeListStart();
+ invariant(getExtent(a)->xprev.isNull());
+ *txn->recoveryUnit()->writing(&getExtent(a)->xprev) = lastExt;
+ *txn->recoveryUnit()->writing(&getExtent(lastExt)->xnext) = a;
+ _setFreeListStart(txn, firstExt);
}
+}
- void MmapV1ExtentManager::freeListStats(OperationContext* txn,
- int* numExtents,
- int64_t* totalFreeSizeBytes) const {
- Lock::ResourceLock rlk(txn->lockState(), _rid, MODE_S);
+DiskLoc MmapV1ExtentManager::_getFreeListStart() const {
+ if (_files.empty())
+ return DiskLoc();
+ const DataFile* file = _getOpenFile(0);
+ return file->header()->freeListStart;
+}
- invariant(numExtents);
- invariant(totalFreeSizeBytes);
+DiskLoc MmapV1ExtentManager::_getFreeListEnd() const {
+ if (_files.empty())
+ return DiskLoc();
+ const DataFile* file = _getOpenFile(0);
+ return file->header()->freeListEnd;
+}
- *numExtents = 0;
- *totalFreeSizeBytes = 0;
+void MmapV1ExtentManager::_setFreeListStart(OperationContext* txn, DiskLoc loc) {
+ invariant(!_files.empty());
+ DataFile* file = _files[0];
+ *txn->recoveryUnit()->writing(&file->header()->freeListStart) = loc;
+}
- DiskLoc a = _getFreeListStart();
- while( !a.isNull() ) {
- Extent *e = getExtent( a );
- (*numExtents)++;
- (*totalFreeSizeBytes) += e->length;
- a = e->xnext;
- }
+void MmapV1ExtentManager::_setFreeListEnd(OperationContext* txn, DiskLoc loc) {
+ invariant(!_files.empty());
+ DataFile* file = _files[0];
+ *txn->recoveryUnit()->writing(&file->header()->freeListEnd) = loc;
+}
- }
+void MmapV1ExtentManager::freeListStats(OperationContext* txn,
+ int* numExtents,
+ int64_t* totalFreeSizeBytes) const {
+ Lock::ResourceLock rlk(txn->lockState(), _rid, MODE_S);
- void MmapV1ExtentManager::printFreeList() const {
- log() << "dump freelist " << _dbname << endl;
+ invariant(numExtents);
+ invariant(totalFreeSizeBytes);
- DiskLoc a = _getFreeListStart();
- while( !a.isNull() ) {
- Extent *e = getExtent( a );
- log() << " extent " << a.toString()
- << " len:" << e->length
- << " prev:" << e->xprev.toString() << endl;
- a = e->xnext;
- }
+ *numExtents = 0;
+ *totalFreeSizeBytes = 0;
- log() << "end freelist" << endl;
+ DiskLoc a = _getFreeListStart();
+ while (!a.isNull()) {
+ Extent* e = getExtent(a);
+ (*numExtents)++;
+ (*totalFreeSizeBytes) += e->length;
+ a = e->xnext;
}
+}
- namespace {
- class CacheHintMadvise : public ExtentManager::CacheHint {
- public:
- CacheHintMadvise(void *p, unsigned len, MAdvise::Advice a)
- : _advice( p, len, a ) {
- }
- private:
- MAdvise _advice;
- };
- }
+void MmapV1ExtentManager::printFreeList() const {
+ log() << "dump freelist " << _dbname << endl;
- ExtentManager::CacheHint* MmapV1ExtentManager::cacheHint( const DiskLoc& extentLoc,
- const ExtentManager::HintType& hint ) {
- invariant ( hint == Sequential );
- Extent* e = getExtent( extentLoc );
- return new CacheHintMadvise( reinterpret_cast<void*>( e ),
- e->length,
- MAdvise::Sequential );
+ DiskLoc a = _getFreeListStart();
+ while (!a.isNull()) {
+ Extent* e = getExtent(a);
+ log() << " extent " << a.toString() << " len:" << e->length
+ << " prev:" << e->xprev.toString() << endl;
+ a = e->xnext;
}
- MmapV1ExtentManager::FilesArray::~FilesArray() {
- for (int i = 0; i < size(); i++) {
- delete _files[i];
- }
- }
+ log() << "end freelist" << endl;
+}
- void MmapV1ExtentManager::FilesArray::push_back(DataFile* val) {
- stdx::lock_guard<stdx::mutex> lk(_writersMutex);
- const int n = _size.load();
- invariant(n < DiskLoc::MaxFiles);
- // Note ordering: _size update must come after updating the _files array
- _files[n] = val;
- _size.store(n + 1);
- }
+namespace {
+class CacheHintMadvise : public ExtentManager::CacheHint {
+public:
+ CacheHintMadvise(void* p, unsigned len, MAdvise::Advice a) : _advice(p, len, a) {}
+
+private:
+ MAdvise _advice;
+};
+}
- DataFileVersion MmapV1ExtentManager::getFileFormat(OperationContext* txn) const {
- if ( numFiles() == 0 )
- return DataFileVersion(0, 0);
+ExtentManager::CacheHint* MmapV1ExtentManager::cacheHint(const DiskLoc& extentLoc,
+ const ExtentManager::HintType& hint) {
+ invariant(hint == Sequential);
+ Extent* e = getExtent(extentLoc);
+ return new CacheHintMadvise(reinterpret_cast<void*>(e), e->length, MAdvise::Sequential);
+}
- // We explicitly only look at the first file.
- return _getOpenFile(0)->getHeader()->version;
+MmapV1ExtentManager::FilesArray::~FilesArray() {
+ for (int i = 0; i < size(); i++) {
+ delete _files[i];
}
+}
- void MmapV1ExtentManager::setFileFormat(OperationContext* txn, DataFileVersion newVersion) {
- invariant(numFiles() > 0);
+void MmapV1ExtentManager::FilesArray::push_back(DataFile* val) {
+ stdx::lock_guard<stdx::mutex> lk(_writersMutex);
+ const int n = _size.load();
+ invariant(n < DiskLoc::MaxFiles);
+ // Note ordering: _size update must come after updating the _files array
+ _files[n] = val;
+ _size.store(n + 1);
+}
- DataFile* df = _getOpenFile(0);
- invariant(df);
+DataFileVersion MmapV1ExtentManager::getFileFormat(OperationContext* txn) const {
+ if (numFiles() == 0)
+ return DataFileVersion(0, 0);
- *txn->recoveryUnit()->writing(&df->getHeader()->version) = newVersion;
- }
+ // We explicitly only look at the first file.
+ return _getOpenFile(0)->getHeader()->version;
+}
+
+void MmapV1ExtentManager::setFileFormat(OperationContext* txn, DataFileVersion newVersion) {
+ invariant(numFiles() > 0);
+
+ DataFile* df = _getOpenFile(0);
+ invariant(df);
+
+ *txn->recoveryUnit()->writing(&df->getHeader()->version) = newVersion;
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h b/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h
index 8253d0f87a3..1f7a0963aa1 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h
@@ -45,204 +45,198 @@
namespace mongo {
- class DataFile;
- class DataFileVersion;
- class MmapV1RecordHeader;
- class OperationContext;
+class DataFile;
+class DataFileVersion;
+class MmapV1RecordHeader;
+class OperationContext;
- struct Extent;
+struct Extent;
+/**
+ * ExtentManager basics
+ * - one per database
+ * - responsible for managing <db>.# files
+ * - NOT responsible for .ns file
+ * - gives out extents
+ * - responsible for figuring out how to get a new extent
+ * - can use any method it wants to do so
+ * - this structure is NOT stored on disk
+ * - this class is thread safe, except as indicated below
+ *
+ * Implementation:
+ * - ExtentManager holds a preallocated list of DataFile
+ * - files will not be removed from the EM, so _files access can be lock-free
+ * - extent size and loc are immutable
+ * - Any non-const public operations on an ExtentManager will acquire an MODE_X lock on its
+ * RESOURCE_MMAPv1_EXTENT_MANAGER resource from the lock-manager, which will extend life
+ * to during WriteUnitOfWorks that might need rollback. Private methods will only
+ * be called from public ones.
+ */
+class MmapV1ExtentManager : public ExtentManager {
+ MONGO_DISALLOW_COPYING(MmapV1ExtentManager);
+
+public:
/**
- * ExtentManager basics
- * - one per database
- * - responsible for managing <db>.# files
- * - NOT responsible for .ns file
- * - gives out extents
- * - responsible for figuring out how to get a new extent
- * - can use any method it wants to do so
- * - this structure is NOT stored on disk
- * - this class is thread safe, except as indicated below
- *
- * Implementation:
- * - ExtentManager holds a preallocated list of DataFile
- * - files will not be removed from the EM, so _files access can be lock-free
- * - extent size and loc are immutable
- * - Any non-const public operations on an ExtentManager will acquire an MODE_X lock on its
- * RESOURCE_MMAPv1_EXTENT_MANAGER resource from the lock-manager, which will extend life
- * to during WriteUnitOfWorks that might need rollback. Private methods will only
- * be called from public ones.
+ * @param freeListDetails this is a reference into the .ns file
+ * while a bit odd, this is not a layer violation as extents
+ * are a peer to the .ns file, without any layering
*/
- class MmapV1ExtentManager : public ExtentManager {
- MONGO_DISALLOW_COPYING( MmapV1ExtentManager );
- public:
- /**
- * @param freeListDetails this is a reference into the .ns file
- * while a bit odd, this is not a layer violation as extents
- * are a peer to the .ns file, without any layering
- */
- MmapV1ExtentManager(StringData dbname, StringData path,
- bool directoryPerDB);
+ MmapV1ExtentManager(StringData dbname, StringData path, bool directoryPerDB);
- /**
- * opens all current files, not thread safe
- */
- Status init(OperationContext* txn);
+ /**
+ * opens all current files, not thread safe
+ */
+ Status init(OperationContext* txn);
- int numFiles() const;
- long long fileSize() const;
+ int numFiles() const;
+ long long fileSize() const;
- // must call Extent::reuse on the returned extent
- DiskLoc allocateExtent( OperationContext* txn,
- bool capped,
- int size,
- bool enforceQuota );
+ // must call Extent::reuse on the returned extent
+ DiskLoc allocateExtent(OperationContext* txn, bool capped, int size, bool enforceQuota);
- /**
- * firstExt has to be == lastExt or a chain
- */
- void freeExtents( OperationContext* txn, DiskLoc firstExt, DiskLoc lastExt );
+ /**
+ * firstExt has to be == lastExt or a chain
+ */
+ void freeExtents(OperationContext* txn, DiskLoc firstExt, DiskLoc lastExt);
- /**
- * frees a single extent
- * ignores all fields in the Extent except: magic, myLoc, length
- */
- void freeExtent( OperationContext* txn, DiskLoc extent );
+ /**
+ * frees a single extent
+ * ignores all fields in the Extent except: magic, myLoc, length
+ */
+ void freeExtent(OperationContext* txn, DiskLoc extent);
- // For debug only: not thread safe
- void printFreeList() const;
+ // For debug only: not thread safe
+ void printFreeList() const;
- void freeListStats(OperationContext* txn,
- int* numExtents,
- int64_t* totalFreeSizeBytes) const;
+ void freeListStats(OperationContext* txn, int* numExtents, int64_t* totalFreeSizeBytes) const;
- /**
- * @param loc - has to be for a specific MmapV1RecordHeader
- * Note(erh): this sadly cannot be removed.
- * A MmapV1RecordHeader DiskLoc has an offset from a file, while a RecordStore really wants an offset
- * from an extent. This intrinsically links an original record store to the original extent
- * manager.
- */
- MmapV1RecordHeader* recordForV1( const DiskLoc& loc ) const;
+ /**
+ * @param loc - has to be for a specific MmapV1RecordHeader
+ * Note(erh): this sadly cannot be removed.
+ * A MmapV1RecordHeader DiskLoc has an offset from a file, while a RecordStore really wants an offset
+ * from an extent. This intrinsically links an original record store to the original extent
+ * manager.
+ */
+ MmapV1RecordHeader* recordForV1(const DiskLoc& loc) const;
- std::unique_ptr<RecordFetcher> recordNeedsFetch( const DiskLoc& loc ) const final;
+ std::unique_ptr<RecordFetcher> recordNeedsFetch(const DiskLoc& loc) const final;
- /**
- * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent)
- * Note(erh) see comment on recordFor
- */
- Extent* extentForV1( const DiskLoc& loc ) const;
+ /**
+ * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent)
+ * Note(erh) see comment on recordFor
+ */
+ Extent* extentForV1(const DiskLoc& loc) const;
- /**
- * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent)
- * Note(erh) see comment on recordFor
- */
- DiskLoc extentLocForV1( const DiskLoc& loc ) const;
+ /**
+ * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent)
+ * Note(erh) see comment on recordFor
+ */
+ DiskLoc extentLocForV1(const DiskLoc& loc) const;
- /**
- * @param loc - has to be for a specific Extent
- */
- Extent* getExtent( const DiskLoc& loc, bool doSanityCheck = true ) const;
+ /**
+ * @param loc - has to be for a specific Extent
+ */
+ Extent* getExtent(const DiskLoc& loc, bool doSanityCheck = true) const;
- /**
- * Not thread safe, requires a database exclusive lock
- */
- DataFileVersion getFileFormat(OperationContext* txn) const;
- void setFileFormat(OperationContext* txn, DataFileVersion newVersion);
+ /**
+ * Not thread safe, requires a database exclusive lock
+ */
+ DataFileVersion getFileFormat(OperationContext* txn) const;
+ void setFileFormat(OperationContext* txn, DataFileVersion newVersion);
- const DataFile* getOpenFile( int n ) const { return _getOpenFile( n ); }
+ const DataFile* getOpenFile(int n) const {
+ return _getOpenFile(n);
+ }
- virtual int maxSize() const;
+ virtual int maxSize() const;
- virtual CacheHint* cacheHint( const DiskLoc& extentLoc, const HintType& hint );
+ virtual CacheHint* cacheHint(const DiskLoc& extentLoc, const HintType& hint);
- private:
- /**
- * will return NULL if nothing suitable in free list
- */
- DiskLoc _allocFromFreeList( OperationContext* txn, int approxSize, bool capped );
+private:
+ /**
+ * will return NULL if nothing suitable in free list
+ */
+ DiskLoc _allocFromFreeList(OperationContext* txn, int approxSize, bool capped);
- /* allocate a new Extent, does not check free list
- */
- DiskLoc _createExtent( OperationContext* txn, int approxSize, bool enforceQuota );
+ /* allocate a new Extent, does not check free list
+ */
+ DiskLoc _createExtent(OperationContext* txn, int approxSize, bool enforceQuota);
- DataFile* _addAFile( OperationContext* txn, int sizeNeeded, bool preallocateNextFile );
+ DataFile* _addAFile(OperationContext* txn, int sizeNeeded, bool preallocateNextFile);
- /**
- * Shared record retrieval logic used by the public recordForV1() and likelyInPhysicalMem()
- * above.
- */
- MmapV1RecordHeader* _recordForV1( const DiskLoc& loc ) const;
+ /**
+ * Shared record retrieval logic used by the public recordForV1() and likelyInPhysicalMem()
+ * above.
+ */
+ MmapV1RecordHeader* _recordForV1(const DiskLoc& loc) const;
- DiskLoc _getFreeListStart() const;
- DiskLoc _getFreeListEnd() const;
- void _setFreeListStart( OperationContext* txn, DiskLoc loc );
- void _setFreeListEnd( OperationContext* txn, DiskLoc loc );
+ DiskLoc _getFreeListStart() const;
+ DiskLoc _getFreeListEnd() const;
+ void _setFreeListStart(OperationContext* txn, DiskLoc loc);
+ void _setFreeListEnd(OperationContext* txn, DiskLoc loc);
- const DataFile* _getOpenFile(int fileId) const;
- DataFile* _getOpenFile(int fileId);
+ const DataFile* _getOpenFile(int fileId) const;
+ DataFile* _getOpenFile(int fileId);
- DiskLoc _createExtentInFile( OperationContext* txn,
- int fileNo,
- DataFile* f,
- int size,
- bool enforceQuota );
+ DiskLoc _createExtentInFile(
+ OperationContext* txn, int fileNo, DataFile* f, int size, bool enforceQuota);
- boost::filesystem::path _fileName(int n) const;
+ boost::filesystem::path _fileName(int n) const;
-// -----
+ // -----
- const std::string _dbname; // i.e. "test"
- const std::string _path; // i.e. "/data/db"
- const bool _directoryPerDB;
- const ResourceId _rid;
+ const std::string _dbname; // i.e. "test"
+ const std::string _path; // i.e. "/data/db"
+ const bool _directoryPerDB;
+ const ResourceId _rid;
- // This reference points into the MMAPv1 engine and is only valid as long as the
- // engine is valid. Not owned here.
- RecordAccessTracker* _recordAccessTracker;
+ // This reference points into the MMAPv1 engine and is only valid as long as the
+ // engine is valid. Not owned here.
+ RecordAccessTracker* _recordAccessTracker;
+
+ /**
+ * Simple wrapper around an array object to allow append-only modification of the array,
+ * as well as concurrent read-accesses. This class has a minimal interface to keep
+ * implementation simple and easy to modify.
+ */
+ class FilesArray {
+ public:
+ FilesArray() : _size(0) {}
+ ~FilesArray();
+
+ /**
+ * Returns file at location 'n' in the array, with 'n' less than number of files added.
+ * Will always return the same pointer for a given file.
+ */
+ DataFile* operator[](int n) const {
+ invariant(n >= 0 && n < size());
+ return _files[n];
+ }
/**
- * Simple wrapper around an array object to allow append-only modification of the array,
- * as well as concurrent read-accesses. This class has a minimal interface to keep
- * implementation simple and easy to modify.
+ * Returns true iff no files were added
*/
- class FilesArray {
- public:
- FilesArray() : _size(0) { }
- ~FilesArray();
-
- /**
- * Returns file at location 'n' in the array, with 'n' less than number of files added.
- * Will always return the same pointer for a given file.
- */
- DataFile* operator[](int n) const {
- invariant(n >= 0 && n < size());
- return _files[n];
- }
-
- /**
- * Returns true iff no files were added
- */
- bool empty() const {
- return size() == 0;
- }
-
- /**
- * Returns number of files added to the array
- */
- int size() const {
- return _size.load();
- }
-
- // Appends val to the array, taking ownership of its pointer
- void push_back(DataFile* val);
-
- private:
- stdx::mutex _writersMutex;
- AtomicInt32 _size; // number of files in the array
- DataFile* _files[DiskLoc::MaxFiles];
- };
-
- FilesArray _files;
+ bool empty() const {
+ return size() == 0;
+ }
+
+ /**
+ * Returns number of files added to the array
+ */
+ int size() const {
+ return _size.load();
+ }
+
+ // Appends val to the array, taking ownership of its pointer
+ void push_back(DataFile* val);
+
+ private:
+ stdx::mutex _writersMutex;
+ AtomicInt32 _size; // number of files in the array
+ DataFile* _files[DiskLoc::MaxFiles];
};
+
+ FilesArray _files;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_init.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_init.cpp
index 29fb1bc8c97..920a6d89182 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_init.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_init.cpp
@@ -38,46 +38,44 @@
namespace mongo {
- namespace {
+namespace {
- class MMAPV1Factory : public StorageEngine::Factory {
- public:
- virtual ~MMAPV1Factory() { }
- virtual StorageEngine* create(const StorageGlobalParams& params,
- const StorageEngineLockFile& lockFile) const {
- return new MMAPV1Engine(lockFile);
- }
-
- virtual StringData getCanonicalName() const {
- return "mmapv1";
- }
+class MMAPV1Factory : public StorageEngine::Factory {
+public:
+ virtual ~MMAPV1Factory() {}
+ virtual StorageEngine* create(const StorageGlobalParams& params,
+ const StorageEngineLockFile& lockFile) const {
+ return new MMAPV1Engine(lockFile);
+ }
- virtual Status validateMetadata(const StorageEngineMetadata& metadata,
- const StorageGlobalParams& params) const {
- Status status = metadata.validateStorageEngineOption(
- "directoryPerDB", params.directoryperdb);
- if (!status.isOK()) {
- return status;
- }
+ virtual StringData getCanonicalName() const {
+ return "mmapv1";
+ }
- return Status::OK();
- }
+ virtual Status validateMetadata(const StorageEngineMetadata& metadata,
+ const StorageGlobalParams& params) const {
+ Status status =
+ metadata.validateStorageEngineOption("directoryPerDB", params.directoryperdb);
+ if (!status.isOK()) {
+ return status;
+ }
- virtual BSONObj createMetadataOptions(const StorageGlobalParams& params) const {
- BSONObjBuilder builder;
- builder.appendBool("directoryPerDB", params.directoryperdb);
- return builder.obj();
- }
- };
+ return Status::OK();
+ }
- } // namespace
+ virtual BSONObj createMetadataOptions(const StorageGlobalParams& params) const {
+ BSONObjBuilder builder;
+ builder.appendBool("directoryPerDB", params.directoryperdb);
+ return builder.obj();
+ }
+};
- MONGO_INITIALIZER_WITH_PREREQUISITES(MMAPV1EngineInit,
- ("SetGlobalEnvironment"))
- (InitializerContext* context) {
+} // namespace
- getGlobalServiceContext()->registerStorageEngine("mmapv1", new MMAPV1Factory());
- return Status::OK();
- }
+MONGO_INITIALIZER_WITH_PREREQUISITES(MMAPV1EngineInit, ("SetGlobalEnvironment"))
+(InitializerContext* context) {
+ getGlobalServiceContext()->registerStorageEngine("mmapv1", new MMAPV1Factory());
+ return Status::OK();
+}
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_init_test.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_init_test.cpp
index 62ecdde5aa1..d5323f1b398 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_init_test.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_init_test.cpp
@@ -38,93 +38,89 @@
namespace {
- using namespace mongo;
-
- class MMAPV1FactoryTest : public mongo::unittest::Test {
- private:
- virtual void setUp() {
- ServiceContext* globalEnv = getGlobalServiceContext();
- ASSERT_TRUE(globalEnv);
- ASSERT_TRUE(getGlobalServiceContext()->isRegisteredStorageEngine("mmapv1"));
- std::unique_ptr<StorageFactoriesIterator> sfi(getGlobalServiceContext()->
- makeStorageFactoriesIterator());
- ASSERT_TRUE(sfi);
- bool found = false;
- while (sfi->more()) {
- const StorageEngine::Factory* currentFactory = sfi->next();
- if (currentFactory->getCanonicalName() == "mmapv1") {
- found = true;
- factory = currentFactory;
- break;
- }
+using namespace mongo;
+
+class MMAPV1FactoryTest : public mongo::unittest::Test {
+private:
+ virtual void setUp() {
+ ServiceContext* globalEnv = getGlobalServiceContext();
+ ASSERT_TRUE(globalEnv);
+ ASSERT_TRUE(getGlobalServiceContext()->isRegisteredStorageEngine("mmapv1"));
+ std::unique_ptr<StorageFactoriesIterator> sfi(
+ getGlobalServiceContext()->makeStorageFactoriesIterator());
+ ASSERT_TRUE(sfi);
+ bool found = false;
+ while (sfi->more()) {
+ const StorageEngine::Factory* currentFactory = sfi->next();
+ if (currentFactory->getCanonicalName() == "mmapv1") {
+ found = true;
+ factory = currentFactory;
+ break;
}
- ASSERT_TRUE(found);
- }
-
- virtual void tearDown() {
- factory = NULL;
- }
-
- protected:
- const StorageEngine::Factory* factory;
- };
-
- void _testValidateMetadata(const StorageEngine::Factory* factory,
- const BSONObj& metadataOptions,
- bool directoryPerDB,
- ErrorCodes::Error expectedCode) {
- // It is fine to specify an invalid data directory for the metadata
- // as long as we do not invoke read() or write().
- StorageEngineMetadata metadata("no_such_directory");
- metadata.setStorageEngineOptions(metadataOptions);
-
- StorageGlobalParams storageOptions;
- storageOptions.directoryperdb = directoryPerDB;
-
- Status status = factory->validateMetadata(metadata, storageOptions);
- if (expectedCode != status.code()) {
- FAIL(str::stream()
- << "Unexpected StorageEngine::Factory::validateMetadata result. Expected: "
- << ErrorCodes::errorString(expectedCode) << " but got "
- << status.toString()
- << " instead. metadataOptions: " << metadataOptions
- << "; directoryPerDB: " << directoryPerDB);
}
+ ASSERT_TRUE(found);
}
- // Do not validate fields that are not present in metadata.
- TEST_F(MMAPV1FactoryTest, ValidateMetadataEmptyOptions) {
- _testValidateMetadata(factory, BSONObj(), false, ErrorCodes::OK);
- _testValidateMetadata(factory, BSONObj(), true, ErrorCodes::OK);
- }
-
- TEST_F(MMAPV1FactoryTest, ValidateMetadataDirectoryPerDB) {
- _testValidateMetadata(factory, fromjson("{directoryPerDB: 123}"), false,
- ErrorCodes::FailedToParse);
- _testValidateMetadata(factory, fromjson("{directoryPerDB: false}"), false,
- ErrorCodes::OK);
- _testValidateMetadata(factory, fromjson("{directoryPerDB: false}"), true,
- ErrorCodes::InvalidOptions);
- _testValidateMetadata(factory, fromjson("{directoryPerDB: true}"), false,
- ErrorCodes::InvalidOptions);
- _testValidateMetadata(factory, fromjson("{directoryPerDB: true}"), true,
- ErrorCodes::OK);
- }
-
- void _testCreateMetadataOptions(const StorageEngine::Factory* factory,
- bool directoryPerDB) {
- StorageGlobalParams storageOptions;
- storageOptions.directoryperdb = directoryPerDB;
-
- BSONObj metadataOptions = factory->createMetadataOptions(storageOptions);
- BSONElement directoryPerDBElement = metadataOptions.getField("directoryPerDB");
- ASSERT_TRUE(directoryPerDBElement.isBoolean());
- ASSERT_EQUALS(directoryPerDB, directoryPerDBElement.boolean());
+ virtual void tearDown() {
+ factory = NULL;
}
- TEST_F(MMAPV1FactoryTest, CreateMetadataOptions) {
- _testCreateMetadataOptions(factory, false);
- _testCreateMetadataOptions(factory, true);
+protected:
+ const StorageEngine::Factory* factory;
+};
+
+void _testValidateMetadata(const StorageEngine::Factory* factory,
+ const BSONObj& metadataOptions,
+ bool directoryPerDB,
+ ErrorCodes::Error expectedCode) {
+ // It is fine to specify an invalid data directory for the metadata
+ // as long as we do not invoke read() or write().
+ StorageEngineMetadata metadata("no_such_directory");
+ metadata.setStorageEngineOptions(metadataOptions);
+
+ StorageGlobalParams storageOptions;
+ storageOptions.directoryperdb = directoryPerDB;
+
+ Status status = factory->validateMetadata(metadata, storageOptions);
+ if (expectedCode != status.code()) {
+ FAIL(str::stream()
+ << "Unexpected StorageEngine::Factory::validateMetadata result. Expected: "
+ << ErrorCodes::errorString(expectedCode) << " but got " << status.toString()
+ << " instead. metadataOptions: " << metadataOptions
+ << "; directoryPerDB: " << directoryPerDB);
}
+}
+
+// Do not validate fields that are not present in metadata.
+TEST_F(MMAPV1FactoryTest, ValidateMetadataEmptyOptions) {
+ _testValidateMetadata(factory, BSONObj(), false, ErrorCodes::OK);
+ _testValidateMetadata(factory, BSONObj(), true, ErrorCodes::OK);
+}
+
+TEST_F(MMAPV1FactoryTest, ValidateMetadataDirectoryPerDB) {
+ _testValidateMetadata(
+ factory, fromjson("{directoryPerDB: 123}"), false, ErrorCodes::FailedToParse);
+ _testValidateMetadata(factory, fromjson("{directoryPerDB: false}"), false, ErrorCodes::OK);
+ _testValidateMetadata(
+ factory, fromjson("{directoryPerDB: false}"), true, ErrorCodes::InvalidOptions);
+ _testValidateMetadata(
+ factory, fromjson("{directoryPerDB: true}"), false, ErrorCodes::InvalidOptions);
+ _testValidateMetadata(factory, fromjson("{directoryPerDB: true}"), true, ErrorCodes::OK);
+}
+
+void _testCreateMetadataOptions(const StorageEngine::Factory* factory, bool directoryPerDB) {
+ StorageGlobalParams storageOptions;
+ storageOptions.directoryperdb = directoryPerDB;
+
+ BSONObj metadataOptions = factory->createMetadataOptions(storageOptions);
+ BSONElement directoryPerDBElement = metadataOptions.getField("directoryPerDB");
+ ASSERT_TRUE(directoryPerDBElement.isBoolean());
+ ASSERT_EQUALS(directoryPerDB, directoryPerDBElement.boolean());
+}
+
+TEST_F(MMAPV1FactoryTest, CreateMetadataOptions) {
+ _testCreateMetadataOptions(factory, false);
+ _testCreateMetadataOptions(factory, true);
+}
} // namespace
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_options.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_options.cpp
index aa5168ea2c6..554a5eafe37 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_options.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_options.cpp
@@ -35,65 +35,65 @@
namespace mongo {
- MMAPV1Options mmapv1GlobalOptions;
+MMAPV1Options mmapv1GlobalOptions;
- /**
- * Specify an integer between 1 and 500 signifying the number of milliseconds (ms)
- * between journal commits.
- */
- class JournalCommitIntervalSetting : public ServerParameter {
- public:
- JournalCommitIntervalSetting() :
- ServerParameter(ServerParameterSet::getGlobal(), "journalCommitInterval",
- false, // allowedToChangeAtStartup
- true // allowedToChangeAtRuntime
- ) {}
+/**
+ * Specify an integer between 1 and 500 signifying the number of milliseconds (ms)
+ * between journal commits.
+ */
+class JournalCommitIntervalSetting : public ServerParameter {
+public:
+ JournalCommitIntervalSetting()
+ : ServerParameter(ServerParameterSet::getGlobal(),
+ "journalCommitInterval",
+ false, // allowedToChangeAtStartup
+ true // allowedToChangeAtRuntime
+ ) {}
- virtual void append(OperationContext* txn, BSONObjBuilder& b, const std::string& name) {
- b << name << mmapv1GlobalOptions.journalCommitInterval;
- }
+ virtual void append(OperationContext* txn, BSONObjBuilder& b, const std::string& name) {
+ b << name << mmapv1GlobalOptions.journalCommitInterval;
+ }
- virtual Status set(const BSONElement& newValueElement) {
- long long newValue;
- if (!newValueElement.isNumber()) {
- StringBuilder sb;
- sb << "Expected number type for journalCommitInterval via setParameter command: "
- << newValueElement;
- return Status(ErrorCodes::BadValue, sb.str());
- }
- if (newValueElement.type() == NumberDouble &&
- (newValueElement.numberDouble() - newValueElement.numberLong()) > 0) {
- StringBuilder sb;
- sb << "journalCommitInterval must be a whole number: "
- << newValueElement;
- return Status(ErrorCodes::BadValue, sb.str());
- }
- newValue = newValueElement.numberLong();
- if (newValue <= 1 || newValue >= 500) {
- StringBuilder sb;
- sb << "journalCommitInterval must be between 1 and 500, but attempted to set to: "
- << newValue;
- return Status(ErrorCodes::BadValue, sb.str());
- }
- mmapv1GlobalOptions.journalCommitInterval = static_cast<unsigned>(newValue);
- return Status::OK();
+ virtual Status set(const BSONElement& newValueElement) {
+ long long newValue;
+ if (!newValueElement.isNumber()) {
+ StringBuilder sb;
+ sb << "Expected number type for journalCommitInterval via setParameter command: "
+ << newValueElement;
+ return Status(ErrorCodes::BadValue, sb.str());
+ }
+ if (newValueElement.type() == NumberDouble &&
+ (newValueElement.numberDouble() - newValueElement.numberLong()) > 0) {
+ StringBuilder sb;
+ sb << "journalCommitInterval must be a whole number: " << newValueElement;
+ return Status(ErrorCodes::BadValue, sb.str());
}
+ newValue = newValueElement.numberLong();
+ if (newValue <= 1 || newValue >= 500) {
+ StringBuilder sb;
+ sb << "journalCommitInterval must be between 1 and 500, but attempted to set to: "
+ << newValue;
+ return Status(ErrorCodes::BadValue, sb.str());
+ }
+ mmapv1GlobalOptions.journalCommitInterval = static_cast<unsigned>(newValue);
+ return Status::OK();
+ }
- virtual Status setFromString(const std::string& str) {
- unsigned newValue;
- Status status = parseNumberFromString(str, &newValue);
- if (!status.isOK()) {
- return status;
- }
- if (newValue <= 1 || newValue >= 500) {
- StringBuilder sb;
- sb << "journalCommitInterval must be between 1 and 500, but attempted to set to: "
- << newValue;
- return Status(ErrorCodes::BadValue, sb.str());
- }
- mmapv1GlobalOptions.journalCommitInterval = newValue;
- return Status::OK();
+ virtual Status setFromString(const std::string& str) {
+ unsigned newValue;
+ Status status = parseNumberFromString(str, &newValue);
+ if (!status.isOK()) {
+ return status;
+ }
+ if (newValue <= 1 || newValue >= 500) {
+ StringBuilder sb;
+ sb << "journalCommitInterval must be between 1 and 500, but attempted to set to: "
+ << newValue;
+ return Status(ErrorCodes::BadValue, sb.str());
}
- } journalCommitIntervalSetting;
+ mmapv1GlobalOptions.journalCommitInterval = newValue;
+ return Status::OK();
+ }
+} journalCommitIntervalSetting;
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_options.h b/src/mongo/db/storage/mmap_v1/mmap_v1_options.h
index f70dea73af7..d94d46c449e 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_options.h
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_options.h
@@ -37,60 +37,59 @@
namespace mongo {
- struct MMAPV1Options {
+struct MMAPV1Options {
+ MMAPV1Options()
+ : lenForNewNsFiles(16 * 1024 * 1024),
+ preallocj(true),
+ prealloc(false),
+ journalCommitInterval(0), // 0 means use default
+ quota(false),
+ quotaFiles(8) {}
- MMAPV1Options() :
- lenForNewNsFiles(16 * 1024 * 1024),
- preallocj(true),
- prealloc(false),
- journalCommitInterval(0), // 0 means use default
- quota(false),
- quotaFiles(8) {}
+ // --nssize
+ // Specifies the default size for namespace files, which are files that end in .ns.
+ // Each collection and index counts as a namespace.
+ unsigned lenForNewNsFiles;
- // --nssize
- // Specifies the default size for namespace files, which are files that end in .ns.
- // Each collection and index counts as a namespace.
- unsigned lenForNewNsFiles;
+ bool preallocj; // --nopreallocj no preallocation of journal files
+ bool prealloc; // --noprealloc no preallocation of data files
+ bool smallfiles; // --smallfiles allocate smaller data files
- bool preallocj; // --nopreallocj no preallocation of journal files
- bool prealloc; // --noprealloc no preallocation of data files
- bool smallfiles; // --smallfiles allocate smaller data files
+ // --journalCommitInterval
+ // The maximum amount of time the mongod process allows between journal operations.
+ // Values can range from 2 to 300 milliseconds. Lower values increase the durability
+ // of the journal, at the expense of disk performance.
+ unsigned journalCommitInterval; // group/batch commit interval ms
- // --journalCommitInterval
- // The maximum amount of time the mongod process allows between journal operations.
- // Values can range from 2 to 300 milliseconds. Lower values increase the durability
- // of the journal, at the expense of disk performance.
- unsigned journalCommitInterval; // group/batch commit interval ms
-
- // --journalOptions 7 dump journal and terminate without doing anything further
- // --journalOptions 4 recover and terminate without listening
- enum { // bits to be ORed
- JournalDumpJournal = 1, // dump diagnostics on the journal during recovery
- JournalScanOnly = 2, // don't do any real work, just scan and dump if dump
- // specified
- JournalRecoverOnly = 4, // terminate after recovery step
- JournalParanoid = 8, // paranoid mode enables extra checks
- JournalAlwaysCommit = 16, // do a group commit every time the writelock is released
- JournalAlwaysRemap = 32, // remap the private view after every group commit
- // (may lag to the next write lock acquisition,
- // but will do all files then)
- JournalNoCheckSpace = 64 // don't check that there is enough room for journal files
- // before startup (for diskfull tests)
- };
- int journalOptions; // --journalOptions <n> for debugging
+ // --journalOptions 7 dump journal and terminate without doing anything further
+ // --journalOptions 4 recover and terminate without listening
+ enum { // bits to be ORed
+ JournalDumpJournal = 1, // dump diagnostics on the journal during recovery
+ JournalScanOnly = 2, // don't do any real work, just scan and dump if dump
+ // specified
+ JournalRecoverOnly = 4, // terminate after recovery step
+ JournalParanoid = 8, // paranoid mode enables extra checks
+ JournalAlwaysCommit = 16, // do a group commit every time the writelock is released
+ JournalAlwaysRemap = 32, // remap the private view after every group commit
+ // (may lag to the next write lock acquisition,
+ // but will do all files then)
+ JournalNoCheckSpace = 64 // don't check that there is enough room for journal files
+ // before startup (for diskfull tests)
+ };
+ int journalOptions; // --journalOptions <n> for debugging
- // --quota
- // Enables a maximum limit for the number data files each database can have.
- // When running with the --quota option, MongoDB has a maximum of 8 data files
- // per database. Adjust the quota with --quotaFiles.
- bool quota;
+ // --quota
+ // Enables a maximum limit for the number data files each database can have.
+ // When running with the --quota option, MongoDB has a maximum of 8 data files
+ // per database. Adjust the quota with --quotaFiles.
+ bool quota;
- // --quotaFiles
- // Modifies the limit on the number of data files per database.
- // --quotaFiles option requires that you set --quota.
- int quotaFiles; // --quotaFiles
- };
+ // --quotaFiles
+ // Modifies the limit on the number of data files per database.
+ // --quotaFiles option requires that you set --quota.
+ int quotaFiles; // --quotaFiles
+};
- extern MMAPV1Options mmapv1GlobalOptions;
+extern MMAPV1Options mmapv1GlobalOptions;
-} // namespace mongo
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_record_store_test.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_record_store_test.cpp
index 6e2e54d56c5..5a00a5a9a7f 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_record_store_test.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_record_store_test.cpp
@@ -41,58 +41,41 @@
namespace mongo {
- class MyHarnessHelper : public HarnessHelper {
- public:
- MyHarnessHelper() {
- }
+class MyHarnessHelper : public HarnessHelper {
+public:
+ MyHarnessHelper() {}
- virtual RecordStore* newNonCappedRecordStore() {
- OperationContextNoop txn;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- md->setUserFlag( &txn, CollectionOptions::Flag_NoPadding );
- SimpleRecordStoreV1* rs = new SimpleRecordStoreV1( &txn,
- "a.b",
- md,
- &_em,
- false );
- return rs;
- }
-
- virtual RecordStore* newCappedRecordStore( int64_t cappedMaxSize,
- int64_t cappedMaxDocs ) {
- OperationContextNoop txn;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- CappedRecordStoreV1* rs = new CappedRecordStoreV1( &txn,
- NULL,
- "a.b",
- md,
- &_em,
- false );
-
- LocAndSize records[] = {
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 1000},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
- initializeV1RS(&txn, records, drecs, NULL, &_em, md);
+ virtual RecordStore* newNonCappedRecordStore() {
+ OperationContextNoop txn;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ md->setUserFlag(&txn, CollectionOptions::Flag_NoPadding);
+ SimpleRecordStoreV1* rs = new SimpleRecordStoreV1(&txn, "a.b", md, &_em, false);
+ return rs;
+ }
- return rs;
- }
+ virtual RecordStore* newCappedRecordStore(int64_t cappedMaxSize, int64_t cappedMaxDocs) {
+ OperationContextNoop txn;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ CappedRecordStoreV1* rs = new CappedRecordStoreV1(&txn, NULL, "a.b", md, &_em, false);
- virtual RecoveryUnit* newRecoveryUnit() {
- return new RecoveryUnitNoop();
- }
+ LocAndSize records[] = {{}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+ initializeV1RS(&txn, records, drecs, NULL, &_em, md);
- private:
- DummyExtentManager _em;
- };
+ return rs;
+ }
- HarnessHelper* newHarnessHelper() {
- return new MyHarnessHelper();
+ virtual RecoveryUnit* newRecoveryUnit() {
+ return new RecoveryUnitNoop();
}
+private:
+ DummyExtentManager _em;
+};
+
+HarnessHelper* newHarnessHelper() {
+ return new MyHarnessHelper();
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/mmap_windows.cpp b/src/mongo/db/storage/mmap_v1/mmap_windows.cpp
index 2969028575a..88abedd9c77 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_windows.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_windows.cpp
@@ -43,498 +43,472 @@
namespace mongo {
- using std::endl;
- using std::string;
- using std::vector;
+using std::endl;
+using std::string;
+using std::vector;
- namespace {
- mongo::AtomicUInt64 mmfNextId(0);
- }
+namespace {
+mongo::AtomicUInt64 mmfNextId(0);
+}
- static size_t fetchMinOSPageSizeBytes() {
- SYSTEM_INFO si;
- GetSystemInfo(&si);
- size_t minOSPageSizeBytes = si.dwPageSize;
- minOSPageSizeBytesTest(minOSPageSizeBytes);
- return minOSPageSizeBytes;
- }
- const size_t g_minOSPageSizeBytes = fetchMinOSPageSizeBytes();
-
- // MapViewMutex
- //
- // Protects:
- // 1. Ensures all MapViewOfFile/UnMapViewOfFile operations are serialized to reduce chance of
- // "address in use" errors (error code 487)
- // - These errors can still occur if the memory is used for other purposes
- // (stack storage, heap)
- // 2. Prevents calls to VirtualProtect while we remapping files.
- // Lock Ordering:
- // - If taken, must be after previewViews._m to prevent deadlocks
- stdx::mutex mapViewMutex;
-
- MAdvise::MAdvise(void *,unsigned, Advice) { }
- MAdvise::~MAdvise() { }
-
- const unsigned long long memoryMappedFileLocationFloor = 256LL * 1024LL * 1024LL * 1024LL;
- static unsigned long long _nextMemoryMappedFileLocation = memoryMappedFileLocationFloor;
-
- // nextMemoryMappedFileLocationMutex
- //
- // Protects:
- // Windows 64-bit specific allocation of virtual memory regions for
- // placing memory mapped files in memory
- // Lock Ordering:
- // No restrictions
- static SimpleMutex _nextMemoryMappedFileLocationMutex;
-
- unsigned long long AlignNumber(unsigned long long number, unsigned long long granularity)
- {
- return (number + granularity - 1) & ~(granularity - 1);
+static size_t fetchMinOSPageSizeBytes() {
+ SYSTEM_INFO si;
+ GetSystemInfo(&si);
+ size_t minOSPageSizeBytes = si.dwPageSize;
+ minOSPageSizeBytesTest(minOSPageSizeBytes);
+ return minOSPageSizeBytes;
+}
+const size_t g_minOSPageSizeBytes = fetchMinOSPageSizeBytes();
+
+// MapViewMutex
+//
+// Protects:
+// 1. Ensures all MapViewOfFile/UnMapViewOfFile operations are serialized to reduce chance of
+// "address in use" errors (error code 487)
+// - These errors can still occur if the memory is used for other purposes
+// (stack storage, heap)
+// 2. Prevents calls to VirtualProtect while we remapping files.
+// Lock Ordering:
+// - If taken, must be after previewViews._m to prevent deadlocks
+stdx::mutex mapViewMutex;
+
+MAdvise::MAdvise(void*, unsigned, Advice) {}
+MAdvise::~MAdvise() {}
+
+const unsigned long long memoryMappedFileLocationFloor = 256LL * 1024LL * 1024LL * 1024LL;
+static unsigned long long _nextMemoryMappedFileLocation = memoryMappedFileLocationFloor;
+
+// nextMemoryMappedFileLocationMutex
+//
+// Protects:
+// Windows 64-bit specific allocation of virtual memory regions for
+// placing memory mapped files in memory
+// Lock Ordering:
+// No restrictions
+static SimpleMutex _nextMemoryMappedFileLocationMutex;
+
+unsigned long long AlignNumber(unsigned long long number, unsigned long long granularity) {
+ return (number + granularity - 1) & ~(granularity - 1);
+}
+
+static void* getNextMemoryMappedFileLocation(unsigned long long mmfSize) {
+ if (4 == sizeof(void*)) {
+ return 0;
}
+ stdx::lock_guard<SimpleMutex> lk(_nextMemoryMappedFileLocationMutex);
- static void* getNextMemoryMappedFileLocation(unsigned long long mmfSize) {
- if (4 == sizeof(void*)) {
- return 0;
- }
- stdx::lock_guard<SimpleMutex> lk(_nextMemoryMappedFileLocationMutex);
+ static unsigned long long granularity = 0;
- static unsigned long long granularity = 0;
+ if (0 == granularity) {
+ SYSTEM_INFO systemInfo;
+ GetSystemInfo(&systemInfo);
+ granularity = static_cast<unsigned long long>(systemInfo.dwAllocationGranularity);
+ }
- if (0 == granularity) {
- SYSTEM_INFO systemInfo;
- GetSystemInfo(&systemInfo);
- granularity = static_cast<unsigned long long>(systemInfo.dwAllocationGranularity);
- }
+ unsigned long long thisMemoryMappedFileLocation = _nextMemoryMappedFileLocation;
- unsigned long long thisMemoryMappedFileLocation = _nextMemoryMappedFileLocation;
+ int current_retry = 1;
- int current_retry = 1;
+ while (true) {
+ MEMORY_BASIC_INFORMATION memInfo;
- while (true) {
- MEMORY_BASIC_INFORMATION memInfo;
-
- if (VirtualQuery(reinterpret_cast<LPCVOID>(thisMemoryMappedFileLocation),
- &memInfo, sizeof(memInfo)) == 0) {
- DWORD gle = GetLastError();
-
- // If we exceed the limits of Virtual Memory
- // - 8TB before Windows 8.1/2012 R2, 128 TB after
- // restart scanning from our memory mapped floor once more
- // This is a linear scan of regions, not of every VM page
- if (gle == ERROR_INVALID_PARAMETER && current_retry == 1) {
- thisMemoryMappedFileLocation = memoryMappedFileLocationFloor;
- ++current_retry;
- continue;
- }
+ if (VirtualQuery(reinterpret_cast<LPCVOID>(thisMemoryMappedFileLocation),
+ &memInfo,
+ sizeof(memInfo)) == 0) {
+ DWORD gle = GetLastError();
- log() << "VirtualQuery of " << thisMemoryMappedFileLocation
- << " failed with error " << errnoWithDescription(gle);
- fassertFailed(17484);
+ // If we exceed the limits of Virtual Memory
+ // - 8TB before Windows 8.1/2012 R2, 128 TB after
+ // restart scanning from our memory mapped floor once more
+ // This is a linear scan of regions, not of every VM page
+ if (gle == ERROR_INVALID_PARAMETER && current_retry == 1) {
+ thisMemoryMappedFileLocation = memoryMappedFileLocationFloor;
+ ++current_retry;
+ continue;
}
- // Free memory regions that we can use for memory map files
- // 1. Marked MEM_FREE, not MEM_RESERVE
- // 2. Marked as PAGE_NOACCESS, not anything else
- if (memInfo.Protect == PAGE_NOACCESS &&
- memInfo.State == MEM_FREE &&
- memInfo.RegionSize > mmfSize)
- break;
-
- thisMemoryMappedFileLocation = reinterpret_cast<unsigned long long>(memInfo.BaseAddress)
- + memInfo.RegionSize;
+ log() << "VirtualQuery of " << thisMemoryMappedFileLocation << " failed with error "
+ << errnoWithDescription(gle);
+ fassertFailed(17484);
}
- _nextMemoryMappedFileLocation = thisMemoryMappedFileLocation
- + AlignNumber(mmfSize, granularity);
+ // Free memory regions that we can use for memory map files
+ // 1. Marked MEM_FREE, not MEM_RESERVE
+ // 2. Marked as PAGE_NOACCESS, not anything else
+ if (memInfo.Protect == PAGE_NOACCESS && memInfo.State == MEM_FREE &&
+ memInfo.RegionSize > mmfSize)
+ break;
- return reinterpret_cast<void*>(static_cast<uintptr_t>(thisMemoryMappedFileLocation));
+ thisMemoryMappedFileLocation =
+ reinterpret_cast<unsigned long long>(memInfo.BaseAddress) + memInfo.RegionSize;
}
- MemoryMappedFile::MemoryMappedFile()
- : _uniqueId(mmfNextId.fetchAndAdd(1)),
- fd(0),
- maphandle(0),
- len(0) {
+ _nextMemoryMappedFileLocation =
+ thisMemoryMappedFileLocation + AlignNumber(mmfSize, granularity);
- created();
- }
+ return reinterpret_cast<void*>(static_cast<uintptr_t>(thisMemoryMappedFileLocation));
+}
- void MemoryMappedFile::close() {
- LockMongoFilesShared::assertExclusivelyLocked();
+MemoryMappedFile::MemoryMappedFile()
+ : _uniqueId(mmfNextId.fetchAndAdd(1)), fd(0), maphandle(0), len(0) {
+ created();
+}
- // Prevent flush and close from concurrently running
- stdx::lock_guard<stdx::mutex> lk(_flushMutex);
+void MemoryMappedFile::close() {
+ LockMongoFilesShared::assertExclusivelyLocked();
- {
- stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
+ // Prevent flush and close from concurrently running
+ stdx::lock_guard<stdx::mutex> lk(_flushMutex);
- for (vector<void*>::iterator i = views.begin(); i != views.end(); i++) {
- UnmapViewOfFile(*i);
- }
- }
+ {
+ stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
- views.clear();
- if ( maphandle )
- CloseHandle(maphandle);
- maphandle = 0;
- if ( fd )
- CloseHandle(fd);
- fd = 0;
- destroyed(); // cleans up from the master list of mmaps
+ for (vector<void*>::iterator i = views.begin(); i != views.end(); i++) {
+ UnmapViewOfFile(*i);
+ }
}
- unsigned long long mapped = 0;
+ views.clear();
+ if (maphandle)
+ CloseHandle(maphandle);
+ maphandle = 0;
+ if (fd)
+ CloseHandle(fd);
+ fd = 0;
+ destroyed(); // cleans up from the master list of mmaps
+}
- void* MemoryMappedFile::createReadOnlyMap() {
- verify( maphandle );
+unsigned long long mapped = 0;
- stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
+void* MemoryMappedFile::createReadOnlyMap() {
+ verify(maphandle);
- void* readOnlyMapAddress = NULL;
- int current_retry = 0;
+ stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
- while (true) {
+ void* readOnlyMapAddress = NULL;
+ int current_retry = 0;
- LPVOID thisAddress = getNextMemoryMappedFileLocation(len);
+ while (true) {
+ LPVOID thisAddress = getNextMemoryMappedFileLocation(len);
- readOnlyMapAddress = MapViewOfFileEx(
- maphandle, // file mapping handle
- FILE_MAP_READ, // access
- 0, 0, // file offset, high and low
- 0, // bytes to map, 0 == all
- thisAddress); // address to place file
+ readOnlyMapAddress = MapViewOfFileEx(maphandle, // file mapping handle
+ FILE_MAP_READ, // access
+ 0,
+ 0, // file offset, high and low
+ 0, // bytes to map, 0 == all
+ thisAddress); // address to place file
- if (0 == readOnlyMapAddress) {
- DWORD dosError = GetLastError();
-
- ++current_retry;
-
- // If we failed to allocate a memory mapped file, try again in case we picked
- // an address that Windows is also trying to use for some other VM allocations
- if (dosError == ERROR_INVALID_ADDRESS && current_retry < 5) {
- continue;
- }
+ if (0 == readOnlyMapAddress) {
+ DWORD dosError = GetLastError();
- log() << "MapViewOfFileEx for " << filename()
- << " at address " << thisAddress
- << " failed with error " << errnoWithDescription(dosError)
- << " (file size is " << len << ")"
- << " in MemoryMappedFile::createReadOnlyMap"
- << endl;
+ ++current_retry;
- fassertFailed(16165);
+ // If we failed to allocate a memory mapped file, try again in case we picked
+ // an address that Windows is also trying to use for some other VM allocations
+ if (dosError == ERROR_INVALID_ADDRESS && current_retry < 5) {
+ continue;
}
- break;
+ log() << "MapViewOfFileEx for " << filename() << " at address " << thisAddress
+ << " failed with error " << errnoWithDescription(dosError) << " (file size is "
+ << len << ")"
+ << " in MemoryMappedFile::createReadOnlyMap" << endl;
+
+ fassertFailed(16165);
}
- views.push_back( readOnlyMapAddress );
- return readOnlyMapAddress;
+ break;
}
- void* MemoryMappedFile::map(const char *filenameIn, unsigned long long &length, int options) {
- verify( fd == 0 && len == 0 ); // can't open more than once
- setFilename(filenameIn);
- FileAllocator::get()->allocateAsap( filenameIn, length );
- /* big hack here: Babble uses db names with colons. doesn't seem to work on windows. temporary perhaps. */
- char filename[256];
- strncpy(filename, filenameIn, 255);
- filename[255] = 0;
- {
- size_t len = strlen( filename );
- for ( size_t i=len-1; i>=0; i-- ) {
- if ( filename[i] == '/' ||
- filename[i] == '\\' )
- break;
+ views.push_back(readOnlyMapAddress);
+ return readOnlyMapAddress;
+}
- if ( filename[i] == ':' )
- filename[i] = '_';
- }
+void* MemoryMappedFile::map(const char* filenameIn, unsigned long long& length, int options) {
+ verify(fd == 0 && len == 0); // can't open more than once
+ setFilename(filenameIn);
+ FileAllocator::get()->allocateAsap(filenameIn, length);
+ /* big hack here: Babble uses db names with colons. doesn't seem to work on windows. temporary perhaps. */
+ char filename[256];
+ strncpy(filename, filenameIn, 255);
+ filename[255] = 0;
+ {
+ size_t len = strlen(filename);
+ for (size_t i = len - 1; i >= 0; i--) {
+ if (filename[i] == '/' || filename[i] == '\\')
+ break;
+
+ if (filename[i] == ':')
+ filename[i] = '_';
}
+ }
- updateLength( filename, length );
+ updateLength(filename, length);
- {
- DWORD createOptions = FILE_ATTRIBUTE_NORMAL;
- if ( options & SEQUENTIAL )
- createOptions |= FILE_FLAG_SEQUENTIAL_SCAN;
- DWORD rw = GENERIC_READ | GENERIC_WRITE;
- fd = CreateFileW(
- toWideString(filename).c_str(),
- rw, // desired access
- FILE_SHARE_WRITE | FILE_SHARE_READ, // share mode
- NULL, // security
- OPEN_ALWAYS, // create disposition
- createOptions , // flags
- NULL); // hTempl
- if ( fd == INVALID_HANDLE_VALUE ) {
- DWORD dosError = GetLastError();
- log() << "CreateFileW for " << filename
- << " failed with " << errnoWithDescription( dosError )
- << " (file size is " << length << ")"
- << " in MemoryMappedFile::map"
- << endl;
- return 0;
- }
+ {
+ DWORD createOptions = FILE_ATTRIBUTE_NORMAL;
+ if (options & SEQUENTIAL)
+ createOptions |= FILE_FLAG_SEQUENTIAL_SCAN;
+ DWORD rw = GENERIC_READ | GENERIC_WRITE;
+ fd = CreateFileW(toWideString(filename).c_str(),
+ rw, // desired access
+ FILE_SHARE_WRITE | FILE_SHARE_READ, // share mode
+ NULL, // security
+ OPEN_ALWAYS, // create disposition
+ createOptions, // flags
+ NULL); // hTempl
+ if (fd == INVALID_HANDLE_VALUE) {
+ DWORD dosError = GetLastError();
+ log() << "CreateFileW for " << filename << " failed with "
+ << errnoWithDescription(dosError) << " (file size is " << length << ")"
+ << " in MemoryMappedFile::map" << endl;
+ return 0;
}
+ }
- mapped += length;
+ mapped += length;
- {
- DWORD flProtect = PAGE_READWRITE; //(options & READONLY)?PAGE_READONLY:PAGE_READWRITE;
- maphandle = CreateFileMappingW(fd, NULL, flProtect,
- length >> 32 /*maxsizehigh*/,
- (unsigned) length /*maxsizelow*/,
- NULL/*lpName*/);
- if ( maphandle == NULL ) {
- DWORD dosError = GetLastError();
- log() << "CreateFileMappingW for " << filename
- << " failed with " << errnoWithDescription( dosError )
- << " (file size is " << length << ")"
- << " in MemoryMappedFile::map"
- << endl;
- close();
- fassertFailed( 16225 );
- }
+ {
+ DWORD flProtect = PAGE_READWRITE; //(options & READONLY)?PAGE_READONLY:PAGE_READWRITE;
+ maphandle = CreateFileMappingW(fd,
+ NULL,
+ flProtect,
+ length >> 32 /*maxsizehigh*/,
+ (unsigned)length /*maxsizelow*/,
+ NULL /*lpName*/);
+ if (maphandle == NULL) {
+ DWORD dosError = GetLastError();
+ log() << "CreateFileMappingW for " << filename << " failed with "
+ << errnoWithDescription(dosError) << " (file size is " << length << ")"
+ << " in MemoryMappedFile::map" << endl;
+ close();
+ fassertFailed(16225);
}
+ }
- void *view = 0;
- {
- stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
- DWORD access = ( options & READONLY ) ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS;
-
- int current_retry = 0;
- while (true) {
+ void* view = 0;
+ {
+ stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
+ DWORD access = (options & READONLY) ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS;
- LPVOID thisAddress = getNextMemoryMappedFileLocation(length);
+ int current_retry = 0;
+ while (true) {
+ LPVOID thisAddress = getNextMemoryMappedFileLocation(length);
- view = MapViewOfFileEx(
- maphandle, // file mapping handle
- access, // access
- 0, 0, // file offset, high and low
- 0, // bytes to map, 0 == all
- thisAddress); // address to place file
+ view = MapViewOfFileEx(maphandle, // file mapping handle
+ access, // access
+ 0,
+ 0, // file offset, high and low
+ 0, // bytes to map, 0 == all
+ thisAddress); // address to place file
- if (view == 0) {
- DWORD dosError = GetLastError();
+ if (view == 0) {
+ DWORD dosError = GetLastError();
- ++current_retry;
+ ++current_retry;
- // If we failed to allocate a memory mapped file, try again in case we picked
- // an address that Windows is also trying to use for some other VM allocations
- if (dosError == ERROR_INVALID_ADDRESS && current_retry < 5) {
- continue;
- }
+ // If we failed to allocate a memory mapped file, try again in case we picked
+ // an address that Windows is also trying to use for some other VM allocations
+ if (dosError == ERROR_INVALID_ADDRESS && current_retry < 5) {
+ continue;
+ }
#ifndef _WIN64
- // Warn user that if they are running a 32-bit app on 64-bit Windows
- if (dosError == ERROR_NOT_ENOUGH_MEMORY) {
- BOOL wow64Process;
- BOOL retWow64 = IsWow64Process(GetCurrentProcess(), &wow64Process);
- if (retWow64 && wow64Process) {
- log() << "This is a 32-bit MongoDB binary running on a 64-bit"
- " operating system that has run out of virtual memory for"
- " databases. Switch to a 64-bit build of MongoDB to open"
- " the databases.";
- }
+ // Warn user that if they are running a 32-bit app on 64-bit Windows
+ if (dosError == ERROR_NOT_ENOUGH_MEMORY) {
+ BOOL wow64Process;
+ BOOL retWow64 = IsWow64Process(GetCurrentProcess(), &wow64Process);
+ if (retWow64 && wow64Process) {
+ log() << "This is a 32-bit MongoDB binary running on a 64-bit"
+ " operating system that has run out of virtual memory for"
+ " databases. Switch to a 64-bit build of MongoDB to open"
+ " the databases.";
}
+ }
#endif
- log() << "MapViewOfFileEx for " << filename
- << " at address " << thisAddress
- << " failed with " << errnoWithDescription(dosError)
- << " (file size is " << length << ")"
- << " in MemoryMappedFile::map"
- << endl;
-
- close();
- fassertFailed(16166);
- }
+ log() << "MapViewOfFileEx for " << filename << " at address " << thisAddress
+ << " failed with " << errnoWithDescription(dosError) << " (file size is "
+ << length << ")"
+ << " in MemoryMappedFile::map" << endl;
- break;
+ close();
+ fassertFailed(16166);
}
- }
- views.push_back(view);
- len = length;
- return view;
+ break;
+ }
}
- extern stdx::mutex mapViewMutex;
-
- void* MemoryMappedFile::createPrivateMap() {
- verify( maphandle );
-
- stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
+ views.push_back(view);
+ len = length;
+ return view;
+}
- LPVOID thisAddress = getNextMemoryMappedFileLocation( len );
+extern stdx::mutex mapViewMutex;
- void* privateMapAddress = NULL;
- int current_retry = 0;
+void* MemoryMappedFile::createPrivateMap() {
+ verify(maphandle);
- while (true) {
+ stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
- privateMapAddress = MapViewOfFileEx(
- maphandle, // file mapping handle
- FILE_MAP_READ, // access
- 0, 0, // file offset, high and low
- 0, // bytes to map, 0 == all
- thisAddress); // address to place file
+ LPVOID thisAddress = getNextMemoryMappedFileLocation(len);
- if (privateMapAddress == 0) {
- DWORD dosError = GetLastError();
+ void* privateMapAddress = NULL;
+ int current_retry = 0;
- ++current_retry;
+ while (true) {
+ privateMapAddress = MapViewOfFileEx(maphandle, // file mapping handle
+ FILE_MAP_READ, // access
+ 0,
+ 0, // file offset, high and low
+ 0, // bytes to map, 0 == all
+ thisAddress); // address to place file
- // If we failed to allocate a memory mapped file, try again in case we picked
- // an address that Windows is also trying to use for some other VM allocations
- if (dosError == ERROR_INVALID_ADDRESS && current_retry < 5) {
- continue;
- }
+ if (privateMapAddress == 0) {
+ DWORD dosError = GetLastError();
- log() << "MapViewOfFileEx for " << filename()
- << " failed with error " << errnoWithDescription(dosError)
- << " (file size is " << len << ")"
- << " in MemoryMappedFile::createPrivateMap"
- << endl;
+ ++current_retry;
- fassertFailed(16167);
+ // If we failed to allocate a memory mapped file, try again in case we picked
+ // an address that Windows is also trying to use for some other VM allocations
+ if (dosError == ERROR_INVALID_ADDRESS && current_retry < 5) {
+ continue;
}
- break;
+ log() << "MapViewOfFileEx for " << filename() << " failed with error "
+ << errnoWithDescription(dosError) << " (file size is " << len << ")"
+ << " in MemoryMappedFile::createPrivateMap" << endl;
+
+ fassertFailed(16167);
}
- views.push_back( privateMapAddress );
- return privateMapAddress;
+ break;
}
- void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) {
- LockMongoFilesExclusive lockMongoFiles;
+ views.push_back(privateMapAddress);
+ return privateMapAddress;
+}
- privateViews.clearWritableBits(oldPrivateAddr, len);
+void* MemoryMappedFile::remapPrivateView(void* oldPrivateAddr) {
+ LockMongoFilesExclusive lockMongoFiles;
- stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
+ privateViews.clearWritableBits(oldPrivateAddr, len);
- if( !UnmapViewOfFile(oldPrivateAddr) ) {
- DWORD dosError = GetLastError();
- log() << "UnMapViewOfFile for " << filename()
- << " failed with error " << errnoWithDescription( dosError )
- << " in MemoryMappedFile::remapPrivateView"
- << endl;
- fassertFailed( 16168 );
- }
+ stdx::lock_guard<stdx::mutex> lk(mapViewMutex);
- void* newPrivateView = MapViewOfFileEx(
- maphandle, // file mapping handle
- FILE_MAP_READ, // access
- 0, 0, // file offset, high and low
- 0, // bytes to map, 0 == all
- oldPrivateAddr ); // we want the same address we had before
- if ( 0 == newPrivateView ) {
- DWORD dosError = GetLastError();
- log() << "MapViewOfFileEx for " << filename()
- << " failed with error " << errnoWithDescription( dosError )
- << " (file size is " << len << ")"
- << " in MemoryMappedFile::remapPrivateView"
- << endl;
- }
- fassert( 16148, newPrivateView == oldPrivateAddr );
- return newPrivateView;
+ if (!UnmapViewOfFile(oldPrivateAddr)) {
+ DWORD dosError = GetLastError();
+ log() << "UnMapViewOfFile for " << filename() << " failed with error "
+ << errnoWithDescription(dosError) << " in MemoryMappedFile::remapPrivateView" << endl;
+ fassertFailed(16168);
}
- class WindowsFlushable : public MemoryMappedFile::Flushable {
- public:
- WindowsFlushable( MemoryMappedFile* theFile,
- void * view,
- HANDLE fd,
- const uint64_t id,
- const std::string& filename,
- stdx::mutex& flushMutex )
- : _theFile(theFile), _view(view), _fd(fd), _id(id), _filename(filename),
- _flushMutex(flushMutex)
- {}
-
- void flush() {
- if (!_view || !_fd)
- return;
+ void* newPrivateView =
+ MapViewOfFileEx(maphandle, // file mapping handle
+ FILE_MAP_READ, // access
+ 0,
+ 0, // file offset, high and low
+ 0, // bytes to map, 0 == all
+ oldPrivateAddr); // we want the same address we had before
+ if (0 == newPrivateView) {
+ DWORD dosError = GetLastError();
+ log() << "MapViewOfFileEx for " << filename() << " failed with error "
+ << errnoWithDescription(dosError) << " (file size is " << len << ")"
+ << " in MemoryMappedFile::remapPrivateView" << endl;
+ }
+ fassert(16148, newPrivateView == oldPrivateAddr);
+ return newPrivateView;
+}
- {
- LockMongoFilesShared mmfilesLock;
+class WindowsFlushable : public MemoryMappedFile::Flushable {
+public:
+ WindowsFlushable(MemoryMappedFile* theFile,
+ void* view,
+ HANDLE fd,
+ const uint64_t id,
+ const std::string& filename,
+ stdx::mutex& flushMutex)
+ : _theFile(theFile),
+ _view(view),
+ _fd(fd),
+ _id(id),
+ _filename(filename),
+ _flushMutex(flushMutex) {}
+
+ void flush() {
+ if (!_view || !_fd)
+ return;
- std::set<MongoFile*> mmfs = MongoFile::getAllFiles();
- std::set<MongoFile*>::const_iterator it = mmfs.find(_theFile);
- if ( it == mmfs.end() || (*it)->getUniqueId() != _id ) {
- // this was deleted while we were unlocked
- return;
- }
+ {
+ LockMongoFilesShared mmfilesLock;
- // Hold the flush mutex to ensure the file is not closed during flush
- _flushMutex.lock();
+ std::set<MongoFile*> mmfs = MongoFile::getAllFiles();
+ std::set<MongoFile*>::const_iterator it = mmfs.find(_theFile);
+ if (it == mmfs.end() || (*it)->getUniqueId() != _id) {
+ // this was deleted while we were unlocked
+ return;
}
- stdx::lock_guard<stdx::mutex> lk(_flushMutex, stdx::adopt_lock);
-
- int loopCount = 0;
- bool success = false;
- bool timeout = false;
- int dosError = ERROR_SUCCESS;
- const int maximumTimeInSeconds = 60 * 15;
- Timer t;
- while ( !success && !timeout ) {
- ++loopCount;
- success = FALSE != FlushViewOfFile( _view, 0 );
- if ( !success ) {
- dosError = GetLastError();
- if ( dosError != ERROR_LOCK_VIOLATION ) {
- break;
- }
- timeout = t.seconds() > maximumTimeInSeconds;
- }
- }
- if ( success && loopCount > 1 ) {
- log() << "FlushViewOfFile for " << _filename
- << " succeeded after " << loopCount
- << " attempts taking " << t.millis()
- << "ms" << endl;
- }
- else if ( !success ) {
- log() << "FlushViewOfFile for " << _filename
- << " failed with error " << dosError
- << " after " << loopCount
- << " attempts taking " << t.millis()
- << "ms" << endl;
- // Abort here to avoid data corruption
- fassert(16387, false);
- }
+ // Hold the flush mutex to ensure the file is not closed during flush
+ _flushMutex.lock();
+ }
- success = FALSE != FlushFileBuffers(_fd);
+ stdx::lock_guard<stdx::mutex> lk(_flushMutex, stdx::adopt_lock);
+
+ int loopCount = 0;
+ bool success = false;
+ bool timeout = false;
+ int dosError = ERROR_SUCCESS;
+ const int maximumTimeInSeconds = 60 * 15;
+ Timer t;
+ while (!success && !timeout) {
+ ++loopCount;
+ success = FALSE != FlushViewOfFile(_view, 0);
if (!success) {
- int err = GetLastError();
- log() << "FlushFileBuffers failed: " << errnoWithDescription( err )
- << " file: " << _filename << endl;
- dataSyncFailedHandler();
+ dosError = GetLastError();
+ if (dosError != ERROR_LOCK_VIOLATION) {
+ break;
+ }
+ timeout = t.seconds() > maximumTimeInSeconds;
}
}
+ if (success && loopCount > 1) {
+ log() << "FlushViewOfFile for " << _filename << " succeeded after " << loopCount
+ << " attempts taking " << t.millis() << "ms" << endl;
+ } else if (!success) {
+ log() << "FlushViewOfFile for " << _filename << " failed with error " << dosError
+ << " after " << loopCount << " attempts taking " << t.millis() << "ms" << endl;
+ // Abort here to avoid data corruption
+ fassert(16387, false);
+ }
- MemoryMappedFile* _theFile; // this may be deleted while we are running
- void * _view;
- HANDLE _fd;
- const uint64_t _id;
- string _filename;
- stdx::mutex& _flushMutex;
- };
-
- void MemoryMappedFile::flush(bool sync) {
- uassert(13056, "Async flushing not supported on windows", sync);
- if( !views.empty() ) {
- WindowsFlushable f(this, viewForFlushing(), fd, _uniqueId, filename(), _flushMutex);
- f.flush();
+ success = FALSE != FlushFileBuffers(_fd);
+ if (!success) {
+ int err = GetLastError();
+ log() << "FlushFileBuffers failed: " << errnoWithDescription(err)
+ << " file: " << _filename << endl;
+ dataSyncFailedHandler();
}
}
- MemoryMappedFile::Flushable * MemoryMappedFile::prepareFlush() {
- return new WindowsFlushable(this, viewForFlushing(), fd, _uniqueId,
- filename(), _flushMutex);
+ MemoryMappedFile* _theFile; // this may be deleted while we are running
+ void* _view;
+ HANDLE _fd;
+ const uint64_t _id;
+ string _filename;
+ stdx::mutex& _flushMutex;
+};
+
+void MemoryMappedFile::flush(bool sync) {
+ uassert(13056, "Async flushing not supported on windows", sync);
+ if (!views.empty()) {
+ WindowsFlushable f(this, viewForFlushing(), fd, _uniqueId, filename(), _flushMutex);
+ f.flush();
}
+}
+MemoryMappedFile::Flushable* MemoryMappedFile::prepareFlush() {
+ return new WindowsFlushable(this, viewForFlushing(), fd, _uniqueId, filename(), _flushMutex);
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/record.h b/src/mongo/db/storage/mmap_v1/record.h
index 38c0cfd7085..0f3f9ebcdd4 100644
--- a/src/mongo/db/storage/mmap_v1/record.h
+++ b/src/mongo/db/storage/mmap_v1/record.h
@@ -37,98 +37,141 @@
namespace mongo {
- class DeletedRecord;
-
- /* MmapV1RecordHeader is a record in a datafile. DeletedRecord is similar but for deleted space.
-
- *11:03:20 AM) dm10gen: regarding extentOfs...
- (11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords
- (11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs. (64 bit total)
- (11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset
- (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo
- (11:04:33 AM) dm10gen: see class DiskLoc for more info
- (11:04:43 AM) dm10gen: so that is how MmapV1RecordHeader::myExtent() works
- (11:04:53 AM) dm10gen: on an alloc(), when we build a new MmapV1RecordHeader, we must populate its extentOfs then
- */
+class DeletedRecord;
+
+/* MmapV1RecordHeader is a record in a datafile. DeletedRecord is similar but for deleted space.
+
+*11:03:20 AM) dm10gen: regarding extentOfs...
+(11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords
+(11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs. (64 bit total)
+(11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset
+(11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo
+(11:04:33 AM) dm10gen: see class DiskLoc for more info
+(11:04:43 AM) dm10gen: so that is how MmapV1RecordHeader::myExtent() works
+(11:04:53 AM) dm10gen: on an alloc(), when we build a new MmapV1RecordHeader, we must populate its extentOfs then
+*/
#pragma pack(1)
- class MmapV1RecordHeader {
- public:
- enum HeaderSizeValue { HeaderSize = 16 };
-
- int lengthWithHeaders() const { return _lengthWithHeaders; }
- int& lengthWithHeaders() { return _lengthWithHeaders; }
-
- int extentOfs() const { return _extentOfs; }
- int& extentOfs() { return _extentOfs; }
-
- int nextOfs() const { return _nextOfs; }
- int& nextOfs() { return _nextOfs; }
-
- int prevOfs() const { return _prevOfs; }
- int& prevOfs() { return _prevOfs; }
-
- const char* data() const { return _data; }
- char* data() { return _data; }
-
- // XXX remove
- const char* dataNoThrowing() const { return _data; }
- char* dataNoThrowing() { return _data; }
-
- int netLength() const { return _netLength(); }
-
- /* use this when a record is deleted. basically a union with next/prev fields */
- DeletedRecord& asDeleted() { return *((DeletedRecord*) this); }
-
- DiskLoc myExtentLoc(const DiskLoc& myLoc) const { return DiskLoc(myLoc.a(), extentOfs() ); }
-
- struct NP {
- int nextOfs;
- int prevOfs;
- };
-
- NP* np() { return (NP*) &_nextOfs; }
-
- RecordData toRecordData() const { return RecordData(_data, _netLength()); }
-
- private:
-
- int _netLength() const { return _lengthWithHeaders - HeaderSize; }
-
- int _lengthWithHeaders;
- int _extentOfs;
- int _nextOfs;
- int _prevOfs;
-
- /** be careful when referencing this that your write intent was correct */
- char _data[4];
-
- public:
- static bool MemoryTrackingEnabled;
-
+class MmapV1RecordHeader {
+public:
+ enum HeaderSizeValue { HeaderSize = 16 };
+
+ int lengthWithHeaders() const {
+ return _lengthWithHeaders;
+ }
+ int& lengthWithHeaders() {
+ return _lengthWithHeaders;
+ }
+
+ int extentOfs() const {
+ return _extentOfs;
+ }
+ int& extentOfs() {
+ return _extentOfs;
+ }
+
+ int nextOfs() const {
+ return _nextOfs;
+ }
+ int& nextOfs() {
+ return _nextOfs;
+ }
+
+ int prevOfs() const {
+ return _prevOfs;
+ }
+ int& prevOfs() {
+ return _prevOfs;
+ }
+
+ const char* data() const {
+ return _data;
+ }
+ char* data() {
+ return _data;
+ }
+
+ // XXX remove
+ const char* dataNoThrowing() const {
+ return _data;
+ }
+ char* dataNoThrowing() {
+ return _data;
+ }
+
+ int netLength() const {
+ return _netLength();
+ }
+
+ /* use this when a record is deleted. basically a union with next/prev fields */
+ DeletedRecord& asDeleted() {
+ return *((DeletedRecord*)this);
+ }
+
+ DiskLoc myExtentLoc(const DiskLoc& myLoc) const {
+ return DiskLoc(myLoc.a(), extentOfs());
+ }
+
+ struct NP {
+ int nextOfs;
+ int prevOfs;
};
-#pragma pack()
-
- // TODO: this probably moves to record_store.h
- class DeletedRecord {
- public:
- int lengthWithHeaders() const { return _lengthWithHeaders; }
- int& lengthWithHeaders() { return _lengthWithHeaders; }
+ NP* np() {
+ return (NP*)&_nextOfs;
+ }
- int extentOfs() const { return _extentOfs; }
- int& extentOfs() { return _extentOfs; }
+ RecordData toRecordData() const {
+ return RecordData(_data, _netLength());
+ }
- // TODO: we need to not const_cast here but problem is DiskLoc::writing
- DiskLoc& nextDeleted() const { return const_cast<DiskLoc&>(_nextDeleted); }
+private:
+ int _netLength() const {
+ return _lengthWithHeaders - HeaderSize;
+ }
- private:
- int _lengthWithHeaders;
+ int _lengthWithHeaders;
+ int _extentOfs;
+ int _nextOfs;
+ int _prevOfs;
- int _extentOfs;
+ /** be careful when referencing this that your write intent was correct */
+ char _data[4];
- DiskLoc _nextDeleted;
- };
+public:
+ static bool MemoryTrackingEnabled;
+};
+#pragma pack()
- BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) );
+// TODO: this probably moves to record_store.h
+class DeletedRecord {
+public:
+ int lengthWithHeaders() const {
+ return _lengthWithHeaders;
+ }
+ int& lengthWithHeaders() {
+ return _lengthWithHeaders;
+ }
+
+ int extentOfs() const {
+ return _extentOfs;
+ }
+ int& extentOfs() {
+ return _extentOfs;
+ }
+
+ // TODO: we need to not const_cast here but problem is DiskLoc::writing
+ DiskLoc& nextDeleted() const {
+ return const_cast<DiskLoc&>(_nextDeleted);
+ }
+
+private:
+ int _lengthWithHeaders;
+
+ int _extentOfs;
+
+ DiskLoc _nextDeleted;
+};
+
+BOOST_STATIC_ASSERT(16 == sizeof(DeletedRecord));
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_access_tracker.cpp b/src/mongo/db/storage/mmap_v1/record_access_tracker.cpp
index ab77ad69b08..ee13b62d456 100644
--- a/src/mongo/db/storage/mmap_v1/record_access_tracker.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_access_tracker.cpp
@@ -42,312 +42,305 @@
namespace mongo {
- namespace {
+namespace {
- static bool blockSupported = false;
+static bool blockSupported = false;
- MONGO_INITIALIZER_WITH_PREREQUISITES(RecordBlockSupported,
- ("SystemInfo"))(InitializerContext* cx) {
- blockSupported = ProcessInfo::blockCheckSupported();
- return Status::OK();
- }
+MONGO_INITIALIZER_WITH_PREREQUISITES(RecordBlockSupported, ("SystemInfo"))(InitializerContext* cx) {
+ blockSupported = ProcessInfo::blockCheckSupported();
+ return Status::OK();
+}
- int hash(size_t region) {
- return
- abs( ( ( 7 + (int)(region & 0xFFFF) )
- * ( 11 + (int)( ( region >> 16 ) & 0xFFFF ) )
+int hash(size_t region) {
+ return abs(((7 + (int)(region & 0xFFFF)) * (11 + (int)((region >> 16) & 0xFFFF))
#if defined(_WIN64) || defined(__amd64__)
- * ( 13 + (int)( ( region >> 32 ) & 0xFFFF ) )
- * ( 17 + (int)( ( region >> 48 ) & 0xFFFF ) )
+ *
+ (13 + (int)((region >> 32) & 0xFFFF)) * (17 + (int)((region >> 48) & 0xFFFF))
#endif
- ) % RecordAccessTracker::SliceSize );
- }
+ ) %
+ RecordAccessTracker::SliceSize);
+}
- int bigHash(size_t region) {
- return hash(region) % RecordAccessTracker::BigHashSize;
- }
+int bigHash(size_t region) {
+ return hash(region) % RecordAccessTracker::BigHashSize;
+}
- namespace PointerTable {
-
- /* A "superpage" is a group of 16 contiguous pages that differ
- * only in the low-order 16 bits. This means that there is
- * enough room in the low-order bits to store a bitmap for each
- * page in the superpage.
- */
- static const size_t superpageMask = ~0xffffLL;
- static const size_t superpageShift = 16;
- static const size_t pageSelectorMask = 0xf000LL; // selects a page in a superpage
- static const int pageSelectorShift = 12;
-
- // Tunables
- static const int capacity = 128; // in superpages
- static const int bucketSize = 4; // half cache line
- static const int buckets = capacity/bucketSize;
-
- struct Data {
- /** organized similar to a CPU cache
- * bucketSize-way set associative
- * least-recently-inserted replacement policy
- */
- size_t _table[buckets][bucketSize];
- long long _lastReset; // time in millis
- };
-
- void reset(Data* data) {
- memset(data->_table, 0, sizeof(data->_table));
- data->_lastReset = Listener::getElapsedTimeMillis();
- }
-
- inline void resetIfNeeded( Data* data ) {
- const long long now = Listener::getElapsedTimeMillis();
- if (MONGO_unlikely((now - data->_lastReset) >
- RecordAccessTracker::RotateTimeSecs*1000)) {
- reset(data);
- }
- }
-
- inline size_t pageBitOf(size_t ptr) {
- return 1LL << ((ptr & pageSelectorMask) >> pageSelectorShift);
- }
-
- inline size_t superpageOf(size_t ptr) {
- return ptr & superpageMask;
- }
-
- inline size_t bucketFor(size_t ptr) {
- return (ptr >> superpageShift) % buckets;
- }
-
- inline bool haveSeenPage(size_t superpage, size_t ptr) {
- return superpage & pageBitOf(ptr);
- }
-
- inline void markPageSeen(size_t& superpage, size_t ptr) {
- superpage |= pageBitOf(ptr);
- }
-
- /** call this to check a page has been seen yet. */
- inline bool seen(Data* data, size_t ptr) {
- resetIfNeeded(data);
-
- // A bucket contains 4 superpages each containing 16 contiguous pages
- // See above for a more detailed explanation of superpages
- size_t* bucket = data->_table[bucketFor(ptr)];
-
- for (int i = 0; i < bucketSize; i++) {
- if (superpageOf(ptr) == superpageOf(bucket[i])) {
- if (haveSeenPage(bucket[i], ptr))
- return true;
-
- markPageSeen(bucket[i], ptr);
- return false;
- }
- }
-
- // superpage isn't in thread-local cache
- // slide bucket forward and add new superpage at front
- for (int i = bucketSize-1; i > 0; i--)
- bucket[i] = bucket[i-1];
-
- bucket[0] = superpageOf(ptr);
- markPageSeen(bucket[0], ptr);
-
- return false;
- }
-
- Data* getData();
-
- }; // namespace PointerTable
-
- } // namespace
-
- //
- // Slice
- //
-
- RecordAccessTracker::Slice::Slice() {
- reset();
- }
+namespace PointerTable {
- void RecordAccessTracker::Slice::reset() {
- memset(_data, 0, sizeof(_data));
- _lastReset = time(0);
+/* A "superpage" is a group of 16 contiguous pages that differ
+ * only in the low-order 16 bits. This means that there is
+ * enough room in the low-order bits to store a bitmap for each
+ * page in the superpage.
+ */
+static const size_t superpageMask = ~0xffffLL;
+static const size_t superpageShift = 16;
+static const size_t pageSelectorMask = 0xf000LL; // selects a page in a superpage
+static const int pageSelectorShift = 12;
+
+// Tunables
+static const int capacity = 128; // in superpages
+static const int bucketSize = 4; // half cache line
+static const int buckets = capacity / bucketSize;
+
+struct Data {
+ /** organized similar to a CPU cache
+ * bucketSize-way set associative
+ * least-recently-inserted replacement policy
+ */
+ size_t _table[buckets][bucketSize];
+ long long _lastReset; // time in millis
+};
+
+void reset(Data* data) {
+ memset(data->_table, 0, sizeof(data->_table));
+ data->_lastReset = Listener::getElapsedTimeMillis();
+}
+
+inline void resetIfNeeded(Data* data) {
+ const long long now = Listener::getElapsedTimeMillis();
+ if (MONGO_unlikely((now - data->_lastReset) > RecordAccessTracker::RotateTimeSecs * 1000)) {
+ reset(data);
}
+}
- RecordAccessTracker::State RecordAccessTracker::Slice::get(int regionHash,
- size_t region,
- short offset) {
- DEV verify(hash(region) == regionHash);
+inline size_t pageBitOf(size_t ptr) {
+ return 1LL << ((ptr & pageSelectorMask) >> pageSelectorShift);
+}
- Entry* e = _get(regionHash, region, false);
- if (!e)
- return Unk;
+inline size_t superpageOf(size_t ptr) {
+ return ptr & superpageMask;
+}
- return (e->value & ( 1ULL << offset ) ) ? In : Out;
- }
+inline size_t bucketFor(size_t ptr) {
+ return (ptr >> superpageShift) % buckets;
+}
- bool RecordAccessTracker::Slice::put(int regionHash, size_t region, short offset) {
- DEV verify(hash(region) == regionHash);
+inline bool haveSeenPage(size_t superpage, size_t ptr) {
+ return superpage & pageBitOf(ptr);
+}
- Entry* e = _get(regionHash, region, true);
- if (!e)
- return false;
+inline void markPageSeen(size_t& superpage, size_t ptr) {
+ superpage |= pageBitOf(ptr);
+}
- e->value |= 1ULL << offset;
- return true;
- }
+/** call this to check a page has been seen yet. */
+inline bool seen(Data* data, size_t ptr) {
+ resetIfNeeded(data);
+
+ // A bucket contains 4 superpages each containing 16 contiguous pages
+ // See above for a more detailed explanation of superpages
+ size_t* bucket = data->_table[bucketFor(ptr)];
+
+ for (int i = 0; i < bucketSize; i++) {
+ if (superpageOf(ptr) == superpageOf(bucket[i])) {
+ if (haveSeenPage(bucket[i], ptr))
+ return true;
- time_t RecordAccessTracker::Slice::lastReset() const {
- return _lastReset;
+ markPageSeen(bucket[i], ptr);
+ return false;
+ }
}
- RecordAccessTracker::Entry* RecordAccessTracker::Slice::_get(int start,
- size_t region,
- bool add) {
- for (int i = 0; i < MaxChain; i++) {
- int bucket = (start + i) % SliceSize;
+ // superpage isn't in thread-local cache
+ // slide bucket forward and add new superpage at front
+ for (int i = bucketSize - 1; i > 0; i--)
+ bucket[i] = bucket[i - 1];
- if (_data[bucket].region == 0) {
- if (!add)
- return NULL;
+ bucket[0] = superpageOf(ptr);
+ markPageSeen(bucket[0], ptr);
- _data[bucket].region = region;
- return &_data[bucket];
- }
+ return false;
+}
- if (_data[bucket].region == region) {
- return &_data[bucket];
- }
- }
+Data* getData();
- return NULL;
- }
+}; // namespace PointerTable
- //
- // Rolling
- //
+} // namespace
- RecordAccessTracker::Rolling::Rolling() {
- _curSlice = 0;
- _lastRotate = Listener::getElapsedTimeMillis();
- }
+//
+// Slice
+//
- bool RecordAccessTracker::Rolling::access(size_t region, short offset, bool doHalf) {
- int regionHash = hash(region);
+RecordAccessTracker::Slice::Slice() {
+ reset();
+}
- stdx::lock_guard<SimpleMutex> lk(_lock);
+void RecordAccessTracker::Slice::reset() {
+ memset(_data, 0, sizeof(_data));
+ _lastReset = time(0);
+}
- static int rarelyCount = 0;
- if (rarelyCount++ % (2048 / BigHashSize) == 0) {
- long long now = Listener::getElapsedTimeMillis();
+RecordAccessTracker::State RecordAccessTracker::Slice::get(int regionHash,
+ size_t region,
+ short offset) {
+ DEV verify(hash(region) == regionHash);
- if (now - _lastRotate > (1000 * RotateTimeSecs)) {
- _rotate();
- }
- }
+ Entry* e = _get(regionHash, region, false);
+ if (!e)
+ return Unk;
- for (int i = 0; i < NumSlices / (doHalf ? 2 : 1); i++) {
- int pos = (_curSlice + i) % NumSlices;
- State s = _slices[pos].get(regionHash, region, offset);
+ return (e->value & (1ULL << offset)) ? In : Out;
+}
- if (s == In)
- return true;
+bool RecordAccessTracker::Slice::put(int regionHash, size_t region, short offset) {
+ DEV verify(hash(region) == regionHash);
+
+ Entry* e = _get(regionHash, region, true);
+ if (!e)
+ return false;
+
+ e->value |= 1ULL << offset;
+ return true;
+}
+
+time_t RecordAccessTracker::Slice::lastReset() const {
+ return _lastReset;
+}
+
+RecordAccessTracker::Entry* RecordAccessTracker::Slice::_get(int start, size_t region, bool add) {
+ for (int i = 0; i < MaxChain; i++) {
+ int bucket = (start + i) % SliceSize;
- if (s == Out) {
- _slices[pos].put(regionHash, region, offset);
- return false;
- }
+ if (_data[bucket].region == 0) {
+ if (!add)
+ return NULL;
+
+ _data[bucket].region = region;
+ return &_data[bucket];
+ }
+
+ if (_data[bucket].region == region) {
+ return &_data[bucket];
}
+ }
+
+ return NULL;
+}
+
+//
+// Rolling
+//
+
+RecordAccessTracker::Rolling::Rolling() {
+ _curSlice = 0;
+ _lastRotate = Listener::getElapsedTimeMillis();
+}
+
+bool RecordAccessTracker::Rolling::access(size_t region, short offset, bool doHalf) {
+ int regionHash = hash(region);
- // we weren't in any slice
- // so add to cur
- if (!_slices[_curSlice].put(regionHash, region, offset)) {
+ stdx::lock_guard<SimpleMutex> lk(_lock);
+
+ static int rarelyCount = 0;
+ if (rarelyCount++ % (2048 / BigHashSize) == 0) {
+ long long now = Listener::getElapsedTimeMillis();
+
+ if (now - _lastRotate > (1000 * RotateTimeSecs)) {
_rotate();
- _slices[_curSlice].put(regionHash, region, offset);
}
- return false;
}
- void RecordAccessTracker::Rolling::_rotate() {
- _curSlice = (_curSlice + 1) % NumSlices;
- _slices[_curSlice].reset();
- _lastRotate = Listener::getElapsedTimeMillis();
+ for (int i = 0; i < NumSlices / (doHalf ? 2 : 1); i++) {
+ int pos = (_curSlice + i) % NumSlices;
+ State s = _slices[pos].get(regionHash, region, offset);
+
+ if (s == In)
+ return true;
+
+ if (s == Out) {
+ _slices[pos].put(regionHash, region, offset);
+ return false;
+ }
+ }
+
+ // we weren't in any slice
+ // so add to cur
+ if (!_slices[_curSlice].put(regionHash, region, offset)) {
+ _rotate();
+ _slices[_curSlice].put(regionHash, region, offset);
}
+ return false;
+}
- // These need to be outside the ps namespace due to the way they are defined
+void RecordAccessTracker::Rolling::_rotate() {
+ _curSlice = (_curSlice + 1) % NumSlices;
+ _slices[_curSlice].reset();
+ _lastRotate = Listener::getElapsedTimeMillis();
+}
+
+// These need to be outside the ps namespace due to the way they are defined
#if defined(MONGO_CONFIG_HAVE___THREAD)
- __thread PointerTable::Data _pointerTableData;
- PointerTable::Data* PointerTable::getData() {
- return &_pointerTableData;
- }
+__thread PointerTable::Data _pointerTableData;
+PointerTable::Data* PointerTable::getData() {
+ return &_pointerTableData;
+}
#elif defined(MONGO_CONFIG_HAVE___DECLSPEC_THREAD)
- __declspec( thread ) PointerTable::Data _pointerTableData;
- PointerTable::Data* PointerTable::getData() {
- return &_pointerTableData;
- }
+__declspec(thread) PointerTable::Data _pointerTableData;
+PointerTable::Data* PointerTable::getData() {
+ return &_pointerTableData;
+}
#else
- TSP_DEFINE(PointerTable::Data, _pointerTableData);
- PointerTable::Data* PointerTable::getData() {
- return _pointerTableData.getMake();
- }
+TSP_DEFINE(PointerTable::Data, _pointerTableData);
+PointerTable::Data* PointerTable::getData() {
+ return _pointerTableData.getMake();
+}
#endif
- //
- // RecordAccessTracker
- //
+//
+// RecordAccessTracker
+//
- RecordAccessTracker::RecordAccessTracker()
- : _blockSupported(blockSupported) {
- reset();
- }
+RecordAccessTracker::RecordAccessTracker() : _blockSupported(blockSupported) {
+ reset();
+}
- void RecordAccessTracker::reset() {
- PointerTable::reset(PointerTable::getData());
- _rollingTable.reset(new Rolling[BigHashSize]);
- }
+void RecordAccessTracker::reset() {
+ PointerTable::reset(PointerTable::getData());
+ _rollingTable.reset(new Rolling[BigHashSize]);
+}
- void RecordAccessTracker::markAccessed(const void* record) {
- const size_t page = reinterpret_cast<size_t>(record) >> 12;
- const size_t region = page >> 6;
- const size_t offset = page & 0x3f;
+void RecordAccessTracker::markAccessed(const void* record) {
+ const size_t page = reinterpret_cast<size_t>(record) >> 12;
+ const size_t region = page >> 6;
+ const size_t offset = page & 0x3f;
- const bool seen = PointerTable::seen(PointerTable::getData(),
- reinterpret_cast<size_t>(record));
- if (!seen) {
- _rollingTable[bigHash(region)].access(region, offset , true);
- }
+ const bool seen = PointerTable::seen(PointerTable::getData(), reinterpret_cast<size_t>(record));
+ if (!seen) {
+ _rollingTable[bigHash(region)].access(region, offset, true);
}
+}
- bool RecordAccessTracker::checkAccessedAndMark(const void* record) {
- const size_t page = reinterpret_cast<size_t>(record) >> 12;
- const size_t region = page >> 6;
- const size_t offset = page & 0x3f;
-
- // This is like the "L1 cache". If we're a miss then we fall through and check the
- // "L2 cache". If we're still a miss, then we defer to a system-specific system
- // call (or give up and return false if deferring to the system call is not enabled).
- if (PointerTable::seen(PointerTable::getData(), reinterpret_cast<size_t>(record))) {
- return true;
- }
-
- // We were a miss in the PointerTable. See if we can find 'record' in the Rolling table.
- if (_rollingTable[bigHash(region)].access(region, offset, false)) {
- return true;
- }
+bool RecordAccessTracker::checkAccessedAndMark(const void* record) {
+ const size_t page = reinterpret_cast<size_t>(record) >> 12;
+ const size_t region = page >> 6;
+ const size_t offset = page & 0x3f;
- if (!_blockSupported) {
- // This means we don't fall back to a system call. Instead we assume things aren't
- // in memory. This could mean that we yield too much, but this is much better
- // than the alternative of not yielding through a page fault.
- return false;
- }
+ // This is like the "L1 cache". If we're a miss then we fall through and check the
+ // "L2 cache". If we're still a miss, then we defer to a system-specific system
+ // call (or give up and return false if deferring to the system call is not enabled).
+ if (PointerTable::seen(PointerTable::getData(), reinterpret_cast<size_t>(record))) {
+ return true;
+ }
- return ProcessInfo::blockInMemory(const_cast<void*>(record));
+ // We were a miss in the PointerTable. See if we can find 'record' in the Rolling table.
+ if (_rollingTable[bigHash(region)].access(region, offset, false)) {
+ return true;
}
- void RecordAccessTracker::disableSystemBlockInMemCheck() {
- _blockSupported = false;
+ if (!_blockSupported) {
+ // This means we don't fall back to a system call. Instead we assume things aren't
+ // in memory. This could mean that we yield too much, but this is much better
+ // than the alternative of not yielding through a page fault.
+ return false;
}
-} // namespace mongo
+ return ProcessInfo::blockInMemory(const_cast<void*>(record));
+}
+
+void RecordAccessTracker::disableSystemBlockInMemCheck() {
+ _blockSupported = false;
+}
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_access_tracker.h b/src/mongo/db/storage/mmap_v1/record_access_tracker.h
index aa98e22230e..a1cb7ab2187 100644
--- a/src/mongo/db/storage/mmap_v1/record_access_tracker.h
+++ b/src/mongo/db/storage/mmap_v1/record_access_tracker.h
@@ -33,127 +33,126 @@
namespace mongo {
- class MmapV1RecordHeader;
+class MmapV1RecordHeader;
+
+/**
+ * Used to implement likelyInPhysicalMemory() for the MMAP v1 storage engine. Since
+ * MMAP v1 holds exclusive collection-level locks, it should yield the locks during a
+ * page fault. The RecordAccessTracker is used to guess at which records are in memory,
+ * so that a yield can be requested unless we're sure that the record has been
+ * recently accessed.
+ */
+class RecordAccessTracker {
+ MONGO_DISALLOW_COPYING(RecordAccessTracker);
+
+public:
+ RecordAccessTracker();
+
+ enum Constants {
+ SliceSize = 1024,
+ MaxChain = 20, // intentionally very low
+ NumSlices = 10,
+ RotateTimeSecs = 90,
+ BigHashSize = 128
+ };
/**
- * Used to implement likelyInPhysicalMemory() for the MMAP v1 storage engine. Since
- * MMAP v1 holds exclusive collection-level locks, it should yield the locks during a
- * page fault. The RecordAccessTracker is used to guess at which records are in memory,
- * so that a yield can be requested unless we're sure that the record has been
- * recently accessed.
+ * Informs this record access tracker that 'record' has been accessed.
*/
- class RecordAccessTracker {
- MONGO_DISALLOW_COPYING(RecordAccessTracker);
- public:
- RecordAccessTracker();
+ void markAccessed(const void* record);
- enum Constants {
- SliceSize = 1024,
- MaxChain = 20, // intentionally very low
- NumSlices = 10,
- RotateTimeSecs = 90,
- BigHashSize = 128
- };
+ /**
+ * @return whether or not 'record' has been marked as accessed recently. A return value
+ * of true means that 'record' is likely in physical memory.
+ *
+ * Also has the side effect of marking 'record' as accessed.
+ */
+ bool checkAccessedAndMark(const void* record);
- /**
- * Informs this record access tracker that 'record' has been accessed.
- */
- void markAccessed(const void* record);
+ /**
+ * Clears out any history of record accesses.
+ */
+ void reset();
- /**
- * @return whether or not 'record' has been marked as accessed recently. A return value
- * of true means that 'record' is likely in physical memory.
- *
- * Also has the side effect of marking 'record' as accessed.
- */
- bool checkAccessedAndMark(const void* record);
+ //
+ // For testing.
+ //
+
+ /**
+ * The accessedRecently() implementation falls back to making a system call if it
+ * appears that the record is not in physical memory. Use this method to disable
+ * the fallback for testing.
+ */
+ void disableSystemBlockInMemCheck();
+
+private:
+ enum State { In, Out, Unk };
+
+ struct Entry {
+ size_t region;
+ unsigned long long value;
+ };
+
+ /**
+ * simple hash map for region -> status
+ * this constitutes a single region of time
+ * it does chaining, but very short chains
+ */
+ class Slice {
+ public:
+ Slice();
- /**
- * Clears out any history of record accesses.
- */
void reset();
- //
- // For testing.
- //
+ State get(int regionHash, size_t region, short offset);
/**
- * The accessedRecently() implementation falls back to making a system call if it
- * appears that the record is not in physical memory. Use this method to disable
- * the fallback for testing.
+ * @return true if added, false if full
*/
- void disableSystemBlockInMemCheck();
+ bool put(int regionHash, size_t region, short offset);
+
+ time_t lastReset() const;
private:
- enum State {
- In, Out, Unk
- };
+ Entry* _get(int start, size_t region, bool add);
+
+ Entry _data[SliceSize];
+ time_t _lastReset;
+ };
- struct Entry {
- size_t region;
- unsigned long long value;
- };
+ /**
+ * this contains many slices of times
+ * the idea you put mem status in the current time slice
+ * and then after a certain period of time, it rolls off so we check again
+ */
+ class Rolling {
+ public:
+ Rolling();
/**
- * simple hash map for region -> status
- * this constitutes a single region of time
- * it does chaining, but very short chains
+ * After this call, we assume the page is in RAM.
+ *
+ * @param doHalf if this is a known good access, want to put in first half.
+ *
+ * @return whether we know the page is in RAM
*/
- class Slice {
- public:
- Slice();
-
- void reset();
-
- State get(int regionHash, size_t region, short offset);
+ bool access(size_t region, short offset, bool doHalf);
- /**
- * @return true if added, false if full
- */
- bool put(int regionHash, size_t region, short offset);
+ private:
+ void _rotate();
- time_t lastReset() const;
+ int _curSlice;
+ long long _lastRotate;
+ Slice _slices[NumSlices];
- private:
- Entry* _get(int start, size_t region, bool add);
+ SimpleMutex _lock;
+ };
- Entry _data[SliceSize];
- time_t _lastReset;
- };
+ // Should this record tracker fallback to making a system call?
+ bool _blockSupported;
- /**
- * this contains many slices of times
- * the idea you put mem status in the current time slice
- * and then after a certain period of time, it rolls off so we check again
- */
- class Rolling {
- public:
- Rolling();
-
- /**
- * After this call, we assume the page is in RAM.
- *
- * @param doHalf if this is a known good access, want to put in first half.
- *
- * @return whether we know the page is in RAM
- */
- bool access(size_t region, short offset, bool doHalf);
-
- private:
- void _rotate();
-
- int _curSlice;
- long long _lastRotate;
- Slice _slices[NumSlices];
-
- SimpleMutex _lock;
- };
-
- // Should this record tracker fallback to making a system call?
- bool _blockSupported;
-
- // An array of Rolling instances for tracking record accesses.
- std::unique_ptr<Rolling[]> _rollingTable;
- };
+ // An array of Rolling instances for tracking record accesses.
+ std::unique_ptr<Rolling[]> _rollingTable;
+};
-} // namespace
+} // namespace
diff --git a/src/mongo/db/storage/mmap_v1/record_access_tracker_test.cpp b/src/mongo/db/storage/mmap_v1/record_access_tracker_test.cpp
index 92147a24c55..7cc766f2b13 100644
--- a/src/mongo/db/storage/mmap_v1/record_access_tracker_test.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_access_tracker_test.cpp
@@ -35,108 +35,108 @@ using namespace mongo;
namespace {
- const void* pointerOf(int data) {
- return reinterpret_cast<const void*>(data);
- }
-
- TEST(RecordAccessTrackerTest, TouchRecordTwice) {
- RecordAccessTracker tracker;
- tracker.disableSystemBlockInMemCheck();
-
- const void* record = pointerOf(0x10003);
-
- ASSERT_FALSE(tracker.checkAccessedAndMark(record));
- ASSERT_TRUE(tracker.checkAccessedAndMark(record));
- }
-
- TEST(RecordAccessTrackerTest, TouchPageTwice) {
- RecordAccessTracker tracker;
- tracker.disableSystemBlockInMemCheck();
-
- const void* firstRecord = pointerOf(0x10003);
- const void* secondRecord = pointerOf(0x10004);
-
- ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecord));
- ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecord));
- ASSERT_TRUE(tracker.checkAccessedAndMark(firstRecord));
- ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecord));
- }
-
- TEST(RecordAccessTrackerTest, TouchTwoPagesTwice) {
- RecordAccessTracker tracker;
- tracker.disableSystemBlockInMemCheck();
-
- const void* firstRecordFirstPage = pointerOf(0x11000);
- const void* secondRecordFirstPage = pointerOf(0x11100);
-
- const void* firstRecordSecondPage = pointerOf(0x12000);
- const void* secondRecordSecondPage = pointerOf(0x12100);
-
- ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage));
- ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage));
- ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage));
- ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage));
- }
-
- // Tests RecordAccessTracker::reset().
- TEST(RecordAccessTrackerTest, TouchTwoPagesTwiceWithReset) {
- RecordAccessTracker tracker;
- tracker.disableSystemBlockInMemCheck();
-
- const void* firstRecordFirstPage = pointerOf(0x11000);
- const void* secondRecordFirstPage = pointerOf(0x11100);
-
- const void* firstRecordSecondPage = pointerOf(0x12000);
- const void* secondRecordSecondPage = pointerOf(0x12100);
-
- ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage));
- ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage));
- ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage));
- ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage));
-
- // Now reset and make sure things look as though we have a fresh RecordAccessTracker.
- tracker.reset();
- ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage));
- ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage));
- ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage));
- ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage));
+const void* pointerOf(int data) {
+ return reinterpret_cast<const void*>(data);
+}
+
+TEST(RecordAccessTrackerTest, TouchRecordTwice) {
+ RecordAccessTracker tracker;
+ tracker.disableSystemBlockInMemCheck();
+
+ const void* record = pointerOf(0x10003);
+
+ ASSERT_FALSE(tracker.checkAccessedAndMark(record));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(record));
+}
+
+TEST(RecordAccessTrackerTest, TouchPageTwice) {
+ RecordAccessTracker tracker;
+ tracker.disableSystemBlockInMemCheck();
+
+ const void* firstRecord = pointerOf(0x10003);
+ const void* secondRecord = pointerOf(0x10004);
+
+ ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecord));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecord));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(firstRecord));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecord));
+}
+
+TEST(RecordAccessTrackerTest, TouchTwoPagesTwice) {
+ RecordAccessTracker tracker;
+ tracker.disableSystemBlockInMemCheck();
+
+ const void* firstRecordFirstPage = pointerOf(0x11000);
+ const void* secondRecordFirstPage = pointerOf(0x11100);
+
+ const void* firstRecordSecondPage = pointerOf(0x12000);
+ const void* secondRecordSecondPage = pointerOf(0x12100);
+
+ ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage));
+ ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage));
+}
+
+// Tests RecordAccessTracker::reset().
+TEST(RecordAccessTrackerTest, TouchTwoPagesTwiceWithReset) {
+ RecordAccessTracker tracker;
+ tracker.disableSystemBlockInMemCheck();
+
+ const void* firstRecordFirstPage = pointerOf(0x11000);
+ const void* secondRecordFirstPage = pointerOf(0x11100);
+
+ const void* firstRecordSecondPage = pointerOf(0x12000);
+ const void* secondRecordSecondPage = pointerOf(0x12100);
+
+ ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage));
+ ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage));
+
+ // Now reset and make sure things look as though we have a fresh RecordAccessTracker.
+ tracker.reset();
+ ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage));
+ ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage));
+ ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage));
+}
+
+// Tests RecordAccessTracker::markAccessed().
+TEST(RecordAccessTrackerTest, AccessTest) {
+ RecordAccessTracker tracker;
+ tracker.disableSystemBlockInMemCheck();
+
+ // Mark the first page in superpage 3 as accessed.
+ const void* record = pointerOf(0x30000);
+ tracker.markAccessed(record);
+
+ // Test that all remaining addresses in the page give true when asked whether they are
+ // recently accessed.
+ for (int i = 0x30001; i < 0x31000; i++) {
+ const void* touchedPageRecord = pointerOf(i);
+ ASSERT_TRUE(tracker.checkAccessedAndMark(touchedPageRecord));
}
-
- // Tests RecordAccessTracker::markAccessed().
- TEST(RecordAccessTrackerTest, AccessTest) {
- RecordAccessTracker tracker;
- tracker.disableSystemBlockInMemCheck();
-
- // Mark the first page in superpage 3 as accessed.
- const void* record = pointerOf(0x30000);
- tracker.markAccessed(record);
-
- // Test that all remaining addresses in the page give true when asked whether they are
- // recently accessed.
- for (int i = 0x30001; i < 0x31000; i++) {
- const void* touchedPageRecord = pointerOf(i);
- ASSERT_TRUE(tracker.checkAccessedAndMark(touchedPageRecord));
- }
+}
+
+// Touch pages in 128 separate superpages, and make sure that they all are reported as
+// recently accessed.
+TEST(RecordAccessTrackerTest, Access128Superpages) {
+ RecordAccessTracker tracker;
+ tracker.disableSystemBlockInMemCheck();
+
+ // Touch the pages.
+ for (int i = 0x00000; i < 0x800000; i += 0x10000) {
+ const void* touchedPageRecord = pointerOf(i);
+ tracker.markAccessed(touchedPageRecord);
}
- // Touch pages in 128 separate superpages, and make sure that they all are reported as
- // recently accessed.
- TEST(RecordAccessTrackerTest, Access128Superpages) {
- RecordAccessTracker tracker;
- tracker.disableSystemBlockInMemCheck();
-
- // Touch the pages.
- for (int i = 0x00000; i < 0x800000; i += 0x10000) {
- const void* touchedPageRecord = pointerOf(i);
- tracker.markAccessed(touchedPageRecord);
- }
-
- // Ensure we know that the pages have all been touched.
- for (int i = 0x00000; i < 0x800000; i += 0x10000) {
- // It should be fine if there is an offset of, say, 0xA, into the page.
- const void* touchedPageRecord = pointerOf(i + 0xA);
- ASSERT_TRUE(tracker.checkAccessedAndMark(touchedPageRecord));
- }
+ // Ensure we know that the pages have all been touched.
+ for (int i = 0x00000; i < 0x800000; i += 0x10000) {
+ // It should be fine if there is an offset of, say, 0xA, into the page.
+ const void* touchedPageRecord = pointerOf(i + 0xA);
+ ASSERT_TRUE(tracker.checkAccessedAndMark(touchedPageRecord));
}
+}
} // namespace
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp
index 5862a44a144..cc8cf582ffe 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp
@@ -48,935 +48,920 @@
namespace mongo {
- using std::unique_ptr;
- using std::set;
- using std::string;
-
- /* Deleted list buckets are used to quickly locate free space based on size. Each bucket
- contains records up to that size (meaning a record with a size exactly equal to
- bucketSizes[n] would go into bucket n+1).
- */
- const int RecordStoreV1Base::bucketSizes[] = {
- 0x20, 0x40, 0x80, 0x100, // 32, 64, 128, 256
- 0x200, 0x400, 0x800, 0x1000, // 512, 1K, 2K, 4K
- 0x2000, 0x4000, 0x8000, 0x10000, // 8K, 16K, 32K, 64K
- 0x20000, 0x40000, 0x80000, 0x100000, // 128K, 256K, 512K, 1M
- 0x200000, 0x400000, 0x600000, 0x800000, // 2M, 4M, 6M, 8M
- 0xA00000, 0xC00000, 0xE00000, // 10M, 12M, 14M,
- MaxAllowedAllocation, // 16.5M
- MaxAllowedAllocation + 1, // Only MaxAllowedAllocation sized records go here.
- INT_MAX, // "oversized" bucket for unused parts of extents.
- };
-
- // If this fails, it means that bucketSizes doesn't have the correct number of entries.
- BOOST_STATIC_ASSERT(sizeof(RecordStoreV1Base::bucketSizes)
- / sizeof(RecordStoreV1Base::bucketSizes[0])
- == RecordStoreV1Base::Buckets);
-
- SavedCursorRegistry::~SavedCursorRegistry() {
- for (SavedCursorSet::iterator it = _cursors.begin(); it != _cursors.end(); it++) {
- (*it)->_registry = NULL; // prevent SavedCursor destructor from accessing this
- }
+using std::unique_ptr;
+using std::set;
+using std::string;
+
+/* Deleted list buckets are used to quickly locate free space based on size. Each bucket
+ contains records up to that size (meaning a record with a size exactly equal to
+ bucketSizes[n] would go into bucket n+1).
+*/
+const int RecordStoreV1Base::bucketSizes[] = {
+ 0x20,
+ 0x40,
+ 0x80,
+ 0x100, // 32, 64, 128, 256
+ 0x200,
+ 0x400,
+ 0x800,
+ 0x1000, // 512, 1K, 2K, 4K
+ 0x2000,
+ 0x4000,
+ 0x8000,
+ 0x10000, // 8K, 16K, 32K, 64K
+ 0x20000,
+ 0x40000,
+ 0x80000,
+ 0x100000, // 128K, 256K, 512K, 1M
+ 0x200000,
+ 0x400000,
+ 0x600000,
+ 0x800000, // 2M, 4M, 6M, 8M
+ 0xA00000,
+ 0xC00000,
+ 0xE00000, // 10M, 12M, 14M,
+ MaxAllowedAllocation, // 16.5M
+ MaxAllowedAllocation + 1, // Only MaxAllowedAllocation sized records go here.
+ INT_MAX, // "oversized" bucket for unused parts of extents.
+};
+
+// If this fails, it means that bucketSizes doesn't have the correct number of entries.
+BOOST_STATIC_ASSERT(sizeof(RecordStoreV1Base::bucketSizes) /
+ sizeof(RecordStoreV1Base::bucketSizes[0]) ==
+ RecordStoreV1Base::Buckets);
+
+SavedCursorRegistry::~SavedCursorRegistry() {
+ for (SavedCursorSet::iterator it = _cursors.begin(); it != _cursors.end(); it++) {
+ (*it)->_registry = NULL; // prevent SavedCursor destructor from accessing this
}
+}
- void SavedCursorRegistry::registerCursor(SavedCursor* cursor) {
- invariant(!cursor->_registry);
- cursor->_registry = this;
- scoped_spinlock lock(_mutex);
- _cursors.insert(cursor);
- }
+void SavedCursorRegistry::registerCursor(SavedCursor* cursor) {
+ invariant(!cursor->_registry);
+ cursor->_registry = this;
+ scoped_spinlock lock(_mutex);
+ _cursors.insert(cursor);
+}
- bool SavedCursorRegistry::unregisterCursor(SavedCursor* cursor) {
- if (!cursor->_registry) {
- return false;
- }
- invariant(cursor->_registry == this);
- cursor->_registry = NULL;
- scoped_spinlock lock(_mutex);
- invariant(_cursors.erase(cursor));
- return true;
- }
-
- void SavedCursorRegistry::invalidateCursorsForBucket(DiskLoc bucket) {
- // While this is not strictly necessary as an exclusive collection lock will be held,
- // it's cleaner to just make the SavedCursorRegistry thread-safe. Spinlock is OK here.
- scoped_spinlock lock(_mutex);
- for (SavedCursorSet::iterator it = _cursors.begin(); it != _cursors.end();) {
- if ((*it)->bucket == bucket) {
- (*it)->_registry = NULL; // prevent ~SavedCursor from trying to unregister
- _cursors.erase(it++);
- }
- else {
- it++;
- }
- }
+bool SavedCursorRegistry::unregisterCursor(SavedCursor* cursor) {
+ if (!cursor->_registry) {
+ return false;
}
+ invariant(cursor->_registry == this);
+ cursor->_registry = NULL;
+ scoped_spinlock lock(_mutex);
+ invariant(_cursors.erase(cursor));
+ return true;
+}
- RecordStoreV1Base::RecordStoreV1Base( StringData ns,
- RecordStoreV1MetaData* details,
- ExtentManager* em,
- bool isSystemIndexes )
- : RecordStore( ns ),
- _details( details ),
- _extentManager( em ),
- _isSystemIndexes( isSystemIndexes ) {
+void SavedCursorRegistry::invalidateCursorsForBucket(DiskLoc bucket) {
+ // While this is not strictly necessary as an exclusive collection lock will be held,
+ // it's cleaner to just make the SavedCursorRegistry thread-safe. Spinlock is OK here.
+ scoped_spinlock lock(_mutex);
+ for (SavedCursorSet::iterator it = _cursors.begin(); it != _cursors.end();) {
+ if ((*it)->bucket == bucket) {
+ (*it)->_registry = NULL; // prevent ~SavedCursor from trying to unregister
+ _cursors.erase(it++);
+ } else {
+ it++;
+ }
}
+}
- RecordStoreV1Base::~RecordStoreV1Base() {
- }
+RecordStoreV1Base::RecordStoreV1Base(StringData ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes)
+ : RecordStore(ns), _details(details), _extentManager(em), _isSystemIndexes(isSystemIndexes) {}
+RecordStoreV1Base::~RecordStoreV1Base() {}
- int64_t RecordStoreV1Base::storageSize( OperationContext* txn,
- BSONObjBuilder* extraInfo,
- int level ) const {
- BSONArrayBuilder extentInfo;
- int64_t total = 0;
- int n = 0;
+int64_t RecordStoreV1Base::storageSize(OperationContext* txn,
+ BSONObjBuilder* extraInfo,
+ int level) const {
+ BSONArrayBuilder extentInfo;
- DiskLoc cur = _details->firstExtent(txn);
+ int64_t total = 0;
+ int n = 0;
- while ( !cur.isNull() ) {
- Extent* e = _extentManager->getExtent( cur );
+ DiskLoc cur = _details->firstExtent(txn);
- total += e->length;
- n++;
+ while (!cur.isNull()) {
+ Extent* e = _extentManager->getExtent(cur);
- if ( extraInfo && level > 0 ) {
- extentInfo.append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) );
- }
- cur = e->xnext;
- }
+ total += e->length;
+ n++;
- if ( extraInfo ) {
- extraInfo->append( "numExtents", n );
- if ( level > 0 )
- extraInfo->append( "extents", extentInfo.arr() );
+ if (extraInfo && level > 0) {
+ extentInfo.append(BSON("len" << e->length << "loc: " << e->myLoc.toBSONObj()));
}
-
- return total;
+ cur = e->xnext;
}
- RecordData RecordStoreV1Base::dataFor( OperationContext* txn, const RecordId& loc ) const {
- return recordFor(DiskLoc::fromRecordId(loc))->toRecordData();
+ if (extraInfo) {
+ extraInfo->append("numExtents", n);
+ if (level > 0)
+ extraInfo->append("extents", extentInfo.arr());
}
- bool RecordStoreV1Base::findRecord( OperationContext* txn,
- const RecordId& loc, RecordData* rd ) const {
- // this is a bit odd, as the semantics of using the storage engine imply it _has_ to be.
- // And in fact we can't actually check.
- // So we assume the best.
- MmapV1RecordHeader* rec = recordFor(DiskLoc::fromRecordId(loc));
- if ( !rec ) {
- return false;
- }
- *rd = rec->toRecordData();
- return true;
- }
+ return total;
+}
- MmapV1RecordHeader* RecordStoreV1Base::recordFor( const DiskLoc& loc ) const {
- return _extentManager->recordForV1( loc );
- }
+RecordData RecordStoreV1Base::dataFor(OperationContext* txn, const RecordId& loc) const {
+ return recordFor(DiskLoc::fromRecordId(loc))->toRecordData();
+}
- const DeletedRecord* RecordStoreV1Base::deletedRecordFor( const DiskLoc& loc ) const {
- invariant( loc.a() != -1 );
- return reinterpret_cast<const DeletedRecord*>( recordFor( loc ) );
- }
+bool RecordStoreV1Base::findRecord(OperationContext* txn,
+ const RecordId& loc,
+ RecordData* rd) const {
+ // this is a bit odd, as the semantics of using the storage engine imply it _has_ to be.
+ // And in fact we can't actually check.
+ // So we assume the best.
+ MmapV1RecordHeader* rec = recordFor(DiskLoc::fromRecordId(loc));
+ if (!rec) {
+ return false;
+ }
+ *rd = rec->toRecordData();
+ return true;
+}
- DeletedRecord* RecordStoreV1Base::drec( const DiskLoc& loc ) const {
- invariant( loc.a() != -1 );
- return reinterpret_cast<DeletedRecord*>( recordFor( loc ) );
- }
+MmapV1RecordHeader* RecordStoreV1Base::recordFor(const DiskLoc& loc) const {
+ return _extentManager->recordForV1(loc);
+}
- Extent* RecordStoreV1Base::_getExtent( OperationContext* txn, const DiskLoc& loc ) const {
- return _extentManager->getExtent( loc );
- }
+const DeletedRecord* RecordStoreV1Base::deletedRecordFor(const DiskLoc& loc) const {
+ invariant(loc.a() != -1);
+ return reinterpret_cast<const DeletedRecord*>(recordFor(loc));
+}
- DiskLoc RecordStoreV1Base::_getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const {
- return _extentManager->extentLocForV1( loc );
- }
+DeletedRecord* RecordStoreV1Base::drec(const DiskLoc& loc) const {
+ invariant(loc.a() != -1);
+ return reinterpret_cast<DeletedRecord*>(recordFor(loc));
+}
+Extent* RecordStoreV1Base::_getExtent(OperationContext* txn, const DiskLoc& loc) const {
+ return _extentManager->getExtent(loc);
+}
- DiskLoc RecordStoreV1Base::getNextRecord( OperationContext* txn, const DiskLoc& loc ) const {
- DiskLoc next = getNextRecordInExtent( txn, loc );
- if ( !next.isNull() ) {
- return next;
- }
+DiskLoc RecordStoreV1Base::_getExtentLocForRecord(OperationContext* txn, const DiskLoc& loc) const {
+ return _extentManager->extentLocForV1(loc);
+}
- // now traverse extents
- Extent* e = _getExtent( txn, _getExtentLocForRecord(txn, loc) );
- while ( 1 ) {
- if ( e->xnext.isNull() )
- return DiskLoc(); // end of collection
- e = _getExtent( txn, e->xnext );
- if ( !e->firstRecord.isNull() )
- break;
- // entire extent could be empty, keep looking
- }
- return e->firstRecord;
+DiskLoc RecordStoreV1Base::getNextRecord(OperationContext* txn, const DiskLoc& loc) const {
+ DiskLoc next = getNextRecordInExtent(txn, loc);
+ if (!next.isNull()) {
+ return next;
}
- DiskLoc RecordStoreV1Base::getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const {
- DiskLoc prev = getPrevRecordInExtent( txn, loc );
- if ( !prev.isNull() ) {
- return prev;
- }
+ // now traverse extents
- // now traverse extents
+ Extent* e = _getExtent(txn, _getExtentLocForRecord(txn, loc));
+ while (1) {
+ if (e->xnext.isNull())
+ return DiskLoc(); // end of collection
+ e = _getExtent(txn, e->xnext);
+ if (!e->firstRecord.isNull())
+ break;
+ // entire extent could be empty, keep looking
+ }
+ return e->firstRecord;
+}
- Extent *e = _getExtent(txn, _getExtentLocForRecord(txn, loc));
- while ( 1 ) {
- if ( e->xprev.isNull() )
- return DiskLoc(); // end of collection
- e = _getExtent( txn, e->xprev );
- if ( !e->firstRecord.isNull() )
- break;
- // entire extent could be empty, keep looking
- }
- return e->lastRecord;
-
- }
-
- DiskLoc RecordStoreV1Base::_findFirstSpot( OperationContext* txn,
- const DiskLoc& extDiskLoc,
- Extent* e ) {
- DiskLoc emptyLoc = extDiskLoc;
- emptyLoc.inc( Extent::HeaderSize() );
- int delRecLength = e->length - Extent::HeaderSize();
- if ( delRecLength >= 32*1024 && _ns.find('$') != string::npos && !isCapped() ) {
- // probably an index. so skip forward to keep its records page aligned
- int& ofs = emptyLoc.GETOFS();
- int newOfs = (ofs + 0xfff) & ~0xfff;
- delRecLength -= (newOfs-ofs);
- dassert( delRecLength > 0 );
- ofs = newOfs;
- }
+DiskLoc RecordStoreV1Base::getPrevRecord(OperationContext* txn, const DiskLoc& loc) const {
+ DiskLoc prev = getPrevRecordInExtent(txn, loc);
+ if (!prev.isNull()) {
+ return prev;
+ }
- DeletedRecord* empty = txn->recoveryUnit()->writing(drec(emptyLoc));
- empty->lengthWithHeaders() = delRecLength;
- empty->extentOfs() = e->myLoc.getOfs();
- empty->nextDeleted().Null();
- return emptyLoc;
+ // now traverse extents
+ Extent* e = _getExtent(txn, _getExtentLocForRecord(txn, loc));
+ while (1) {
+ if (e->xprev.isNull())
+ return DiskLoc(); // end of collection
+ e = _getExtent(txn, e->xprev);
+ if (!e->firstRecord.isNull())
+ break;
+ // entire extent could be empty, keep looking
}
+ return e->lastRecord;
+}
- DiskLoc RecordStoreV1Base::getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const {
- int nextOffset = recordFor( loc )->nextOfs();
-
- if ( nextOffset == DiskLoc::NullOfs )
- return DiskLoc();
+DiskLoc RecordStoreV1Base::_findFirstSpot(OperationContext* txn,
+ const DiskLoc& extDiskLoc,
+ Extent* e) {
+ DiskLoc emptyLoc = extDiskLoc;
+ emptyLoc.inc(Extent::HeaderSize());
+ int delRecLength = e->length - Extent::HeaderSize();
+ if (delRecLength >= 32 * 1024 && _ns.find('$') != string::npos && !isCapped()) {
+ // probably an index. so skip forward to keep its records page aligned
+ int& ofs = emptyLoc.GETOFS();
+ int newOfs = (ofs + 0xfff) & ~0xfff;
+ delRecLength -= (newOfs - ofs);
+ dassert(delRecLength > 0);
+ ofs = newOfs;
+ }
+
+ DeletedRecord* empty = txn->recoveryUnit()->writing(drec(emptyLoc));
+ empty->lengthWithHeaders() = delRecLength;
+ empty->extentOfs() = e->myLoc.getOfs();
+ empty->nextDeleted().Null();
+ return emptyLoc;
+}
- fassert( 17441, abs(nextOffset) >= 8 ); // defensive
- DiskLoc result( loc.a(), nextOffset );
- return result;
- }
+DiskLoc RecordStoreV1Base::getNextRecordInExtent(OperationContext* txn, const DiskLoc& loc) const {
+ int nextOffset = recordFor(loc)->nextOfs();
- DiskLoc RecordStoreV1Base::getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const {
- int prevOffset = recordFor( loc )->prevOfs();
+ if (nextOffset == DiskLoc::NullOfs)
+ return DiskLoc();
- if ( prevOffset == DiskLoc::NullOfs )
- return DiskLoc();
+ fassert(17441, abs(nextOffset) >= 8); // defensive
+ DiskLoc result(loc.a(), nextOffset);
+ return result;
+}
- fassert( 17442, abs(prevOffset) >= 8 ); // defensive
- DiskLoc result( loc.a(), prevOffset );
- return result;
- }
+DiskLoc RecordStoreV1Base::getPrevRecordInExtent(OperationContext* txn, const DiskLoc& loc) const {
+ int prevOffset = recordFor(loc)->prevOfs();
- StatusWith<RecordId> RecordStoreV1Base::insertRecord( OperationContext* txn,
- const DocWriter* doc,
- bool enforceQuota ) {
- int docSize = doc->documentSize();
- if ( docSize < 4 ) {
- return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be >= 4 bytes");
- }
- const int lenWHdr = docSize + MmapV1RecordHeader::HeaderSize;
- if ( lenWHdr > MaxAllowedAllocation ) {
- return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be <= 16.5MB");
- }
- const int lenToAlloc = (doc->addPadding() && shouldPadInserts())
- ? quantizeAllocationSpace(lenWHdr)
- : lenWHdr;
+ if (prevOffset == DiskLoc::NullOfs)
+ return DiskLoc();
- StatusWith<DiskLoc> loc = allocRecord( txn, lenToAlloc, enforceQuota );
- if ( !loc.isOK() )
- return StatusWith<RecordId>(loc.getStatus());
+ fassert(17442, abs(prevOffset) >= 8); // defensive
+ DiskLoc result(loc.a(), prevOffset);
+ return result;
+}
- MmapV1RecordHeader *r = recordFor( loc.getValue() );
- fassert( 17319, r->lengthWithHeaders() >= lenWHdr );
+StatusWith<RecordId> RecordStoreV1Base::insertRecord(OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota) {
+ int docSize = doc->documentSize();
+ if (docSize < 4) {
+ return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be >= 4 bytes");
+ }
+ const int lenWHdr = docSize + MmapV1RecordHeader::HeaderSize;
+ if (lenWHdr > MaxAllowedAllocation) {
+ return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be <= 16.5MB");
+ }
+ const int lenToAlloc =
+ (doc->addPadding() && shouldPadInserts()) ? quantizeAllocationSpace(lenWHdr) : lenWHdr;
- r = reinterpret_cast<MmapV1RecordHeader*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) );
- doc->writeDocument( r->data() );
+ StatusWith<DiskLoc> loc = allocRecord(txn, lenToAlloc, enforceQuota);
+ if (!loc.isOK())
+ return StatusWith<RecordId>(loc.getStatus());
- _addRecordToRecListInExtent(txn, r, loc.getValue());
+ MmapV1RecordHeader* r = recordFor(loc.getValue());
+ fassert(17319, r->lengthWithHeaders() >= lenWHdr);
- _details->incrementStats( txn, r->netLength(), 1 );
+ r = reinterpret_cast<MmapV1RecordHeader*>(txn->recoveryUnit()->writingPtr(r, lenWHdr));
+ doc->writeDocument(r->data());
- return StatusWith<RecordId>(loc.getValue().toRecordId());
- }
+ _addRecordToRecListInExtent(txn, r, loc.getValue());
+ _details->incrementStats(txn, r->netLength(), 1);
- StatusWith<RecordId> RecordStoreV1Base::insertRecord( OperationContext* txn,
- const char* data,
- int len,
- bool enforceQuota ) {
- if ( len < 4 ) {
- return StatusWith<RecordId>( ErrorCodes::InvalidLength, "record has to be >= 4 bytes" );
- }
+ return StatusWith<RecordId>(loc.getValue().toRecordId());
+}
- if ( len + MmapV1RecordHeader::HeaderSize > MaxAllowedAllocation ) {
- return StatusWith<RecordId>( ErrorCodes::InvalidLength, "record has to be <= 16.5MB" );
- }
- return _insertRecord( txn, data, len, enforceQuota );
+StatusWith<RecordId> RecordStoreV1Base::insertRecord(OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota) {
+ if (len < 4) {
+ return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be >= 4 bytes");
}
- StatusWith<RecordId> RecordStoreV1Base::_insertRecord( OperationContext* txn,
- const char* data,
- int len,
- bool enforceQuota ) {
+ if (len + MmapV1RecordHeader::HeaderSize > MaxAllowedAllocation) {
+ return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be <= 16.5MB");
+ }
- const int lenWHdr = len + MmapV1RecordHeader::HeaderSize;
- const int lenToAlloc = shouldPadInserts() ? quantizeAllocationSpace(lenWHdr)
- : lenWHdr;
- fassert( 17208, lenToAlloc >= lenWHdr );
+ return _insertRecord(txn, data, len, enforceQuota);
+}
- StatusWith<DiskLoc> loc = allocRecord( txn, lenToAlloc, enforceQuota );
- if ( !loc.isOK() )
- return StatusWith<RecordId>(loc.getStatus());
+StatusWith<RecordId> RecordStoreV1Base::_insertRecord(OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota) {
+ const int lenWHdr = len + MmapV1RecordHeader::HeaderSize;
+ const int lenToAlloc = shouldPadInserts() ? quantizeAllocationSpace(lenWHdr) : lenWHdr;
+ fassert(17208, lenToAlloc >= lenWHdr);
- MmapV1RecordHeader *r = recordFor( loc.getValue() );
- fassert( 17210, r->lengthWithHeaders() >= lenWHdr );
+ StatusWith<DiskLoc> loc = allocRecord(txn, lenToAlloc, enforceQuota);
+ if (!loc.isOK())
+ return StatusWith<RecordId>(loc.getStatus());
- // copy the data
- r = reinterpret_cast<MmapV1RecordHeader*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) );
- memcpy( r->data(), data, len );
+ MmapV1RecordHeader* r = recordFor(loc.getValue());
+ fassert(17210, r->lengthWithHeaders() >= lenWHdr);
- _addRecordToRecListInExtent(txn, r, loc.getValue());
+ // copy the data
+ r = reinterpret_cast<MmapV1RecordHeader*>(txn->recoveryUnit()->writingPtr(r, lenWHdr));
+ memcpy(r->data(), data, len);
- _details->incrementStats( txn, r->netLength(), 1 );
+ _addRecordToRecListInExtent(txn, r, loc.getValue());
- return StatusWith<RecordId>(loc.getValue().toRecordId());
- }
+ _details->incrementStats(txn, r->netLength(), 1);
- StatusWith<RecordId> RecordStoreV1Base::updateRecord( OperationContext* txn,
- const RecordId& oldLocation,
- const char* data,
- int dataSize,
- bool enforceQuota,
- UpdateNotifier* notifier ) {
- MmapV1RecordHeader* oldRecord = recordFor( DiskLoc::fromRecordId(oldLocation) );
- if ( oldRecord->netLength() >= dataSize ) {
- // Make sure to notify other queries before we do an in-place update.
- if ( notifier ) {
- Status callbackStatus = notifier->recordStoreGoingToUpdateInPlace( txn,
- oldLocation );
- if ( !callbackStatus.isOK() )
- return StatusWith<RecordId>( callbackStatus );
- }
+ return StatusWith<RecordId>(loc.getValue().toRecordId());
+}
- // we fit
- memcpy( txn->recoveryUnit()->writingPtr( oldRecord->data(), dataSize ), data, dataSize );
- return StatusWith<RecordId>( oldLocation );
+StatusWith<RecordId> RecordStoreV1Base::updateRecord(OperationContext* txn,
+ const RecordId& oldLocation,
+ const char* data,
+ int dataSize,
+ bool enforceQuota,
+ UpdateNotifier* notifier) {
+ MmapV1RecordHeader* oldRecord = recordFor(DiskLoc::fromRecordId(oldLocation));
+ if (oldRecord->netLength() >= dataSize) {
+ // Make sure to notify other queries before we do an in-place update.
+ if (notifier) {
+ Status callbackStatus = notifier->recordStoreGoingToUpdateInPlace(txn, oldLocation);
+ if (!callbackStatus.isOK())
+ return StatusWith<RecordId>(callbackStatus);
}
- if ( isCapped() )
- return StatusWith<RecordId>( ErrorCodes::InternalError,
- "failing update: objects in a capped ns cannot grow",
- 10003 );
-
- // we have to move
- if ( dataSize + MmapV1RecordHeader::HeaderSize > MaxAllowedAllocation ) {
- return StatusWith<RecordId>( ErrorCodes::InvalidLength, "record has to be <= 16.5MB" );
- }
+ // we fit
+ memcpy(txn->recoveryUnit()->writingPtr(oldRecord->data(), dataSize), data, dataSize);
+ return StatusWith<RecordId>(oldLocation);
+ }
- StatusWith<RecordId> newLocation = _insertRecord( txn, data, dataSize, enforceQuota );
- if ( !newLocation.isOK() )
- return newLocation;
-
- // insert worked, so we delete old record
- if ( notifier ) {
- Status moveStatus = notifier->recordStoreGoingToMove( txn,
- oldLocation,
- oldRecord->data(),
- oldRecord->netLength() );
- if ( !moveStatus.isOK() )
- return StatusWith<RecordId>( moveStatus );
- }
+ if (isCapped())
+ return StatusWith<RecordId>(
+ ErrorCodes::InternalError, "failing update: objects in a capped ns cannot grow", 10003);
- deleteRecord( txn, oldLocation );
+ // we have to move
+ if (dataSize + MmapV1RecordHeader::HeaderSize > MaxAllowedAllocation) {
+ return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be <= 16.5MB");
+ }
+ StatusWith<RecordId> newLocation = _insertRecord(txn, data, dataSize, enforceQuota);
+ if (!newLocation.isOK())
return newLocation;
- }
- bool RecordStoreV1Base::updateWithDamagesSupported() const {
- return true;
+ // insert worked, so we delete old record
+ if (notifier) {
+ Status moveStatus = notifier->recordStoreGoingToMove(
+ txn, oldLocation, oldRecord->data(), oldRecord->netLength());
+ if (!moveStatus.isOK())
+ return StatusWith<RecordId>(moveStatus);
}
- Status RecordStoreV1Base::updateWithDamages( OperationContext* txn,
- const RecordId& loc,
- const RecordData& oldRec,
- const char* damageSource,
- const mutablebson::DamageVector& damages ) {
- MmapV1RecordHeader* rec = recordFor( DiskLoc::fromRecordId(loc) );
- char* root = rec->data();
+ deleteRecord(txn, oldLocation);
- // All updates were in place. Apply them via durability and writing pointer.
- mutablebson::DamageVector::const_iterator where = damages.begin();
- const mutablebson::DamageVector::const_iterator end = damages.end();
- for( ; where != end; ++where ) {
- const char* sourcePtr = damageSource + where->sourceOffset;
- void* targetPtr = txn->recoveryUnit()->writingPtr(root + where->targetOffset, where->size);
- std::memcpy(targetPtr, sourcePtr, where->size);
- }
+ return newLocation;
+}
- return Status::OK();
- }
+bool RecordStoreV1Base::updateWithDamagesSupported() const {
+ return true;
+}
- void RecordStoreV1Base::deleteRecord( OperationContext* txn, const RecordId& rid ) {
- const DiskLoc dl = DiskLoc::fromRecordId(rid);
+Status RecordStoreV1Base::updateWithDamages(OperationContext* txn,
+ const RecordId& loc,
+ const RecordData& oldRec,
+ const char* damageSource,
+ const mutablebson::DamageVector& damages) {
+ MmapV1RecordHeader* rec = recordFor(DiskLoc::fromRecordId(loc));
+ char* root = rec->data();
- MmapV1RecordHeader* todelete = recordFor( dl );
- invariant( todelete->netLength() >= 4 ); // this is required for defensive code
+ // All updates were in place. Apply them via durability and writing pointer.
+ mutablebson::DamageVector::const_iterator where = damages.begin();
+ const mutablebson::DamageVector::const_iterator end = damages.end();
+ for (; where != end; ++where) {
+ const char* sourcePtr = damageSource + where->sourceOffset;
+ void* targetPtr = txn->recoveryUnit()->writingPtr(root + where->targetOffset, where->size);
+ std::memcpy(targetPtr, sourcePtr, where->size);
+ }
- /* remove ourself from the record next/prev chain */
- {
- if ( todelete->prevOfs() != DiskLoc::NullOfs ) {
- DiskLoc prev = getPrevRecordInExtent( txn, dl );
- MmapV1RecordHeader* prevRecord = recordFor( prev );
- txn->recoveryUnit()->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs();
- }
+ return Status::OK();
+}
- if ( todelete->nextOfs() != DiskLoc::NullOfs ) {
- DiskLoc next = getNextRecord( txn, dl );
- MmapV1RecordHeader* nextRecord = recordFor( next );
- txn->recoveryUnit()->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs();
- }
- }
+void RecordStoreV1Base::deleteRecord(OperationContext* txn, const RecordId& rid) {
+ const DiskLoc dl = DiskLoc::fromRecordId(rid);
- /* remove ourself from extent pointers */
- {
- DiskLoc extentLoc = todelete->myExtentLoc(dl);
- Extent *e = _getExtent( txn, extentLoc );
- if ( e->firstRecord == dl ) {
- txn->recoveryUnit()->writing(&e->firstRecord);
- if ( todelete->nextOfs() == DiskLoc::NullOfs )
- e->firstRecord.Null();
- else
- e->firstRecord.set(dl.a(), todelete->nextOfs() );
- }
- if ( e->lastRecord == dl ) {
- txn->recoveryUnit()->writing(&e->lastRecord);
- if ( todelete->prevOfs() == DiskLoc::NullOfs )
- e->lastRecord.Null();
- else
- e->lastRecord.set(dl.a(), todelete->prevOfs() );
- }
- }
+ MmapV1RecordHeader* todelete = recordFor(dl);
+ invariant(todelete->netLength() >= 4); // this is required for defensive code
- /* add to the free list */
- {
- _details->incrementStats( txn, -1 * todelete->netLength(), -1 );
-
- if ( _isSystemIndexes ) {
- /* temp: if in system.indexes, don't reuse, and zero out: we want to be
- careful until validated more, as IndexDetails has pointers
- to this disk location. so an incorrectly done remove would cause
- a lot of problems.
- */
- memset( txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders() ),
- 0, todelete->lengthWithHeaders() );
- }
- else {
- // this is defensive so we can detect if we are still using a location
- // that was deleted
- memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4);
- addDeletedRec(txn, dl);
- }
+ /* remove ourself from the record next/prev chain */
+ {
+ if (todelete->prevOfs() != DiskLoc::NullOfs) {
+ DiskLoc prev = getPrevRecordInExtent(txn, dl);
+ MmapV1RecordHeader* prevRecord = recordFor(prev);
+ txn->recoveryUnit()->writingInt(prevRecord->nextOfs()) = todelete->nextOfs();
}
+ if (todelete->nextOfs() != DiskLoc::NullOfs) {
+ DiskLoc next = getNextRecord(txn, dl);
+ MmapV1RecordHeader* nextRecord = recordFor(next);
+ txn->recoveryUnit()->writingInt(nextRecord->prevOfs()) = todelete->prevOfs();
+ }
}
- std::unique_ptr<RecordCursor> RecordStoreV1Base::getCursorForRepair(
- OperationContext* txn) const {
- return stdx::make_unique<RecordStoreV1RepairCursor>(txn, this);
- }
-
- void RecordStoreV1Base::_addRecordToRecListInExtent(OperationContext* txn,
- MmapV1RecordHeader *r,
- DiskLoc loc) {
- dassert( recordFor(loc) == r );
- DiskLoc extentLoc = _getExtentLocForRecord( txn, loc );
- Extent *e = _getExtent( txn, extentLoc );
- if ( e->lastRecord.isNull() ) {
- *txn->recoveryUnit()->writing(&e->firstRecord) = loc;
- *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
- r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs;
+ /* remove ourself from extent pointers */
+ {
+ DiskLoc extentLoc = todelete->myExtentLoc(dl);
+ Extent* e = _getExtent(txn, extentLoc);
+ if (e->firstRecord == dl) {
+ txn->recoveryUnit()->writing(&e->firstRecord);
+ if (todelete->nextOfs() == DiskLoc::NullOfs)
+ e->firstRecord.Null();
+ else
+ e->firstRecord.set(dl.a(), todelete->nextOfs());
}
- else {
- MmapV1RecordHeader *oldlast = recordFor(e->lastRecord);
- r->prevOfs() = e->lastRecord.getOfs();
- r->nextOfs() = DiskLoc::NullOfs;
- txn->recoveryUnit()->writingInt(oldlast->nextOfs()) = loc.getOfs();
- *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
+ if (e->lastRecord == dl) {
+ txn->recoveryUnit()->writing(&e->lastRecord);
+ if (todelete->prevOfs() == DiskLoc::NullOfs)
+ e->lastRecord.Null();
+ else
+ e->lastRecord.set(dl.a(), todelete->prevOfs());
}
}
- void RecordStoreV1Base::increaseStorageSize( OperationContext* txn,
- int size,
- bool enforceQuota ) {
- DiskLoc eloc = _extentManager->allocateExtent( txn,
- isCapped(),
- size,
- enforceQuota );
- Extent *e = _extentManager->getExtent( eloc );
- invariant( e );
-
- *txn->recoveryUnit()->writing( &e->nsDiagnostic ) = _ns;
-
- txn->recoveryUnit()->writing( &e->xnext )->Null();
- txn->recoveryUnit()->writing( &e->xprev )->Null();
- txn->recoveryUnit()->writing( &e->firstRecord )->Null();
- txn->recoveryUnit()->writing( &e->lastRecord )->Null();
-
- DiskLoc emptyLoc = _findFirstSpot( txn, eloc, e );
-
- if ( _details->lastExtent(txn).isNull() ) {
- invariant( _details->firstExtent(txn).isNull() );
- _details->setFirstExtent( txn, eloc );
- _details->setLastExtent( txn, eloc );
- _details->setCapExtent( txn, eloc );
- invariant( e->xprev.isNull() );
- invariant( e->xnext.isNull() );
- }
- else {
- invariant( !_details->firstExtent(txn).isNull() );
- *txn->recoveryUnit()->writing(&e->xprev) = _details->lastExtent(txn);
- *txn->recoveryUnit()->writing(&_extentManager->getExtent(_details->lastExtent(txn))->xnext) = eloc;
- _details->setLastExtent( txn, eloc );
+ /* add to the free list */
+ {
+ _details->incrementStats(txn, -1 * todelete->netLength(), -1);
+
+ if (_isSystemIndexes) {
+ /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+ careful until validated more, as IndexDetails has pointers
+ to this disk location. so an incorrectly done remove would cause
+ a lot of problems.
+ */
+ memset(txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders()),
+ 0,
+ todelete->lengthWithHeaders());
+ } else {
+ // this is defensive so we can detect if we are still using a location
+ // that was deleted
+ memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4);
+ addDeletedRec(txn, dl);
}
+ }
+}
- _details->setLastExtentSize( txn, e->length );
+std::unique_ptr<RecordCursor> RecordStoreV1Base::getCursorForRepair(OperationContext* txn) const {
+ return stdx::make_unique<RecordStoreV1RepairCursor>(txn, this);
+}
- addDeletedRec(txn, emptyLoc);
+void RecordStoreV1Base::_addRecordToRecListInExtent(OperationContext* txn,
+ MmapV1RecordHeader* r,
+ DiskLoc loc) {
+ dassert(recordFor(loc) == r);
+ DiskLoc extentLoc = _getExtentLocForRecord(txn, loc);
+ Extent* e = _getExtent(txn, extentLoc);
+ if (e->lastRecord.isNull()) {
+ *txn->recoveryUnit()->writing(&e->firstRecord) = loc;
+ *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
+ r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs;
+ } else {
+ MmapV1RecordHeader* oldlast = recordFor(e->lastRecord);
+ r->prevOfs() = e->lastRecord.getOfs();
+ r->nextOfs() = DiskLoc::NullOfs;
+ txn->recoveryUnit()->writingInt(oldlast->nextOfs()) = loc.getOfs();
+ *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
}
+}
- Status RecordStoreV1Base::validate( OperationContext* txn,
- bool full, bool scanData,
- ValidateAdaptor* adaptor,
- ValidateResults* results, BSONObjBuilder* output ) {
+void RecordStoreV1Base::increaseStorageSize(OperationContext* txn, int size, bool enforceQuota) {
+ DiskLoc eloc = _extentManager->allocateExtent(txn, isCapped(), size, enforceQuota);
+ Extent* e = _extentManager->getExtent(eloc);
+ invariant(e);
- // 1) basic status that require no iteration
- // 2) extent level info
- // 3) check extent start and end
- // 4) check each non-deleted record
- // 5) check deleted list
+ *txn->recoveryUnit()->writing(&e->nsDiagnostic) = _ns;
- // -------------
+ txn->recoveryUnit()->writing(&e->xnext)->Null();
+ txn->recoveryUnit()->writing(&e->xprev)->Null();
+ txn->recoveryUnit()->writing(&e->firstRecord)->Null();
+ txn->recoveryUnit()->writing(&e->lastRecord)->Null();
- // 1111111111111111111
- if ( isCapped() ){
- output->appendBool("capped", true);
- output->appendNumber("max", _details->maxCappedDocs());
- }
+ DiskLoc emptyLoc = _findFirstSpot(txn, eloc, e);
- output->appendNumber("datasize", _details->dataSize());
- output->appendNumber("nrecords", _details->numRecords());
- output->appendNumber("lastExtentSize", _details->lastExtentSize(txn));
-
- if ( _details->firstExtent(txn).isNull() )
- output->append( "firstExtent", "null" );
- else
- output->append( "firstExtent",
- str::stream() << _details->firstExtent(txn).toString()
- << " ns:"
- << _getExtent( txn, _details->firstExtent(txn) )->nsDiagnostic.toString());
- if ( _details->lastExtent(txn).isNull() )
- output->append( "lastExtent", "null" );
- else
- output->append( "lastExtent", str::stream() << _details->lastExtent(txn).toString()
- << " ns:"
- << _getExtent( txn, _details->lastExtent(txn) )->nsDiagnostic.toString());
-
- // 22222222222222222222222222
- { // validate extent basics
- BSONArrayBuilder extentData;
- int extentCount = 0;
- DiskLoc extentDiskLoc;
- try {
- if ( !_details->firstExtent(txn).isNull() ) {
- _getExtent( txn, _details->firstExtent(txn) )->assertOk();
- _getExtent( txn, _details->lastExtent(txn) )->assertOk();
- }
+ if (_details->lastExtent(txn).isNull()) {
+ invariant(_details->firstExtent(txn).isNull());
+ _details->setFirstExtent(txn, eloc);
+ _details->setLastExtent(txn, eloc);
+ _details->setCapExtent(txn, eloc);
+ invariant(e->xprev.isNull());
+ invariant(e->xnext.isNull());
+ } else {
+ invariant(!_details->firstExtent(txn).isNull());
+ *txn->recoveryUnit()->writing(&e->xprev) = _details->lastExtent(txn);
+ *txn->recoveryUnit()->writing(
+ &_extentManager->getExtent(_details->lastExtent(txn))->xnext) = eloc;
+ _details->setLastExtent(txn, eloc);
+ }
- extentDiskLoc = _details->firstExtent(txn);
- while (!extentDiskLoc.isNull()) {
- Extent* thisExtent = _getExtent( txn, extentDiskLoc );
- if (full) {
- extentData << thisExtent->dump();
- }
- if (!thisExtent->validates(extentDiskLoc, &results->errors)) {
- results->valid = false;
- }
- DiskLoc nextDiskLoc = thisExtent->xnext;
+ _details->setLastExtentSize(txn, e->length);
- if (extentCount > 0 && !nextDiskLoc.isNull()
- && _getExtent( txn, nextDiskLoc )->xprev != extentDiskLoc) {
- StringBuilder sb;
- sb << "'xprev' pointer " << _getExtent( txn, nextDiskLoc )->xprev.toString()
- << " in extent " << nextDiskLoc.toString()
- << " does not point to extent " << extentDiskLoc.toString();
- results->errors.push_back( sb.str() );
- results->valid = false;
- }
- if (nextDiskLoc.isNull() && extentDiskLoc != _details->lastExtent(txn)) {
- StringBuilder sb;
- sb << "'lastExtent' pointer " << _details->lastExtent(txn).toString()
- << " does not point to last extent in list " << extentDiskLoc.toString();
- results->errors.push_back( sb.str() );
- results->valid = false;
- }
- extentDiskLoc = nextDiskLoc;
- extentCount++;
- txn->checkForInterrupt();
- }
- }
- catch (const DBException& e) {
- StringBuilder sb;
- sb << "exception validating extent " << extentCount
- << ": " << e.what();
- results->errors.push_back( sb.str() );
- results->valid = false;
- return Status::OK();
- }
- output->append("extentCount", extentCount);
+ addDeletedRec(txn, emptyLoc);
+}
- if ( full )
- output->appendArray( "extents" , extentData.arr() );
+Status RecordStoreV1Base::validate(OperationContext* txn,
+ bool full,
+ bool scanData,
+ ValidateAdaptor* adaptor,
+ ValidateResults* results,
+ BSONObjBuilder* output) {
+ // 1) basic status that require no iteration
+ // 2) extent level info
+ // 3) check extent start and end
+ // 4) check each non-deleted record
+ // 5) check deleted list
+
+ // -------------
+
+ // 1111111111111111111
+ if (isCapped()) {
+ output->appendBool("capped", true);
+ output->appendNumber("max", _details->maxCappedDocs());
+ }
+
+ output->appendNumber("datasize", _details->dataSize());
+ output->appendNumber("nrecords", _details->numRecords());
+ output->appendNumber("lastExtentSize", _details->lastExtentSize(txn));
+
+ if (_details->firstExtent(txn).isNull())
+ output->append("firstExtent", "null");
+ else
+ output->append("firstExtent",
+ str::stream()
+ << _details->firstExtent(txn).toString() << " ns:"
+ << _getExtent(txn, _details->firstExtent(txn))->nsDiagnostic.toString());
+ if (_details->lastExtent(txn).isNull())
+ output->append("lastExtent", "null");
+ else
+ output->append("lastExtent",
+ str::stream()
+ << _details->lastExtent(txn).toString() << " ns:"
+ << _getExtent(txn, _details->lastExtent(txn))->nsDiagnostic.toString());
+
+ // 22222222222222222222222222
+ { // validate extent basics
+ BSONArrayBuilder extentData;
+ int extentCount = 0;
+ DiskLoc extentDiskLoc;
+ try {
+ if (!_details->firstExtent(txn).isNull()) {
+ _getExtent(txn, _details->firstExtent(txn))->assertOk();
+ _getExtent(txn, _details->lastExtent(txn))->assertOk();
+ }
+ extentDiskLoc = _details->firstExtent(txn);
+ while (!extentDiskLoc.isNull()) {
+ Extent* thisExtent = _getExtent(txn, extentDiskLoc);
+ if (full) {
+ extentData << thisExtent->dump();
+ }
+ if (!thisExtent->validates(extentDiskLoc, &results->errors)) {
+ results->valid = false;
+ }
+ DiskLoc nextDiskLoc = thisExtent->xnext;
+
+ if (extentCount > 0 && !nextDiskLoc.isNull() &&
+ _getExtent(txn, nextDiskLoc)->xprev != extentDiskLoc) {
+ StringBuilder sb;
+ sb << "'xprev' pointer " << _getExtent(txn, nextDiskLoc)->xprev.toString()
+ << " in extent " << nextDiskLoc.toString() << " does not point to extent "
+ << extentDiskLoc.toString();
+ results->errors.push_back(sb.str());
+ results->valid = false;
+ }
+ if (nextDiskLoc.isNull() && extentDiskLoc != _details->lastExtent(txn)) {
+ StringBuilder sb;
+ sb << "'lastExtent' pointer " << _details->lastExtent(txn).toString()
+ << " does not point to last extent in list " << extentDiskLoc.toString();
+ results->errors.push_back(sb.str());
+ results->valid = false;
+ }
+ extentDiskLoc = nextDiskLoc;
+ extentCount++;
+ txn->checkForInterrupt();
+ }
+ } catch (const DBException& e) {
+ StringBuilder sb;
+ sb << "exception validating extent " << extentCount << ": " << e.what();
+ results->errors.push_back(sb.str());
+ results->valid = false;
+ return Status::OK();
}
+ output->append("extentCount", extentCount);
+
+ if (full)
+ output->appendArray("extents", extentData.arr());
+ }
+ try {
+ // 333333333333333333333333333
+ bool testingLastExtent = false;
try {
- // 333333333333333333333333333
- bool testingLastExtent = false;
- try {
- DiskLoc firstExtentLoc = _details->firstExtent(txn);
- if (firstExtentLoc.isNull()) {
- // this is ok
+ DiskLoc firstExtentLoc = _details->firstExtent(txn);
+ if (firstExtentLoc.isNull()) {
+ // this is ok
+ } else {
+ output->append("firstExtentDetails", _getExtent(txn, firstExtentLoc)->dump());
+ if (!_getExtent(txn, firstExtentLoc)->xprev.isNull()) {
+ StringBuilder sb;
+ sb << "'xprev' pointer in 'firstExtent' "
+ << _details->firstExtent(txn).toString() << " is "
+ << _getExtent(txn, firstExtentLoc)->xprev.toString() << ", should be null";
+ results->errors.push_back(sb.str());
+ results->valid = false;
}
- else {
- output->append("firstExtentDetails", _getExtent(txn, firstExtentLoc)->dump());
- if (!_getExtent(txn, firstExtentLoc)->xprev.isNull()) {
+ }
+ testingLastExtent = true;
+ DiskLoc lastExtentLoc = _details->lastExtent(txn);
+ if (lastExtentLoc.isNull()) {
+ // this is ok
+ } else {
+ if (firstExtentLoc != lastExtentLoc) {
+ output->append("lastExtentDetails", _getExtent(txn, lastExtentLoc)->dump());
+ if (!_getExtent(txn, lastExtentLoc)->xnext.isNull()) {
StringBuilder sb;
- sb << "'xprev' pointer in 'firstExtent' " << _details->firstExtent(txn).toString()
- << " is " << _getExtent(txn, firstExtentLoc)->xprev.toString()
+ sb << "'xnext' pointer in 'lastExtent' " << lastExtentLoc.toString()
+ << " is " << _getExtent(txn, lastExtentLoc)->xnext.toString()
<< ", should be null";
- results->errors.push_back( sb.str() );
+ results->errors.push_back(sb.str());
results->valid = false;
}
}
- testingLastExtent = true;
- DiskLoc lastExtentLoc = _details->lastExtent(txn);
- if (lastExtentLoc.isNull()) {
- // this is ok
- }
- else {
- if (firstExtentLoc != lastExtentLoc) {
- output->append("lastExtentDetails", _getExtent(txn, lastExtentLoc)->dump());
- if (!_getExtent(txn, lastExtentLoc)->xnext.isNull()) {
- StringBuilder sb;
- sb << "'xnext' pointer in 'lastExtent' " << lastExtentLoc.toString()
- << " is " << _getExtent(txn, lastExtentLoc)->xnext.toString()
- << ", should be null";
- results->errors.push_back( sb.str() );
- results->valid = false;
- }
- }
- }
- }
- catch (const DBException& e) {
- StringBuilder sb;
- sb << "exception processing '"
- << (testingLastExtent ? "lastExtent" : "firstExtent")
- << "': " << e.what();
- results->errors.push_back( sb.str() );
- results->valid = false;
}
+ } catch (const DBException& e) {
+ StringBuilder sb;
+ sb << "exception processing '" << (testingLastExtent ? "lastExtent" : "firstExtent")
+ << "': " << e.what();
+ results->errors.push_back(sb.str());
+ results->valid = false;
+ }
- // 4444444444444444444444444
-
- set<DiskLoc> recs;
- if( scanData ) {
- int n = 0;
- int nInvalid = 0;
- long long nQuantizedSize = 0;
- long long len = 0;
- long long nlen = 0;
- long long bsonLen = 0;
- int outOfOrder = 0;
- DiskLoc dl_last;
-
- auto cursor = getCursor(txn);
- while (auto record = cursor->next()) {
- const auto dl = DiskLoc::fromRecordId(record->id);
- n++;
-
- if ( n < 1000000 )
- recs.insert(dl);
- if ( isCapped() ) {
- if ( dl < dl_last )
- outOfOrder++;
- dl_last = dl;
- }
-
- MmapV1RecordHeader *r = recordFor(dl);
- len += r->lengthWithHeaders();
- nlen += r->netLength();
+ // 4444444444444444444444444
+
+ set<DiskLoc> recs;
+ if (scanData) {
+ int n = 0;
+ int nInvalid = 0;
+ long long nQuantizedSize = 0;
+ long long len = 0;
+ long long nlen = 0;
+ long long bsonLen = 0;
+ int outOfOrder = 0;
+ DiskLoc dl_last;
+
+ auto cursor = getCursor(txn);
+ while (auto record = cursor->next()) {
+ const auto dl = DiskLoc::fromRecordId(record->id);
+ n++;
+
+ if (n < 1000000)
+ recs.insert(dl);
+ if (isCapped()) {
+ if (dl < dl_last)
+ outOfOrder++;
+ dl_last = dl;
+ }
- if ( isQuantized( r->lengthWithHeaders() ) ) {
- // Count the number of records having a size consistent with
- // the quantizeAllocationSpace quantization implementation.
- ++nQuantizedSize;
- }
+ MmapV1RecordHeader* r = recordFor(dl);
+ len += r->lengthWithHeaders();
+ nlen += r->netLength();
- if (full){
- size_t dataSize = 0;
- const Status status = adaptor->validate( r->toRecordData(), &dataSize );
- if (!status.isOK()) {
- results->valid = false;
- if (nInvalid == 0) // only log once;
- results->errors.push_back( "invalid object detected (see logs)" );
-
- nInvalid++;
- log() << "Invalid object detected in " << _ns
- << ": " << status.reason();
- }
- else {
- bsonLen += dataSize;
- }
- }
+ if (isQuantized(r->lengthWithHeaders())) {
+ // Count the number of records having a size consistent with
+ // the quantizeAllocationSpace quantization implementation.
+ ++nQuantizedSize;
}
- if ( isCapped() && !_details->capLooped() ) {
- output->append("cappedOutOfOrder", outOfOrder);
- if ( outOfOrder > 1 ) {
+ if (full) {
+ size_t dataSize = 0;
+ const Status status = adaptor->validate(r->toRecordData(), &dataSize);
+ if (!status.isOK()) {
results->valid = false;
- results->errors.push_back( "too many out of order records" );
+ if (nInvalid == 0) // only log once;
+ results->errors.push_back("invalid object detected (see logs)");
+
+ nInvalid++;
+ log() << "Invalid object detected in " << _ns << ": " << status.reason();
+ } else {
+ bsonLen += dataSize;
}
}
- output->append("objectsFound", n);
+ }
- if (full) {
- output->append("invalidObjects", nInvalid);
+ if (isCapped() && !_details->capLooped()) {
+ output->append("cappedOutOfOrder", outOfOrder);
+ if (outOfOrder > 1) {
+ results->valid = false;
+ results->errors.push_back("too many out of order records");
}
+ }
+ output->append("objectsFound", n);
- output->appendNumber("nQuantizedSize", nQuantizedSize);
- output->appendNumber("bytesWithHeaders", len);
- output->appendNumber("bytesWithoutHeaders", nlen);
+ if (full) {
+ output->append("invalidObjects", nInvalid);
+ }
- if (full) {
- output->appendNumber("bytesBson", bsonLen);
- }
- } // end scanData
+ output->appendNumber("nQuantizedSize", nQuantizedSize);
+ output->appendNumber("bytesWithHeaders", len);
+ output->appendNumber("bytesWithoutHeaders", nlen);
- // 55555555555555555555555555
- BSONArrayBuilder deletedListArray;
- for ( int i = 0; i < Buckets; i++ ) {
- deletedListArray << _details->deletedListEntry(i).isNull();
+ if (full) {
+ output->appendNumber("bytesBson", bsonLen);
}
+ } // end scanData
+
+ // 55555555555555555555555555
+ BSONArrayBuilder deletedListArray;
+ for (int i = 0; i < Buckets; i++) {
+ deletedListArray << _details->deletedListEntry(i).isNull();
+ }
- int ndel = 0;
- long long delSize = 0;
- BSONArrayBuilder delBucketSizes;
- int incorrect = 0;
- for ( int i = 0; i < Buckets; i++ ) {
- DiskLoc loc = _details->deletedListEntry(i);
- try {
- int k = 0;
- while ( !loc.isNull() ) {
- if ( recs.count(loc) )
- incorrect++;
- ndel++;
-
- if ( loc.questionable() ) {
- if( isCapped() && !loc.isValid() && i == 1 ) {
- /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
- see comments in namespace.h
- */
- break;
- }
-
- string err( str::stream() << "bad pointer in deleted record list: "
- << loc.toString()
- << " bucket: " << i
- << " k: " << k );
- results->errors.push_back( err );
- results->valid = false;
+ int ndel = 0;
+ long long delSize = 0;
+ BSONArrayBuilder delBucketSizes;
+ int incorrect = 0;
+ for (int i = 0; i < Buckets; i++) {
+ DiskLoc loc = _details->deletedListEntry(i);
+ try {
+ int k = 0;
+ while (!loc.isNull()) {
+ if (recs.count(loc))
+ incorrect++;
+ ndel++;
+
+ if (loc.questionable()) {
+ if (isCapped() && !loc.isValid() && i == 1) {
+ /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
+ see comments in namespace.h
+ */
break;
}
- const DeletedRecord* d = deletedRecordFor(loc);
- delSize += d->lengthWithHeaders();
- loc = d->nextDeleted();
- k++;
- txn->checkForInterrupt();
+ string err(str::stream()
+ << "bad pointer in deleted record list: " << loc.toString()
+ << " bucket: " << i << " k: " << k);
+ results->errors.push_back(err);
+ results->valid = false;
+ break;
}
- delBucketSizes << k;
- }
- catch (...) {
- results->errors.push_back( (string)"exception in deleted chain for bucket " +
- BSONObjBuilder::numStr(i) );
- results->valid = false;
- }
- }
- output->appendNumber("deletedCount", ndel);
- output->appendNumber("deletedSize", delSize);
- if ( full ) {
- output->append( "delBucketSizes", delBucketSizes.arr() );
- }
- if ( incorrect ) {
- results->errors.push_back( BSONObjBuilder::numStr(incorrect) +
- " records from datafile are in deleted list" );
+ const DeletedRecord* d = deletedRecordFor(loc);
+ delSize += d->lengthWithHeaders();
+ loc = d->nextDeleted();
+ k++;
+ txn->checkForInterrupt();
+ }
+ delBucketSizes << k;
+ } catch (...) {
+ results->errors.push_back((string) "exception in deleted chain for bucket " +
+ BSONObjBuilder::numStr(i));
results->valid = false;
}
-
}
- catch (AssertionException) {
- results->errors.push_back( "exception during validate" );
- results->valid = false;
+ output->appendNumber("deletedCount", ndel);
+ output->appendNumber("deletedSize", delSize);
+ if (full) {
+ output->append("delBucketSizes", delBucketSizes.arr());
}
- return Status::OK();
- }
-
- void RecordStoreV1Base::appendCustomStats( OperationContext* txn,
- BSONObjBuilder* result,
- double scale ) const {
- result->append( "lastExtentSize", _details->lastExtentSize(txn) / scale );
- result->append( "paddingFactor", 1.0 ); // hard coded
- result->append( "paddingFactorNote", "paddingFactor is unused and unmaintained in 3.0. It "
- "remains hard coded to 1.0 for compatibility only." );
- result->append( "userFlags", _details->userFlags() );
- result->appendBool( "capped", isCapped() );
- if ( isCapped() ) {
- result->appendNumber( "max", _details->maxCappedDocs() );
- result->appendNumber( "maxSize", static_cast<long long>(storageSize(txn, NULL, 0) /
- scale) );
+ if (incorrect) {
+ results->errors.push_back(BSONObjBuilder::numStr(incorrect) +
+ " records from datafile are in deleted list");
+ results->valid = false;
}
+
+ } catch (AssertionException) {
+ results->errors.push_back("exception during validate");
+ results->valid = false;
}
+ return Status::OK();
+}
- namespace {
- struct touch_location {
- const char* root;
- size_t length;
- };
+void RecordStoreV1Base::appendCustomStats(OperationContext* txn,
+ BSONObjBuilder* result,
+ double scale) const {
+ result->append("lastExtentSize", _details->lastExtentSize(txn) / scale);
+ result->append("paddingFactor", 1.0); // hard coded
+ result->append("paddingFactorNote",
+ "paddingFactor is unused and unmaintained in 3.0. It "
+ "remains hard coded to 1.0 for compatibility only.");
+ result->append("userFlags", _details->userFlags());
+ result->appendBool("capped", isCapped());
+ if (isCapped()) {
+ result->appendNumber("max", _details->maxCappedDocs());
+ result->appendNumber("maxSize", static_cast<long long>(storageSize(txn, NULL, 0) / scale));
}
+}
- Status RecordStoreV1Base::touch( OperationContext* txn, BSONObjBuilder* output ) const {
- Timer t;
- std::vector<touch_location> ranges;
- {
- DiskLoc nextLoc = _details->firstExtent(txn);
- Extent* ext = nextLoc.isNull() ? NULL : _getExtent( txn, nextLoc );
- while ( ext ) {
- touch_location tl;
- tl.root = reinterpret_cast<const char*>(ext);
- tl.length = ext->length;
- ranges.push_back(tl);
+namespace {
+struct touch_location {
+ const char* root;
+ size_t length;
+};
+}
- nextLoc = ext->xnext;
- if ( nextLoc.isNull() )
- ext = NULL;
- else
- ext = _getExtent( txn, nextLoc );
- }
- }
+Status RecordStoreV1Base::touch(OperationContext* txn, BSONObjBuilder* output) const {
+ Timer t;
- std::string progress_msg = "touch " + std::string(txn->getNS()) + " extents";
- stdx::unique_lock<Client> lk(*txn->getClient());
- ProgressMeterHolder pm(*txn->setMessage_inlock(progress_msg.c_str(),
- "Touch Progress",
- ranges.size()));
- lk.unlock();
-
- for ( std::vector<touch_location>::iterator it = ranges.begin(); it != ranges.end(); ++it ) {
- touch_pages( it->root, it->length );
- pm.hit();
- txn->checkForInterrupt();
- }
- pm.finished();
+ std::vector<touch_location> ranges;
+ {
+ DiskLoc nextLoc = _details->firstExtent(txn);
+ Extent* ext = nextLoc.isNull() ? NULL : _getExtent(txn, nextLoc);
+ while (ext) {
+ touch_location tl;
+ tl.root = reinterpret_cast<const char*>(ext);
+ tl.length = ext->length;
+ ranges.push_back(tl);
- if ( output ) {
- output->append( "numRanges", static_cast<int>( ranges.size() ) );
- output->append( "millis", t.millis() );
+ nextLoc = ext->xnext;
+ if (nextLoc.isNull())
+ ext = NULL;
+ else
+ ext = _getExtent(txn, nextLoc);
}
-
- return Status::OK();
}
- boost::optional<Record> RecordStoreV1Base::IntraExtentIterator::next() {
- if (_curr.isNull()) return {};
- auto out = _curr.toRecordId();
- advance();
- return {{out, _rs->dataFor(_txn, out)}};
+ std::string progress_msg = "touch " + std::string(txn->getNS()) + " extents";
+ stdx::unique_lock<Client> lk(*txn->getClient());
+ ProgressMeterHolder pm(
+ *txn->setMessage_inlock(progress_msg.c_str(), "Touch Progress", ranges.size()));
+ lk.unlock();
+
+ for (std::vector<touch_location>::iterator it = ranges.begin(); it != ranges.end(); ++it) {
+ touch_pages(it->root, it->length);
+ pm.hit();
+ txn->checkForInterrupt();
}
+ pm.finished();
- boost::optional<Record> RecordStoreV1Base::IntraExtentIterator::seekExact(const RecordId& id) {
- invariant(!"seekExact not supported");
+ if (output) {
+ output->append("numRanges", static_cast<int>(ranges.size()));
+ output->append("millis", t.millis());
}
- void RecordStoreV1Base::IntraExtentIterator::advance() {
- if (_curr.isNull())
- return;
+ return Status::OK();
+}
- const MmapV1RecordHeader* rec = recordFor(_curr);
- const int nextOfs = _forward ? rec->nextOfs() : rec->prevOfs();
- _curr = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(_curr.a(), nextOfs));
- }
+boost::optional<Record> RecordStoreV1Base::IntraExtentIterator::next() {
+ if (_curr.isNull())
+ return {};
+ auto out = _curr.toRecordId();
+ advance();
+ return {{out, _rs->dataFor(_txn, out)}};
+}
- void RecordStoreV1Base::IntraExtentIterator::invalidate(const RecordId& rid) {
- if (rid == _curr.toRecordId()) {
- advance();
- }
- }
+boost::optional<Record> RecordStoreV1Base::IntraExtentIterator::seekExact(const RecordId& id) {
+ invariant(!"seekExact not supported");
+}
+
+void RecordStoreV1Base::IntraExtentIterator::advance() {
+ if (_curr.isNull())
+ return;
- std::unique_ptr<RecordFetcher> RecordStoreV1Base::IntraExtentIterator::fetcherForNext() const {
- return _rs->_extentManager->recordNeedsFetch(_curr);
+ const MmapV1RecordHeader* rec = recordFor(_curr);
+ const int nextOfs = _forward ? rec->nextOfs() : rec->prevOfs();
+ _curr = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(_curr.a(), nextOfs));
+}
+
+void RecordStoreV1Base::IntraExtentIterator::invalidate(const RecordId& rid) {
+ if (rid == _curr.toRecordId()) {
+ advance();
}
+}
- int RecordStoreV1Base::quantizeAllocationSpace(int allocSize) {
- invariant(allocSize <= MaxAllowedAllocation);
- for ( int i = 0; i < Buckets - 2; i++ ) { // last two bucketSizes are invalid
- if ( bucketSizes[i] >= allocSize ) {
- // Return the size of the first bucket sized >= the requested size.
- return bucketSizes[i];
- }
+std::unique_ptr<RecordFetcher> RecordStoreV1Base::IntraExtentIterator::fetcherForNext() const {
+ return _rs->_extentManager->recordNeedsFetch(_curr);
+}
+
+int RecordStoreV1Base::quantizeAllocationSpace(int allocSize) {
+ invariant(allocSize <= MaxAllowedAllocation);
+ for (int i = 0; i < Buckets - 2; i++) { // last two bucketSizes are invalid
+ if (bucketSizes[i] >= allocSize) {
+ // Return the size of the first bucket sized >= the requested size.
+ return bucketSizes[i];
}
- invariant(false); // prior invariant means we should find something.
}
+ invariant(false); // prior invariant means we should find something.
+}
- bool RecordStoreV1Base::isQuantized(int recordSize) {
- if (recordSize > MaxAllowedAllocation)
- return false;
+bool RecordStoreV1Base::isQuantized(int recordSize) {
+ if (recordSize > MaxAllowedAllocation)
+ return false;
- return recordSize == quantizeAllocationSpace(recordSize);
- }
+ return recordSize == quantizeAllocationSpace(recordSize);
+}
- int RecordStoreV1Base::bucket(int size) {
- for ( int i = 0; i < Buckets; i++ ) {
- if ( bucketSizes[i] > size ) {
- // Return the first bucket sized _larger_ than the requested size. This is important
- // since we want all records in a bucket to be >= the quantized size, therefore the
- // quantized size must be the smallest allowed record per bucket.
- return i;
- }
+int RecordStoreV1Base::bucket(int size) {
+ for (int i = 0; i < Buckets; i++) {
+ if (bucketSizes[i] > size) {
+ // Return the first bucket sized _larger_ than the requested size. This is important
+ // since we want all records in a bucket to be >= the quantized size, therefore the
+ // quantized size must be the smallest allowed record per bucket.
+ return i;
}
- // Technically, this is reachable if size == INT_MAX, but it would be an error to pass that
- // in anyway since it would be impossible to have a record that large given the file and
- // extent headers.
- invariant(false);
}
+ // Technically, this is reachable if size == INT_MAX, but it would be an error to pass that
+ // in anyway since it would be impossible to have a record that large given the file and
+ // extent headers.
+ invariant(false);
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.h b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h
index 4e1aa8de338..5c0437cce56 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_base.h
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h
@@ -38,312 +38,319 @@
namespace mongo {
- class DeletedRecord;
- class DocWriter;
- class ExtentManager;
- class MmapV1RecordHeader;
- class OperationContext;
+class DeletedRecord;
+class DocWriter;
+class ExtentManager;
+class MmapV1RecordHeader;
+class OperationContext;
- struct Extent;
+struct Extent;
- class RecordStoreV1MetaData {
- public:
- virtual ~RecordStoreV1MetaData(){}
+class RecordStoreV1MetaData {
+public:
+ virtual ~RecordStoreV1MetaData() {}
- virtual const DiskLoc& capExtent() const = 0;
- virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+ virtual const DiskLoc& capExtent() const = 0;
+ virtual void setCapExtent(OperationContext* txn, const DiskLoc& loc) = 0;
- virtual const DiskLoc& capFirstNewRecord() const = 0;
- virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc ) = 0;
+ virtual const DiskLoc& capFirstNewRecord() const = 0;
+ virtual void setCapFirstNewRecord(OperationContext* txn, const DiskLoc& loc) = 0;
- bool capLooped() const { return capFirstNewRecord().isValid(); }
+ bool capLooped() const {
+ return capFirstNewRecord().isValid();
+ }
- virtual long long dataSize() const = 0;
- virtual long long numRecords() const = 0;
+ virtual long long dataSize() const = 0;
+ virtual long long numRecords() const = 0;
- virtual void incrementStats( OperationContext* txn,
- long long dataSizeIncrement,
- long long numRecordsIncrement ) = 0;
+ virtual void incrementStats(OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement) = 0;
- virtual void setStats( OperationContext* txn,
- long long dataSize,
- long long numRecords ) = 0;
+ virtual void setStats(OperationContext* txn, long long dataSize, long long numRecords) = 0;
- virtual DiskLoc deletedListEntry( int bucket ) const = 0;
- virtual void setDeletedListEntry( OperationContext* txn,
- int bucket,
- const DiskLoc& loc ) = 0;
+ virtual DiskLoc deletedListEntry(int bucket) const = 0;
+ virtual void setDeletedListEntry(OperationContext* txn, int bucket, const DiskLoc& loc) = 0;
- virtual DiskLoc deletedListLegacyGrabBag() const = 0;
- virtual void setDeletedListLegacyGrabBag(OperationContext* txn, const DiskLoc& loc) = 0;
+ virtual DiskLoc deletedListLegacyGrabBag() const = 0;
+ virtual void setDeletedListLegacyGrabBag(OperationContext* txn, const DiskLoc& loc) = 0;
- virtual void orphanDeletedList(OperationContext* txn) = 0;
+ virtual void orphanDeletedList(OperationContext* txn) = 0;
- virtual const DiskLoc& firstExtent( OperationContext* txn ) const = 0;
- virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+ virtual const DiskLoc& firstExtent(OperationContext* txn) const = 0;
+ virtual void setFirstExtent(OperationContext* txn, const DiskLoc& loc) = 0;
- virtual const DiskLoc& lastExtent( OperationContext* txn ) const = 0;
- virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+ virtual const DiskLoc& lastExtent(OperationContext* txn) const = 0;
+ virtual void setLastExtent(OperationContext* txn, const DiskLoc& loc) = 0;
- virtual bool isCapped() const = 0;
+ virtual bool isCapped() const = 0;
- virtual bool isUserFlagSet( int flag ) const = 0;
- virtual int userFlags() const = 0;
- virtual bool setUserFlag( OperationContext* txn, int flag ) = 0;
- virtual bool clearUserFlag( OperationContext* txn, int flag ) = 0;
- virtual bool replaceUserFlags( OperationContext* txn, int flags ) = 0;
+ virtual bool isUserFlagSet(int flag) const = 0;
+ virtual int userFlags() const = 0;
+ virtual bool setUserFlag(OperationContext* txn, int flag) = 0;
+ virtual bool clearUserFlag(OperationContext* txn, int flag) = 0;
+ virtual bool replaceUserFlags(OperationContext* txn, int flags) = 0;
- virtual int lastExtentSize( OperationContext* txn) const = 0;
- virtual void setLastExtentSize( OperationContext* txn, int newMax ) = 0;
+ virtual int lastExtentSize(OperationContext* txn) const = 0;
+ virtual void setLastExtentSize(OperationContext* txn, int newMax) = 0;
- virtual long long maxCappedDocs() const = 0;
-
- };
+ virtual long long maxCappedDocs() const = 0;
+};
+/**
+ * Class that stores active cursors that have been saved (as part of yielding) to
+ * allow them to be invalidated if the thing they pointed at goes away. The registry is
+ * thread-safe, as readers may concurrently register and remove their cursors. Contention is
+ * expected to be very low, as yielding is infrequent. This logically belongs to the
+ * RecordStore, but is not contained in it to facilitate unit testing.
+ */
+class SavedCursorRegistry {
+public:
/**
- * Class that stores active cursors that have been saved (as part of yielding) to
- * allow them to be invalidated if the thing they pointed at goes away. The registry is
- * thread-safe, as readers may concurrently register and remove their cursors. Contention is
- * expected to be very low, as yielding is infrequent. This logically belongs to the
- * RecordStore, but is not contained in it to facilitate unit testing.
+ * The destructor ensures the cursor is unregistered when an exception is thrown.
+ * Note that the SavedCursor may outlive the registry it was saved in.
*/
- class SavedCursorRegistry {
- public:
- /**
- * The destructor ensures the cursor is unregistered when an exception is thrown.
- * Note that the SavedCursor may outlive the registry it was saved in.
- */
- struct SavedCursor {
- SavedCursor() : _registry(NULL) { }
- virtual ~SavedCursor() { if (_registry) _registry->unregisterCursor(this); }
- DiskLoc bucket;
- BSONObj key;
- DiskLoc loc;
-
- private:
- friend class SavedCursorRegistry;
- // Non-null iff registered. Accessed by owner or writer with MODE_X collection lock
- SavedCursorRegistry* _registry;
- };
-
- ~SavedCursorRegistry();
-
- /**
- * Adds given saved cursor to SavedCursorRegistry. Doesn't take ownership.
- */
- void registerCursor(SavedCursor* cursor);
-
- /**
- * Removes given saved cursor. Returns true if the cursor was still present, and false
- * if it had already been removed due to invalidation. Doesn't take ownership.
- */
- bool unregisterCursor(SavedCursor* cursor);
-
- /**
- * When a btree-bucket disappears due to merge/split or similar, this invalidates all
- * cursors that point at the same bucket by removing them from the registry.
- */
- void invalidateCursorsForBucket(DiskLoc bucket);
+ struct SavedCursor {
+ SavedCursor() : _registry(NULL) {}
+ virtual ~SavedCursor() {
+ if (_registry)
+ _registry->unregisterCursor(this);
+ }
+ DiskLoc bucket;
+ BSONObj key;
+ DiskLoc loc;
private:
- SpinLock _mutex;
- typedef unordered_set<SavedCursor *> SavedCursorSet; // SavedCursor pointers not owned here
- SavedCursorSet _cursors;
+ friend class SavedCursorRegistry;
+ // Non-null iff registered. Accessed by owner or writer with MODE_X collection lock
+ SavedCursorRegistry* _registry;
};
- class RecordStoreV1Base : public RecordStore {
- public:
-
- static const int Buckets = 26;
- static const int MaxAllowedAllocation = 16*1024*1024 + 512*1024;
+ ~SavedCursorRegistry();
- static const int bucketSizes[];
+ /**
+ * Adds given saved cursor to SavedCursorRegistry. Doesn't take ownership.
+ */
+ void registerCursor(SavedCursor* cursor);
- // ------------
+ /**
+ * Removes given saved cursor. Returns true if the cursor was still present, and false
+ * if it had already been removed due to invalidation. Doesn't take ownership.
+ */
+ bool unregisterCursor(SavedCursor* cursor);
- class IntraExtentIterator;
+ /**
+ * When a btree-bucket disappears due to merge/split or similar, this invalidates all
+ * cursors that point at the same bucket by removing them from the registry.
+ */
+ void invalidateCursorsForBucket(DiskLoc bucket);
- /**
- * @param details - takes ownership
- * @param em - does NOT take ownership
- */
- RecordStoreV1Base(StringData ns,
- RecordStoreV1MetaData* details,
- ExtentManager* em,
- bool isSystemIndexes);
+private:
+ SpinLock _mutex;
+ typedef unordered_set<SavedCursor*> SavedCursorSet; // SavedCursor pointers not owned here
+ SavedCursorSet _cursors;
+};
- virtual ~RecordStoreV1Base();
+class RecordStoreV1Base : public RecordStore {
+public:
+ static const int Buckets = 26;
+ static const int MaxAllowedAllocation = 16 * 1024 * 1024 + 512 * 1024;
- virtual long long dataSize( OperationContext* txn ) const { return _details->dataSize(); }
- virtual long long numRecords( OperationContext* txn ) const { return _details->numRecords(); }
+ static const int bucketSizes[];
- virtual int64_t storageSize( OperationContext* txn,
- BSONObjBuilder* extraInfo = NULL,
- int level = 0 ) const;
+ // ------------
- virtual RecordData dataFor( OperationContext* txn, const RecordId& loc ) const;
+ class IntraExtentIterator;
- virtual bool findRecord( OperationContext* txn, const RecordId& loc, RecordData* rd ) const;
+ /**
+ * @param details - takes ownership
+ * @param em - does NOT take ownership
+ */
+ RecordStoreV1Base(StringData ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes);
- void deleteRecord( OperationContext* txn,
- const RecordId& dl );
+ virtual ~RecordStoreV1Base();
- StatusWith<RecordId> insertRecord( OperationContext* txn,
- const char* data,
- int len,
- bool enforceQuota );
+ virtual long long dataSize(OperationContext* txn) const {
+ return _details->dataSize();
+ }
+ virtual long long numRecords(OperationContext* txn) const {
+ return _details->numRecords();
+ }
- StatusWith<RecordId> insertRecord( OperationContext* txn,
- const DocWriter* doc,
- bool enforceQuota );
+ virtual int64_t storageSize(OperationContext* txn,
+ BSONObjBuilder* extraInfo = NULL,
+ int level = 0) const;
- virtual StatusWith<RecordId> updateRecord( OperationContext* txn,
- const RecordId& oldLocation,
- const char* data,
- int len,
- bool enforceQuota,
- UpdateNotifier* notifier );
+ virtual RecordData dataFor(OperationContext* txn, const RecordId& loc) const;
- virtual bool updateWithDamagesSupported() const;
+ virtual bool findRecord(OperationContext* txn, const RecordId& loc, RecordData* rd) const;
- virtual Status updateWithDamages( OperationContext* txn,
- const RecordId& loc,
- const RecordData& oldRec,
- const char* damageSource,
- const mutablebson::DamageVector& damages );
+ void deleteRecord(OperationContext* txn, const RecordId& dl);
- virtual std::unique_ptr<RecordCursor> getCursorForRepair( OperationContext* txn ) const;
+ StatusWith<RecordId> insertRecord(OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota);
- void increaseStorageSize( OperationContext* txn, int size, bool enforceQuota );
+ StatusWith<RecordId> insertRecord(OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota);
- virtual Status validate( OperationContext* txn,
- bool full, bool scanData,
- ValidateAdaptor* adaptor,
- ValidateResults* results, BSONObjBuilder* output );
+ virtual StatusWith<RecordId> updateRecord(OperationContext* txn,
+ const RecordId& oldLocation,
+ const char* data,
+ int len,
+ bool enforceQuota,
+ UpdateNotifier* notifier);
- virtual void appendCustomStats( OperationContext* txn,
- BSONObjBuilder* result,
- double scale ) const;
+ virtual bool updateWithDamagesSupported() const;
- virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const;
+ virtual Status updateWithDamages(OperationContext* txn,
+ const RecordId& loc,
+ const RecordData& oldRec,
+ const char* damageSource,
+ const mutablebson::DamageVector& damages);
- const RecordStoreV1MetaData* details() const { return _details.get(); }
+ virtual std::unique_ptr<RecordCursor> getCursorForRepair(OperationContext* txn) const;
- // This keeps track of cursors saved during yielding, for invalidation purposes.
- SavedCursorRegistry savedCursors;
+ void increaseStorageSize(OperationContext* txn, int size, bool enforceQuota);
- DiskLoc getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const;
+ virtual Status validate(OperationContext* txn,
+ bool full,
+ bool scanData,
+ ValidateAdaptor* adaptor,
+ ValidateResults* results,
+ BSONObjBuilder* output);
- DiskLoc getNextRecord( OperationContext* txn, const DiskLoc& loc ) const;
- DiskLoc getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const;
+ virtual void appendCustomStats(OperationContext* txn,
+ BSONObjBuilder* result,
+ double scale) const;
- DiskLoc getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
- DiskLoc getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+ virtual Status touch(OperationContext* txn, BSONObjBuilder* output) const;
- /**
- * Quantize 'minSize' to the nearest allocation size.
- */
- static int quantizeAllocationSpace(int minSize);
+ const RecordStoreV1MetaData* details() const {
+ return _details.get();
+ }
- static bool isQuantized(int recordSize);
+ // This keeps track of cursors saved during yielding, for invalidation purposes.
+ SavedCursorRegistry savedCursors;
- /* return which "deleted bucket" for this size object */
- static int bucket(int size);
+ DiskLoc getExtentLocForRecord(OperationContext* txn, const DiskLoc& loc) const;
- virtual void updateStatsAfterRepair(OperationContext* txn,
- long long numRecords,
- long long dataSize) {
- invariant(false); // MMAPv1 has its own repair which doesn't call this.
- }
- protected:
+ DiskLoc getNextRecord(OperationContext* txn, const DiskLoc& loc) const;
+ DiskLoc getPrevRecord(OperationContext* txn, const DiskLoc& loc) const;
- virtual MmapV1RecordHeader* recordFor( const DiskLoc& loc ) const;
+ DiskLoc getNextRecordInExtent(OperationContext* txn, const DiskLoc& loc) const;
+ DiskLoc getPrevRecordInExtent(OperationContext* txn, const DiskLoc& loc) const;
- const DeletedRecord* deletedRecordFor( const DiskLoc& loc ) const;
+ /**
+ * Quantize 'minSize' to the nearest allocation size.
+ */
+ static int quantizeAllocationSpace(int minSize);
- virtual bool isCapped() const = 0;
+ static bool isQuantized(int recordSize);
- virtual bool shouldPadInserts() const = 0;
+ /* return which "deleted bucket" for this size object */
+ static int bucket(int size);
- virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn,
- int lengthWithHeaders,
- bool enforceQuota ) = 0;
+ virtual void updateStatsAfterRepair(OperationContext* txn,
+ long long numRecords,
+ long long dataSize) {
+ invariant(false); // MMAPv1 has its own repair which doesn't call this.
+ }
- // TODO: document, remove, what have you
- virtual void addDeletedRec( OperationContext* txn, const DiskLoc& dloc) = 0;
+protected:
+ virtual MmapV1RecordHeader* recordFor(const DiskLoc& loc) const;
- // TODO: another sad one
- virtual DeletedRecord* drec( const DiskLoc& loc ) const;
+ const DeletedRecord* deletedRecordFor(const DiskLoc& loc) const;
- // just a wrapper for _extentManager->getExtent( loc );
- Extent* _getExtent( OperationContext* txn, const DiskLoc& loc ) const;
+ virtual bool isCapped() const = 0;
- DiskLoc _getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const;
+ virtual bool shouldPadInserts() const = 0;
- DiskLoc _getNextRecord( OperationContext* txn, const DiskLoc& loc ) const;
- DiskLoc _getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const;
+ virtual StatusWith<DiskLoc> allocRecord(OperationContext* txn,
+ int lengthWithHeaders,
+ bool enforceQuota) = 0;
- DiskLoc _getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
- DiskLoc _getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+ // TODO: document, remove, what have you
+ virtual void addDeletedRec(OperationContext* txn, const DiskLoc& dloc) = 0;
- /**
- * finds the first suitable DiskLoc for data
- * will return the DiskLoc of a newly created DeletedRecord
- */
- DiskLoc _findFirstSpot( OperationContext* txn, const DiskLoc& extDiskLoc, Extent* e );
+ // TODO: another sad one
+ virtual DeletedRecord* drec(const DiskLoc& loc) const;
- /** add a record to the end of the linked list chain within this extent.
- require: you must have already declared write intent for the record header.
- */
- void _addRecordToRecListInExtent(OperationContext* txn, MmapV1RecordHeader* r, DiskLoc loc);
+ // just a wrapper for _extentManager->getExtent( loc );
+ Extent* _getExtent(OperationContext* txn, const DiskLoc& loc) const;
- /**
- * internal
- * doesn't check inputs or change padding
- */
- StatusWith<RecordId> _insertRecord( OperationContext* txn,
- const char* data,
- int len,
- bool enforceQuota );
+ DiskLoc _getExtentLocForRecord(OperationContext* txn, const DiskLoc& loc) const;
- std::unique_ptr<RecordStoreV1MetaData> _details;
- ExtentManager* _extentManager;
- bool _isSystemIndexes;
+ DiskLoc _getNextRecord(OperationContext* txn, const DiskLoc& loc) const;
+ DiskLoc _getPrevRecord(OperationContext* txn, const DiskLoc& loc) const;
- friend class RecordStoreV1RepairCursor;
- };
+ DiskLoc _getNextRecordInExtent(OperationContext* txn, const DiskLoc& loc) const;
+ DiskLoc _getPrevRecordInExtent(OperationContext* txn, const DiskLoc& loc) const;
/**
- * Iterates over all records within a single extent.
- *
- * EOF at end of extent, even if there are more extents.
+ * finds the first suitable DiskLoc for data
+ * will return the DiskLoc of a newly created DeletedRecord
*/
- class RecordStoreV1Base::IntraExtentIterator final : public RecordCursor {
- public:
- IntraExtentIterator(OperationContext* txn,
- DiskLoc start,
- const RecordStoreV1Base* rs,
- bool forward = true)
- : _txn(txn), _curr(start), _rs(rs), _forward(forward) {}
-
- boost::optional<Record> next() final;
- boost::optional<Record> seekExact(const RecordId& id) final;
- void invalidate(const RecordId& dl) final;
- void savePositioned() final {}
- bool restore(OperationContext* txn) final { return true; }
- std::unique_ptr<RecordFetcher> fetcherForNext() const final;
+ DiskLoc _findFirstSpot(OperationContext* txn, const DiskLoc& extDiskLoc, Extent* e);
- private:
- virtual const MmapV1RecordHeader* recordFor( const DiskLoc& loc ) const {
- return _rs->recordFor(loc);
- }
+ /** add a record to the end of the linked list chain within this extent.
+ require: you must have already declared write intent for the record header.
+ */
+ void _addRecordToRecListInExtent(OperationContext* txn, MmapV1RecordHeader* r, DiskLoc loc);
+
+ /**
+ * internal
+ * doesn't check inputs or change padding
+ */
+ StatusWith<RecordId> _insertRecord(OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota);
- void advance();
+ std::unique_ptr<RecordStoreV1MetaData> _details;
+ ExtentManager* _extentManager;
+ bool _isSystemIndexes;
- OperationContext* _txn;
- DiskLoc _curr;
- const RecordStoreV1Base* _rs;
- bool _forward;
- };
+ friend class RecordStoreV1RepairCursor;
+};
+/**
+ * Iterates over all records within a single extent.
+ *
+ * EOF at end of extent, even if there are more extents.
+ */
+class RecordStoreV1Base::IntraExtentIterator final : public RecordCursor {
+public:
+ IntraExtentIterator(OperationContext* txn,
+ DiskLoc start,
+ const RecordStoreV1Base* rs,
+ bool forward = true)
+ : _txn(txn), _curr(start), _rs(rs), _forward(forward) {}
+
+ boost::optional<Record> next() final;
+ boost::optional<Record> seekExact(const RecordId& id) final;
+ void invalidate(const RecordId& dl) final;
+ void savePositioned() final {}
+ bool restore(OperationContext* txn) final {
+ return true;
+ }
+ std::unique_ptr<RecordFetcher> fetcherForNext() const final;
+
+private:
+ virtual const MmapV1RecordHeader* recordFor(const DiskLoc& loc) const {
+ return _rs->recordFor(loc);
+ }
+
+ void advance();
+
+ OperationContext* _txn;
+ DiskLoc _curr;
+ const RecordStoreV1Base* _rs;
+ bool _forward;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp
index a41dd66ab1e..2674861bdb1 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp
@@ -62,658 +62,630 @@
namespace mongo {
- using std::dec;
- using std::endl;
- using std::hex;
- using std::vector;
-
- CappedRecordStoreV1::CappedRecordStoreV1( OperationContext* txn,
- CappedDocumentDeleteCallback* collection,
- StringData ns,
- RecordStoreV1MetaData* details,
- ExtentManager* em,
- bool isSystemIndexes )
- : RecordStoreV1Base( ns, details, em, isSystemIndexes ),
- _deleteCallback( collection ) {
-
- DiskLoc extentLoc = details->firstExtent(txn);
- while ( !extentLoc.isNull() ) {
- _extentAdvice.push_back( _extentManager->cacheHint( extentLoc,
- ExtentManager::Sequential ) );
- Extent* extent = em->getExtent( extentLoc );
- extentLoc = extent->xnext;
- }
-
- // this is for VERY VERY old versions of capped collections
- cappedCheckMigrate(txn);
+using std::dec;
+using std::endl;
+using std::hex;
+using std::vector;
+
+CappedRecordStoreV1::CappedRecordStoreV1(OperationContext* txn,
+ CappedDocumentDeleteCallback* collection,
+ StringData ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes)
+ : RecordStoreV1Base(ns, details, em, isSystemIndexes), _deleteCallback(collection) {
+ DiskLoc extentLoc = details->firstExtent(txn);
+ while (!extentLoc.isNull()) {
+ _extentAdvice.push_back(_extentManager->cacheHint(extentLoc, ExtentManager::Sequential));
+ Extent* extent = em->getExtent(extentLoc);
+ extentLoc = extent->xnext;
}
- CappedRecordStoreV1::~CappedRecordStoreV1() {
- }
+ // this is for VERY VERY old versions of capped collections
+ cappedCheckMigrate(txn);
+}
- StatusWith<DiskLoc> CappedRecordStoreV1::allocRecord( OperationContext* txn,
- int lenToAlloc,
- bool enforceQuota ) {
- {
- // align very slightly.
- lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
- }
+CappedRecordStoreV1::~CappedRecordStoreV1() {}
- if ( lenToAlloc > theCapExtent()->length ) {
- // the extent check is a way to try and improve performance
- // since we have to iterate all the extents (for now) to get
- // storage size
- if ( lenToAlloc > storageSize(txn) ) {
- return StatusWith<DiskLoc>( ErrorCodes::DocTooLargeForCapped,
- mongoutils::str::stream()
- << "document is larger than capped size "
- << lenToAlloc << " > " << storageSize(txn),
- 16328 );
- }
+StatusWith<DiskLoc> CappedRecordStoreV1::allocRecord(OperationContext* txn,
+ int lenToAlloc,
+ bool enforceQuota) {
+ {
+ // align very slightly.
+ lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+ }
+ if (lenToAlloc > theCapExtent()->length) {
+ // the extent check is a way to try and improve performance
+ // since we have to iterate all the extents (for now) to get
+ // storage size
+ if (lenToAlloc > storageSize(txn)) {
+ return StatusWith<DiskLoc>(ErrorCodes::DocTooLargeForCapped,
+ mongoutils::str::stream()
+ << "document is larger than capped size " << lenToAlloc
+ << " > " << storageSize(txn),
+ 16328);
}
- DiskLoc loc;
- { // do allocation
-
- // signal done allocating new extents.
- if ( !cappedLastDelRecLastExtent().isValid() )
- setLastDelRecLastExtent( txn, DiskLoc() );
+ }
+ DiskLoc loc;
+ { // do allocation
- invariant( lenToAlloc < 400000000 );
- int passes = 0;
+ // signal done allocating new extents.
+ if (!cappedLastDelRecLastExtent().isValid())
+ setLastDelRecLastExtent(txn, DiskLoc());
- // delete records until we have room and the max # objects limit achieved.
+ invariant(lenToAlloc < 400000000);
+ int passes = 0;
- /* this fails on a rename -- that is ok but must keep commented out */
- //invariant( theCapExtent()->ns == ns );
+ // delete records until we have room and the max # objects limit achieved.
- theCapExtent()->assertOk();
- DiskLoc firstEmptyExtent; // This prevents us from infinite looping.
- while ( 1 ) {
- if ( _details->numRecords() < _details->maxCappedDocs() ) {
- loc = __capAlloc( txn, lenToAlloc );
- if ( !loc.isNull() )
- break;
- }
+ /* this fails on a rename -- that is ok but must keep commented out */
+ // invariant( theCapExtent()->ns == ns );
- // If on first iteration through extents, don't delete anything.
- if ( !_details->capFirstNewRecord().isValid() ) {
- advanceCapExtent( txn, _ns );
+ theCapExtent()->assertOk();
+ DiskLoc firstEmptyExtent; // This prevents us from infinite looping.
+ while (1) {
+ if (_details->numRecords() < _details->maxCappedDocs()) {
+ loc = __capAlloc(txn, lenToAlloc);
+ if (!loc.isNull())
+ break;
+ }
- if ( _details->capExtent() != _details->firstExtent(txn) )
- _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
- // else signal done with first iteration through extents.
- continue;
- }
+ // If on first iteration through extents, don't delete anything.
+ if (!_details->capFirstNewRecord().isValid()) {
+ advanceCapExtent(txn, _ns);
- if ( !_details->capFirstNewRecord().isNull() &&
- theCapExtent()->firstRecord == _details->capFirstNewRecord() ) {
- // We've deleted all records that were allocated on the previous
- // iteration through this extent.
- advanceCapExtent( txn, _ns );
- continue;
- }
+ if (_details->capExtent() != _details->firstExtent(txn))
+ _details->setCapFirstNewRecord(txn, DiskLoc().setInvalid());
+ // else signal done with first iteration through extents.
+ continue;
+ }
- if ( theCapExtent()->firstRecord.isNull() ) {
- if ( firstEmptyExtent.isNull() )
- firstEmptyExtent = _details->capExtent();
- advanceCapExtent( txn, _ns );
- if ( firstEmptyExtent == _details->capExtent() ) {
- // All records have been deleted but there is still no room for this record.
- // Nothing we can do but fail.
- _maybeComplain( txn, lenToAlloc );
- return StatusWith<DiskLoc>(
- ErrorCodes::DocTooLargeForCapped,
- str::stream() << "document doesn't fit in capped collection."
- << " size: " << lenToAlloc
- << " storageSize:" << storageSize(txn),
- 28575);
- }
- continue;
- }
+ if (!_details->capFirstNewRecord().isNull() &&
+ theCapExtent()->firstRecord == _details->capFirstNewRecord()) {
+ // We've deleted all records that were allocated on the previous
+ // iteration through this extent.
+ advanceCapExtent(txn, _ns);
+ continue;
+ }
- const RecordId fr = theCapExtent()->firstRecord.toRecordId();
- Status status = _deleteCallback->aboutToDeleteCapped( txn, fr, dataFor(txn, fr) );
- if ( !status.isOK() )
- return StatusWith<DiskLoc>( status );
- deleteRecord( txn, fr );
-
- _compact(txn);
- if ((++passes % 5000) == 0) {
- StringBuilder sb;
- log() << "passes = " << passes << " in CappedRecordStoreV1::allocRecord:"
- << " ns: " << _ns
- << ", lenToAlloc: " << lenToAlloc
- << ", maxCappedDocs: " << _details->maxCappedDocs()
- << ", nrecords: " << _details->numRecords()
- << ", datasize: " << _details->dataSize()
- << ". Continuing to delete old records to make room.";
+ if (theCapExtent()->firstRecord.isNull()) {
+ if (firstEmptyExtent.isNull())
+ firstEmptyExtent = _details->capExtent();
+ advanceCapExtent(txn, _ns);
+ if (firstEmptyExtent == _details->capExtent()) {
+ // All records have been deleted but there is still no room for this record.
+ // Nothing we can do but fail.
+ _maybeComplain(txn, lenToAlloc);
+ return StatusWith<DiskLoc>(ErrorCodes::DocTooLargeForCapped,
+ str::stream()
+ << "document doesn't fit in capped collection."
+ << " size: " << lenToAlloc
+ << " storageSize:" << storageSize(txn),
+ 28575);
}
+ continue;
}
- // Remember first record allocated on this iteration through capExtent.
- if ( _details->capFirstNewRecord().isValid() && _details->capFirstNewRecord().isNull() )
- _details->setCapFirstNewRecord( txn, loc );
+ const RecordId fr = theCapExtent()->firstRecord.toRecordId();
+ Status status = _deleteCallback->aboutToDeleteCapped(txn, fr, dataFor(txn, fr));
+ if (!status.isOK())
+ return StatusWith<DiskLoc>(status);
+ deleteRecord(txn, fr);
+
+ _compact(txn);
+ if ((++passes % 5000) == 0) {
+ StringBuilder sb;
+ log() << "passes = " << passes << " in CappedRecordStoreV1::allocRecord:"
+ << " ns: " << _ns << ", lenToAlloc: " << lenToAlloc
+ << ", maxCappedDocs: " << _details->maxCappedDocs()
+ << ", nrecords: " << _details->numRecords()
+ << ", datasize: " << _details->dataSize()
+ << ". Continuing to delete old records to make room.";
+ }
}
- invariant( !loc.isNull() );
+ // Remember first record allocated on this iteration through capExtent.
+ if (_details->capFirstNewRecord().isValid() && _details->capFirstNewRecord().isNull())
+ _details->setCapFirstNewRecord(txn, loc);
+ }
- // possibly slice up if we've allocated too much space
+ invariant(!loc.isNull());
- DeletedRecord *r = drec( loc );
+ // possibly slice up if we've allocated too much space
- /* note we want to grab from the front so our next pointers on disk tend
- to go in a forward direction which is important for performance. */
- int regionlen = r->lengthWithHeaders();
- invariant( r->extentOfs() < loc.getOfs() );
+ DeletedRecord* r = drec(loc);
- int left = regionlen - lenToAlloc;
+ /* note we want to grab from the front so our next pointers on disk tend
+ to go in a forward direction which is important for performance. */
+ int regionlen = r->lengthWithHeaders();
+ invariant(r->extentOfs() < loc.getOfs());
- /* split off some for further use. */
- txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
- DiskLoc newDelLoc = loc;
- newDelLoc.inc(lenToAlloc);
- DeletedRecord* newDel = drec( newDelLoc );
- DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
- newDelW->extentOfs() = r->extentOfs();
- newDelW->lengthWithHeaders() = left;
- newDelW->nextDeleted().Null();
+ int left = regionlen - lenToAlloc;
- addDeletedRec(txn, newDelLoc);
+ /* split off some for further use. */
+ txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
+ DiskLoc newDelLoc = loc;
+ newDelLoc.inc(lenToAlloc);
+ DeletedRecord* newDel = drec(newDelLoc);
+ DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
+ newDelW->extentOfs() = r->extentOfs();
+ newDelW->lengthWithHeaders() = left;
+ newDelW->nextDeleted().Null();
- return StatusWith<DiskLoc>( loc );
- }
+ addDeletedRec(txn, newDelLoc);
- Status CappedRecordStoreV1::truncate(OperationContext* txn) {
- setLastDelRecLastExtent( txn, DiskLoc() );
- setListOfAllDeletedRecords( txn, DiskLoc() );
-
- // preserve firstExtent/lastExtent
- _details->setCapExtent( txn, _details->firstExtent(txn) );
- _details->setStats( txn, 0, 0 );
- // preserve lastExtentSize
- // nIndexes preserve 0
- // capped preserve true
- // max preserve
- // paddingFactor is unused
- _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
- setLastDelRecLastExtent( txn, DiskLoc().setInvalid() );
- // dataFileVersion preserve
- // indexFileVersion preserve
-
- // Reset all existing extents and recreate the deleted list.
- Extent* ext;
- for( DiskLoc extLoc = _details->firstExtent(txn);
- !extLoc.isNull();
- extLoc = ext->xnext ) {
- ext = _extentManager->getExtent(extLoc);
-
- txn->recoveryUnit()->writing( &ext->firstRecord )->Null();
- txn->recoveryUnit()->writing( &ext->lastRecord )->Null();
-
- addDeletedRec( txn, _findFirstSpot( txn, extLoc, ext ) );
- }
+ return StatusWith<DiskLoc>(loc);
+}
- return Status::OK();
+Status CappedRecordStoreV1::truncate(OperationContext* txn) {
+ setLastDelRecLastExtent(txn, DiskLoc());
+ setListOfAllDeletedRecords(txn, DiskLoc());
+
+ // preserve firstExtent/lastExtent
+ _details->setCapExtent(txn, _details->firstExtent(txn));
+ _details->setStats(txn, 0, 0);
+ // preserve lastExtentSize
+ // nIndexes preserve 0
+ // capped preserve true
+ // max preserve
+ // paddingFactor is unused
+ _details->setCapFirstNewRecord(txn, DiskLoc().setInvalid());
+ setLastDelRecLastExtent(txn, DiskLoc().setInvalid());
+ // dataFileVersion preserve
+ // indexFileVersion preserve
+
+ // Reset all existing extents and recreate the deleted list.
+ Extent* ext;
+ for (DiskLoc extLoc = _details->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
+ ext = _extentManager->getExtent(extLoc);
+
+ txn->recoveryUnit()->writing(&ext->firstRecord)->Null();
+ txn->recoveryUnit()->writing(&ext->lastRecord)->Null();
+
+ addDeletedRec(txn, _findFirstSpot(txn, extLoc, ext));
}
- void CappedRecordStoreV1::temp_cappedTruncateAfter( OperationContext* txn,
- RecordId end,
- bool inclusive ) {
- cappedTruncateAfter( txn, _ns.c_str(), DiskLoc::fromRecordId(end), inclusive );
- }
+ return Status::OK();
+}
- /* combine adjacent deleted records *for the current extent* of the capped collection
+void CappedRecordStoreV1::temp_cappedTruncateAfter(OperationContext* txn,
+ RecordId end,
+ bool inclusive) {
+ cappedTruncateAfter(txn, _ns.c_str(), DiskLoc::fromRecordId(end), inclusive);
+}
- this is O(n^2) but we call it for capped tables where typically n==1 or 2!
- (or 3...there will be a little unused sliver at the end of the extent.)
- */
- void CappedRecordStoreV1::_compact(OperationContext* txn) {
- DDD( "CappedRecordStoreV1::compact enter" );
+/* combine adjacent deleted records *for the current extent* of the capped collection
- vector<DiskLoc> drecs;
+ this is O(n^2) but we call it for capped tables where typically n==1 or 2!
+ (or 3...there will be a little unused sliver at the end of the extent.)
+*/
+void CappedRecordStoreV1::_compact(OperationContext* txn) {
+ DDD("CappedRecordStoreV1::compact enter");
- // Pull out capExtent's DRs from deletedList
- DiskLoc i = cappedFirstDeletedInCurExtent();
- for (; !i.isNull() && inCapExtent( i ); i = deletedRecordFor( i )->nextDeleted() ) {
- DDD( "\t" << i );
- drecs.push_back( i );
- }
+ vector<DiskLoc> drecs;
+
+ // Pull out capExtent's DRs from deletedList
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ for (; !i.isNull() && inCapExtent(i); i = deletedRecordFor(i)->nextDeleted()) {
+ DDD("\t" << i);
+ drecs.push_back(i);
+ }
- setFirstDeletedInCurExtent( txn, i );
+ setFirstDeletedInCurExtent(txn, i);
- std::sort( drecs.begin(), drecs.end() );
- DDD( "\t drecs.size(): " << drecs.size() );
+ std::sort(drecs.begin(), drecs.end());
+ DDD("\t drecs.size(): " << drecs.size());
- vector<DiskLoc>::const_iterator j = drecs.begin();
- invariant( j != drecs.end() );
- DiskLoc a = *j;
- while ( 1 ) {
+ vector<DiskLoc>::const_iterator j = drecs.begin();
+ invariant(j != drecs.end());
+ DiskLoc a = *j;
+ while (1) {
+ j++;
+ if (j == drecs.end()) {
+ DDD("\t compact adddelrec");
+ addDeletedRec(txn, a);
+ break;
+ }
+ DiskLoc b = *j;
+ while (a.a() == b.a() && a.getOfs() + drec(a)->lengthWithHeaders() == b.getOfs()) {
+ // a & b are adjacent. merge.
+ txn->recoveryUnit()->writingInt(drec(a)->lengthWithHeaders()) +=
+ drec(b)->lengthWithHeaders();
j++;
- if ( j == drecs.end() ) {
- DDD( "\t compact adddelrec" );
+ if (j == drecs.end()) {
+ DDD("\t compact adddelrec2");
addDeletedRec(txn, a);
- break;
+ return;
}
- DiskLoc b = *j;
- while ( a.a() == b.a() &&
- a.getOfs() + drec( a )->lengthWithHeaders() == b.getOfs() ) {
-
- // a & b are adjacent. merge.
- txn->recoveryUnit()->writingInt( drec(a)->lengthWithHeaders() ) += drec(b)->lengthWithHeaders();
- j++;
- if ( j == drecs.end() ) {
- DDD( "\t compact adddelrec2" );
- addDeletedRec(txn, a);
- return;
- }
- b = *j;
- }
- DDD( "\t compact adddelrec3" );
- addDeletedRec(txn, a);
- a = b;
+ b = *j;
}
-
- }
-
- DiskLoc CappedRecordStoreV1::cappedFirstDeletedInCurExtent() const {
- if ( cappedLastDelRecLastExtent().isNull() )
- return cappedListOfAllDeletedRecords();
- else
- return drec(cappedLastDelRecLastExtent())->nextDeleted();
+ DDD("\t compact adddelrec3");
+ addDeletedRec(txn, a);
+ a = b;
}
+}
- void CappedRecordStoreV1::setFirstDeletedInCurExtent( OperationContext* txn,
- const DiskLoc& loc ) {
- if ( cappedLastDelRecLastExtent().isNull() )
- setListOfAllDeletedRecords( txn, loc );
- else
- *txn->recoveryUnit()->writing( &drec(cappedLastDelRecLastExtent())->nextDeleted() ) = loc;
- }
+DiskLoc CappedRecordStoreV1::cappedFirstDeletedInCurExtent() const {
+ if (cappedLastDelRecLastExtent().isNull())
+ return cappedListOfAllDeletedRecords();
+ else
+ return drec(cappedLastDelRecLastExtent())->nextDeleted();
+}
- void CappedRecordStoreV1::cappedCheckMigrate(OperationContext* txn) {
- // migrate old RecordStoreV1MetaData format
- if ( _details->capExtent().a() == 0 && _details->capExtent().getOfs() == 0 ) {
- WriteUnitOfWork wunit(txn);
- _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
- // put all the DeletedRecords in cappedListOfAllDeletedRecords()
- for ( int i = 1; i < Buckets; ++i ) {
- DiskLoc first = _details->deletedListEntry( i );
- if ( first.isNull() )
- continue;
- DiskLoc last = first;
- for (; !drec(last)->nextDeleted().isNull(); last = drec(last)->nextDeleted() );
- *txn->recoveryUnit()->writing(&drec(last)->nextDeleted()) = cappedListOfAllDeletedRecords();
- setListOfAllDeletedRecords( txn, first );
- _details->setDeletedListEntry(txn, i, DiskLoc());
- }
- // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
+void CappedRecordStoreV1::setFirstDeletedInCurExtent(OperationContext* txn, const DiskLoc& loc) {
+ if (cappedLastDelRecLastExtent().isNull())
+ setListOfAllDeletedRecords(txn, loc);
+ else
+ *txn->recoveryUnit()->writing(&drec(cappedLastDelRecLastExtent())->nextDeleted()) = loc;
+}
- // Last, in case we're killed before getting here
- _details->setCapExtent( txn, _details->firstExtent(txn) );
- wunit.commit();
+void CappedRecordStoreV1::cappedCheckMigrate(OperationContext* txn) {
+ // migrate old RecordStoreV1MetaData format
+ if (_details->capExtent().a() == 0 && _details->capExtent().getOfs() == 0) {
+ WriteUnitOfWork wunit(txn);
+ _details->setCapFirstNewRecord(txn, DiskLoc().setInvalid());
+ // put all the DeletedRecords in cappedListOfAllDeletedRecords()
+ for (int i = 1; i < Buckets; ++i) {
+ DiskLoc first = _details->deletedListEntry(i);
+ if (first.isNull())
+ continue;
+ DiskLoc last = first;
+ for (; !drec(last)->nextDeleted().isNull(); last = drec(last)->nextDeleted())
+ ;
+ *txn->recoveryUnit()->writing(&drec(last)->nextDeleted()) =
+ cappedListOfAllDeletedRecords();
+ setListOfAllDeletedRecords(txn, first);
+ _details->setDeletedListEntry(txn, i, DiskLoc());
}
+ // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
+
+ // Last, in case we're killed before getting here
+ _details->setCapExtent(txn, _details->firstExtent(txn));
+ wunit.commit();
}
+}
- bool CappedRecordStoreV1::inCapExtent( const DiskLoc &dl ) const {
- invariant( !dl.isNull() );
+bool CappedRecordStoreV1::inCapExtent(const DiskLoc& dl) const {
+ invariant(!dl.isNull());
- if ( dl.a() != _details->capExtent().a() )
- return false;
+ if (dl.a() != _details->capExtent().a())
+ return false;
- if ( dl.getOfs() < _details->capExtent().getOfs() )
- return false;
+ if (dl.getOfs() < _details->capExtent().getOfs())
+ return false;
- const Extent* e = theCapExtent();
- int end = _details->capExtent().getOfs() + e->length;
- return dl.getOfs() <= end;
- }
+ const Extent* e = theCapExtent();
+ int end = _details->capExtent().getOfs() + e->length;
+ return dl.getOfs() <= end;
+}
- bool CappedRecordStoreV1::nextIsInCapExtent( const DiskLoc &dl ) const {
- invariant( !dl.isNull() );
- DiskLoc next = drec(dl)->nextDeleted();
- if ( next.isNull() )
- return false;
- return inCapExtent( next );
+bool CappedRecordStoreV1::nextIsInCapExtent(const DiskLoc& dl) const {
+ invariant(!dl.isNull());
+ DiskLoc next = drec(dl)->nextDeleted();
+ if (next.isNull())
+ return false;
+ return inCapExtent(next);
+}
+
+void CappedRecordStoreV1::advanceCapExtent(OperationContext* txn, StringData ns) {
+ // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
+ // (or DiskLoc() if new capExtent == firstExtent)
+ if (_details->capExtent() == _details->lastExtent(txn))
+ setLastDelRecLastExtent(txn, DiskLoc());
+ else {
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ for (; !i.isNull() && nextIsInCapExtent(i); i = drec(i)->nextDeleted())
+ ;
+ setLastDelRecLastExtent(txn, i);
}
- void CappedRecordStoreV1::advanceCapExtent( OperationContext* txn, StringData ns ) {
- // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
- // (or DiskLoc() if new capExtent == firstExtent)
- if ( _details->capExtent() == _details->lastExtent(txn) )
- setLastDelRecLastExtent( txn, DiskLoc() );
- else {
- DiskLoc i = cappedFirstDeletedInCurExtent();
- for (; !i.isNull() && nextIsInCapExtent( i ); i = drec(i)->nextDeleted() );
- setLastDelRecLastExtent( txn, i );
- }
+ _details->setCapExtent(
+ txn, theCapExtent()->xnext.isNull() ? _details->firstExtent(txn) : theCapExtent()->xnext);
- _details->setCapExtent( txn,
- theCapExtent()->xnext.isNull() ? _details->firstExtent(txn)
- : theCapExtent()->xnext );
+ /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
+ // dassert( theCapExtent()->ns == ns );
- /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
- //dassert( theCapExtent()->ns == ns );
+ theCapExtent()->assertOk();
+ _details->setCapFirstNewRecord(txn, DiskLoc());
+}
- theCapExtent()->assertOk();
- _details->setCapFirstNewRecord( txn, DiskLoc() );
+DiskLoc CappedRecordStoreV1::__capAlloc(OperationContext* txn, int len) {
+ DiskLoc prev = cappedLastDelRecLastExtent();
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ DiskLoc ret;
+ for (; !i.isNull() && inCapExtent(i); prev = i, i = drec(i)->nextDeleted()) {
+ // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
+ // so make sure there's space to create a DR at the end.
+ if (drec(i)->lengthWithHeaders() >= len + 24) {
+ ret = i;
+ break;
+ }
}
- DiskLoc CappedRecordStoreV1::__capAlloc( OperationContext* txn, int len ) {
- DiskLoc prev = cappedLastDelRecLastExtent();
- DiskLoc i = cappedFirstDeletedInCurExtent();
- DiskLoc ret;
- for (; !i.isNull() && inCapExtent( i ); prev = i, i = drec(i)->nextDeleted() ) {
- // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
- // so make sure there's space to create a DR at the end.
- if ( drec(i)->lengthWithHeaders() >= len + 24 ) {
- ret = i;
- break;
- }
- }
+ /* unlink ourself from the deleted list */
+ if (!ret.isNull()) {
+ if (prev.isNull())
+ setListOfAllDeletedRecords(txn, drec(ret)->nextDeleted());
+ else
+ *txn->recoveryUnit()->writing(&drec(prev)->nextDeleted()) = drec(ret)->nextDeleted();
+ *txn->recoveryUnit()->writing(&drec(ret)->nextDeleted()) =
+ DiskLoc().setInvalid(); // defensive.
+ invariant(drec(ret)->extentOfs() < ret.getOfs());
+ }
- /* unlink ourself from the deleted list */
- if ( !ret.isNull() ) {
- if ( prev.isNull() )
- setListOfAllDeletedRecords( txn, drec(ret)->nextDeleted() );
- else
- *txn->recoveryUnit()->writing(&drec(prev)->nextDeleted()) = drec(ret)->nextDeleted();
- *txn->recoveryUnit()->writing(&drec(ret)->nextDeleted()) = DiskLoc().setInvalid(); // defensive.
- invariant( drec(ret)->extentOfs() < ret.getOfs() );
- }
+ return ret;
+}
- return ret;
+void CappedRecordStoreV1::cappedTruncateLastDelUpdate(OperationContext* txn) {
+ if (_details->capExtent() == _details->firstExtent(txn)) {
+ // Only one extent of the collection is in use, so there
+ // is no deleted record in a previous extent, so nullify
+ // cappedLastDelRecLastExtent().
+ setLastDelRecLastExtent(txn, DiskLoc());
+ } else {
+ // Scan through all deleted records in the collection
+ // until the last deleted record for the extent prior
+ // to the new capExtent is found. Then set
+ // cappedLastDelRecLastExtent() to that deleted record.
+ DiskLoc i = cappedListOfAllDeletedRecords();
+ for (; !drec(i)->nextDeleted().isNull() && !inCapExtent(drec(i)->nextDeleted());
+ i = drec(i)->nextDeleted())
+ ;
+ // In our capped storage model, every extent must have at least one
+ // deleted record. Here we check that 'i' is not the last deleted
+ // record. (We expect that there will be deleted records in the new
+ // capExtent as well.)
+ invariant(!drec(i)->nextDeleted().isNull());
+ setLastDelRecLastExtent(txn, i);
}
+}
- void CappedRecordStoreV1::cappedTruncateLastDelUpdate(OperationContext* txn) {
- if ( _details->capExtent() == _details->firstExtent(txn) ) {
- // Only one extent of the collection is in use, so there
- // is no deleted record in a previous extent, so nullify
- // cappedLastDelRecLastExtent().
- setLastDelRecLastExtent( txn, DiskLoc() );
+void CappedRecordStoreV1::cappedTruncateAfter(OperationContext* txn,
+ const char* ns,
+ DiskLoc end,
+ bool inclusive) {
+ invariant(cappedLastDelRecLastExtent().isValid());
+
+ // We iteratively remove the newest document until the newest document
+ // is 'end', then we remove 'end' if requested.
+ bool foundLast = false;
+ while (1) {
+ if (foundLast) {
+ // 'end' has been found and removed, so break.
+ break;
}
- else {
- // Scan through all deleted records in the collection
- // until the last deleted record for the extent prior
- // to the new capExtent is found. Then set
- // cappedLastDelRecLastExtent() to that deleted record.
- DiskLoc i = cappedListOfAllDeletedRecords();
- for( ;
- !drec(i)->nextDeleted().isNull() &&
- !inCapExtent( drec(i)->nextDeleted() );
- i = drec(i)->nextDeleted() );
- // In our capped storage model, every extent must have at least one
- // deleted record. Here we check that 'i' is not the last deleted
- // record. (We expect that there will be deleted records in the new
- // capExtent as well.)
- invariant( !drec(i)->nextDeleted().isNull() );
- setLastDelRecLastExtent( txn, i );
- }
- }
-
- void CappedRecordStoreV1::cappedTruncateAfter(OperationContext* txn,
- const char* ns,
- DiskLoc end,
- bool inclusive) {
- invariant( cappedLastDelRecLastExtent().isValid() );
-
- // We iteratively remove the newest document until the newest document
- // is 'end', then we remove 'end' if requested.
- bool foundLast = false;
- while( 1 ) {
- if ( foundLast ) {
- // 'end' has been found and removed, so break.
+ // 'curr' will point to the newest document in the collection.
+ const DiskLoc curr = theCapExtent()->lastRecord;
+ const RecordId currId = curr.toRecordId();
+ invariant(!curr.isNull());
+ if (curr == end) {
+ if (inclusive) {
+ // 'end' has been found, so break next iteration.
+ foundLast = true;
+ } else {
+ // 'end' has been found, so break.
break;
}
- // 'curr' will point to the newest document in the collection.
- const DiskLoc curr = theCapExtent()->lastRecord;
- const RecordId currId = curr.toRecordId();
- invariant( !curr.isNull() );
- if ( curr == end ) {
- if ( inclusive ) {
- // 'end' has been found, so break next iteration.
- foundLast = true;
- }
- else {
- // 'end' has been found, so break.
- break;
- }
- }
-
- // TODO The algorithm used in this function cannot generate an
- // empty collection, but we could call emptyCappedCollection() in
- // this case instead of asserting.
- uassert( 13415, "emptying the collection is not allowed", _details->numRecords() > 1 );
-
- WriteUnitOfWork wunit(txn);
- // Delete the newest record, and coalesce the new deleted
- // record with existing deleted records.
- Status status = _deleteCallback->aboutToDeleteCapped(txn, currId, dataFor(txn, currId));
- uassertStatusOK( status );
- deleteRecord( txn, currId );
- _compact(txn);
-
- // This is the case where we have not yet had to remove any
- // documents to make room for other documents, and we are allocating
- // documents from free space in fresh extents instead of reusing
- // space from familiar extents.
- if ( !_details->capLooped() ) {
-
- // We just removed the last record from the 'capExtent', and
- // the 'capExtent' can't be empty, so we set 'capExtent' to
- // capExtent's prev extent.
- if ( theCapExtent()->lastRecord.isNull() ) {
- invariant( !theCapExtent()->xprev.isNull() );
- // NOTE Because we didn't delete the last document, and
- // capLooped() is false, capExtent is not the first extent
- // so xprev will be nonnull.
- _details->setCapExtent( txn, theCapExtent()->xprev );
- theCapExtent()->assertOk();
-
- // update cappedLastDelRecLastExtent()
- cappedTruncateLastDelUpdate(txn);
- }
- wunit.commit();
- continue;
- }
-
- // This is the case where capLooped() is true, and we just deleted
- // from capExtent, and we just deleted capFirstNewRecord, which was
- // the last record on the fresh side of capExtent.
- // NOTE In this comparison, curr and potentially capFirstNewRecord
- // may point to invalid data, but we can still compare the
- // references themselves.
- if ( curr == _details->capFirstNewRecord() ) {
-
- // Set 'capExtent' to the first nonempty extent prior to the
- // initial capExtent. There must be such an extent because we
- // have not deleted the last document in the collection. It is
- // possible that all extents other than the capExtent are empty.
- // In this case we will keep the initial capExtent and specify
- // that all records contained within are on the fresh rather than
- // stale side of the extent.
- DiskLoc newCapExtent = _details->capExtent();
- do {
- // Find the previous extent, looping if necessary.
- newCapExtent = ( newCapExtent == _details->firstExtent(txn) ) ?
- _details->lastExtent(txn) :
- _extentManager->getExtent(newCapExtent)->xprev;
- _extentManager->getExtent(newCapExtent)->assertOk();
- }
- while ( _extentManager->getExtent(newCapExtent)->firstRecord.isNull() );
- _details->setCapExtent( txn, newCapExtent );
+ }
- // Place all documents in the new capExtent on the fresh side
- // of the capExtent by setting capFirstNewRecord to the first
- // document in the new capExtent.
- _details->setCapFirstNewRecord( txn, theCapExtent()->firstRecord );
+ // TODO The algorithm used in this function cannot generate an
+ // empty collection, but we could call emptyCappedCollection() in
+ // this case instead of asserting.
+ uassert(13415, "emptying the collection is not allowed", _details->numRecords() > 1);
+
+ WriteUnitOfWork wunit(txn);
+ // Delete the newest record, and coalesce the new deleted
+ // record with existing deleted records.
+ Status status = _deleteCallback->aboutToDeleteCapped(txn, currId, dataFor(txn, currId));
+ uassertStatusOK(status);
+ deleteRecord(txn, currId);
+ _compact(txn);
+
+ // This is the case where we have not yet had to remove any
+ // documents to make room for other documents, and we are allocating
+ // documents from free space in fresh extents instead of reusing
+ // space from familiar extents.
+ if (!_details->capLooped()) {
+ // We just removed the last record from the 'capExtent', and
+ // the 'capExtent' can't be empty, so we set 'capExtent' to
+ // capExtent's prev extent.
+ if (theCapExtent()->lastRecord.isNull()) {
+ invariant(!theCapExtent()->xprev.isNull());
+ // NOTE Because we didn't delete the last document, and
+ // capLooped() is false, capExtent is not the first extent
+ // so xprev will be nonnull.
+ _details->setCapExtent(txn, theCapExtent()->xprev);
+ theCapExtent()->assertOk();
// update cappedLastDelRecLastExtent()
cappedTruncateLastDelUpdate(txn);
}
-
wunit.commit();
+ continue;
}
- }
- DiskLoc CappedRecordStoreV1::cappedListOfAllDeletedRecords() const {
- return _details->deletedListEntry(0);
- }
+ // This is the case where capLooped() is true, and we just deleted
+ // from capExtent, and we just deleted capFirstNewRecord, which was
+ // the last record on the fresh side of capExtent.
+ // NOTE In this comparison, curr and potentially capFirstNewRecord
+ // may point to invalid data, but we can still compare the
+ // references themselves.
+ if (curr == _details->capFirstNewRecord()) {
+ // Set 'capExtent' to the first nonempty extent prior to the
+ // initial capExtent. There must be such an extent because we
+ // have not deleted the last document in the collection. It is
+ // possible that all extents other than the capExtent are empty.
+ // In this case we will keep the initial capExtent and specify
+ // that all records contained within are on the fresh rather than
+ // stale side of the extent.
+ DiskLoc newCapExtent = _details->capExtent();
+ do {
+ // Find the previous extent, looping if necessary.
+ newCapExtent = (newCapExtent == _details->firstExtent(txn))
+ ? _details->lastExtent(txn)
+ : _extentManager->getExtent(newCapExtent)->xprev;
+ _extentManager->getExtent(newCapExtent)->assertOk();
+ } while (_extentManager->getExtent(newCapExtent)->firstRecord.isNull());
+ _details->setCapExtent(txn, newCapExtent);
+
+ // Place all documents in the new capExtent on the fresh side
+ // of the capExtent by setting capFirstNewRecord to the first
+ // document in the new capExtent.
+ _details->setCapFirstNewRecord(txn, theCapExtent()->firstRecord);
+
+ // update cappedLastDelRecLastExtent()
+ cappedTruncateLastDelUpdate(txn);
+ }
- void CappedRecordStoreV1::setListOfAllDeletedRecords( OperationContext* txn,
- const DiskLoc& loc ) {
- return _details->setDeletedListEntry(txn, 0, loc);
+ wunit.commit();
}
+}
- DiskLoc CappedRecordStoreV1::cappedLastDelRecLastExtent() const {
- return _details->deletedListEntry(1);
- }
+DiskLoc CappedRecordStoreV1::cappedListOfAllDeletedRecords() const {
+ return _details->deletedListEntry(0);
+}
- void CappedRecordStoreV1::setLastDelRecLastExtent( OperationContext* txn,
- const DiskLoc& loc ) {
- return _details->setDeletedListEntry(txn, 1, loc);
- }
+void CappedRecordStoreV1::setListOfAllDeletedRecords(OperationContext* txn, const DiskLoc& loc) {
+ return _details->setDeletedListEntry(txn, 0, loc);
+}
- Extent* CappedRecordStoreV1::theCapExtent() const {
- return _extentManager->getExtent(_details->capExtent());
- }
+DiskLoc CappedRecordStoreV1::cappedLastDelRecLastExtent() const {
+ return _details->deletedListEntry(1);
+}
- void CappedRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) {
- DeletedRecord* d = txn->recoveryUnit()->writing( drec( dloc ) );
-
- if ( !cappedLastDelRecLastExtent().isValid() ) {
- // Initial extent allocation. Insert at end.
- d->nextDeleted() = DiskLoc();
- if ( cappedListOfAllDeletedRecords().isNull() )
- setListOfAllDeletedRecords( txn, dloc );
- else {
- DiskLoc i = cappedListOfAllDeletedRecords();
- for (; !drec(i)->nextDeleted().isNull(); i = drec(i)->nextDeleted() )
- ;
- *txn->recoveryUnit()->writing(&drec(i)->nextDeleted()) = dloc;
- }
- }
+void CappedRecordStoreV1::setLastDelRecLastExtent(OperationContext* txn, const DiskLoc& loc) {
+ return _details->setDeletedListEntry(txn, 1, loc);
+}
+
+Extent* CappedRecordStoreV1::theCapExtent() const {
+ return _extentManager->getExtent(_details->capExtent());
+}
+
+void CappedRecordStoreV1::addDeletedRec(OperationContext* txn, const DiskLoc& dloc) {
+ DeletedRecord* d = txn->recoveryUnit()->writing(drec(dloc));
+
+ if (!cappedLastDelRecLastExtent().isValid()) {
+ // Initial extent allocation. Insert at end.
+ d->nextDeleted() = DiskLoc();
+ if (cappedListOfAllDeletedRecords().isNull())
+ setListOfAllDeletedRecords(txn, dloc);
else {
- d->nextDeleted() = cappedFirstDeletedInCurExtent();
- setFirstDeletedInCurExtent( txn, dloc );
- // always _compact() after this so order doesn't matter
+ DiskLoc i = cappedListOfAllDeletedRecords();
+ for (; !drec(i)->nextDeleted().isNull(); i = drec(i)->nextDeleted())
+ ;
+ *txn->recoveryUnit()->writing(&drec(i)->nextDeleted()) = dloc;
}
+ } else {
+ d->nextDeleted() = cappedFirstDeletedInCurExtent();
+ setFirstDeletedInCurExtent(txn, dloc);
+ // always _compact() after this so order doesn't matter
}
+}
- std::unique_ptr<RecordCursor> CappedRecordStoreV1::getCursor(OperationContext* txn,
- bool forward) const {
-
- return stdx::make_unique<CappedRecordStoreV1Iterator>(txn, this, forward);
- }
+std::unique_ptr<RecordCursor> CappedRecordStoreV1::getCursor(OperationContext* txn,
+ bool forward) const {
+ return stdx::make_unique<CappedRecordStoreV1Iterator>(txn, this, forward);
+}
- vector<std::unique_ptr<RecordCursor>> CappedRecordStoreV1::getManyCursors(
- OperationContext* txn) const {
- vector<std::unique_ptr<RecordCursor>> cursors;
+vector<std::unique_ptr<RecordCursor>> CappedRecordStoreV1::getManyCursors(
+ OperationContext* txn) const {
+ vector<std::unique_ptr<RecordCursor>> cursors;
- if (!_details->capLooped()) {
- // if we haven't looped yet, just spit out all extents (same as non-capped impl)
- const Extent* ext;
- for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
- ext = _getExtent(txn, extLoc);
- if (ext->firstRecord.isNull())
- continue;
+ if (!_details->capLooped()) {
+ // if we haven't looped yet, just spit out all extents (same as non-capped impl)
+ const Extent* ext;
+ for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
+ ext = _getExtent(txn, extLoc);
+ if (ext->firstRecord.isNull())
+ continue;
- cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(
- txn, ext->firstRecord, this));
- }
+ cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(
+ txn, ext->firstRecord, this));
}
- else {
- // if we've looped we need to iterate the extents, starting and ending with the
- // capExtent
- const DiskLoc capExtent = details()->capExtent();
- invariant(!capExtent.isNull());
- invariant(capExtent.isValid());
-
- // First do the "old" portion of capExtent if there is any
- DiskLoc extLoc = capExtent;
- {
- const Extent* ext = _getExtent(txn, extLoc);
- if (ext->firstRecord != details()->capFirstNewRecord()) {
- // this means there is old data in capExtent
- cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(
- txn, ext->firstRecord, this));
- }
-
- extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext;
- }
-
- // Next handle all the other extents
- while (extLoc != capExtent) {
- const Extent* ext = _getExtent(txn, extLoc);
+ } else {
+ // if we've looped we need to iterate the extents, starting and ending with the
+ // capExtent
+ const DiskLoc capExtent = details()->capExtent();
+ invariant(!capExtent.isNull());
+ invariant(capExtent.isValid());
+
+ // First do the "old" portion of capExtent if there is any
+ DiskLoc extLoc = capExtent;
+ {
+ const Extent* ext = _getExtent(txn, extLoc);
+ if (ext->firstRecord != details()->capFirstNewRecord()) {
+ // this means there is old data in capExtent
cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(
- txn, ext->firstRecord, this));
-
- extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext;
+ txn, ext->firstRecord, this));
}
- // Finally handle the "new" data in the capExtent
+ extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext;
+ }
+
+ // Next handle all the other extents
+ while (extLoc != capExtent) {
+ const Extent* ext = _getExtent(txn, extLoc);
cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(
- txn, details()->capFirstNewRecord(), this));
+ txn, ext->firstRecord, this));
+
+ extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext;
}
- return cursors;
+ // Finally handle the "new" data in the capExtent
+ cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(
+ txn, details()->capFirstNewRecord(), this));
}
- void CappedRecordStoreV1::_maybeComplain( OperationContext* txn, int len ) const {
- RARELY {
- std::stringstream buf;
- buf << "couldn't make room for record len: " << len << " in capped ns " << _ns << '\n';
- buf << "numRecords: " << numRecords(txn) << '\n';
- int i = 0;
- for ( DiskLoc e = _details->firstExtent(txn);
- !e.isNull();
- e = _extentManager->getExtent( e )->xnext, ++i ) {
- buf << " Extent " << i;
- if ( e == _details->capExtent() )
- buf << " (capExtent)";
- buf << ' ' << e;
- buf << '\n';
-
- buf << " magic: " << hex << _extentManager->getExtent( e )->magic << dec
- << " extent->ns: " << _extentManager->getExtent( e )->nsDiagnostic.toString()
- << '\n';
- buf << " fr: " << _extentManager->getExtent( e )->firstRecord.toString()
- << " lr: " << _extentManager->getExtent( e )->lastRecord.toString()
- << " extent->len: " << _extentManager->getExtent( e )->length << '\n';
- }
-
- warning() << buf.str();
+ return cursors;
+}
- // assume it is unusually large record; if not, something is broken
- fassert( 17438, len * 5 > _details->lastExtentSize(txn) );
+void CappedRecordStoreV1::_maybeComplain(OperationContext* txn, int len) const {
+ RARELY {
+ std::stringstream buf;
+ buf << "couldn't make room for record len: " << len << " in capped ns " << _ns << '\n';
+ buf << "numRecords: " << numRecords(txn) << '\n';
+ int i = 0;
+ for (DiskLoc e = _details->firstExtent(txn); !e.isNull();
+ e = _extentManager->getExtent(e)->xnext, ++i) {
+ buf << " Extent " << i;
+ if (e == _details->capExtent())
+ buf << " (capExtent)";
+ buf << ' ' << e;
+ buf << '\n';
+
+ buf << " magic: " << hex << _extentManager->getExtent(e)->magic << dec
+ << " extent->ns: " << _extentManager->getExtent(e)->nsDiagnostic.toString() << '\n';
+ buf << " fr: " << _extentManager->getExtent(e)->firstRecord.toString()
+ << " lr: " << _extentManager->getExtent(e)->lastRecord.toString()
+ << " extent->len: " << _extentManager->getExtent(e)->length << '\n';
}
- }
-
- DiskLoc CappedRecordStoreV1::firstRecord( OperationContext* txn,
- const DiskLoc &startExtent ) const {
- for (DiskLoc i = startExtent.isNull() ? _details->firstExtent(txn) : startExtent;
- !i.isNull();
- i = _extentManager->getExtent( i )->xnext ) {
- Extent* e = _extentManager->getExtent( i );
+ warning() << buf.str();
- if ( !e->firstRecord.isNull() )
- return e->firstRecord;
- }
- return DiskLoc();
+ // assume it is unusually large record; if not, something is broken
+ fassert(17438, len * 5 > _details->lastExtentSize(txn));
}
+}
- DiskLoc CappedRecordStoreV1::lastRecord( OperationContext* txn,
- const DiskLoc &startExtent ) const {
- for (DiskLoc i = startExtent.isNull() ? _details->lastExtent(txn) : startExtent;
- !i.isNull();
- i = _extentManager->getExtent( i )->xprev ) {
+DiskLoc CappedRecordStoreV1::firstRecord(OperationContext* txn, const DiskLoc& startExtent) const {
+ for (DiskLoc i = startExtent.isNull() ? _details->firstExtent(txn) : startExtent; !i.isNull();
+ i = _extentManager->getExtent(i)->xnext) {
+ Extent* e = _extentManager->getExtent(i);
- Extent* e = _extentManager->getExtent( i );
- if ( !e->lastRecord.isNull() )
- return e->lastRecord;
- }
- return DiskLoc();
+ if (!e->firstRecord.isNull())
+ return e->firstRecord;
}
+ return DiskLoc();
+}
+DiskLoc CappedRecordStoreV1::lastRecord(OperationContext* txn, const DiskLoc& startExtent) const {
+ for (DiskLoc i = startExtent.isNull() ? _details->lastExtent(txn) : startExtent; !i.isNull();
+ i = _extentManager->getExtent(i)->xprev) {
+ Extent* e = _extentManager->getExtent(i);
+ if (!e->lastRecord.isNull())
+ return e->lastRecord;
+ }
+ return DiskLoc();
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h
index 186de786f37..83105fe8ff9 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h
@@ -38,95 +38,92 @@
namespace mongo {
- class CappedRecordStoreV1 final : public RecordStoreV1Base {
- public:
- CappedRecordStoreV1( OperationContext* txn,
- CappedDocumentDeleteCallback* collection,
- StringData ns,
- RecordStoreV1MetaData* details,
- ExtentManager* em,
- bool isSystemIndexes );
-
- ~CappedRecordStoreV1() final;
-
- const char* name() const final { return "CappedRecordStoreV1"; }
-
- Status truncate(OperationContext* txn) final;
-
- /**
- * Truncate documents newer than the document at 'end' from the capped
- * collection. The collection cannot be completely emptied using this
- * function. An assertion will be thrown if that is attempted.
- * @param inclusive - Truncate 'end' as well iff true
- * XXX: this will go away soon, just needed to move for now
- */
- void temp_cappedTruncateAfter(OperationContext* txn, RecordId end, bool inclusive) final;
-
- std::unique_ptr<RecordCursor> getCursor(OperationContext* txn, bool forward) const final;
-
- std::vector<std::unique_ptr<RecordCursor>> getManyCursors(
- OperationContext* txn) const final;
-
- // Start from firstExtent by default.
- DiskLoc firstRecord( OperationContext* txn,
- const DiskLoc &startExtent = DiskLoc() ) const;
- // Start from lastExtent by default.
- DiskLoc lastRecord( OperationContext* txn,
- const DiskLoc &startExtent = DiskLoc() ) const;
-
- protected:
-
- bool isCapped() const final { return true; }
- bool shouldPadInserts() const final { return false; }
-
- void setCappedDeleteCallback( CappedDocumentDeleteCallback* cb ) final {
- _deleteCallback = cb;
- }
-
- StatusWith<DiskLoc> allocRecord( OperationContext* txn,
- int lengthWithHeaders,
- bool enforceQuota ) final;
-
- void addDeletedRec(OperationContext* txn, const DiskLoc& dloc) final;
-
- private:
- // -- start copy from cap.cpp --
- void _compact(OperationContext* txn);
- DiskLoc cappedFirstDeletedInCurExtent() const;
- void setFirstDeletedInCurExtent( OperationContext* txn, const DiskLoc& loc );
- void cappedCheckMigrate(OperationContext* txn);
- DiskLoc __capAlloc( OperationContext* txn, int len );
- bool inCapExtent( const DiskLoc &dl ) const;
- DiskLoc cappedListOfAllDeletedRecords() const;
- DiskLoc cappedLastDelRecLastExtent() const;
- void setListOfAllDeletedRecords( OperationContext* txn, const DiskLoc& loc );
- void setLastDelRecLastExtent( OperationContext* txn, const DiskLoc& loc );
- Extent *theCapExtent() const;
- bool nextIsInCapExtent( const DiskLoc &dl ) const;
- void advanceCapExtent( OperationContext* txn, StringData ns );
- void cappedTruncateLastDelUpdate(OperationContext* txn);
-
- /**
- * Truncate documents newer than the document at 'end' from the capped
- * collection. The collection cannot be completely emptied using this
- * function. An assertion will be thrown if that is attempted.
- * @param inclusive - Truncate 'end' as well iff true
- */
- void cappedTruncateAfter(OperationContext* txn,
- const char* ns,
- DiskLoc end,
- bool inclusive);
-
- void _maybeComplain( OperationContext* txn, int len ) const;
-
- // -- end copy from cap.cpp --
-
- CappedDocumentDeleteCallback* _deleteCallback;
-
- OwnedPointerVector<ExtentManager::CacheHint> _extentAdvice;
-
- friend class CappedRecordStoreV1Iterator;
- };
-
-
+class CappedRecordStoreV1 final : public RecordStoreV1Base {
+public:
+ CappedRecordStoreV1(OperationContext* txn,
+ CappedDocumentDeleteCallback* collection,
+ StringData ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes);
+
+ ~CappedRecordStoreV1() final;
+
+ const char* name() const final {
+ return "CappedRecordStoreV1";
+ }
+
+ Status truncate(OperationContext* txn) final;
+
+ /**
+ * Truncate documents newer than the document at 'end' from the capped
+ * collection. The collection cannot be completely emptied using this
+ * function. An assertion will be thrown if that is attempted.
+ * @param inclusive - Truncate 'end' as well iff true
+ * XXX: this will go away soon, just needed to move for now
+ */
+ void temp_cappedTruncateAfter(OperationContext* txn, RecordId end, bool inclusive) final;
+
+ std::unique_ptr<RecordCursor> getCursor(OperationContext* txn, bool forward) const final;
+
+ std::vector<std::unique_ptr<RecordCursor>> getManyCursors(OperationContext* txn) const final;
+
+ // Start from firstExtent by default.
+ DiskLoc firstRecord(OperationContext* txn, const DiskLoc& startExtent = DiskLoc()) const;
+ // Start from lastExtent by default.
+ DiskLoc lastRecord(OperationContext* txn, const DiskLoc& startExtent = DiskLoc()) const;
+
+protected:
+ bool isCapped() const final {
+ return true;
+ }
+ bool shouldPadInserts() const final {
+ return false;
+ }
+
+ void setCappedDeleteCallback(CappedDocumentDeleteCallback* cb) final {
+ _deleteCallback = cb;
+ }
+
+ StatusWith<DiskLoc> allocRecord(OperationContext* txn,
+ int lengthWithHeaders,
+ bool enforceQuota) final;
+
+ void addDeletedRec(OperationContext* txn, const DiskLoc& dloc) final;
+
+private:
+ // -- start copy from cap.cpp --
+ void _compact(OperationContext* txn);
+ DiskLoc cappedFirstDeletedInCurExtent() const;
+ void setFirstDeletedInCurExtent(OperationContext* txn, const DiskLoc& loc);
+ void cappedCheckMigrate(OperationContext* txn);
+ DiskLoc __capAlloc(OperationContext* txn, int len);
+ bool inCapExtent(const DiskLoc& dl) const;
+ DiskLoc cappedListOfAllDeletedRecords() const;
+ DiskLoc cappedLastDelRecLastExtent() const;
+ void setListOfAllDeletedRecords(OperationContext* txn, const DiskLoc& loc);
+ void setLastDelRecLastExtent(OperationContext* txn, const DiskLoc& loc);
+ Extent* theCapExtent() const;
+ bool nextIsInCapExtent(const DiskLoc& dl) const;
+ void advanceCapExtent(OperationContext* txn, StringData ns);
+ void cappedTruncateLastDelUpdate(OperationContext* txn);
+
+ /**
+ * Truncate documents newer than the document at 'end' from the capped
+ * collection. The collection cannot be completely emptied using this
+ * function. An assertion will be thrown if that is attempted.
+ * @param inclusive - Truncate 'end' as well iff true
+ */
+ void cappedTruncateAfter(OperationContext* txn, const char* ns, DiskLoc end, bool inclusive);
+
+ void _maybeComplain(OperationContext* txn, int len) const;
+
+ // -- end copy from cap.cpp --
+
+ CappedDocumentDeleteCallback* _deleteCallback;
+
+ OwnedPointerVector<ExtentManager::CacheHint> _extentAdvice;
+
+ friend class CappedRecordStoreV1Iterator;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp
index ea77d224488..353a7f39c0c 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp
@@ -36,181 +36,181 @@
namespace mongo {
- //
- // Capped collection traversal
- //
- CappedRecordStoreV1Iterator::CappedRecordStoreV1Iterator(OperationContext* txn,
- const CappedRecordStoreV1* collection,
- bool forward)
- : _txn(txn), _recordStore(collection), _forward(forward) {
-
- const RecordStoreV1MetaData* nsd = _recordStore->details();
-
- // If a start position isn't specified, we fill one out from the start of the
- // collection.
- if (_forward) {
- // Going forwards.
- if (!nsd->capLooped()) {
- // If our capped collection doesn't loop around, the first record is easy.
- _curr = collection->firstRecord(_txn);
- }
- else {
- // Our capped collection has "looped' around.
- // Copied verbatim from ForwardCappedCursor::init.
- // TODO ELABORATE
- _curr = _getExtent( nsd->capExtent() )->firstRecord;
- if (!_curr.isNull() && _curr == nsd->capFirstNewRecord()) {
- _curr = _getExtent( nsd->capExtent() )->lastRecord;
- _curr = nextLoop(_curr);
- }
- }
- }
- else {
- // Going backwards
- if (!nsd->capLooped()) {
- // Start at the end.
- _curr = collection->lastRecord(_txn);
- }
- else {
- _curr = _getExtent( nsd->capExtent() )->lastRecord;
+//
+// Capped collection traversal
+//
+CappedRecordStoreV1Iterator::CappedRecordStoreV1Iterator(OperationContext* txn,
+ const CappedRecordStoreV1* collection,
+ bool forward)
+ : _txn(txn), _recordStore(collection), _forward(forward) {
+ const RecordStoreV1MetaData* nsd = _recordStore->details();
+
+ // If a start position isn't specified, we fill one out from the start of the
+ // collection.
+ if (_forward) {
+ // Going forwards.
+ if (!nsd->capLooped()) {
+ // If our capped collection doesn't loop around, the first record is easy.
+ _curr = collection->firstRecord(_txn);
+ } else {
+ // Our capped collection has "looped' around.
+ // Copied verbatim from ForwardCappedCursor::init.
+ // TODO ELABORATE
+ _curr = _getExtent(nsd->capExtent())->firstRecord;
+ if (!_curr.isNull() && _curr == nsd->capFirstNewRecord()) {
+ _curr = _getExtent(nsd->capExtent())->lastRecord;
+ _curr = nextLoop(_curr);
}
}
- }
-
- boost::optional<Record> CappedRecordStoreV1Iterator::next() {
- if (isEOF()) return {};
- auto toReturn = _curr.toRecordId();
- _curr = getNextCapped(_curr);
- return {{toReturn, _recordStore->RecordStore::dataFor(_txn, toReturn)}};
- }
-
- boost::optional<Record> CappedRecordStoreV1Iterator::seekExact(const RecordId& id) {
- _curr = getNextCapped(DiskLoc::fromRecordId(id));
- return {{id, _recordStore->RecordStore::dataFor(_txn, id)}};
- }
-
- void CappedRecordStoreV1Iterator::invalidate(const RecordId& id) {
- const DiskLoc dl = DiskLoc::fromRecordId(id);
- if (dl == _curr) {
- // We *could* move to the next thing, since there is actually a next
- // thing, but according to clientcursor.cpp:
- // "note we cannot advance here. if this condition occurs, writes to the oplog
- // have "caught" the reader. skipping ahead, the reader would miss potentially
- // important data."
- _curr = DiskLoc();
- _killedByInvalidate = true;
+ } else {
+ // Going backwards
+ if (!nsd->capLooped()) {
+ // Start at the end.
+ _curr = collection->lastRecord(_txn);
+ } else {
+ _curr = _getExtent(nsd->capExtent())->lastRecord;
}
}
-
- void CappedRecordStoreV1Iterator::savePositioned() {
- _txn = nullptr;
+}
+
+boost::optional<Record> CappedRecordStoreV1Iterator::next() {
+ if (isEOF())
+ return {};
+ auto toReturn = _curr.toRecordId();
+ _curr = getNextCapped(_curr);
+ return {{toReturn, _recordStore->RecordStore::dataFor(_txn, toReturn)}};
+}
+
+boost::optional<Record> CappedRecordStoreV1Iterator::seekExact(const RecordId& id) {
+ _curr = getNextCapped(DiskLoc::fromRecordId(id));
+ return {{id, _recordStore->RecordStore::dataFor(_txn, id)}};
+}
+
+void CappedRecordStoreV1Iterator::invalidate(const RecordId& id) {
+ const DiskLoc dl = DiskLoc::fromRecordId(id);
+ if (dl == _curr) {
+ // We *could* move to the next thing, since there is actually a next
+ // thing, but according to clientcursor.cpp:
+ // "note we cannot advance here. if this condition occurs, writes to the oplog
+ // have "caught" the reader. skipping ahead, the reader would miss potentially
+ // important data."
+ _curr = DiskLoc();
+ _killedByInvalidate = true;
}
+}
- bool CappedRecordStoreV1Iterator::restore(OperationContext* txn) {
- _txn = txn;
- return !_killedByInvalidate;
- }
+void CappedRecordStoreV1Iterator::savePositioned() {
+ _txn = nullptr;
+}
- DiskLoc CappedRecordStoreV1Iterator::getNextCapped(const DiskLoc& dl) {
- invariant(!dl.isNull());
- const RecordStoreV1MetaData* details = _recordStore->details();
+bool CappedRecordStoreV1Iterator::restore(OperationContext* txn) {
+ _txn = txn;
+ return !_killedByInvalidate;
+}
- if (_forward) {
- // If it's not looped, it's easy.
- if (!_recordStore->details()->capLooped()) {
- return _getNextRecord( dl );
- }
+DiskLoc CappedRecordStoreV1Iterator::getNextCapped(const DiskLoc& dl) {
+ invariant(!dl.isNull());
+ const RecordStoreV1MetaData* details = _recordStore->details();
- // TODO ELABORATE
- // EOF.
- if (dl == _getExtent( details->capExtent() )->lastRecord) {
- return DiskLoc();
- }
+ if (_forward) {
+ // If it's not looped, it's easy.
+ if (!_recordStore->details()->capLooped()) {
+ return _getNextRecord(dl);
+ }
- DiskLoc ret = nextLoop(dl);
+ // TODO ELABORATE
+ // EOF.
+ if (dl == _getExtent(details->capExtent())->lastRecord) {
+ return DiskLoc();
+ }
- // If we become capFirstNewRecord from same extent, advance to next extent.
- if (ret == details->capFirstNewRecord() && ret != _getExtent( details->capExtent() )->firstRecord) {
- ret = nextLoop(_getExtent( details->capExtent() )->lastRecord);
- }
+ DiskLoc ret = nextLoop(dl);
- // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord
- if (ret == _getExtent( details->capExtent() )->firstRecord) { ret = details->capFirstNewRecord(); }
+ // If we become capFirstNewRecord from same extent, advance to next extent.
+ if (ret == details->capFirstNewRecord() &&
+ ret != _getExtent(details->capExtent())->firstRecord) {
+ ret = nextLoop(_getExtent(details->capExtent())->lastRecord);
+ }
- return ret;
+ // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord
+ if (ret == _getExtent(details->capExtent())->firstRecord) {
+ ret = details->capFirstNewRecord();
}
- else {
- if (!details->capLooped()) { return _getPrevRecord( dl ); }
- // TODO ELABORATE
- // Last record
- if (details->capFirstNewRecord() == _getExtent( details->capExtent() )->firstRecord) {
- if (dl == nextLoop(_getExtent( details->capExtent() )->lastRecord)) {
- return DiskLoc();
- }
- }
- else {
- if (dl == _getExtent( details->capExtent() )->firstRecord) { return DiskLoc(); }
- }
+ return ret;
+ } else {
+ if (!details->capLooped()) {
+ return _getPrevRecord(dl);
+ }
- DiskLoc ret;
- // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev.
- if (dl == details->capFirstNewRecord()) {
- ret = prevLoop(_getExtent( details->capExtent() )->firstRecord);
- }
- else {
- ret = prevLoop(dl);
+ // TODO ELABORATE
+ // Last record
+ if (details->capFirstNewRecord() == _getExtent(details->capExtent())->firstRecord) {
+ if (dl == nextLoop(_getExtent(details->capExtent())->lastRecord)) {
+ return DiskLoc();
}
-
- // If we just became last in cap extent, advance past capFirstNewRecord
- // (We know ext(capExtent)->firstRecord != capFirstNewRecord, since would
- // have returned DiskLoc() earlier otherwise.)
- if (ret == _getExtent( details->capExtent() )->lastRecord) {
- ret = _getPrevRecord( details->capFirstNewRecord() );
+ } else {
+ if (dl == _getExtent(details->capExtent())->firstRecord) {
+ return DiskLoc();
}
+ }
- return ret;
+ DiskLoc ret;
+ // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev.
+ if (dl == details->capFirstNewRecord()) {
+ ret = prevLoop(_getExtent(details->capExtent())->firstRecord);
+ } else {
+ ret = prevLoop(dl);
}
- }
- DiskLoc CappedRecordStoreV1Iterator::nextLoop(const DiskLoc& prev) {
- // TODO ELABORATE
- DiskLoc next = _getNextRecord( prev );
- if (!next.isNull()) {
- return next;
+ // If we just became last in cap extent, advance past capFirstNewRecord
+ // (We know ext(capExtent)->firstRecord != capFirstNewRecord, since would
+ // have returned DiskLoc() earlier otherwise.)
+ if (ret == _getExtent(details->capExtent())->lastRecord) {
+ ret = _getPrevRecord(details->capFirstNewRecord());
}
- return _recordStore->firstRecord(_txn);
+
+ return ret;
}
+}
- DiskLoc CappedRecordStoreV1Iterator::prevLoop(const DiskLoc& curr) {
- // TODO ELABORATE
- DiskLoc prev = _getPrevRecord( curr );
- if (!prev.isNull()) {
- return prev;
- }
- return _recordStore->lastRecord(_txn);
+DiskLoc CappedRecordStoreV1Iterator::nextLoop(const DiskLoc& prev) {
+ // TODO ELABORATE
+ DiskLoc next = _getNextRecord(prev);
+ if (!next.isNull()) {
+ return next;
+ }
+ return _recordStore->firstRecord(_txn);
+}
+
+DiskLoc CappedRecordStoreV1Iterator::prevLoop(const DiskLoc& curr) {
+ // TODO ELABORATE
+ DiskLoc prev = _getPrevRecord(curr);
+ if (!prev.isNull()) {
+ return prev;
}
+ return _recordStore->lastRecord(_txn);
+}
- Extent* CappedRecordStoreV1Iterator::_getExtent( const DiskLoc& loc ) {
- return _recordStore->_extentManager->getExtent( loc );
- }
+Extent* CappedRecordStoreV1Iterator::_getExtent(const DiskLoc& loc) {
+ return _recordStore->_extentManager->getExtent(loc);
+}
- DiskLoc CappedRecordStoreV1Iterator::_getNextRecord( const DiskLoc& loc ) {
- return _recordStore->getNextRecord( _txn, loc );
- }
+DiskLoc CappedRecordStoreV1Iterator::_getNextRecord(const DiskLoc& loc) {
+ return _recordStore->getNextRecord(_txn, loc);
+}
- DiskLoc CappedRecordStoreV1Iterator::_getPrevRecord( const DiskLoc& loc ) {
- return _recordStore->getPrevRecord( _txn, loc );
- }
+DiskLoc CappedRecordStoreV1Iterator::_getPrevRecord(const DiskLoc& loc) {
+ return _recordStore->getPrevRecord(_txn, loc);
+}
- std::unique_ptr<RecordFetcher> CappedRecordStoreV1Iterator::fetcherForNext() const {
- return _recordStore->_extentManager->recordNeedsFetch(_curr);
- }
+std::unique_ptr<RecordFetcher> CappedRecordStoreV1Iterator::fetcherForNext() const {
+ return _recordStore->_extentManager->recordNeedsFetch(_curr);
+}
- std::unique_ptr<RecordFetcher> CappedRecordStoreV1Iterator::fetcherForId(
- const RecordId& id) const {
- return _recordStore->_extentManager->recordNeedsFetch(DiskLoc::fromRecordId(id));
- }
+std::unique_ptr<RecordFetcher> CappedRecordStoreV1Iterator::fetcherForId(const RecordId& id) const {
+ return _recordStore->_extentManager->recordNeedsFetch(DiskLoc::fromRecordId(id));
+}
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h
index de2b6fda5e3..0a366d9921a 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h
@@ -33,58 +33,60 @@
namespace mongo {
- class CappedRecordStoreV1;
+class CappedRecordStoreV1;
- struct Extent;
+struct Extent;
+
+/**
+ * This class iterates over a capped collection identified by 'ns'.
+ * The collection must exist when the constructor is called.
+ */
+class CappedRecordStoreV1Iterator final : public RecordCursor {
+public:
+ CappedRecordStoreV1Iterator(OperationContext* txn,
+ const CappedRecordStoreV1* collection,
+ bool forward);
+
+ boost::optional<Record> next() final;
+ boost::optional<Record> seekExact(const RecordId& id) final;
+ void savePositioned() final;
+ bool restore(OperationContext* txn) final;
+ void invalidate(const RecordId& dl) final;
+ std::unique_ptr<RecordFetcher> fetcherForNext() const final;
+ std::unique_ptr<RecordFetcher> fetcherForId(const RecordId& id) const final;
+
+private:
+ void advance();
+ bool isEOF() {
+ return _curr.isNull();
+ }
/**
- * This class iterates over a capped collection identified by 'ns'.
- * The collection must exist when the constructor is called.
+ * Internal collection navigation helper methods.
*/
- class CappedRecordStoreV1Iterator final : public RecordCursor {
- public:
- CappedRecordStoreV1Iterator( OperationContext* txn,
- const CappedRecordStoreV1* collection,
- bool forward );
-
- boost::optional<Record> next() final;
- boost::optional<Record> seekExact(const RecordId& id) final;
- void savePositioned() final;
- bool restore(OperationContext* txn) final;
- void invalidate(const RecordId& dl) final;
- std::unique_ptr<RecordFetcher> fetcherForNext() const final;
- std::unique_ptr<RecordFetcher> fetcherForId(const RecordId& id) const final;
-
- private:
- void advance();
- bool isEOF() { return _curr.isNull(); }
-
- /**
- * Internal collection navigation helper methods.
- */
- DiskLoc getNextCapped(const DiskLoc& dl);
- DiskLoc prevLoop(const DiskLoc& curr);
- DiskLoc nextLoop(const DiskLoc& prev);
-
- // some helpers - these move to RecordStore probably
- Extent* _getExtent( const DiskLoc& loc );
- DiskLoc _getNextRecord( const DiskLoc& loc );
- DiskLoc _getPrevRecord( const DiskLoc& loc );
-
- // transactional context for read locks. Not owned by us
- OperationContext* _txn;
-
- // The collection we're iterating over.
- const CappedRecordStoreV1* const _recordStore;
-
- // The result returned on the next call to getNext().
- DiskLoc _curr;
-
- const bool _forward;
-
- // If invalidate kills the DiskLoc we need to move forward, we kill the iterator. See the
- // comment in the body of invalidate(...).
- bool _killedByInvalidate = false;
- };
+ DiskLoc getNextCapped(const DiskLoc& dl);
+ DiskLoc prevLoop(const DiskLoc& curr);
+ DiskLoc nextLoop(const DiskLoc& prev);
+
+ // some helpers - these move to RecordStore probably
+ Extent* _getExtent(const DiskLoc& loc);
+ DiskLoc _getNextRecord(const DiskLoc& loc);
+ DiskLoc _getPrevRecord(const DiskLoc& loc);
+
+ // transactional context for read locks. Not owned by us
+ OperationContext* _txn;
+
+ // The collection we're iterating over.
+ const CappedRecordStoreV1* const _recordStore;
+
+ // The result returned on the next call to getNext().
+ DiskLoc _curr;
+
+ const bool _forward;
+
+ // If invalidate kills the DiskLoc we need to move forward, we kill the iterator. See the
+ // comment in the body of invalidate(...).
+ bool _killedByInvalidate = false;
+};
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp
index 0c369587f9b..1089d243467 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp
@@ -42,773 +42,671 @@ using namespace mongo;
namespace {
- using std::string;
- using std::vector;
-
- // Provides data to be inserted. Must be large enough for largest possible record.
- // Should be in BSS so unused portions should be free.
- char zeros[20*1024*1024] = {};
-
- class DummyCappedDocumentDeleteCallback : public CappedDocumentDeleteCallback {
- public:
- Status aboutToDeleteCapped( OperationContext* txn, const RecordId& loc, RecordData data) {
- deleted.push_back( DiskLoc::fromRecordId(loc) );
- return Status::OK();
- }
- vector<DiskLoc> deleted;
- };
-
- void simpleInsertTest( const char* buf, int size ) {
-
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
-
- string myns = "test.simple1";
- CappedRecordStoreV1 rs( &txn, &cb, myns, md, &em, false );
+using std::string;
+using std::vector;
+
+// Provides data to be inserted. Must be large enough for largest possible record.
+// Should be in BSS so unused portions should be free.
+char zeros[20 * 1024 * 1024] = {};
+
+class DummyCappedDocumentDeleteCallback : public CappedDocumentDeleteCallback {
+public:
+ Status aboutToDeleteCapped(OperationContext* txn, const RecordId& loc, RecordData data) {
+ deleted.push_back(DiskLoc::fromRecordId(loc));
+ return Status::OK();
+ }
+ vector<DiskLoc> deleted;
+};
- rs.increaseStorageSize( &txn, 1024, false );
+void simpleInsertTest(const char* buf, int size) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
- ASSERT_NOT_OK( rs.insertRecord( &txn, buf, 3, 1000 ).getStatus() );
+ string myns = "test.simple1";
+ CappedRecordStoreV1 rs(&txn, &cb, myns, md, &em, false);
- rs.insertRecord( &txn, buf, size, 10000 );
+ rs.increaseStorageSize(&txn, 1024, false);
- {
- BSONObjBuilder b;
- int64_t storageSize = rs.storageSize( &txn, &b );
- BSONObj obj = b.obj();
- ASSERT_EQUALS( 1, obj["numExtents"].numberInt() );
- ASSERT_EQUALS( storageSize, em.quantizeExtentSize( 1024 ) );
- }
+ ASSERT_NOT_OK(rs.insertRecord(&txn, buf, 3, 1000).getStatus());
- for ( int i = 0; i < 1000; i++ ) {
- ASSERT_OK( rs.insertRecord( &txn, buf, size, 10000 ).getStatus() );
- }
+ rs.insertRecord(&txn, buf, size, 10000);
- long long start = md->numRecords();
- for ( int i = 0; i < 1000; i++ ) {
- ASSERT_OK( rs.insertRecord( &txn, buf, size, 10000 ).getStatus() );
- }
- ASSERT_EQUALS( start, md->numRecords() );
- ASSERT_GREATER_THAN( start, 100 );
- ASSERT_LESS_THAN( start, 1000 );
+ {
+ BSONObjBuilder b;
+ int64_t storageSize = rs.storageSize(&txn, &b);
+ BSONObj obj = b.obj();
+ ASSERT_EQUALS(1, obj["numExtents"].numberInt());
+ ASSERT_EQUALS(storageSize, em.quantizeExtentSize(1024));
}
- TEST(CappedRecordStoreV1, SimpleInsertSize4) {
- simpleInsertTest("abcd", 4);
- }
- TEST(CappedRecordStoreV1, SimpleInsertSize8) {
- simpleInsertTest("abcdefgh", 8);
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_OK(rs.insertRecord(&txn, buf, size, 10000).getStatus());
}
- TEST(CappedRecordStoreV1, EmptySingleExtent) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
-
- {
- LocAndSize records[] = {
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 1000},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
-
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1100), 900},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped
- }
+ long long start = md->numRecords();
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_OK(rs.insertRecord(&txn, buf, size, 10000).getStatus());
}
+ ASSERT_EQUALS(start, md->numRecords());
+ ASSERT_GREATER_THAN(start, 100);
+ ASSERT_LESS_THAN(start, 1000);
+}
- TEST(CappedRecordStoreV1, FirstLoopWithSingleExtentExactSize) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
-
- {
- LocAndSize records[] = {
- {DiskLoc(0, 1000), 100},
- {DiskLoc(0, 1100), 100},
- {DiskLoc(0, 1200), 100},
- {DiskLoc(0, 1300), 100},
- {DiskLoc(0, 1400), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1500), 50},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
-
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+TEST(CappedRecordStoreV1, SimpleInsertSize4) {
+ simpleInsertTest("abcd", 4);
+}
+TEST(CappedRecordStoreV1, SimpleInsertSize8) {
+ simpleInsertTest("abcdefgh", 8);
+}
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1200), 100}, // first old record
- {DiskLoc(0, 1300), 100},
- {DiskLoc(0, 1400), 100}, // last old record
- {DiskLoc(0, 1000), 100}, // first new record
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug
- {DiskLoc(0, 1500), 50}, // gap at end of extent
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
- }
+TEST(CappedRecordStoreV1, EmptySingleExtent) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {{}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
}
- TEST(CappedRecordStoreV1, NonFirstLoopWithSingleExtentExactSize) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
- {
- LocAndSize records[] = {
- {DiskLoc(0, 1000), 100},
- {DiskLoc(0, 1100), 100},
- {DiskLoc(0, 1200), 100},
- {DiskLoc(0, 1300), 100},
- {DiskLoc(0, 1400), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1500), 50},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
-
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1200), 100}, // first old record
- {DiskLoc(0, 1300), 100},
- {DiskLoc(0, 1400), 100}, // last old record
- {DiskLoc(0, 1000), 100}, // first new record
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug
- {DiskLoc(0, 1500), 50}, // gap at end of extent
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
- }
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 100}, {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1100), 900}, {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped
}
+}
- /**
- * Current code always tries to leave 24 bytes to create a DeletedRecord.
- */
- TEST(CappedRecordStoreV1, WillLoopWithout24SpareBytes) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+TEST(CappedRecordStoreV1, FirstLoopWithSingleExtentExactSize) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {{DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1500), 50}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
+ }
- {
- LocAndSize records[] = {
- {DiskLoc(0, 1000), 100},
- {DiskLoc(0, 1100), 100},
- {DiskLoc(0, 1200), 100},
- {DiskLoc(0, 1300), 100},
- {DiskLoc(0, 1400), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1500), 123},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1200), 100}, // first old record
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100}, // last old record
+ {DiskLoc(0, 1000), 100}, // first new record
+ {}};
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug
+ {DiskLoc(0, 1500), 50}, // gap at end of extent
+ {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+}
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+TEST(CappedRecordStoreV1, NonFirstLoopWithSingleExtentExactSize) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {{DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1500), 50}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
+ }
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1200), 100}, // first old record
- {DiskLoc(0, 1300), 100},
- {DiskLoc(0, 1400), 100}, // last old record
- {DiskLoc(0, 1000), 100}, // first new record
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1100), 100}, // gap after newest record
- {DiskLoc(0, 1500), 123}, // gap at end of extent
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
- }
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1200), 100}, // first old record
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100}, // last old record
+ {DiskLoc(0, 1000), 100}, // first new record
+ {}};
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug
+ {DiskLoc(0, 1500), 50}, // gap at end of extent
+ {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
}
+}
- TEST(CappedRecordStoreV1, WontLoopWith24SpareBytes) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+/**
+ * Current code always tries to leave 24 bytes to create a DeletedRecord.
+ */
+TEST(CappedRecordStoreV1, WillLoopWithout24SpareBytes) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {{DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1500), 123}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
+ }
- {
- LocAndSize records[] = {
- {DiskLoc(0, 1000), 100},
- {DiskLoc(0, 1100), 100},
- {DiskLoc(0, 1200), 100},
- {DiskLoc(0, 1300), 100},
- {DiskLoc(0, 1400), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1500), 124},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1200), 100}, // first old record
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100}, // last old record
+ {DiskLoc(0, 1000), 100}, // first new record
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1100), 100}, // gap after newest record
+ {DiskLoc(0, 1500), 123}, // gap at end of extent
+ {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+}
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+TEST(CappedRecordStoreV1, WontLoopWith24SpareBytes) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {{DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1500), 124}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
+ }
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 100},
- {DiskLoc(0, 1100), 100},
- {DiskLoc(0, 1200), 100},
- {DiskLoc(0, 1300), 100},
- {DiskLoc(0, 1400), 100},
- {DiskLoc(0, 1500), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1600), 24}, // gap at end of extent
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
- }
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {DiskLoc(0, 1500), 100},
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1600), 24}, // gap at end of extent
+ {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
}
+}
- TEST(CappedRecordStoreV1, MoveToSecondExtentUnLooped) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+TEST(CappedRecordStoreV1, MoveToSecondExtentUnLooped) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ // Two extents, each with 1000 bytes.
+ LocAndSize records[] = {
+ {DiskLoc(0, 1000), 500}, {DiskLoc(0, 1500), 300}, {DiskLoc(0, 1800), 100}, {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1900), 100}, {DiskLoc(1, 1000), 1000}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
+ }
- {
- // Two extents, each with 1000 bytes.
- LocAndSize records[] = {
- {DiskLoc(0, 1000), 500},
- {DiskLoc(0, 1500), 300},
- {DiskLoc(0, 1800), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1900), 100},
- {DiskLoc(1, 1000), 1000},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 500},
+ {DiskLoc(0, 1500), 300},
+ {DiskLoc(0, 1800), 100},
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 500},
- {DiskLoc(0, 1500), 300},
- {DiskLoc(0, 1800), 100},
-
- {DiskLoc(1, 1000), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1900), 100},
- {DiskLoc(1, 1100), 900},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped
- }
+ {DiskLoc(1, 1000), 100},
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1900), 100}, {DiskLoc(1, 1100), 900}, {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped
}
+}
- TEST(CappedRecordStoreV1, MoveToSecondExtentLooped) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+TEST(CappedRecordStoreV1, MoveToSecondExtentLooped) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ // Two extents, each with 1000 bytes.
+ LocAndSize records[] = {{DiskLoc(0, 1800), 100}, // old
+ {DiskLoc(0, 1000), 500}, // first new
+ {DiskLoc(0, 1500), 400},
+
+ {DiskLoc(1, 1000), 300},
+ {DiskLoc(1, 1300), 600},
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1900), 100}, {DiskLoc(1, 1900), 100}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
+ }
- {
- // Two extents, each with 1000 bytes.
- LocAndSize records[] = {
- {DiskLoc(0, 1800), 100}, // old
- {DiskLoc(0, 1000), 500}, // first new
- {DiskLoc(0, 1500), 400},
-
- {DiskLoc(1, 1000), 300},
- {DiskLoc(1, 1300), 600},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1900), 100},
- {DiskLoc(1, 1900), 100},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
+ rs.insertRecord(&txn, zeros, 200 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 200 - MmapV1RecordHeader::HeaderSize, false);
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 500},
+ {DiskLoc(0, 1500), 400},
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 500},
- {DiskLoc(0, 1500), 400},
-
- {DiskLoc(1, 1300), 600}, // old
- {DiskLoc(1, 1000), 200}, // first new
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1800), 200},
- {DiskLoc(1, 1200), 100},
- {DiskLoc(1, 1900), 100},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(1, 1000));
- }
+ {DiskLoc(1, 1300), 600}, // old
+ {DiskLoc(1, 1000), 200}, // first new
+ {}};
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1800), 200}, {DiskLoc(1, 1200), 100}, {DiskLoc(1, 1900), 100}, {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(1, 1000));
}
+}
- // Larger than storageSize (fails early)
- TEST(CappedRecordStoreV1, OversizedRecordHuge) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+// Larger than storageSize (fails early)
+TEST(CappedRecordStoreV1, OversizedRecordHuge) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {{}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
+ }
- {
- LocAndSize records[] = {
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 1000},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
+ StatusWith<RecordId> status = rs.insertRecord(&txn, zeros, 16000, false);
+ ASSERT_EQUALS(status.getStatus(), ErrorCodes::DocTooLargeForCapped);
+ ASSERT_EQUALS(status.getStatus().location(), 16328);
+}
- StatusWith<RecordId> status = rs.insertRecord(&txn, zeros, 16000, false);
- ASSERT_EQUALS(status.getStatus(), ErrorCodes::DocTooLargeForCapped);
- ASSERT_EQUALS(status.getStatus().location(), 16328);
+// Smaller than storageSize, but larger than usable space (fails late)
+TEST(CappedRecordStoreV1, OversizedRecordMedium) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {{}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
}
- // Smaller than storageSize, but larger than usable space (fails late)
- TEST(CappedRecordStoreV1, OversizedRecordMedium) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+ StatusWith<RecordId> status =
+ rs.insertRecord(&txn, zeros, 1004 - MmapV1RecordHeader::HeaderSize, false);
+ ASSERT_EQUALS(status.getStatus(), ErrorCodes::DocTooLargeForCapped);
+ ASSERT_EQUALS(status.getStatus().location(), 28575);
+}
- {
- LocAndSize records[] = {
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 1000},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
+//
+// XXX The CappedRecordStoreV1Scrambler suite of tests describe existing behavior that is less
+// than ideal. Any improved implementation will need to be able to handle a collection that has
+// been scrambled like this.
+//
- StatusWith<RecordId> status = rs.insertRecord(&txn, zeros, 1004 - MmapV1RecordHeader::HeaderSize, false);
- ASSERT_EQUALS(status.getStatus(), ErrorCodes::DocTooLargeForCapped);
- ASSERT_EQUALS(status.getStatus().location(), 28575);
+/**
+ * This is a minimal example that shows the current allocator laying out records out-of-order.
+ */
+TEST(CappedRecordStoreV1Scrambler, Minimal) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ // Starting with a single empty 1000 byte extent.
+ LocAndSize records[] = {{}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
}
- //
- // XXX The CappedRecordStoreV1Scrambler suite of tests describe existing behavior that is less
- // than ideal. Any improved implementation will need to be able to handle a collection that has
- // been scrambled like this.
- //
-
- /**
- * This is a minimal example that shows the current allocator laying out records out-of-order.
- */
- TEST(CappedRecordStoreV1Scrambler, Minimal) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
-
- {
- // Starting with a single empty 1000 byte extent.
- LocAndSize records[] = {
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 1000},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
- }
-
- rs.insertRecord(&txn, zeros, 500 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 300 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 400 - MmapV1RecordHeader::HeaderSize, false); // won't fit at end so wraps
- rs.insertRecord(&txn, zeros, 120 - MmapV1RecordHeader::HeaderSize, false); // fits at end
- rs.insertRecord(&txn, zeros, 60 - MmapV1RecordHeader::HeaderSize, false); // fits in earlier hole
+ rs.insertRecord(&txn, zeros, 500 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 300 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(
+ &txn, zeros, 400 - MmapV1RecordHeader::HeaderSize, false); // won't fit at end so wraps
+ rs.insertRecord(&txn, zeros, 120 - MmapV1RecordHeader::HeaderSize, false); // fits at end
+ rs.insertRecord(
+ &txn, zeros, 60 - MmapV1RecordHeader::HeaderSize, false); // fits in earlier hole
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1500), 300}, // 2nd insert
+ {DiskLoc(0, 1000), 400}, // 3rd (1st new)
+ {DiskLoc(0, 1800), 120}, // 4th
+ {DiskLoc(0, 1400), 60}, // 5th
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1460), 40}, {DiskLoc(0, 1920), 80}, {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+}
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1500), 300}, // 2nd insert
- {DiskLoc(0, 1000), 400}, // 3rd (1st new)
- {DiskLoc(0, 1800), 120}, // 4th
- {DiskLoc(0, 1400), 60}, // 5th
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1460), 40},
- {DiskLoc(0, 1920), 80},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
- }
+/**
+ * This tests a specially crafted set of inserts that scrambles a capped collection in a way
+ * that leaves 4 deleted records in a single extent.
+ */
+TEST(CappedRecordStoreV1Scrambler, FourDeletedRecordsInSingleExtent) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0);
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ // Starting with a single empty 1000 byte extent.
+ LocAndSize records[] = {{}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}};
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+ initializeV1RS(&txn, records, drecs, NULL, &em, md);
}
- /**
- * This tests a specially crafted set of inserts that scrambles a capped collection in a way
- * that leaves 4 deleted records in a single extent.
- */
- TEST(CappedRecordStoreV1Scrambler, FourDeletedRecordsInSingleExtent) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+ // This list of sizes was empirically generated to achieve this outcome. Don't think too
+ // much about them.
+ rs.insertRecord(&txn, zeros, 500 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 300 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 304 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 76 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 96 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 76 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 200 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 200 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 56 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 96 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 104 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 96 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 60 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 60 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 146 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 146 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 40 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 40 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 36 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 96 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 200 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 60 - MmapV1RecordHeader::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 64 - MmapV1RecordHeader::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1148), 148},
+ {DiskLoc(0, 1936), 40},
+ {DiskLoc(0, 1712), 40},
+ {DiskLoc(0, 1296), 36},
+ {DiskLoc(0, 1752), 100},
+ {DiskLoc(0, 1332), 96},
+ {DiskLoc(0, 1428), 200},
+ {DiskLoc(0, 1852), 60},
+ {DiskLoc(0, 1000), 64}, // (1st new)
+ {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1064), 84},
+ {DiskLoc(0, 1976), 24},
+ {DiskLoc(0, 1912), 24},
+ {DiskLoc(0, 1628), 84},
+ {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+}
- {
- // Starting with a single empty 1000 byte extent.
- LocAndSize records[] = {
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 1000},
- {}
- };
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
- initializeV1RS(&txn, records, drecs, NULL, &em, md);
+//
+// The CappedRecordStoreV1QueryStage tests some nitty-gritty capped
+// collection details. Ported and polished from pdfiletests.cpp.
+//
+
+class CollscanHelper {
+public:
+ CollscanHelper(int nExtents)
+ : md(new DummyRecordStoreV1MetaData(true, 0)), rs(&txn, &cb, ns(), md, &em, false) {
+ LocAndSize recs[] = {{}};
+ LocAndSize drecs[8];
+ ASSERT_LESS_THAN(nExtents, 8);
+ for (int j = 0; j < nExtents; ++j) {
+ drecs[j].loc = DiskLoc(j, 1000);
+ drecs[j].size = 1000;
}
+ drecs[nExtents].loc = DiskLoc();
+ drecs[nExtents].size = 0;
- // This list of sizes was empirically generated to achieve this outcome. Don't think too
- // much about them.
- rs.insertRecord(&txn, zeros, 500 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 300 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 304 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 76 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 96 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 76 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 200 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 200 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 56 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 96 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 104 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 96 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 60 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 60 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 146 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 146 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 40 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 40 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 36 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 100 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 96 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 200 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 60 - MmapV1RecordHeader::HeaderSize, false);
- rs.insertRecord(&txn, zeros, 64 - MmapV1RecordHeader::HeaderSize, false);
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+ initializeV1RS(&txn, recs, drecs, NULL, &em, md);
+ }
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1148), 148},
- {DiskLoc(0, 1936), 40},
- {DiskLoc(0, 1712), 40},
- {DiskLoc(0, 1296), 36},
- {DiskLoc(0, 1752), 100},
- {DiskLoc(0, 1332), 96},
- {DiskLoc(0, 1428), 200},
- {DiskLoc(0, 1852), 60},
- {DiskLoc(0, 1000), 64}, // (1st new)
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1064), 84},
- {DiskLoc(0, 1976), 24},
- {DiskLoc(0, 1912), 24},
- {DiskLoc(0, 1628), 84},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
- ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ // Insert bypasses standard alloc/insert routines to use the extent we want.
+ // TODO: Directly declare resulting record store state instead of procedurally creating it
+ DiskLoc insert(const DiskLoc& ext, int i) {
+ // Copied verbatim.
+ BSONObjBuilder b;
+ b.append("a", i);
+ BSONObj o = b.done();
+ int len = o.objsize();
+ Extent* e = em.getExtent(ext);
+ e = txn.recoveryUnit()->writing(e);
+ int ofs;
+ if (e->lastRecord.isNull()) {
+ ofs = ext.getOfs() + (e->_extentData - (char*)e);
+ } else {
+ ofs = e->lastRecord.getOfs() + em.recordForV1(e->lastRecord)->lengthWithHeaders();
}
+ DiskLoc dl(ext.a(), ofs);
+ MmapV1RecordHeader* r = em.recordForV1(dl);
+ r = (MmapV1RecordHeader*)txn.recoveryUnit()->writingPtr(
+ r, MmapV1RecordHeader::HeaderSize + len);
+ r->lengthWithHeaders() = MmapV1RecordHeader::HeaderSize + len;
+ r->extentOfs() = e->myLoc.getOfs();
+ r->nextOfs() = DiskLoc::NullOfs;
+ r->prevOfs() = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs();
+ memcpy(r->data(), o.objdata(), len);
+ if (e->firstRecord.isNull())
+ e->firstRecord = dl;
+ else
+ txn.recoveryUnit()->writingInt(em.recordForV1(e->lastRecord)->nextOfs()) = ofs;
+ e->lastRecord = dl;
+ return dl;
}
- //
- // The CappedRecordStoreV1QueryStage tests some nitty-gritty capped
- // collection details. Ported and polished from pdfiletests.cpp.
- //
-
- class CollscanHelper {
- public:
- CollscanHelper(int nExtents)
- : md(new DummyRecordStoreV1MetaData( true, 0 )),
- rs(&txn, &cb, ns(), md, &em, false)
+ // TODO: Directly assert the desired record store state instead of just walking it
+ void walkAndCount(int expectedCount) {
+ // Walk the collection going forward.
{
- LocAndSize recs[] = {
- {}
- };
- LocAndSize drecs[8];
- ASSERT_LESS_THAN(nExtents, 8);
- for (int j = 0; j < nExtents; ++j) {
- drecs[j].loc = DiskLoc(j, 1000);
- drecs[j].size = 1000;
+ CappedRecordStoreV1Iterator cursor(&txn, &rs, /*forward=*/true);
+ int resultCount = 0;
+ while (auto record = cursor.next()) {
+ ++resultCount;
}
- drecs[nExtents].loc = DiskLoc();
- drecs[nExtents].size = 0;
- md->setCapExtent(&txn, DiskLoc(0, 0));
- md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
- initializeV1RS(&txn, recs, drecs, NULL, &em, md);
+ ASSERT_EQUALS(resultCount, expectedCount);
}
- // Insert bypasses standard alloc/insert routines to use the extent we want.
- // TODO: Directly declare resulting record store state instead of procedurally creating it
- DiskLoc insert( const DiskLoc& ext, int i ) {
- // Copied verbatim.
- BSONObjBuilder b;
- b.append( "a", i );
- BSONObj o = b.done();
- int len = o.objsize();
- Extent *e = em.getExtent(ext);
- e = txn.recoveryUnit()->writing(e);
- int ofs;
- if ( e->lastRecord.isNull() ) {
- ofs = ext.getOfs() + ( e->_extentData - (char *)e );
- }
- else {
- ofs = e->lastRecord.getOfs()
- + em.recordForV1(e->lastRecord)->lengthWithHeaders();
- }
- DiskLoc dl( ext.a(), ofs );
- MmapV1RecordHeader *r = em.recordForV1(dl);
- r = (MmapV1RecordHeader*) txn.recoveryUnit()->writingPtr(r, MmapV1RecordHeader::HeaderSize + len);
- r->lengthWithHeaders() = MmapV1RecordHeader::HeaderSize + len;
- r->extentOfs() = e->myLoc.getOfs();
- r->nextOfs() = DiskLoc::NullOfs;
- r->prevOfs() = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs();
- memcpy( r->data(), o.objdata(), len );
- if ( e->firstRecord.isNull() )
- e->firstRecord = dl;
- else
- txn.recoveryUnit()->writingInt(em.recordForV1(e->lastRecord)->nextOfs()) = ofs;
- e->lastRecord = dl;
- return dl;
- }
-
- // TODO: Directly assert the desired record store state instead of just walking it
- void walkAndCount (int expectedCount) {
- // Walk the collection going forward.
- {
- CappedRecordStoreV1Iterator cursor(&txn, &rs, /*forward=*/true);
- int resultCount = 0;
- while (auto record = cursor.next()) {
- ++resultCount;
- }
-
- ASSERT_EQUALS(resultCount, expectedCount);
+ // Walk the collection going backwards.
+ {
+ CappedRecordStoreV1Iterator cursor(&txn, &rs, /*forward=*/false);
+ int resultCount = expectedCount;
+ while (auto record = cursor.next()) {
+ --resultCount;
}
- // Walk the collection going backwards.
- {
- CappedRecordStoreV1Iterator cursor(&txn, &rs, /*forward=*/false);
- int resultCount = expectedCount;
- while (auto record = cursor.next()) {
- --resultCount;
- }
-
- ASSERT_EQUALS(resultCount, 0);
- }
+ ASSERT_EQUALS(resultCount, 0);
}
+ }
- static const char *ns() { return "unittests.QueryStageCollectionScanCapped"; }
+ static const char* ns() {
+ return "unittests.QueryStageCollectionScanCapped";
+ }
- OperationContextNoop txn;
- DummyRecordStoreV1MetaData* md;
- DummyExtentManager em;
+ OperationContextNoop txn;
+ DummyRecordStoreV1MetaData* md;
+ DummyExtentManager em;
- private:
- DummyCappedDocumentDeleteCallback cb;
- CappedRecordStoreV1 rs;
- };
+private:
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs;
+};
- TEST(CappedRecordStoreV1QueryStage, CollscanCappedBase) {
- CollscanHelper h(1);
- h.walkAndCount(0);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanCappedBase) {
+ CollscanHelper h(1);
+ h.walkAndCount(0);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanEmptyLooped) {
- CollscanHelper h(1);
- h.md->setCapFirstNewRecord( &h.txn, DiskLoc() );
- h.walkAndCount(0);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanEmptyLooped) {
+ CollscanHelper h(1);
+ h.md->setCapFirstNewRecord(&h.txn, DiskLoc());
+ h.walkAndCount(0);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanEmptyMultiExtentLooped) {
- CollscanHelper h(3);
- h.md->setCapFirstNewRecord( &h.txn, DiskLoc() );
- h.walkAndCount(0);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanEmptyMultiExtentLooped) {
+ CollscanHelper h(3);
+ h.md->setCapFirstNewRecord(&h.txn, DiskLoc());
+ h.walkAndCount(0);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanSingle) {
- CollscanHelper h(1);
+TEST(CappedRecordStoreV1QueryStage, CollscanSingle) {
+ CollscanHelper h(1);
- h.md->setCapFirstNewRecord(&h.txn, h.insert( h.md->capExtent(), 0 ));
- h.walkAndCount(1);
- }
+ h.md->setCapFirstNewRecord(&h.txn, h.insert(h.md->capExtent(), 0));
+ h.walkAndCount(1);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanNewCapFirst) {
- CollscanHelper h(1);
- DiskLoc x = h.insert(h.md->capExtent(), 0 );
- h.md->setCapFirstNewRecord( &h.txn, x );
- h.insert(h.md->capExtent(), 1 );
- h.walkAndCount(2);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanNewCapFirst) {
+ CollscanHelper h(1);
+ DiskLoc x = h.insert(h.md->capExtent(), 0);
+ h.md->setCapFirstNewRecord(&h.txn, x);
+ h.insert(h.md->capExtent(), 1);
+ h.walkAndCount(2);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanNewCapMiddle) {
- CollscanHelper h(1);
- h.insert(h.md->capExtent(), 0 );
- h.md->setCapFirstNewRecord(&h.txn, h.insert( h.md->capExtent(), 1 ) );
- h.insert( h.md->capExtent(), 2 );
- h.walkAndCount(3);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanNewCapMiddle) {
+ CollscanHelper h(1);
+ h.insert(h.md->capExtent(), 0);
+ h.md->setCapFirstNewRecord(&h.txn, h.insert(h.md->capExtent(), 1));
+ h.insert(h.md->capExtent(), 2);
+ h.walkAndCount(3);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanFirstExtent) {
- CollscanHelper h(2);
- h.insert(h.md->capExtent(), 0 );
- h.insert(h.md->lastExtent(&h.txn), 1 );
- h.md->setCapFirstNewRecord(&h.txn, h.insert( h.md->capExtent(), 2 ) );
- h.insert( h.md->capExtent(), 3 );
- h.walkAndCount(4);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanFirstExtent) {
+ CollscanHelper h(2);
+ h.insert(h.md->capExtent(), 0);
+ h.insert(h.md->lastExtent(&h.txn), 1);
+ h.md->setCapFirstNewRecord(&h.txn, h.insert(h.md->capExtent(), 2));
+ h.insert(h.md->capExtent(), 3);
+ h.walkAndCount(4);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanLastExtent) {
- CollscanHelper h(2);
- h.md->setCapExtent( &h.txn, h.md->lastExtent(&h.txn) );
- h.insert( h.md->capExtent(), 0 );
- h.insert( h.md->firstExtent(&h.txn), 1 );
- h.md->setCapFirstNewRecord( &h.txn, h.insert( h.md->capExtent(), 2 ) );
- h.insert( h.md->capExtent(), 3 );
- h.walkAndCount(4);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanLastExtent) {
+ CollscanHelper h(2);
+ h.md->setCapExtent(&h.txn, h.md->lastExtent(&h.txn));
+ h.insert(h.md->capExtent(), 0);
+ h.insert(h.md->firstExtent(&h.txn), 1);
+ h.md->setCapFirstNewRecord(&h.txn, h.insert(h.md->capExtent(), 2));
+ h.insert(h.md->capExtent(), 3);
+ h.walkAndCount(4);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanMidExtent) {
- CollscanHelper h(3);
- h.md->setCapExtent( &h.txn, h.em.getExtent(h.md->firstExtent(&h.txn))->xnext );
- h.insert( h.md->capExtent(), 0 );
- h.insert( h.md->lastExtent(&h.txn), 1 );
- h.insert( h.md->firstExtent(&h.txn), 2 );
- h.md->setCapFirstNewRecord( &h.txn, h.insert( h.md->capExtent(), 3 ) );
- h.insert( h.md->capExtent(), 4 );
- h.walkAndCount(5);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanMidExtent) {
+ CollscanHelper h(3);
+ h.md->setCapExtent(&h.txn, h.em.getExtent(h.md->firstExtent(&h.txn))->xnext);
+ h.insert(h.md->capExtent(), 0);
+ h.insert(h.md->lastExtent(&h.txn), 1);
+ h.insert(h.md->firstExtent(&h.txn), 2);
+ h.md->setCapFirstNewRecord(&h.txn, h.insert(h.md->capExtent(), 3));
+ h.insert(h.md->capExtent(), 4);
+ h.walkAndCount(5);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanAloneInExtent) {
- CollscanHelper h(3);
- h.md->setCapExtent( &h.txn, h.em.getExtent(h.md->firstExtent(&h.txn))->xnext );
- h.insert( h.md->lastExtent(&h.txn), 0 );
- h.insert( h.md->firstExtent(&h.txn), 1 );
- h.md->setCapFirstNewRecord( &h.txn, h.insert( h.md->capExtent(), 2 ) );
- h.walkAndCount(3);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanAloneInExtent) {
+ CollscanHelper h(3);
+ h.md->setCapExtent(&h.txn, h.em.getExtent(h.md->firstExtent(&h.txn))->xnext);
+ h.insert(h.md->lastExtent(&h.txn), 0);
+ h.insert(h.md->firstExtent(&h.txn), 1);
+ h.md->setCapFirstNewRecord(&h.txn, h.insert(h.md->capExtent(), 2));
+ h.walkAndCount(3);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanFirstInExtent) {
- CollscanHelper h(3);
- h.md->setCapExtent( &h.txn, h.em.getExtent(h.md->firstExtent(&h.txn))->xnext );
- h.insert( h.md->lastExtent(&h.txn), 0 );
- h.insert( h.md->firstExtent(&h.txn), 1 );
- h.md->setCapFirstNewRecord( &h.txn, h.insert( h.md->capExtent(), 2 ) );
- h.insert( h.md->capExtent(), 3 );
- h.walkAndCount(4);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanFirstInExtent) {
+ CollscanHelper h(3);
+ h.md->setCapExtent(&h.txn, h.em.getExtent(h.md->firstExtent(&h.txn))->xnext);
+ h.insert(h.md->lastExtent(&h.txn), 0);
+ h.insert(h.md->firstExtent(&h.txn), 1);
+ h.md->setCapFirstNewRecord(&h.txn, h.insert(h.md->capExtent(), 2));
+ h.insert(h.md->capExtent(), 3);
+ h.walkAndCount(4);
+}
- TEST(CappedRecordStoreV1QueryStage, CollscanLastInExtent) {
- CollscanHelper h(3);
- h.md->setCapExtent( &h.txn, h.em.getExtent(h.md->firstExtent(&h.txn))->xnext );
- h.insert( h.md->capExtent(), 0 );
- h.insert( h.md->lastExtent(&h.txn), 1 );
- h.insert( h.md->firstExtent(&h.txn), 2 );
- h.md->setCapFirstNewRecord( &h.txn, h.insert( h.md->capExtent(), 3 ) );
- h.walkAndCount(4);
- }
+TEST(CappedRecordStoreV1QueryStage, CollscanLastInExtent) {
+ CollscanHelper h(3);
+ h.md->setCapExtent(&h.txn, h.em.getExtent(h.md->firstExtent(&h.txn))->xnext);
+ h.insert(h.md->capExtent(), 0);
+ h.insert(h.md->lastExtent(&h.txn), 1);
+ h.insert(h.md->firstExtent(&h.txn), 2);
+ h.md->setCapFirstNewRecord(&h.txn, h.insert(h.md->capExtent(), 3));
+ h.walkAndCount(4);
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp
index a4cb9977fe3..728f07d6013 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp
@@ -38,49 +38,47 @@
namespace mongo {
- using std::endl;
-
- RecordStoreV1RepairCursor::RecordStoreV1RepairCursor(OperationContext* txn,
- const RecordStoreV1Base* recordStore)
- : _txn(txn), _recordStore(recordStore), _stage(FORWARD_SCAN) {
-
- // Position the iterator at the first record
- //
- advance();
- }
-
- boost::optional<Record> RecordStoreV1RepairCursor::next() {
- if (_currRecord.isNull()) return {};
- auto out = _currRecord.toRecordId();
- advance();
- return {{out, _recordStore->dataFor(_txn, out)}};
- }
-
- boost::optional<Record> RecordStoreV1RepairCursor::seekExact(const RecordId& id) {
- invariant(!"seekExact not supported");
- }
-
- void RecordStoreV1RepairCursor::advance() {
- const ExtentManager* em = _recordStore->_extentManager;
-
- while (true) {
- if (_currRecord.isNull()) {
-
- if (!_advanceToNextValidExtent()) {
- return;
- }
+using std::endl;
+
+RecordStoreV1RepairCursor::RecordStoreV1RepairCursor(OperationContext* txn,
+ const RecordStoreV1Base* recordStore)
+ : _txn(txn), _recordStore(recordStore), _stage(FORWARD_SCAN) {
+ // Position the iterator at the first record
+ //
+ advance();
+}
+
+boost::optional<Record> RecordStoreV1RepairCursor::next() {
+ if (_currRecord.isNull())
+ return {};
+ auto out = _currRecord.toRecordId();
+ advance();
+ return {{out, _recordStore->dataFor(_txn, out)}};
+}
+
+boost::optional<Record> RecordStoreV1RepairCursor::seekExact(const RecordId& id) {
+ invariant(!"seekExact not supported");
+}
+
+void RecordStoreV1RepairCursor::advance() {
+ const ExtentManager* em = _recordStore->_extentManager;
+
+ while (true) {
+ if (_currRecord.isNull()) {
+ if (!_advanceToNextValidExtent()) {
+ return;
+ }
- _seenInCurrentExtent.clear();
+ _seenInCurrentExtent.clear();
- // Otherwise _advanceToNextValidExtent would have returned false
- //
- invariant(!_currExtent.isNull());
+ // Otherwise _advanceToNextValidExtent would have returned false
+ //
+ invariant(!_currExtent.isNull());
- const Extent* e = em->getExtent(_currExtent, false);
- _currRecord = (FORWARD_SCAN == _stage ? e->firstRecord : e->lastRecord);
- }
- else {
- switch (_stage) {
+ const Extent* e = em->getExtent(_currExtent, false);
+ _currRecord = (FORWARD_SCAN == _stage ? e->firstRecord : e->lastRecord);
+ } else {
+ switch (_stage) {
case FORWARD_SCAN:
_currRecord = _recordStore->getNextRecordInExtent(_txn, _currRecord);
break;
@@ -90,37 +88,37 @@ namespace mongo {
default:
invariant(!"This should never be reached.");
break;
- }
- }
-
- if (_currRecord.isNull()) {
- continue;
}
+ }
- // Validate the contents of the record's disk location and deduplicate
- //
- if (!_seenInCurrentExtent.insert(_currRecord).second) {
- error() << "infinite loop in extent, seen: " << _currRecord << " before" << endl;
- _currRecord = DiskLoc();
- continue;
- }
+ if (_currRecord.isNull()) {
+ continue;
+ }
- if (_currRecord.getOfs() <= 0){
- error() << "offset is 0 for record which should be impossible" << endl;
- _currRecord = DiskLoc();
- continue;
- }
+ // Validate the contents of the record's disk location and deduplicate
+ //
+ if (!_seenInCurrentExtent.insert(_currRecord).second) {
+ error() << "infinite loop in extent, seen: " << _currRecord << " before" << endl;
+ _currRecord = DiskLoc();
+ continue;
+ }
- return;
+ if (_currRecord.getOfs() <= 0) {
+ error() << "offset is 0 for record which should be impossible" << endl;
+ _currRecord = DiskLoc();
+ continue;
}
+
+ return;
}
+}
- bool RecordStoreV1RepairCursor::_advanceToNextValidExtent() {
- const ExtentManager* em = _recordStore->_extentManager;
+bool RecordStoreV1RepairCursor::_advanceToNextValidExtent() {
+ const ExtentManager* em = _recordStore->_extentManager;
- while (true) {
- if (_currExtent.isNull()) {
- switch (_stage) {
+ while (true) {
+ if (_currExtent.isNull()) {
+ switch (_stage) {
case FORWARD_SCAN:
_currExtent = _recordStore->details()->firstExtent(_txn);
break;
@@ -130,35 +128,34 @@ namespace mongo {
default:
invariant(DONE == _stage);
return false;
- }
- }
- else {
- // If _currExtent is not NULL, then it must point to a valid extent, so no extra
- // checks here.
- //
- const Extent* e = em->getExtent(_currExtent, false);
- _currExtent = (FORWARD_SCAN == _stage ? e->xnext : e->xprev);
}
-
- bool hasNextExtent = !_currExtent.isNull();
-
- // Sanity checks for the extent's disk location
+ } else {
+ // If _currExtent is not NULL, then it must point to a valid extent, so no extra
+ // checks here.
//
- if (hasNextExtent && (!_currExtent.isValid() || (_currExtent.getOfs() < 0))) {
- error() << "Invalid extent location: " << _currExtent << endl;
+ const Extent* e = em->getExtent(_currExtent, false);
+ _currExtent = (FORWARD_SCAN == _stage ? e->xnext : e->xprev);
+ }
- // Switch the direction of scan
- //
- hasNextExtent = false;
- }
+ bool hasNextExtent = !_currExtent.isNull();
- if (hasNextExtent) {
- break;
- }
+ // Sanity checks for the extent's disk location
+ //
+ if (hasNextExtent && (!_currExtent.isValid() || (_currExtent.getOfs() < 0))) {
+ error() << "Invalid extent location: " << _currExtent << endl;
- // Swap the direction of scan and loop again
+ // Switch the direction of scan
//
- switch (_stage) {
+ hasNextExtent = false;
+ }
+
+ if (hasNextExtent) {
+ break;
+ }
+
+ // Swap the direction of scan and loop again
+ //
+ switch (_stage) {
case FORWARD_SCAN:
_stage = BACKWARD_SCAN;
break;
@@ -168,49 +165,48 @@ namespace mongo {
default:
invariant(!"This should never be reached.");
break;
- }
-
- _currExtent = DiskLoc();
}
+ _currExtent = DiskLoc();
+ }
- // Check _currExtent's contents for validity, but do not count is as failure if they
- // don't check out.
- //
- const Extent* e = em->getExtent(_currExtent, false);
- if (!e->isOk()){
- warning() << "Extent not ok magic: " << e->magic << " going to try to continue"
- << endl;
- }
-
- log() << (FORWARD_SCAN == _stage ? "FORWARD" : "BACKWARD") << " Extent loc: "
- << _currExtent << ", length: " << e->length << endl;
- return true;
+ // Check _currExtent's contents for validity, but do not count is as failure if they
+ // don't check out.
+ //
+ const Extent* e = em->getExtent(_currExtent, false);
+ if (!e->isOk()) {
+ warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl;
}
- void RecordStoreV1RepairCursor::invalidate(const RecordId& id) {
- // If we see this record again it probably means it was reinserted rather than an infinite
- // loop. If we do loop, we should quickly hit another seen record that hasn't been
- // invalidated.
- DiskLoc dl = DiskLoc::fromRecordId(id);
- _seenInCurrentExtent.erase(dl);
+ log() << (FORWARD_SCAN == _stage ? "FORWARD" : "BACKWARD") << " Extent loc: " << _currExtent
+ << ", length: " << e->length << endl;
- if (_currRecord == dl) {
- // The DiskLoc being invalidated is also the one pointed at by this iterator. We
- // advance the iterator so it's not pointing at invalid data.
- advance();
+ return true;
+}
- if (_currRecord == dl) {
- // Even after advancing the iterator, we're still pointing at the DiskLoc being
- // invalidated. This is expected when 'dl' is the last DiskLoc in the FORWARD scan,
- // and the initial call to getNext() moves the iterator to the first loc in the
- // BACKWARDS scan.
- advance();
- }
+void RecordStoreV1RepairCursor::invalidate(const RecordId& id) {
+ // If we see this record again it probably means it was reinserted rather than an infinite
+ // loop. If we do loop, we should quickly hit another seen record that hasn't been
+ // invalidated.
+ DiskLoc dl = DiskLoc::fromRecordId(id);
+ _seenInCurrentExtent.erase(dl);
+
+ if (_currRecord == dl) {
+ // The DiskLoc being invalidated is also the one pointed at by this iterator. We
+ // advance the iterator so it's not pointing at invalid data.
+ advance();
- invariant(_currRecord != dl);
+ if (_currRecord == dl) {
+ // Even after advancing the iterator, we're still pointing at the DiskLoc being
+ // invalidated. This is expected when 'dl' is the last DiskLoc in the FORWARD scan,
+ // and the initial call to getNext() moves the iterator to the first loc in the
+ // BACKWARDS scan.
+ advance();
}
+
+ invariant(_currRecord != dl);
}
+}
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h
index 6b93ad5941a..def5178ad8e 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h
@@ -35,63 +35,60 @@
namespace mongo {
+/**
+ * This iterator will go over the collection twice - once going forward (first extent -> last
+ * extent) and once backwards in an attempt to salvage potentially corrupted or unreachable
+ * records. It is used by the mongodump --repair option.
+ */
+class RecordStoreV1RepairCursor final : public RecordCursor {
+public:
+ RecordStoreV1RepairCursor(OperationContext* txn, const RecordStoreV1Base* recordStore);
+
+ boost::optional<Record> next() final;
+ boost::optional<Record> seekExact(const RecordId& id) final;
+ void invalidate(const RecordId& dl);
+ void savePositioned() final {
+ _txn = nullptr;
+ }
+ bool restore(OperationContext* txn) final {
+ _txn = txn;
+ return true;
+ }
+
+ // Explicitly not supporting fetcherForNext(). The expected use case for this class is a
+ // special offline operation where there are no concurrent operations, so it would be better
+ // to take the pagefault inline with the operation.
+
+private:
+ void advance();
+
/**
- * This iterator will go over the collection twice - once going forward (first extent -> last
- * extent) and once backwards in an attempt to salvage potentially corrupted or unreachable
- * records. It is used by the mongodump --repair option.
+ * Based on the direction of scan, finds the next valid (un-corrupted) extent in the chain
+ * and sets _currExtent to point to that.
+ *
+ * @return true if valid extent was found (_currExtent will not be null)
+ * false otherwise and _currExtent will be null
*/
- class RecordStoreV1RepairCursor final : public RecordCursor {
- public:
- RecordStoreV1RepairCursor(OperationContext* txn,
- const RecordStoreV1Base* recordStore);
-
- boost::optional<Record> next() final;
- boost::optional<Record> seekExact(const RecordId& id) final;
- void invalidate(const RecordId& dl);
- void savePositioned() final { _txn = nullptr; }
- bool restore(OperationContext* txn) final {
- _txn = txn;
- return true;
- }
-
- // Explicitly not supporting fetcherForNext(). The expected use case for this class is a
- // special offline operation where there are no concurrent operations, so it would be better
- // to take the pagefault inline with the operation.
-
- private:
- void advance();
-
- /**
- * Based on the direction of scan, finds the next valid (un-corrupted) extent in the chain
- * and sets _currExtent to point to that.
- *
- * @return true if valid extent was found (_currExtent will not be null)
- * false otherwise and _currExtent will be null
- */
- bool _advanceToNextValidExtent();
-
- // transactional context for read locks. Not owned by us
- OperationContext* _txn;
-
- // Reference to the owning RecordStore. The store must not be deleted while there are
- // active iterators on it.
- //
- const RecordStoreV1Base* _recordStore;
-
- DiskLoc _currExtent;
- DiskLoc _currRecord;
-
- enum Stage {
- FORWARD_SCAN = 0,
- BACKWARD_SCAN = 1,
- DONE = 2
- };
-
- Stage _stage;
-
- // Used to find cycles within an extent. Cleared after each extent has been processed.
- //
- std::set<DiskLoc> _seenInCurrentExtent;
- };
+ bool _advanceToNextValidExtent();
+
+ // transactional context for read locks. Not owned by us
+ OperationContext* _txn;
+
+ // Reference to the owning RecordStore. The store must not be deleted while there are
+ // active iterators on it.
+ //
+ const RecordStoreV1Base* _recordStore;
+
+ DiskLoc _currExtent;
+ DiskLoc _currRecord;
+
+ enum Stage { FORWARD_SCAN = 0, BACKWARD_SCAN = 1, DONE = 2 };
+
+ Stage _stage;
+
+ // Used to find cycles within an extent. Cleared after each extent has been processed.
+ //
+ std::set<DiskLoc> _seenInCurrentExtent;
+};
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp
index 029883254bd..5948553b9af 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp
@@ -53,447 +53,431 @@
namespace mongo {
- using std::endl;
- using std::vector;
-
- static Counter64 freelistAllocs;
- static Counter64 freelistBucketExhausted;
- static Counter64 freelistIterations;
-
- // TODO figure out what to do about these.
- static ServerStatusMetricField<Counter64> dFreelist1( "storage.freelist.search.requests",
- &freelistAllocs );
-
- static ServerStatusMetricField<Counter64> dFreelist2( "storage.freelist.search.bucketExhausted",
- &freelistBucketExhausted );
-
- static ServerStatusMetricField<Counter64> dFreelist3( "storage.freelist.search.scanned",
- &freelistIterations );
-
- SimpleRecordStoreV1::SimpleRecordStoreV1( OperationContext* txn,
- StringData ns,
- RecordStoreV1MetaData* details,
- ExtentManager* em,
- bool isSystemIndexes )
- : RecordStoreV1Base( ns, details, em, isSystemIndexes ) {
-
- invariant( !details->isCapped() );
- _normalCollection = NamespaceString::normal( ns );
- }
+using std::endl;
+using std::vector;
+
+static Counter64 freelistAllocs;
+static Counter64 freelistBucketExhausted;
+static Counter64 freelistIterations;
+
+// TODO figure out what to do about these.
+static ServerStatusMetricField<Counter64> dFreelist1("storage.freelist.search.requests",
+ &freelistAllocs);
+
+static ServerStatusMetricField<Counter64> dFreelist2("storage.freelist.search.bucketExhausted",
+ &freelistBucketExhausted);
+
+static ServerStatusMetricField<Counter64> dFreelist3("storage.freelist.search.scanned",
+ &freelistIterations);
+
+SimpleRecordStoreV1::SimpleRecordStoreV1(OperationContext* txn,
+ StringData ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes)
+ : RecordStoreV1Base(ns, details, em, isSystemIndexes) {
+ invariant(!details->isCapped());
+ _normalCollection = NamespaceString::normal(ns);
+}
- SimpleRecordStoreV1::~SimpleRecordStoreV1() {
+SimpleRecordStoreV1::~SimpleRecordStoreV1() {}
+
+DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* txn, int lenToAllocRaw) {
+ // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the
+ // correct deleted list each time we try to allocate a new record. This ensures we won't
+ // orphan any data when upgrading from old versions, without needing a long upgrade phase.
+ // This is done before we try to allocate the new record so we can take advantage of the new
+ // space immediately.
+ {
+ const DiskLoc head = _details->deletedListLegacyGrabBag();
+ if (!head.isNull()) {
+ _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted());
+ addDeletedRec(txn, head);
+ }
}
- DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn,
- int lenToAllocRaw ) {
-
- // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the
- // correct deleted list each time we try to allocate a new record. This ensures we won't
- // orphan any data when upgrading from old versions, without needing a long upgrade phase.
- // This is done before we try to allocate the new record so we can take advantage of the new
- // space immediately.
- {
- const DiskLoc head = _details->deletedListLegacyGrabBag();
- if (!head.isNull()) {
- _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted());
- addDeletedRec(txn, head);
+ // align size up to a multiple of 4
+ const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1);
+
+ freelistAllocs.increment();
+ DiskLoc loc;
+ DeletedRecord* dr = NULL;
+ {
+ int myBucket;
+ for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) {
+ // Only look at the first entry in each bucket. This works because we are either
+ // quantizing or allocating fixed-size blocks.
+ const DiskLoc head = _details->deletedListEntry(myBucket);
+ if (head.isNull())
+ continue;
+ DeletedRecord* const candidate = drec(head);
+ if (candidate->lengthWithHeaders() >= lenToAlloc) {
+ loc = head;
+ dr = candidate;
+ break;
}
}
- // align size up to a multiple of 4
- const int lenToAlloc = (lenToAllocRaw + (4-1)) & ~(4-1);
+ if (!dr)
+ return DiskLoc(); // no space
- freelistAllocs.increment();
- DiskLoc loc;
- DeletedRecord* dr = NULL;
- {
-
- int myBucket;
- for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) {
- // Only look at the first entry in each bucket. This works because we are either
- // quantizing or allocating fixed-size blocks.
- const DiskLoc head = _details->deletedListEntry(myBucket);
- if (head.isNull()) continue;
- DeletedRecord* const candidate = drec(head);
- if (candidate->lengthWithHeaders() >= lenToAlloc) {
- loc = head;
- dr = candidate;
- break;
- }
- }
-
- if (!dr)
- return DiskLoc(); // no space
-
- // Unlink ourself from the deleted list
- _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted());
- *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid(); // defensive
- }
+ // Unlink ourself from the deleted list
+ _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted());
+ *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid(); // defensive
+ }
- invariant( dr->extentOfs() < loc.getOfs() );
+ invariant(dr->extentOfs() < loc.getOfs());
- // Split the deleted record if it has at least as much left over space as our smallest
- // allocation size. Otherwise, just take the whole DeletedRecord.
- const int remainingLength = dr->lengthWithHeaders() - lenToAlloc;
- if (remainingLength >= bucketSizes[0]) {
- txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc;
- const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc);
- DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc));
- newDel->extentOfs() = dr->extentOfs();
- newDel->lengthWithHeaders() = remainingLength;
- newDel->nextDeleted().Null();
+ // Split the deleted record if it has at least as much left over space as our smallest
+ // allocation size. Otherwise, just take the whole DeletedRecord.
+ const int remainingLength = dr->lengthWithHeaders() - lenToAlloc;
+ if (remainingLength >= bucketSizes[0]) {
+ txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc;
+ const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc);
+ DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc));
+ newDel->extentOfs() = dr->extentOfs();
+ newDel->lengthWithHeaders() = remainingLength;
+ newDel->nextDeleted().Null();
- addDeletedRec(txn, newDelLoc);
- }
-
- return loc;
+ addDeletedRec(txn, newDelLoc);
}
- StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord( OperationContext* txn,
- int lengthWithHeaders,
- bool enforceQuota ) {
- if (lengthWithHeaders > MaxAllowedAllocation) {
- return StatusWith<DiskLoc>(
- ErrorCodes::InvalidLength,
- str::stream() << "Attempting to allocate a record larger than maximum size: "
- << lengthWithHeaders << " > 16.5MB");
- }
+ return loc;
+}
- DiskLoc loc = _allocFromExistingExtents( txn, lengthWithHeaders );
- if ( !loc.isNull() )
- return StatusWith<DiskLoc>( loc );
+StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord(OperationContext* txn,
+ int lengthWithHeaders,
+ bool enforceQuota) {
+ if (lengthWithHeaders > MaxAllowedAllocation) {
+ return StatusWith<DiskLoc>(
+ ErrorCodes::InvalidLength,
+ str::stream() << "Attempting to allocate a record larger than maximum size: "
+ << lengthWithHeaders << " > 16.5MB");
+ }
- LOG(1) << "allocating new extent";
+ DiskLoc loc = _allocFromExistingExtents(txn, lengthWithHeaders);
+ if (!loc.isNull())
+ return StatusWith<DiskLoc>(loc);
- increaseStorageSize( txn,
- _extentManager->followupSize( lengthWithHeaders,
- _details->lastExtentSize(txn)),
- enforceQuota );
+ LOG(1) << "allocating new extent";
- loc = _allocFromExistingExtents( txn, lengthWithHeaders );
- if ( !loc.isNull() ) {
- // got on first try
- return StatusWith<DiskLoc>( loc );
- }
+ increaseStorageSize(
+ txn,
+ _extentManager->followupSize(lengthWithHeaders, _details->lastExtentSize(txn)),
+ enforceQuota);
- log() << "warning: alloc() failed after allocating new extent. "
- << "lengthWithHeaders: " << lengthWithHeaders << " last extent size:"
- << _details->lastExtentSize(txn) << "; trying again";
+ loc = _allocFromExistingExtents(txn, lengthWithHeaders);
+ if (!loc.isNull()) {
+ // got on first try
+ return StatusWith<DiskLoc>(loc);
+ }
- for ( int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++ ) {
- log() << "try #" << z << endl;
+ log() << "warning: alloc() failed after allocating new extent. "
+ << "lengthWithHeaders: " << lengthWithHeaders
+ << " last extent size:" << _details->lastExtentSize(txn) << "; trying again";
- increaseStorageSize( txn,
- _extentManager->followupSize( lengthWithHeaders,
- _details->lastExtentSize(txn)),
- enforceQuota );
+ for (int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++) {
+ log() << "try #" << z << endl;
- loc = _allocFromExistingExtents( txn, lengthWithHeaders );
- if ( ! loc.isNull() )
- return StatusWith<DiskLoc>( loc );
- }
+ increaseStorageSize(
+ txn,
+ _extentManager->followupSize(lengthWithHeaders, _details->lastExtentSize(txn)),
+ enforceQuota);
- return StatusWith<DiskLoc>( ErrorCodes::InternalError, "cannot allocate space" );
+ loc = _allocFromExistingExtents(txn, lengthWithHeaders);
+ if (!loc.isNull())
+ return StatusWith<DiskLoc>(loc);
}
- Status SimpleRecordStoreV1::truncate(OperationContext* txn) {
- const DiskLoc firstExtLoc = _details->firstExtent(txn);
- if (firstExtLoc.isNull() || !firstExtLoc.isValid()) {
- // Already empty
- return Status::OK();
- }
-
- // Free all extents except the first.
- Extent* firstExt = _extentManager->getExtent(firstExtLoc);
- if (!firstExt->xnext.isNull()) {
- const DiskLoc extNextLoc = firstExt->xnext;
- const DiskLoc oldLastExtLoc = _details->lastExtent(txn);
- Extent* const nextExt = _extentManager->getExtent(extNextLoc);
+ return StatusWith<DiskLoc>(ErrorCodes::InternalError, "cannot allocate space");
+}
- // Unlink other extents;
- *txn->recoveryUnit()->writing(&nextExt->xprev) = DiskLoc();
- *txn->recoveryUnit()->writing(&firstExt->xnext) = DiskLoc();
- _details->setLastExtent(txn, firstExtLoc);
- _details->setLastExtentSize(txn, firstExt->length);
+Status SimpleRecordStoreV1::truncate(OperationContext* txn) {
+ const DiskLoc firstExtLoc = _details->firstExtent(txn);
+ if (firstExtLoc.isNull() || !firstExtLoc.isValid()) {
+ // Already empty
+ return Status::OK();
+ }
- _extentManager->freeExtents(txn, extNextLoc, oldLastExtLoc);
- }
+ // Free all extents except the first.
+ Extent* firstExt = _extentManager->getExtent(firstExtLoc);
+ if (!firstExt->xnext.isNull()) {
+ const DiskLoc extNextLoc = firstExt->xnext;
+ const DiskLoc oldLastExtLoc = _details->lastExtent(txn);
+ Extent* const nextExt = _extentManager->getExtent(extNextLoc);
- // Make the first (now only) extent a single large deleted record.
- *txn->recoveryUnit()->writing(&firstExt->firstRecord) = DiskLoc();
- *txn->recoveryUnit()->writing(&firstExt->lastRecord) = DiskLoc();
- _details->orphanDeletedList(txn);
- addDeletedRec(txn, _findFirstSpot(txn, firstExtLoc, firstExt));
+ // Unlink other extents;
+ *txn->recoveryUnit()->writing(&nextExt->xprev) = DiskLoc();
+ *txn->recoveryUnit()->writing(&firstExt->xnext) = DiskLoc();
+ _details->setLastExtent(txn, firstExtLoc);
+ _details->setLastExtentSize(txn, firstExt->length);
- // Make stats reflect that there are now no documents in this record store.
- _details->setStats(txn, 0, 0);
-
- return Status::OK();
+ _extentManager->freeExtents(txn, extNextLoc, oldLastExtLoc);
}
- void SimpleRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) {
- DeletedRecord* d = drec( dloc );
+ // Make the first (now only) extent a single large deleted record.
+ *txn->recoveryUnit()->writing(&firstExt->firstRecord) = DiskLoc();
+ *txn->recoveryUnit()->writing(&firstExt->lastRecord) = DiskLoc();
+ _details->orphanDeletedList(txn);
+ addDeletedRec(txn, _findFirstSpot(txn, firstExtLoc, firstExt));
- int b = bucket(d->lengthWithHeaders());
- *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b);
- _details->setDeletedListEntry(txn, b, dloc);
- }
+ // Make stats reflect that there are now no documents in this record store.
+ _details->setStats(txn, 0, 0);
- std::unique_ptr<RecordCursor> SimpleRecordStoreV1::getCursor(OperationContext* txn,
- bool forward) const {
- return stdx::make_unique<SimpleRecordStoreV1Iterator>( txn, this, forward );
- }
+ return Status::OK();
+}
- vector<std::unique_ptr<RecordCursor>> SimpleRecordStoreV1::getManyCursors(
- OperationContext* txn) const {
- vector<std::unique_ptr<RecordCursor>> cursors;
- const Extent* ext;
- for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
- ext = _getExtent(txn, extLoc);
- if (ext->firstRecord.isNull())
- continue;
- cursors.push_back(
- stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(txn,
- ext->firstRecord,
- this));
- }
+void SimpleRecordStoreV1::addDeletedRec(OperationContext* txn, const DiskLoc& dloc) {
+ DeletedRecord* d = drec(dloc);
- return cursors;
- }
+ int b = bucket(d->lengthWithHeaders());
+ *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b);
+ _details->setDeletedListEntry(txn, b, dloc);
+}
- class CompactDocWriter : public DocWriter {
- public:
- /**
- * param allocationSize - allocation size WITH header
- */
- CompactDocWriter( const MmapV1RecordHeader* rec, unsigned dataSize, size_t allocationSize )
- : _rec( rec ),
- _dataSize( dataSize ),
- _allocationSize( allocationSize ) {
- }
+std::unique_ptr<RecordCursor> SimpleRecordStoreV1::getCursor(OperationContext* txn,
+ bool forward) const {
+ return stdx::make_unique<SimpleRecordStoreV1Iterator>(txn, this, forward);
+}
- virtual ~CompactDocWriter() {}
+vector<std::unique_ptr<RecordCursor>> SimpleRecordStoreV1::getManyCursors(
+ OperationContext* txn) const {
+ vector<std::unique_ptr<RecordCursor>> cursors;
+ const Extent* ext;
+ for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
+ ext = _getExtent(txn, extLoc);
+ if (ext->firstRecord.isNull())
+ continue;
+ cursors.push_back(
+ stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(txn, ext->firstRecord, this));
+ }
- virtual void writeDocument( char* buf ) const {
- memcpy( buf, _rec->data(), _dataSize );
- }
+ return cursors;
+}
- virtual size_t documentSize() const {
- return _allocationSize - MmapV1RecordHeader::HeaderSize;
- }
+class CompactDocWriter : public DocWriter {
+public:
+ /**
+ * param allocationSize - allocation size WITH header
+ */
+ CompactDocWriter(const MmapV1RecordHeader* rec, unsigned dataSize, size_t allocationSize)
+ : _rec(rec), _dataSize(dataSize), _allocationSize(allocationSize) {}
- virtual bool addPadding() const {
- return false;
- }
+ virtual ~CompactDocWriter() {}
- private:
- const MmapV1RecordHeader* _rec;
- size_t _dataSize;
- size_t _allocationSize;
- };
+ virtual void writeDocument(char* buf) const {
+ memcpy(buf, _rec->data(), _dataSize);
+ }
- void SimpleRecordStoreV1::_compactExtent(OperationContext* txn,
- const DiskLoc extentLoc,
- int extentNumber,
- RecordStoreCompactAdaptor* adaptor,
- const CompactOptions* compactOptions,
- CompactStats* stats ) {
+ virtual size_t documentSize() const {
+ return _allocationSize - MmapV1RecordHeader::HeaderSize;
+ }
- log() << "compact begin extent #" << extentNumber
- << " for namespace " << _ns << " " << extentLoc;
+ virtual bool addPadding() const {
+ return false;
+ }
- unsigned oldObjSize = 0; // we'll report what the old padding was
- unsigned oldObjSizeWithPadding = 0;
+private:
+ const MmapV1RecordHeader* _rec;
+ size_t _dataSize;
+ size_t _allocationSize;
+};
- Extent* const sourceExtent = _extentManager->getExtent( extentLoc );
- sourceExtent->assertOk();
- fassert( 17437, sourceExtent->validates(extentLoc) );
+void SimpleRecordStoreV1::_compactExtent(OperationContext* txn,
+ const DiskLoc extentLoc,
+ int extentNumber,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* compactOptions,
+ CompactStats* stats) {
+ log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " "
+ << extentLoc;
+
+ unsigned oldObjSize = 0; // we'll report what the old padding was
+ unsigned oldObjSizeWithPadding = 0;
+
+ Extent* const sourceExtent = _extentManager->getExtent(extentLoc);
+ sourceExtent->assertOk();
+ fassert(17437, sourceExtent->validates(extentLoc));
+
+ {
+ // The next/prev MmapV1RecordHeader pointers within the Extent might not be in order so we first
+ // page in the whole Extent sequentially.
+ // TODO benchmark on slow storage to verify this is measurably faster.
+ log() << "compact paging in len=" << sourceExtent->length / 1000000.0 << "MB" << endl;
+ Timer t;
+ size_t length = sourceExtent->length;
+
+ touch_pages(reinterpret_cast<const char*>(sourceExtent), length);
+ int ms = t.millis();
+ if (ms > 1000)
+ log() << "compact end paging in " << ms << "ms "
+ << sourceExtent->length / 1000000.0 / t.seconds() << "MB/sec" << endl;
+ }
- {
- // The next/prev MmapV1RecordHeader pointers within the Extent might not be in order so we first
- // page in the whole Extent sequentially.
- // TODO benchmark on slow storage to verify this is measurably faster.
- log() << "compact paging in len=" << sourceExtent->length/1000000.0 << "MB" << endl;
- Timer t;
- size_t length = sourceExtent->length;
-
- touch_pages( reinterpret_cast<const char*>(sourceExtent), length );
- int ms = t.millis();
- if( ms > 1000 )
- log() << "compact end paging in " << ms << "ms "
- << sourceExtent->length/1000000.0/t.seconds() << "MB/sec" << endl;
- }
+ {
+ // Move each MmapV1RecordHeader out of this extent and insert it in to the "new" extents.
+ log() << "compact copying records" << endl;
+ long long totalNetSize = 0;
+ long long nrecords = 0;
+ DiskLoc nextSourceLoc = sourceExtent->firstRecord;
+ while (!nextSourceLoc.isNull()) {
+ txn->checkForInterrupt();
- {
- // Move each MmapV1RecordHeader out of this extent and insert it in to the "new" extents.
- log() << "compact copying records" << endl;
- long long totalNetSize = 0;
- long long nrecords = 0;
- DiskLoc nextSourceLoc = sourceExtent->firstRecord;
- while (!nextSourceLoc.isNull()) {
- txn->checkForInterrupt();
-
- WriteUnitOfWork wunit(txn);
- MmapV1RecordHeader* recOld = recordFor(nextSourceLoc);
- RecordData oldData = recOld->toRecordData();
- nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc);
-
- if ( compactOptions->validateDocuments && !adaptor->isDataValid( oldData ) ) {
- // object is corrupt!
- log() << "compact removing corrupt document!";
- stats->corruptDocuments++;
- }
- else {
- // How much data is in the record. Excludes padding and MmapV1RecordHeader headers.
- const unsigned rawDataSize = adaptor->dataSize( oldData );
-
- nrecords++;
- oldObjSize += rawDataSize;
- oldObjSizeWithPadding += recOld->netLength();
-
- // Allocation sizes include the headers and possibly some padding.
- const unsigned minAllocationSize = rawDataSize + MmapV1RecordHeader::HeaderSize;
- unsigned allocationSize = minAllocationSize;
- switch( compactOptions->paddingMode ) {
- case CompactOptions::NONE: // default padding
+ WriteUnitOfWork wunit(txn);
+ MmapV1RecordHeader* recOld = recordFor(nextSourceLoc);
+ RecordData oldData = recOld->toRecordData();
+ nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc);
+
+ if (compactOptions->validateDocuments && !adaptor->isDataValid(oldData)) {
+ // object is corrupt!
+ log() << "compact removing corrupt document!";
+ stats->corruptDocuments++;
+ } else {
+ // How much data is in the record. Excludes padding and MmapV1RecordHeader headers.
+ const unsigned rawDataSize = adaptor->dataSize(oldData);
+
+ nrecords++;
+ oldObjSize += rawDataSize;
+ oldObjSizeWithPadding += recOld->netLength();
+
+ // Allocation sizes include the headers and possibly some padding.
+ const unsigned minAllocationSize = rawDataSize + MmapV1RecordHeader::HeaderSize;
+ unsigned allocationSize = minAllocationSize;
+ switch (compactOptions->paddingMode) {
+ case CompactOptions::NONE: // default padding
if (shouldPadInserts()) {
allocationSize = quantizeAllocationSpace(minAllocationSize);
}
break;
- case CompactOptions::PRESERVE: // keep original padding
+ case CompactOptions::PRESERVE: // keep original padding
allocationSize = recOld->lengthWithHeaders();
break;
- case CompactOptions::MANUAL: // user specified how much padding to use
+ case CompactOptions::MANUAL: // user specified how much padding to use
allocationSize = compactOptions->computeRecordSize(minAllocationSize);
- if (allocationSize < minAllocationSize
- || allocationSize > BSONObjMaxUserSize / 2 ) {
+ if (allocationSize < minAllocationSize ||
+ allocationSize > BSONObjMaxUserSize / 2) {
allocationSize = minAllocationSize;
}
break;
- }
- invariant(allocationSize >= minAllocationSize);
-
- // Copy the data to a new record. Because we orphaned the record freelist at the
- // start of the compact, this insert will allocate a record in a new extent.
- // See the comment in compact() for more details.
- CompactDocWriter writer( recOld, rawDataSize, allocationSize );
- StatusWith<RecordId> status = insertRecord( txn, &writer, false );
- uassertStatusOK( status.getStatus() );
- const MmapV1RecordHeader* newRec = recordFor(DiskLoc::fromRecordId(status.getValue()));
- invariant(unsigned(newRec->netLength()) >= rawDataSize);
- totalNetSize += newRec->netLength();
-
- // Tells the caller that the record has been moved, so it can do things such as
- // add it to indexes.
- adaptor->inserted(newRec->toRecordData(), status.getValue());
- }
-
- // Remove the old record from the linked list of records withing the sourceExtent.
- // The old record is not added to the freelist as we will be freeing the whole
- // extent at the end.
- *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc;
- if (nextSourceLoc.isNull()) {
- // Just moved the last record out of the extent. Mark extent as empty.
- *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc();
}
- else {
- MmapV1RecordHeader* newFirstRecord = recordFor(nextSourceLoc);
- txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs;
- }
-
- // Adjust the stats to reflect the removal of the old record. The insert above
- // handled adjusting the stats for the new record.
- _details->incrementStats(txn, -(recOld->netLength()), -1);
-
- wunit.commit();
+ invariant(allocationSize >= minAllocationSize);
+
+ // Copy the data to a new record. Because we orphaned the record freelist at the
+ // start of the compact, this insert will allocate a record in a new extent.
+ // See the comment in compact() for more details.
+ CompactDocWriter writer(recOld, rawDataSize, allocationSize);
+ StatusWith<RecordId> status = insertRecord(txn, &writer, false);
+ uassertStatusOK(status.getStatus());
+ const MmapV1RecordHeader* newRec =
+ recordFor(DiskLoc::fromRecordId(status.getValue()));
+ invariant(unsigned(newRec->netLength()) >= rawDataSize);
+ totalNetSize += newRec->netLength();
+
+ // Tells the caller that the record has been moved, so it can do things such as
+ // add it to indexes.
+ adaptor->inserted(newRec->toRecordData(), status.getValue());
}
- // The extent must now be empty.
- invariant(sourceExtent->firstRecord.isNull());
- invariant(sourceExtent->lastRecord.isNull());
+ // Remove the old record from the linked list of records withing the sourceExtent.
+ // The old record is not added to the freelist as we will be freeing the whole
+ // extent at the end.
+ *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc;
+ if (nextSourceLoc.isNull()) {
+ // Just moved the last record out of the extent. Mark extent as empty.
+ *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc();
+ } else {
+ MmapV1RecordHeader* newFirstRecord = recordFor(nextSourceLoc);
+ txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs;
+ }
- // We are still the first extent, but we must not be the only extent.
- invariant( _details->firstExtent(txn) == extentLoc );
- invariant( _details->lastExtent(txn) != extentLoc );
+ // Adjust the stats to reflect the removal of the old record. The insert above
+ // handled adjusting the stats for the new record.
+ _details->incrementStats(txn, -(recOld->netLength()), -1);
- // Remove the newly emptied sourceExtent from the extent linked list and return it to
- // the extent manager.
- WriteUnitOfWork wunit(txn);
- const DiskLoc newFirst = sourceExtent->xnext;
- _details->setFirstExtent( txn, newFirst );
- *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc();
- _extentManager->freeExtent( txn, extentLoc );
wunit.commit();
-
- {
- const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize
- : 1.0; // defining 0/0 as 1 for this.
-
- log() << "compact finished extent #" << extentNumber << " containing " << nrecords
- << " documents (" << totalNetSize / (1024*1024.0) << "MB)"
- << " oldPadding: " << oldPadding;
- }
}
- }
+ // The extent must now be empty.
+ invariant(sourceExtent->firstRecord.isNull());
+ invariant(sourceExtent->lastRecord.isNull());
- Status SimpleRecordStoreV1::compact( OperationContext* txn,
- RecordStoreCompactAdaptor* adaptor,
- const CompactOptions* options,
- CompactStats* stats ) {
-
- std::vector<DiskLoc> extents;
- for( DiskLoc extLocation = _details->firstExtent(txn);
- !extLocation.isNull();
- extLocation = _extentManager->getExtent( extLocation )->xnext ) {
- extents.push_back( extLocation );
- }
- log() << "compact " << extents.size() << " extents";
+ // We are still the first extent, but we must not be the only extent.
+ invariant(_details->firstExtent(txn) == extentLoc);
+ invariant(_details->lastExtent(txn) != extentLoc);
- {
- WriteUnitOfWork wunit(txn);
- // Orphaning the deleted lists ensures that all inserts go to new extents rather than
- // the ones that existed before starting the compact. If we abort the operation before
- // completion, any free space in the old extents will be leaked and never reused unless
- // the collection is compacted again or dropped. This is considered an acceptable
- // failure mode as no data will be lost.
- log() << "compact orphan deleted lists" << endl;
- _details->orphanDeletedList(txn);
-
- // Start over from scratch with our extent sizing and growth
- _details->setLastExtentSize( txn, 0 );
-
- // create a new extent so new records go there
- increaseStorageSize( txn, _details->lastExtentSize(txn), true );
- wunit.commit();
- }
+ // Remove the newly emptied sourceExtent from the extent linked list and return it to
+ // the extent manager.
+ WriteUnitOfWork wunit(txn);
+ const DiskLoc newFirst = sourceExtent->xnext;
+ _details->setFirstExtent(txn, newFirst);
+ *txn->recoveryUnit()->writing(&_extentManager->getExtent(newFirst)->xprev) = DiskLoc();
+ _extentManager->freeExtent(txn, extentLoc);
+ wunit.commit();
- stdx::unique_lock<Client> lk(*txn->getClient());
- ProgressMeterHolder pm(*txn->setMessage_inlock("compact extent",
- "Extent Compacting Progress",
- extents.size()));
- lk.unlock();
+ {
+ const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize
+ : 1.0; // defining 0/0 as 1 for this.
- // Go through all old extents and move each record to a new set of extents.
- int extentNumber = 0;
- for( std::vector<DiskLoc>::iterator it = extents.begin(); it != extents.end(); it++ ) {
- txn->checkForInterrupt();
- invariant(_details->firstExtent(txn) == *it);
- // empties and removes the first extent
- _compactExtent(txn, *it, extentNumber++, adaptor, options, stats );
- invariant(_details->firstExtent(txn) != *it);
- pm.hit();
+ log() << "compact finished extent #" << extentNumber << " containing " << nrecords
+ << " documents (" << totalNetSize / (1024 * 1024.0) << "MB)"
+ << " oldPadding: " << oldPadding;
}
+ }
+}
- invariant( _extentManager->getExtent( _details->firstExtent(txn) )->xprev.isNull() );
- invariant( _extentManager->getExtent( _details->lastExtent(txn) )->xnext.isNull() );
+Status SimpleRecordStoreV1::compact(OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats) {
+ std::vector<DiskLoc> extents;
+ for (DiskLoc extLocation = _details->firstExtent(txn); !extLocation.isNull();
+ extLocation = _extentManager->getExtent(extLocation)->xnext) {
+ extents.push_back(extLocation);
+ }
+ log() << "compact " << extents.size() << " extents";
+
+ {
+ WriteUnitOfWork wunit(txn);
+ // Orphaning the deleted lists ensures that all inserts go to new extents rather than
+ // the ones that existed before starting the compact. If we abort the operation before
+ // completion, any free space in the old extents will be leaked and never reused unless
+ // the collection is compacted again or dropped. This is considered an acceptable
+ // failure mode as no data will be lost.
+ log() << "compact orphan deleted lists" << endl;
+ _details->orphanDeletedList(txn);
- // indexes will do their own progress meter
- pm.finished();
+ // Start over from scratch with our extent sizing and growth
+ _details->setLastExtentSize(txn, 0);
- return Status::OK();
+ // create a new extent so new records go there
+ increaseStorageSize(txn, _details->lastExtentSize(txn), true);
+ wunit.commit();
}
+ stdx::unique_lock<Client> lk(*txn->getClient());
+ ProgressMeterHolder pm(
+ *txn->setMessage_inlock("compact extent", "Extent Compacting Progress", extents.size()));
+ lk.unlock();
+
+ // Go through all old extents and move each record to a new set of extents.
+ int extentNumber = 0;
+ for (std::vector<DiskLoc>::iterator it = extents.begin(); it != extents.end(); it++) {
+ txn->checkForInterrupt();
+ invariant(_details->firstExtent(txn) == *it);
+ // empties and removes the first extent
+ _compactExtent(txn, *it, extentNumber++, adaptor, options, stats);
+ invariant(_details->firstExtent(txn) != *it);
+ pm.hit();
+ }
+
+ invariant(_extentManager->getExtent(_details->firstExtent(txn))->xprev.isNull());
+ invariant(_extentManager->getExtent(_details->lastExtent(txn))->xnext.isNull());
+
+ // indexes will do their own progress meter
+ pm.finished();
+
+ return Status::OK();
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h
index a108305492a..9ab6ba86f78 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h
@@ -36,65 +36,70 @@
namespace mongo {
- class SimpleRecordStoreV1Cursor;
-
- // used by index and original collections
- class SimpleRecordStoreV1 : public RecordStoreV1Base {
- public:
- SimpleRecordStoreV1( OperationContext* txn,
- StringData ns,
- RecordStoreV1MetaData* details,
- ExtentManager* em,
- bool isSystemIndexes );
-
- virtual ~SimpleRecordStoreV1();
-
- const char* name() const { return "SimpleRecordStoreV1"; }
-
- std::unique_ptr<RecordCursor> getCursor(OperationContext* txn, bool forward) const final;
-
- std::vector<std::unique_ptr<RecordCursor>> getManyCursors(
- OperationContext* txn) const final;
-
- virtual Status truncate(OperationContext* txn);
-
- virtual void temp_cappedTruncateAfter(OperationContext* txn, RecordId end, bool inclusive) {
- invariant(!"cappedTruncateAfter not supported");
- }
-
- virtual bool compactSupported() const { return true; }
- virtual bool compactsInPlace() const { return false; }
- virtual Status compact( OperationContext* txn,
- RecordStoreCompactAdaptor* adaptor,
- const CompactOptions* options,
- CompactStats* stats );
-
- protected:
- virtual bool isCapped() const { return false; }
- virtual bool shouldPadInserts() const {
- return !_details->isUserFlagSet(CollectionOptions::Flag_NoPadding);
- }
-
- virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn,
- int lengthWithHeaders,
- bool enforceQuota );
-
- virtual void addDeletedRec(OperationContext* txn,
- const DiskLoc& dloc);
- private:
- DiskLoc _allocFromExistingExtents( OperationContext* txn,
- int lengthWithHeaders );
-
- void _compactExtent(OperationContext* txn,
- const DiskLoc diskloc,
- int extentNumber,
- RecordStoreCompactAdaptor* adaptor,
- const CompactOptions* compactOptions,
- CompactStats* stats );
-
- bool _normalCollection;
-
- friend class SimpleRecordStoreV1Iterator;
- };
-
+class SimpleRecordStoreV1Cursor;
+
+// used by index and original collections
+class SimpleRecordStoreV1 : public RecordStoreV1Base {
+public:
+ SimpleRecordStoreV1(OperationContext* txn,
+ StringData ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes);
+
+ virtual ~SimpleRecordStoreV1();
+
+ const char* name() const {
+ return "SimpleRecordStoreV1";
+ }
+
+ std::unique_ptr<RecordCursor> getCursor(OperationContext* txn, bool forward) const final;
+
+ std::vector<std::unique_ptr<RecordCursor>> getManyCursors(OperationContext* txn) const final;
+
+ virtual Status truncate(OperationContext* txn);
+
+ virtual void temp_cappedTruncateAfter(OperationContext* txn, RecordId end, bool inclusive) {
+ invariant(!"cappedTruncateAfter not supported");
+ }
+
+ virtual bool compactSupported() const {
+ return true;
+ }
+ virtual bool compactsInPlace() const {
+ return false;
+ }
+ virtual Status compact(OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats);
+
+protected:
+ virtual bool isCapped() const {
+ return false;
+ }
+ virtual bool shouldPadInserts() const {
+ return !_details->isUserFlagSet(CollectionOptions::Flag_NoPadding);
+ }
+
+ virtual StatusWith<DiskLoc> allocRecord(OperationContext* txn,
+ int lengthWithHeaders,
+ bool enforceQuota);
+
+ virtual void addDeletedRec(OperationContext* txn, const DiskLoc& dloc);
+
+private:
+ DiskLoc _allocFromExistingExtents(OperationContext* txn, int lengthWithHeaders);
+
+ void _compactExtent(OperationContext* txn,
+ const DiskLoc diskloc,
+ int extentNumber,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* compactOptions,
+ CompactStats* stats);
+
+ bool _normalCollection;
+
+ friend class SimpleRecordStoreV1Iterator;
+};
}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp
index ec1e51abe02..babfbcf26ea 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp
@@ -35,100 +35,94 @@
namespace mongo {
- //
- // Regular / non-capped collection traversal
- //
-
- SimpleRecordStoreV1Iterator::SimpleRecordStoreV1Iterator(OperationContext* txn,
- const SimpleRecordStoreV1* collection,
- bool forward)
- : _txn(txn)
- , _recordStore(collection)
- , _forward(forward) {
-
- // Eagerly seek to first Record on creation since it is cheap.
- const ExtentManager* em = _recordStore->_extentManager;
- if ( _recordStore->details()->firstExtent(txn).isNull() ) {
- // nothing in the collection
- verify( _recordStore->details()->lastExtent(txn).isNull() );
+//
+// Regular / non-capped collection traversal
+//
+
+SimpleRecordStoreV1Iterator::SimpleRecordStoreV1Iterator(OperationContext* txn,
+ const SimpleRecordStoreV1* collection,
+ bool forward)
+ : _txn(txn), _recordStore(collection), _forward(forward) {
+ // Eagerly seek to first Record on creation since it is cheap.
+ const ExtentManager* em = _recordStore->_extentManager;
+ if (_recordStore->details()->firstExtent(txn).isNull()) {
+ // nothing in the collection
+ verify(_recordStore->details()->lastExtent(txn).isNull());
+ } else if (_forward) {
+ // Find a non-empty extent and start with the first record in it.
+ Extent* e = em->getExtent(_recordStore->details()->firstExtent(txn));
+
+ while (e->firstRecord.isNull() && !e->xnext.isNull()) {
+ e = em->getExtent(e->xnext);
}
- else if (_forward) {
- // Find a non-empty extent and start with the first record in it.
- Extent* e = em->getExtent( _recordStore->details()->firstExtent(txn) );
- while (e->firstRecord.isNull() && !e->xnext.isNull()) {
- e = em->getExtent( e->xnext );
- }
-
- // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no
- // valid e->xnext
- _curr = e->firstRecord;
- }
- else {
- // Walk backwards, skipping empty extents, and use the last record in the first
- // non-empty extent we see.
- Extent* e = em->getExtent( _recordStore->details()->lastExtent(txn) );
-
- // TODO ELABORATE
- // Does one of e->lastRecord.isNull(), e.firstRecord.isNull() imply the other?
- while (e->lastRecord.isNull() && !e->xprev.isNull()) {
- e = em->getExtent( e->xprev );
- }
-
- // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no
- // valid e->xprev
- _curr = e->lastRecord;
+ // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no
+ // valid e->xnext
+ _curr = e->firstRecord;
+ } else {
+ // Walk backwards, skipping empty extents, and use the last record in the first
+ // non-empty extent we see.
+ Extent* e = em->getExtent(_recordStore->details()->lastExtent(txn));
+
+ // TODO ELABORATE
+ // Does one of e->lastRecord.isNull(), e.firstRecord.isNull() imply the other?
+ while (e->lastRecord.isNull() && !e->xprev.isNull()) {
+ e = em->getExtent(e->xprev);
}
- }
- boost::optional<Record> SimpleRecordStoreV1Iterator::next() {
- if (isEOF()) return {};
- auto toReturn = _curr.toRecordId();
- advance();
- return {{toReturn, _recordStore->RecordStore::dataFor(_txn, toReturn)}};
+ // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no
+ // valid e->xprev
+ _curr = e->lastRecord;
}
+}
- boost::optional<Record> SimpleRecordStoreV1Iterator::seekExact(const RecordId& id) {
- _curr = DiskLoc::fromRecordId(id);
- advance();
- return {{id, _recordStore->RecordStore::dataFor(_txn, id)}};
- }
+boost::optional<Record> SimpleRecordStoreV1Iterator::next() {
+ if (isEOF())
+ return {};
+ auto toReturn = _curr.toRecordId();
+ advance();
+ return {{toReturn, _recordStore->RecordStore::dataFor(_txn, toReturn)}};
+}
- void SimpleRecordStoreV1Iterator::advance() {
- // Move to the next thing.
- if (!isEOF()) {
- if (_forward) {
- _curr = _recordStore->getNextRecord( _txn, _curr );
- }
- else {
- _curr = _recordStore->getPrevRecord( _txn, _curr );
- }
- }
- }
+boost::optional<Record> SimpleRecordStoreV1Iterator::seekExact(const RecordId& id) {
+ _curr = DiskLoc::fromRecordId(id);
+ advance();
+ return {{id, _recordStore->RecordStore::dataFor(_txn, id)}};
+}
- void SimpleRecordStoreV1Iterator::invalidate(const RecordId& dl) {
- // Just move past the thing being deleted.
- if (dl == _curr.toRecordId()) {
- advance();
+void SimpleRecordStoreV1Iterator::advance() {
+ // Move to the next thing.
+ if (!isEOF()) {
+ if (_forward) {
+ _curr = _recordStore->getNextRecord(_txn, _curr);
+ } else {
+ _curr = _recordStore->getPrevRecord(_txn, _curr);
}
}
+}
- void SimpleRecordStoreV1Iterator::savePositioned() {
- _txn = nullptr;
+void SimpleRecordStoreV1Iterator::invalidate(const RecordId& dl) {
+ // Just move past the thing being deleted.
+ if (dl == _curr.toRecordId()) {
+ advance();
}
+}
- bool SimpleRecordStoreV1Iterator::restore(OperationContext* txn) {
- _txn = txn;
- // if the collection is dropped, then the cursor should be destroyed
- return true;
- }
+void SimpleRecordStoreV1Iterator::savePositioned() {
+ _txn = nullptr;
+}
- std::unique_ptr<RecordFetcher> SimpleRecordStoreV1Iterator::fetcherForNext() const {
- return _recordStore->_extentManager->recordNeedsFetch(_curr);
- }
+bool SimpleRecordStoreV1Iterator::restore(OperationContext* txn) {
+ _txn = txn;
+ // if the collection is dropped, then the cursor should be destroyed
+ return true;
+}
- std::unique_ptr<RecordFetcher> SimpleRecordStoreV1Iterator::fetcherForId(
- const RecordId& id) const {
- return _recordStore->_extentManager->recordNeedsFetch(DiskLoc::fromRecordId(id));
- }
+std::unique_ptr<RecordFetcher> SimpleRecordStoreV1Iterator::fetcherForNext() const {
+ return _recordStore->_extentManager->recordNeedsFetch(_curr);
+}
+
+std::unique_ptr<RecordFetcher> SimpleRecordStoreV1Iterator::fetcherForId(const RecordId& id) const {
+ return _recordStore->_extentManager->recordNeedsFetch(DiskLoc::fromRecordId(id));
+}
}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h
index c19c0c386b3..91b0088bf72 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h
@@ -33,39 +33,41 @@
namespace mongo {
- class SimpleRecordStoreV1;
+class SimpleRecordStoreV1;
- /**
- * This class iterates over a non-capped collection identified by 'ns'.
- * The collection must exist when the constructor is called.
- *
- * If start is not DiskLoc(), the iteration begins at that DiskLoc.
- */
- class SimpleRecordStoreV1Iterator final : public RecordCursor {
- public:
- SimpleRecordStoreV1Iterator( OperationContext* txn,
- const SimpleRecordStoreV1* records,
- bool forward);
+/**
+ * This class iterates over a non-capped collection identified by 'ns'.
+ * The collection must exist when the constructor is called.
+ *
+ * If start is not DiskLoc(), the iteration begins at that DiskLoc.
+ */
+class SimpleRecordStoreV1Iterator final : public RecordCursor {
+public:
+ SimpleRecordStoreV1Iterator(OperationContext* txn,
+ const SimpleRecordStoreV1* records,
+ bool forward);
- boost::optional<Record> next() final;
- boost::optional<Record> seekExact(const RecordId& id) final;
- void savePositioned() final;
- bool restore(OperationContext* txn) final;
- void invalidate(const RecordId& dl) final;
- std::unique_ptr<RecordFetcher> fetcherForNext() const final;
- std::unique_ptr<RecordFetcher> fetcherForId(const RecordId& id) const final;
+ boost::optional<Record> next() final;
+ boost::optional<Record> seekExact(const RecordId& id) final;
+ void savePositioned() final;
+ bool restore(OperationContext* txn) final;
+ void invalidate(const RecordId& dl) final;
+ std::unique_ptr<RecordFetcher> fetcherForNext() const final;
+ std::unique_ptr<RecordFetcher> fetcherForId(const RecordId& id) const final;
- private:
- void advance();
- bool isEOF() { return _curr.isNull(); }
+private:
+ void advance();
+ bool isEOF() {
+ return _curr.isNull();
+ }
- // for getNext, not owned
- OperationContext* _txn;
+ // for getNext, not owned
+ OperationContext* _txn;
- // The result returned on the next call to getNext().
- DiskLoc _curr;
- const SimpleRecordStoreV1* const _recordStore;
- const bool _forward;
- };
+ // The result returned on the next call to getNext().
+ DiskLoc _curr;
+ const SimpleRecordStoreV1* const _recordStore;
+ const bool _forward;
+};
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp
index 21ffdf6ef2b..e4e85168b01 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp
@@ -40,501 +40,413 @@ using namespace mongo;
namespace {
- using std::string;
-
- TEST( SimpleRecordStoreV1, quantizeAllocationSpaceSimple ) {
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(33), 64);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000), 1024);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10001), 16*1024);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(100000), 128*1024);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000001), 1024*1024);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10000000), 10*1024*1024);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14*1024*1024 - 1), 14*1024*1024);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14*1024*1024), 14*1024*1024);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14*1024*1024 + 1),
- 16*1024*1024 + 512*1024);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(16*1024*1024 + 512*1024),
- 16*1024*1024 + 512*1024);
- }
+using std::string;
+
+TEST(SimpleRecordStoreV1, quantizeAllocationSpaceSimple) {
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(33), 64);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000), 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10001), 16 * 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(100000), 128 * 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000001), 1024 * 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10000000), 10 * 1024 * 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14 * 1024 * 1024 - 1),
+ 14 * 1024 * 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14 * 1024 * 1024), 14 * 1024 * 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14 * 1024 * 1024 + 1),
+ 16 * 1024 * 1024 + 512 * 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(16 * 1024 * 1024 + 512 * 1024),
+ 16 * 1024 * 1024 + 512 * 1024);
+}
- TEST( SimpleRecordStoreV1, quantizeAllocationMinMaxBound ) {
- const int maxSize = RecordStoreV1Base::MaxAllowedAllocation;
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1), 32);
- ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(maxSize), maxSize);
- }
+TEST(SimpleRecordStoreV1, quantizeAllocationMinMaxBound) {
+ const int maxSize = RecordStoreV1Base::MaxAllowedAllocation;
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1), 32);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(maxSize), maxSize);
+}
- /**
- * Tests quantization of sizes around all valid bucket sizes.
- */
- TEST( SimpleRecordStoreV1, quantizeAroundBucketSizes ) {
- for (int bucket = 0; bucket < RecordStoreV1Base::Buckets - 2; bucket++) {
- const int size = RecordStoreV1Base::bucketSizes[bucket];
- const int nextSize = RecordStoreV1Base::bucketSizes[bucket + 1];
-
- // size - 1 is quantized to size.
- ASSERT_EQUALS( size,
- RecordStoreV1Base::quantizeAllocationSpace( size - 1 ) );
-
- // size is quantized to size.
- ASSERT_EQUALS( size,
- RecordStoreV1Base::quantizeAllocationSpace( size ) );
-
- // size + 1 is quantized to nextSize (if it is a valid allocation)
- if (size + 1 <= RecordStoreV1Base::MaxAllowedAllocation) {
- ASSERT_EQUALS( nextSize,
- RecordStoreV1Base::quantizeAllocationSpace( size + 1 ) );
- }
+/**
+ * Tests quantization of sizes around all valid bucket sizes.
+ */
+TEST(SimpleRecordStoreV1, quantizeAroundBucketSizes) {
+ for (int bucket = 0; bucket < RecordStoreV1Base::Buckets - 2; bucket++) {
+ const int size = RecordStoreV1Base::bucketSizes[bucket];
+ const int nextSize = RecordStoreV1Base::bucketSizes[bucket + 1];
+
+ // size - 1 is quantized to size.
+ ASSERT_EQUALS(size, RecordStoreV1Base::quantizeAllocationSpace(size - 1));
+
+ // size is quantized to size.
+ ASSERT_EQUALS(size, RecordStoreV1Base::quantizeAllocationSpace(size));
+
+ // size + 1 is quantized to nextSize (if it is a valid allocation)
+ if (size + 1 <= RecordStoreV1Base::MaxAllowedAllocation) {
+ ASSERT_EQUALS(nextSize, RecordStoreV1Base::quantizeAllocationSpace(size + 1));
}
}
+}
+
+BSONObj docForRecordSize(int size) {
+ BSONObjBuilder b;
+ b.append("_id", 5);
+ b.append("x", string(size - MmapV1RecordHeader::HeaderSize - 22, 'x'));
+ BSONObj x = b.obj();
+ ASSERT_EQUALS(MmapV1RecordHeader::HeaderSize + x.objsize(), size);
+ return x;
+}
+
+class BsonDocWriter : public DocWriter {
+public:
+ BsonDocWriter(const BSONObj& obj, bool padding) : _obj(obj), _padding(padding) {}
- BSONObj docForRecordSize( int size ) {
- BSONObjBuilder b;
- b.append( "_id", 5 );
- b.append( "x", string( size - MmapV1RecordHeader::HeaderSize - 22, 'x' ) );
- BSONObj x = b.obj();
- ASSERT_EQUALS( MmapV1RecordHeader::HeaderSize + x.objsize(), size );
- return x;
+ virtual void writeDocument(char* buf) const {
+ memcpy(buf, _obj.objdata(), _obj.objsize());
+ }
+ virtual size_t documentSize() const {
+ return _obj.objsize();
+ }
+ virtual bool addPadding() const {
+ return _padding;
}
- class BsonDocWriter : public DocWriter {
- public:
- BsonDocWriter(const BSONObj& obj, bool padding) : _obj(obj), _padding(padding) {}
+private:
+ BSONObj _obj;
+ bool _padding;
+};
- virtual void writeDocument(char* buf) const { memcpy(buf, _obj.objdata(), _obj.objsize()); }
- virtual size_t documentSize() const { return _obj.objsize(); }
- virtual bool addPadding() const { return _padding; }
+/** alloc() quantizes the requested size using quantizeAllocationSpace() rules. */
+TEST(SimpleRecordStoreV1, AllocQuantized) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
- private:
- BSONObj _obj;
- bool _padding;
- };
+ string myns = "test.AllocQuantized";
+ SimpleRecordStoreV1 rs(&txn, myns, md, &em, false);
- /** alloc() quantizes the requested size using quantizeAllocationSpace() rules. */
- TEST(SimpleRecordStoreV1, AllocQuantized) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ BSONObj obj = docForRecordSize(300);
+ StatusWith<RecordId> result = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT(result.isOK());
- string myns = "test.AllocQuantized";
- SimpleRecordStoreV1 rs( &txn, myns, md, &em, false );
+ // The length of the allocated record is quantized.
+ ASSERT_EQUALS(512, rs.dataFor(&txn, result.getValue()).size() + MmapV1RecordHeader::HeaderSize);
+}
- BSONObj obj = docForRecordSize( 300 );
- StatusWith<RecordId> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false);
- ASSERT( result.isOK() );
+TEST(SimpleRecordStoreV1, AllocNonQuantized) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ md->setUserFlag(&txn, CollectionOptions::Flag_NoPadding);
- // The length of the allocated record is quantized.
- ASSERT_EQUALS( 512 , rs.dataFor( &txn, result.getValue() ).size() + MmapV1RecordHeader::HeaderSize );
- }
+ string myns = "test.AllocQuantized";
+ SimpleRecordStoreV1 rs(&txn, myns, md, &em, false);
- TEST(SimpleRecordStoreV1, AllocNonQuantized) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- md->setUserFlag(&txn, CollectionOptions::Flag_NoPadding);
+ BSONObj obj = docForRecordSize(300);
+ StatusWith<RecordId> result = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT(result.isOK());
- string myns = "test.AllocQuantized";
- SimpleRecordStoreV1 rs( &txn, myns, md, &em, false );
+ // The length of the allocated record is quantized.
+ ASSERT_EQUALS(300, rs.dataFor(&txn, result.getValue()).size() + MmapV1RecordHeader::HeaderSize);
+}
- BSONObj obj = docForRecordSize( 300 );
- StatusWith<RecordId> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false);
- ASSERT( result.isOK() );
+TEST(SimpleRecordStoreV1, AllocNonQuantizedStillAligned) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ md->setUserFlag(&txn, CollectionOptions::Flag_NoPadding);
- // The length of the allocated record is quantized.
- ASSERT_EQUALS( 300 , rs.dataFor( &txn, result.getValue() ).size() + MmapV1RecordHeader::HeaderSize );
- }
+ string myns = "test.AllocQuantized";
+ SimpleRecordStoreV1 rs(&txn, myns, md, &em, false);
- TEST(SimpleRecordStoreV1, AllocNonQuantizedStillAligned) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- md->setUserFlag(&txn, CollectionOptions::Flag_NoPadding);
+ BSONObj obj = docForRecordSize(298);
+ StatusWith<RecordId> result = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT(result.isOK());
- string myns = "test.AllocQuantized";
- SimpleRecordStoreV1 rs( &txn, myns, md, &em, false );
+ // The length of the allocated record is quantized.
+ ASSERT_EQUALS(300, rs.dataFor(&txn, result.getValue()).size() + MmapV1RecordHeader::HeaderSize);
+}
- BSONObj obj = docForRecordSize( 298 );
- StatusWith<RecordId> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false);
- ASSERT( result.isOK() );
+/** alloc() quantizes the requested size if DocWriter::addPadding() returns true. */
+TEST(SimpleRecordStoreV1, AllocQuantizedWithDocWriter) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
- // The length of the allocated record is quantized.
- ASSERT_EQUALS( 300 , rs.dataFor( &txn, result.getValue() ).size() + MmapV1RecordHeader::HeaderSize );
- }
+ string myns = "test.AllocQuantized";
+ SimpleRecordStoreV1 rs(&txn, myns, md, &em, false);
- /** alloc() quantizes the requested size if DocWriter::addPadding() returns true. */
- TEST(SimpleRecordStoreV1, AllocQuantizedWithDocWriter) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ BsonDocWriter docWriter(docForRecordSize(300), true);
+ StatusWith<RecordId> result = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT(result.isOK());
- string myns = "test.AllocQuantized";
- SimpleRecordStoreV1 rs( &txn, myns, md, &em, false );
+ // The length of the allocated record is quantized.
+ ASSERT_EQUALS(512, rs.dataFor(&txn, result.getValue()).size() + MmapV1RecordHeader::HeaderSize);
+}
- BsonDocWriter docWriter(docForRecordSize( 300 ), true);
- StatusWith<RecordId> result = rs.insertRecord(&txn, &docWriter, false);
- ASSERT( result.isOK() );
+/**
+ * alloc() does not quantize records if DocWriter::addPadding() returns false
+ */
+TEST(SimpleRecordStoreV1, AllocNonQuantizedDocWriter) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
- // The length of the allocated record is quantized.
- ASSERT_EQUALS( 512 , rs.dataFor( &txn, result.getValue() ).size() + MmapV1RecordHeader::HeaderSize );
- }
+ string myns = "test.AllocIndexNamespaceNotQuantized";
+ SimpleRecordStoreV1 rs(&txn, myns + "$x", md, &em, false);
- /**
- * alloc() does not quantize records if DocWriter::addPadding() returns false
- */
- TEST(SimpleRecordStoreV1, AllocNonQuantizedDocWriter) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ BsonDocWriter docWriter(docForRecordSize(300), false);
+ StatusWith<RecordId> result = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT(result.isOK());
- string myns = "test.AllocIndexNamespaceNotQuantized";
- SimpleRecordStoreV1 rs( &txn, myns + "$x", md, &em, false );
+ // The length of the allocated record is not quantized.
+ ASSERT_EQUALS(300, rs.dataFor(&txn, result.getValue()).size() + MmapV1RecordHeader::HeaderSize);
+}
+
+/** alloc() aligns record sizes up to 4 bytes even if DocWriter::addPadding returns false. */
+TEST(SimpleRecordStoreV1, AllocAlignedDocWriter) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
- BsonDocWriter docWriter(docForRecordSize( 300 ), false);
- StatusWith<RecordId> result = rs.insertRecord(&txn, &docWriter, false);
- ASSERT( result.isOK() );
+ string myns = "test.AllocIndexNamespaceNotQuantized";
+ SimpleRecordStoreV1 rs(&txn, myns + "$x", md, &em, false);
- // The length of the allocated record is not quantized.
- ASSERT_EQUALS( 300, rs.dataFor( &txn, result.getValue() ).size() + MmapV1RecordHeader::HeaderSize );
+ BsonDocWriter docWriter(docForRecordSize(298), false);
+ StatusWith<RecordId> result = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT(result.isOK());
+ ASSERT_EQUALS(300, rs.dataFor(&txn, result.getValue()).size() + MmapV1RecordHeader::HeaderSize);
+}
+/**
+ * alloc() with quantized size doesn't split if enough room left over.
+ */
+TEST(SimpleRecordStoreV1, AllocUseQuantizedDeletedRecordWithoutSplit) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
+
+ {
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 512 + 31}, {}};
+ initializeV1RS(&txn, NULL, drecs, NULL, &em, md);
}
- /** alloc() aligns record sizes up to 4 bytes even if DocWriter::addPadding returns false. */
- TEST(SimpleRecordStoreV1, AllocAlignedDocWriter) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ BsonDocWriter docWriter(docForRecordSize(300), true);
+ StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT_OK(actualLocation.getStatus());
- string myns = "test.AllocIndexNamespaceNotQuantized";
- SimpleRecordStoreV1 rs( &txn, myns + "$x", md, &em, false );
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 512 + 31}, {}};
+ LocAndSize drecs[] = {{}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
+ }
+}
- BsonDocWriter docWriter(docForRecordSize( 298 ), false);
- StatusWith<RecordId> result = rs.insertRecord(&txn, &docWriter, false);
- ASSERT( result.isOK() );
+/**
+ * alloc() with quantized size splits if enough room left over.
+ */
+TEST(SimpleRecordStoreV1, AllocUseQuantizedDeletedRecordWithSplit) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
+
+ {
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 512 + 32}, {}};
+ initializeV1RS(&txn, NULL, drecs, NULL, &em, md);
+ }
+
+ BsonDocWriter docWriter(docForRecordSize(300), true);
+ StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT_OK(actualLocation.getStatus());
- ASSERT_EQUALS( 300, rs.dataFor( &txn, result.getValue() ).size() + MmapV1RecordHeader::HeaderSize );
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 512}, {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1512), 32}, {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
}
- /**
- * alloc() with quantized size doesn't split if enough room left over.
- */
- TEST(SimpleRecordStoreV1, AllocUseQuantizedDeletedRecordWithoutSplit) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
-
- {
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 512 + 31},
- {}
- };
- initializeV1RS(&txn, NULL, drecs, NULL, &em, md);
- }
+}
- BsonDocWriter docWriter(docForRecordSize( 300 ), true);
- StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
- ASSERT_OK( actualLocation.getStatus() );
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 512 + 31},
- {}
- };
- LocAndSize drecs[] = {
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- }
+/**
+ * alloc() with non quantized size doesn't split if enough room left over.
+ */
+TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecordWithoutSplit) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
+
+ {
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 331}, {}};
+ initializeV1RS(&txn, NULL, drecs, NULL, &em, md);
}
- /**
- * alloc() with quantized size splits if enough room left over.
- */
- TEST(SimpleRecordStoreV1, AllocUseQuantizedDeletedRecordWithSplit) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
-
- {
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 512 + 32},
- {}
- };
- initializeV1RS(&txn, NULL, drecs, NULL, &em, md);
- }
+ BsonDocWriter docWriter(docForRecordSize(300), false);
+ StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT_OK(actualLocation.getStatus());
- BsonDocWriter docWriter(docForRecordSize( 300 ), true);
- StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
- ASSERT_OK( actualLocation.getStatus() );
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 512},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1512), 32},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- }
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 331}, {}};
+ LocAndSize drecs[] = {{}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
}
+}
- /**
- * alloc() with non quantized size doesn't split if enough room left over.
- */
- TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecordWithoutSplit) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
-
- {
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 331},
- {}
- };
- initializeV1RS(&txn, NULL, drecs, NULL, &em, md);
- }
-
- BsonDocWriter docWriter(docForRecordSize( 300 ), false);
- StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
- ASSERT_OK( actualLocation.getStatus() );
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 331},
- {}
- };
- LocAndSize drecs[] = {
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- }
+/**
+ * alloc() with non quantized size splits if enough room left over.
+ */
+TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecordWithSplit) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
+
+ {
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 332}, {}};
+ initializeV1RS(&txn, NULL, drecs, NULL, &em, md);
}
- /**
- * alloc() with non quantized size splits if enough room left over.
- */
- TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecordWithSplit) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
-
- {
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 332},
- {}
- };
- initializeV1RS(&txn, NULL, drecs, NULL, &em, md);
- }
+ BsonDocWriter docWriter(docForRecordSize(300), false);
+ StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT_OK(actualLocation.getStatus());
- BsonDocWriter docWriter(docForRecordSize( 300 ), false);
- StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
- ASSERT_OK( actualLocation.getStatus() );
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 300},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1300), 32},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- }
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 300}, {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1300), 32}, {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
}
+}
- /**
- * alloc() will use from the legacy grab bag if it can.
- */
- TEST(SimpleRecordStoreV1, GrabBagIsUsed) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
-
- {
- LocAndSize drecs[] = {
- {}
- };
- LocAndSize grabBag[] = {
- {DiskLoc(0, 1000), 4*1024*1024},
- {DiskLoc(1, 1000), 4*1024*1024},
- {}
- };
- initializeV1RS(&txn, NULL, drecs, grabBag, &em, md);
- }
+/**
+ * alloc() will use from the legacy grab bag if it can.
+ */
+TEST(SimpleRecordStoreV1, GrabBagIsUsed) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
+
+ {
+ LocAndSize drecs[] = {{}};
+ LocAndSize grabBag[] = {
+ {DiskLoc(0, 1000), 4 * 1024 * 1024}, {DiskLoc(1, 1000), 4 * 1024 * 1024}, {}};
+ initializeV1RS(&txn, NULL, drecs, grabBag, &em, md);
+ }
- BsonDocWriter docWriter(docForRecordSize( 256 ), false);
- StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
- ASSERT_OK( actualLocation.getStatus() );
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 256},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1256), 4*1024*1024 - 256},
- {}
- };
- LocAndSize grabBag[] = {
- {DiskLoc(1, 1000), 4*1024*1024},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, grabBag, &em, md);
- }
+ BsonDocWriter docWriter(docForRecordSize(256), false);
+ StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT_OK(actualLocation.getStatus());
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 256}, {}};
+ LocAndSize drecs[] = {{DiskLoc(0, 1256), 4 * 1024 * 1024 - 256}, {}};
+ LocAndSize grabBag[] = {{DiskLoc(1, 1000), 4 * 1024 * 1024}, {}};
+ assertStateV1RS(&txn, recs, drecs, grabBag, &em, md);
}
+}
- /**
- * alloc() will pull from the legacy grab bag even if it isn't needed.
- */
- TEST(SimpleRecordStoreV1, GrabBagIsPoppedEvenIfUnneeded) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
-
- {
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 1000},
- {}
- };
- LocAndSize grabBag[] = {
- {DiskLoc(1, 1000), 4*1024*1024},
- {DiskLoc(2, 1000), 4*1024*1024},
- {}
- };
- initializeV1RS(&txn, NULL, drecs, grabBag, &em, md);
- }
+/**
+ * alloc() will pull from the legacy grab bag even if it isn't needed.
+ */
+TEST(SimpleRecordStoreV1, GrabBagIsPoppedEvenIfUnneeded) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
+
+ {
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}};
+ LocAndSize grabBag[] = {
+ {DiskLoc(1, 1000), 4 * 1024 * 1024}, {DiskLoc(2, 1000), 4 * 1024 * 1024}, {}};
+ initializeV1RS(&txn, NULL, drecs, grabBag, &em, md);
+ }
- BsonDocWriter docWriter(docForRecordSize( 1000 ), false);
- StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
- ASSERT_OK( actualLocation.getStatus() );
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 1000},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(1, 1000), 4*1024*1024},
- {}
- };
- LocAndSize grabBag[] = {
- {DiskLoc(2, 1000), 4*1024*1024},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, grabBag, &em, md);
- }
+ BsonDocWriter docWriter(docForRecordSize(1000), false);
+ StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT_OK(actualLocation.getStatus());
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 1000}, {}};
+ LocAndSize drecs[] = {{DiskLoc(1, 1000), 4 * 1024 * 1024}, {}};
+ LocAndSize grabBag[] = {{DiskLoc(2, 1000), 4 * 1024 * 1024}, {}};
+ assertStateV1RS(&txn, recs, drecs, grabBag, &em, md);
}
+}
- /**
- * alloc() will pull from the legacy grab bag even if it can't be used
- */
- TEST(SimpleRecordStoreV1, GrabBagIsPoppedEvenIfUnusable) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
-
- {
- LocAndSize drecs[] = {
- {DiskLoc(0, 1000), 8*1024*1024},
- {}
- };
- LocAndSize grabBag[] = {
- {DiskLoc(1, 1000), 4*1024*1024},
- {DiskLoc(2, 1000), 4*1024*1024},
- {}
- };
- initializeV1RS(&txn, NULL, drecs, grabBag, &em, md);
- }
+/**
+ * alloc() will pull from the legacy grab bag even if it can't be used
+ */
+TEST(SimpleRecordStoreV1, GrabBagIsPoppedEvenIfUnusable) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
+
+ {
+ LocAndSize drecs[] = {{DiskLoc(0, 1000), 8 * 1024 * 1024}, {}};
+ LocAndSize grabBag[] = {
+ {DiskLoc(1, 1000), 4 * 1024 * 1024}, {DiskLoc(2, 1000), 4 * 1024 * 1024}, {}};
+ initializeV1RS(&txn, NULL, drecs, grabBag, &em, md);
+ }
- BsonDocWriter docWriter(docForRecordSize( 8*1024*1024 ), false);
- StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
- ASSERT_OK( actualLocation.getStatus() );
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 8*1024*1024},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(1, 1000), 4*1024*1024},
- {}
- };
- LocAndSize grabBag[] = {
- {DiskLoc(2, 1000), 4*1024*1024},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, grabBag, &em, md);
- }
+ BsonDocWriter docWriter(docForRecordSize(8 * 1024 * 1024), false);
+ StatusWith<RecordId> actualLocation = rs.insertRecord(&txn, &docWriter, false);
+ ASSERT_OK(actualLocation.getStatus());
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 8 * 1024 * 1024}, {}};
+ LocAndSize drecs[] = {{DiskLoc(1, 1000), 4 * 1024 * 1024}, {}};
+ LocAndSize grabBag[] = {{DiskLoc(2, 1000), 4 * 1024 * 1024}, {}};
+ assertStateV1RS(&txn, recs, drecs, grabBag, &em, md);
}
+}
+
+// -----------------
+
+TEST(SimpleRecordStoreV1, FullSimple1) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
- // -----------------
-
- TEST( SimpleRecordStoreV1, FullSimple1 ) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn,
- "test.foo",
- md,
- &em,
- false );
-
-
- ASSERT_EQUALS( 0, md->numRecords() );
- StatusWith<RecordId> result = rs.insertRecord( &txn, "abc", 4, 1000 );
- ASSERT_TRUE( result.isOK() );
- ASSERT_EQUALS( 1, md->numRecords() );
- RecordData recordData = rs.dataFor( &txn, result.getValue() );
- ASSERT_EQUALS( string("abc"), string(recordData.data()) );
+
+ ASSERT_EQUALS(0, md->numRecords());
+ StatusWith<RecordId> result = rs.insertRecord(&txn, "abc", 4, 1000);
+ ASSERT_TRUE(result.isOK());
+ ASSERT_EQUALS(1, md->numRecords());
+ RecordData recordData = rs.dataFor(&txn, result.getValue());
+ ASSERT_EQUALS(string("abc"), string(recordData.data()));
+}
+
+// -----------------
+
+TEST(SimpleRecordStoreV1, Truncate) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0);
+ SimpleRecordStoreV1 rs(&txn, "test.foo", md, &em, false);
+
+ {
+ LocAndSize recs[] = {{DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(2, 1100), 100},
+ {}};
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1200), 100}, {DiskLoc(2, 1000), 100}, {DiskLoc(1, 1000), 1000}, {}};
+
+ initializeV1RS(&txn, recs, drecs, NULL, &em, md);
+
+ ASSERT_EQUALS(em.getExtent(DiskLoc(0, 0))->length, em.minSize());
}
- // -----------------
-
- TEST( SimpleRecordStoreV1, Truncate ) {
- OperationContextNoop txn;
- DummyExtentManager em;
- DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
- SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
-
- {
- LocAndSize recs[] = {
- {DiskLoc(0, 1000), 100},
- {DiskLoc(0, 1100), 100},
- {DiskLoc(0, 1300), 100},
- {DiskLoc(2, 1100), 100},
- {}
- };
- LocAndSize drecs[] = {
- {DiskLoc(0, 1200), 100},
- {DiskLoc(2, 1000), 100},
- {DiskLoc(1, 1000), 1000},
- {}
- };
-
- initializeV1RS(&txn, recs, drecs, NULL, &em, md);
-
- ASSERT_EQUALS(em.getExtent(DiskLoc(0, 0))->length, em.minSize());
- }
+ rs.truncate(&txn);
- rs.truncate(&txn);
-
- {
- LocAndSize recs[] = {
- {}
- };
- LocAndSize drecs[] = {
- // One extent filled with a single deleted record.
- {DiskLoc(0, Extent::HeaderSize()), em.minSize() - Extent::HeaderSize()},
- {}
- };
- assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
- }
+ {
+ LocAndSize recs[] = {{}};
+ LocAndSize drecs[] = {
+ // One extent filled with a single deleted record.
+ {DiskLoc(0, Extent::HeaderSize()), em.minSize() - Extent::HeaderSize()},
+ {}};
+ assertStateV1RS(&txn, recs, drecs, NULL, &em, md);
}
}
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp
index 7bfaee1867e..12801124b95 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp
@@ -47,631 +47,609 @@
namespace mongo {
- using std::numeric_limits;
-
- DummyRecordStoreV1MetaData::DummyRecordStoreV1MetaData( bool capped, int userFlags ) {
- _dataSize = 0;
- _numRecords = 0;
- _capped = capped;
- _userFlags = userFlags;
- _lastExtentSize = 0;
- _paddingFactor = 1;
- _maxCappedDocs = numeric_limits<long long>::max();
- _capFirstNewRecord.setInvalid();
- if ( _capped ) {
- // copied from NamespaceDetails::NamespaceDetails()
- setDeletedListEntry( NULL, 1, DiskLoc().setInvalid() );
- }
- }
-
- const DiskLoc& DummyRecordStoreV1MetaData::capExtent() const {
- return _capExtent;
- }
-
- void DummyRecordStoreV1MetaData::setCapExtent( OperationContext* txn,
- const DiskLoc& loc ) {
- _capExtent = loc;
- }
-
- const DiskLoc& DummyRecordStoreV1MetaData::capFirstNewRecord() const {
- return _capFirstNewRecord;
- }
-
- void DummyRecordStoreV1MetaData::setCapFirstNewRecord( OperationContext* txn,
- const DiskLoc& loc ) {
- _capFirstNewRecord = loc;
- }
-
- long long DummyRecordStoreV1MetaData::dataSize() const {
- return _dataSize;
+using std::numeric_limits;
+
+DummyRecordStoreV1MetaData::DummyRecordStoreV1MetaData(bool capped, int userFlags) {
+ _dataSize = 0;
+ _numRecords = 0;
+ _capped = capped;
+ _userFlags = userFlags;
+ _lastExtentSize = 0;
+ _paddingFactor = 1;
+ _maxCappedDocs = numeric_limits<long long>::max();
+ _capFirstNewRecord.setInvalid();
+ if (_capped) {
+ // copied from NamespaceDetails::NamespaceDetails()
+ setDeletedListEntry(NULL, 1, DiskLoc().setInvalid());
}
+}
- long long DummyRecordStoreV1MetaData::numRecords() const {
- return _numRecords;
- }
+const DiskLoc& DummyRecordStoreV1MetaData::capExtent() const {
+ return _capExtent;
+}
- void DummyRecordStoreV1MetaData::incrementStats( OperationContext* txn,
- long long dataSizeIncrement,
- long long numRecordsIncrement ) {
- _dataSize += dataSizeIncrement;
- _numRecords += numRecordsIncrement;
- }
+void DummyRecordStoreV1MetaData::setCapExtent(OperationContext* txn, const DiskLoc& loc) {
+ _capExtent = loc;
+}
- void DummyRecordStoreV1MetaData::setStats( OperationContext* txn,
- long long dataSize,
- long long numRecords ) {
- _dataSize = dataSize;
- _numRecords = numRecords;
- }
+const DiskLoc& DummyRecordStoreV1MetaData::capFirstNewRecord() const {
+ return _capFirstNewRecord;
+}
- namespace {
- DiskLoc myNull;
- }
+void DummyRecordStoreV1MetaData::setCapFirstNewRecord(OperationContext* txn, const DiskLoc& loc) {
+ _capFirstNewRecord = loc;
+}
- DiskLoc DummyRecordStoreV1MetaData::deletedListEntry( int bucket ) const {
- invariant( bucket >= 0 );
- if ( static_cast<size_t>( bucket ) >= _deletedLists.size() )
- return myNull;
- return _deletedLists[bucket];
- }
+long long DummyRecordStoreV1MetaData::dataSize() const {
+ return _dataSize;
+}
- void DummyRecordStoreV1MetaData::setDeletedListEntry( OperationContext* txn,
- int bucket,
- const DiskLoc& loc ) {
- invariant( bucket >= 0 );
- invariant( bucket < 1000 );
- while ( static_cast<size_t>( bucket ) >= _deletedLists.size() )
- _deletedLists.push_back( DiskLoc() );
- _deletedLists[bucket] = loc;
- }
+long long DummyRecordStoreV1MetaData::numRecords() const {
+ return _numRecords;
+}
- DiskLoc DummyRecordStoreV1MetaData::deletedListLegacyGrabBag() const {
- return _deletedListLegacyGrabBag;
- }
+void DummyRecordStoreV1MetaData::incrementStats(OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement) {
+ _dataSize += dataSizeIncrement;
+ _numRecords += numRecordsIncrement;
+}
- void DummyRecordStoreV1MetaData::setDeletedListLegacyGrabBag(OperationContext* txn,
- const DiskLoc& loc) {
- _deletedListLegacyGrabBag = loc;
- }
+void DummyRecordStoreV1MetaData::setStats(OperationContext* txn,
+ long long dataSize,
+ long long numRecords) {
+ _dataSize = dataSize;
+ _numRecords = numRecords;
+}
- void DummyRecordStoreV1MetaData::orphanDeletedList(OperationContext* txn) {
- // They will be recreated on demand.
- _deletedLists.clear();
- }
+namespace {
+DiskLoc myNull;
+}
- const DiskLoc& DummyRecordStoreV1MetaData::firstExtent(OperationContext* txn) const {
- return _firstExtent;
- }
+DiskLoc DummyRecordStoreV1MetaData::deletedListEntry(int bucket) const {
+ invariant(bucket >= 0);
+ if (static_cast<size_t>(bucket) >= _deletedLists.size())
+ return myNull;
+ return _deletedLists[bucket];
+}
- void DummyRecordStoreV1MetaData::setFirstExtent( OperationContext* txn,
- const DiskLoc& loc ) {
- _firstExtent = loc;
- }
+void DummyRecordStoreV1MetaData::setDeletedListEntry(OperationContext* txn,
+ int bucket,
+ const DiskLoc& loc) {
+ invariant(bucket >= 0);
+ invariant(bucket < 1000);
+ while (static_cast<size_t>(bucket) >= _deletedLists.size())
+ _deletedLists.push_back(DiskLoc());
+ _deletedLists[bucket] = loc;
+}
- const DiskLoc& DummyRecordStoreV1MetaData::lastExtent(OperationContext* txn) const {
- return _lastExtent;
- }
+DiskLoc DummyRecordStoreV1MetaData::deletedListLegacyGrabBag() const {
+ return _deletedListLegacyGrabBag;
+}
- void DummyRecordStoreV1MetaData::setLastExtent( OperationContext* txn,
- const DiskLoc& loc ) {
- _lastExtent = loc;
- }
+void DummyRecordStoreV1MetaData::setDeletedListLegacyGrabBag(OperationContext* txn,
+ const DiskLoc& loc) {
+ _deletedListLegacyGrabBag = loc;
+}
- bool DummyRecordStoreV1MetaData::isCapped() const {
- return _capped;
- }
+void DummyRecordStoreV1MetaData::orphanDeletedList(OperationContext* txn) {
+ // They will be recreated on demand.
+ _deletedLists.clear();
+}
- bool DummyRecordStoreV1MetaData::isUserFlagSet( int flag ) const {
- return _userFlags & flag;
- }
+const DiskLoc& DummyRecordStoreV1MetaData::firstExtent(OperationContext* txn) const {
+ return _firstExtent;
+}
- bool DummyRecordStoreV1MetaData::setUserFlag( OperationContext* txn, int flag ) {
- if ( ( _userFlags & flag ) == flag )
- return false;
+void DummyRecordStoreV1MetaData::setFirstExtent(OperationContext* txn, const DiskLoc& loc) {
+ _firstExtent = loc;
+}
- _userFlags |= flag;
- return true;
+const DiskLoc& DummyRecordStoreV1MetaData::lastExtent(OperationContext* txn) const {
+ return _lastExtent;
+}
- }
- bool DummyRecordStoreV1MetaData::clearUserFlag( OperationContext* txn, int flag ) {
- if ( ( _userFlags & flag ) == 0 )
- return false;
+void DummyRecordStoreV1MetaData::setLastExtent(OperationContext* txn, const DiskLoc& loc) {
+ _lastExtent = loc;
+}
- _userFlags &= ~flag;
- return true;
+bool DummyRecordStoreV1MetaData::isCapped() const {
+ return _capped;
+}
- }
- bool DummyRecordStoreV1MetaData::replaceUserFlags( OperationContext* txn, int flags ) {
- if ( _userFlags == flags )
- return false;
- _userFlags = flags;
- return true;
- }
+bool DummyRecordStoreV1MetaData::isUserFlagSet(int flag) const {
+ return _userFlags & flag;
+}
+bool DummyRecordStoreV1MetaData::setUserFlag(OperationContext* txn, int flag) {
+ if ((_userFlags & flag) == flag)
+ return false;
- int DummyRecordStoreV1MetaData::lastExtentSize(OperationContext* txn) const {
- return _lastExtentSize;
- }
+ _userFlags |= flag;
+ return true;
+}
+bool DummyRecordStoreV1MetaData::clearUserFlag(OperationContext* txn, int flag) {
+ if ((_userFlags & flag) == 0)
+ return false;
- void DummyRecordStoreV1MetaData::setLastExtentSize( OperationContext* txn, int newMax ) {
- _lastExtentSize = newMax;
- }
+ _userFlags &= ~flag;
+ return true;
+}
+bool DummyRecordStoreV1MetaData::replaceUserFlags(OperationContext* txn, int flags) {
+ if (_userFlags == flags)
+ return false;
+ _userFlags = flags;
+ return true;
+}
- long long DummyRecordStoreV1MetaData::maxCappedDocs() const {
- return _maxCappedDocs;
- }
- // -----------------------------------------
+int DummyRecordStoreV1MetaData::lastExtentSize(OperationContext* txn) const {
+ return _lastExtentSize;
+}
- DummyExtentManager::~DummyExtentManager() {
- for ( size_t i = 0; i < _extents.size(); i++ ) {
- if ( _extents[i].data )
- free( _extents[i].data );
- }
- }
+void DummyRecordStoreV1MetaData::setLastExtentSize(OperationContext* txn, int newMax) {
+ _lastExtentSize = newMax;
+}
- Status DummyExtentManager::init(OperationContext* txn) {
- return Status::OK();
- }
+long long DummyRecordStoreV1MetaData::maxCappedDocs() const {
+ return _maxCappedDocs;
+}
- int DummyExtentManager::numFiles() const {
- return static_cast<int>( _extents.size() );
- }
+// -----------------------------------------
- long long DummyExtentManager::fileSize() const {
- invariant( false );
- return -1;
+DummyExtentManager::~DummyExtentManager() {
+ for (size_t i = 0; i < _extents.size(); i++) {
+ if (_extents[i].data)
+ free(_extents[i].data);
}
+}
- DiskLoc DummyExtentManager::allocateExtent( OperationContext* txn,
- bool capped,
- int size,
- bool enforceQuota ) {
- size = quantizeExtentSize( size );
+Status DummyExtentManager::init(OperationContext* txn) {
+ return Status::OK();
+}
- ExtentInfo info;
- info.data = static_cast<char*>( mongoMalloc( size ) );
- info.length = size;
+int DummyExtentManager::numFiles() const {
+ return static_cast<int>(_extents.size());
+}
- DiskLoc loc( _extents.size(), 0 );
- _extents.push_back( info );
+long long DummyExtentManager::fileSize() const {
+ invariant(false);
+ return -1;
+}
- Extent* e = getExtent( loc, false );
- e->magic = Extent::extentSignature;
- e->myLoc = loc;
- e->xnext.Null();
- e->xprev.Null();
- e->length = size;
- e->firstRecord.Null();
- e->lastRecord.Null();
+DiskLoc DummyExtentManager::allocateExtent(OperationContext* txn,
+ bool capped,
+ int size,
+ bool enforceQuota) {
+ size = quantizeExtentSize(size);
+
+ ExtentInfo info;
+ info.data = static_cast<char*>(mongoMalloc(size));
+ info.length = size;
+
+ DiskLoc loc(_extents.size(), 0);
+ _extents.push_back(info);
+
+ Extent* e = getExtent(loc, false);
+ e->magic = Extent::extentSignature;
+ e->myLoc = loc;
+ e->xnext.Null();
+ e->xprev.Null();
+ e->length = size;
+ e->firstRecord.Null();
+ e->lastRecord.Null();
+
+ return loc;
+}
- return loc;
+void DummyExtentManager::freeExtents(OperationContext* txn, DiskLoc firstExt, DiskLoc lastExt) {
+ // XXX
+}
- }
+void DummyExtentManager::freeExtent(OperationContext* txn, DiskLoc extent) {
+ // XXX
+}
+void DummyExtentManager::freeListStats(OperationContext* txn,
+ int* numExtents,
+ int64_t* totalFreeSizeBytes) const {
+ invariant(false);
+}
- void DummyExtentManager::freeExtents( OperationContext* txn,
- DiskLoc firstExt, DiskLoc lastExt ) {
- // XXX
- }
+std::unique_ptr<RecordFetcher> DummyExtentManager::recordNeedsFetch(const DiskLoc& loc) const {
+ return {};
+}
- void DummyExtentManager::freeExtent( OperationContext* txn, DiskLoc extent ) {
- // XXX
- }
- void DummyExtentManager::freeListStats(OperationContext* txn,
- int* numExtents,
- int64_t* totalFreeSizeBytes) const {
- invariant(false);
- }
+MmapV1RecordHeader* DummyExtentManager::recordForV1(const DiskLoc& loc) const {
+ if (static_cast<size_t>(loc.a()) >= _extents.size())
+ return NULL;
+ if (static_cast<size_t>(loc.getOfs()) >= _extents[loc.a()].length)
+ return NULL;
+ char* root = _extents[loc.a()].data;
+ return reinterpret_cast<MmapV1RecordHeader*>(root + loc.getOfs());
+}
- std::unique_ptr<RecordFetcher> DummyExtentManager::recordNeedsFetch(const DiskLoc& loc) const {
- return {};
- }
+Extent* DummyExtentManager::extentForV1(const DiskLoc& loc) const {
+ invariant(false);
+}
- MmapV1RecordHeader* DummyExtentManager::recordForV1( const DiskLoc& loc ) const {
- if ( static_cast<size_t>( loc.a() ) >= _extents.size() )
- return NULL;
- if ( static_cast<size_t>( loc.getOfs() ) >= _extents[loc.a()].length )
- return NULL;
- char* root = _extents[loc.a()].data;
- return reinterpret_cast<MmapV1RecordHeader*>( root + loc.getOfs() );
- }
+DiskLoc DummyExtentManager::extentLocForV1(const DiskLoc& loc) const {
+ return DiskLoc(loc.a(), 0);
+}
- Extent* DummyExtentManager::extentForV1( const DiskLoc& loc ) const {
- invariant( false );
- }
+Extent* DummyExtentManager::getExtent(const DiskLoc& loc, bool doSanityCheck) const {
+ invariant(!loc.isNull());
+ invariant(static_cast<size_t>(loc.a()) < _extents.size());
+ invariant(loc.getOfs() == 0);
+ Extent* ext = reinterpret_cast<Extent*>(_extents[loc.a()].data);
+ if (doSanityCheck)
+ ext->assertOk();
+ return ext;
+}
- DiskLoc DummyExtentManager::extentLocForV1( const DiskLoc& loc ) const {
- return DiskLoc( loc.a(), 0 );
- }
+int DummyExtentManager::maxSize() const {
+ return 1024 * 1024 * 64;
+}
- Extent* DummyExtentManager::getExtent( const DiskLoc& loc, bool doSanityCheck ) const {
- invariant( !loc.isNull() );
- invariant( static_cast<size_t>( loc.a() ) < _extents.size() );
- invariant( loc.getOfs() == 0 );
- Extent* ext = reinterpret_cast<Extent*>( _extents[loc.a()].data );
- if (doSanityCheck)
- ext->assertOk();
- return ext;
- }
+DummyExtentManager::CacheHint* DummyExtentManager::cacheHint(const DiskLoc& extentLoc,
+ const HintType& hint) {
+ return new CacheHint();
+}
- int DummyExtentManager::maxSize() const {
- return 1024 * 1024 * 64;
- }
+namespace {
+void accumulateExtentSizeRequirements(const LocAndSize* las, std::map<int, size_t>* sizes) {
+ if (!las)
+ return;
- DummyExtentManager::CacheHint* DummyExtentManager::cacheHint( const DiskLoc& extentLoc, const HintType& hint ) {
- return new CacheHint();
- }
+ while (!las->loc.isNull()) {
+ // We require passed in offsets to be > 1000 to leave room for Extent headers.
+ invariant(Extent::HeaderSize() < 1000);
+ invariant(las->loc.getOfs() >= 1000);
-namespace {
- void accumulateExtentSizeRequirements(const LocAndSize* las, std::map<int, size_t>* sizes) {
- if (!las)
- return;
-
- while (!las->loc.isNull()) {
- // We require passed in offsets to be > 1000 to leave room for Extent headers.
- invariant(Extent::HeaderSize() < 1000);
- invariant(las->loc.getOfs() >= 1000);
-
- const size_t end = las->loc.getOfs() + las->size;
- size_t& sizeNeeded = (*sizes)[las->loc.a()];
- sizeNeeded = std::max(sizeNeeded, end);
- las++;
- }
+ const size_t end = las->loc.getOfs() + las->size;
+ size_t& sizeNeeded = (*sizes)[las->loc.a()];
+ sizeNeeded = std::max(sizeNeeded, end);
+ las++;
}
+}
- void printRecList(OperationContext* txn,
- const ExtentManager* em,
- const RecordStoreV1MetaData* md) {
- log() << " *** BEGIN ACTUAL RECORD LIST *** ";
- DiskLoc extLoc = md->firstExtent(txn);
- std::set<DiskLoc> seenLocs;
- while (!extLoc.isNull()) {
- Extent* ext = em->getExtent(extLoc, true);
- DiskLoc actualLoc = ext->firstRecord;
- while (!actualLoc.isNull()) {
- const MmapV1RecordHeader* actualRec = em->recordForV1(actualLoc);
- const int actualSize = actualRec->lengthWithHeaders();
-
- log() << "loc: " << actualLoc // <--hex
- << " (" << actualLoc.getOfs() << ")"
- << " size: " << actualSize
- << " prev: " << actualRec->prevOfs()
- << " next: " << actualRec->nextOfs()
- << (actualLoc == md->capFirstNewRecord() ? " (CAP_FIRST_NEW)" : "")
- ;
-
- const bool foundCycle = !seenLocs.insert(actualLoc).second;
- invariant(!foundCycle);
-
- const int nextOfs = actualRec->nextOfs();
- actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc()
- : DiskLoc(actualLoc.a(), nextOfs));
- }
- extLoc = ext->xnext;
+void printRecList(OperationContext* txn, const ExtentManager* em, const RecordStoreV1MetaData* md) {
+ log() << " *** BEGIN ACTUAL RECORD LIST *** ";
+ DiskLoc extLoc = md->firstExtent(txn);
+ std::set<DiskLoc> seenLocs;
+ while (!extLoc.isNull()) {
+ Extent* ext = em->getExtent(extLoc, true);
+ DiskLoc actualLoc = ext->firstRecord;
+ while (!actualLoc.isNull()) {
+ const MmapV1RecordHeader* actualRec = em->recordForV1(actualLoc);
+ const int actualSize = actualRec->lengthWithHeaders();
+
+ log() << "loc: " << actualLoc // <--hex
+ << " (" << actualLoc.getOfs() << ")"
+ << " size: " << actualSize << " prev: " << actualRec->prevOfs()
+ << " next: " << actualRec->nextOfs()
+ << (actualLoc == md->capFirstNewRecord() ? " (CAP_FIRST_NEW)" : "");
+
+ const bool foundCycle = !seenLocs.insert(actualLoc).second;
+ invariant(!foundCycle);
+
+ const int nextOfs = actualRec->nextOfs();
+ actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(actualLoc.a(), nextOfs));
}
- log() << " *** END ACTUAL RECORD LIST *** ";
+ extLoc = ext->xnext;
}
+ log() << " *** END ACTUAL RECORD LIST *** ";
+}
- void printDRecList(const ExtentManager* em, const RecordStoreV1MetaData* md) {
- log() << " *** BEGIN ACTUAL DELETED RECORD LIST *** ";
- std::set<DiskLoc> seenLocs;
- for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
- DiskLoc actualLoc = md->deletedListEntry(bucketIdx);
- while (!actualLoc.isNull()) {
- const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
- const int actualSize = actualDrec->lengthWithHeaders();
+void printDRecList(const ExtentManager* em, const RecordStoreV1MetaData* md) {
+ log() << " *** BEGIN ACTUAL DELETED RECORD LIST *** ";
+ std::set<DiskLoc> seenLocs;
+ for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
+ DiskLoc actualLoc = md->deletedListEntry(bucketIdx);
+ while (!actualLoc.isNull()) {
+ const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
+ const int actualSize = actualDrec->lengthWithHeaders();
- log() << "loc: " << actualLoc // <--hex
- << " (" << actualLoc.getOfs() << ")"
- << " size: " << actualSize
- << " bucket: " << bucketIdx
- << " next: " << actualDrec->nextDeleted();
+ log() << "loc: " << actualLoc // <--hex
+ << " (" << actualLoc.getOfs() << ")"
+ << " size: " << actualSize << " bucket: " << bucketIdx
+ << " next: " << actualDrec->nextDeleted();
- const bool foundCycle = !seenLocs.insert(actualLoc).second;
- invariant(!foundCycle);
+ const bool foundCycle = !seenLocs.insert(actualLoc).second;
+ invariant(!foundCycle);
- actualLoc = actualDrec->nextDeleted();
- }
-
- // Only print bucket 0 in capped collections since it contains all deleted records
- if (md->isCapped())
- break;
+ actualLoc = actualDrec->nextDeleted();
}
- log() << " *** END ACTUAL DELETED RECORD LIST *** ";
+
+ // Only print bucket 0 in capped collections since it contains all deleted records
+ if (md->isCapped())
+ break;
}
+ log() << " *** END ACTUAL DELETED RECORD LIST *** ";
+}
}
- void initializeV1RS(OperationContext* txn,
- const LocAndSize* records,
- const LocAndSize* drecs,
- const LocAndSize* legacyGrabBag,
- DummyExtentManager* em,
- DummyRecordStoreV1MetaData* md) {
- invariant(records || drecs); // if both are NULL nothing is being created...
-
- // Need to start with a blank slate
- invariant(em->numFiles() == 0);
- invariant(md->firstExtent(txn).isNull());
-
- // pre-allocate extents (even extents that aren't part of this RS)
- {
- typedef std::map<int, size_t> ExtentSizes;
- ExtentSizes extentSizes;
- accumulateExtentSizeRequirements(records, &extentSizes);
- accumulateExtentSizeRequirements(drecs, &extentSizes);
- accumulateExtentSizeRequirements(legacyGrabBag, &extentSizes);
- invariant(!extentSizes.empty());
-
- const int maxExtent = extentSizes.rbegin()->first;
- for (int i = 0; i <= maxExtent; i++) {
- const size_t size = extentSizes.count(i) ? extentSizes[i] : 0;
- const DiskLoc loc = em->allocateExtent(txn, md->isCapped(), size, 0);
-
- // This function and assertState depend on these details of DummyExtentManager
- invariant(loc.a() == i);
- invariant(loc.getOfs() == 0);
- }
-
- // link together extents that should be part of this RS
- md->setFirstExtent(txn, DiskLoc(extentSizes.begin()->first, 0));
- md->setLastExtent(txn, DiskLoc(extentSizes.rbegin()->first, 0));
- for (ExtentSizes::iterator it = extentSizes.begin();
- boost::next(it) != extentSizes.end(); /* ++it */ ) {
- const int a = it->first;
- ++it;
- const int b = it->first;
- em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0);
- em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0);
- }
+void initializeV1RS(OperationContext* txn,
+ const LocAndSize* records,
+ const LocAndSize* drecs,
+ const LocAndSize* legacyGrabBag,
+ DummyExtentManager* em,
+ DummyRecordStoreV1MetaData* md) {
+ invariant(records || drecs); // if both are NULL nothing is being created...
+
+ // Need to start with a blank slate
+ invariant(em->numFiles() == 0);
+ invariant(md->firstExtent(txn).isNull());
+
+ // pre-allocate extents (even extents that aren't part of this RS)
+ {
+ typedef std::map<int, size_t> ExtentSizes;
+ ExtentSizes extentSizes;
+ accumulateExtentSizeRequirements(records, &extentSizes);
+ accumulateExtentSizeRequirements(drecs, &extentSizes);
+ accumulateExtentSizeRequirements(legacyGrabBag, &extentSizes);
+ invariant(!extentSizes.empty());
+
+ const int maxExtent = extentSizes.rbegin()->first;
+ for (int i = 0; i <= maxExtent; i++) {
+ const size_t size = extentSizes.count(i) ? extentSizes[i] : 0;
+ const DiskLoc loc = em->allocateExtent(txn, md->isCapped(), size, 0);
+
+ // This function and assertState depend on these details of DummyExtentManager
+ invariant(loc.a() == i);
+ invariant(loc.getOfs() == 0);
+ }
- // This signals "done allocating new extents".
- if (md->isCapped())
- md->setDeletedListEntry(txn, 1, DiskLoc());
+ // link together extents that should be part of this RS
+ md->setFirstExtent(txn, DiskLoc(extentSizes.begin()->first, 0));
+ md->setLastExtent(txn, DiskLoc(extentSizes.rbegin()->first, 0));
+ for (ExtentSizes::iterator it = extentSizes.begin(); boost::next(it) != extentSizes.end();
+ /* ++it */) {
+ const int a = it->first;
+ ++it;
+ const int b = it->first;
+ em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0);
+ em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0);
}
- if (records && !records[0].loc.isNull()) {
- int recIdx = 0;
- DiskLoc extLoc = md->firstExtent(txn);
- while (!extLoc.isNull()) {
- Extent* ext = em->getExtent(extLoc);
- int prevOfs = DiskLoc::NullOfs;
- while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent
- const DiskLoc loc = records[recIdx].loc;
- const int size = records[recIdx].size;;
- invariant(size >= MmapV1RecordHeader::HeaderSize);
+ // This signals "done allocating new extents".
+ if (md->isCapped())
+ md->setDeletedListEntry(txn, 1, DiskLoc());
+ }
- md->incrementStats(txn, size - MmapV1RecordHeader::HeaderSize, 1);
+ if (records && !records[0].loc.isNull()) {
+ int recIdx = 0;
+ DiskLoc extLoc = md->firstExtent(txn);
+ while (!extLoc.isNull()) {
+ Extent* ext = em->getExtent(extLoc);
+ int prevOfs = DiskLoc::NullOfs;
+ while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent
+ const DiskLoc loc = records[recIdx].loc;
+ const int size = records[recIdx].size;
+ ;
+ invariant(size >= MmapV1RecordHeader::HeaderSize);
- if (ext->firstRecord.isNull())
- ext->firstRecord = loc;
+ md->incrementStats(txn, size - MmapV1RecordHeader::HeaderSize, 1);
- MmapV1RecordHeader* rec = em->recordForV1(loc);
- rec->lengthWithHeaders() = size;
- rec->extentOfs() = 0;
+ if (ext->firstRecord.isNull())
+ ext->firstRecord = loc;
- rec->prevOfs() = prevOfs;
- prevOfs = loc.getOfs();
+ MmapV1RecordHeader* rec = em->recordForV1(loc);
+ rec->lengthWithHeaders() = size;
+ rec->extentOfs() = 0;
- const DiskLoc nextLoc = records[recIdx + 1].loc;
- if (nextLoc.a() == loc.a()) { // if next is in same extent
- rec->nextOfs() = nextLoc.getOfs();
- }
- else {
- rec->nextOfs() = DiskLoc::NullOfs;
- ext->lastRecord = loc;
- }
+ rec->prevOfs() = prevOfs;
+ prevOfs = loc.getOfs();
- recIdx++;
+ const DiskLoc nextLoc = records[recIdx + 1].loc;
+ if (nextLoc.a() == loc.a()) { // if next is in same extent
+ rec->nextOfs() = nextLoc.getOfs();
+ } else {
+ rec->nextOfs() = DiskLoc::NullOfs;
+ ext->lastRecord = loc;
}
- extLoc = ext->xnext;
+
+ recIdx++;
}
- invariant(records[recIdx].loc.isNull());
+ extLoc = ext->xnext;
}
-
- if (drecs && !drecs[0].loc.isNull()) {
- int drecIdx = 0;
- DiskLoc* prevNextPtr = NULL;
- int lastBucket = -1;
- while (!drecs[drecIdx].loc.isNull()) {
- const DiskLoc loc = drecs[drecIdx].loc;
- const int size = drecs[drecIdx].size;
- invariant(size >= MmapV1RecordHeader::HeaderSize);
- const int bucket = RecordStoreV1Base::bucket(size);
-
- if (md->isCapped()) {
- // All drecs form a single list in bucket 0
- if (prevNextPtr == NULL) {
- md->setDeletedListEntry(txn, 0, loc);
- }
- else {
- *prevNextPtr = loc;
- }
-
- if (loc.a() < md->capExtent().a()
- && drecs[drecIdx + 1].loc.a() == md->capExtent().a()) {
- // Bucket 1 is known as cappedLastDelRecLastExtent
- md->setDeletedListEntry(txn, 1, loc);
- }
- }
- else if (bucket != lastBucket) {
- invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket
- md->setDeletedListEntry(txn, bucket, loc);
- lastBucket = bucket;
- }
- else {
+ invariant(records[recIdx].loc.isNull());
+ }
+
+ if (drecs && !drecs[0].loc.isNull()) {
+ int drecIdx = 0;
+ DiskLoc* prevNextPtr = NULL;
+ int lastBucket = -1;
+ while (!drecs[drecIdx].loc.isNull()) {
+ const DiskLoc loc = drecs[drecIdx].loc;
+ const int size = drecs[drecIdx].size;
+ invariant(size >= MmapV1RecordHeader::HeaderSize);
+ const int bucket = RecordStoreV1Base::bucket(size);
+
+ if (md->isCapped()) {
+ // All drecs form a single list in bucket 0
+ if (prevNextPtr == NULL) {
+ md->setDeletedListEntry(txn, 0, loc);
+ } else {
*prevNextPtr = loc;
}
- DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
- drec->lengthWithHeaders() = size;
- drec->extentOfs() = 0;
- drec->nextDeleted() = DiskLoc();
- prevNextPtr = &drec->nextDeleted();
-
- drecIdx++;
+ if (loc.a() < md->capExtent().a() &&
+ drecs[drecIdx + 1].loc.a() == md->capExtent().a()) {
+ // Bucket 1 is known as cappedLastDelRecLastExtent
+ md->setDeletedListEntry(txn, 1, loc);
+ }
+ } else if (bucket != lastBucket) {
+ invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket
+ md->setDeletedListEntry(txn, bucket, loc);
+ lastBucket = bucket;
+ } else {
+ *prevNextPtr = loc;
}
- }
- if (legacyGrabBag && !legacyGrabBag[0].loc.isNull()) {
- invariant(!md->isCapped()); // capped should have an empty legacy grab bag.
+ DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
+ drec->lengthWithHeaders() = size;
+ drec->extentOfs() = 0;
+ drec->nextDeleted() = DiskLoc();
+ prevNextPtr = &drec->nextDeleted();
- int grabBagIdx = 0;
- DiskLoc* prevNextPtr = NULL;
- while (!legacyGrabBag[grabBagIdx].loc.isNull()) {
- const DiskLoc loc = legacyGrabBag[grabBagIdx].loc;
- const int size = legacyGrabBag[grabBagIdx].size;
- invariant(size >= MmapV1RecordHeader::HeaderSize);
+ drecIdx++;
+ }
+ }
- if (grabBagIdx == 0) {
- md->setDeletedListLegacyGrabBag(txn, loc);
- }
- else {
- *prevNextPtr = loc;
- }
+ if (legacyGrabBag && !legacyGrabBag[0].loc.isNull()) {
+ invariant(!md->isCapped()); // capped should have an empty legacy grab bag.
- DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
- drec->lengthWithHeaders() = size;
- drec->extentOfs() = 0;
- drec->nextDeleted() = DiskLoc();
- prevNextPtr = &drec->nextDeleted();
+ int grabBagIdx = 0;
+ DiskLoc* prevNextPtr = NULL;
+ while (!legacyGrabBag[grabBagIdx].loc.isNull()) {
+ const DiskLoc loc = legacyGrabBag[grabBagIdx].loc;
+ const int size = legacyGrabBag[grabBagIdx].size;
+ invariant(size >= MmapV1RecordHeader::HeaderSize);
- grabBagIdx++;
+ if (grabBagIdx == 0) {
+ md->setDeletedListLegacyGrabBag(txn, loc);
+ } else {
+ *prevNextPtr = loc;
}
- }
- // Make sure we set everything up as requested.
- assertStateV1RS(txn, records, drecs, legacyGrabBag, em, md);
+ DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
+ drec->lengthWithHeaders() = size;
+ drec->extentOfs() = 0;
+ drec->nextDeleted() = DiskLoc();
+ prevNextPtr = &drec->nextDeleted();
+
+ grabBagIdx++;
+ }
}
- void assertStateV1RS(OperationContext* txn,
- const LocAndSize* records,
- const LocAndSize* drecs,
- const LocAndSize* legacyGrabBag,
- const ExtentManager* em,
- const DummyRecordStoreV1MetaData* md) {
- invariant(records || drecs); // if both are NULL nothing is being asserted...
-
- try {
- if (records) {
- long long dataSize = 0;
- long long numRecs = 0;
-
- int recIdx = 0;
-
- DiskLoc extLoc = md->firstExtent(txn);
- while (!extLoc.isNull()) { // for each Extent
- Extent* ext = em->getExtent(extLoc, true);
- int expectedPrevOfs = DiskLoc::NullOfs;
- DiskLoc actualLoc = ext->firstRecord;
- while (!actualLoc.isNull()) { // for each MmapV1RecordHeader in this Extent
- const MmapV1RecordHeader* actualRec = em->recordForV1(actualLoc);
- const int actualSize = actualRec->lengthWithHeaders();
-
- dataSize += actualSize - MmapV1RecordHeader::HeaderSize;
- numRecs += 1;
-
- ASSERT_EQUALS(actualLoc, records[recIdx].loc);
- ASSERT_EQUALS(actualSize, records[recIdx].size);
-
- ASSERT_EQUALS(actualRec->extentOfs(), extLoc.getOfs());
- ASSERT_EQUALS(actualRec->prevOfs(), expectedPrevOfs);
- expectedPrevOfs = actualLoc.getOfs();
-
- recIdx++;
- const int nextOfs = actualRec->nextOfs();
- actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc()
- : DiskLoc(actualLoc.a(), nextOfs));
- }
+ // Make sure we set everything up as requested.
+ assertStateV1RS(txn, records, drecs, legacyGrabBag, em, md);
+}
- if (ext->xnext.isNull()) {
- ASSERT_EQUALS(md->lastExtent(txn), extLoc);
- }
+void assertStateV1RS(OperationContext* txn,
+ const LocAndSize* records,
+ const LocAndSize* drecs,
+ const LocAndSize* legacyGrabBag,
+ const ExtentManager* em,
+ const DummyRecordStoreV1MetaData* md) {
+ invariant(records || drecs); // if both are NULL nothing is being asserted...
- extLoc = ext->xnext;
- }
+ try {
+ if (records) {
+ long long dataSize = 0;
+ long long numRecs = 0;
- // both the expected and actual record lists must be done at this point
- ASSERT_EQUALS(records[recIdx].loc, DiskLoc());
+ int recIdx = 0;
- ASSERT_EQUALS(dataSize, md->dataSize());
- ASSERT_EQUALS(numRecs, md->numRecords());
- }
+ DiskLoc extLoc = md->firstExtent(txn);
+ while (!extLoc.isNull()) { // for each Extent
+ Extent* ext = em->getExtent(extLoc, true);
+ int expectedPrevOfs = DiskLoc::NullOfs;
+ DiskLoc actualLoc = ext->firstRecord;
+ while (!actualLoc.isNull()) { // for each MmapV1RecordHeader in this Extent
+ const MmapV1RecordHeader* actualRec = em->recordForV1(actualLoc);
+ const int actualSize = actualRec->lengthWithHeaders();
- if (drecs) {
- int drecIdx = 0;
- for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
- DiskLoc actualLoc = md->deletedListEntry(bucketIdx);
-
- if (md->isCapped() && bucketIdx == 1) {
- // In capped collections, the 2nd bucket (index 1) points to the drec before
- // the first drec in the capExtent. If the capExtent is the first Extent,
- // it should be Null.
-
- if (md->capExtent() == md->firstExtent(txn)) {
- ASSERT_EQUALS(actualLoc, DiskLoc());
- }
- else {
- ASSERT_NOT_EQUALS(actualLoc.a(), md->capExtent().a());
- const DeletedRecord* actualDrec =
- &em->recordForV1(actualLoc)->asDeleted();
- ASSERT_EQUALS(actualDrec->nextDeleted().a(), md->capExtent().a());
- }
-
- // Don't do normal checking of bucket 1 in capped collections. Checking
- // other buckets to verify that they are Null.
- continue;
- }
+ dataSize += actualSize - MmapV1RecordHeader::HeaderSize;
+ numRecs += 1;
- while (!actualLoc.isNull()) {
- const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
- const int actualSize = actualDrec->lengthWithHeaders();
+ ASSERT_EQUALS(actualLoc, records[recIdx].loc);
+ ASSERT_EQUALS(actualSize, records[recIdx].size);
+
+ ASSERT_EQUALS(actualRec->extentOfs(), extLoc.getOfs());
+ ASSERT_EQUALS(actualRec->prevOfs(), expectedPrevOfs);
+ expectedPrevOfs = actualLoc.getOfs();
+
+ recIdx++;
+ const int nextOfs = actualRec->nextOfs();
+ actualLoc =
+ (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(actualLoc.a(), nextOfs));
+ }
+
+ if (ext->xnext.isNull()) {
+ ASSERT_EQUALS(md->lastExtent(txn), extLoc);
+ }
- ASSERT_EQUALS(actualLoc, drecs[drecIdx].loc);
- ASSERT_EQUALS(actualSize, drecs[drecIdx].size);
+ extLoc = ext->xnext;
+ }
- // Make sure the drec is correct
- ASSERT_EQUALS(actualDrec->extentOfs(), 0);
+ // both the expected and actual record lists must be done at this point
+ ASSERT_EQUALS(records[recIdx].loc, DiskLoc());
- // in capped collections all drecs are linked into a single list in bucket 0
- ASSERT_EQUALS(bucketIdx, md->isCapped()
- ? 0
- : RecordStoreV1Base::bucket(actualSize));
+ ASSERT_EQUALS(dataSize, md->dataSize());
+ ASSERT_EQUALS(numRecs, md->numRecords());
+ }
- drecIdx++;
- actualLoc = actualDrec->nextDeleted();
+ if (drecs) {
+ int drecIdx = 0;
+ for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
+ DiskLoc actualLoc = md->deletedListEntry(bucketIdx);
+
+ if (md->isCapped() && bucketIdx == 1) {
+ // In capped collections, the 2nd bucket (index 1) points to the drec before
+ // the first drec in the capExtent. If the capExtent is the first Extent,
+ // it should be Null.
+
+ if (md->capExtent() == md->firstExtent(txn)) {
+ ASSERT_EQUALS(actualLoc, DiskLoc());
+ } else {
+ ASSERT_NOT_EQUALS(actualLoc.a(), md->capExtent().a());
+ const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
+ ASSERT_EQUALS(actualDrec->nextDeleted().a(), md->capExtent().a());
}
+
+ // Don't do normal checking of bucket 1 in capped collections. Checking
+ // other buckets to verify that they are Null.
+ continue;
}
- // both the expected and actual deleted lists must be done at this point
- ASSERT_EQUALS(drecs[drecIdx].loc, DiskLoc());
- }
- if (legacyGrabBag) {
- int grabBagIdx = 0;
- DiskLoc actualLoc = md->deletedListLegacyGrabBag();
while (!actualLoc.isNull()) {
const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
const int actualSize = actualDrec->lengthWithHeaders();
- ASSERT_EQUALS(actualLoc, legacyGrabBag[grabBagIdx].loc);
- ASSERT_EQUALS(actualSize, legacyGrabBag[grabBagIdx].size);
+ ASSERT_EQUALS(actualLoc, drecs[drecIdx].loc);
+ ASSERT_EQUALS(actualSize, drecs[drecIdx].size);
+
+ // Make sure the drec is correct
+ ASSERT_EQUALS(actualDrec->extentOfs(), 0);
+
+ // in capped collections all drecs are linked into a single list in bucket 0
+ ASSERT_EQUALS(bucketIdx,
+ md->isCapped() ? 0 : RecordStoreV1Base::bucket(actualSize));
- grabBagIdx++;
+ drecIdx++;
actualLoc = actualDrec->nextDeleted();
}
-
- // both the expected and actual deleted lists must be done at this point
- ASSERT_EQUALS(legacyGrabBag[grabBagIdx].loc, DiskLoc());
- }
- else {
- // Unless a test is actually using the grabBag it should be empty
- ASSERT_EQUALS(md->deletedListLegacyGrabBag(), DiskLoc());
}
+ // both the expected and actual deleted lists must be done at this point
+ ASSERT_EQUALS(drecs[drecIdx].loc, DiskLoc());
}
- catch (...) {
- // If a test fails, provide extra info to make debugging easier
- printRecList(txn, em, md);
- printDRecList(em, md);
- throw;
+
+ if (legacyGrabBag) {
+ int grabBagIdx = 0;
+ DiskLoc actualLoc = md->deletedListLegacyGrabBag();
+ while (!actualLoc.isNull()) {
+ const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
+ const int actualSize = actualDrec->lengthWithHeaders();
+
+ ASSERT_EQUALS(actualLoc, legacyGrabBag[grabBagIdx].loc);
+ ASSERT_EQUALS(actualSize, legacyGrabBag[grabBagIdx].size);
+
+ grabBagIdx++;
+ actualLoc = actualDrec->nextDeleted();
+ }
+
+ // both the expected and actual deleted lists must be done at this point
+ ASSERT_EQUALS(legacyGrabBag[grabBagIdx].loc, DiskLoc());
+ } else {
+ // Unless a test is actually using the grabBag it should be empty
+ ASSERT_EQUALS(md->deletedListLegacyGrabBag(), DiskLoc());
}
+ } catch (...) {
+ // If a test fails, provide extra info to make debugging easier
+ printRecList(txn, em, md);
+ printDRecList(em, md);
+ throw;
}
}
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h
index f37969c1ca6..0a038f9e9f3 100644
--- a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h
@@ -37,169 +37,162 @@
namespace mongo {
- class DummyRecordStoreV1MetaData : public RecordStoreV1MetaData {
- public:
- DummyRecordStoreV1MetaData( bool capped, int userFlags );
- virtual ~DummyRecordStoreV1MetaData(){}
+class DummyRecordStoreV1MetaData : public RecordStoreV1MetaData {
+public:
+ DummyRecordStoreV1MetaData(bool capped, int userFlags);
+ virtual ~DummyRecordStoreV1MetaData() {}
- virtual const DiskLoc& capExtent() const;
- virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc );
+ virtual const DiskLoc& capExtent() const;
+ virtual void setCapExtent(OperationContext* txn, const DiskLoc& loc);
- virtual const DiskLoc& capFirstNewRecord() const;
- virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc );
+ virtual const DiskLoc& capFirstNewRecord() const;
+ virtual void setCapFirstNewRecord(OperationContext* txn, const DiskLoc& loc);
- virtual long long dataSize() const;
- virtual long long numRecords() const;
+ virtual long long dataSize() const;
+ virtual long long numRecords() const;
- virtual void incrementStats( OperationContext* txn,
- long long dataSizeIncrement,
- long long numRecordsIncrement );
+ virtual void incrementStats(OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement);
- virtual void setStats( OperationContext* txn,
- long long dataSize,
- long long numRecords );
+ virtual void setStats(OperationContext* txn, long long dataSize, long long numRecords);
- virtual DiskLoc deletedListEntry( int bucket ) const;
- virtual void setDeletedListEntry( OperationContext* txn,
- int bucket,
- const DiskLoc& loc );
+ virtual DiskLoc deletedListEntry(int bucket) const;
+ virtual void setDeletedListEntry(OperationContext* txn, int bucket, const DiskLoc& loc);
- virtual DiskLoc deletedListLegacyGrabBag() const;
- virtual void setDeletedListLegacyGrabBag(OperationContext* txn, const DiskLoc& loc);
+ virtual DiskLoc deletedListLegacyGrabBag() const;
+ virtual void setDeletedListLegacyGrabBag(OperationContext* txn, const DiskLoc& loc);
- virtual void orphanDeletedList(OperationContext* txn);
+ virtual void orphanDeletedList(OperationContext* txn);
- virtual const DiskLoc& firstExtent( OperationContext* txn ) const;
- virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc );
+ virtual const DiskLoc& firstExtent(OperationContext* txn) const;
+ virtual void setFirstExtent(OperationContext* txn, const DiskLoc& loc);
- virtual const DiskLoc& lastExtent( OperationContext* txn ) const;
- virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc );
+ virtual const DiskLoc& lastExtent(OperationContext* txn) const;
+ virtual void setLastExtent(OperationContext* txn, const DiskLoc& loc);
- virtual bool isCapped() const;
+ virtual bool isCapped() const;
- virtual bool isUserFlagSet( int flag ) const;
- virtual int userFlags() const { return _userFlags; }
- virtual bool setUserFlag( OperationContext* txn, int flag );
- virtual bool clearUserFlag( OperationContext* txn, int flag );
- virtual bool replaceUserFlags( OperationContext* txn, int flags );
+ virtual bool isUserFlagSet(int flag) const;
+ virtual int userFlags() const {
+ return _userFlags;
+ }
+ virtual bool setUserFlag(OperationContext* txn, int flag);
+ virtual bool clearUserFlag(OperationContext* txn, int flag);
+ virtual bool replaceUserFlags(OperationContext* txn, int flags);
- virtual int lastExtentSize( OperationContext* txn ) const;
- virtual void setLastExtentSize( OperationContext* txn, int newMax );
+ virtual int lastExtentSize(OperationContext* txn) const;
+ virtual void setLastExtentSize(OperationContext* txn, int newMax);
- virtual long long maxCappedDocs() const;
+ virtual long long maxCappedDocs() const;
- protected:
+protected:
+ DiskLoc _capExtent;
+ DiskLoc _capFirstNewRecord;
- DiskLoc _capExtent;
- DiskLoc _capFirstNewRecord;
+ long long _dataSize;
+ long long _numRecords;
- long long _dataSize;
- long long _numRecords;
+ DiskLoc _firstExtent;
+ DiskLoc _lastExtent;
- DiskLoc _firstExtent;
- DiskLoc _lastExtent;
+ bool _capped;
+ int _userFlags;
+ long long _maxCappedDocs;
- bool _capped;
- int _userFlags;
- long long _maxCappedDocs;
+ int _lastExtentSize;
+ double _paddingFactor;
- int _lastExtentSize;
- double _paddingFactor;
+ std::vector<DiskLoc> _deletedLists;
+ DiskLoc _deletedListLegacyGrabBag;
+};
- std::vector<DiskLoc> _deletedLists;
- DiskLoc _deletedListLegacyGrabBag;
- };
+class DummyExtentManager : public ExtentManager {
+public:
+ virtual ~DummyExtentManager();
- class DummyExtentManager : public ExtentManager {
- public:
- virtual ~DummyExtentManager();
+ virtual Status init(OperationContext* txn);
- virtual Status init(OperationContext* txn);
+ virtual int numFiles() const;
+ virtual long long fileSize() const;
- virtual int numFiles() const;
- virtual long long fileSize() const;
+ virtual DiskLoc allocateExtent(OperationContext* txn, bool capped, int size, bool enforceQuota);
- virtual DiskLoc allocateExtent( OperationContext* txn,
- bool capped,
- int size,
- bool enforceQuota );
+ virtual void freeExtents(OperationContext* txn, DiskLoc firstExt, DiskLoc lastExt);
- virtual void freeExtents( OperationContext* txn,
- DiskLoc firstExt, DiskLoc lastExt );
+ virtual void freeExtent(OperationContext* txn, DiskLoc extent);
- virtual void freeExtent( OperationContext* txn, DiskLoc extent );
+ virtual void freeListStats(OperationContext* txn,
+ int* numExtents,
+ int64_t* totalFreeSizeBytes) const;
- virtual void freeListStats(OperationContext* txn,
- int* numExtents,
- int64_t* totalFreeSizeBytes) const;
+ virtual MmapV1RecordHeader* recordForV1(const DiskLoc& loc) const;
- virtual MmapV1RecordHeader* recordForV1( const DiskLoc& loc ) const;
+ virtual std::unique_ptr<RecordFetcher> recordNeedsFetch(const DiskLoc& loc) const final;
- virtual std::unique_ptr<RecordFetcher> recordNeedsFetch( const DiskLoc& loc ) const final;
+ virtual Extent* extentForV1(const DiskLoc& loc) const;
- virtual Extent* extentForV1( const DiskLoc& loc ) const;
+ virtual DiskLoc extentLocForV1(const DiskLoc& loc) const;
- virtual DiskLoc extentLocForV1( const DiskLoc& loc ) const;
+ virtual Extent* getExtent(const DiskLoc& loc, bool doSanityCheck = true) const;
- virtual Extent* getExtent( const DiskLoc& loc, bool doSanityCheck = true ) const;
+ virtual int maxSize() const;
- virtual int maxSize() const;
+ virtual CacheHint* cacheHint(const DiskLoc& extentLoc, const HintType& hint);
- virtual CacheHint* cacheHint( const DiskLoc& extentLoc, const HintType& hint );
+protected:
+ struct ExtentInfo {
+ char* data;
+ size_t length;
+ };
- protected:
- struct ExtentInfo {
- char* data;
- size_t length;
- };
+ std::vector<ExtentInfo> _extents;
+};
- std::vector<ExtentInfo> _extents;
- };
-
- struct LocAndSize {
- DiskLoc loc;
- int size; // with headers
- };
+struct LocAndSize {
+ DiskLoc loc;
+ int size; // with headers
+};
- /**
- * Creates a V1 storage/mmap_v1 with the passed in records and DeletedRecords (drecs).
- *
- * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer is shorthand for
- * an empty list. Each extent gets it's own DiskLoc file number. DiskLoc Offsets must be > 1000.
- *
- * records must be sorted by extent/file. offsets within an extent can be in any order.
- *
- * In a simple RS, drecs must be grouped into size-buckets, but the ordering within the size
- * buckets is up to you.
- *
- * In a capped collection, all drecs form a single list and must be grouped by extent, with each
- * extent having at least one drec. capFirstNewRecord() and capExtent() *must* be correctly set
- * on md before calling.
- *
- * You are responsible for ensuring the records and drecs don't overlap.
- *
- * ExtentManager and MetaData must both be empty.
- */
- void initializeV1RS(OperationContext* txn,
- const LocAndSize* records,
- const LocAndSize* drecs,
- const LocAndSize* legacyGrabBag,
- DummyExtentManager* em,
- DummyRecordStoreV1MetaData* md);
-
- /**
- * Asserts that the V1RecordStore defined by md has the passed in records and drecs in the
- * correct order.
- *
- * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer means don't check
- * that list.
- */
- void assertStateV1RS(OperationContext* txn,
- const LocAndSize* records,
- const LocAndSize* drecs,
- const LocAndSize* legacyGrabBag,
- const ExtentManager* em,
- const DummyRecordStoreV1MetaData* md);
+/**
+ * Creates a V1 storage/mmap_v1 with the passed in records and DeletedRecords (drecs).
+ *
+ * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer is shorthand for
+ * an empty list. Each extent gets it's own DiskLoc file number. DiskLoc Offsets must be > 1000.
+ *
+ * records must be sorted by extent/file. offsets within an extent can be in any order.
+ *
+ * In a simple RS, drecs must be grouped into size-buckets, but the ordering within the size
+ * buckets is up to you.
+ *
+ * In a capped collection, all drecs form a single list and must be grouped by extent, with each
+ * extent having at least one drec. capFirstNewRecord() and capExtent() *must* be correctly set
+ * on md before calling.
+ *
+ * You are responsible for ensuring the records and drecs don't overlap.
+ *
+ * ExtentManager and MetaData must both be empty.
+ */
+void initializeV1RS(OperationContext* txn,
+ const LocAndSize* records,
+ const LocAndSize* drecs,
+ const LocAndSize* legacyGrabBag,
+ DummyExtentManager* em,
+ DummyRecordStoreV1MetaData* md);
+
+/**
+ * Asserts that the V1RecordStore defined by md has the passed in records and drecs in the
+ * correct order.
+ *
+ * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer means don't check
+ * that list.
+ */
+void assertStateV1RS(OperationContext* txn,
+ const LocAndSize* records,
+ const LocAndSize* drecs,
+ const LocAndSize* legacyGrabBag,
+ const ExtentManager* em,
+ const DummyRecordStoreV1MetaData* md);
} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/repair_database.cpp b/src/mongo/db/storage/mmap_v1/repair_database.cpp
index aa83636ae6b..6db0a4e15da 100644
--- a/src/mongo/db/storage/mmap_v1/repair_database.cpp
+++ b/src/mongo/db/storage/mmap_v1/repair_database.cpp
@@ -55,427 +55,415 @@
namespace mongo {
- using std::unique_ptr;
- using std::endl;
- using std::map;
- using std::string;
- using std::stringstream;
- using std::vector;
-
- typedef boost::filesystem::path Path;
-
- // inheritable class to implement an operation that may be applied to all
- // files in a database using _applyOpToDataFiles()
- class FileOp {
- public:
- virtual ~FileOp() {}
- // Return true if file exists and operation successful
- virtual bool apply( const boost::filesystem::path &p ) = 0;
- virtual const char * op() const = 0;
- };
-
- void _applyOpToDataFiles(const string& database, FileOp &fo, bool afterAllocator = false,
- const string& path = storageGlobalParams.dbpath);
-
- void _deleteDataFiles(const std::string& database) {
- if (storageGlobalParams.directoryperdb) {
- FileAllocator::get()->waitUntilFinished();
- MONGO_ASSERT_ON_EXCEPTION_WITH_MSG(
- boost::filesystem::remove_all(
- boost::filesystem::path(storageGlobalParams.dbpath) / database),
- "delete data files with a directoryperdb");
- return;
- }
- class : public FileOp {
- virtual bool apply( const boost::filesystem::path &p ) {
- return boost::filesystem::remove( p );
- }
- virtual const char * op() const {
- return "remove";
- }
- } deleter;
- _applyOpToDataFiles( database, deleter, true );
+using std::unique_ptr;
+using std::endl;
+using std::map;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+typedef boost::filesystem::path Path;
+
+// inheritable class to implement an operation that may be applied to all
+// files in a database using _applyOpToDataFiles()
+class FileOp {
+public:
+ virtual ~FileOp() {}
+ // Return true if file exists and operation successful
+ virtual bool apply(const boost::filesystem::path& p) = 0;
+ virtual const char* op() const = 0;
+};
+
+void _applyOpToDataFiles(const string& database,
+ FileOp& fo,
+ bool afterAllocator = false,
+ const string& path = storageGlobalParams.dbpath);
+
+void _deleteDataFiles(const std::string& database) {
+ if (storageGlobalParams.directoryperdb) {
+ FileAllocator::get()->waitUntilFinished();
+ MONGO_ASSERT_ON_EXCEPTION_WITH_MSG(
+ boost::filesystem::remove_all(boost::filesystem::path(storageGlobalParams.dbpath) /
+ database),
+ "delete data files with a directoryperdb");
+ return;
}
-
- void boostRenameWrapper( const Path &from, const Path &to ) {
- try {
- boost::filesystem::rename( from, to );
+ class : public FileOp {
+ virtual bool apply(const boost::filesystem::path& p) {
+ return boost::filesystem::remove(p);
}
- catch ( const boost::filesystem::filesystem_error & ) {
- // boost rename doesn't work across partitions
- boost::filesystem::copy_file( from, to);
- boost::filesystem::remove( from );
+ virtual const char* op() const {
+ return "remove";
}
}
+ deleter;
+ _applyOpToDataFiles(database, deleter, true);
+}
- // back up original database files to 'temp' dir
- void _renameForBackup( const std::string& database, const Path &reservedPath ) {
- Path newPath( reservedPath );
- if (storageGlobalParams.directoryperdb)
- newPath /= database;
- class Renamer : public FileOp {
- public:
- Renamer( const Path &newPath ) : newPath_( newPath ) {}
- private:
- const boost::filesystem::path &newPath_;
- virtual bool apply( const Path &p ) {
- if ( !boost::filesystem::exists( p ) )
- return false;
- boostRenameWrapper( p, newPath_ / ( p.leaf().string() + ".bak" ) );
- return true;
- }
- virtual const char * op() const {
- return "renaming";
- }
- } renamer( newPath );
- _applyOpToDataFiles( database, renamer, true );
+void boostRenameWrapper(const Path& from, const Path& to) {
+ try {
+ boost::filesystem::rename(from, to);
+ } catch (const boost::filesystem::filesystem_error&) {
+ // boost rename doesn't work across partitions
+ boost::filesystem::copy_file(from, to);
+ boost::filesystem::remove(from);
}
+}
- intmax_t dbSize( const string& database ) {
- class SizeAccumulator : public FileOp {
- public:
- SizeAccumulator() : totalSize_( 0 ) {}
- intmax_t size() const {
- return totalSize_;
- }
- private:
- virtual bool apply( const boost::filesystem::path &p ) {
- if ( !boost::filesystem::exists( p ) )
- return false;
- totalSize_ += boost::filesystem::file_size( p );
- return true;
- }
- virtual const char *op() const {
- return "checking size";
- }
- intmax_t totalSize_;
- };
- SizeAccumulator sa;
- _applyOpToDataFiles( database, sa );
- return sa.size();
- }
+// back up original database files to 'temp' dir
+void _renameForBackup(const std::string& database, const Path& reservedPath) {
+ Path newPath(reservedPath);
+ if (storageGlobalParams.directoryperdb)
+ newPath /= database;
+ class Renamer : public FileOp {
+ public:
+ Renamer(const Path& newPath) : newPath_(newPath) {}
- // move temp files to standard data dir
- void _replaceWithRecovered( const string& database, const char *reservedPathString ) {
- Path newPath(storageGlobalParams.dbpath);
- if (storageGlobalParams.directoryperdb)
- newPath /= database;
- class Replacer : public FileOp {
- public:
- Replacer( const Path &newPath ) : newPath_( newPath ) {}
- private:
- const boost::filesystem::path &newPath_;
- virtual bool apply( const Path &p ) {
- if ( !boost::filesystem::exists( p ) )
- return false;
- boostRenameWrapper( p, newPath_ / p.leaf() );
- return true;
- }
- virtual const char * op() const {
- return "renaming";
- }
- } replacer( newPath );
- _applyOpToDataFiles( database, replacer, true, reservedPathString );
- }
+ private:
+ const boost::filesystem::path& newPath_;
+ virtual bool apply(const Path& p) {
+ if (!boost::filesystem::exists(p))
+ return false;
+ boostRenameWrapper(p, newPath_ / (p.leaf().string() + ".bak"));
+ return true;
+ }
+ virtual const char* op() const {
+ return "renaming";
+ }
+ } renamer(newPath);
+ _applyOpToDataFiles(database, renamer, true);
+}
- // generate a directory name for storing temp data files
- Path uniqueReservedPath( const char *prefix ) {
- Path repairPath = Path(storageGlobalParams.repairpath);
- Path reservedPath;
- int i = 0;
- bool exists = false;
- do {
- stringstream ss;
- ss << prefix << "_repairDatabase_" << i++;
- reservedPath = repairPath / ss.str();
- MONGO_ASSERT_ON_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
+intmax_t dbSize(const string& database) {
+ class SizeAccumulator : public FileOp {
+ public:
+ SizeAccumulator() : totalSize_(0) {}
+ intmax_t size() const {
+ return totalSize_;
}
- while ( exists );
- return reservedPath;
- }
- void _applyOpToDataFiles( const string& database, FileOp &fo, bool afterAllocator, const string& path ) {
- if ( afterAllocator )
- FileAllocator::get()->waitUntilFinished();
- string c = database;
- c += '.';
- boost::filesystem::path p(path);
- if (storageGlobalParams.directoryperdb)
- p /= database;
- boost::filesystem::path q;
- q = p / (c+"ns");
- bool ok = false;
- MONGO_ASSERT_ON_EXCEPTION( ok = fo.apply( q ) );
- if ( ok ) {
- LOG(2) << fo.op() << " file " << q.string() << endl;
+ private:
+ virtual bool apply(const boost::filesystem::path& p) {
+ if (!boost::filesystem::exists(p))
+ return false;
+ totalSize_ += boost::filesystem::file_size(p);
+ return true;
}
- int i = 0;
- int extra = 10; // should not be necessary, this is defensive in case there are missing files
- while ( 1 ) {
- verify( i <= DiskLoc::MaxFiles );
- stringstream ss;
- ss << c << i;
- q = p / ss.str();
- MONGO_ASSERT_ON_EXCEPTION( ok = fo.apply(q) );
- if ( ok ) {
- if ( extra != 10 ) {
- LOG(1) << fo.op() << " file " << q.string() << endl;
- log() << " _applyOpToDataFiles() warning: extra == " << extra << endl;
- }
- }
- else if ( --extra <= 0 )
- break;
- i++;
+ virtual const char* op() const {
+ return "checking size";
}
- }
+ intmax_t totalSize_;
+ };
+ SizeAccumulator sa;
+ _applyOpToDataFiles(database, sa);
+ return sa.size();
+}
- class RepairFileDeleter {
+// move temp files to standard data dir
+void _replaceWithRecovered(const string& database, const char* reservedPathString) {
+ Path newPath(storageGlobalParams.dbpath);
+ if (storageGlobalParams.directoryperdb)
+ newPath /= database;
+ class Replacer : public FileOp {
public:
- RepairFileDeleter( OperationContext* txn,
- const string& dbName,
- const string& pathString,
- const Path& path )
- : _txn(txn),
- _dbName( dbName ),
- _pathString( pathString ),
- _path( path ),
- _success( false ) {
+ Replacer(const Path& newPath) : newPath_(newPath) {}
+
+ private:
+ const boost::filesystem::path& newPath_;
+ virtual bool apply(const Path& p) {
+ if (!boost::filesystem::exists(p))
+ return false;
+ boostRenameWrapper(p, newPath_ / p.leaf());
+ return true;
}
+ virtual const char* op() const {
+ return "renaming";
+ }
+ } replacer(newPath);
+ _applyOpToDataFiles(database, replacer, true, reservedPathString);
+}
+
+// generate a directory name for storing temp data files
+Path uniqueReservedPath(const char* prefix) {
+ Path repairPath = Path(storageGlobalParams.repairpath);
+ Path reservedPath;
+ int i = 0;
+ bool exists = false;
+ do {
+ stringstream ss;
+ ss << prefix << "_repairDatabase_" << i++;
+ reservedPath = repairPath / ss.str();
+ MONGO_ASSERT_ON_EXCEPTION(exists = boost::filesystem::exists(reservedPath));
+ } while (exists);
+ return reservedPath;
+}
- ~RepairFileDeleter() {
- if ( _success )
- return;
+void _applyOpToDataFiles(const string& database,
+ FileOp& fo,
+ bool afterAllocator,
+ const string& path) {
+ if (afterAllocator)
+ FileAllocator::get()->waitUntilFinished();
+ string c = database;
+ c += '.';
+ boost::filesystem::path p(path);
+ if (storageGlobalParams.directoryperdb)
+ p /= database;
+ boost::filesystem::path q;
+ q = p / (c + "ns");
+ bool ok = false;
+ MONGO_ASSERT_ON_EXCEPTION(ok = fo.apply(q));
+ if (ok) {
+ LOG(2) << fo.op() << " file " << q.string() << endl;
+ }
+ int i = 0;
+ int extra = 10; // should not be necessary, this is defensive in case there are missing files
+ while (1) {
+ verify(i <= DiskLoc::MaxFiles);
+ stringstream ss;
+ ss << c << i;
+ q = p / ss.str();
+ MONGO_ASSERT_ON_EXCEPTION(ok = fo.apply(q));
+ if (ok) {
+ if (extra != 10) {
+ LOG(1) << fo.op() << " file " << q.string() << endl;
+ log() << " _applyOpToDataFiles() warning: extra == " << extra << endl;
+ }
+ } else if (--extra <= 0)
+ break;
+ i++;
+ }
+}
- log() << "cleaning up failed repair "
- << "db: " << _dbName << " path: " << _pathString;
+class RepairFileDeleter {
+public:
+ RepairFileDeleter(OperationContext* txn,
+ const string& dbName,
+ const string& pathString,
+ const Path& path)
+ : _txn(txn), _dbName(dbName), _pathString(pathString), _path(path), _success(false) {}
- try {
- getDur().syncDataAndTruncateJournal(_txn);
+ ~RepairFileDeleter() {
+ if (_success)
+ return;
- // need both in case journaling is disabled
- MongoFile::flushAll(true);
+ log() << "cleaning up failed repair "
+ << "db: " << _dbName << " path: " << _pathString;
- MONGO_ASSERT_ON_EXCEPTION( boost::filesystem::remove_all( _path ) );
- }
- catch ( DBException& e ) {
- error() << "RepairFileDeleter failed to cleanup: " << e;
- error() << "aborting";
- fassertFailed( 17402 );
- }
- }
+ try {
+ getDur().syncDataAndTruncateJournal(_txn);
+
+ // need both in case journaling is disabled
+ MongoFile::flushAll(true);
- void success() {
- _success = true;
+ MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::remove_all(_path));
+ } catch (DBException& e) {
+ error() << "RepairFileDeleter failed to cleanup: " << e;
+ error() << "aborting";
+ fassertFailed(17402);
}
+ }
- private:
- OperationContext* _txn;
- string _dbName;
- string _pathString;
- Path _path;
- bool _success;
- };
+ void success() {
+ _success = true;
+ }
- Status MMAPV1Engine::repairDatabase( OperationContext* txn,
- const std::string& dbName,
- bool preserveClonedFilesOnFailure,
- bool backupOriginalFiles ) {
- unique_ptr<RepairFileDeleter> repairFileDeleter;
+private:
+ OperationContext* _txn;
+ string _dbName;
+ string _pathString;
+ Path _path;
+ bool _success;
+};
+
+Status MMAPV1Engine::repairDatabase(OperationContext* txn,
+ const std::string& dbName,
+ bool preserveClonedFilesOnFailure,
+ bool backupOriginalFiles) {
+ unique_ptr<RepairFileDeleter> repairFileDeleter;
+
+ // Must be done before and after repair
+ getDur().syncDataAndTruncateJournal(txn);
+
+ intmax_t totalSize = dbSize(dbName);
+ intmax_t freeSize = File::freeSpace(storageGlobalParams.repairpath);
+
+ if (freeSize > -1 && freeSize < totalSize) {
+ return Status(ErrorCodes::OutOfDiskSpace,
+ str::stream()
+ << "Cannot repair database " << dbName << " having size: " << totalSize
+ << " (bytes) because free disk space is: " << freeSize << " (bytes)");
+ }
- // Must be done before and after repair
- getDur().syncDataAndTruncateJournal(txn);
+ txn->checkForInterrupt();
- intmax_t totalSize = dbSize( dbName );
- intmax_t freeSize = File::freeSpace(storageGlobalParams.repairpath);
+ Path reservedPath = uniqueReservedPath(
+ (preserveClonedFilesOnFailure || backupOriginalFiles) ? "backup" : "_tmp");
+ bool created = false;
+ MONGO_ASSERT_ON_EXCEPTION(created = boost::filesystem::create_directory(reservedPath));
+ invariant(created);
+ string reservedPathString = reservedPath.string();
- if ( freeSize > -1 && freeSize < totalSize ) {
- return Status( ErrorCodes::OutOfDiskSpace,
- str::stream() << "Cannot repair database " << dbName
- << " having size: " << totalSize
- << " (bytes) because free disk space is: " << freeSize << " (bytes)" );
- }
+ if (!preserveClonedFilesOnFailure)
+ repairFileDeleter.reset(
+ new RepairFileDeleter(txn, dbName, reservedPathString, reservedPath));
- txn->checkForInterrupt();
+ {
+ Database* originalDatabase = dbHolder().openDb(txn, dbName);
+ if (originalDatabase == NULL) {
+ return Status(ErrorCodes::NamespaceNotFound, "database does not exist to repair");
+ }
- Path reservedPath =
- uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ?
- "backup" : "_tmp" );
- bool created = false;
- MONGO_ASSERT_ON_EXCEPTION( created = boost::filesystem::create_directory( reservedPath ) );
- invariant( created );
- string reservedPathString = reservedPath.string();
+ unique_ptr<MMAPV1DatabaseCatalogEntry> dbEntry;
+ unique_ptr<Database> tempDatabase;
- if ( !preserveClonedFilesOnFailure )
- repairFileDeleter.reset( new RepairFileDeleter( txn,
- dbName,
- reservedPathString,
- reservedPath ) );
+ // Must call this before MMAPV1DatabaseCatalogEntry's destructor closes the DB files
+ ON_BLOCK_EXIT(&dur::DurableInterface::syncDataAndTruncateJournal, &getDur(), txn);
{
- Database* originalDatabase = dbHolder().openDb(txn, dbName);
- if (originalDatabase == NULL) {
- return Status(ErrorCodes::NamespaceNotFound, "database does not exist to repair");
- }
-
- unique_ptr<MMAPV1DatabaseCatalogEntry> dbEntry;
- unique_ptr<Database> tempDatabase;
+ dbEntry.reset(new MMAPV1DatabaseCatalogEntry(
+ txn, dbName, reservedPathString, storageGlobalParams.directoryperdb, true));
+ tempDatabase.reset(new Database(txn, dbName, dbEntry.get()));
+ }
- // Must call this before MMAPV1DatabaseCatalogEntry's destructor closes the DB files
- ON_BLOCK_EXIT(&dur::DurableInterface::syncDataAndTruncateJournal, &getDur(), txn);
+ map<string, CollectionOptions> namespacesToCopy;
+ {
+ string ns = dbName + ".system.namespaces";
+ OldClientContext ctx(txn, ns);
+ Collection* coll = originalDatabase->getCollection(ns);
+ if (coll) {
+ auto cursor = coll->getCursor(txn);
+ while (auto record = cursor->next()) {
+ BSONObj obj = record->data.releaseToBson();
- {
- dbEntry.reset(new MMAPV1DatabaseCatalogEntry(txn,
- dbName,
- reservedPathString,
- storageGlobalParams.directoryperdb,
- true));
- tempDatabase.reset( new Database(txn, dbName, dbEntry.get()));
- }
+ string ns = obj["name"].String();
- map<string,CollectionOptions> namespacesToCopy;
- {
- string ns = dbName + ".system.namespaces";
- OldClientContext ctx(txn, ns );
- Collection* coll = originalDatabase->getCollection( ns );
- if ( coll ) {
- auto cursor = coll->getCursor(txn);
- while (auto record = cursor->next()) {
- BSONObj obj = record->data.releaseToBson();
-
- string ns = obj["name"].String();
-
- NamespaceString nss( ns );
- if ( nss.isSystem() ) {
- if ( nss.isSystemDotIndexes() )
- continue;
- if ( nss.coll() == "system.namespaces" )
- continue;
- }
-
- if ( !nss.isNormal() )
+ NamespaceString nss(ns);
+ if (nss.isSystem()) {
+ if (nss.isSystemDotIndexes())
continue;
+ if (nss.coll() == "system.namespaces")
+ continue;
+ }
+
+ if (!nss.isNormal())
+ continue;
- CollectionOptions options;
- if ( obj["options"].isABSONObj() ) {
- Status status = options.parse( obj["options"].Obj() );
- if ( !status.isOK() )
- return status;
- }
- namespacesToCopy[ns] = options;
+ CollectionOptions options;
+ if (obj["options"].isABSONObj()) {
+ Status status = options.parse(obj["options"].Obj());
+ if (!status.isOK())
+ return status;
}
+ namespacesToCopy[ns] = options;
}
}
+ }
- for ( map<string,CollectionOptions>::const_iterator i = namespacesToCopy.begin();
- i != namespacesToCopy.end();
- ++i ) {
- string ns = i->first;
- CollectionOptions options = i->second;
-
- Collection* tempCollection = NULL;
- {
- WriteUnitOfWork wunit(txn);
- tempCollection = tempDatabase->createCollection(txn, ns, options, false);
- wunit.commit();
- }
+ for (map<string, CollectionOptions>::const_iterator i = namespacesToCopy.begin();
+ i != namespacesToCopy.end();
+ ++i) {
+ string ns = i->first;
+ CollectionOptions options = i->second;
- OldClientContext readContext(txn, ns, originalDatabase);
- Collection* originalCollection = originalDatabase->getCollection( ns );
- invariant( originalCollection );
-
- // data
-
- // TODO SERVER-14812 add a mode that drops duplicates rather than failing
- MultiIndexBlock indexer(txn, tempCollection );
- {
- vector<BSONObj> indexes;
- IndexCatalog::IndexIterator ii =
- originalCollection->getIndexCatalog()->getIndexIterator( txn, false );
- while ( ii.more() ) {
- IndexDescriptor* desc = ii.next();
- indexes.push_back( desc->infoObj() );
- }
+ Collection* tempCollection = NULL;
+ {
+ WriteUnitOfWork wunit(txn);
+ tempCollection = tempDatabase->createCollection(txn, ns, options, false);
+ wunit.commit();
+ }
- Status status = indexer.init( indexes );
- if (!status.isOK()) {
- return status;
- }
- }
+ OldClientContext readContext(txn, ns, originalDatabase);
+ Collection* originalCollection = originalDatabase->getCollection(ns);
+ invariant(originalCollection);
- auto cursor = originalCollection->getCursor(txn);
- while (auto record = cursor->next()) {
- BSONObj doc = record->data.releaseToBson();
-
- WriteUnitOfWork wunit(txn);
- StatusWith<RecordId> result = tempCollection->insertDocument(txn,
- doc,
- &indexer,
- false);
- if ( !result.isOK() )
- return result.getStatus();
-
- wunit.commit();
- txn->checkForInterrupt();
- }
-
- Status status = indexer.doneInserting();
- if (!status.isOK())
- return status;
+ // data
- {
- WriteUnitOfWork wunit(txn);
- indexer.commit();
- wunit.commit();
+ // TODO SERVER-14812 add a mode that drops duplicates rather than failing
+ MultiIndexBlock indexer(txn, tempCollection);
+ {
+ vector<BSONObj> indexes;
+ IndexCatalog::IndexIterator ii =
+ originalCollection->getIndexCatalog()->getIndexIterator(txn, false);
+ while (ii.more()) {
+ IndexDescriptor* desc = ii.next();
+ indexes.push_back(desc->infoObj());
}
+ Status status = indexer.init(indexes);
+ if (!status.isOK()) {
+ return status;
+ }
}
- getDur().syncDataAndTruncateJournal(txn);
+ auto cursor = originalCollection->getCursor(txn);
+ while (auto record = cursor->next()) {
+ BSONObj doc = record->data.releaseToBson();
- // need both in case journaling is disabled
- MongoFile::flushAll(true);
+ WriteUnitOfWork wunit(txn);
+ StatusWith<RecordId> result =
+ tempCollection->insertDocument(txn, doc, &indexer, false);
+ if (!result.isOK())
+ return result.getStatus();
+
+ wunit.commit();
+ txn->checkForInterrupt();
+ }
+
+ Status status = indexer.doneInserting();
+ if (!status.isOK())
+ return status;
- txn->checkForInterrupt();
+ {
+ WriteUnitOfWork wunit(txn);
+ indexer.commit();
+ wunit.commit();
+ }
}
- // at this point if we abort, we don't want to delete new files
- // as they might be the only copies
+ getDur().syncDataAndTruncateJournal(txn);
- if ( repairFileDeleter.get() )
- repairFileDeleter->success();
+ // need both in case journaling is disabled
+ MongoFile::flushAll(true);
- // Close the database so we can rename/delete the original data files
- dbHolder().close(txn, dbName);
+ txn->checkForInterrupt();
+ }
- if ( backupOriginalFiles ) {
- _renameForBackup( dbName, reservedPath );
- }
- else {
- // first make new directory before deleting data
- Path newDir = Path(storageGlobalParams.dbpath) / dbName;
- MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir));
+ // at this point if we abort, we don't want to delete new files
+ // as they might be the only copies
- // this deletes old files
- _deleteDataFiles( dbName );
+ if (repairFileDeleter.get())
+ repairFileDeleter->success();
- if ( !boost::filesystem::exists(newDir) ) {
- // we deleted because of directoryperdb
- // re-create
- MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir));
- }
- }
+ // Close the database so we can rename/delete the original data files
+ dbHolder().close(txn, dbName);
+
+ if (backupOriginalFiles) {
+ _renameForBackup(dbName, reservedPath);
+ } else {
+ // first make new directory before deleting data
+ Path newDir = Path(storageGlobalParams.dbpath) / dbName;
+ MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir));
- _replaceWithRecovered( dbName, reservedPathString.c_str() );
+ // this deletes old files
+ _deleteDataFiles(dbName);
- if (!backupOriginalFiles) {
- MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::remove_all(reservedPath));
+ if (!boost::filesystem::exists(newDir)) {
+ // we deleted because of directoryperdb
+ // re-create
+ MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir));
}
+ }
- // Reopen the database so it's discoverable
- dbHolder().openDb(txn, dbName);
+ _replaceWithRecovered(dbName, reservedPathString.c_str());
- return Status::OK();
+ if (!backupOriginalFiles) {
+ MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::remove_all(reservedPath));
}
+ // Reopen the database so it's discoverable
+ dbHolder().openDb(txn, dbName);
+ return Status::OK();
+}
}