// mmap_v1_extent_manager.cpp
/**
* Copyright (C) 2014 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage
#include
#include "mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h"
#include "mongo/base/counter.h"
#include "mongo/db/audit.h"
#include "mongo/db/client.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/service_context.h"
#include "mongo/db/storage/mmap_v1/data_file.h"
#include "mongo/db/storage/mmap_v1/dur.h"
#include "mongo/db/storage/mmap_v1/extent.h"
#include "mongo/db/storage/mmap_v1/extent_manager.h"
#include "mongo/db/storage/mmap_v1/mmap.h"
#include "mongo/db/storage/mmap_v1/mmap_v1_engine.h"
#include "mongo/db/storage/mmap_v1/mmap_v1_options.h"
#include "mongo/db/storage/mmap_v1/record.h"
#include "mongo/db/storage/record_fetcher.h"
#include "mongo/stdx/memory.h"
#include "mongo/util/fail_point_service.h"
#include "mongo/util/file.h"
#include "mongo/util/log.h"
namespace mongo {
using std::unique_ptr;
using std::endl;
using std::max;
using std::string;
using std::stringstream;
// Turn on this failpoint to force the system to yield for a fetch. Setting to "alwaysOn"
// will cause yields for fetching to occur on every 'kNeedsFetchFailFreq'th call to
// recordNeedsFetch().
static const int kNeedsFetchFailFreq = 2;
static Counter64 needsFetchFailCounter;
MONGO_FP_DECLARE(recordNeedsFetchFail);
// Used to make sure the compiler doesn't get too smart on us when we're
// trying to touch records.
// volatile - avoid compiler optimizations for touching a mmap page
volatile int __record_touch_dummy = 1; // NOLINT
class MmapV1RecordFetcher : public RecordFetcher {
MONGO_DISALLOW_COPYING(MmapV1RecordFetcher);
public:
explicit MmapV1RecordFetcher(const MmapV1RecordHeader* record) : _record(record) {}
virtual void setup(OperationContext* opCtx) {
invariant(!_filesLock.get());
_filesLock.reset(new LockMongoFilesShared(opCtx));
}
virtual void fetch() {
// It's only legal to touch the record while we're holding a lock on the data files.
invariant(_filesLock.get());
const char* recordChar = reinterpret_cast(_record);
// Here's where we actually deference a pointer into the record. This is where
// we expect a page fault to occur, so we should this out of the lock.
__record_touch_dummy += *recordChar;
// We're not going to touch the record anymore, so we can give up our
// lock on mongo files. We do this here because we have to release the
// lock on mongo files prior to reacquiring lock mgr locks.
_filesLock.reset();
}
private:
// The record which needs to be touched in order to page fault. Not owned by us.
const MmapV1RecordHeader* _record;
// This ensures that our MmapV1RecordHeader* does not drop out from under our feet before
// we dereference it.
std::unique_ptr _filesLock;
};
MmapV1ExtentManager::MmapV1ExtentManager(StringData dbname, StringData path, bool directoryPerDB)
: _dbname(dbname.toString()),
_path(path.toString()),
_directoryPerDB(directoryPerDB),
_rid(RESOURCE_METADATA, dbname) {
StorageEngine* engine = getGlobalServiceContext()->getGlobalStorageEngine();
invariant(engine->isMmapV1());
MMAPV1Engine* mmapEngine = static_cast(engine);
_recordAccessTracker = &mmapEngine->getRecordAccessTracker();
}
std::unique_ptr MmapV1ExtentManager::Factory::create(StringData dbname,
StringData path,
bool directoryPerDB) {
return stdx::make_unique(
std::move(dbname), std::move(path), directoryPerDB);
}
boost::filesystem::path MmapV1ExtentManager::_fileName(int n) const {
stringstream ss;
ss << _dbname << '.' << n;
boost::filesystem::path fullName(_path);
if (_directoryPerDB)
fullName /= _dbname;
fullName /= ss.str();
return fullName;
}
Status MmapV1ExtentManager::init(OperationContext* opCtx) {
invariant(_files.empty());
for (int n = 0; n < DiskLoc::MaxFiles; n++) {
const boost::filesystem::path fullName = _fileName(n);
if (!boost::filesystem::exists(fullName)) {
break;
}
const std::string fullNameString = fullName.string();
{
// If the file is uninitialized we exit the loop because it is just prealloced. We
// do this on a bare File object rather than using the DataFile because closing a
// DataFile triggers dur::closingFileNotification() which is fatal if there are any
// pending writes. Therefore we must only open files that we know we want to keep.
File preview;
preview.open(fullNameString.c_str(), /*readOnly*/ true);
invariant(preview.is_open());
// File can't be initialized if too small.
if (preview.len() < sizeof(DataFileHeader)) {
break;
}
// This is the equivalent of DataFileHeader::uninitialized().
int version;
preview.read(0, reinterpret_cast(&version), sizeof(version));
invariant(!preview.bad());
if (version == 0) {
break;
}
}
unique_ptr df(new DataFile(opCtx, n));
Status s = df->openExisting(opCtx, fullNameString.c_str());
if (!s.isOK()) {
df->close(opCtx);
return s;
}
invariant(!df->getHeader()->uninitialized());
// We only checkUpgrade on files that we are keeping, not preallocs.
df->getHeader()->checkUpgrade(opCtx);
_files.push_back(df.release());
}
// If this is a new database being created, instantiate the first file and one extent so
// we can have a coherent database.
if (_files.empty()) {
WriteUnitOfWork wuow(opCtx);
_createExtent(opCtx, initialSize(128), false);
wuow.commit();
// Commit the journal and all changes to disk so that even if exceptions occur during
// subsequent initialization, we won't have uncommited changes during file close.
getDur().commitNow(opCtx);
}
return Status::OK();
}
const DataFile* MmapV1ExtentManager::_getOpenFile(int fileId) const {
if (fileId < 0 || fileId >= _files.size()) {
log() << "_getOpenFile() invalid file index requested " << fileId;
invariant(false);
}
return _files[fileId];
}
DataFile* MmapV1ExtentManager::_getOpenFile(int fileId) {
if (fileId < 0 || fileId >= _files.size()) {
log() << "_getOpenFile() invalid file index requested " << fileId;
invariant(false);
}
return _files[fileId];
}
DataFile* MmapV1ExtentManager::_addAFile(OperationContext* opCtx,
int sizeNeeded,
bool preallocateNextFile) {
// Database must be stable and we need to be in some sort of an update operation in order
// to add a new file.
invariant(opCtx->lockState()->isDbLockedForMode(_dbname, MODE_IX));
const int allocFileId = _files.size();
int minSize = 0;
if (allocFileId > 0) {
// Make the next file at least as large as the previous
minSize = _files[allocFileId - 1]->getHeader()->fileLength;
}
if (minSize < sizeNeeded + DataFileHeader::HeaderSize) {
minSize = sizeNeeded + DataFileHeader::HeaderSize;
}
{
unique_ptr allocFile(new DataFile(opCtx, allocFileId));
const string allocFileName = _fileName(allocFileId).string();
Timer t;
try {
allocFile->open(opCtx, allocFileName.c_str(), minSize, false);
} catch (...) {
allocFile->close(opCtx);
throw;
}
if (t.seconds() > 1) {
log() << "MmapV1ExtentManager took " << t.seconds()
<< " seconds to open: " << allocFileName;
}
// It's all good
_files.push_back(allocFile.release());
}
// Preallocate is asynchronous
if (preallocateNextFile) {
unique_ptr nextFile(new DataFile(opCtx, allocFileId + 1));
const string nextFileName = _fileName(allocFileId + 1).string();
try {
nextFile->open(opCtx, nextFileName.c_str(), minSize, false);
} catch (...) {
nextFile->close(opCtx);
throw;
}
}
// Returns the last file added
return _files[allocFileId];
}
int MmapV1ExtentManager::numFiles() const {
return _files.size();
}
long long MmapV1ExtentManager::fileSize() const {
long long size = 0;
for (int n = 0; boost::filesystem::exists(_fileName(n)); n++) {
size += boost::filesystem::file_size(_fileName(n));
}
return size;
}
MmapV1RecordHeader* MmapV1ExtentManager::_recordForV1(const DiskLoc& loc) const {
loc.assertOk();
const DataFile* df = _getOpenFile(loc.a());
int ofs = loc.getOfs();
if (ofs < DataFileHeader::HeaderSize) {
df->badOfs(ofs); // will msgassert - external call to keep out of the normal code path
}
return reinterpret_cast(df->p() + ofs);
}
MmapV1RecordHeader* MmapV1ExtentManager::recordForV1(const DiskLoc& loc) const {
MmapV1RecordHeader* record = _recordForV1(loc);
_recordAccessTracker->markAccessed(record);
return record;
}
std::unique_ptr MmapV1ExtentManager::recordNeedsFetch(const DiskLoc& loc) const {
if (loc.isNull())
return {};
MmapV1RecordHeader* record = _recordForV1(loc);
// For testing: if failpoint is enabled we randomly request fetches without
// going to the RecordAccessTracker.
if (MONGO_FAIL_POINT(recordNeedsFetchFail)) {
needsFetchFailCounter.increment();
if ((needsFetchFailCounter.get() % kNeedsFetchFailFreq) == 0) {
return stdx::make_unique(record);
}
}
if (!_recordAccessTracker->checkAccessedAndMark(record)) {
return stdx::make_unique(record);
}
return {};
}
DiskLoc MmapV1ExtentManager::extentLocForV1(const DiskLoc& loc) const {
MmapV1RecordHeader* record = recordForV1(loc);
return DiskLoc(loc.a(), record->extentOfs());
}
Extent* MmapV1ExtentManager::extentForV1(const DiskLoc& loc) const {
DiskLoc extentLoc = extentLocForV1(loc);
return getExtent(extentLoc);
}
Extent* MmapV1ExtentManager::getExtent(const DiskLoc& loc, bool doSanityCheck) const {
loc.assertOk();
Extent* e = reinterpret_cast(_getOpenFile(loc.a())->p() + loc.getOfs());
if (doSanityCheck)
e->assertOk();
_recordAccessTracker->markAccessed(e);
return e;
}
void _checkQuota(bool enforceQuota, int fileNo) {
if (!enforceQuota)
return;
if (fileNo < mmapv1GlobalOptions.quotaFiles)
return;
uasserted(12501, "quota exceeded");
}
int MmapV1ExtentManager::maxSize() const {
return DataFile::maxSize() - DataFileHeader::HeaderSize - 16;
}
DiskLoc MmapV1ExtentManager::_createExtentInFile(
OperationContext* opCtx, int fileNo, DataFile* f, int size, bool enforceQuota) {
_checkQuota(enforceQuota, fileNo - 1);
massert(10358, "bad new extent size", size >= minSize() && size <= maxSize());
DiskLoc loc = f->allocExtentArea(opCtx, size);
loc.assertOk();
Extent* e = getExtent(loc, false);
verify(e);
*opCtx->recoveryUnit()->writing(&e->magic) = Extent::extentSignature;
*opCtx->recoveryUnit()->writing(&e->myLoc) = loc;
*opCtx->recoveryUnit()->writing(&e->length) = size;
return loc;
}
DiskLoc MmapV1ExtentManager::_createExtent(OperationContext* opCtx, int size, bool enforceQuota) {
size = quantizeExtentSize(size);
if (size > maxSize())
size = maxSize();
verify(size < DataFile::maxSize());
for (int i = numFiles() - 1; i >= 0; i--) {
DataFile* f = _getOpenFile(i);
invariant(f);
if (f->getHeader()->unusedLength >= size) {
return _createExtentInFile(opCtx, i, f, size, enforceQuota);
}
}
_checkQuota(enforceQuota, numFiles());
// no space in an existing file
// allocate files until we either get one big enough or hit maxSize
for (int i = 0; i < 8; i++) {
DataFile* f = _addAFile(opCtx, size, false);
if (f->getHeader()->unusedLength >= size) {
return _createExtentInFile(opCtx, numFiles() - 1, f, size, enforceQuota);
}
}
// callers don't check for null return code, so assert
msgasserted(14810, "couldn't allocate space for a new extent");
}
DiskLoc MmapV1ExtentManager::_allocFromFreeList(OperationContext* opCtx,
int approxSize,
bool capped) {
// setup extent constraints
int low, high;
if (capped) {
// be strict about the size
low = approxSize;
if (low > 2048)
low -= 256;
high = (int)(approxSize * 1.05) + 256;
} else {
low = (int)(approxSize * 0.8);
high = (int)(approxSize * 1.4);
}
if (high <= 0) {
// overflowed
high = max(approxSize, maxSize());
}
if (high <= minSize()) {
// the minimum extent size is 4097
high = minSize() + 1;
}
// scan free list looking for something suitable
int n = 0;
Extent* best = 0;
int bestDiff = 0x7fffffff;
{
Timer t;
DiskLoc L = _getFreeListStart();
while (!L.isNull()) {
Extent* e = getExtent(L);
if (e->length >= low && e->length <= high) {
int diff = abs(e->length - approxSize);
if (diff < bestDiff) {
bestDiff = diff;
best = e;
if (((double)diff) / approxSize < 0.1) {
// close enough
break;
}
if (t.seconds() >= 2) {
// have spent lots of time in write lock, and we are in [low,high], so close
// enough could come into play if extent freelist is very long
break;
}
} else {
OCCASIONALLY {
if (high < 64 * 1024 && t.seconds() >= 2) {
// be less picky if it is taking a long time
high = 64 * 1024;
}
}
}
}
L = e->xnext;
++n;
}
if (t.seconds() >= 10) {
log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl;
}
}
if (n > 128) {
LOG(n < 512 ? 1 : 0) << "warning: newExtent " << n << " scanned\n";
}
if (!best)
return DiskLoc();
// remove from the free list
if (!best->xprev.isNull())
*opCtx->recoveryUnit()->writing(&getExtent(best->xprev)->xnext) = best->xnext;
if (!best->xnext.isNull())
*opCtx->recoveryUnit()->writing(&getExtent(best->xnext)->xprev) = best->xprev;
if (_getFreeListStart() == best->myLoc)
_setFreeListStart(opCtx, best->xnext);
if (_getFreeListEnd() == best->myLoc)
_setFreeListEnd(opCtx, best->xprev);
return best->myLoc;
}
DiskLoc MmapV1ExtentManager::allocateExtent(OperationContext* opCtx,
bool capped,
int size,
bool enforceQuota) {
Lock::ResourceLock rlk(opCtx->lockState(), _rid, MODE_X);
bool fromFreeList = true;
DiskLoc eloc = _allocFromFreeList(opCtx, size, capped);
if (eloc.isNull()) {
fromFreeList = false;
eloc = _createExtent(opCtx, size, enforceQuota);
}
invariant(!eloc.isNull());
invariant(eloc.isValid());
LOG(1) << "MmapV1ExtentManager::allocateExtent"
<< " desiredSize:" << size << " fromFreeList: " << fromFreeList << " eloc: " << eloc;
return eloc;
}
void MmapV1ExtentManager::freeExtent(OperationContext* opCtx, DiskLoc firstExt) {
Lock::ResourceLock rlk(opCtx->lockState(), _rid, MODE_X);
Extent* e = getExtent(firstExt);
opCtx->recoveryUnit()->writing(&e->xnext)->Null();
opCtx->recoveryUnit()->writing(&e->xprev)->Null();
opCtx->recoveryUnit()->writing(&e->firstRecord)->Null();
opCtx->recoveryUnit()->writing(&e->lastRecord)->Null();
if (_getFreeListStart().isNull()) {
_setFreeListStart(opCtx, firstExt);
_setFreeListEnd(opCtx, firstExt);
} else {
DiskLoc a = _getFreeListStart();
invariant(getExtent(a)->xprev.isNull());
*opCtx->recoveryUnit()->writing(&getExtent(a)->xprev) = firstExt;
*opCtx->recoveryUnit()->writing(&getExtent(firstExt)->xnext) = a;
_setFreeListStart(opCtx, firstExt);
}
}
void MmapV1ExtentManager::freeExtents(OperationContext* opCtx, DiskLoc firstExt, DiskLoc lastExt) {
Lock::ResourceLock rlk(opCtx->lockState(), _rid, MODE_X);
if (firstExt.isNull() && lastExt.isNull())
return;
{
verify(!firstExt.isNull() && !lastExt.isNull());
Extent* f = getExtent(firstExt);
Extent* l = getExtent(lastExt);
verify(f->xprev.isNull());
verify(l->xnext.isNull());
verify(f == l || !f->xnext.isNull());
verify(f == l || !l->xprev.isNull());
}
if (_getFreeListStart().isNull()) {
_setFreeListStart(opCtx, firstExt);
_setFreeListEnd(opCtx, lastExt);
} else {
DiskLoc a = _getFreeListStart();
invariant(getExtent(a)->xprev.isNull());
*opCtx->recoveryUnit()->writing(&getExtent(a)->xprev) = lastExt;
*opCtx->recoveryUnit()->writing(&getExtent(lastExt)->xnext) = a;
_setFreeListStart(opCtx, firstExt);
}
}
DiskLoc MmapV1ExtentManager::_getFreeListStart() const {
if (_files.empty())
return DiskLoc();
const DataFile* file = _getOpenFile(0);
return file->header()->freeListStart;
}
DiskLoc MmapV1ExtentManager::_getFreeListEnd() const {
if (_files.empty())
return DiskLoc();
const DataFile* file = _getOpenFile(0);
return file->header()->freeListEnd;
}
void MmapV1ExtentManager::_setFreeListStart(OperationContext* opCtx, DiskLoc loc) {
invariant(!_files.empty());
DataFile* file = _files[0];
*opCtx->recoveryUnit()->writing(&file->header()->freeListStart) = loc;
}
void MmapV1ExtentManager::_setFreeListEnd(OperationContext* opCtx, DiskLoc loc) {
invariant(!_files.empty());
DataFile* file = _files[0];
*opCtx->recoveryUnit()->writing(&file->header()->freeListEnd) = loc;
}
void MmapV1ExtentManager::freeListStats(OperationContext* opCtx,
int* numExtents,
int64_t* totalFreeSizeBytes) const {
Lock::ResourceLock rlk(opCtx->lockState(), _rid, MODE_S);
invariant(numExtents);
invariant(totalFreeSizeBytes);
*numExtents = 0;
*totalFreeSizeBytes = 0;
DiskLoc a = _getFreeListStart();
while (!a.isNull()) {
Extent* e = getExtent(a);
(*numExtents)++;
(*totalFreeSizeBytes) += e->length;
a = e->xnext;
}
}
namespace {
class CacheHintMadvise : public ExtentManager::CacheHint {
public:
CacheHintMadvise(void* p, unsigned len, MAdvise::Advice a) : _advice(p, len, a) {}
private:
MAdvise _advice;
};
}
ExtentManager::CacheHint* MmapV1ExtentManager::cacheHint(const DiskLoc& extentLoc,
const ExtentManager::HintType& hint) {
invariant(hint == Sequential);
Extent* e = getExtent(extentLoc);
return new CacheHintMadvise(reinterpret_cast(e), e->length, MAdvise::Sequential);
}
MmapV1ExtentManager::FilesArray::~FilesArray() {
for (int i = 0; i < size(); i++) {
delete _files[i];
}
}
void MmapV1ExtentManager::FilesArray::close(OperationContext* opCtx) {
for (int i = 0; i < size(); i++) {
_files[i]->close(opCtx);
}
}
void MmapV1ExtentManager::FilesArray::push_back(DataFile* val) {
stdx::lock_guard lk(_writersMutex);
const int n = _size.load();
invariant(n < DiskLoc::MaxFiles);
// Note ordering: _size update must come after updating the _files array
_files[n] = val;
_size.store(n + 1);
}
DataFileVersion MmapV1ExtentManager::getFileFormat(OperationContext* opCtx) const {
if (numFiles() == 0)
return DataFileVersion(0, 0);
// We explicitly only look at the first file.
return _getOpenFile(0)->getHeader()->version;
}
void MmapV1ExtentManager::setFileFormat(OperationContext* opCtx, DataFileVersion newVersion) {
invariant(numFiles() > 0);
DataFile* df = _getOpenFile(0);
invariant(df);
*opCtx->recoveryUnit()->writing(&df->getHeader()->version) = newVersion;
}
}