/**
* Copyright (C) 2016 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects
* for all of the code used other than as permitted herein. If you modify
* file(s) with this exception, you may extend this exception to your
* version of the file(s), but you are not obligated to do so. If you do not
* wish to do so, delete this exception statement from your version. If you
* delete this exception statement from all source files in the program,
* then also delete it in the license file.
*/
#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault
#include "mongo/platform/basic.h"
#include "mongo/base/init.h"
#include "mongo/base/static_assert.h"
#include "mongo/config.h"
#include "mongo/db/commands/server_status.h"
#include "mongo/db/server_parameters.h"
#include "mongo/util/log.h"
#include "mongo/util/stacktrace.h"
#include
#include
// for dlfcn.h and backtrace
#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE)
#include
#include
//
// Sampling heap profiler
//
// Intercepts allocate and free calls to track approximate number of live allocated bytes
// associated with each allocating stack trace at each point in time.
//
// Hooks into tcmalloc via the MallocHook interface, but has no dependency
// on any allocator internals; could be used with any allocator via similar
// hooks, or via shims.
//
// Adds no space overhead to each allocated object - allocated objects
// and associated stack traces are recorded in separate pre-allocated
// fixed-size hash tables. Size of the separate hash tables is configurable
// but something on the order of tens of MB should suffice for most purposes.
//
// Performance overhead is small because it only samples a fraction of the allocations.
//
// Samples allocate calls every so many bytes allocated.
// * a stack trace is obtained, and entered in a stack hash table if it's a new stack trace
// * the number of active bytes charged to that stack trace is increased
// * the allocated object, stack trace, and number of bytes is recorded in an object hash table
// For each free call if the freed object is in the object hash table.
// * the number of active bytes charged to the allocating stack trace is decreased
// * the object is removed from the object hash table
//
// Enable at startup time (only) with
// mongod --setParameter heapProfilingEnabled=true
//
// If enabled, adds a heapProfile section to serverStatus as follows:
//
// heapProfile: {
// stats: {
// // internal stats related to heap profiling process (collisions, number of stacks, etc.)
// }
// stacks: {
// stack_n_: { // one for each stack _n_
// activeBytes: ..., // number of active bytes allocated by this stack
// stack: [ // the stack itself
// "frame0",
// "frame1",
// ...
// ]
// }
// }
//
// Each new stack encountered is also logged to mongod log with a message like
// .... stack_n_: {0: "frame0", 1: "frame1", ...}
//
// Can be used in one of two ways:
//
// Via FTDC - strings are not captured by FTDC, so the information
// recorded in FTDC for each sample is essentially of the form
// {stack_n_: activeBytes} for each stack _n_. The timeseries tool
// will present one graph per stack, identified by the label stack_n_,
// showing active bytes that were allocated by that stack at each
// point in time. The mappings from stack_n_ to the actual stack can
// be found in mongod log.
//
// Via serverStatus - the serverStatus section described above
// contains complete information, including the stack trace. It can
// be obtained and examined manually, and can be further processed by
// tools.
//
// We will need about 1 active ObjInfo for every sampleIntervalBytes live bytes,
// so max active memory we can handle is sampleIntervalBytes * kMaxObjInfos.
// With the current defaults of
// kMaxObjInfos = 1024 * 1024
// sampleIntervalBytes = 256 * 1024
// the following information is computed and logged on startup (see HeapProfiler()):
// maxActiveMemory 262144 MB
// objTableSize 72 MB
// stackTableSize 16.6321MB
// So the defaults allow handling very large memories at a reasonable sampling interval
// and acceptable size overhead for the hash tables.
//
namespace mongo {
namespace {
//
// Simple hash table maps Key->Value.
// All storage is pre-allocated at creation.
// Access functions take a hash specifying a bucket as the first parameter to avoid re-computing
// hash unnecessarily; caller must ensure that hash is correctly computed from the appropriate Key.
// Key must implement operator== to support find().
// Key and Value must both support assignment to allow copying key and value into table on insert.
//
// Concurrency rules:
// Reads (find(), isBucketEmpty(), forEach()) MAY be called concurrently with each other.
// Writes (insert(), remove()) may NOT be called concurrently with each other.
// Concurrency of reads and writes is as follows:
// find() may NOT be called concurrently with any write.
// isBucketEmpty() MAY be called concurrently with any write.
// forEach()
// MAY be called concurrently with insert() but NOT remove()
// does not provide snapshot semantics wrt set of entries traversed
// caller must ensure safety wrt concurrent modification of Value of existing entry
//
using Hash = uint32_t;
template
class HashTable {
MONGO_DISALLOW_COPYING(HashTable);
private:
struct Entry {
Key key{};
Value value{};
std::atomic next{nullptr}; // NOLINT
std::atomic valid{false}; // NOLINT
Entry() {}
};
const size_t maxEntries; // we allocate storage for this many entries on creation
std::atomic_size_t numEntries; // number of entries currently in use NOLINT
size_t numBuckets; // number of buckets, computed as numEntries * loadFactor
// pre-allocate buckets and entries
std::unique_ptr[]> buckets; // NOLINT
std::unique_ptr entries;
std::atomic_size_t nextEntry; // first entry that's never been used NOLINT
Entry* freeEntry; // linked list of entries returned to us by removeEntry
public:
HashTable(size_t maxEntries, int loadFactor)
: maxEntries(maxEntries),
numEntries(0),
numBuckets(maxEntries * loadFactor),
buckets(new std::atomic[numBuckets]()), // NOLINT
entries(new Entry[maxEntries]()),
nextEntry(0),
freeEntry(nullptr) {}
// Allocate a new entry in the specified hash bucket.
// Stores a copy of the specified Key and Value.
// Returns a pointer to the newly allocated Value, or nullptr if out of space.
Value* insert(Hash hash, const Key& key, const Value& value) {
hash %= numBuckets;
Entry* entry = nullptr;
if (freeEntry) {
entry = freeEntry;
freeEntry = freeEntry->next;
} else if (nextEntry < maxEntries) {
entry = &entries[nextEntry++];
}
if (entry) {
entry->next = buckets[hash].load();
buckets[hash] = entry;
entry->key = key;
entry->value = value;
entry->valid = true; // signal that the entry is well-formed and may be traversed
numEntries++;
return &entry->value;
} else {
return nullptr;
}
}
// Find the entry containing Key in the specified hash bucket.
// Returns a pointer to the corresponding Value object, or nullptr if not found.
Value* find(Hash hash, const Key& key) {
hash %= numBuckets;
for (Entry* entry = buckets[hash]; entry; entry = entry->next)
if (entry->key == key)
return &entry->value;
return nullptr;
}
// Remove an entry specified by key.
void remove(Hash hash, const Key& key) {
hash %= numBuckets;
for (auto nextp = &buckets[hash]; *nextp; nextp = &((*nextp).load()->next)) {
Entry* entry = *nextp;
if (entry->key == key) {
*nextp = entry->next.load();
entry->valid = false; // first signal entry is invalid as it may get reused
entry->next = freeEntry;
freeEntry = entry;
numEntries--;
break;
}
}
}
// Traverse the array of pre-allocated entries, calling f(key, value) on each valid entry.
// This may be called concurrently with insert() but not remove()
// atomic entry.valid ensures that it will see only well-formed entries
// nextEntry is atomic to guard against torn reads as nextEntry is updated
// Note however it is not guaranteed to provide snapshot semantics wrt the set of entries,
// and caller must ensure safety wrt concurrent updates to the Value of an entry
template
void forEach(F f) {
for (size_t i = 0; i < nextEntry; i++) {
Entry& entry = entries[i];
if (entry.valid) // only traverse well-formed entries
f(entry.key, entry.value);
}
}
// Determines whether the specified hash bucket is empty. May be called concurrently with
// insert() and remove(). Concurrent visibility on other threads is guaranteed because
// buckets[hash] is atomic.
bool isEmptyBucket(Hash hash) {
hash %= numBuckets;
return buckets[hash] == nullptr;
}
// Number of entries.
size_t size() {
return numEntries;
}
// Highwater mark of number of entries used, for reporting stats.
size_t maxSizeSeen() {
return nextEntry;
}
// Returns total allocated size of the hash table, for reporting stats.
size_t memorySizeBytes() {
return numBuckets * sizeof(buckets[0]) + maxEntries * sizeof(entries[0]);
}
};
class HeapProfiler {
private:
// 0: sampling internally disabled
// 1: sample every allocation - byte accurate but slow and big
// >1: sample ever sampleIntervalBytes bytes allocated - less accurate but fast and small
std::atomic_size_t sampleIntervalBytes; // NOLINT
stdx::mutex hashtable_mutex; // guards updates to both object and stack hash tables
stdx::mutex stackinfo_mutex; // guards against races updating the StackInfo bson representation
// cumulative bytes allocated - determines when samples are taken
std::atomic_size_t bytesAllocated{0}; // NOLINT
// estimated currently active bytes - sum of activeBytes for all stacks
size_t totalActiveBytes = 0;
//
// Hash table of stacks
//
using FrameInfo = void*; // per-frame information is just the IP
static const int kMaxStackInfos = 20000; // max number of unique call sites we handle
static const int kStackHashTableLoadFactor = 2; // keep loading <50%
static const int kMaxFramesPerStack = 100; // max depth of stack
// stack HashTable Key
struct Stack {
int numFrames = 0;
std::array frames;
Stack() {}
bool operator==(const Stack& that) {
return this->numFrames == that.numFrames &&
std::equal(frames.begin(), frames.begin() + numFrames, that.frames.begin());
}
Hash hash() {
Hash hash;
MONGO_STATIC_ASSERT_MSG(sizeof(frames) == sizeof(FrameInfo) * kMaxFramesPerStack,
"frames array is not dense");
MurmurHash3_x86_32(frames.data(), numFrames * sizeof(FrameInfo), 0, &hash);
return hash;
}
};
// Stack HashTable Value.
struct StackInfo {
int stackNum = 0; // used for stack short name
BSONObj stackObj{}; // symbolized representation
size_t activeBytes = 0; // number of live allocated bytes charged to this stack
explicit StackInfo(int stackNum) : stackNum(stackNum) {}
StackInfo() {}
};
// The stack HashTable itself.
HashTable stackHashTable{kMaxStackInfos, kStackHashTableLoadFactor};
// frames to skip at top and bottom of backtrace when reporting stacks
int skipStartFrames = 0;
int skipEndFrames = 0;
//
// Hash table of allocated objects.
//
static const int kMaxObjInfos = 1024 * 1024; // maximum tracked allocations
static const int kObjHashTableLoadFactor = 4; // keep hash table loading <25%
// Obj HashTable Key.
struct Obj {
const void* objPtr = nullptr;
explicit Obj(const void* objPtr) : objPtr(objPtr) {}
Obj() {}
bool operator==(const Obj& that) {
return this->objPtr == that.objPtr;
}
Hash hash() {
Hash hash = 0;
MurmurHash3_x86_32(&objPtr, sizeof(objPtr), 0, &hash);
return hash;
}
};
// Obj HashTable Value.
struct ObjInfo {
size_t accountedLen = 0;
StackInfo* stackInfo = nullptr;
ObjInfo(size_t accountedLen, StackInfo* stackInfo)
: accountedLen(accountedLen), stackInfo(stackInfo) {}
ObjInfo() {}
};
// The obj HashTable itself.
HashTable objHashTable{kMaxObjInfos, kObjHashTableLoadFactor};
// If we encounter an error that doesn't allow us to proceed, for
// example out of space for new hash table entries, we internally
// disable profiling and then log an error message.
void disable(const char* msg) {
sampleIntervalBytes = 0;
log() << msg;
}
//
// Record an allocation.
//
void _alloc(const void* objPtr, size_t objLen) {
// still profiling?
if (sampleIntervalBytes == 0)
return;
// Sample every sampleIntervalBytes bytes of allocation.
// We charge each sampled stack with the amount of memory allocated since the last sample
// this could grossly overcharge any given stack sample, but on average over a large
// number of samples will be correct.
size_t lastSample = bytesAllocated.fetch_add(objLen);
size_t currentSample = lastSample + objLen;
size_t accountedLen = sampleIntervalBytes *
(currentSample / sampleIntervalBytes - lastSample / sampleIntervalBytes);
if (accountedLen == 0)
return;
// Get backtrace.
Stack tempStack;
tempStack.numFrames = backtrace(tempStack.frames.data(), kMaxFramesPerStack);
// Compute backtrace hash.
Hash stackHash = tempStack.hash();
// Now acquire lock.
stdx::lock_guard lk(hashtable_mutex);
// Look up stack in stackHashTable.
StackInfo* stackInfo = stackHashTable.find(stackHash, tempStack);
// If new stack, store in stackHashTable.
if (!stackInfo) {
StackInfo newStackInfo(stackHashTable.size() /*stackNum*/);
stackInfo = stackHashTable.insert(stackHash, tempStack, newStackInfo);
if (!stackInfo) {
disable("too many stacks; disabling heap profiling");
return;
}
}
// Count the bytes.
totalActiveBytes += accountedLen;
stackInfo->activeBytes += accountedLen;
// Enter obj in objHashTable.
Obj obj(objPtr);
ObjInfo objInfo(accountedLen, stackInfo);
if (!objHashTable.insert(obj.hash(), obj, objInfo)) {
disable("too many live objects; disabling heap profiling");
return;
}
}
//
// Record a freed object.
//
void _free(const void* objPtr) {
// still profiling?
if (sampleIntervalBytes == 0)
return;
// Compute hash, quick return before locking if bucket is empty (common case).
// This is crucial for performance because we need to check the hash table on every _free.
// Visibility of the bucket entry if the _alloc was done on a different thread is
// guaranteed because isEmptyBucket consults an atomic pointer.
Obj obj(objPtr);
Hash objHash = obj.hash();
if (objHashTable.isEmptyBucket(objHash))
return;
// Now acquire lock.
stdx::lock_guard lk(hashtable_mutex);
// Remove the object from the hash bucket if present.
ObjInfo* objInfo = objHashTable.find(objHash, obj);
if (objInfo) {
totalActiveBytes -= objInfo->accountedLen;
objInfo->stackInfo->activeBytes -= objInfo->accountedLen;
objHashTable.remove(objHash, obj);
}
}
//
// Generate bson representation of stack.
//
void generateStackIfNeeded(Stack& stack, StackInfo& stackInfo) {
if (!stackInfo.stackObj.isEmpty())
return;
BSONArrayBuilder builder;
for (int j = skipStartFrames; j < stack.numFrames - skipEndFrames; j++) {
Dl_info dli;
StringData frameString;
char* demangled = nullptr;
if (dladdr(stack.frames[j], &dli)) {
if (dli.dli_sname) {
int status;
demangled = abi::__cxa_demangle(dli.dli_sname, 0, 0, &status);
if (demangled) {
// strip off function parameters as they are very verbose and not useful
char* p = strchr(demangled, '(');
if (p)
frameString = StringData(demangled, p - demangled);
else
frameString = demangled;
} else {
frameString = dli.dli_sname;
}
}
}
if (frameString.empty()) {
std::ostringstream s;
s << stack.frames[j];
frameString = s.str();
}
builder.append(frameString);
if (demangled)
free(demangled);
}
stackInfo.stackObj = builder.obj();
log() << "heapProfile stack" << stackInfo.stackNum << ": " << stackInfo.stackObj;
}
//
// Generate serverStatus section.
//
bool logGeneralStats = true; // first time only
// In order to reduce load on ftdc we track the stacks we deem important enough to emit
// once a stack is deemed "important" it remains important from that point on.
// "Important" is a sticky quality to improve the stability of the set of stacks we emit,
// and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
std::set importantStacks{
[](StackInfo* a, StackInfo* b) -> bool { return a->stackNum < b->stackNum; }};
int numImportantSamples = 0; // samples currently included in importantStacks
const int kMaxImportantSamples = 4 * 3600; // reset every 4 hours at default 1 sample / sec
void _generateServerStatusSection(BSONObjBuilder& builder) {
// compute and log some informational stats first time through
if (logGeneralStats) {
const size_t maxActiveMemory = sampleIntervalBytes * kMaxObjInfos;
const size_t objTableSize = objHashTable.memorySizeBytes();
const size_t stackTableSize = stackHashTable.memorySizeBytes();
const double MB = 1024 * 1024;
log() << "sampleIntervalBytes " << sampleIntervalBytesParameter << "; "
<< "maxActiveMemory " << maxActiveMemory / MB << " MB; "
<< "objTableSize " << objTableSize / MB << " MB; "
<< "stackTableSize " << stackTableSize / MB << " MB";
// print a stack trace to log somap for post-facto symbolization
log() << "following stack trace is for heap profiler informational purposes";
printStackTrace();
logGeneralStats = false;
}
// Stats subsection.
BSONObjBuilder statsBuilder(builder.subobjStart("stats"));
statsBuilder.appendNumber("totalActiveBytes", totalActiveBytes);
statsBuilder.appendNumber("bytesAllocated", bytesAllocated);
statsBuilder.appendNumber("numStacks", stackHashTable.size());
statsBuilder.appendNumber("currentObjEntries", objHashTable.size());
statsBuilder.appendNumber("maxObjEntriesUsed", objHashTable.maxSizeSeen());
statsBuilder.doneFast();
// Guard against races updating the StackInfo bson representation.
stdx::lock_guard lk(stackinfo_mutex);
// Traverse stackHashTable accumulating potential stacks to emit.
// We do this traversal without locking hashtable_mutex because we need to use the heap.
// forEach guarantees this is safe wrt to insert(), and we never call remove().
// We use stackinfo_mutex to ensure safety wrt concurrent updates to the StackInfo objects.
// We can get skew between entries, which is ok.
std::vector stackInfos;
stackHashTable.forEach([&](Stack& stack, StackInfo& stackInfo) {
if (stackInfo.activeBytes) {
generateStackIfNeeded(stack, stackInfo);
stackInfos.push_back(&stackInfo);
}
});
// Sort the stacks and find enough stacks to account for at least 99% of the active bytes
// deem any stack that has ever met this criterion as "important".
auto sortByActiveBytes = [](StackInfo* a, StackInfo* b) -> bool {
return a->activeBytes > b->activeBytes;
};
std::stable_sort(stackInfos.begin(), stackInfos.end(), sortByActiveBytes);
size_t threshold = totalActiveBytes * 0.99;
size_t cumulative = 0;
for (auto it = stackInfos.begin(); it != stackInfos.end(); ++it) {
StackInfo* stackInfo = *it;
importantStacks.insert(stackInfo);
cumulative += stackInfo->activeBytes;
if (cumulative > threshold)
break;
}
// Build the stacks subsection by emitting the "important" stacks.
BSONObjBuilder stacksBuilder(builder.subobjStart("stacks"));
for (auto it = importantStacks.begin(); it != importantStacks.end(); ++it) {
StackInfo* stackInfo = *it;
std::ostringstream shortName;
shortName << "stack" << stackInfo->stackNum;
BSONObjBuilder stackBuilder(stacksBuilder.subobjStart(shortName.str()));
stackBuilder.appendNumber("activeBytes", stackInfo->activeBytes);
}
stacksBuilder.doneFast();
// importantStacks grows monotonically, so it can accumulate unneeded stacks,
// so we clear it periodically.
if (++numImportantSamples >= kMaxImportantSamples) {
log() << "clearing importantStacks";
importantStacks.clear();
numImportantSamples = 0;
}
}
//
// Static hooks to give to the allocator.
//
static void alloc(const void* obj, size_t objLen) {
heapProfiler->_alloc(obj, objLen);
}
static void free(const void* obj) {
heapProfiler->_free(obj);
}
public:
static HeapProfiler* heapProfiler;
static bool enabledParameter;
static long long sampleIntervalBytesParameter;
HeapProfiler() {
// Set sample interval from the parameter.
sampleIntervalBytes = sampleIntervalBytesParameter;
// This is our only allocator dependency - ifdef and change as
// appropriate for other allocators, using hooks or shims.
// For tcmalloc we skip two frames that are internal to the allocator
// so that the top frame is the public tc_* function.
skipStartFrames = 2;
skipEndFrames = 0;
MallocHook::AddNewHook(alloc);
MallocHook::AddDeleteHook(free);
}
static void generateServerStatusSection(BSONObjBuilder& builder) {
if (heapProfiler)
heapProfiler->_generateServerStatusSection(builder);
}
};
//
// serverStatus section
//
class HeapProfilerServerStatusSection final : public ServerStatusSection {
public:
HeapProfilerServerStatusSection() : ServerStatusSection("heapProfile") {}
bool includeByDefault() const override {
return HeapProfiler::enabledParameter;
}
BSONObj generateSection(OperationContext* opCtx,
const BSONElement& configElement) const override {
BSONObjBuilder builder;
HeapProfiler::generateServerStatusSection(builder);
return builder.obj();
}
} heapProfilerServerStatusSection;
//
// startup
//
HeapProfiler* HeapProfiler::heapProfiler;
bool HeapProfiler::enabledParameter = false;
long long HeapProfiler::sampleIntervalBytesParameter = 256 * 1024;
ExportedServerParameter heapProfilingEnabledParameter(
ServerParameterSet::getGlobal(), "heapProfilingEnabled", &HeapProfiler::enabledParameter);
ExportedServerParameter
heapProfilingSampleIntervalBytes(ServerParameterSet::getGlobal(),
"heapProfilingSampleIntervalBytes",
&HeapProfiler::sampleIntervalBytesParameter);
MONGO_INITIALIZER_GENERAL(StartHeapProfiling, ("EndStartupOptionHandling"), ("default"))
(InitializerContext* context) {
if (HeapProfiler::enabledParameter)
HeapProfiler::heapProfiler = new HeapProfiler();
return Status::OK();
}
} // namespace
} // namespace mongo
#endif // MONGO_HAVE_HEAP_PROFILER