diff options
-rw-r--r-- | src/mongo/util/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/util/heap_profiler.cpp | 680 |
2 files changed, 681 insertions, 0 deletions
diff --git a/src/mongo/util/SConscript b/src/mongo/util/SConscript index 83516dcc9da..7b09db3b5ba 100644 --- a/src/mongo/util/SConscript +++ b/src/mongo/util/SConscript @@ -294,6 +294,7 @@ if env['MONGO_ALLOCATOR'] == 'tcmalloc': source=[ 'tcmalloc_server_status_section.cpp', 'tcmalloc_set_parameter.cpp', + 'heap_profiler.cpp', ], LIBDEPS=[ # TODO: This is needless, the definition of diff --git a/src/mongo/util/heap_profiler.cpp b/src/mongo/util/heap_profiler.cpp new file mode 100644 index 00000000000..33b0eb4f9de --- /dev/null +++ b/src/mongo/util/heap_profiler.cpp @@ -0,0 +1,680 @@ +/** + * Copyright (C) 2016 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault + +#include "mongo/platform/basic.h" + +#include "mongo/base/init.h" +#include "mongo/db/commands/server_status.h" +#include "mongo/db/server_parameters.h" +#include "mongo/util/log.h" + +#include <gperftools/malloc_hook.h> +#include <third_party/murmurhash3/MurmurHash3.h> + +// for dlfcn.h and backtrace +#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE) + +#include <dlfcn.h> +#include <execinfo.h> + +// +// Sampling heap profiler +// +// Intercepts allocate and free calls to track approximate number of live allocated bytes +// associated with each allocating stack trace at each point in time. +// +// Hooks into tcmalloc via the MallocHook interface, but has no dependency +// on any allocator internals; could be used with any allocator via similar +// hooks, or via shims. +// +// Adds no space overhead to each allocated object - allocated objects +// and associated stack traces are recorded in separate pre-allocated +// fixed-size hash tables. Size of the separate hash tables is configurable +// but something on the order of tens of MB should suffice for most purposes. +// +// Performance overhead is small because it only samples a fraction of the allocations. +// +// Samples allocate calls every so many bytes allocated. +// * a stack trace is obtained, and entered in a stack hash table if it's a new stack trace +// * the number of active bytes charged to that stack trace is increased +// * the allocated object, stack trace, and number of bytes is recorded in an object hash table +// For each free call if the freed object is in the object hash table. +// * the number of active bytes charged to the allocating stack trace is decreased +// * the object is removed from the object hash table +// +// Enable at startup time (only) with +// mongod --setParameter heapProfilingEnabled=true +// +// If enabled, adds a heapProfile section to serverStatus as follows: +// +// heapProfile: { +// stats: { +// // internal stats related to heap profiling process (collisions, number of stacks, etc.) +// } +// stacks: { +// stack_n_: { // one for each stack _n_ +// activeBytes: ..., // number of active bytes allocated by this stack +// stack: [ // the stack itself +// "frame0", +// "frame1", +// ... +// ] +// } +// } +// +// Each new stack encountered is also logged to mongod log with a message like +// .... stack_n_: {0: "frame0", 1: "frame1", ...} +// +// Can be used in one of two ways: +// +// Via FTDC - strings are not captured by FTDC, so the information +// recorded in FTDC for each sample is essentially of the form +// {stack_n_: activeBytes} for each stack _n_. The timeseries tool +// will present one graph per stack, identified by the label stack_n_, +// showing active bytes that were allocated by that stack at each +// point in time. The mappings from stack_n_ to the actual stack can +// be found in mongod log. +// +// Via serverStatus - the serverStatus section described above +// contains complete information, including the stack trace. It can +// be obtained and examined manually, and can be further processed by +// tools. +// +// We will need about 1 active ObjInfo for every sampleIntervalBytes live bytes, +// so max active memory we can handle is sampleIntervalBytes * kMaxObjInfos. +// With the current defaults of +// kMaxObjInfos = 1024 * 1024 +// sampleIntervalBytes = 256 * 1024 +// the following information is computed and logged on startup (see HeapProfiler()): +// maxActiveMemory 262144 MB +// objTableSize 72 MB +// stackTableSize 16.6321MB +// So the defaults allow handling very large memories at a reasonable sampling interval +// and acceptable size overhead for the hash tables. +// + +namespace mongo { +namespace { + +// +// Simple hash table maps Key->Value. +// All storage is pre-allocated at creation. +// Access functions take a hash specifying a bucket as the first parameter to avoid re-computing +// hash unnecessarily; caller must ensure that hash is correctly computed from the appropriate Key. +// Key must implement operator== to support find(). +// Key and Value must both support assignment to allow copying key and value into table on insert. +// +// Concurrency rules: +// Reads (find(), isBucketEmpty(), forEach()) MAY be called concurrently with each other. +// Writes (insert(), remove()) may NOT be called concurrently with each other. +// Concurrency of reads and writes is as follows: +// find() may NOT be called concurrently with any write. +// isBucketEmpty() MAY be called concurrently with any write. +// forEach() +// MAY be called concurrently with insert() but NOT remove() +// does not provide snapshot semantics wrt set of entries traversed +// caller must ensure safety wrt concurrent modification of Value of existing entry +// + +using Hash = uint32_t; + +template <class Key, class Value> +class HashTable { + MONGO_DISALLOW_COPYING(HashTable); + +private: + struct Entry { + Key key{}; + Value value{}; + std::atomic<Entry*> next{nullptr}; + std::atomic<bool> valid{false}; + Entry() {} + }; + + const size_t maxEntries; // we allocate storage for this many entries on creation + std::atomic_size_t numEntries; // number of entries currently in use + size_t numBuckets; // number of buckets, computed as numEntries * loadFactor + + // pre-allocate buckets and entries + std::unique_ptr<std::atomic<Entry*>[]> buckets; + std::unique_ptr<Entry[]> entries; + + std::atomic_size_t nextEntry; // first entry that's never been used + Entry* freeEntry; // linked list of entries returned to us by removeEntry + +public: + HashTable(size_t maxEntries, int loadFactor) + : maxEntries(maxEntries), + numEntries(0), + numBuckets(maxEntries * loadFactor), + buckets(new std::atomic<Entry*>[numBuckets]()), + entries(new Entry[maxEntries]()), + nextEntry(0), + freeEntry(nullptr) {} + + // Allocate a new entry in the specified hash bucket. + // Stores a copy of the specified Key and Value. + // Returns a pointer to the newly allocated Value, or nullptr if out of space. + Value* insert(Hash hash, const Key& key, const Value& value) { + hash %= numBuckets; + Entry* entry = nullptr; + if (freeEntry) { + entry = freeEntry; + freeEntry = freeEntry->next; + } else if (nextEntry < maxEntries) { + entry = &entries[nextEntry++]; + } + if (entry) { + entry->next = buckets[hash].load(); + buckets[hash] = entry; + entry->key = key; + entry->value = value; + entry->valid = true; // signal that the entry is well-formed and may be traversed + numEntries++; + return &entry->value; + } else { + return nullptr; + } + } + + // Find the entry containing Key in the specified hash bucket. + // Returns a pointer to the corresponding Value object, or nullptr if not found. + Value* find(Hash hash, const Key& key) { + hash %= numBuckets; + for (Entry* entry = buckets[hash]; entry; entry = entry->next) + if (entry->key == key) + return &entry->value; + return nullptr; + } + + // Remove an entry specified by key. + void remove(Hash hash, const Key& key) { + hash %= numBuckets; + for (auto nextp = &buckets[hash]; *nextp; nextp = &((*nextp).load()->next)) { + Entry* entry = *nextp; + if (entry->key == key) { + *nextp = entry->next.load(); + entry->valid = false; // first signal entry is invalid as it may get reused + entry->next = freeEntry; + freeEntry = entry; + numEntries--; + break; + } + } + } + + // Traverse the array of pre-allocated entries, calling f(key, value) on each valid entry. + // This may be called concurrently with insert() but not remove() + // atomic entry.valid ensures that it will see only well-formed entries + // nextEntry is atomic to guard against torn reads as nextEntry is updated + // Note however it is not guaranteed to provide snapshot semantics wrt the set of entries, + // and caller must ensure safety wrt concurrent updates to the Value of an entry + template <typename F> + void forEach(F f) { + for (size_t i = 0; i < nextEntry; i++) { + Entry& entry = entries[i]; + if (entry.valid) // only traverse well-formed entries + f(entry.key, entry.value); + } + } + + // Determines whether the specified hash bucket is empty. May be called concurrently with + // insert() and remove(). Concurrent visibility on other threads is guaranteed because + // buckets[hash] is atomic. + bool isEmptyBucket(Hash hash) { + hash %= numBuckets; + return buckets[hash] == nullptr; + } + + // Number of entries. + size_t size() { + return numEntries; + } + + // Highwater mark of number of entries used, for reporting stats. + size_t maxSizeSeen() { + return nextEntry; + } + + // Returns total allocated size of the hash table, for reporting stats. + size_t memorySizeBytes() { + return numBuckets * sizeof(buckets[0]) + maxEntries * sizeof(entries[0]); + } +}; + + +class HeapProfiler { +private: + // 0: sampling internally disabled + // 1: sample every allocation - byte accurate but slow and big + // >1: sample ever sampleIntervalBytes bytes allocated - less accurate but fast and small + std::atomic_size_t sampleIntervalBytes; + + std::mutex hashtable_mutex; // guards updates to both object and stack hash tables + std::mutex stackinfo_mutex; // guards against races updating the StackInfo bson representation + + // cumulative bytes allocated - determines when samples are taken + std::atomic_size_t bytesAllocated{0}; + + // estimated currently active bytes - sum of activeBytes for all stacks + size_t totalActiveBytes = 0; + + // + // Hash table of stacks + // + + using FrameInfo = void*; // per-frame information is just the IP + + static const int kMaxStackInfos = 20000; // max number of unique call sites we handle + static const int kStackHashTableLoadFactor = 2; // keep loading <50% + static const int kMaxFramesPerStack = 100; // max depth of stack + + // stack HashTable Key + struct Stack { + int numFrames = 0; + std::array<FrameInfo, kMaxFramesPerStack> frames; + Stack() {} + + bool operator==(const Stack& that) { + return this->numFrames == that.numFrames && + std::equal(frames.begin(), frames.begin() + numFrames, that.frames.begin()); + } + + Hash hash() { + Hash hash; + static_assert(sizeof(frames) == sizeof(FrameInfo) * kMaxFramesPerStack, + "frames array is not dense"); + MurmurHash3_x86_32(frames.data(), numFrames * sizeof(FrameInfo), 0, &hash); + return hash; + } + }; + + // Stack HashTable Value. + struct StackInfo { + int stackNum = 0; // used for stack short name + BSONObj stackObj{}; // symbolized representation + size_t activeBytes = 0; // number of live allocated bytes charged to this stack + explicit StackInfo(int stackNum) : stackNum(stackNum) {} + StackInfo() {} + }; + + // The stack HashTable itself. + HashTable<Stack, StackInfo> stackHashTable{kMaxStackInfos, kStackHashTableLoadFactor}; + + // frames to skip at top and bottom of backtrace when reporting stacks + int skipStartFrames = 0; + int skipEndFrames = 0; + + + // + // Hash table of allocated objects. + // + + static const int kMaxObjInfos = 1024 * 1024; // maximum tracked allocations + static const int kObjHashTableLoadFactor = 4; // keep hash table loading <25% + + // Obj HashTable Key. + struct Obj { + const void* objPtr = nullptr; + explicit Obj(const void* objPtr) : objPtr(objPtr) {} + Obj() {} + + bool operator==(const Obj& that) { + return this->objPtr == that.objPtr; + } + + Hash hash() { + Hash hash = 0; + MurmurHash3_x86_32(&objPtr, sizeof(objPtr), 0, &hash); + return hash; + } + }; + + // Obj HashTable Value. + struct ObjInfo { + size_t accountedLen = 0; + StackInfo* stackInfo = nullptr; + ObjInfo(size_t accountedLen, StackInfo* stackInfo) + : accountedLen(accountedLen), stackInfo(stackInfo) {} + ObjInfo() {} + }; + + // The obj HashTable itself. + HashTable<Obj, ObjInfo> objHashTable{kMaxObjInfos, kObjHashTableLoadFactor}; + + + // If we encounter an error that doesn't allow us to proceed, for + // example out of space for new hash table entries, we internally + // disable profiling and then log an error message. + void disable(const char* msg) { + sampleIntervalBytes = 0; + log() << msg; + } + + // + // Record an allocation. + // + void _alloc(const void* objPtr, size_t objLen) { + // still profiling? + if (sampleIntervalBytes == 0) + return; + + // Sample every sampleIntervalBytes bytes of allocation. + // We charge each sampled stack with the amount of memory allocated since the last sample + // this could grossly overcharge any given stack sample, but on average over a large + // number of samples will be correct. + size_t lastSample = bytesAllocated.fetch_add(objLen); + size_t currentSample = lastSample + objLen; + size_t accountedLen = sampleIntervalBytes * + (currentSample / sampleIntervalBytes - lastSample / sampleIntervalBytes); + if (accountedLen == 0) + return; + + // Get backtrace. + Stack tempStack; + tempStack.numFrames = backtrace(tempStack.frames.data(), kMaxFramesPerStack); + + // Compute backtrace hash. + Hash stackHash = tempStack.hash(); + + // Now acquire lock. + std::lock_guard<std::mutex> lk(hashtable_mutex); + + // Look up stack in stackHashTable. + StackInfo* stackInfo = stackHashTable.find(stackHash, tempStack); + + // If new stack, store in stackHashTable. + if (!stackInfo) { + StackInfo newStackInfo(stackHashTable.size() /*stackNum*/); + stackInfo = stackHashTable.insert(stackHash, tempStack, newStackInfo); + if (!stackInfo) { + disable("too many stacks; disabling heap profiling"); + return; + } + } + + // Count the bytes. + totalActiveBytes += accountedLen; + stackInfo->activeBytes += accountedLen; + + // Enter obj in objHashTable. + Obj obj(objPtr); + ObjInfo objInfo(accountedLen, stackInfo); + if (!objHashTable.insert(obj.hash(), obj, objInfo)) { + disable("too many live objects; disabling heap profiling"); + return; + } + } + + // + // Record a freed object. + // + void _free(const void* objPtr) { + // still profiling? + if (sampleIntervalBytes == 0) + return; + + // Compute hash, quick return before locking if bucket is empty (common case). + // This is crucial for performance because we need to check the hash table on every _free. + // Visibility of the bucket entry if the _alloc was done on a different thread is + // guaranteed because isEmptyBucket consults an atomic pointer. + Obj obj(objPtr); + Hash objHash = obj.hash(); + if (objHashTable.isEmptyBucket(objHash)) + return; + + // Now acquire lock. + std::lock_guard<std::mutex> lk(hashtable_mutex); + + // Remove the object from the hash bucket if present. + ObjInfo* objInfo = objHashTable.find(objHash, obj); + if (objInfo) { + totalActiveBytes -= objInfo->accountedLen; + objInfo->stackInfo->activeBytes -= objInfo->accountedLen; + objHashTable.remove(objHash, obj); + } + } + + // + // Generate bson representation of stack. + // + void generateStackIfNeeded(Stack& stack, StackInfo& stackInfo) { + if (!stackInfo.stackObj.isEmpty()) + return; + BSONArrayBuilder builder; + for (int j = skipStartFrames; j < stack.numFrames - skipEndFrames; j++) { + Dl_info dli; + StringData frameString; + char* demangled = nullptr; + if (dladdr(stack.frames[j], &dli)) { + if (dli.dli_sname) { + int status; + demangled = abi::__cxa_demangle(dli.dli_sname, 0, 0, &status); + if (demangled) { + // strip off function parameters as they are very verbose and not useful + char* p = strchr(demangled, '('); + if (p) + frameString = StringData(demangled, p - demangled); + else + frameString = demangled; + } else { + frameString = dli.dli_sname; + } + } + } + if (frameString.empty()) { + std::ostringstream s; + s << stack.frames[j]; + frameString = s.str(); + } + builder.append(frameString); + if (demangled) + free(demangled); + } + stackInfo.stackObj = builder.obj(); + log() << "heapProfile stack" << stackInfo.stackNum << ": " << stackInfo.stackObj; + } + + // + // Generate serverStatus section. + // + + bool logGeneralStats = true; // first time only + + // In order to reduce load on ftdc we track the stacks we deem important enough to emit + // once a stack is deemed "important" it remains important from that point on. + // "Important" is a sticky quality to improve the stability of the set of stacks we emit, + // and we always emit them in stackNum order, greatly improving ftdc compression efficiency. + std::set<StackInfo*, bool (*)(StackInfo*, StackInfo*)> importantStacks{ + [](StackInfo* a, StackInfo* b) -> bool { return a->stackNum < b->stackNum; }}; + + int numImportantSamples = 0; // samples currently included in importantStacks + const int kMaxImportantSamples = 4 * 3600; // reset every 4 hours at default 1 sample / sec + + void _generateServerStatusSection(BSONObjBuilder& builder) { + // compute and log some informational stats first time through + if (logGeneralStats) { + const size_t maxActiveMemory = sampleIntervalBytes * kMaxObjInfos; + const size_t objTableSize = objHashTable.memorySizeBytes(); + const size_t stackTableSize = stackHashTable.memorySizeBytes(); + const double MB = 1024 * 1024; + log() << "sampleIntervalBytes " << sampleIntervalBytesParameter << "; " + << "maxActiveMemory " << maxActiveMemory / MB << " MB; " + << "objTableSize " << objTableSize / MB << " MB; " + << "stackTableSize " << stackTableSize / MB << " MB"; + logGeneralStats = false; + } + + // Stats subsection. + BSONObjBuilder statsBuilder(builder.subobjStart("stats")); + statsBuilder.appendNumber("totalActiveBytes", totalActiveBytes); + statsBuilder.appendNumber("bytesAllocated", bytesAllocated); + statsBuilder.appendNumber("numStacks", stackHashTable.size()); + statsBuilder.appendNumber("currentObjEntries", objHashTable.size()); + statsBuilder.appendNumber("maxObjEntriesUsed", objHashTable.maxSizeSeen()); + statsBuilder.doneFast(); + + // Guard against races updating the StackInfo bson representation. + std::lock_guard<std::mutex> lk(stackinfo_mutex); + + // Traverse stackHashTable accumulating potential stacks to emit. + // We do this traversal without locking hashtable_mutex because we need to use the heap. + // forEach guarantees this is safe wrt to insert(), and we never call remove(). + // We use stackinfo_mutex to ensure safety wrt concurrent updates to the StackInfo objects. + // We can get skew between entries, which is ok. + std::vector<StackInfo*> stackInfos; + stackHashTable.forEach([&](Stack& stack, StackInfo& stackInfo) { + if (stackInfo.activeBytes) { + generateStackIfNeeded(stack, stackInfo); + stackInfos.push_back(&stackInfo); + } + }); + + // Sort the stacks and find enough stacks to account for at least 99% of the active bytes + // deem any stack that has ever met this criterion as "important". + auto sortByActiveBytes = + [](StackInfo* a, StackInfo* b) -> bool { return a->activeBytes > b->activeBytes; }; + std::stable_sort(stackInfos.begin(), stackInfos.end(), sortByActiveBytes); + size_t threshold = totalActiveBytes * 0.99; + size_t cumulative = 0; + for (auto it = stackInfos.begin(); it != stackInfos.end(); ++it) { + StackInfo* stackInfo = *it; + importantStacks.insert(stackInfo); + cumulative += stackInfo->activeBytes; + if (cumulative > threshold) + break; + } + + // Build the stacks subsection by emitting the "important" stacks. + BSONObjBuilder stacksBuilder(builder.subobjStart("stacks")); + for (auto it = importantStacks.begin(); it != importantStacks.end(); ++it) { + StackInfo* stackInfo = *it; + std::ostringstream shortName; + shortName << "stack" << stackInfo->stackNum; + BSONObjBuilder stackBuilder(stacksBuilder.subobjStart(shortName.str())); + stackBuilder.appendNumber("activeBytes", stackInfo->activeBytes); + stackBuilder.append("stack", stackInfo->stackObj); + } + stacksBuilder.doneFast(); + + // importantStacks grows monotonically, so it can accumulate unneeded stacks, + // so we clear it periodically. + if (++numImportantSamples >= kMaxImportantSamples) { + log() << "clearing importantStacks"; + importantStacks.clear(); + numImportantSamples = 0; + } + } + + // + // Static hooks to give to the allocator. + // + + static void alloc(const void* obj, size_t objLen) { + heapProfiler->_alloc(obj, objLen); + } + + static void free(const void* obj) { + heapProfiler->_free(obj); + } + +public: + static HeapProfiler* heapProfiler; + static bool enabledParameter; + static long long sampleIntervalBytesParameter; + + HeapProfiler() { + // Set sample interval from the parameter. + sampleIntervalBytes = sampleIntervalBytesParameter; + + // This is our only allocator dependency - ifdef and change as + // appropriate for other allocators, using hooks or shims. + // For tcmalloc we skip two frames that are internal to the allocator + // so that the top frame is the public tc_* function. + skipStartFrames = 2; + skipEndFrames = 0; + MallocHook::AddNewHook(alloc); + MallocHook::AddDeleteHook(free); + } + + static void generateServerStatusSection(BSONObjBuilder& builder) { + if (heapProfiler) + heapProfiler->_generateServerStatusSection(builder); + } +}; + +// +// serverStatus section +// + +class HeapProfilerServerStatusSection final : public ServerStatusSection { +public: + HeapProfilerServerStatusSection() : ServerStatusSection("heapProfile") {} + + bool includeByDefault() const override { + return HeapProfiler::enabledParameter; + } + + BSONObj generateSection(OperationContext* txn, + const BSONElement& configElement) const override { + BSONObjBuilder builder; + HeapProfiler::generateServerStatusSection(builder); + return builder.obj(); + } +} heapProfilerServerStatusSection; + +// +// startup +// + +HeapProfiler* HeapProfiler::heapProfiler; +bool HeapProfiler::enabledParameter = false; +long long HeapProfiler::sampleIntervalBytesParameter = 256 * 1024; + +ExportedServerParameter<bool, ServerParameterType::kStartupOnly> heapProfilingEnabledParameter( + ServerParameterSet::getGlobal(), "heapProfilingEnabled", &HeapProfiler::enabledParameter); + +ExportedServerParameter<long long, ServerParameterType::kStartupOnly> + heapProfilingSampleIntervalBytes(ServerParameterSet::getGlobal(), + "heapProfilingSampleIntervalBytes", + &HeapProfiler::sampleIntervalBytesParameter); + +MONGO_INITIALIZER_GENERAL(StartHeapProfiling, + ("EndStartupOptionHandling"), + ("default"))(InitializerContext* context) { + if (HeapProfiler::enabledParameter) + HeapProfiler::heapProfiler = new HeapProfiler(); + return Status::OK(); +} + +} // namespace +} // namespace mongo + +#endif // MONGO_HAVE_HEAP_PROFILER |