/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/platform/basic.h" #include #include "mongo/util/assert_util.h" #include "mongo/util/processinfo.h" #include "mongo/util/time_support.h" namespace mongo { namespace { // This is a trivial test case to sanity check that "benchmark" runs. // For more information on how benchmarks should be written, please refer to Google Benchmark's // excellent README: https://github.com/google/benchmark/blob/v1.3.0/README.md void BM_Empty(benchmark::State& state) { for (auto keepRunning : state) { // The code inside this for-loop is what's being timed. benchmark::DoNotOptimize(state.iterations()); } } // Register two benchmarks, one runs the "BM_Empty" function in a single thread, the other runs a // copy per CPU core. BENCHMARK(BM_Empty); BENCHMARK(BM_Empty)->ThreadPerCpu(); void BM_CpuLoad(benchmark::State& state) { for (auto keepRunning : state) { uint64_t limit = 10000; uint64_t lresult = 0; uint64_t x = 100; for (uint64_t i = 0; i < limit; i++) { benchmark::DoNotOptimize(x *= 13); } benchmark::DoNotOptimize(lresult = x); } } // This Benchmark is adapted from the `cpuload` command: // https://github.com/mongodb/mongo/blob/r3.7.2/src/mongo/db/commands/cpuload.cpp BENCHMARK(BM_CpuLoad)->Threads(1)->ThreadPerCpu(); void BM_Sleep(benchmark::State& state) { for (auto keepRunning : state) { sleepmillis(100); } } BENCHMARK(BM_Sleep)->Threads(1)->ThreadPerCpu(); // Generate a loop with macros. #define ONE ptrToNextLinkedListNode = reinterpret_cast(*ptrToNextLinkedListNode); #define FIVE ONE ONE ONE ONE ONE #define TEN FIVE FIVE #define FIFTY TEN TEN TEN TEN TEN #define HUNDRED FIFTY FIFTY // Stride is the number of elements to skip when traversing the array. // It should ideally be >= the cache line to avoid side-effects from pre-fetching. const int kStrideBytes = 64; class CacheLatencyTest : public benchmark::Fixture { // Fixture for CPU Cache and RAM latency test. Adapted from lmbench's lat_mem_rd test. public: // Array of pointers used as a linked list. std::unique_ptr data; void SetUp(benchmark::State& state) override { if (state.thread_index == 0) { fassert(data.get() == nullptr, "'data' is not null"); /* * Create a circular list of pointers using a simple striding * algorithm. */ const int arrLength = state.range(0); int counter = 0; data = std::make_unique(arrLength); char** arr = data.get(); /* * This access pattern corresponds to many array/matrix algorithms. * It should be easily and correctly predicted by any decent hardware * prefetch algorithm. */ for (counter = 0; counter < arrLength - kStrideBytes; counter += kStrideBytes) { arr[counter] = reinterpret_cast(&arr[counter + kStrideBytes]); } arr[counter] = reinterpret_cast(&arr[0]); } } void TearDown(benchmark::State& state) override { if (state.thread_index == 0) { fassert(data.get() != nullptr, "'data' is null"); data.reset(); } } }; BENCHMARK_DEFINE_F(CacheLatencyTest, BM_CacheLatency)(benchmark::State& state) { size_t arrLength = state.range(0); size_t counter = arrLength / (kStrideBytes * 100) + 1; for (auto keepRunning : state) { char** dummyResult = nullptr; // Dummy result to prevent the loop from being optimized out. char** ptrToNextLinkedListNode = reinterpret_cast(data.get()[0]); for (size_t i = 0; i < counter; ++i) { HUNDRED; } benchmark::DoNotOptimize(dummyResult = ptrToNextLinkedListNode); } // Record the number of times we accessed the cache so Benchmark can compute the average latency // of each access. This allows comparing access latency across caches of different sizes. state.SetItemsProcessed(state.iterations() * counter * 100); } BENCHMARK_REGISTER_F(CacheLatencyTest, BM_CacheLatency) ->RangeMultiplier(2 * 1024) // Loop over arrays of different sizes to test the L2, L3, and RAM latency. ->Range(256 * 1024, 4096 * 1024) ->ThreadRange(1, ProcessInfo::getNumAvailableCores()); } // namespace } // namespace mongo