/**
* Copyright (C) 2018 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the prograxm with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/platform/basic.h"
#include
#include "mongo/util/assert_util.h"
#include "mongo/util/processinfo.h"
#include "mongo/util/time_support.h"
namespace mongo {
namespace {
// This is a trivial test case to sanity check that "benchmark" runs.
// For more information on how benchmarks should be written, please refer to Google Benchmark's
// excellent README: https://github.com/google/benchmark/blob/v1.3.0/README.md
void BM_Empty(benchmark::State& state) {
for (auto keepRunning : state) {
// The code inside this for-loop is what's being timed.
benchmark::DoNotOptimize(state.iterations());
}
}
// Register two benchmarks, one runs the "BM_Empty" function in a single thread, the other runs a
// copy per CPU core.
BENCHMARK(BM_Empty);
BENCHMARK(BM_Empty)->ThreadPerCpu();
void BM_CpuLoad(benchmark::State& state) {
for (auto keepRunning : state) {
uint64_t limit = 10000;
uint64_t lresult = 0;
uint64_t x = 100;
for (uint64_t i = 0; i < limit; i++) {
benchmark::DoNotOptimize(x *= 13);
}
benchmark::DoNotOptimize(lresult = x);
}
}
// This Benchmark is adapted from the `cpuload` command:
// https://github.com/mongodb/mongo/blob/r3.7.2/src/mongo/db/commands/cpuload.cpp
BENCHMARK(BM_CpuLoad)->Threads(1)->ThreadPerCpu();
void BM_Sleep(benchmark::State& state) {
for (auto keepRunning : state) {
sleepmillis(100);
}
}
BENCHMARK(BM_Sleep)->Threads(1)->ThreadPerCpu();
// Generate a loop with macros.
#define ONE ptrToNextLinkedListNode = reinterpret_cast(*ptrToNextLinkedListNode);
#define FIVE ONE ONE ONE ONE ONE
#define TEN FIVE FIVE
#define FIFTY TEN TEN TEN TEN TEN
#define HUNDRED FIFTY FIFTY
// Stride is the number of elements to skip when traversing the array.
// It should ideally be >= the cache line to avoid side-effects from pre-fetching.
const int kStrideBytes = 64;
class CacheLatencyTest : public benchmark::Fixture {
// Fixture for CPU Cache and RAM latency test. Adapted from lmbench's lat_mem_rd test.
public:
// Array of pointers used as a linked list.
std::unique_ptr data;
void SetUp(benchmark::State& state) override {
if (state.thread_index == 0) {
fassert(data.get() == nullptr, "'data' is not null");
/*
* Create a circular list of pointers using a simple striding
* algorithm.
*/
const int arrLength = state.range(0);
int counter = 0;
data = std::make_unique(arrLength);
char** arr = data.get();
/*
* This access pattern corresponds to many array/matrix algorithms.
* It should be easily and correctly predicted by any decent hardware
* prefetch algorithm.
*/
for (counter = 0; counter < arrLength - kStrideBytes; counter += kStrideBytes) {
arr[counter] = reinterpret_cast(&arr[counter + kStrideBytes]);
}
arr[counter] = reinterpret_cast(&arr[0]);
}
}
void TearDown(benchmark::State& state) override {
if (state.thread_index == 0) {
fassert(data.get() != nullptr, "'data' is null");
data.reset();
}
}
};
BENCHMARK_DEFINE_F(CacheLatencyTest, BM_CacheLatency)(benchmark::State& state) {
size_t arrLength = state.range(0);
size_t counter = arrLength / (kStrideBytes * 100) + 1;
for (auto keepRunning : state) {
char** dummyResult = 0; // Dummy result to prevent the loop from being optimized out.
char** ptrToNextLinkedListNode = reinterpret_cast(data.get()[0]);
for (size_t i = 0; i < counter; ++i) {
HUNDRED;
}
benchmark::DoNotOptimize(dummyResult = ptrToNextLinkedListNode);
}
// Record the number of times we accessed the cache so Benchmark can compute the average latency
// of each access. This allows comparing access latency across caches of different sizes.
state.SetItemsProcessed(state.iterations() * counter * 100);
}
BENCHMARK_REGISTER_F(CacheLatencyTest, BM_CacheLatency)
->RangeMultiplier(2 * 1024)
// Loop over arrays of different sizes to test the L2, L3, and RAM latency.
->Range(256 * 1024, 4096 * 1024)
->ThreadRange(1, ProcessInfo::getNumAvailableCores());
} // namespace
} // namespace mongo