diff options
Diffstat (limited to 'src/mongo/db/query/stats/rand_utils_new.h')
-rw-r--r-- | src/mongo/db/query/stats/rand_utils_new.h | 353 |
1 files changed, 353 insertions, 0 deletions
diff --git a/src/mongo/db/query/stats/rand_utils_new.h b/src/mongo/db/query/stats/rand_utils_new.h new file mode 100644 index 00000000000..be77578fc28 --- /dev/null +++ b/src/mongo/db/query/stats/rand_utils_new.h @@ -0,0 +1,353 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <random> +#include <vector> + +#include "mongo/db/query/stats/value_utils.h" + +namespace mongo::stats { + +class DatasetDescriptorNew; + +/** + * A base class for wrappers of STL random distributions that produce size_t values within a range. + * This class enables polymorphic usage of random distributions, for instance to implement a mix of + * distributions. + */ +class RandomDistribution { +public: + RandomDistribution() = default; + RandomDistribution(const RandomDistribution&) = default; + RandomDistribution(RandomDistribution&&) = default; + RandomDistribution& operator=(const RandomDistribution&) = default; + RandomDistribution& operator=(RandomDistribution&&) = default; + virtual ~RandomDistribution() = default; + + virtual size_t operator()(std::mt19937_64& gen) = 0; +}; + +/** + A uniform random distribution of size_t within a range + */ +class UniformDistr : public RandomDistribution { +public: + UniformDistr(size_t min, size_t max) : _distr{min, max}, _min(min), _max(max) {} + + size_t operator()(std::mt19937_64& gen) override { + size_t result = _distr(gen); + uassert(6660540, "Random index out of range", result >= _min && result <= _max); + return result; + } + +private: + std::uniform_int_distribution<size_t> _distr; + size_t _min; + size_t _max; +}; + +/** + * Wrapper of normal distribution that is guaranteed to produces size_t values within a certain + * range. The STL class normal_distribution takes a median and standard deviation. This class + * computes a suitable median and standard deviation from the required [min,max] boundaries. + */ +class NormalDistr : public RandomDistribution { +public: + NormalDistr(size_t min, size_t max) + : _distr{(double)(min + max) / 2.0, (double)(max - min) / 4.0}, + _backup{min, max}, + _min((double)min), + _max((double)max) {} + + size_t operator()(std::mt19937_64& gen) override { + size_t result = std::round(_distr(gen)); + size_t trials = 0; + // If the result is outside the range (an event with low probability), try 10 more times to + // get a number in the range. + while (!(result >= _min && result <= _max) && trials < 10) { + double randNum = _distr(gen); + if (randNum < _min) { + result = std::ceil(randNum); + } else if (randNum > _max) { + result = std::floor(randNum); + } else { + result = std::round(randNum); + } + ++trials; + } + if (result < _min && result > _max) { + // We couldn't generate a number in [min,max] within 10 attempts. Generate a uniform + // number. + result = _backup(gen); + } + uassert(6660541, "Random index out of range", result >= _min && result <= _max); + return result; + } + +private: + std::normal_distribution<double> _distr; + std::uniform_int_distribution<size_t> _backup; + double _min; + double _max; +}; + +enum class DistrType { kUniform, kNormal }; + +using MixedDistributionDescriptor = std::vector<std::pair<DistrType, double /*weight*/>>; + +/** + * Generator for mixed distribution, where mixing is on the type of distribution, in the + * probabilities specified in distrProbabilites + */ +class MixedDistribution { +public: + MixedDistribution(std::vector<std::unique_ptr<RandomDistribution>> distrMix, + std::vector<double>& distrProbabilities) + : _distrMix(std::move(distrMix)) { + _distDist.param(std::discrete_distribution<size_t>::param_type(distrProbabilities.begin(), + distrProbabilities.end())); + } + + static std::unique_ptr<MixedDistribution> make(MixedDistributionDescriptor& descriptor, + size_t min, + size_t max) { + std::vector<double> distrProbabilities; + std::vector<std::unique_ptr<RandomDistribution>> distrMix; + + for (const auto& [distrType, weight] : descriptor) { + distrProbabilities.push_back(weight); + switch (distrType) { + case DistrType::kUniform: + distrMix.emplace_back(std::make_unique<UniformDistr>(min, max)); + break; + case DistrType::kNormal: + distrMix.emplace_back(std::make_unique<NormalDistr>(min, max)); + break; + default: + MONGO_UNREACHABLE; + } + } + + return std::make_unique<MixedDistribution>(std::move(distrMix), distrProbabilities); + } + + size_t operator()(std::mt19937_64& gen) { + size_t distIdx = _distDist(gen); + size_t result = (*_distrMix.at(distIdx))(gen); + return result; + } + +private: + // Mix of different distributions. There can be instances of the same type of distribution, + // because they can still be defined differently. + std::vector<std::unique_ptr<RandomDistribution>> _distrMix; + // Distribution of distributions - select the current distribution with a certain probability. + std::discrete_distribution<size_t> _distDist; +}; + +/** + * Descriptor of a typed data distribution + */ +class DataTypeDistrNew { +public: + DataTypeDistrNew(MixedDistributionDescriptor distrDescriptor, + sbe::value::TypeTags tag, + double weight, + size_t ndv, + double nullsRatio = 0.0) + : _mixedDistrDescriptor(distrDescriptor), + _tag(tag), + _weight(weight), + _ndv(ndv), + _nullsRatio(nullsRatio) { + uassert(6660542, "NDV must be > 0.", ndv > 0); + uassert(6660543, "nullsRatio must be in [0, 1].", nullsRatio >= 0 && nullsRatio <= 1); + } + + virtual ~DataTypeDistrNew() = default; + + /** + * Generate all unique values that generation chooses from, and store them in '_valSet'. + * Different data types provide different implementations. + * @todo: The 'parentDesc' parameter is used only by array generation. Consider a different way + * of passing it only to that type. + */ + virtual void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) = 0; + + /** + * Generate a single random value, and store it in 'randValues' vector. + */ + void generate(std::vector<SBEValue>& randValues, std::mt19937_64& gen); + + /** + * Generate a single random value, and store it in 'randValueArray' array. + */ + void generate(sbe::value::Array* randValueArray, std::mt19937_64& gen); + + /** + * Custom equality comparison for storage in sets. There can be only datatype in a set. + */ + bool operator==(const DataTypeDistrNew& d) const { + return this->_tag == d._tag; + } + + sbe::value::TypeTags tag() const { + return _tag; + } + + double weight() const { + return _weight; + } + +protected: + MixedDistributionDescriptor _mixedDistrDescriptor; + sbe::value::TypeTags _tag; + // Weight that determines the probability of a value of this type. + const double _weight; + const size_t _ndv; + // A set of (randomly generated) values to choose from when generating random datasets. + std::vector<SBEValue> _valSet; + // Generator of random indexes into a set of values. + // std::uniform_int_distribution<size_t> _idxDist; + std::unique_ptr<MixedDistribution> _idxDist; + // Percent of null values in the dataset. + double _nullsRatio; + std::uniform_real_distribution<double> _nullSelector{0, 1}; + + friend class DatasetDescriptorNew; +}; + +using TypeDistrVector = std::vector<std::unique_ptr<DataTypeDistrNew>>; + +/** + * Integer data distribution. + */ +class IntDistribution : public DataTypeDistrNew { +public: + IntDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + int minInt, + int maxInt, + double nullsRatio = 0); + + /* + * Generate a set of random integers, and store them in _valSet. + */ + void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; + +protected: + int _minInt; + int _maxInt; +}; + +/** + * String data distribution. + */ +class StrDistribution : public DataTypeDistrNew { +public: + StrDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + size_t minStrLen, + size_t maxStrLen, + double nullsRatio = 0); + + /* + * Generate a set of random strings, and store them in _valSet. + */ + void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; + +protected: + std::string genRandomString(size_t len, std::mt19937_64& gen); + + size_t _minStrLen; + size_t _maxStrLen; + // All strings draw characters from this alphabet. + static const std::string _alphabet; + // Generator of random indexes into the set of characters '_alphabet'. + std::uniform_int_distribution<size_t> _uniformCharIdxDist{0, _alphabet.size() - 1}; +}; + +/** + * SBE array data distribution. + */ +class ArrDistribution : public DataTypeDistrNew { +public: + ArrDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + size_t minArrLen, + size_t maxArrLen, + std::unique_ptr<DatasetDescriptorNew> arrayDataDescriptor, + double reuseScalarsRatio = 0, + double nullsRatio = 0); + +private: + void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; + + // Generator of random array sizes. + std::uniform_int_distribution<size_t> _uniformArrSizeDist; + // Descriptor of the dataset within each array. + std::unique_ptr<DatasetDescriptorNew> _arrayDataDescriptor; + // Randomly select a parent or a child distribution when generating random + std::uniform_real_distribution<double> _uniformRandProbability{0.0, 1.0}; + double _reuseScalarsRatio; +}; + +/** + Given a list of tyoed data distibutions, this class is used to generate a vector of values + according to the distribution weights. +*/ +class DatasetDescriptorNew { +public: + DatasetDescriptorNew(TypeDistrVector dataTypeDistributions, std::mt19937_64& gen); + + // Generate a random dataset of 'nElems' according to the data distribution characteristics in + // this object. + std::vector<SBEValue> genRandomDataset(size_t nElems); + +private: + // Select a random value data type. + DataTypeDistrNew* getRandDataTypeDist(); + + // Distribution of different SBE data types. There will be %percent values of each type. + // TODO: is it a better idea to store shared_ptr or raw pointers to enable reuse? + TypeDistrVector _dataTypeDistributions; + // Pseudo-random generator. + std::mt19937_64& _gen; + // Select a random data type distribution. + std::discrete_distribution<size_t> _dataTypeSelector; + + friend class ArrDistribution; +}; + +} // namespace mongo::stats |