diff options
author | Anton Korshunov <anton.korshunov@mongodb.com> | 2022-11-28 14:19:20 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-11-28 15:12:36 +0000 |
commit | b1474e5c22fd2106c1e7c6493052e8fbc450e289 (patch) | |
tree | 84c30cb1b57836a437ebd90b11632a4efbdeb86d /src/mongo/db/query/stats/rand_utils.h | |
parent | e4e1b807a5e079dc9fff098294271b63966930e3 (diff) | |
download | mongo-b1474e5c22fd2106c1e7c6493052e8fbc450e289.tar.gz |
SERVER-71051 Make CE module less dependent on statistics module
Diffstat (limited to 'src/mongo/db/query/stats/rand_utils.h')
-rw-r--r-- | src/mongo/db/query/stats/rand_utils.h | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/src/mongo/db/query/stats/rand_utils.h b/src/mongo/db/query/stats/rand_utils.h new file mode 100644 index 00000000000..89e4741fd2a --- /dev/null +++ b/src/mongo/db/query/stats/rand_utils.h @@ -0,0 +1,188 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <random> +#include <vector> + +#include "mongo/db/query/stats/value_utils.h" + +namespace mongo::stats { +// A simple histogram describing the distribution of values of each data type. +using DataTypeDistribution = std::map<sbe::value::TypeTags, double>; + +/** + Describes the distribution of a dataset according to type and weight. Other ctor parameters + are used to describe the various data types which can be emitted and correspond to the fields + named similarly + */ +class DatasetDescriptor { +public: + DatasetDescriptor(const DataTypeDistribution& dataTypeDistribution, + size_t intNDV, + int minInt, + int maxInt, + size_t strNDV, + size_t minStrLen, + size_t maxStrLen, + std::shared_ptr<DatasetDescriptor> nestedDataDescriptor = nullptr, + double reuseScalarsRatio = 0, + size_t arrNDV = 0, + size_t minArrLen = 0, + size_t maxArrLen = 0); + + // Generate a random dataset of 'nElems' according to the data distribution characteristics in + // this object. + std::vector<SBEValue> genRandomDataset(size_t nElems, DatasetDescriptor* parentDesc = nullptr); + +private: + // Select a random value data type. + sbe::value::TypeTags getRandDataType() { + double key = _uniformRandProbability(_gen); + return (*_dataTypeDistribution.upper_bound(key)).second; + } + + // Generate a random string with size 'len'. + std::string genRandomString(size_t len); + + // Generate a random array with length determined uniformly between minArrLen and maxArrLen + std::vector<SBEValue> genRandomArray(); + + // Generate a set of random arrays that are chosen from when generating array data. + void fillRandomArraySet(); + +private: + using InternalDataTypeDistribution = std::map<double, sbe::value::TypeTags>; + /* + * General distribution charecteristics. + */ + + // Pseudo-random generator. + std::mt19937_64 _gen; + // Random probabilities. Used to: + // - Select Value data types as random indexes in '_dataTypeDistribution'. + // - Select the source of values - either existing scalars or new. + std::uniform_real_distribution<double> _uniformRandProbability{0.0, 1.0}; + // Distribution of different SBE data types. There will be %percent values of each type. + InternalDataTypeDistribution _dataTypeDistribution; + double _reuseScalarsRatio; + + /* + * Integer data parameters. + */ + + // Number of distinct integer values. + const size_t _intNDV; + // A set of integers to choose from while generating random integers. + std::vector<int> _intSet; + // Generator of random integers with uniform distribution. + std::uniform_int_distribution<int> _uniformIntDist; + // Generator of random indexes into the set of integers '_intSet'. + std::uniform_int_distribution<size_t> _uniformIntIdxDist; + + /* + * String data parameters. + */ + + // All strings draw characters from this alphabet. + static const std::string _alphabet; + // A set of random strings to choose from. In theory there can be duplicates, but this is very + // unlikely. We don't care much if there are a few duplicates anyway. + std::vector<std::string> _stringSet; + // Generator of random indexes into the set of characters '_alphabet'. + std::uniform_int_distribution<size_t> _uniformCharIdxDist{0, _alphabet.size() - 1}; + // Generator of random indexes into the set of strings '_stringSet'. + std::uniform_int_distribution<size_t> _uniformStrIdxDist; + + /* + * Array data parameters. + */ + + // Number of distinct arrays. + // TODO: currently not used. The idea is to use it in the same way as arrays - pre-generate + // '_arrNDV' arrays, then select randomly from this initial set. + size_t _arrNDV; + // Set of arrays to pick from when generating random data. + std::vector<std::vector<SBEValue>> _arraySet; + // Generator of random array sizes. + std::uniform_int_distribution<size_t> _uniformArrSizeDist; + // Descriptor of the dataset within each array. + std::shared_ptr<DatasetDescriptor> _nestedDataDescriptor; + // Generator of random indexes into the set of arrays '_arraySet'. + std::uniform_int_distribution<size_t> _uniformArrIdxDist; +}; + +/** + Generate a pseudorandom string of length n + * The alphabet is fixed as [0-9][a-z][A-Z] + * Characters are chosed uniformly from the alphabet + * Randomness is implemented such that it is independent of the platform, + i.e. given the same length and seed on any platform, we will produce the + same string. +*/ +std::string genString(size_t len, size_t seed); + +/** + Generate a set of elements consisting of strings and ints in the + requested ratio. The generated array will contain the same values given the same + inputs on all platforms. + */ +std::vector<SBEValue> genFixedValueArray(size_t nElems, double intRatio, double strRatio); + +/** + Generate a random string of length len. + * The alphabet is fixed as [0-9][a-z][A-Z]. + * Characters are chosed uniformly from the alphabet. + * Generated strings are likely to differ by platform, so derived values depending on them + are also likely to change. + */ +std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed); + + +/** + Generate a uniformly random set of elements consisting of string and ints in the + requested ratio. The resulting array is very likely to differ between platforms, even + with the same seed. Thus, derived values are also likely to change. + + Prefer genFixedValueArray when comparing derived values against constants. + */ +std::vector<SBEValue> genRandomValueArray(size_t nElems, + double intRatio, + double strRatio, + size_t seed); + +/** + Generate a set up values consisting of half scalars, and half arrays of length 10. + + Values contained in the result will be drawn from the input vector. + */ +std::vector<SBEValue> nestArrays(const std::vector<SBEValue>& input, size_t emptyArrayCount); + +} // namespace mongo::stats |