diff options
-rw-r--r-- | src/mongo/db/query/ce/maxdiff_histogram_test.cpp | 273 | ||||
-rw-r--r-- | src/mongo/db/query/ce/rand_utils.cpp | 88 | ||||
-rw-r--r-- | src/mongo/db/query/ce/rand_utils.h | 36 |
3 files changed, 245 insertions, 152 deletions
diff --git a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp index 142344d10fa..78c6490a0e9 100644 --- a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp +++ b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp @@ -116,148 +116,139 @@ TEST_F(HistogramTest, CreateFixed) { ASSERT_APPROX_EQUAL(50.0, estimateCard(hist, 50 * 10, EstimationType::kLess), kTolerance); } -// TODO SERVER-69857: Uncomment test. -// TEST_F(HistogramTest, MaxDiffTestInt) { -// constexpr size_t nElems = 100; -// constexpr size_t nBuckets = 10; -// constexpr size_t seed = 0; +TEST_F(HistogramTest, MaxDiffTestInt) { + constexpr size_t nElems = 100; + constexpr size_t nBuckets = 10; -// std::vector<SBEValue> randData = genRandomValueArray(nElems, 1.0, 0.0, seed); - -// auto opCtx = makeOperationContext(); -// const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]"); - -// const ScalarHistogram& hist = makeHistogram(randData, nBuckets); -// std::cout << hist.toString(); - -// ASSERT_GTE(hist.getBuckets().size(), nBuckets); -// const double expectedCard = estimateCard(hist, 10, EstimationType::kLess); - -// ASSERT_EQ(36, actualCard); -// ASSERT_APPROX_EQUAL(32.8667, expectedCard, kTolerance); -// } - -// TODO SERVER-69857: Uncomment test. -// TEST_F(HistogramTest, MaxDiffTestString) { -// constexpr size_t nElems = 100; -// constexpr size_t nBuckets = 10; -// constexpr size_t seed = 0; - -// auto randData = genRandomValueArray(nElems, 0.0, 1.0, seed); -// std::cout << "Generated " << nElems << " random values:\n" -// << printValueArray(randData) << "\n" -// << std::flush; - -// auto opCtx = makeOperationContext(); -// const size_t actualCard = -// getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: '91YgOvBB'}}}]"); - -// sortValueVector(randData); -// const DataDistribution& dataDistrib = getDataDistribution(randData); - -// const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets); -// std::cout << hist.toString(); -// ASSERT_GTE(hist.getBuckets().size(), nBuckets); - -// const auto [tag, val] = value::makeNewString("91YgOvBB"_sd); -// value::ValueGuard vg(tag, val); -// const double expectedCard = estimate(hist, tag, val, EstimationType::kLess).card; - -// ASSERT_EQ(13, actualCard); -// ASSERT_APPROX_EQUAL(11.0783, expectedCard, kTolerance); -// } - -// TEST_F(HistogramTest, MaxDiffTestMixedTypes) { -// constexpr size_t nElems = 100; -// constexpr size_t nBuckets = 10; -// constexpr size_t seed = 0; - -// auto randData = genRandomValueArray(nElems, 0.5, 0.5, seed); -// std::cout << "Generated " << nElems << " random values:\n" -// << printValueArray(randData) << "\n" -// << std::flush; - -// auto opCtx = makeOperationContext(); -// const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]"); - -// sortValueVector(randData); -// const DataDistribution& dataDistrib = getDataDistribution(randData); - -// const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets); -// std::cout << hist.toString(); -// ASSERT_GTE(hist.getBuckets().size(), nBuckets); -// const double expectedCard = estimateCard(hist, 10, EstimationType::kLess); - -// ASSERT_EQ(18, actualCard); -// ASSERT_APPROX_EQUAL(17.0833, expectedCard, kTolerance); -// } - -// TODO SERVER-69857: Uncomment test. -// TEST_F(HistogramTest, MaxDiffIntArrays) { -// constexpr size_t nElems = 100; -// constexpr size_t nBuckets = 10; -// constexpr size_t seed = 0; - -// auto rawData = genRandomValueArray(nElems, 1.0, 0.0, seed); -// auto arrayData = genRandomValueArrayWithArrays(rawData); - -// ArrayHistogram estimator = createArrayEstimator(arrayData, nBuckets); - -// auto opCtx = makeOperationContext(); -// { -// const size_t actualCard = -// getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$eq: 2}}}]"); - -// const auto [tag, val] = makeInt64Value(2); -// value::ValueGuard vg(tag, val); -// const double expectedCard = estimateCardEq(estimator, tag, val, true /* includeScalar -// */); - -// ASSERT_APPROX_EQUAL(4.0, expectedCard, kTolerance); -// ASSERT_EQ(4, actualCard); -// } - -// { -// const size_t actualCard = -// getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$lt: 3}}}]"); - -// const auto [tag, val] = makeInt64Value(3); -// value::ValueGuard vg(tag, val); -// const double expectedCard = estimateCardRange(estimator, -// false /*lowInclusive*/, -// value::TypeTags::MinKey, -// 0, -// false /*highInclusive*/, -// tag, -// val, -// true /* includeScalar */); - -// ASSERT_APPROX_EQUAL(4.6667, expectedCard, kTolerance); -// ASSERT_EQ(6, actualCard); -// } - -// { -// const size_t actualCard = getActualCard( -// opCtx.get(), arrayData, "[{$match: {a: {$elemMatch: {$gt: 2, $lt: 5}}}}]"); - -// const auto [lowTag, lowVal] = makeInt64Value(2); -// value::ValueGuard vgLow(lowTag, lowVal); -// const auto [highTag, highVal] = makeInt64Value(5); -// value::ValueGuard vgHigh(highTag, highVal); - -// const double expectedCard = estimateCardRange(estimator, -// false /*lowInclusive*/, -// lowTag, -// lowVal, -// false /*highInclusive*/, -// highTag, -// highVal, -// false /* includeScalar */); - -// ASSERT_APPROX_EQUAL(1.76505, expectedCard, kTolerance); -// ASSERT_EQ(3, actualCard); -// } -// } + auto data = genFixedValueArray(nElems, 1.0, 0.0); + auto opCtx = makeOperationContext(); + const size_t actualCard = getActualCard(opCtx.get(), data, "[{$match: {a: {$lt: 10}}}]"); + + const ScalarHistogram& hist = makeHistogram(data, nBuckets); + std::cout << hist.toString(); + + ASSERT_LTE(hist.getBuckets().size(), nBuckets); + const double estimatedCard = estimateCard(hist, 11, EstimationType::kLess); + + ASSERT_EQ(36, actualCard); + ASSERT_APPROX_EQUAL(39.73333, estimatedCard, kTolerance); +} + +TEST_F(HistogramTest, MaxDiffTestString) { + constexpr size_t nElems = 100; + constexpr size_t nBuckets = 10; + + auto randData = genFixedValueArray(nElems, 0.0, 1.0); + std::cout << "Generated " << nElems << " random values:\n" + << printValueArray(randData) << "\n" + << std::flush; + + auto opCtx = makeOperationContext(); + const size_t actualCard = + getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: '91YgOvBB'}}}]"); + + sortValueVector(randData); + const DataDistribution& dataDistrib = getDataDistribution(randData); + + const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets); + std::cout << hist.toString(); + ASSERT_LTE(hist.getBuckets().size(), nBuckets); + + const auto [tag, val] = value::makeNewString("91YgOvBB"_sd); + value::ValueGuard vg(tag, val); + const double estimatedCard = estimate(hist, tag, val, EstimationType::kLess).card; + + ASSERT_EQ(15, actualCard); + ASSERT_APPROX_EQUAL(10.9443, estimatedCard, kTolerance); +} + +TEST_F(HistogramTest, MaxDiffTestMixedTypes) { + constexpr size_t nElems = 100; + constexpr size_t nBuckets = 10; + + auto randData = genFixedValueArray(nElems, 0.5, 0.5); + std::cout << "Generated " << nElems << " random values:\n" + << printValueArray(randData) << "\n" + << std::flush; + + auto opCtx = makeOperationContext(); + const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]"); + + sortValueVector(randData); + const DataDistribution& dataDistrib = getDataDistribution(randData); + + const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets); + std::cout << hist.toString(); + ASSERT_LTE(hist.getBuckets().size(), nBuckets); + const double estimatedCard = estimateCard(hist, 10, EstimationType::kLess); + + ASSERT_EQ(18, actualCard); + ASSERT_APPROX_EQUAL(18.0, estimatedCard, kTolerance); +} + +TEST_F(HistogramTest, MaxDiffIntArrays) { + constexpr size_t nElems = 100; + constexpr size_t nBuckets = 10; + + auto rawData = genFixedValueArray(nElems, 1.0, 0.0); + auto arrayData = nestArrays(rawData); + + ArrayHistogram estimator = createArrayEstimator(arrayData, nBuckets); + + auto opCtx = makeOperationContext(); + { + const size_t actualCard = + getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$eq: 2}}}]"); + + const auto [tag, val] = makeInt64Value(2); + value::ValueGuard vg(tag, val); + const double estimatedCard = estimateCardEq(estimator, tag, val, true /* includeScalar + */); + + ASSERT_APPROX_EQUAL(4.0, estimatedCard, kTolerance); + ASSERT_EQ(4, actualCard); + } + + { + const size_t actualCard = + getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$lt: 3}}}]"); + + const auto [tag, val] = makeInt64Value(3); + value::ValueGuard vg(tag, val); + const double estimatedCard = estimateCardRange(estimator, + false /*lowInclusive*/, + value::TypeTags::MinKey, + 0, + false /*highInclusive*/, + tag, + val, + true /* includeScalar */); + ASSERT_EQ(6, actualCard); + ASSERT_APPROX_EQUAL(6.0, estimatedCard, kTolerance); + } + + { + const size_t actualCard = getActualCard( + opCtx.get(), arrayData, "[{$match: {a: {$elemMatch: {$gt: 2, $lt: 5}}}}]"); + + const auto [lowTag, lowVal] = makeInt64Value(2); + value::ValueGuard vgLow(lowTag, lowVal); + const auto [highTag, highVal] = makeInt64Value(5); + value::ValueGuard vgHigh(highTag, highVal); + + const double estimatedCard = estimateCardRange(estimator, + false /*lowInclusive*/, + lowTag, + lowVal, + false /*highInclusive*/, + highTag, + highVal, + false /* includeScalar */); + + ASSERT_EQ(2, actualCard); + ASSERT_APPROX_EQUAL(3.15479, estimatedCard, kTolerance); + } +} } // namespace } // namespace mongo::ce::statistics diff --git a/src/mongo/db/query/ce/rand_utils.cpp b/src/mongo/db/query/ce/rand_utils.cpp index e4c4a75616d..05bfcf393df 100644 --- a/src/mongo/db/query/ce/rand_utils.cpp +++ b/src/mongo/db/query/ce/rand_utils.cpp @@ -226,8 +226,11 @@ void DatasetDescriptor::fillRandomArraySet() { } } -///// Old implementation ////// -auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed) { +/** + Generate a random string. It is possible (even expected) that the same parameters + will generate different strings on successive calls +*/ +std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed) { std::string randStr; randStr.reserve(len); const constexpr char* kAlphabet = @@ -243,6 +246,73 @@ auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed) { return randStr; } +/** + Generate a string. This string will be deterministic in that the same + parameters will always generate the same string, even on different platforms. +*/ +std::string genString(size_t len, size_t seed) { + std::string str; + str.reserve(len); + + const constexpr char* kAlphabet = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + const int kAlphabetLength = strlen(kAlphabet); + + unsigned long long rand = seed; + for (size_t i = 0; i < len; ++i) { + // Library implementations of rand vary by compiler, naturally, Since we still + // want the appearance of randomness, but consistency across compilers, we use a linear + // congruential generator to choose characters for the string. The parameters chosen + // are from Numerical Recipes. We use the upper 32 bits when calculating the character + // index, as the lower 32 are essentially nonrandom -- a weakness of LCGs in general. + rand = 3935559000370003845ULL * rand + 269134368944950781ULL; + + int idx = (rand >> 32) % kAlphabetLength; + str += kAlphabet[idx]; + } + + return str; +} + +/** + Generate an array of values with the required ratio of int to string. This array will be + deterministic in that the same parameters will always generate the same array, even on + different platforms. +*/ +std::vector<SBEValue> genFixedValueArray(size_t nElems, double intRatio, double strRatio) { + + std::vector<SBEValue> values; + + const int intNDV = static_cast<int>(nElems) / 4; + for (size_t i = 0; i < std::round(nElems * intRatio); ++i) { + const auto [tag, val] = makeInt64Value((i % intNDV) + 1); + values.emplace_back(tag, val); + } + + if (strRatio == 0.0) { + return values; + } + + // Generate a set of strings so that each string can be chosen multiple times in the test + // data set. + const size_t strNDV = nElems / 5; + std::vector<std::string> stringSet; + stringSet.reserve(strNDV); + for (size_t i = 0; i < strNDV; ++i) { + const auto randStr = genString(8, i); + stringSet.push_back(randStr); + } + + for (size_t i = 0; i < std::round(nElems * strRatio); ++i) { + size_t idx = i % stringSet.size(); + const auto randStr = stringSet[idx]; + const auto [tag, val] = value::makeNewString(randStr); + values.emplace_back(tag, val); + } + + return values; +} + std::vector<SBEValue> genRandomValueArray(size_t nElems, double intRatio, double strRatio, @@ -273,14 +343,13 @@ std::vector<SBEValue> genRandomValueArray(size_t nElems, size_t idx = idxDistr(gen); const auto randStr = stringSet[idx]; const auto [tag, val] = value::makeNewString(randStr); - const auto [copyTag, copyVal] = value::copyValue(tag, val); - randValues.emplace_back(copyTag, copyVal); + randValues.emplace_back(tag, val); } return randValues; } -std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>& input) { +std::vector<SBEValue> nestArrays(const std::vector<SBEValue>& input) { std::vector<SBEValue> result; auto [arrayTag, arrayVal] = value::makeNewArray(); @@ -302,6 +371,15 @@ std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>& } } + // It's possible that the array still contains something. If it's empty, + // we can safely release it. If not, append it to the result. + value::Array* arr = value::getArrayView(arrayVal); + if (arr->size() > 0) { + result.emplace_back(arrayTag, arrayVal); + } else { + value::releaseValue(arrayTag, arrayVal); + } + return result; } diff --git a/src/mongo/db/query/ce/rand_utils.h b/src/mongo/db/query/ce/rand_utils.h index f7cc41c58b4..362d2c7eaec 100644 --- a/src/mongo/db/query/ce/rand_utils.h +++ b/src/mongo/db/query/ce/rand_utils.h @@ -143,25 +143,49 @@ private: }; // namespace mongo::ce /** - Generate a random string of length len. + Generate a pseudorandom string of length n * The alphabet is fixed as [0-9][a-z][A-Z] * Characters are chosed uniformly from the alphabet + * Randomness is implemented such that it is independent of the platform, + i.e. given the same length and seed on any platform, we will produce the + same string. +*/ +std::string genString(size_t len, size_t seed); + +/** + Generate a set of elements consisting of strings and ints in the + requested ratio. The generated array will contain the same values given the same + inputs on all platforms. + */ +std::vector<SBEValue> genFixedValueArray(size_t nElems, double intRatio, double strRatio); + +/** + Generate a random string of length len. + * The alphabet is fixed as [0-9][a-z][A-Z]. + * Characters are chosed uniformly from the alphabet. + * Generated strings are likely to differ by platform, so derived values depending on them + are also likely to change. */ -auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed); +std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed); + /** Generate a uniformly random set of elements consisting of string and ints in the - requested ratio + requested ratio. The resulting array is very likely to differ between platforms, even + with the same seed. Thus, derived values are also likely to change. + + Prefer genFixedValueArray when comparing derived values against constants. */ std::vector<SBEValue> genRandomValueArray(size_t nElems, double intRatio, double strRatio, size_t seed); + /** - Generate a set up values consisting of half scalars, and half arrays of length 10 + Generate a set up values consisting of half scalars, and half arrays of length 10. - Values contained in the result will be drawn from the input vector + Values contained in the result will be drawn from the input vector. */ -std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>& input); +std::vector<SBEValue> nestArrays(const std::vector<SBEValue>& input); } // namespace mongo::ce |