summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Redman <joel.redman@mongodb.com>2022-09-26 14:53:50 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-26 15:23:14 +0000
commitf5b97eee094fb08b59857aa592589c0a6b3089d2 (patch)
tree48ffa056dd9d2408e9194f283e664cb8d57b15bd
parentdefc79cac973ec5c08cbcccb44cfaa16af2a76c7 (diff)
downloadmongo-f5b97eee094fb08b59857aa592589c0a6b3089d2.tar.gz
SERVER-69857 Repair maxdiff tests so they work correctly on Windows
-rw-r--r--src/mongo/db/query/ce/maxdiff_histogram_test.cpp273
-rw-r--r--src/mongo/db/query/ce/rand_utils.cpp88
-rw-r--r--src/mongo/db/query/ce/rand_utils.h36
3 files changed, 245 insertions, 152 deletions
diff --git a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
index 142344d10fa..78c6490a0e9 100644
--- a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
+++ b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
@@ -116,148 +116,139 @@ TEST_F(HistogramTest, CreateFixed) {
ASSERT_APPROX_EQUAL(50.0, estimateCard(hist, 50 * 10, EstimationType::kLess), kTolerance);
}
-// TODO SERVER-69857: Uncomment test.
-// TEST_F(HistogramTest, MaxDiffTestInt) {
-// constexpr size_t nElems = 100;
-// constexpr size_t nBuckets = 10;
-// constexpr size_t seed = 0;
+TEST_F(HistogramTest, MaxDiffTestInt) {
+ constexpr size_t nElems = 100;
+ constexpr size_t nBuckets = 10;
-// std::vector<SBEValue> randData = genRandomValueArray(nElems, 1.0, 0.0, seed);
-
-// auto opCtx = makeOperationContext();
-// const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]");
-
-// const ScalarHistogram& hist = makeHistogram(randData, nBuckets);
-// std::cout << hist.toString();
-
-// ASSERT_GTE(hist.getBuckets().size(), nBuckets);
-// const double expectedCard = estimateCard(hist, 10, EstimationType::kLess);
-
-// ASSERT_EQ(36, actualCard);
-// ASSERT_APPROX_EQUAL(32.8667, expectedCard, kTolerance);
-// }
-
-// TODO SERVER-69857: Uncomment test.
-// TEST_F(HistogramTest, MaxDiffTestString) {
-// constexpr size_t nElems = 100;
-// constexpr size_t nBuckets = 10;
-// constexpr size_t seed = 0;
-
-// auto randData = genRandomValueArray(nElems, 0.0, 1.0, seed);
-// std::cout << "Generated " << nElems << " random values:\n"
-// << printValueArray(randData) << "\n"
-// << std::flush;
-
-// auto opCtx = makeOperationContext();
-// const size_t actualCard =
-// getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: '91YgOvBB'}}}]");
-
-// sortValueVector(randData);
-// const DataDistribution& dataDistrib = getDataDistribution(randData);
-
-// const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets);
-// std::cout << hist.toString();
-// ASSERT_GTE(hist.getBuckets().size(), nBuckets);
-
-// const auto [tag, val] = value::makeNewString("91YgOvBB"_sd);
-// value::ValueGuard vg(tag, val);
-// const double expectedCard = estimate(hist, tag, val, EstimationType::kLess).card;
-
-// ASSERT_EQ(13, actualCard);
-// ASSERT_APPROX_EQUAL(11.0783, expectedCard, kTolerance);
-// }
-
-// TEST_F(HistogramTest, MaxDiffTestMixedTypes) {
-// constexpr size_t nElems = 100;
-// constexpr size_t nBuckets = 10;
-// constexpr size_t seed = 0;
-
-// auto randData = genRandomValueArray(nElems, 0.5, 0.5, seed);
-// std::cout << "Generated " << nElems << " random values:\n"
-// << printValueArray(randData) << "\n"
-// << std::flush;
-
-// auto opCtx = makeOperationContext();
-// const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]");
-
-// sortValueVector(randData);
-// const DataDistribution& dataDistrib = getDataDistribution(randData);
-
-// const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets);
-// std::cout << hist.toString();
-// ASSERT_GTE(hist.getBuckets().size(), nBuckets);
-// const double expectedCard = estimateCard(hist, 10, EstimationType::kLess);
-
-// ASSERT_EQ(18, actualCard);
-// ASSERT_APPROX_EQUAL(17.0833, expectedCard, kTolerance);
-// }
-
-// TODO SERVER-69857: Uncomment test.
-// TEST_F(HistogramTest, MaxDiffIntArrays) {
-// constexpr size_t nElems = 100;
-// constexpr size_t nBuckets = 10;
-// constexpr size_t seed = 0;
-
-// auto rawData = genRandomValueArray(nElems, 1.0, 0.0, seed);
-// auto arrayData = genRandomValueArrayWithArrays(rawData);
-
-// ArrayHistogram estimator = createArrayEstimator(arrayData, nBuckets);
-
-// auto opCtx = makeOperationContext();
-// {
-// const size_t actualCard =
-// getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$eq: 2}}}]");
-
-// const auto [tag, val] = makeInt64Value(2);
-// value::ValueGuard vg(tag, val);
-// const double expectedCard = estimateCardEq(estimator, tag, val, true /* includeScalar
-// */);
-
-// ASSERT_APPROX_EQUAL(4.0, expectedCard, kTolerance);
-// ASSERT_EQ(4, actualCard);
-// }
-
-// {
-// const size_t actualCard =
-// getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$lt: 3}}}]");
-
-// const auto [tag, val] = makeInt64Value(3);
-// value::ValueGuard vg(tag, val);
-// const double expectedCard = estimateCardRange(estimator,
-// false /*lowInclusive*/,
-// value::TypeTags::MinKey,
-// 0,
-// false /*highInclusive*/,
-// tag,
-// val,
-// true /* includeScalar */);
-
-// ASSERT_APPROX_EQUAL(4.6667, expectedCard, kTolerance);
-// ASSERT_EQ(6, actualCard);
-// }
-
-// {
-// const size_t actualCard = getActualCard(
-// opCtx.get(), arrayData, "[{$match: {a: {$elemMatch: {$gt: 2, $lt: 5}}}}]");
-
-// const auto [lowTag, lowVal] = makeInt64Value(2);
-// value::ValueGuard vgLow(lowTag, lowVal);
-// const auto [highTag, highVal] = makeInt64Value(5);
-// value::ValueGuard vgHigh(highTag, highVal);
-
-// const double expectedCard = estimateCardRange(estimator,
-// false /*lowInclusive*/,
-// lowTag,
-// lowVal,
-// false /*highInclusive*/,
-// highTag,
-// highVal,
-// false /* includeScalar */);
-
-// ASSERT_APPROX_EQUAL(1.76505, expectedCard, kTolerance);
-// ASSERT_EQ(3, actualCard);
-// }
-// }
+ auto data = genFixedValueArray(nElems, 1.0, 0.0);
+ auto opCtx = makeOperationContext();
+ const size_t actualCard = getActualCard(opCtx.get(), data, "[{$match: {a: {$lt: 10}}}]");
+
+ const ScalarHistogram& hist = makeHistogram(data, nBuckets);
+ std::cout << hist.toString();
+
+ ASSERT_LTE(hist.getBuckets().size(), nBuckets);
+ const double estimatedCard = estimateCard(hist, 11, EstimationType::kLess);
+
+ ASSERT_EQ(36, actualCard);
+ ASSERT_APPROX_EQUAL(39.73333, estimatedCard, kTolerance);
+}
+
+TEST_F(HistogramTest, MaxDiffTestString) {
+ constexpr size_t nElems = 100;
+ constexpr size_t nBuckets = 10;
+
+ auto randData = genFixedValueArray(nElems, 0.0, 1.0);
+ std::cout << "Generated " << nElems << " random values:\n"
+ << printValueArray(randData) << "\n"
+ << std::flush;
+
+ auto opCtx = makeOperationContext();
+ const size_t actualCard =
+ getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: '91YgOvBB'}}}]");
+
+ sortValueVector(randData);
+ const DataDistribution& dataDistrib = getDataDistribution(randData);
+
+ const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets);
+ std::cout << hist.toString();
+ ASSERT_LTE(hist.getBuckets().size(), nBuckets);
+
+ const auto [tag, val] = value::makeNewString("91YgOvBB"_sd);
+ value::ValueGuard vg(tag, val);
+ const double estimatedCard = estimate(hist, tag, val, EstimationType::kLess).card;
+
+ ASSERT_EQ(15, actualCard);
+ ASSERT_APPROX_EQUAL(10.9443, estimatedCard, kTolerance);
+}
+
+TEST_F(HistogramTest, MaxDiffTestMixedTypes) {
+ constexpr size_t nElems = 100;
+ constexpr size_t nBuckets = 10;
+
+ auto randData = genFixedValueArray(nElems, 0.5, 0.5);
+ std::cout << "Generated " << nElems << " random values:\n"
+ << printValueArray(randData) << "\n"
+ << std::flush;
+
+ auto opCtx = makeOperationContext();
+ const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]");
+
+ sortValueVector(randData);
+ const DataDistribution& dataDistrib = getDataDistribution(randData);
+
+ const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets);
+ std::cout << hist.toString();
+ ASSERT_LTE(hist.getBuckets().size(), nBuckets);
+ const double estimatedCard = estimateCard(hist, 10, EstimationType::kLess);
+
+ ASSERT_EQ(18, actualCard);
+ ASSERT_APPROX_EQUAL(18.0, estimatedCard, kTolerance);
+}
+
+TEST_F(HistogramTest, MaxDiffIntArrays) {
+ constexpr size_t nElems = 100;
+ constexpr size_t nBuckets = 10;
+
+ auto rawData = genFixedValueArray(nElems, 1.0, 0.0);
+ auto arrayData = nestArrays(rawData);
+
+ ArrayHistogram estimator = createArrayEstimator(arrayData, nBuckets);
+
+ auto opCtx = makeOperationContext();
+ {
+ const size_t actualCard =
+ getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$eq: 2}}}]");
+
+ const auto [tag, val] = makeInt64Value(2);
+ value::ValueGuard vg(tag, val);
+ const double estimatedCard = estimateCardEq(estimator, tag, val, true /* includeScalar
+ */);
+
+ ASSERT_APPROX_EQUAL(4.0, estimatedCard, kTolerance);
+ ASSERT_EQ(4, actualCard);
+ }
+
+ {
+ const size_t actualCard =
+ getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$lt: 3}}}]");
+
+ const auto [tag, val] = makeInt64Value(3);
+ value::ValueGuard vg(tag, val);
+ const double estimatedCard = estimateCardRange(estimator,
+ false /*lowInclusive*/,
+ value::TypeTags::MinKey,
+ 0,
+ false /*highInclusive*/,
+ tag,
+ val,
+ true /* includeScalar */);
+ ASSERT_EQ(6, actualCard);
+ ASSERT_APPROX_EQUAL(6.0, estimatedCard, kTolerance);
+ }
+
+ {
+ const size_t actualCard = getActualCard(
+ opCtx.get(), arrayData, "[{$match: {a: {$elemMatch: {$gt: 2, $lt: 5}}}}]");
+
+ const auto [lowTag, lowVal] = makeInt64Value(2);
+ value::ValueGuard vgLow(lowTag, lowVal);
+ const auto [highTag, highVal] = makeInt64Value(5);
+ value::ValueGuard vgHigh(highTag, highVal);
+
+ const double estimatedCard = estimateCardRange(estimator,
+ false /*lowInclusive*/,
+ lowTag,
+ lowVal,
+ false /*highInclusive*/,
+ highTag,
+ highVal,
+ false /* includeScalar */);
+
+ ASSERT_EQ(2, actualCard);
+ ASSERT_APPROX_EQUAL(3.15479, estimatedCard, kTolerance);
+ }
+}
} // namespace
} // namespace mongo::ce::statistics
diff --git a/src/mongo/db/query/ce/rand_utils.cpp b/src/mongo/db/query/ce/rand_utils.cpp
index e4c4a75616d..05bfcf393df 100644
--- a/src/mongo/db/query/ce/rand_utils.cpp
+++ b/src/mongo/db/query/ce/rand_utils.cpp
@@ -226,8 +226,11 @@ void DatasetDescriptor::fillRandomArraySet() {
}
}
-///// Old implementation //////
-auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed) {
+/**
+ Generate a random string. It is possible (even expected) that the same parameters
+ will generate different strings on successive calls
+*/
+std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed) {
std::string randStr;
randStr.reserve(len);
const constexpr char* kAlphabet =
@@ -243,6 +246,73 @@ auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed) {
return randStr;
}
+/**
+ Generate a string. This string will be deterministic in that the same
+ parameters will always generate the same string, even on different platforms.
+*/
+std::string genString(size_t len, size_t seed) {
+ std::string str;
+ str.reserve(len);
+
+ const constexpr char* kAlphabet =
+ "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+ const int kAlphabetLength = strlen(kAlphabet);
+
+ unsigned long long rand = seed;
+ for (size_t i = 0; i < len; ++i) {
+ // Library implementations of rand vary by compiler, naturally, Since we still
+ // want the appearance of randomness, but consistency across compilers, we use a linear
+ // congruential generator to choose characters for the string. The parameters chosen
+ // are from Numerical Recipes. We use the upper 32 bits when calculating the character
+ // index, as the lower 32 are essentially nonrandom -- a weakness of LCGs in general.
+ rand = 3935559000370003845ULL * rand + 269134368944950781ULL;
+
+ int idx = (rand >> 32) % kAlphabetLength;
+ str += kAlphabet[idx];
+ }
+
+ return str;
+}
+
+/**
+ Generate an array of values with the required ratio of int to string. This array will be
+ deterministic in that the same parameters will always generate the same array, even on
+ different platforms.
+*/
+std::vector<SBEValue> genFixedValueArray(size_t nElems, double intRatio, double strRatio) {
+
+ std::vector<SBEValue> values;
+
+ const int intNDV = static_cast<int>(nElems) / 4;
+ for (size_t i = 0; i < std::round(nElems * intRatio); ++i) {
+ const auto [tag, val] = makeInt64Value((i % intNDV) + 1);
+ values.emplace_back(tag, val);
+ }
+
+ if (strRatio == 0.0) {
+ return values;
+ }
+
+ // Generate a set of strings so that each string can be chosen multiple times in the test
+ // data set.
+ const size_t strNDV = nElems / 5;
+ std::vector<std::string> stringSet;
+ stringSet.reserve(strNDV);
+ for (size_t i = 0; i < strNDV; ++i) {
+ const auto randStr = genString(8, i);
+ stringSet.push_back(randStr);
+ }
+
+ for (size_t i = 0; i < std::round(nElems * strRatio); ++i) {
+ size_t idx = i % stringSet.size();
+ const auto randStr = stringSet[idx];
+ const auto [tag, val] = value::makeNewString(randStr);
+ values.emplace_back(tag, val);
+ }
+
+ return values;
+}
+
std::vector<SBEValue> genRandomValueArray(size_t nElems,
double intRatio,
double strRatio,
@@ -273,14 +343,13 @@ std::vector<SBEValue> genRandomValueArray(size_t nElems,
size_t idx = idxDistr(gen);
const auto randStr = stringSet[idx];
const auto [tag, val] = value::makeNewString(randStr);
- const auto [copyTag, copyVal] = value::copyValue(tag, val);
- randValues.emplace_back(copyTag, copyVal);
+ randValues.emplace_back(tag, val);
}
return randValues;
}
-std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>& input) {
+std::vector<SBEValue> nestArrays(const std::vector<SBEValue>& input) {
std::vector<SBEValue> result;
auto [arrayTag, arrayVal] = value::makeNewArray();
@@ -302,6 +371,15 @@ std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>&
}
}
+ // It's possible that the array still contains something. If it's empty,
+ // we can safely release it. If not, append it to the result.
+ value::Array* arr = value::getArrayView(arrayVal);
+ if (arr->size() > 0) {
+ result.emplace_back(arrayTag, arrayVal);
+ } else {
+ value::releaseValue(arrayTag, arrayVal);
+ }
+
return result;
}
diff --git a/src/mongo/db/query/ce/rand_utils.h b/src/mongo/db/query/ce/rand_utils.h
index f7cc41c58b4..362d2c7eaec 100644
--- a/src/mongo/db/query/ce/rand_utils.h
+++ b/src/mongo/db/query/ce/rand_utils.h
@@ -143,25 +143,49 @@ private:
}; // namespace mongo::ce
/**
- Generate a random string of length len.
+ Generate a pseudorandom string of length n
* The alphabet is fixed as [0-9][a-z][A-Z]
* Characters are chosed uniformly from the alphabet
+ * Randomness is implemented such that it is independent of the platform,
+ i.e. given the same length and seed on any platform, we will produce the
+ same string.
+*/
+std::string genString(size_t len, size_t seed);
+
+/**
+ Generate a set of elements consisting of strings and ints in the
+ requested ratio. The generated array will contain the same values given the same
+ inputs on all platforms.
+ */
+std::vector<SBEValue> genFixedValueArray(size_t nElems, double intRatio, double strRatio);
+
+/**
+ Generate a random string of length len.
+ * The alphabet is fixed as [0-9][a-z][A-Z].
+ * Characters are chosed uniformly from the alphabet.
+ * Generated strings are likely to differ by platform, so derived values depending on them
+ are also likely to change.
*/
-auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed);
+std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed);
+
/**
Generate a uniformly random set of elements consisting of string and ints in the
- requested ratio
+ requested ratio. The resulting array is very likely to differ between platforms, even
+ with the same seed. Thus, derived values are also likely to change.
+
+ Prefer genFixedValueArray when comparing derived values against constants.
*/
std::vector<SBEValue> genRandomValueArray(size_t nElems,
double intRatio,
double strRatio,
size_t seed);
+
/**
- Generate a set up values consisting of half scalars, and half arrays of length 10
+ Generate a set up values consisting of half scalars, and half arrays of length 10.
- Values contained in the result will be drawn from the input vector
+ Values contained in the result will be drawn from the input vector.
*/
-std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>& input);
+std::vector<SBEValue> nestArrays(const std::vector<SBEValue>& input);
} // namespace mongo::ce