SERVER-69857 Repair maxdiff tests so they work correctly on Windows

author: Joel Redman <joel.redman@mongodb.com> 2022-09-26 14:53:50 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-09-26 15:23:14 +0000
commit: f5b97eee094fb08b59857aa592589c0a6b3089d2 (patch)
tree: 48ffa056dd9d2408e9194f283e664cb8d57b15bd
parent: defc79cac973ec5c08cbcccb44cfaa16af2a76c7 (diff)
download: mongo-f5b97eee094fb08b59857aa592589c0a6b3089d2.tar.gz
3 files changed, 245 insertions, 152 deletions
diff --git a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
index 142344d10fa..78c6490a0e9 100644
--- a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
+++ b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
@@ -116,148 +116,139 @@ TEST_F(HistogramTest, CreateFixed) {
     ASSERT_APPROX_EQUAL(50.0, estimateCard(hist, 50 * 10, EstimationType::kLess), kTolerance);
 }
 
-// TODO SERVER-69857: Uncomment test.
-// TEST_F(HistogramTest, MaxDiffTestInt) {
-//     constexpr size_t nElems = 100;
-//     constexpr size_t nBuckets = 10;
-//     constexpr size_t seed = 0;
+TEST_F(HistogramTest, MaxDiffTestInt) {
+    constexpr size_t nElems = 100;
+    constexpr size_t nBuckets = 10;
 
-//     std::vector<SBEValue> randData = genRandomValueArray(nElems, 1.0, 0.0, seed);
-
-//     auto opCtx = makeOperationContext();
-//     const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]");
-
-//     const ScalarHistogram& hist = makeHistogram(randData, nBuckets);
-//     std::cout << hist.toString();
-
-//     ASSERT_GTE(hist.getBuckets().size(), nBuckets);
-//     const double expectedCard = estimateCard(hist, 10, EstimationType::kLess);
-
-//     ASSERT_EQ(36, actualCard);
-//     ASSERT_APPROX_EQUAL(32.8667, expectedCard, kTolerance);
-// }
-
-// TODO SERVER-69857: Uncomment test.
-// TEST_F(HistogramTest, MaxDiffTestString) {
-//     constexpr size_t nElems = 100;
-//     constexpr size_t nBuckets = 10;
-//     constexpr size_t seed = 0;
-
-//     auto randData = genRandomValueArray(nElems, 0.0, 1.0, seed);
-//     std::cout << "Generated " << nElems << " random values:\n"
-//               << printValueArray(randData) << "\n"
-//               << std::flush;
-
-//     auto opCtx = makeOperationContext();
-//     const size_t actualCard =
-//         getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: '91YgOvBB'}}}]");
-
-//     sortValueVector(randData);
-//     const DataDistribution& dataDistrib = getDataDistribution(randData);
-
-//     const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets);
-//     std::cout << hist.toString();
-//     ASSERT_GTE(hist.getBuckets().size(), nBuckets);
-
-//     const auto [tag, val] = value::makeNewString("91YgOvBB"_sd);
-//     value::ValueGuard vg(tag, val);
-//     const double expectedCard = estimate(hist, tag, val, EstimationType::kLess).card;
-
-//     ASSERT_EQ(13, actualCard);
-//     ASSERT_APPROX_EQUAL(11.0783, expectedCard, kTolerance);
-// }
-
-// TEST_F(HistogramTest, MaxDiffTestMixedTypes) {
-//     constexpr size_t nElems = 100;
-//     constexpr size_t nBuckets = 10;
-//     constexpr size_t seed = 0;
-
-//     auto randData = genRandomValueArray(nElems, 0.5, 0.5, seed);
-//     std::cout << "Generated " << nElems << " random values:\n"
-//               << printValueArray(randData) << "\n"
-//               << std::flush;
-
-//     auto opCtx = makeOperationContext();
-//     const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]");
-
-//     sortValueVector(randData);
-//     const DataDistribution& dataDistrib = getDataDistribution(randData);
-
-//     const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets);
-//     std::cout << hist.toString();
-//     ASSERT_GTE(hist.getBuckets().size(), nBuckets);
-//     const double expectedCard = estimateCard(hist, 10, EstimationType::kLess);
-
-//     ASSERT_EQ(18, actualCard);
-//     ASSERT_APPROX_EQUAL(17.0833, expectedCard, kTolerance);
-// }
-
-// TODO SERVER-69857: Uncomment test.
-// TEST_F(HistogramTest, MaxDiffIntArrays) {
-//     constexpr size_t nElems = 100;
-//     constexpr size_t nBuckets = 10;
-//     constexpr size_t seed = 0;
-
-//     auto rawData = genRandomValueArray(nElems, 1.0, 0.0, seed);
-//     auto arrayData = genRandomValueArrayWithArrays(rawData);
-
-//     ArrayHistogram estimator = createArrayEstimator(arrayData, nBuckets);
-
-//     auto opCtx = makeOperationContext();
-//     {
-//         const size_t actualCard =
-//             getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$eq: 2}}}]");
-
-//         const auto [tag, val] = makeInt64Value(2);
-//         value::ValueGuard vg(tag, val);
-//         const double expectedCard = estimateCardEq(estimator, tag, val, true /* includeScalar
-//         */);
-
-//         ASSERT_APPROX_EQUAL(4.0, expectedCard, kTolerance);
-//         ASSERT_EQ(4, actualCard);
-//     }
-
-//     {
-//         const size_t actualCard =
-//             getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$lt: 3}}}]");
-
-//         const auto [tag, val] = makeInt64Value(3);
-//         value::ValueGuard vg(tag, val);
-//         const double expectedCard = estimateCardRange(estimator,
-//                                                       false /*lowInclusive*/,
-//                                                       value::TypeTags::MinKey,
-//                                                       0,
-//                                                       false /*highInclusive*/,
-//                                                       tag,
-//                                                       val,
-//                                                       true /* includeScalar */);
-
-//         ASSERT_APPROX_EQUAL(4.6667, expectedCard, kTolerance);
-//         ASSERT_EQ(6, actualCard);
-//     }
-
-//     {
-//         const size_t actualCard = getActualCard(
-//             opCtx.get(), arrayData, "[{$match: {a: {$elemMatch: {$gt: 2, $lt: 5}}}}]");
-
-//         const auto [lowTag, lowVal] = makeInt64Value(2);
-//         value::ValueGuard vgLow(lowTag, lowVal);
-//         const auto [highTag, highVal] = makeInt64Value(5);
-//         value::ValueGuard vgHigh(highTag, highVal);
-
-//         const double expectedCard = estimateCardRange(estimator,
-//                                                       false /*lowInclusive*/,
-//                                                       lowTag,
-//                                                       lowVal,
-//                                                       false /*highInclusive*/,
-//                                                       highTag,
-//                                                       highVal,
-//                                                       false /* includeScalar */);
-
-//         ASSERT_APPROX_EQUAL(1.76505, expectedCard, kTolerance);
-//         ASSERT_EQ(3, actualCard);
-//     }
-// }
+    auto data = genFixedValueArray(nElems, 1.0, 0.0);
+    auto opCtx = makeOperationContext();
+    const size_t actualCard = getActualCard(opCtx.get(), data, "[{$match: {a: {$lt: 10}}}]");
+
+    const ScalarHistogram& hist = makeHistogram(data, nBuckets);
+    std::cout << hist.toString();
+
+    ASSERT_LTE(hist.getBuckets().size(), nBuckets);
+    const double estimatedCard = estimateCard(hist, 11, EstimationType::kLess);
+
+    ASSERT_EQ(36, actualCard);
+    ASSERT_APPROX_EQUAL(39.73333, estimatedCard, kTolerance);
+}
+
+TEST_F(HistogramTest, MaxDiffTestString) {
+    constexpr size_t nElems = 100;
+    constexpr size_t nBuckets = 10;
+
+    auto randData = genFixedValueArray(nElems, 0.0, 1.0);
+    std::cout << "Generated " << nElems << " random values:\n"
+              << printValueArray(randData) << "\n"
+              << std::flush;
+
+    auto opCtx = makeOperationContext();
+    const size_t actualCard =
+        getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: '91YgOvBB'}}}]");
+
+    sortValueVector(randData);
+    const DataDistribution& dataDistrib = getDataDistribution(randData);
+
+    const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets);
+    std::cout << hist.toString();
+    ASSERT_LTE(hist.getBuckets().size(), nBuckets);
+
+    const auto [tag, val] = value::makeNewString("91YgOvBB"_sd);
+    value::ValueGuard vg(tag, val);
+    const double estimatedCard = estimate(hist, tag, val, EstimationType::kLess).card;
+
+    ASSERT_EQ(15, actualCard);
+    ASSERT_APPROX_EQUAL(10.9443, estimatedCard, kTolerance);
+}
+
+TEST_F(HistogramTest, MaxDiffTestMixedTypes) {
+    constexpr size_t nElems = 100;
+    constexpr size_t nBuckets = 10;
+
+    auto randData = genFixedValueArray(nElems, 0.5, 0.5);
+    std::cout << "Generated " << nElems << " random values:\n"
+              << printValueArray(randData) << "\n"
+              << std::flush;
+
+    auto opCtx = makeOperationContext();
+    const size_t actualCard = getActualCard(opCtx.get(), randData, "[{$match: {a: {$lt: 10}}}]");
+
+    sortValueVector(randData);
+    const DataDistribution& dataDistrib = getDataDistribution(randData);
+
+    const ScalarHistogram& hist = genMaxDiffHistogram(dataDistrib, nBuckets);
+    std::cout << hist.toString();
+    ASSERT_LTE(hist.getBuckets().size(), nBuckets);
+    const double estimatedCard = estimateCard(hist, 10, EstimationType::kLess);
+
+    ASSERT_EQ(18, actualCard);
+    ASSERT_APPROX_EQUAL(18.0, estimatedCard, kTolerance);
+}
+
+TEST_F(HistogramTest, MaxDiffIntArrays) {
+    constexpr size_t nElems = 100;
+    constexpr size_t nBuckets = 10;
+
+    auto rawData = genFixedValueArray(nElems, 1.0, 0.0);
+    auto arrayData = nestArrays(rawData);
+
+    ArrayHistogram estimator = createArrayEstimator(arrayData, nBuckets);
+
+    auto opCtx = makeOperationContext();
+    {
+        const size_t actualCard =
+            getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$eq: 2}}}]");
+
+        const auto [tag, val] = makeInt64Value(2);
+        value::ValueGuard vg(tag, val);
+        const double estimatedCard = estimateCardEq(estimator, tag, val, true /* includeScalar
+        */);
+
+        ASSERT_APPROX_EQUAL(4.0, estimatedCard, kTolerance);
+        ASSERT_EQ(4, actualCard);
+    }
+
+    {
+        const size_t actualCard =
+            getActualCard(opCtx.get(), arrayData, "[{$match: {a: {$lt: 3}}}]");
+
+        const auto [tag, val] = makeInt64Value(3);
+        value::ValueGuard vg(tag, val);
+        const double estimatedCard = estimateCardRange(estimator,
+                                                       false /*lowInclusive*/,
+                                                       value::TypeTags::MinKey,
+                                                       0,
+                                                       false /*highInclusive*/,
+                                                       tag,
+                                                       val,
+                                                       true /* includeScalar */);
+        ASSERT_EQ(6, actualCard);
+        ASSERT_APPROX_EQUAL(6.0, estimatedCard, kTolerance);
+    }
+
+    {
+        const size_t actualCard = getActualCard(
+            opCtx.get(), arrayData, "[{$match: {a: {$elemMatch: {$gt: 2, $lt: 5}}}}]");
+
+        const auto [lowTag, lowVal] = makeInt64Value(2);
+        value::ValueGuard vgLow(lowTag, lowVal);
+        const auto [highTag, highVal] = makeInt64Value(5);
+        value::ValueGuard vgHigh(highTag, highVal);
+
+        const double estimatedCard = estimateCardRange(estimator,
+                                                       false /*lowInclusive*/,
+                                                       lowTag,
+                                                       lowVal,
+                                                       false /*highInclusive*/,
+                                                       highTag,
+                                                       highVal,
+                                                       false /* includeScalar */);
+
+        ASSERT_EQ(2, actualCard);
+        ASSERT_APPROX_EQUAL(3.15479, estimatedCard, kTolerance);
+    }
+}
 
 }  // namespace
 }  // namespace mongo::ce::statistics
diff --git a/src/mongo/db/query/ce/rand_utils.cpp b/src/mongo/db/query/ce/rand_utils.cpp
index e4c4a75616d..05bfcf393df 100644
--- a/src/mongo/db/query/ce/rand_utils.cpp
+++ b/src/mongo/db/query/ce/rand_utils.cpp
@@ -226,8 +226,11 @@ void DatasetDescriptor::fillRandomArraySet() {
     }
 }
 
-///// Old implementation //////
-auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed) {
+/**
+    Generate a random string. It is possible (even expected) that the same parameters
+    will generate different strings on successive calls
+*/
+std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed) {
     std::string randStr;
     randStr.reserve(len);
     const constexpr char* kAlphabet =
@@ -243,6 +246,73 @@ auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed) {
     return randStr;
 }
 
+/**
+    Generate a string. This string will be deterministic in that the same
+    parameters will always generate the same string, even on different platforms.
+*/
+std::string genString(size_t len, size_t seed) {
+    std::string str;
+    str.reserve(len);
+
+    const constexpr char* kAlphabet =
+        "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+    const int kAlphabetLength = strlen(kAlphabet);
+
+    unsigned long long rand = seed;
+    for (size_t i = 0; i < len; ++i) {
+        // Library implementations of rand vary by compiler, naturally, Since we still
+        // want the appearance of randomness, but consistency across compilers, we use a linear
+        // congruential generator to choose characters for the string. The parameters chosen
+        // are from Numerical Recipes. We use the upper 32 bits when calculating the character
+        // index, as the lower 32 are essentially nonrandom -- a weakness of LCGs in general.
+        rand = 3935559000370003845ULL * rand + 269134368944950781ULL;
+
+        int idx = (rand >> 32) % kAlphabetLength;
+        str += kAlphabet[idx];
+    }
+
+    return str;
+}
+
+/**
+    Generate an array of values with the required ratio of int to string. This array will be
+    deterministic in that the same parameters will always generate the same array, even on
+    different platforms.
+*/
+std::vector<SBEValue> genFixedValueArray(size_t nElems, double intRatio, double strRatio) {
+
+    std::vector<SBEValue> values;
+
+    const int intNDV = static_cast<int>(nElems) / 4;
+    for (size_t i = 0; i < std::round(nElems * intRatio); ++i) {
+        const auto [tag, val] = makeInt64Value((i % intNDV) + 1);
+        values.emplace_back(tag, val);
+    }
+
+    if (strRatio == 0.0) {
+        return values;
+    }
+
+    // Generate a set of strings so that each string can be chosen multiple times in the test
+    // data set.
+    const size_t strNDV = nElems / 5;
+    std::vector<std::string> stringSet;
+    stringSet.reserve(strNDV);
+    for (size_t i = 0; i < strNDV; ++i) {
+        const auto randStr = genString(8, i);
+        stringSet.push_back(randStr);
+    }
+
+    for (size_t i = 0; i < std::round(nElems * strRatio); ++i) {
+        size_t idx = i % stringSet.size();
+        const auto randStr = stringSet[idx];
+        const auto [tag, val] = value::makeNewString(randStr);
+        values.emplace_back(tag, val);
+    }
+
+    return values;
+}
+
 std::vector<SBEValue> genRandomValueArray(size_t nElems,
                                           double intRatio,
                                           double strRatio,
@@ -273,14 +343,13 @@ std::vector<SBEValue> genRandomValueArray(size_t nElems,
         size_t idx = idxDistr(gen);
         const auto randStr = stringSet[idx];
         const auto [tag, val] = value::makeNewString(randStr);
-        const auto [copyTag, copyVal] = value::copyValue(tag, val);
-        randValues.emplace_back(copyTag, copyVal);
+        randValues.emplace_back(tag, val);
     }
 
     return randValues;
 }
 
-std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>& input) {
+std::vector<SBEValue> nestArrays(const std::vector<SBEValue>& input) {
     std::vector<SBEValue> result;
     auto [arrayTag, arrayVal] = value::makeNewArray();
 
@@ -302,6 +371,15 @@ std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>&
         }
     }
 
+    // It's possible that the array still contains something. If it's empty,
+    // we can safely release it. If not, append it to the result.
+    value::Array* arr = value::getArrayView(arrayVal);
+    if (arr->size() > 0) {
+        result.emplace_back(arrayTag, arrayVal);
+    } else {
+        value::releaseValue(arrayTag, arrayVal);
+    }
+
     return result;
 }
 
diff --git a/src/mongo/db/query/ce/rand_utils.h b/src/mongo/db/query/ce/rand_utils.h
index f7cc41c58b4..362d2c7eaec 100644
--- a/src/mongo/db/query/ce/rand_utils.h
+++ b/src/mongo/db/query/ce/rand_utils.h
@@ -143,25 +143,49 @@ private:
 };  // namespace mongo::ce
 
 /**
-    Generate a random string of length len.
+    Generate a pseudorandom string of length n
     * The alphabet is fixed as [0-9][a-z][A-Z]
     * Characters are chosed uniformly from the alphabet
+    * Randomness is implemented such that it is independent of the platform,
+        i.e. given the same length and seed on any platform, we will produce the
+        same string.
+*/
+std::string genString(size_t len, size_t seed);
+
+/**
+    Generate a set of elements consisting of strings and ints in the
+    requested ratio. The generated array will contain the same values given the same
+    inputs on all platforms.
+ */
+std::vector<SBEValue> genFixedValueArray(size_t nElems, double intRatio, double strRatio);
+
+/**
+    Generate a random string of length len.
+    * The alphabet is fixed as [0-9][a-z][A-Z].
+    * Characters are chosed uniformly from the alphabet.
+    * Generated strings are likely to differ by platform, so derived values depending on them
+      are also likely to change.
  */
-auto genRandomString(size_t len, std::mt19937_64& gen, size_t seed);
+std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed);
+
 
 /**
     Generate a uniformly random set of elements consisting of string and ints in the
-    requested ratio
+    requested ratio. The resulting array is very likely to differ between platforms, even
+    with the same seed. Thus, derived values are also likely to change.
+
+    Prefer genFixedValueArray when comparing derived values against constants.
  */
 std::vector<SBEValue> genRandomValueArray(size_t nElems,
                                           double intRatio,
                                           double strRatio,
                                           size_t seed);
+
 /**
-    Generate a set up values consisting of half scalars, and half arrays of length 10
+    Generate a set up values consisting of half scalars, and half arrays of length 10.
 
-    Values contained in the result will be drawn from the input vector
+    Values contained in the result will be drawn from the input vector.
  */
-std::vector<SBEValue> genRandomValueArrayWithArrays(const std::vector<SBEValue>& input);
+std::vector<SBEValue> nestArrays(const std::vector<SBEValue>& input);
 
 }  // namespace mongo::ce
author	Joel Redman <joel.redman@mongodb.com>	2022-09-26 14:53:50 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-09-26 15:23:14 +0000
commit	f5b97eee094fb08b59857aa592589c0a6b3089d2 (patch)
tree	48ffa056dd9d2408e9194f283e664cb8d57b15bd
parent	defc79cac973ec5c08cbcccb44cfaa16af2a76c7 (diff)
download	mongo-f5b97eee094fb08b59857aa592589c0a6b3089d2.tar.gz