summaryrefslogtreecommitdiff
path: root/src/mongo/db/pipeline/accumulator_test.cpp
diff options
context:
space:
mode:
authorRuoxin Xu <ruoxin.xu@mongodb.com>2021-03-09 15:55:19 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-03-17 23:47:47 +0000
commit65faf38974f63900b1dcdc6fd1994ce3255911d4 (patch)
treebc4527e60d6054279b2d4aa7035ec2b7585c0026 /src/mongo/db/pipeline/accumulator_test.cpp
parent1640bbb7d487ef65722d98ac830090c82b7f7ee5 (diff)
downloadmongo-65faf38974f63900b1dcdc6fd1994ce3255911d4.tar.gz
SERVER-54240 Implement $covariance accumulator-only
Diffstat (limited to 'src/mongo/db/pipeline/accumulator_test.cpp')
-rw-r--r--src/mongo/db/pipeline/accumulator_test.cpp141
1 files changed, 141 insertions, 0 deletions
diff --git a/src/mongo/db/pipeline/accumulator_test.cpp b/src/mongo/db/pipeline/accumulator_test.cpp
index 9a08f5984d2..fe9fa550b6f 100644
--- a/src/mongo/db/pipeline/accumulator_test.cpp
+++ b/src/mongo/db/pipeline/accumulator_test.cpp
@@ -31,6 +31,7 @@
#include "mongo/platform/basic.h"
+#include <cmath>
#include <memory>
#include "mongo/db/exec/document_value/document.h"
@@ -431,6 +432,146 @@ TEST(Accumulators, PushRespectsMaxMemoryConstraint) {
ErrorCodes::ExceededMemoryLimit);
}
+/* ------------------------- AccumulatorCorvariance(Samp/Pop) -------------------------- */
+
+// Calculate covariance using the offline algorithm.
+double offlineCovariance(const std::vector<Value>& input, bool isSamp) {
+ // Edge cases return 0 though 'input' should not be empty. Empty input is tested elsewhere.
+ if (input.size() <= 1)
+ return 0;
+
+ double adjustedN = isSamp ? input.size() - 1 : input.size();
+ double meanX = 0;
+ double meanY = 0;
+ double cXY = 0;
+
+ for (auto&& value : input) {
+ meanX += value.getArray()[0].coerceToDouble();
+ meanY += value.getArray()[1].coerceToDouble();
+ }
+ meanX /= input.size();
+ meanY /= input.size();
+
+ for (auto&& value : input) {
+ cXY += (value.getArray()[0].coerceToDouble() - meanX) *
+ (value.getArray()[1].coerceToDouble() - meanY);
+ }
+
+ return cXY / adjustedN;
+}
+
+// Test the accumulator-output covariance (using an online algorithm:
+// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online) is equal to the
+// covariance calculated based on the offline algorithm (cov(x,y) = Σ((xi-avg(x))*(yi-avg(y)))/n)).
+// If 'result' is given, the covariance should also be tested against the given result.
+template <typename AccName>
+static void assertCovariance(ExpressionContext* const expCtx,
+ const std::vector<Value>& input,
+ boost::optional<double> result = boost::none) {
+ auto accum = AccName::create(expCtx);
+ for (auto&& val : input) {
+ accum->process(val, false);
+ }
+ double onlineCov = accum->getValue(false).coerceToDouble();
+ double offlineCov =
+ offlineCovariance(input, std::is_same_v<AccName, AccumulatorCovarianceSamp>);
+
+ ASSERT_LTE(fabs(onlineCov - offlineCov), 1e-10);
+ if (result) {
+ ASSERT_LTE(fabs(onlineCov - *result), 1e-5);
+ }
+}
+
+TEST(Accumulators, CovarianceEdgeCases) {
+ auto expCtx = ExpressionContextForTest{};
+
+ // The sample covariance of variables of single value should be undefined.
+ const std::vector<Value> singlePoint = {
+ Value(std::vector<Value>({Value(0), Value(1)})),
+ };
+
+ const std::vector<Value> nanPoints = {
+ Value(std::vector<Value>({Value(numeric_limits<double>::quiet_NaN()),
+ Value(numeric_limits<double>::quiet_NaN())})),
+ Value(std::vector<Value>({Value(numeric_limits<double>::quiet_NaN()),
+ Value(numeric_limits<double>::quiet_NaN())})),
+ };
+
+ assertExpectedResults<AccumulatorCovariancePop>(
+ &expCtx,
+ {
+ {{}, Value(BSONNULL)},
+ {singlePoint, Value(0.0)},
+ {nanPoints, Value(numeric_limits<double>::quiet_NaN())},
+ },
+ true /* Covariance accumulator can't be merged */);
+
+ assertExpectedResults<AccumulatorCovarianceSamp>(
+ &expCtx,
+ {
+ {{}, Value(BSONNULL)},
+ {singlePoint, Value(BSONNULL)},
+ {nanPoints, Value(numeric_limits<double>::quiet_NaN())},
+ },
+ true /* Covariance accumulator can't be merged */);
+}
+
+TEST(Accumulators, PopulationCovariance) {
+ auto expCtx = ExpressionContextForTest{};
+
+ // Some doubles as input.
+ const std::vector<Value> multiplePoints = {
+ Value(std::vector<Value>({Value(0), Value(1.5)})),
+ Value(std::vector<Value>({Value(1.4), Value(2.5)})),
+ Value(std::vector<Value>({Value(4.7), Value(3.6)})),
+ };
+
+ // Test both offline and online corvariance algorithm with a given result.
+ assertCovariance<AccumulatorCovariancePop>(&expCtx, multiplePoints, 1.655556);
+}
+
+TEST(Accumulators, SampleCovariance) {
+ auto expCtx = ExpressionContextForTest{};
+
+ // Some doubles as input.
+ std::vector<Value> multiplePoints = {
+ Value(std::vector<Value>({Value(0), Value(1.5)})),
+ Value(std::vector<Value>({Value(1.4), Value(2.5)})),
+ Value(std::vector<Value>({Value(4.7), Value(3.6)})),
+ };
+
+ // Test both offline and online corvariance algorithm with a given result.
+ assertCovariance<AccumulatorCovarianceSamp>(&expCtx, multiplePoints, 2.483334);
+}
+
+std::vector<Value> generateRandomVariables() {
+ auto seed = Date_t::now().asInt64();
+ LOGV2(5424001, "Generated new seed is {seed}", "seed"_attr = seed);
+
+ std::vector<Value> output;
+ PseudoRandom prng(seed);
+ const int variableSize = prng.nextInt32(1000) + 2;
+
+ for (int i = 0; i < variableSize; i++) {
+ std::vector<Value> newXY;
+ newXY.push_back(Value(prng.nextCanonicalDouble()));
+ newXY.push_back(Value(prng.nextCanonicalDouble()));
+ output.push_back(Value(newXY));
+ }
+
+ return output;
+}
+
+TEST(Accumulators, CovarianceWithRandomVariables) {
+ auto expCtx = ExpressionContextForTest{};
+
+ // Some randomly generated variables as input.
+ std::vector<Value> randomVariables = generateRandomVariables();
+
+ assertCovariance<AccumulatorCovariancePop>(&expCtx, randomVariables, boost::none);
+ assertCovariance<AccumulatorCovarianceSamp>(&expCtx, randomVariables, boost::none);
+}
+
/* ------------------------- AccumulatorMergeObjects -------------------------- */
TEST(AccumulatorMergeObjects, MergingZeroObjectsShouldReturnEmptyDocument) {