SERVER-74297 Initial implementation of $percentile accumulator

author: Irina Yatsenko <irina.yatsenko@mongodb.com> 2023-03-03 17:49:27 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2023-03-03 23:00:36 +0000
commit: 23a26b5dad45d12187b61a1e5e49c60c5be26632 (patch)
tree: 15c189d6cb1488a4fbc4b0d676a8d0792cf0e10d
parent: 4f07b59413fd6053ad8033934c4312fecbcbd5d9 (diff)
download: mongo-23a26b5dad45d12187b61a1e5e49c60c5be26632.tar.gz
7 files changed, 620 insertions, 0 deletions
diff --git a/jstests/aggregation/accumulators/percentiles_approx.js b/jstests/aggregation/accumulators/percentiles_approx.js
new file mode 100644
index 00000000000..227b15ca1df
--- /dev/null
+++ b/jstests/aggregation/accumulators/percentiles_approx.js
@@ -0,0 +1,89 @@
+/**
+ * Tests for the approximate percentile accumulator semantics.
+ * @tags: [
+ *   featureFlagApproxPercentiles,
+ *   # sharded collections aren't supported yet
+ *   assumes_unsharded_collection,
+ * ]
+ */
+(function() {
+"use strict";
+
+load("jstests/aggregation/extras/utils.js");
+
+const coll = db[jsTestName()];
+
+/**
+ * Tests for correctness without grouping. Each group gets its own accumulator so we can validate
+ * the basic $percentile functionality using a single group.
+ */
+function testWithSingleGroup({docs, percentileSpec, expectedResult, msg}) {
+    coll.drop();
+    coll.insertMany(docs);
+    const res = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
+
+    // For $percentile the result should be ordered to match the spec, so assert exact equality.
+    assert.eq(expectedResult, res[0].p, msg + ` result: ${tojson(res)}`);
+}
+
+testWithSingleGroup({
+    docs: [{x: 0}, {x: "non-numeric"}, {x: 1}, {no_x: 0}, {x: 2}],
+    percentileSpec: {$percentile: {p: [0.5], input: "$x", algorithm: "approximate"}},
+    expectedResult: [1],
+    msg: "Non-numeric data should be ignored"
+});
+
+testWithSingleGroup({
+    docs: [{x: "non-numeric"}, {no_x: 0}],
+    percentileSpec: {$percentile: {p: [0.5], input: "$x", algorithm: "approximate"}},
+    expectedResult: null,
+    msg: "Percentile of completely non-numeric data"
+});
+
+testWithSingleGroup({
+    docs: [{x: "non-numeric"}, {no_x: 0}],
+    percentileSpec: {$percentile: {p: [0.5, 0.9], input: "$x", algorithm: "approximate"}},
+    expectedResult: null,
+    msg: "Multiple percentiles of completely non-numeric data"
+});
+
+testWithSingleGroup({
+    docs: [{x: 10}, {x: 5}, {x: 27}],
+    percentileSpec: {$percentile: {p: [0], input: "$x", algorithm: "approximate"}},
+    expectedResult: [5],
+    msg: "Minimun percentile"
+});
+
+testWithSingleGroup({
+    docs: [{x: 10}, {x: 5}, {x: 27}],
+    percentileSpec: {$percentile: {p: [1], input: "$x", algorithm: "approximate"}},
+    expectedResult: [27],
+    msg: "Maximum percentile"
+});
+
+testWithSingleGroup({
+    docs: [{x: 0}, {x: 1}, {x: 2}],
+    percentileSpec: {$percentile: {p: [0.5, 0.9, 0.1], input: "$x", algorithm: "approximate"}},
+    expectedResult: [1, 2, 0],
+    msg: "Multiple percentiles"
+});
+
+function testWithMultipleGroups({docs, percentileSpec, expectedResult, msg}) {
+    coll.drop();
+    coll.insertMany(docs);
+    const res =
+        coll.aggregate([{$group: {_id: "$k", p: percentileSpec}}, {$sort: {_id: 1}}]).toArray();
+
+    // For $percentile the result should be ordered to match the spec, so assert exact equality.
+    for (let i = 0; i < res.length; i++) {
+        assert.eq(expectedResult[i], res[i].p, msg + ` result: ${tojson(res)}`);
+    }
+}
+
+testWithMultipleGroups({
+    docs: [{k: 0, x: 0}, {k: 0, x: 1}, {k: 1, x: 2}, {k: 2}, {k: 0, x: "str"}, {k: 1, x: 0}],
+    percentileSpec: {$percentile: {p: [0.9], input: "$x", algorithm: "approximate"}},
+    expectedResult: [/* k:0 */[1], /* k:1 */[2], /* k:2 */ null],
+    msg: "Multiple groups"
+});
+})();
diff --git a/jstests/aggregation/accumulators/percentiles_syntax.js b/jstests/aggregation/accumulators/percentiles_syntax.js
new file mode 100644
index 00000000000..ed5e1d12e1f
--- /dev/null
+++ b/jstests/aggregation/accumulators/percentiles_syntax.js
@@ -0,0 +1,106 @@
+/**
+ * Tests for the $percentile accumulator syntax.
+ * @tags: [
+ *   featureFlagApproxPercentiles,
+ *   # sharded collections aren't supported yet
+ *   assumes_unsharded_collection,
+ * ]
+ */
+(function() {
+"use strict";
+
+load("jstests/aggregation/extras/utils.js");
+
+const coll = db[jsTestName()];
+coll.drop();
+// These tests don't validate the computed $percentile but we need a result to be produced in
+// order to check its format.
+coll.insert({x: 42});
+
+/**
+ * Tests to check that invalid $percentile specifications are rejected.
+ */
+function assertInvalidSyntax(percentileSpec, msg) {
+    assert.commandFailed(
+        coll.runCommand("aggregate",
+                        {pipeline: [{$group: {_id: null, p: percentileSpec}}], cursor: {}}),
+        msg);
+}
+
+assertInvalidSyntax({$percentile: 0.5}, "Should fail if $percentile is not an object");
+
+assertInvalidSyntax({$percentile: {input: "$x", algorithm: "approximate"}},
+                    "Should fail if $percentile is missing 'p' field");
+
+assertInvalidSyntax({$percentile: {p: [0.5], algorithm: "approximate"}},
+                    "Should fail if $percentile is missing 'input' field");
+
+assertInvalidSyntax({$percentile: {p: [0.5], input: "$x"}},
+                    "Should fail if $percentile is missing 'algorithm' field");
+
+assertInvalidSyntax({$percentile: {p: [0.5], input: "$x", algorithm: "approximate", extras: 42}},
+                    "Should fail if $percentile contains an unexpected field");
+
+assertInvalidSyntax({$percentile: {p: 0.5, input: "$x", algorithm: "approximate"}},
+                    "Should fail if 'p' field in $percentile isn't array");
+
+assertInvalidSyntax({$percentile: {p: [], input: "$x", algorithm: "approximate"}},
+                    "Should fail if 'p' field in $percentile is an empty array");
+
+assertInvalidSyntax(
+    {$percentile: {p: [0.5, "foo"], input: "$x", algorithm: "approximate"}},
+    "Should fail if 'p' field in $percentile is an array with a non-numeric element");
+
+assertInvalidSyntax(
+    {$percentile: {p: [0.5, 10], input: "$x", algorithm: "approximate"}},
+    "Should fail if 'p' field in $percentile is an array with any value outside of [0, 1] range");
+
+assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: 42}},
+                    "Should fail if 'algorithm' field isn't a string");
+
+assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: "fancy"}},
+                    "Should fail if 'algorithm' isn't one of _predefined_ strings");
+
+/**
+ * Test that valid $percentile specifications are accepted. The results, i.e. semantics, are tested
+ * elsewhere and would cover all of the cases below, we are providing them here nonetheless for
+ * completeness.
+ */
+function assertValidSyntax(percentileSpec, msg) {
+    assert.commandWorked(
+        coll.runCommand("aggregate",
+                        {pipeline: [{$group: {_id: null, p: percentileSpec}}], cursor: {}}),
+        msg);
+}
+
+assertValidSyntax(
+    {$percentile: {p: [0.0, 0.0001, 0.5, 0.995, 1.0], input: "$x", algorithm: "approximate"}},
+    "Should be able to specify an array of percentiles");
+
+assertValidSyntax(
+    {$percentile: {p: [0.5, 0.9], input: {$divide: ["$x", 2]}, algorithm: "approximate"}},
+    "Should be able to specify 'input' as an expression");
+
+assertValidSyntax({$percentile: {p: [0.5, 0.9], input: "x", algorithm: "approximate"}},
+                  "Non-numeric inputs should be gracefully ignored");
+
+/**
+ * Test that the "arrayness" of the result matches the "arrayness" of the specification.
+ */
+function assertArrayness(percentileSpec, msg) {
+    const pInSpec = percentileSpec.$percentile.p;
+
+    const res = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
+
+    assert.eq(Array.isArray(pInSpec), Array.isArray(res[0].p), msg + ` result: ${tojson(res)}`);
+    if (Array.isArray(pInSpec)) {
+        assert.eq(pInSpec.length, res[0].p.length, msg + ` result: ${tojson(res)}`);
+    }
+}
+
+assertArrayness({$percentile: {p: [0.2, 0.9], input: "$x", algorithm: "approximate"}},
+                "Computed percentiles should be in an array of length 2");
+
+assertArrayness({$percentile: {p: [0.2], input: "$x", algorithm: "approximate"}},
+                "Computed percentiles should be in an array of length 1");
+})();
diff --git a/src/mongo/db/pipeline/SConscript b/src/mongo/db/pipeline/SConscript
index 918878e266e..cf4b9d8e8d9 100644
--- a/src/mongo/db/pipeline/SConscript
+++ b/src/mongo/db/pipeline/SConscript
@@ -116,11 +116,13 @@ env.Library(
         'accumulator_merge_objects.cpp',
         'accumulator_min_max.cpp',
         'accumulator_multi.cpp',
+        'accumulator_percentile.cpp',
         'accumulator_push.cpp',
         'accumulator_rank.cpp',
         'accumulator_std_dev.cpp',
         'accumulator_sum.cpp',
         'map_reduce_options.idl',
+        'percentile_algo_sort_and_rank.cpp',
         'window_function/window_bounds.cpp',
         'window_function/window_function_covariance.cpp',
         'window_function/window_function_count.cpp',
diff --git a/src/mongo/db/pipeline/accumulator_percentile.cpp b/src/mongo/db/pipeline/accumulator_percentile.cpp
new file mode 100644
index 00000000000..659c08d526e
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_percentile.cpp
@@ -0,0 +1,174 @@
+/**
+ *    Copyright (C) 2023-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/pipeline/accumulator_percentile.h"
+
+namespace mongo {
+
+using boost::intrusive_ptr;
+
+REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(percentile,
+                                       AccumulatorPercentile::parseArgs,
+                                       feature_flags::gFeatureFlagApproxPercentiles);
+
+namespace {
+// Checks that 'pv' is an array of valid percentile specifications.
+std::vector<double> validatePercentileArg(const Value& pv) {
+    const auto msg = "'p' must be an array of numeric values from [0.0, 1.0] range, but found "_sd;
+    uassert(7429700, str::stream() << msg << pv.toString(), pv.isArray());
+
+    std::vector<double> ps;
+    for (const Value& p : pv.getArray()) {
+        uassert(7429701, str::stream() << msg << pv.toString(), p.numeric());
+        auto pd = p.coerceToDouble();
+        uassert(7429702, str::stream() << msg << pv.toString(), pd >= 0 && pd <= 1);
+        ps.push_back(pd);
+    }
+    return ps;
+}
+}  // namespace
+
+// TODO SERVER-74556: Move parsing of the args into IDL.
+AccumulationExpression AccumulatorPercentile::parseArgs(ExpressionContext* const expCtx,
+                                                        BSONElement elem,
+                                                        VariablesParseState vps) {
+    expCtx->sbeGroupCompatible = false;
+
+    uassert(7429703,
+            str::stream() << "specification must be an object; found " << elem,
+            elem.type() == BSONType::Object);
+    BSONObj obj = elem.embeddedObject();
+
+    boost::intrusive_ptr<Expression> input;
+    std::vector<double> ps;
+    std::string algo = "";
+    for (auto&& element : obj) {
+        auto fieldName = element.fieldNameStringData();
+        if (fieldName == kFieldNameInput) {
+            input = Expression::parseOperand(expCtx, element, vps);
+        } else if (fieldName == kFieldNameP) {
+            auto pv = Value(element);
+            ps = validatePercentileArg(pv);
+        } else if (fieldName == kFieldNameAlgo) {
+            if (element.type() == BSONType::String) {
+                algo = element.String();
+            }
+            // More types of percentiles will be supported in the future: PM-1883
+            uassert(7429704,
+                    str::stream()
+                        << "the valid specifications for 'algorithm' are: 'approximate'; found "
+                        << element,
+                    algo == "approximate");
+        } else {
+            uasserted(7429705,
+                      str::stream() << "Unknown argument for 'percentile' operator: " << fieldName);
+        }
+    }
+    uassert(7429706, str::stream() << "Missing value for '" << kFieldNameP << "'", !ps.empty());
+    uassert(7429707, str::stream() << "Missing value for '" << kFieldNameInput << "'", input);
+    uassert(
+        7429708, str::stream() << "Missing value for '" << kFieldNameAlgo << "'", !algo.empty());
+
+    auto factory = [expCtx, ps] {
+        // Temporary implementation! To be replaced based on the user's choice of algorithm.
+        auto algo = PercentileAlgorithm::createDiscreteSortAndRank();
+
+        return AccumulatorPercentile::create(expCtx, ps, std::move(algo));
+    };
+
+    return {ExpressionConstant::create(expCtx, Value(BSONNULL)) /*initializer*/,
+            std::move(input) /*argument*/,
+            std::move(factory),
+            "$percentile"_sd /*name*/};
+}
+
+void AccumulatorPercentile::processInternal(const Value& input, bool merging) {
+    if (merging) {
+        // Merging percentiles from different shards. For approximate percentiels 'input' is a
+        // document with a digest.
+        // TODO SERVER-74358: Support sharded collections
+        uasserted(7429709, "Sharded collections are not yet supported by $percentile");
+        return;
+    }
+
+    if (!input.numeric()) {
+        return;
+    }
+
+    _algo->incorporate(input.coerceToDouble());
+
+    // TODO SERVER-74558: Could/should we update the memory usage less often?
+    _memUsageBytes = sizeof(*this) + _algo->memUsageBytes();
+}
+
+Value AccumulatorPercentile::getValue(bool toBeMerged) {
+    if (toBeMerged) {
+        // Return a document, containing the whole digest, because they would need to be merged
+        // on mongos to compute the final result.
+        // TODO SERVER-74358: Support sharded collections
+        uasserted(7429710, "Sharded collections are not yet supported by $percentile");
+        return Value(Document{});
+    }
+
+    // Compute the percentiles for each requested one in the order listed. Computing a percentile
+    // can only fail if there have been no numeric inputs, then all percentiles would fail. Rather
+    // than returning an array of nulls in this case, we return a single null value.
+    std::vector<double> pctls;
+    for (double p : _percentiles) {
+        auto res = _algo->computePercentile(p);
+        if (!res) {
+            return Value(BSONNULL);
+        }
+        pctls.push_back(res.value());
+    }
+
+    return Value(std::vector<Value>(pctls.begin(), pctls.end()));
+}
+
+AccumulatorPercentile::AccumulatorPercentile(ExpressionContext* const expCtx,
+                                             const std::vector<double>& ps,
+                                             std::unique_ptr<PercentileAlgorithm> algo)
+    : AccumulatorState(expCtx), _percentiles(ps), _algo(std::move(algo)) {
+    _memUsageBytes = sizeof(*this) + _algo->memUsageBytes();
+}
+
+void AccumulatorPercentile::reset() {
+    // Temporary implementation! To be replaced based on the user's choice of algorithm.
+    auto algo = PercentileAlgorithm::createDiscreteSortAndRank();
+
+    _memUsageBytes = sizeof(*this) + _algo->memUsageBytes();
+}
+
+intrusive_ptr<AccumulatorState> AccumulatorPercentile::create(
+    ExpressionContext* const expCtx,
+    const std::vector<double>& ps,
+    std::unique_ptr<PercentileAlgorithm> algo) {
+    return new AccumulatorPercentile(expCtx, ps, std::move(algo));
+}
+}  // namespace mongo
diff --git a/src/mongo/db/pipeline/accumulator_percentile.h b/src/mongo/db/pipeline/accumulator_percentile.h
new file mode 100644
index 00000000000..f25724911f0
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_percentile.h
@@ -0,0 +1,82 @@
+/**
+ *    Copyright (C) 2023-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/pipeline/accumulation_statement.h"
+#include "mongo/db/pipeline/percentile_algo.h"
+
+namespace mongo {
+/**
+ * Accumulator for computing $percentile.
+ */
+class AccumulatorPercentile : public AccumulatorState {
+public:
+    // {$percentile: {input: <Numeric>, p: <Array Expression>, algorithm: <String>}
+    static constexpr auto kFieldNameInput = "input"_sd;
+    static constexpr auto kFieldNameP = "p"_sd;
+    static constexpr auto kFieldNameAlgo = "algorithm"_sd;
+
+    static constexpr auto kName = "$percentile"_sd;
+    const char* getOpName() const final {
+        return kName.rawData();
+    }
+
+    /**
+     * Parsing and creating the accumulator. A separate accumulator object is created per group.
+     */
+    static AccumulationExpression parseArgs(ExpressionContext* expCtx,
+                                            BSONElement elem,
+                                            VariablesParseState vps);
+
+    static boost::intrusive_ptr<AccumulatorState> create(ExpressionContext* expCtx,
+                                                         const std::vector<double>& ps,
+                                                         std::unique_ptr<PercentileAlgorithm> algo);
+
+    AccumulatorPercentile(ExpressionContext* expCtx,
+                          const std::vector<double>& ps,
+                          std::unique_ptr<PercentileAlgorithm> algo);
+
+    /**
+     * Ingressing values and computing the requested percentiles.
+     */
+    void processInternal(const Value& input, bool merging) final;
+    Value getValue(bool toBeMerged) final;
+
+    /**
+     * Other infra for the accumulators.
+     */
+    void reset() final;
+
+private:
+    std::vector<double> _percentiles;
+    std::unique_ptr<PercentileAlgorithm> _algo;
+};
+
+}  // namespace mongo
diff --git a/src/mongo/db/pipeline/percentile_algo.h b/src/mongo/db/pipeline/percentile_algo.h
new file mode 100644
index 00000000000..ce37ec2db77
--- /dev/null
+++ b/src/mongo/db/pipeline/percentile_algo.h
@@ -0,0 +1,77 @@
+/**
+ *    Copyright (C) 2023-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include <boost/optional/optional.hpp>
+#include <memory>
+
+namespace mongo {
+
+/**
+ * Eventually we'll be supporting multiple types of percentiles (discrete, continuous, approximate)
+ * and potentially multiple different algorithms for computing the approximate ones.
+ *
+ * The goal is to keep these algorithms MQL- and engine-agnostic with this interface.
+ */
+struct PercentileAlgorithm {
+    virtual ~PercentileAlgorithm() {}
+
+    virtual void incorporate(double input) = 0;
+    virtual void incorporate(const std::vector<double>& inputs) = 0;
+
+    /**
+     * 'p' must be from [0, 1] range.
+     *
+     * It is always valid to call 'computePercentile()', however, if no input has been incorporated
+     * yet, all implementations must return 'boost::none'. It is allowed to incorporate more inputs
+     * after calling 'computePercentile()' and call it again (naturally, the result might differ
+     * depending on the new data).
+     *
+     * Note 1: the implementations are free to either return "none" or throw if they require setting
+     * up for computing a specific percentile but a different one is requested here.
+     *
+     * Note 2: the implementations might have different tradeoffs regarding balancing performance of
+     * ingress vs computing the percentile, so this interface provides no perfomance guarantees.
+     * Refer to the documentation of the specific implementations for details.
+     */
+    virtual boost::optional<double> computePercentile(double p) = 0;
+
+    /*
+     * The owner might need a rough estimate of how much memory the algorithm is using.
+     */
+    virtual long memUsageBytes() = 0;
+
+    /**
+     * Factory methods for instantiating concrete algorithms.
+     */
+    static std::unique_ptr<PercentileAlgorithm> createDiscreteSortAndRank();
+};
+
+}  // namespace mongo
diff --git a/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp b/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp
new file mode 100644
index 00000000000..921e8845f41
--- /dev/null
+++ b/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp
@@ -0,0 +1,90 @@
+/**
+ *    Copyright (C) 2023-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "mongo/db/pipeline/percentile_algo.h"
+
+namespace mongo {
+
+/**
+ * 'SortAndRank' algorithm for computing percentiles is accurate and doesn't require specifying a
+ * percentile in advance but is only suitable for small datasets. It accumulats all data sent to it
+ * and sorts it all when a percentile is requested. Requesting more percentiles after the first one
+ * without incorporating more data is fast as it doesn't need to sort again.
+ */
+class DiscreteSortAndRank : public PercentileAlgorithm {
+public:
+    DiscreteSortAndRank() = default;  // no config required for this algorithm
+
+    void incorporate(double input) final {
+        // Take advantage of already sorted input -- avoid resorting it later.
+        if (!_shouldSort && !_accumulatedValues.empty() && input < _accumulatedValues.back()) {
+            _shouldSort = true;
+        }
+
+        _accumulatedValues.push_back(input);
+    }
+    void incorporate(const std::vector<double>& inputs) final {
+        _shouldSort = true;
+        _accumulatedValues.reserve(_accumulatedValues.size() + inputs.size());
+        _accumulatedValues.insert(_accumulatedValues.end(), inputs.begin(), inputs.end());
+    }
+
+    boost::optional<double> computePercentile(double p) final {
+        if (_accumulatedValues.empty()) {
+            return boost::none;
+        }
+
+        if (_shouldSort) {
+            std::sort(_accumulatedValues.begin(), _accumulatedValues.end());
+            _shouldSort = false;
+        }
+
+        const double n = _accumulatedValues.size();
+        const size_t rank = static_cast<size_t>(std::max<double>(0, std::ceil(p * n) - 1));
+        return _accumulatedValues[rank];
+    }
+
+    long memUsageBytes() final {
+        return _accumulatedValues.capacity() * sizeof(double);
+    }
+
+private:
+    std::vector<double> _accumulatedValues;
+    bool _shouldSort = true;
+};
+
+std::unique_ptr<PercentileAlgorithm> PercentileAlgorithm::createDiscreteSortAndRank() {
+    return std::make_unique<DiscreteSortAndRank>();
+}
+
+}  // namespace mongo
author	Irina Yatsenko <irina.yatsenko@mongodb.com>	2023-03-03 17:49:27 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2023-03-03 23:00:36 +0000
commit	23a26b5dad45d12187b61a1e5e49c60c5be26632 (patch)
tree	15c189d6cb1488a4fbc4b0d676a8d0792cf0e10d
parent	4f07b59413fd6053ad8033934c4312fecbcbd5d9 (diff)
download	mongo-23a26b5dad45d12187b61a1e5e49c60c5be26632.tar.gz